.

2019-01-17 22:41:27 +08:00 · 2019-01-17 22:41:27 +08:00 · da57a3a604
parent f8a769ed49
commit da57a3a604
4 changed files with 153 additions and 0 deletions
--- a/chapter1/crawler/taobao/items.py
+++ b/chapter1/crawler/taobao/items.py
@ -40,6 +40,7 @@ class CommentItem(scrapy.Item):

 class JDProductItem(scrapy.Item):
    id = scrapy.Field()
+    type = scrapy.Field()
    name = scrapy.Field()
    price = scrapy.Field()
    url = scrapy.Field()
--- a/chapter1/crawler/taobao/pipelines.py
+++ b/chapter1/crawler/taobao/pipelines.py
@ -87,6 +87,7 @@ class JDHbasePipeline(object):
        if isinstance(item, JDProductItem):
            self.product_table.put(item["id"],
                                   {"product:name": item["name"],
+                                    "product:type": item["type"],
                                    "product:price": item["price"],
                                    "product:url": item["url"],
                                    "product:img": item["img"],
--- a/chapter1/crawler/taobao/spiders/jd.py
+++ b/chapter1/crawler/taobao/spiders/jd.py
@ -114,6 +114,7 @@ class JdSpider(scrapy.Spider):
            print(shop)

            yield JDProductItem(
+                type=0,
                name=name,
                price=price,
                url=url,
--- a/chapter1/crawler/taobao/spiders/jd1.py
+++ b/chapter1/crawler/taobao/spiders/jd1.py
@ -0,0 +1,150 @@
+# coding=utf-8
+import json
+import time
+
+import scrapy
+from scrapy import Request
+from scrapy.conf import settings
+from scrapy_splash import SplashRequest
+import pdb
+from taobao.items import JDProductItem, JDCommentItem, JDCommentSummary
+import re
+
+from taobao.utils import now_time
+
+
+class Jd1Spider(scrapy.Spider):
+    name = 'jd1'
+    allowed_domains = ['jd.com']
+    start_urls = [
+        "https://list.jd.com/list.html?cat=670,671,672"
+    ]
+
+    custom_settings = {
+        'LOG_LEVEL': "WARN",
+        'ITEM_PIPELINES': {
+            'taobao.pipelines.JDCleanDataPipeline': 300,
+            'taobao.pipelines.JDHbasePipeline': 400,
+        }
+    }
+
+    def make_comment_url(self, product_id, page):
+        comment_url = "https://club.jd.com/comment/productPageComments.action?callback=fetchJSON" \
+                      "_comment98vv33164&productId=%s&score=0&sortType=5&page=%d&pageSize=10" \
+                      "&isShadowSku=0&rid=0&fold=1" % (product_id, page)
+        return comment_url
+
+    def start_requests(self):
+        for url in self.start_urls:
+            yield SplashRequest(url,
+                                self.parse,
+                                args={})
+
+    def parse_comment(self, response):
+        # pdb.set_trace()
+        product_id = response.meta["product_id"]
+        page = response.meta["page"]
+        print("抓取评论, %s, 当前 %d页" % (product_id, page))
+        m = re.match(r'^fetchJSON_comment98vv33164\((.+)\);$', response.text)
+        if m:
+            t = m.group(1)
+            j = json.loads(t)
+            commentSummary = j['comments']
+            for comment in commentSummary:
+                c_content = comment['content']
+                c_time = comment['referenceTime']
+                c_name = comment['nickname']
+                c_client = comment['userClientShow']
+                c_id = "%s_%s" % (product_id, comment['id'])
+                print('{}, {} {} {}\n{}\n'.format(c_id, c_name, c_time, c_client, c_content))
+                yield JDCommentItem(
+                    id=c_id,
+                    user_name=c_name,
+                    from_type=c_client,
+                    content=c_content,
+                    comment_time=c_time,
+                    created_at=now_time()
+                )
+            if page == 0:
+                productCommentSummary = j['productCommentSummary']
+                yield JDCommentSummary(
+                    id = product_id,
+                    good_count = productCommentSummary['goodCount'],
+                    general_count= productCommentSummary['generalCount'],
+                    poor_count= productCommentSummary['poorCount'],
+                    comment_count= productCommentSummary['commentCount'],
+                    default_good_count= productCommentSummary['defaultGoodCount'],
+                    good_rate= productCommentSummary['goodRate']
+                )
+
+
+            page = page + 1
+
+            next_comment_url = self.make_comment_url(product_id, page)
+
+            # time.sleep(10)
+
+            if page>settings['COMMENT_MAX_PAGE']:
+                print("评论抓取达到最大深度, %s, 页数: %d" % (product_id, page))
+            else:
+                yield Request(next_comment_url, self.parse_comment,
+                          meta={'product_id': product_id, 'page': page})
+        else:
+            print("评论抓取完成, %s, 页数: %d" % (product_id, page))
+
+    def parse(self, response):
+        items = response.xpath('//div[@id="plist"]//li[@class="gl-item"]')
+
+        for item in items:
+            # pdb.set_trace()
+            name = item.xpath('.//div[contains(@class, "p-name")]/a/em/text()').extract_first()
+            print(name)
+            price = item.xpath(
+                './/div[contains(@class, "p-price")]/strong/i/text()').extract_first()
+            print(price)
+            url = item.xpath('.//div[contains(@class, "p-name")]/a/@href').extract_first()
+            print('https:' + url)
+            img = item.xpath('.//div[@class="p-img"]/a/img/@data-lazy-img').extract_first()
+            if not img:
+                img = item.xpath('.//div[@class="p-img"]/a/img/@src').extract_first()
+
+            comment_num = item.xpath('.//div[@class="p-commit"]/strong/a/text()').extract_first()
+            print(comment_num)
+            shop = item.xpath('.//div[@class="p-shop"]/span/a/@title').extract_first()
+            print(shop)
+
+            yield JDProductItem(
+                type=1,
+                name=name,
+                price=price,
+                url=url,
+                img=img,
+                comment_num=comment_num,
+                shop=shop,
+                created_at=now_time()
+            )
+
+            m = re.match(r'.+\/(\d+)\.html', url)
+            if m:
+                product_id = m.group(1)
+                comment_url = self.make_comment_url(product_id, 0)
+                yield Request(comment_url, self.parse_comment,
+                              meta={'product_id': product_id, 'page': 0},
+                              priority=100)
+
+        # 获取下一页
+        next_page = response.xpath('//a[@class="pn-next"]/@href').extract_first()
+        if next_page:
+            page = 1
+            m = re.match(r'.+page=(\d+).+', next_page)
+            if m:
+                page = m.group(1)
+            print("开始抓取:{0}页".format(page))
+
+            # time.sleep(10)
+
+            yield SplashRequest(
+                url='https://list.jd.com' + next_page
+            )
+        else:
+            print("抓取结束")