.

2019-01-17 22:41:27 +08:00 · 2019-01-17 22:41:27 +08:00 · da57a3a604
parent f8a769ed49
commit da57a3a604
4 changed files with 153 additions and 0 deletions
--- a/chapter1/crawler/taobao/items.py
+++ b/chapter1/crawler/taobao/items.py
@ -40,6 +40,7 @@ class CommentItem(scrapy.Item):
 class JDProductItem(scrapy.Item):
    id = scrapy.Field()
    type = scrapy.Field()
    name = scrapy.Field()
    price = scrapy.Field()
    url = scrapy.Field()
--- a/chapter1/crawler/taobao/pipelines.py
+++ b/chapter1/crawler/taobao/pipelines.py
@ -87,6 +87,7 @@ class JDHbasePipeline(object):
        if isinstance(item, JDProductItem):
            self.product_table.put(item["id"],
                                   {"product:name": item["name"],
                                    "product:type": item["type"],
                                    "product:price": item["price"],
                                    "product:url": item["url"],
                                    "product:img": item["img"],
--- a/chapter1/crawler/taobao/spiders/jd.py
+++ b/chapter1/crawler/taobao/spiders/jd.py
@ -114,6 +114,7 @@ class JdSpider(scrapy.Spider):
            print(shop)
            yield JDProductItem(
                type=0,
                name=name,
                price=price,
                url=url,
--- a/chapter1/crawler/taobao/spiders/jd1.py
+++ b/chapter1/crawler/taobao/spiders/jd1.py
@ -0,0 +1,150 @@
 # coding=utf-8
 import json
 import time
 import scrapy
 from scrapy import Request
 from scrapy.conf import settings
 from scrapy_splash import SplashRequest
 import pdb
 from taobao.items import JDProductItem, JDCommentItem, JDCommentSummary
 import re
 from taobao.utils import now_time
 class Jd1Spider(scrapy.Spider):
    name = 'jd1'
    allowed_domains = ['jd.com']
    start_urls = [
        "https://list.jd.com/list.html?cat=670,671,672"
    ]
    custom_settings = {
        'LOG_LEVEL': "WARN",
        'ITEM_PIPELINES': {
            'taobao.pipelines.JDCleanDataPipeline': 300,
            'taobao.pipelines.JDHbasePipeline': 400,
        }
    }
    def make_comment_url(self, product_id, page):
        comment_url = "https://club.jd.com/comment/productPageComments.action?callback=fetchJSON" \
                      "_comment98vv33164&productId=%s&score=0&sortType=5&page=%d&pageSize=10" \
                      "&isShadowSku=0&rid=0&fold=1" % (product_id, page)
        return comment_url
    def start_requests(self):
        for url in self.start_urls:
            yield SplashRequest(url,
                                self.parse,
                                args={})
    def parse_comment(self, response):
        # pdb.set_trace()
        product_id = response.meta["product_id"]
        page = response.meta["page"]
        print("抓取评论, %s, 当前 %d页" % (product_id, page))
        m = re.match(r'^fetchJSON_comment98vv33164\((.+)\);$', response.text)
        if m:
            t = m.group(1)
            j = json.loads(t)
            commentSummary = j['comments']
            for comment in commentSummary:
                c_content = comment['content']
                c_time = comment['referenceTime']
                c_name = comment['nickname']
                c_client = comment['userClientShow']
                c_id = "%s_%s" % (product_id, comment['id'])
                print('{}, {} {} {}\n{}\n'.format(c_id, c_name, c_time, c_client, c_content))
                yield JDCommentItem(
                    id=c_id,
                    user_name=c_name,
                    from_type=c_client,
                    content=c_content,
                    comment_time=c_time,
                    created_at=now_time()
                )
            if page == 0:
                productCommentSummary = j['productCommentSummary']
                yield JDCommentSummary(
                    id = product_id,
                    good_count = productCommentSummary['goodCount'],
                    general_count= productCommentSummary['generalCount'],
                    poor_count= productCommentSummary['poorCount'],
                    comment_count= productCommentSummary['commentCount'],
                    default_good_count= productCommentSummary['defaultGoodCount'],
                    good_rate= productCommentSummary['goodRate']
                )
            page = page + 1
            next_comment_url = self.make_comment_url(product_id, page)
            # time.sleep(10)
            if page>settings['COMMENT_MAX_PAGE']:
                print("评论抓取达到最大深度, %s, 页数: %d" % (product_id, page))
            else:
                yield Request(next_comment_url, self.parse_comment,
                          meta={'product_id': product_id, 'page': page})
        else:
            print("评论抓取完成, %s, 页数: %d" % (product_id, page))
    def parse(self, response):
        items = response.xpath('//div[@id="plist"]//li[@class="gl-item"]')
        for item in items:
            # pdb.set_trace()
            name = item.xpath('.//div[contains(@class, "p-name")]/a/em/text()').extract_first()
            print(name)
            price = item.xpath(
                './/div[contains(@class, "p-price")]/strong/i/text()').extract_first()
            print(price)
            url = item.xpath('.//div[contains(@class, "p-name")]/a/@href').extract_first()
            print('https:' + url)
            img = item.xpath('.//div[@class="p-img"]/a/img/@data-lazy-img').extract_first()
            if not img:
                img = item.xpath('.//div[@class="p-img"]/a/img/@src').extract_first()
            comment_num = item.xpath('.//div[@class="p-commit"]/strong/a/text()').extract_first()
            print(comment_num)
            shop = item.xpath('.//div[@class="p-shop"]/span/a/@title').extract_first()
            print(shop)
            yield JDProductItem(
                type=1,
                name=name,
                price=price,
                url=url,
                img=img,
                comment_num=comment_num,
                shop=shop,
                created_at=now_time()
            )
            m = re.match(r'.+\/(\d+)\.html', url)
            if m:
                product_id = m.group(1)
                comment_url = self.make_comment_url(product_id, 0)
                yield Request(comment_url, self.parse_comment,
                              meta={'product_id': product_id, 'page': 0},
                              priority=100)
        # 获取下一页
        next_page = response.xpath('//a[@class="pn-next"]/@href').extract_first()
        if next_page:
            page = 1
            m = re.match(r'.+page=(\d+).+', next_page)
            if m:
                page = m.group(1)
            print("开始抓取:{0}页".format(page))
            # time.sleep(10)
            yield SplashRequest(
                url='https://list.jd.com' + next_page
            )
        else:
            print("抓取结束")