diff --git a/chapter1/crawler/taobao/items.py b/chapter1/crawler/taobao/items.py index a936d68..d03dbfd 100644 --- a/chapter1/crawler/taobao/items.py +++ b/chapter1/crawler/taobao/items.py @@ -40,6 +40,7 @@ class CommentItem(scrapy.Item): class JDProductItem(scrapy.Item): id = scrapy.Field() + type = scrapy.Field() name = scrapy.Field() price = scrapy.Field() url = scrapy.Field() diff --git a/chapter1/crawler/taobao/pipelines.py b/chapter1/crawler/taobao/pipelines.py index e8cf800..38069b9 100644 --- a/chapter1/crawler/taobao/pipelines.py +++ b/chapter1/crawler/taobao/pipelines.py @@ -87,6 +87,7 @@ class JDHbasePipeline(object): if isinstance(item, JDProductItem): self.product_table.put(item["id"], {"product:name": item["name"], + "product:type": item["type"], "product:price": item["price"], "product:url": item["url"], "product:img": item["img"], diff --git a/chapter1/crawler/taobao/spiders/jd.py b/chapter1/crawler/taobao/spiders/jd.py index 3f12d53..65d4fb9 100644 --- a/chapter1/crawler/taobao/spiders/jd.py +++ b/chapter1/crawler/taobao/spiders/jd.py @@ -114,6 +114,7 @@ class JdSpider(scrapy.Spider): print(shop) yield JDProductItem( + type=0, name=name, price=price, url=url, diff --git a/chapter1/crawler/taobao/spiders/jd1.py b/chapter1/crawler/taobao/spiders/jd1.py new file mode 100644 index 0000000..605159b --- /dev/null +++ b/chapter1/crawler/taobao/spiders/jd1.py @@ -0,0 +1,150 @@ +# coding=utf-8 +import json +import time + +import scrapy +from scrapy import Request +from scrapy.conf import settings +from scrapy_splash import SplashRequest +import pdb +from taobao.items import JDProductItem, JDCommentItem, JDCommentSummary +import re + +from taobao.utils import now_time + + +class Jd1Spider(scrapy.Spider): + name = 'jd1' + allowed_domains = ['jd.com'] + start_urls = [ + "https://list.jd.com/list.html?cat=670,671,672" + ] + + custom_settings = { + 'LOG_LEVEL': "WARN", + 'ITEM_PIPELINES': { + 'taobao.pipelines.JDCleanDataPipeline': 300, + 'taobao.pipelines.JDHbasePipeline': 400, + } + } + + def make_comment_url(self, product_id, page): + comment_url = "https://club.jd.com/comment/productPageComments.action?callback=fetchJSON" \ + "_comment98vv33164&productId=%s&score=0&sortType=5&page=%d&pageSize=10" \ + "&isShadowSku=0&rid=0&fold=1" % (product_id, page) + return comment_url + + def start_requests(self): + for url in self.start_urls: + yield SplashRequest(url, + self.parse, + args={}) + + def parse_comment(self, response): + # pdb.set_trace() + product_id = response.meta["product_id"] + page = response.meta["page"] + print("抓取评论, %s, 当前 %d页" % (product_id, page)) + m = re.match(r'^fetchJSON_comment98vv33164\((.+)\);$', response.text) + if m: + t = m.group(1) + j = json.loads(t) + commentSummary = j['comments'] + for comment in commentSummary: + c_content = comment['content'] + c_time = comment['referenceTime'] + c_name = comment['nickname'] + c_client = comment['userClientShow'] + c_id = "%s_%s" % (product_id, comment['id']) + print('{}, {} {} {}\n{}\n'.format(c_id, c_name, c_time, c_client, c_content)) + yield JDCommentItem( + id=c_id, + user_name=c_name, + from_type=c_client, + content=c_content, + comment_time=c_time, + created_at=now_time() + ) + if page == 0: + productCommentSummary = j['productCommentSummary'] + yield JDCommentSummary( + id = product_id, + good_count = productCommentSummary['goodCount'], + general_count= productCommentSummary['generalCount'], + poor_count= productCommentSummary['poorCount'], + comment_count= productCommentSummary['commentCount'], + default_good_count= productCommentSummary['defaultGoodCount'], + good_rate= productCommentSummary['goodRate'] + ) + + + page = page + 1 + + next_comment_url = self.make_comment_url(product_id, page) + + # time.sleep(10) + + if page>settings['COMMENT_MAX_PAGE']: + print("评论抓取达到最大深度, %s, 页数: %d" % (product_id, page)) + else: + yield Request(next_comment_url, self.parse_comment, + meta={'product_id': product_id, 'page': page}) + else: + print("评论抓取完成, %s, 页数: %d" % (product_id, page)) + + def parse(self, response): + items = response.xpath('//div[@id="plist"]//li[@class="gl-item"]') + + for item in items: + # pdb.set_trace() + name = item.xpath('.//div[contains(@class, "p-name")]/a/em/text()').extract_first() + print(name) + price = item.xpath( + './/div[contains(@class, "p-price")]/strong/i/text()').extract_first() + print(price) + url = item.xpath('.//div[contains(@class, "p-name")]/a/@href').extract_first() + print('https:' + url) + img = item.xpath('.//div[@class="p-img"]/a/img/@data-lazy-img').extract_first() + if not img: + img = item.xpath('.//div[@class="p-img"]/a/img/@src').extract_first() + + comment_num = item.xpath('.//div[@class="p-commit"]/strong/a/text()').extract_first() + print(comment_num) + shop = item.xpath('.//div[@class="p-shop"]/span/a/@title').extract_first() + print(shop) + + yield JDProductItem( + type=1, + name=name, + price=price, + url=url, + img=img, + comment_num=comment_num, + shop=shop, + created_at=now_time() + ) + + m = re.match(r'.+\/(\d+)\.html', url) + if m: + product_id = m.group(1) + comment_url = self.make_comment_url(product_id, 0) + yield Request(comment_url, self.parse_comment, + meta={'product_id': product_id, 'page': 0}, + priority=100) + + # 获取下一页 + next_page = response.xpath('//a[@class="pn-next"]/@href').extract_first() + if next_page: + page = 1 + m = re.match(r'.+page=(\d+).+', next_page) + if m: + page = m.group(1) + print("开始抓取:{0}页".format(page)) + + # time.sleep(10) + + yield SplashRequest( + url='https://list.jd.com' + next_page + ) + else: + print("抓取结束")