From 2b906606b9568ac637c0c077d9dac0ff47d7f71b Mon Sep 17 00:00:00 2001 From: guange <8863824@gmail.com> Date: Fri, 25 Jan 2019 16:09:19 +0800 Subject: [PATCH] 1 --- chapter1/crawler/taobao/items.py | 1 + chapter1/crawler/taobao/pipelines.py | 2 +- chapter1/crawler/taobao/spiders/jd.py | 6 +++++- 3 files changed, 7 insertions(+), 2 deletions(-) diff --git a/chapter1/crawler/taobao/items.py b/chapter1/crawler/taobao/items.py index 5d4a5df..6a2c096 100644 --- a/chapter1/crawler/taobao/items.py +++ b/chapter1/crawler/taobao/items.py @@ -14,6 +14,7 @@ class ZhihuItem(scrapy.Item): pass class ProductSpecItem(scrapy.Item): + pid = scrapy.Field() cpu = scrapy.Field() rom = scrapy.Field() ram = scrapy.Field() diff --git a/chapter1/crawler/taobao/pipelines.py b/chapter1/crawler/taobao/pipelines.py index 2c49775..e626615 100644 --- a/chapter1/crawler/taobao/pipelines.py +++ b/chapter1/crawler/taobao/pipelines.py @@ -130,7 +130,7 @@ class JDHbasePipeline(object): "comment:good_rate": str(item["good_rate"]) }) elif isinstance(item, ProductSpecItem): - self.product_table.put(item["id"], + self.product_table.put(item["pid"], { "spec:cpu": str(item["cpu"]), "spec:rom": str(item['rom']), diff --git a/chapter1/crawler/taobao/spiders/jd.py b/chapter1/crawler/taobao/spiders/jd.py index 88d394e..c192846 100644 --- a/chapter1/crawler/taobao/spiders/jd.py +++ b/chapter1/crawler/taobao/spiders/jd.py @@ -51,6 +51,7 @@ class JdSpider(scrapy.Spider): def parse_spec(self, response): #spec + product_id = response.meta["product_id"] cpu = response.xpath('//dt[text()="CPU型号"]/following-sibling::dd/text()').extract_first() rom = response.xpath('//dt[text()="ROM"]/following-sibling::dd[2]/text()').extract_first() ram = response.xpath('//dt[text()="RAM"]/following-sibling::dd[2]/text()').extract_first() @@ -62,6 +63,7 @@ class JdSpider(scrapy.Spider): print(cpu, rom, ram, resolution, charge, weight, brand) yield ProductSpecItem( + pid = product_id, cpu = cpu, rom = rom, ram = ram, @@ -164,7 +166,9 @@ class JdSpider(scrapy.Spider): # priority=100) if url: - yield SplashRequest(url='https:'+url, callback=self.parse_spec) + yield SplashRequest(url='https:'+url, + callback=self.parse_spec, + meta={'product_id': product_id}) # 获取下一页 next_page = response.xpath('//a[@class="pn-next"]/@href').extract_first()