1

2019-01-25 16:09:19 +08:00 · 2019-01-25 16:09:19 +08:00 · 2b906606b9
parent 41aa5ed3fe
commit 2b906606b9
3 changed files with 7 additions and 2 deletions
--- a/chapter1/crawler/taobao/items.py
+++ b/chapter1/crawler/taobao/items.py
@ -14,6 +14,7 @@ class ZhihuItem(scrapy.Item):
    pass

 class ProductSpecItem(scrapy.Item):
+    pid = scrapy.Field()
    cpu = scrapy.Field()
    rom = scrapy.Field()
    ram = scrapy.Field() 
--- a/chapter1/crawler/taobao/pipelines.py
+++ b/chapter1/crawler/taobao/pipelines.py
@ -130,7 +130,7 @@ class JDHbasePipeline(object):
                                    "comment:good_rate": str(item["good_rate"])
                                    })
        elif isinstance(item, ProductSpecItem):
-            self.product_table.put(item["id"],
+            self.product_table.put(item["pid"],
                    {
                        "spec:cpu": str(item["cpu"]),
                        "spec:rom": str(item['rom']),
--- a/chapter1/crawler/taobao/spiders/jd.py
+++ b/chapter1/crawler/taobao/spiders/jd.py
@ -51,6 +51,7 @@ class JdSpider(scrapy.Spider):

    def parse_spec(self, response):
        #spec
+        product_id = response.meta["product_id"]
        cpu = response.xpath('//dt[text()="CPU型号"]/following-sibling::dd/text()').extract_first()
        rom = response.xpath('//dt[text()="ROM"]/following-sibling::dd[2]/text()').extract_first()
        ram = response.xpath('//dt[text()="RAM"]/following-sibling::dd[2]/text()').extract_first()
@ -62,6 +63,7 @@ class JdSpider(scrapy.Spider):
        print(cpu, rom, ram, resolution, charge, weight, brand)

        yield ProductSpecItem(
+            pid = product_id,
            cpu = cpu,
            rom = rom,
            ram = ram,
@ -164,7 +166,9 @@ class JdSpider(scrapy.Spider):
                #               priority=100)
            
            if url:
-                yield SplashRequest(url='https:'+url, callback=self.parse_spec)
+                yield SplashRequest(url='https:'+url, 
+                                    callback=self.parse_spec,
+                                    meta={'product_id': product_id})

        # 获取下一页
        next_page = response.xpath('//a[@class="pn-next"]/@href').extract_first()