This commit is contained in:
guange 2019-01-25 16:09:19 +08:00
parent 41aa5ed3fe
commit 2b906606b9
3 changed files with 7 additions and 2 deletions

View File

@ -14,6 +14,7 @@ class ZhihuItem(scrapy.Item):
pass pass
class ProductSpecItem(scrapy.Item): class ProductSpecItem(scrapy.Item):
pid = scrapy.Field()
cpu = scrapy.Field() cpu = scrapy.Field()
rom = scrapy.Field() rom = scrapy.Field()
ram = scrapy.Field() ram = scrapy.Field()

View File

@ -130,7 +130,7 @@ class JDHbasePipeline(object):
"comment:good_rate": str(item["good_rate"]) "comment:good_rate": str(item["good_rate"])
}) })
elif isinstance(item, ProductSpecItem): elif isinstance(item, ProductSpecItem):
self.product_table.put(item["id"], self.product_table.put(item["pid"],
{ {
"spec:cpu": str(item["cpu"]), "spec:cpu": str(item["cpu"]),
"spec:rom": str(item['rom']), "spec:rom": str(item['rom']),

View File

@ -51,6 +51,7 @@ class JdSpider(scrapy.Spider):
def parse_spec(self, response): def parse_spec(self, response):
#spec #spec
product_id = response.meta["product_id"]
cpu = response.xpath('//dt[text()="CPU型号"]/following-sibling::dd/text()').extract_first() cpu = response.xpath('//dt[text()="CPU型号"]/following-sibling::dd/text()').extract_first()
rom = response.xpath('//dt[text()="ROM"]/following-sibling::dd[2]/text()').extract_first() rom = response.xpath('//dt[text()="ROM"]/following-sibling::dd[2]/text()').extract_first()
ram = response.xpath('//dt[text()="RAM"]/following-sibling::dd[2]/text()').extract_first() ram = response.xpath('//dt[text()="RAM"]/following-sibling::dd[2]/text()').extract_first()
@ -62,6 +63,7 @@ class JdSpider(scrapy.Spider):
print(cpu, rom, ram, resolution, charge, weight, brand) print(cpu, rom, ram, resolution, charge, weight, brand)
yield ProductSpecItem( yield ProductSpecItem(
pid = product_id,
cpu = cpu, cpu = cpu,
rom = rom, rom = rom,
ram = ram, ram = ram,
@ -164,7 +166,9 @@ class JdSpider(scrapy.Spider):
# priority=100) # priority=100)
if url: if url:
yield SplashRequest(url='https:'+url, callback=self.parse_spec) yield SplashRequest(url='https:'+url,
callback=self.parse_spec,
meta={'product_id': product_id})
# 获取下一页 # 获取下一页
next_page = response.xpath('//a[@class="pn-next"]/@href').extract_first() next_page = response.xpath('//a[@class="pn-next"]/@href').extract_first()