This commit is contained in:
parent
f8a769ed49
commit
da57a3a604
|
@ -40,6 +40,7 @@ class CommentItem(scrapy.Item):
|
|||
|
||||
class JDProductItem(scrapy.Item):
|
||||
id = scrapy.Field()
|
||||
type = scrapy.Field()
|
||||
name = scrapy.Field()
|
||||
price = scrapy.Field()
|
||||
url = scrapy.Field()
|
||||
|
|
|
@ -87,6 +87,7 @@ class JDHbasePipeline(object):
|
|||
if isinstance(item, JDProductItem):
|
||||
self.product_table.put(item["id"],
|
||||
{"product:name": item["name"],
|
||||
"product:type": item["type"],
|
||||
"product:price": item["price"],
|
||||
"product:url": item["url"],
|
||||
"product:img": item["img"],
|
||||
|
|
|
@ -114,6 +114,7 @@ class JdSpider(scrapy.Spider):
|
|||
print(shop)
|
||||
|
||||
yield JDProductItem(
|
||||
type=0,
|
||||
name=name,
|
||||
price=price,
|
||||
url=url,
|
||||
|
|
|
@ -0,0 +1,150 @@
|
|||
# coding=utf-8
|
||||
import json
|
||||
import time
|
||||
|
||||
import scrapy
|
||||
from scrapy import Request
|
||||
from scrapy.conf import settings
|
||||
from scrapy_splash import SplashRequest
|
||||
import pdb
|
||||
from taobao.items import JDProductItem, JDCommentItem, JDCommentSummary
|
||||
import re
|
||||
|
||||
from taobao.utils import now_time
|
||||
|
||||
|
||||
class Jd1Spider(scrapy.Spider):
|
||||
name = 'jd1'
|
||||
allowed_domains = ['jd.com']
|
||||
start_urls = [
|
||||
"https://list.jd.com/list.html?cat=670,671,672"
|
||||
]
|
||||
|
||||
custom_settings = {
|
||||
'LOG_LEVEL': "WARN",
|
||||
'ITEM_PIPELINES': {
|
||||
'taobao.pipelines.JDCleanDataPipeline': 300,
|
||||
'taobao.pipelines.JDHbasePipeline': 400,
|
||||
}
|
||||
}
|
||||
|
||||
def make_comment_url(self, product_id, page):
|
||||
comment_url = "https://club.jd.com/comment/productPageComments.action?callback=fetchJSON" \
|
||||
"_comment98vv33164&productId=%s&score=0&sortType=5&page=%d&pageSize=10" \
|
||||
"&isShadowSku=0&rid=0&fold=1" % (product_id, page)
|
||||
return comment_url
|
||||
|
||||
def start_requests(self):
|
||||
for url in self.start_urls:
|
||||
yield SplashRequest(url,
|
||||
self.parse,
|
||||
args={})
|
||||
|
||||
def parse_comment(self, response):
|
||||
# pdb.set_trace()
|
||||
product_id = response.meta["product_id"]
|
||||
page = response.meta["page"]
|
||||
print("抓取评论, %s, 当前 %d页" % (product_id, page))
|
||||
m = re.match(r'^fetchJSON_comment98vv33164\((.+)\);$', response.text)
|
||||
if m:
|
||||
t = m.group(1)
|
||||
j = json.loads(t)
|
||||
commentSummary = j['comments']
|
||||
for comment in commentSummary:
|
||||
c_content = comment['content']
|
||||
c_time = comment['referenceTime']
|
||||
c_name = comment['nickname']
|
||||
c_client = comment['userClientShow']
|
||||
c_id = "%s_%s" % (product_id, comment['id'])
|
||||
print('{}, {} {} {}\n{}\n'.format(c_id, c_name, c_time, c_client, c_content))
|
||||
yield JDCommentItem(
|
||||
id=c_id,
|
||||
user_name=c_name,
|
||||
from_type=c_client,
|
||||
content=c_content,
|
||||
comment_time=c_time,
|
||||
created_at=now_time()
|
||||
)
|
||||
if page == 0:
|
||||
productCommentSummary = j['productCommentSummary']
|
||||
yield JDCommentSummary(
|
||||
id = product_id,
|
||||
good_count = productCommentSummary['goodCount'],
|
||||
general_count= productCommentSummary['generalCount'],
|
||||
poor_count= productCommentSummary['poorCount'],
|
||||
comment_count= productCommentSummary['commentCount'],
|
||||
default_good_count= productCommentSummary['defaultGoodCount'],
|
||||
good_rate= productCommentSummary['goodRate']
|
||||
)
|
||||
|
||||
|
||||
page = page + 1
|
||||
|
||||
next_comment_url = self.make_comment_url(product_id, page)
|
||||
|
||||
# time.sleep(10)
|
||||
|
||||
if page>settings['COMMENT_MAX_PAGE']:
|
||||
print("评论抓取达到最大深度, %s, 页数: %d" % (product_id, page))
|
||||
else:
|
||||
yield Request(next_comment_url, self.parse_comment,
|
||||
meta={'product_id': product_id, 'page': page})
|
||||
else:
|
||||
print("评论抓取完成, %s, 页数: %d" % (product_id, page))
|
||||
|
||||
def parse(self, response):
|
||||
items = response.xpath('//div[@id="plist"]//li[@class="gl-item"]')
|
||||
|
||||
for item in items:
|
||||
# pdb.set_trace()
|
||||
name = item.xpath('.//div[contains(@class, "p-name")]/a/em/text()').extract_first()
|
||||
print(name)
|
||||
price = item.xpath(
|
||||
'.//div[contains(@class, "p-price")]/strong/i/text()').extract_first()
|
||||
print(price)
|
||||
url = item.xpath('.//div[contains(@class, "p-name")]/a/@href').extract_first()
|
||||
print('https:' + url)
|
||||
img = item.xpath('.//div[@class="p-img"]/a/img/@data-lazy-img').extract_first()
|
||||
if not img:
|
||||
img = item.xpath('.//div[@class="p-img"]/a/img/@src').extract_first()
|
||||
|
||||
comment_num = item.xpath('.//div[@class="p-commit"]/strong/a/text()').extract_first()
|
||||
print(comment_num)
|
||||
shop = item.xpath('.//div[@class="p-shop"]/span/a/@title').extract_first()
|
||||
print(shop)
|
||||
|
||||
yield JDProductItem(
|
||||
type=1,
|
||||
name=name,
|
||||
price=price,
|
||||
url=url,
|
||||
img=img,
|
||||
comment_num=comment_num,
|
||||
shop=shop,
|
||||
created_at=now_time()
|
||||
)
|
||||
|
||||
m = re.match(r'.+\/(\d+)\.html', url)
|
||||
if m:
|
||||
product_id = m.group(1)
|
||||
comment_url = self.make_comment_url(product_id, 0)
|
||||
yield Request(comment_url, self.parse_comment,
|
||||
meta={'product_id': product_id, 'page': 0},
|
||||
priority=100)
|
||||
|
||||
# 获取下一页
|
||||
next_page = response.xpath('//a[@class="pn-next"]/@href').extract_first()
|
||||
if next_page:
|
||||
page = 1
|
||||
m = re.match(r'.+page=(\d+).+', next_page)
|
||||
if m:
|
||||
page = m.group(1)
|
||||
print("开始抓取:{0}页".format(page))
|
||||
|
||||
# time.sleep(10)
|
||||
|
||||
yield SplashRequest(
|
||||
url='https://list.jd.com' + next_page
|
||||
)
|
||||
else:
|
||||
print("抓取结束")
|
Loading…
Reference in New Issue