This commit is contained in:
parent
f8a769ed49
commit
da57a3a604
|
@ -40,6 +40,7 @@ class CommentItem(scrapy.Item):
|
||||||
|
|
||||||
class JDProductItem(scrapy.Item):
|
class JDProductItem(scrapy.Item):
|
||||||
id = scrapy.Field()
|
id = scrapy.Field()
|
||||||
|
type = scrapy.Field()
|
||||||
name = scrapy.Field()
|
name = scrapy.Field()
|
||||||
price = scrapy.Field()
|
price = scrapy.Field()
|
||||||
url = scrapy.Field()
|
url = scrapy.Field()
|
||||||
|
|
|
@ -87,6 +87,7 @@ class JDHbasePipeline(object):
|
||||||
if isinstance(item, JDProductItem):
|
if isinstance(item, JDProductItem):
|
||||||
self.product_table.put(item["id"],
|
self.product_table.put(item["id"],
|
||||||
{"product:name": item["name"],
|
{"product:name": item["name"],
|
||||||
|
"product:type": item["type"],
|
||||||
"product:price": item["price"],
|
"product:price": item["price"],
|
||||||
"product:url": item["url"],
|
"product:url": item["url"],
|
||||||
"product:img": item["img"],
|
"product:img": item["img"],
|
||||||
|
|
|
@ -114,6 +114,7 @@ class JdSpider(scrapy.Spider):
|
||||||
print(shop)
|
print(shop)
|
||||||
|
|
||||||
yield JDProductItem(
|
yield JDProductItem(
|
||||||
|
type=0,
|
||||||
name=name,
|
name=name,
|
||||||
price=price,
|
price=price,
|
||||||
url=url,
|
url=url,
|
||||||
|
|
|
@ -0,0 +1,150 @@
|
||||||
|
# coding=utf-8
|
||||||
|
import json
|
||||||
|
import time
|
||||||
|
|
||||||
|
import scrapy
|
||||||
|
from scrapy import Request
|
||||||
|
from scrapy.conf import settings
|
||||||
|
from scrapy_splash import SplashRequest
|
||||||
|
import pdb
|
||||||
|
from taobao.items import JDProductItem, JDCommentItem, JDCommentSummary
|
||||||
|
import re
|
||||||
|
|
||||||
|
from taobao.utils import now_time
|
||||||
|
|
||||||
|
|
||||||
|
class Jd1Spider(scrapy.Spider):
|
||||||
|
name = 'jd1'
|
||||||
|
allowed_domains = ['jd.com']
|
||||||
|
start_urls = [
|
||||||
|
"https://list.jd.com/list.html?cat=670,671,672"
|
||||||
|
]
|
||||||
|
|
||||||
|
custom_settings = {
|
||||||
|
'LOG_LEVEL': "WARN",
|
||||||
|
'ITEM_PIPELINES': {
|
||||||
|
'taobao.pipelines.JDCleanDataPipeline': 300,
|
||||||
|
'taobao.pipelines.JDHbasePipeline': 400,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
def make_comment_url(self, product_id, page):
|
||||||
|
comment_url = "https://club.jd.com/comment/productPageComments.action?callback=fetchJSON" \
|
||||||
|
"_comment98vv33164&productId=%s&score=0&sortType=5&page=%d&pageSize=10" \
|
||||||
|
"&isShadowSku=0&rid=0&fold=1" % (product_id, page)
|
||||||
|
return comment_url
|
||||||
|
|
||||||
|
def start_requests(self):
|
||||||
|
for url in self.start_urls:
|
||||||
|
yield SplashRequest(url,
|
||||||
|
self.parse,
|
||||||
|
args={})
|
||||||
|
|
||||||
|
def parse_comment(self, response):
|
||||||
|
# pdb.set_trace()
|
||||||
|
product_id = response.meta["product_id"]
|
||||||
|
page = response.meta["page"]
|
||||||
|
print("抓取评论, %s, 当前 %d页" % (product_id, page))
|
||||||
|
m = re.match(r'^fetchJSON_comment98vv33164\((.+)\);$', response.text)
|
||||||
|
if m:
|
||||||
|
t = m.group(1)
|
||||||
|
j = json.loads(t)
|
||||||
|
commentSummary = j['comments']
|
||||||
|
for comment in commentSummary:
|
||||||
|
c_content = comment['content']
|
||||||
|
c_time = comment['referenceTime']
|
||||||
|
c_name = comment['nickname']
|
||||||
|
c_client = comment['userClientShow']
|
||||||
|
c_id = "%s_%s" % (product_id, comment['id'])
|
||||||
|
print('{}, {} {} {}\n{}\n'.format(c_id, c_name, c_time, c_client, c_content))
|
||||||
|
yield JDCommentItem(
|
||||||
|
id=c_id,
|
||||||
|
user_name=c_name,
|
||||||
|
from_type=c_client,
|
||||||
|
content=c_content,
|
||||||
|
comment_time=c_time,
|
||||||
|
created_at=now_time()
|
||||||
|
)
|
||||||
|
if page == 0:
|
||||||
|
productCommentSummary = j['productCommentSummary']
|
||||||
|
yield JDCommentSummary(
|
||||||
|
id = product_id,
|
||||||
|
good_count = productCommentSummary['goodCount'],
|
||||||
|
general_count= productCommentSummary['generalCount'],
|
||||||
|
poor_count= productCommentSummary['poorCount'],
|
||||||
|
comment_count= productCommentSummary['commentCount'],
|
||||||
|
default_good_count= productCommentSummary['defaultGoodCount'],
|
||||||
|
good_rate= productCommentSummary['goodRate']
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
page = page + 1
|
||||||
|
|
||||||
|
next_comment_url = self.make_comment_url(product_id, page)
|
||||||
|
|
||||||
|
# time.sleep(10)
|
||||||
|
|
||||||
|
if page>settings['COMMENT_MAX_PAGE']:
|
||||||
|
print("评论抓取达到最大深度, %s, 页数: %d" % (product_id, page))
|
||||||
|
else:
|
||||||
|
yield Request(next_comment_url, self.parse_comment,
|
||||||
|
meta={'product_id': product_id, 'page': page})
|
||||||
|
else:
|
||||||
|
print("评论抓取完成, %s, 页数: %d" % (product_id, page))
|
||||||
|
|
||||||
|
def parse(self, response):
|
||||||
|
items = response.xpath('//div[@id="plist"]//li[@class="gl-item"]')
|
||||||
|
|
||||||
|
for item in items:
|
||||||
|
# pdb.set_trace()
|
||||||
|
name = item.xpath('.//div[contains(@class, "p-name")]/a/em/text()').extract_first()
|
||||||
|
print(name)
|
||||||
|
price = item.xpath(
|
||||||
|
'.//div[contains(@class, "p-price")]/strong/i/text()').extract_first()
|
||||||
|
print(price)
|
||||||
|
url = item.xpath('.//div[contains(@class, "p-name")]/a/@href').extract_first()
|
||||||
|
print('https:' + url)
|
||||||
|
img = item.xpath('.//div[@class="p-img"]/a/img/@data-lazy-img').extract_first()
|
||||||
|
if not img:
|
||||||
|
img = item.xpath('.//div[@class="p-img"]/a/img/@src').extract_first()
|
||||||
|
|
||||||
|
comment_num = item.xpath('.//div[@class="p-commit"]/strong/a/text()').extract_first()
|
||||||
|
print(comment_num)
|
||||||
|
shop = item.xpath('.//div[@class="p-shop"]/span/a/@title').extract_first()
|
||||||
|
print(shop)
|
||||||
|
|
||||||
|
yield JDProductItem(
|
||||||
|
type=1,
|
||||||
|
name=name,
|
||||||
|
price=price,
|
||||||
|
url=url,
|
||||||
|
img=img,
|
||||||
|
comment_num=comment_num,
|
||||||
|
shop=shop,
|
||||||
|
created_at=now_time()
|
||||||
|
)
|
||||||
|
|
||||||
|
m = re.match(r'.+\/(\d+)\.html', url)
|
||||||
|
if m:
|
||||||
|
product_id = m.group(1)
|
||||||
|
comment_url = self.make_comment_url(product_id, 0)
|
||||||
|
yield Request(comment_url, self.parse_comment,
|
||||||
|
meta={'product_id': product_id, 'page': 0},
|
||||||
|
priority=100)
|
||||||
|
|
||||||
|
# 获取下一页
|
||||||
|
next_page = response.xpath('//a[@class="pn-next"]/@href').extract_first()
|
||||||
|
if next_page:
|
||||||
|
page = 1
|
||||||
|
m = re.match(r'.+page=(\d+).+', next_page)
|
||||||
|
if m:
|
||||||
|
page = m.group(1)
|
||||||
|
print("开始抓取:{0}页".format(page))
|
||||||
|
|
||||||
|
# time.sleep(10)
|
||||||
|
|
||||||
|
yield SplashRequest(
|
||||||
|
url='https://list.jd.com' + next_page
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
print("抓取结束")
|
Loading…
Reference in New Issue