This commit is contained in:
guange 2019-01-17 22:41:27 +08:00
parent f8a769ed49
commit da57a3a604
4 changed files with 153 additions and 0 deletions

View File

@ -40,6 +40,7 @@ class CommentItem(scrapy.Item):
class JDProductItem(scrapy.Item):
id = scrapy.Field()
type = scrapy.Field()
name = scrapy.Field()
price = scrapy.Field()
url = scrapy.Field()

View File

@ -87,6 +87,7 @@ class JDHbasePipeline(object):
if isinstance(item, JDProductItem):
self.product_table.put(item["id"],
{"product:name": item["name"],
"product:type": item["type"],
"product:price": item["price"],
"product:url": item["url"],
"product:img": item["img"],

View File

@ -114,6 +114,7 @@ class JdSpider(scrapy.Spider):
print(shop)
yield JDProductItem(
type=0,
name=name,
price=price,
url=url,

View File

@ -0,0 +1,150 @@
# coding=utf-8
import json
import time
import scrapy
from scrapy import Request
from scrapy.conf import settings
from scrapy_splash import SplashRequest
import pdb
from taobao.items import JDProductItem, JDCommentItem, JDCommentSummary
import re
from taobao.utils import now_time
class Jd1Spider(scrapy.Spider):
name = 'jd1'
allowed_domains = ['jd.com']
start_urls = [
"https://list.jd.com/list.html?cat=670,671,672"
]
custom_settings = {
'LOG_LEVEL': "WARN",
'ITEM_PIPELINES': {
'taobao.pipelines.JDCleanDataPipeline': 300,
'taobao.pipelines.JDHbasePipeline': 400,
}
}
def make_comment_url(self, product_id, page):
comment_url = "https://club.jd.com/comment/productPageComments.action?callback=fetchJSON" \
"_comment98vv33164&productId=%s&score=0&sortType=5&page=%d&pageSize=10" \
"&isShadowSku=0&rid=0&fold=1" % (product_id, page)
return comment_url
def start_requests(self):
for url in self.start_urls:
yield SplashRequest(url,
self.parse,
args={})
def parse_comment(self, response):
# pdb.set_trace()
product_id = response.meta["product_id"]
page = response.meta["page"]
print("抓取评论, %s, 当前 %d" % (product_id, page))
m = re.match(r'^fetchJSON_comment98vv33164\((.+)\);$', response.text)
if m:
t = m.group(1)
j = json.loads(t)
commentSummary = j['comments']
for comment in commentSummary:
c_content = comment['content']
c_time = comment['referenceTime']
c_name = comment['nickname']
c_client = comment['userClientShow']
c_id = "%s_%s" % (product_id, comment['id'])
print('{}, {} {} {}\n{}\n'.format(c_id, c_name, c_time, c_client, c_content))
yield JDCommentItem(
id=c_id,
user_name=c_name,
from_type=c_client,
content=c_content,
comment_time=c_time,
created_at=now_time()
)
if page == 0:
productCommentSummary = j['productCommentSummary']
yield JDCommentSummary(
id = product_id,
good_count = productCommentSummary['goodCount'],
general_count= productCommentSummary['generalCount'],
poor_count= productCommentSummary['poorCount'],
comment_count= productCommentSummary['commentCount'],
default_good_count= productCommentSummary['defaultGoodCount'],
good_rate= productCommentSummary['goodRate']
)
page = page + 1
next_comment_url = self.make_comment_url(product_id, page)
# time.sleep(10)
if page>settings['COMMENT_MAX_PAGE']:
print("评论抓取达到最大深度, %s, 页数: %d" % (product_id, page))
else:
yield Request(next_comment_url, self.parse_comment,
meta={'product_id': product_id, 'page': page})
else:
print("评论抓取完成, %s, 页数: %d" % (product_id, page))
def parse(self, response):
items = response.xpath('//div[@id="plist"]//li[@class="gl-item"]')
for item in items:
# pdb.set_trace()
name = item.xpath('.//div[contains(@class, "p-name")]/a/em/text()').extract_first()
print(name)
price = item.xpath(
'.//div[contains(@class, "p-price")]/strong/i/text()').extract_first()
print(price)
url = item.xpath('.//div[contains(@class, "p-name")]/a/@href').extract_first()
print('https:' + url)
img = item.xpath('.//div[@class="p-img"]/a/img/@data-lazy-img').extract_first()
if not img:
img = item.xpath('.//div[@class="p-img"]/a/img/@src').extract_first()
comment_num = item.xpath('.//div[@class="p-commit"]/strong/a/text()').extract_first()
print(comment_num)
shop = item.xpath('.//div[@class="p-shop"]/span/a/@title').extract_first()
print(shop)
yield JDProductItem(
type=1,
name=name,
price=price,
url=url,
img=img,
comment_num=comment_num,
shop=shop,
created_at=now_time()
)
m = re.match(r'.+\/(\d+)\.html', url)
if m:
product_id = m.group(1)
comment_url = self.make_comment_url(product_id, 0)
yield Request(comment_url, self.parse_comment,
meta={'product_id': product_id, 'page': 0},
priority=100)
# 获取下一页
next_page = response.xpath('//a[@class="pn-next"]/@href').extract_first()
if next_page:
page = 1
m = re.match(r'.+page=(\d+).+', next_page)
if m:
page = m.group(1)
print("开始抓取:{0}".format(page))
# time.sleep(10)
yield SplashRequest(
url='https://list.jd.com' + next_page
)
else:
print("抓取结束")