This commit is contained in:
guange 2019-01-21 10:17:09 +08:00
parent 2fcd8ba6e0
commit 5639278337
10 changed files with 136 additions and 45 deletions

1
.gitignore vendored
View File

@ -7,3 +7,4 @@ chapter1/crawler/datas/comments1/
chapter1/crawler/datas/products/ chapter1/crawler/datas/products/
chapter1/crawler/taobao/settings.py chapter1/crawler/taobao/settings.py
chapter2/mysite/mysite/settings.py chapter2/mysite/mysite/settings.py
.env

3
.vscode/settings.json vendored Normal file
View File

@ -0,0 +1,3 @@
{
"python.pythonPath": ".env/bin/python"
}

View File

@ -1,31 +1,44 @@
import re,requests,json import re
import requests
import json
s = requests.session() s = requests.session()
url = 'https://club.jd.com/comment/productPageComments.action' url = "https://club.jd.com/comment/productPageComments.action"
data = { data = {
'callback':'fetchJSON_comment98vv61', 'callback': 'fetchJSON_comment98vv61',
'productId':'3888284', 'productId': '3888284',
'score':0, 'score': 0,
'sortType':5, 'sortType': 5,
'pageSize':10, 'pageSize': 10,
'isShadowSku':0, 'isShadowSku': 0,
'page':0 'page': 0
} }
while True:
t = s.get(url,params = data).text
try:
t = re.search(r'(?<=fetchJSON_comment98vv61\().*(?=\);)',t).group(0)
except Exception as e:
break
j = json.loads(t) def main():
commentSummary = j['comments'] while True:
for comment in commentSummary: t = s.get(url, params=data).text
c_content = comment['content'] try:
c_time = comment['referenceTime'] t = re.search(
c_name = comment['nickname'] r'(?<=fetchjson_comment98vv61\().*(?=\);)', t).group(0)
c_client = comment['userClientShow'] except Exception as e:
print('{} {} {}\n{}\n'.format(c_name,c_time,c_client,c_content)) print(e)
data['page'] += 1 j = json.loads(t)
commentsummary = j["comments"]
for comment in commentsummary:
c_content = comment["content"]
c_time = comment['referencetime']
c_name = comment['nickname']
c_client = comment['userclientshow']
print('{} {} {}\n{}\n'.format(c_name, c_time, c_client, c_content))
data['page'] += 1
if __name__ == "__main__":
import time
import datetime
oldtime = datetime.timedelta(days=1)
print(oldtime)
print(datetime.datetime.now()-oldtime)

View File

@ -22,7 +22,7 @@ class ProductItem(scrapy.Item):
author = scrapy.Field() author = scrapy.Field()
shop = scrapy.Field() shop = scrapy.Field()
pubtime = scrapy.Field() pubtime = scrapy.Field()
publisher = scrapy.Field() publisher = scrapy.Field()
url = scrapy.Field() url = scrapy.Field()
detail = scrapy.Field() detail = scrapy.Field()
comment_num = scrapy.Field() comment_num = scrapy.Field()
@ -67,4 +67,3 @@ class JDCommentSummary(scrapy.Item):
comment_count = scrapy.Field() comment_count = scrapy.Field()
default_good_count = scrapy.Field() default_good_count = scrapy.Field()
good_rate = scrapy.Field() good_rate = scrapy.Field()

View File

@ -55,7 +55,8 @@ class Jd1Spider(scrapy.Spider):
c_name = comment['nickname'] c_name = comment['nickname']
c_client = comment['userClientShow'] c_client = comment['userClientShow']
c_id = "%s_%s" % (product_id, comment['id']) c_id = "%s_%s" % (product_id, comment['id'])
print('{}, {} {} {}\n{}\n'.format(c_id, c_name, c_time, c_client, c_content)) print('{}, {} {} {}\n{}\n'.format(
c_id, c_name, c_time, c_client, c_content))
yield JDCommentItem( yield JDCommentItem(
id=c_id, id=c_id,
user_name=c_name, user_name=c_name,
@ -67,27 +68,26 @@ class Jd1Spider(scrapy.Spider):
if page == 0: if page == 0:
productCommentSummary = j['productCommentSummary'] productCommentSummary = j['productCommentSummary']
yield JDCommentSummary( yield JDCommentSummary(
id = product_id, id=product_id,
good_count = productCommentSummary['goodCount'], good_count=productCommentSummary['goodCount'],
general_count= productCommentSummary['generalCount'], general_count=productCommentSummary['generalCount'],
poor_count= productCommentSummary['poorCount'], poor_count=productCommentSummary['poorCount'],
comment_count= productCommentSummary['commentCount'], comment_count=productCommentSummary['commentCount'],
default_good_count= productCommentSummary['defaultGoodCount'], default_good_count=productCommentSummary['defaultGoodCount'],
good_rate= productCommentSummary['goodRate'] good_rate=productCommentSummary['goodRate']
) )
page = page + 1 page = page + 1
next_comment_url = self.make_comment_url(product_id, page) next_comment_url = self.make_comment_url(product_id, page)
# time.sleep(10) # time.sleep(10)
if page>settings['COMMENT_MAX_PAGE']: if page > settings['COMMENT_MAX_PAGE']:
print("评论抓取达到最大深度, %s, 页数: %d" % (product_id, page)) print("评论抓取达到最大深度, %s, 页数: %d" % (product_id, page))
else: else:
yield Request(next_comment_url, self.parse_comment, yield Request(next_comment_url, self.parse_comment,
meta={'product_id': product_id, 'page': page}) meta={'product_id': product_id, 'page': page})
else: else:
print("评论抓取完成, %s, 页数: %d" % (product_id, page)) print("评论抓取完成, %s, 页数: %d" % (product_id, page))
@ -96,20 +96,26 @@ class Jd1Spider(scrapy.Spider):
for item in items: for item in items:
# pdb.set_trace() # pdb.set_trace()
name = item.xpath('.//div[contains(@class, "p-name")]/a/em/text()').extract_first() name = item.xpath(
'.//div[contains(@class, "p-name")]/a/em/text()').extract_first()
print(name) print(name)
price = item.xpath( price = item.xpath(
'.//div[contains(@class, "p-price")]/strong/i/text()').extract_first() './/div[contains(@class, "p-price")]/strong/i/text()').extract_first()
print(price) print(price)
url = item.xpath('.//div[contains(@class, "p-name")]/a/@href').extract_first() url = item.xpath(
'.//div[contains(@class, "p-name")]/a/@href').extract_first()
print('https:' + url) print('https:' + url)
img = item.xpath('.//div[@class="p-img"]/a/img/@data-lazy-img').extract_first() img = item.xpath(
'.//div[@class="p-img"]/a/img/@data-lazy-img').extract_first()
if not img: if not img:
img = item.xpath('.//div[@class="p-img"]/a/img/@src').extract_first() img = item.xpath(
'.//div[@class="p-img"]/a/img/@src').extract_first()
comment_num = item.xpath('.//div[@class="p-commit"]/strong/a/text()').extract_first() comment_num = item.xpath(
'.//div[@class="p-commit"]/strong/a/text()').extract_first()
print(comment_num) print(comment_num)
shop = item.xpath('.//div[@class="p-shop"]/span/a/@title').extract_first() shop = item.xpath(
'.//div[@class="p-shop"]/span/a/@title').extract_first()
print(shop) print(shop)
yield JDProductItem( yield JDProductItem(
@ -132,7 +138,8 @@ class Jd1Spider(scrapy.Spider):
priority=100) priority=100)
# 获取下一页 # 获取下一页
next_page = response.xpath('//a[@class="pn-next"]/@href').extract_first() next_page = response.xpath(
'//a[@class="pn-next"]/@href').extract_first()
if next_page: if next_page:
page = 1 page = 1
m = re.match(r'.+page=(\d+).+', next_page) m = re.match(r'.+page=(\d+).+', next_page)

View File

@ -158,6 +158,7 @@ function init(){
] ]
}); });
if(document.getElementById('histogramChart')){
var histogramChart = echarts.init(document.getElementById('histogramChart')); var histogramChart = echarts.init(document.getElementById('histogramChart'));
histogramChart.setOption({ histogramChart.setOption({
@ -255,6 +256,8 @@ function init(){
] ]
}); });
}
var lineChart2 = echarts.init(document.getElementById('lineChart2')); var lineChart2 = echarts.init(document.getElementById('lineChart2'));
lineChart2.setOption({ lineChart2.setOption({

View File

@ -1042,3 +1042,44 @@ border-radius: 3px;
.popBox .ttBox{height: 30px; line-height: 30px; padding: 14px 30px; border-bottom: solid 1px #eef0f1;text-align: center;-webkit-box-sizing: content-box; -moz-box-sizing: content-box;box-sizing: content-box;} .popBox .ttBox{height: 30px; line-height: 30px; padding: 14px 30px; border-bottom: solid 1px #eef0f1;text-align: center;-webkit-box-sizing: content-box; -moz-box-sizing: content-box;box-sizing: content-box;}
.popBox .ttBox .tt{font-size: 20px; display: inline-block; height: 30px;} .popBox .ttBox .tt{font-size: 20px; display: inline-block; height: 30px;}
.popBox .txtBox{height: calc(100% - 80px);overflow: auto;padding: 10px 0;} .popBox .txtBox{height: calc(100% - 80px);overflow: auto;padding: 10px 0;}
.percentChart{
font-size: 14px;
padding: 90px 40px 0px 0px;
}
.percentChart li{
margin-bottom: 25px;
clear:both;
line-height: 35px;
display: flex;
}
.percentChart li span.leftTitle{
text-align: right;
display: block;
color: #ffffff;
width: 130px;
}
.percentChart li .rightPercent{
flex:1;
position: relative;
background-color: #32CD32;
height: 35px;
margin-left: 10px;
}
.percentChart li .rightPercent span{
position: absolute;
color: #fff;
right: 5px;
font-size: 12px;
line-height: 35px
}
.percentChart li .rightPercent p{
background-color: #FF7F50;
width: 50%;
text-align: right;
color: #ffffff;
line-height: 35px;
font-size: 12px;
padding-right: 5px;
box-sizing:border-box;
}

View File

@ -139,7 +139,29 @@
</div> </div>
<div class="div_any_child"> <div class="div_any_child">
<div class="div_any_title"><img src="{% static "images/title_4.png" %}">商品数据采集(当日)</div> <div class="div_any_title"><img src="{% static "images/title_4.png" %}">商品数据采集(当日)</div>
<p id="lineChart2" class="p_chart"></p> <div class="percentChart">
<li>
<span class="leftTitle">京东Android</span>
<div class="rightPercent">
<span>120</span>
<p style="width:30%">40</p >
</div>
</li>
<li>
<span class="leftTitle">京东iPhone</span>
<div class="rightPercent">
<span>500</span>
<p style="width:60%">300</p >
</div>
</li>
<li>
<span class="leftTitle">微信购物</span>
<div class="rightPercent">
<span>300</span>
<p style="width:50%">150</p >
</div>
</li>
</div>
</div> </div>
</div> </div>
</div> </div>

View File

@ -89,6 +89,7 @@ def collect_crawl_info(spark):
def collect_news(spark): def collect_news(spark):
"""获取最新的20条采集"""
df = spark.sql("select * from jd_comment order by created_at desc limit 20") df = spark.sql("select * from jd_comment order by created_at desc limit 20")
for row in df.rdd.collect(): for row in df.rdd.collect():

View File

@ -33,6 +33,7 @@ jupyter-core==4.4.0
lxml==4.3.0 lxml==4.3.0
MarkupSafe==1.1.0 MarkupSafe==1.1.0
mistune==0.8.4 mistune==0.8.4
mysqlclient==1.3.14
nbconvert==5.4.0 nbconvert==5.4.0
nbformat==4.4.0 nbformat==4.4.0
notebook==5.7.4 notebook==5.7.4