This commit is contained in:
parent
2fcd8ba6e0
commit
5639278337
|
@ -7,3 +7,4 @@ chapter1/crawler/datas/comments1/
|
||||||
chapter1/crawler/datas/products/
|
chapter1/crawler/datas/products/
|
||||||
chapter1/crawler/taobao/settings.py
|
chapter1/crawler/taobao/settings.py
|
||||||
chapter2/mysite/mysite/settings.py
|
chapter2/mysite/mysite/settings.py
|
||||||
|
.env
|
|
@ -0,0 +1,3 @@
|
||||||
|
{
|
||||||
|
"python.pythonPath": ".env/bin/python"
|
||||||
|
}
|
|
@ -1,31 +1,44 @@
|
||||||
import re,requests,json
|
import re
|
||||||
|
import requests
|
||||||
|
import json
|
||||||
|
|
||||||
s = requests.session()
|
s = requests.session()
|
||||||
url = 'https://club.jd.com/comment/productPageComments.action'
|
url = "https://club.jd.com/comment/productPageComments.action"
|
||||||
data = {
|
data = {
|
||||||
'callback':'fetchJSON_comment98vv61',
|
'callback': 'fetchJSON_comment98vv61',
|
||||||
'productId':'3888284',
|
'productId': '3888284',
|
||||||
'score':0,
|
'score': 0,
|
||||||
'sortType':5,
|
'sortType': 5,
|
||||||
'pageSize':10,
|
'pageSize': 10,
|
||||||
'isShadowSku':0,
|
'isShadowSku': 0,
|
||||||
'page':0
|
'page': 0
|
||||||
}
|
}
|
||||||
|
|
||||||
while True:
|
|
||||||
t = s.get(url,params = data).text
|
|
||||||
try:
|
|
||||||
t = re.search(r'(?<=fetchJSON_comment98vv61\().*(?=\);)',t).group(0)
|
|
||||||
except Exception as e:
|
|
||||||
break
|
|
||||||
|
|
||||||
j = json.loads(t)
|
def main():
|
||||||
commentSummary = j['comments']
|
while True:
|
||||||
for comment in commentSummary:
|
t = s.get(url, params=data).text
|
||||||
c_content = comment['content']
|
try:
|
||||||
c_time = comment['referenceTime']
|
t = re.search(
|
||||||
c_name = comment['nickname']
|
r'(?<=fetchjson_comment98vv61\().*(?=\);)', t).group(0)
|
||||||
c_client = comment['userClientShow']
|
except Exception as e:
|
||||||
print('{} {} {}\n{}\n'.format(c_name,c_time,c_client,c_content))
|
print(e)
|
||||||
|
|
||||||
data['page'] += 1
|
j = json.loads(t)
|
||||||
|
commentsummary = j["comments"]
|
||||||
|
for comment in commentsummary:
|
||||||
|
c_content = comment["content"]
|
||||||
|
c_time = comment['referencetime']
|
||||||
|
c_name = comment['nickname']
|
||||||
|
c_client = comment['userclientshow']
|
||||||
|
print('{} {} {}\n{}\n'.format(c_name, c_time, c_client, c_content))
|
||||||
|
|
||||||
|
data['page'] += 1
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
import time
|
||||||
|
import datetime
|
||||||
|
oldtime = datetime.timedelta(days=1)
|
||||||
|
print(oldtime)
|
||||||
|
print(datetime.datetime.now()-oldtime)
|
||||||
|
|
|
@ -22,7 +22,7 @@ class ProductItem(scrapy.Item):
|
||||||
author = scrapy.Field()
|
author = scrapy.Field()
|
||||||
shop = scrapy.Field()
|
shop = scrapy.Field()
|
||||||
pubtime = scrapy.Field()
|
pubtime = scrapy.Field()
|
||||||
publisher = scrapy.Field()
|
publisher = scrapy.Field()
|
||||||
url = scrapy.Field()
|
url = scrapy.Field()
|
||||||
detail = scrapy.Field()
|
detail = scrapy.Field()
|
||||||
comment_num = scrapy.Field()
|
comment_num = scrapy.Field()
|
||||||
|
@ -67,4 +67,3 @@ class JDCommentSummary(scrapy.Item):
|
||||||
comment_count = scrapy.Field()
|
comment_count = scrapy.Field()
|
||||||
default_good_count = scrapy.Field()
|
default_good_count = scrapy.Field()
|
||||||
good_rate = scrapy.Field()
|
good_rate = scrapy.Field()
|
||||||
|
|
||||||
|
|
|
@ -55,7 +55,8 @@ class Jd1Spider(scrapy.Spider):
|
||||||
c_name = comment['nickname']
|
c_name = comment['nickname']
|
||||||
c_client = comment['userClientShow']
|
c_client = comment['userClientShow']
|
||||||
c_id = "%s_%s" % (product_id, comment['id'])
|
c_id = "%s_%s" % (product_id, comment['id'])
|
||||||
print('{}, {} {} {}\n{}\n'.format(c_id, c_name, c_time, c_client, c_content))
|
print('{}, {} {} {}\n{}\n'.format(
|
||||||
|
c_id, c_name, c_time, c_client, c_content))
|
||||||
yield JDCommentItem(
|
yield JDCommentItem(
|
||||||
id=c_id,
|
id=c_id,
|
||||||
user_name=c_name,
|
user_name=c_name,
|
||||||
|
@ -67,27 +68,26 @@ class Jd1Spider(scrapy.Spider):
|
||||||
if page == 0:
|
if page == 0:
|
||||||
productCommentSummary = j['productCommentSummary']
|
productCommentSummary = j['productCommentSummary']
|
||||||
yield JDCommentSummary(
|
yield JDCommentSummary(
|
||||||
id = product_id,
|
id=product_id,
|
||||||
good_count = productCommentSummary['goodCount'],
|
good_count=productCommentSummary['goodCount'],
|
||||||
general_count= productCommentSummary['generalCount'],
|
general_count=productCommentSummary['generalCount'],
|
||||||
poor_count= productCommentSummary['poorCount'],
|
poor_count=productCommentSummary['poorCount'],
|
||||||
comment_count= productCommentSummary['commentCount'],
|
comment_count=productCommentSummary['commentCount'],
|
||||||
default_good_count= productCommentSummary['defaultGoodCount'],
|
default_good_count=productCommentSummary['defaultGoodCount'],
|
||||||
good_rate= productCommentSummary['goodRate']
|
good_rate=productCommentSummary['goodRate']
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
page = page + 1
|
page = page + 1
|
||||||
|
|
||||||
next_comment_url = self.make_comment_url(product_id, page)
|
next_comment_url = self.make_comment_url(product_id, page)
|
||||||
|
|
||||||
# time.sleep(10)
|
# time.sleep(10)
|
||||||
|
|
||||||
if page>settings['COMMENT_MAX_PAGE']:
|
if page > settings['COMMENT_MAX_PAGE']:
|
||||||
print("评论抓取达到最大深度, %s, 页数: %d" % (product_id, page))
|
print("评论抓取达到最大深度, %s, 页数: %d" % (product_id, page))
|
||||||
else:
|
else:
|
||||||
yield Request(next_comment_url, self.parse_comment,
|
yield Request(next_comment_url, self.parse_comment,
|
||||||
meta={'product_id': product_id, 'page': page})
|
meta={'product_id': product_id, 'page': page})
|
||||||
else:
|
else:
|
||||||
print("评论抓取完成, %s, 页数: %d" % (product_id, page))
|
print("评论抓取完成, %s, 页数: %d" % (product_id, page))
|
||||||
|
|
||||||
|
@ -96,20 +96,26 @@ class Jd1Spider(scrapy.Spider):
|
||||||
|
|
||||||
for item in items:
|
for item in items:
|
||||||
# pdb.set_trace()
|
# pdb.set_trace()
|
||||||
name = item.xpath('.//div[contains(@class, "p-name")]/a/em/text()').extract_first()
|
name = item.xpath(
|
||||||
|
'.//div[contains(@class, "p-name")]/a/em/text()').extract_first()
|
||||||
print(name)
|
print(name)
|
||||||
price = item.xpath(
|
price = item.xpath(
|
||||||
'.//div[contains(@class, "p-price")]/strong/i/text()').extract_first()
|
'.//div[contains(@class, "p-price")]/strong/i/text()').extract_first()
|
||||||
print(price)
|
print(price)
|
||||||
url = item.xpath('.//div[contains(@class, "p-name")]/a/@href').extract_first()
|
url = item.xpath(
|
||||||
|
'.//div[contains(@class, "p-name")]/a/@href').extract_first()
|
||||||
print('https:' + url)
|
print('https:' + url)
|
||||||
img = item.xpath('.//div[@class="p-img"]/a/img/@data-lazy-img').extract_first()
|
img = item.xpath(
|
||||||
|
'.//div[@class="p-img"]/a/img/@data-lazy-img').extract_first()
|
||||||
if not img:
|
if not img:
|
||||||
img = item.xpath('.//div[@class="p-img"]/a/img/@src').extract_first()
|
img = item.xpath(
|
||||||
|
'.//div[@class="p-img"]/a/img/@src').extract_first()
|
||||||
|
|
||||||
comment_num = item.xpath('.//div[@class="p-commit"]/strong/a/text()').extract_first()
|
comment_num = item.xpath(
|
||||||
|
'.//div[@class="p-commit"]/strong/a/text()').extract_first()
|
||||||
print(comment_num)
|
print(comment_num)
|
||||||
shop = item.xpath('.//div[@class="p-shop"]/span/a/@title').extract_first()
|
shop = item.xpath(
|
||||||
|
'.//div[@class="p-shop"]/span/a/@title').extract_first()
|
||||||
print(shop)
|
print(shop)
|
||||||
|
|
||||||
yield JDProductItem(
|
yield JDProductItem(
|
||||||
|
@ -132,7 +138,8 @@ class Jd1Spider(scrapy.Spider):
|
||||||
priority=100)
|
priority=100)
|
||||||
|
|
||||||
# 获取下一页
|
# 获取下一页
|
||||||
next_page = response.xpath('//a[@class="pn-next"]/@href').extract_first()
|
next_page = response.xpath(
|
||||||
|
'//a[@class="pn-next"]/@href').extract_first()
|
||||||
if next_page:
|
if next_page:
|
||||||
page = 1
|
page = 1
|
||||||
m = re.match(r'.+page=(\d+).+', next_page)
|
m = re.match(r'.+page=(\d+).+', next_page)
|
||||||
|
|
|
@ -158,6 +158,7 @@ function init(){
|
||||||
]
|
]
|
||||||
});
|
});
|
||||||
|
|
||||||
|
if(document.getElementById('histogramChart')){
|
||||||
var histogramChart = echarts.init(document.getElementById('histogramChart'));
|
var histogramChart = echarts.init(document.getElementById('histogramChart'));
|
||||||
histogramChart.setOption({
|
histogramChart.setOption({
|
||||||
|
|
||||||
|
@ -255,6 +256,8 @@ function init(){
|
||||||
]
|
]
|
||||||
});
|
});
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
var lineChart2 = echarts.init(document.getElementById('lineChart2'));
|
var lineChart2 = echarts.init(document.getElementById('lineChart2'));
|
||||||
lineChart2.setOption({
|
lineChart2.setOption({
|
||||||
|
|
||||||
|
|
|
@ -1042,3 +1042,44 @@ border-radius: 3px;
|
||||||
.popBox .ttBox{height: 30px; line-height: 30px; padding: 14px 30px; border-bottom: solid 1px #eef0f1;text-align: center;-webkit-box-sizing: content-box; -moz-box-sizing: content-box;box-sizing: content-box;}
|
.popBox .ttBox{height: 30px; line-height: 30px; padding: 14px 30px; border-bottom: solid 1px #eef0f1;text-align: center;-webkit-box-sizing: content-box; -moz-box-sizing: content-box;box-sizing: content-box;}
|
||||||
.popBox .ttBox .tt{font-size: 20px; display: inline-block; height: 30px;}
|
.popBox .ttBox .tt{font-size: 20px; display: inline-block; height: 30px;}
|
||||||
.popBox .txtBox{height: calc(100% - 80px);overflow: auto;padding: 10px 0;}
|
.popBox .txtBox{height: calc(100% - 80px);overflow: auto;padding: 10px 0;}
|
||||||
|
|
||||||
|
.percentChart{
|
||||||
|
font-size: 14px;
|
||||||
|
padding: 90px 40px 0px 0px;
|
||||||
|
}
|
||||||
|
.percentChart li{
|
||||||
|
margin-bottom: 25px;
|
||||||
|
clear:both;
|
||||||
|
line-height: 35px;
|
||||||
|
display: flex;
|
||||||
|
}
|
||||||
|
.percentChart li span.leftTitle{
|
||||||
|
text-align: right;
|
||||||
|
display: block;
|
||||||
|
color: #ffffff;
|
||||||
|
width: 130px;
|
||||||
|
}
|
||||||
|
.percentChart li .rightPercent{
|
||||||
|
flex:1;
|
||||||
|
position: relative;
|
||||||
|
background-color: #32CD32;
|
||||||
|
height: 35px;
|
||||||
|
margin-left: 10px;
|
||||||
|
}
|
||||||
|
.percentChart li .rightPercent span{
|
||||||
|
position: absolute;
|
||||||
|
color: #fff;
|
||||||
|
right: 5px;
|
||||||
|
font-size: 12px;
|
||||||
|
line-height: 35px
|
||||||
|
}
|
||||||
|
.percentChart li .rightPercent p{
|
||||||
|
background-color: #FF7F50;
|
||||||
|
width: 50%;
|
||||||
|
text-align: right;
|
||||||
|
color: #ffffff;
|
||||||
|
line-height: 35px;
|
||||||
|
font-size: 12px;
|
||||||
|
padding-right: 5px;
|
||||||
|
box-sizing:border-box;
|
||||||
|
}
|
|
@ -139,7 +139,29 @@
|
||||||
</div>
|
</div>
|
||||||
<div class="div_any_child">
|
<div class="div_any_child">
|
||||||
<div class="div_any_title"><img src="{% static "images/title_4.png" %}">商品数据采集(当日)</div>
|
<div class="div_any_title"><img src="{% static "images/title_4.png" %}">商品数据采集(当日)</div>
|
||||||
<p id="lineChart2" class="p_chart"></p>
|
<div class="percentChart">
|
||||||
|
<li>
|
||||||
|
<span class="leftTitle">京东Android</span>
|
||||||
|
<div class="rightPercent">
|
||||||
|
<span>120</span>
|
||||||
|
<p style="width:30%">40</p >
|
||||||
|
</div>
|
||||||
|
</li>
|
||||||
|
<li>
|
||||||
|
<span class="leftTitle">京东iPhone</span>
|
||||||
|
<div class="rightPercent">
|
||||||
|
<span>500</span>
|
||||||
|
<p style="width:60%">300</p >
|
||||||
|
</div>
|
||||||
|
</li>
|
||||||
|
<li>
|
||||||
|
<span class="leftTitle">微信购物</span>
|
||||||
|
<div class="rightPercent">
|
||||||
|
<span>300</span>
|
||||||
|
<p style="width:50%">150</p >
|
||||||
|
</div>
|
||||||
|
</li>
|
||||||
|
</div>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
|
|
@ -89,6 +89,7 @@ def collect_crawl_info(spark):
|
||||||
|
|
||||||
|
|
||||||
def collect_news(spark):
|
def collect_news(spark):
|
||||||
|
"""获取最新的20条采集"""
|
||||||
df = spark.sql("select * from jd_comment order by created_at desc limit 20")
|
df = spark.sql("select * from jd_comment order by created_at desc limit 20")
|
||||||
|
|
||||||
for row in df.rdd.collect():
|
for row in df.rdd.collect():
|
||||||
|
|
|
@ -33,6 +33,7 @@ jupyter-core==4.4.0
|
||||||
lxml==4.3.0
|
lxml==4.3.0
|
||||||
MarkupSafe==1.1.0
|
MarkupSafe==1.1.0
|
||||||
mistune==0.8.4
|
mistune==0.8.4
|
||||||
|
mysqlclient==1.3.14
|
||||||
nbconvert==5.4.0
|
nbconvert==5.4.0
|
||||||
nbformat==4.4.0
|
nbformat==4.4.0
|
||||||
notebook==5.7.4
|
notebook==5.7.4
|
Loading…
Reference in New Issue