This commit is contained in:
guange 2019-01-18 11:31:19 +08:00
parent 1beabd758e
commit 406fab63fe
6 changed files with 48 additions and 40 deletions

1
.gitignore vendored
View File

@ -6,3 +6,4 @@ chapter1/crawler/datas/comments/
chapter1/crawler/datas/comments1/ chapter1/crawler/datas/comments1/
chapter1/crawler/datas/products/ chapter1/crawler/datas/products/
chapter1/crawler/taobao/settings.py chapter1/crawler/taobao/settings.py
chapter2/mysite/mysite/settings.py

View File

@ -9,7 +9,6 @@ from scrapy_splash import SplashRequest
import pdb import pdb
from taobao.items import JDProductItem, JDCommentItem, JDCommentSummary from taobao.items import JDProductItem, JDCommentItem, JDCommentSummary
import re import re
from taobao.utils import now_time from taobao.utils import now_time

View File

@ -34,11 +34,27 @@ create table platform_infos (
create table news ( create table news (
`id` int(11) NOT NULL AUTO_INCREMENT, `id` int(11) NOT NULL AUTO_INCREMENT,
comment_time varchar(255) comment '评论时间', comment_time varchar(255) comment '评论时间',
content varchar(1024) comment '评论内容', content varchar(1024) comment '评论内容',
comment_id varchar(255) comment '评论ID', comment_id varchar(255) comment '评论ID',
PRIMARY KEY (`id`) PRIMARY KEY (`id`)
)comment='最新抓取的20条信息'; )comment='最新抓取的20条信息';
create table last_day_counts (
`id` int(11) NOT NULL AUTO_INCREMENT,
last_day int default 0,
product_c int default 0,
comment_c int default 0,
PRIMARY KEY (`id`)
)comment ='最后5天历史采集量';
insert into last_day_counts(last_day, product_c, comment_c) values(1, 0, 0);
insert into last_day_counts(last_day, product_c, comment_c) values(2, 0, 0);
insert into last_day_counts(last_day, product_c, comment_c) values(3, 0, 0);
insert into last_day_counts(last_day, product_c, comment_c) values(4, 0, 0);
insert into last_day_counts(last_day, product_c, comment_c) values(5, 0, 0);

View File

@ -304,7 +304,7 @@ function init(){
color : '#ffffff', color : '#ffffff',
}, },
data : ['厦门第一医院','厦门中山医院','厦门中医院','厦门第五医院',], data : ['手机','笔记本',],
}, },
calculable : false, calculable : false,
tooltip : { tooltip : {
@ -340,7 +340,7 @@ function init(){
xAxis: [ xAxis: [
{ {
type: 'category', type: 'category',
data : ['8:00','10:00','12:00','14:00','16:00','18:00'], data : ['四天前','三天前','二天前','昨天','今天'],
axisLine:{ axisLine:{
lineStyle:{ lineStyle:{
color: '#034c6a' color: '#034c6a'
@ -373,7 +373,7 @@ function init(){
}, },
series : [ series : [
{ {
name:'厦门第一医院', name:'手机',
type:'line', type:'line',
smooth:true, smooth:true,
itemStyle: { itemStyle: {
@ -383,10 +383,10 @@ function init(){
} }
} }
}, },
data:[15, 0, 20, 45, 22.1, 25,].reverse() data:[15, 0, 20, 45, 22.1,]
}, },
{ {
name:'厦门中山医院', name:'笔记本',
type:'line', type:'line',
smooth:true, smooth:true,
itemStyle: { itemStyle: {
@ -396,34 +396,9 @@ function init(){
} }
} }
}, },
data:[25, 10, 30, 55, 32.1, 35, ].reverse() data:[25, 10, 30, 55, 32.1, ]
}, },
{
name:'厦门中医院',
type:'line',
smooth:true,
itemStyle: {
normal: {
lineStyle: {
shadowColor : 'rgba(0,0,0,0.4)'
}
}
},
data:[35, 20, 40, 65, 42.1, 45, ].reverse()
},
{
name:'厦门第五医院',
type:'line',
smooth:true,
itemStyle: {
normal: {
lineStyle: {
shadowColor : 'rgba(0,0,0,0.4)'
}
}
},
data:[45, 30, 50, 75, 52.1, 55, 6].reverse()
}
] ]
}); });

View File

@ -131,11 +131,11 @@
</div> </div>
<div class="right div_any01"> <div class="right div_any01">
<div class="div_any_child"> <div class="div_any_child">
<div class="div_any_title"><img src="{% static "images/title_3.png" %}">数据采集条数(当日)</div> <div class="div_any_title"><img src="{% static "images/title_3.png" %}">评论数据采集条数(当日)</div>
<p id="lineChart" class="p_chart"></p> <p id="lineChart" class="p_chart"></p>
</div> </div>
<div class="div_any_child"> <div class="div_any_child">
<div class="div_any_title"><img src="{% static "images/title_4.png" %}"> 采集人数(当日)</div> <div class="div_any_title"><img src="{% static "images/title_4.png" %}">商品数据采集(当日)</div>
<p id="lineChart2" class="p_chart"></p> <p id="lineChart2" class="p_chart"></p>
</div> </div>
</div> </div>

View File

@ -3,6 +3,7 @@ from pyspark.sql import Row
# $example off:spark_hive$ # $example off:spark_hive$
import os import os
import pymysql import pymysql
import datetime
import time import time
def mysql_query(sql): def mysql_query(sql):
@ -29,6 +30,12 @@ def mysql_execute(sql):
def today(): def today():
return time.strftime('%Y-%m-%d') return time.strftime('%Y-%m-%d')
def getYesterday(day):
today=datetime.date.today()
oneday=datetime.timedelta(days=day)
yesterday=today-oneday
return yesterday.strftime('%Y-%m-%d')
def collect_crawl_info(spark): def collect_crawl_info(spark):
df = spark.sql("select count(*) as N from jd_comment") df = spark.sql("select count(*) as N from jd_comment")
@ -57,6 +64,16 @@ def collect_news(spark):
mysql_execute("insert into news (comment_time, content, comment_id) values ('{}', '{}', '{}')".format( mysql_execute("insert into news (comment_time, content, comment_id) values ('{}', '{}', '{}')".format(
row["comment_time"], row["content"], row["id"])) row["comment_time"], row["content"], row["id"]))
def get_last_day_count(spark):
"""获取过去几天的采集量"""
for i in range(5):
df = spark.sql("select count(*) as N from jd where created_at like '"+getYesterday(i+1)+"%'")
jd_last_count = df.rdd.collect()[0]["N"]
df = spark.sql("select count(*) as N from jd_comment where created_at like '"+getYesterday(i+1)+"%'")
jd_comment_last_count = df.rdd.collect()[0]["N"]
if __name__ == "__main__": if __name__ == "__main__":
# $example on:spark_hive$ # $example on:spark_hive$