diff --git a/.gitignore b/.gitignore index 8f295a1..46e2435 100644 --- a/.gitignore +++ b/.gitignore @@ -6,3 +6,4 @@ chapter1/crawler/datas/comments/ chapter1/crawler/datas/comments1/ chapter1/crawler/datas/products/ chapter1/crawler/taobao/settings.py +chapter2/mysite/mysite/settings.py diff --git a/chapter1/crawler/taobao/spiders/jd1.py b/chapter1/crawler/taobao/spiders/jd1.py index 97dca08..2a03084 100644 --- a/chapter1/crawler/taobao/spiders/jd1.py +++ b/chapter1/crawler/taobao/spiders/jd1.py @@ -9,7 +9,6 @@ from scrapy_splash import SplashRequest import pdb from taobao.items import JDProductItem, JDCommentItem, JDCommentSummary import re - from taobao.utils import now_time diff --git a/chapter2/init.sql b/chapter2/init.sql index fb3465a..f269bca 100644 --- a/chapter2/init.sql +++ b/chapter2/init.sql @@ -34,11 +34,27 @@ create table platform_infos ( create table news ( -`id` int(11) NOT NULL AUTO_INCREMENT, + `id` int(11) NOT NULL AUTO_INCREMENT, -comment_time varchar(255) comment '评论时间', -content varchar(1024) comment '评论内容', -comment_id varchar(255) comment '评论ID', + comment_time varchar(255) comment '评论时间', + content varchar(1024) comment '评论内容', + comment_id varchar(255) comment '评论ID', -PRIMARY KEY (`id`) + PRIMARY KEY (`id`) )comment='最新抓取的20条信息'; + +create table last_day_counts ( + `id` int(11) NOT NULL AUTO_INCREMENT, + last_day int default 0, + product_c int default 0, + comment_c int default 0, + PRIMARY KEY (`id`) +)comment ='最后5天历史采集量'; + +insert into last_day_counts(last_day, product_c, comment_c) values(1, 0, 0); +insert into last_day_counts(last_day, product_c, comment_c) values(2, 0, 0); +insert into last_day_counts(last_day, product_c, comment_c) values(3, 0, 0); +insert into last_day_counts(last_day, product_c, comment_c) values(4, 0, 0); +insert into last_day_counts(last_day, product_c, comment_c) values(5, 0, 0); + + diff --git a/chapter2/mysite/myapp/static/scripts/index.js b/chapter2/mysite/myapp/static/scripts/index.js index aa5ed57..522a704 100755 --- a/chapter2/mysite/myapp/static/scripts/index.js +++ b/chapter2/mysite/myapp/static/scripts/index.js @@ -304,7 +304,7 @@ function init(){ color : '#ffffff', }, - data : ['厦门第一医院','厦门中山医院','厦门中医院','厦门第五医院',], + data : ['手机','笔记本',], }, calculable : false, tooltip : { @@ -340,7 +340,7 @@ function init(){ xAxis: [ { type: 'category', - data : ['8:00','10:00','12:00','14:00','16:00','18:00'], + data : ['四天前','三天前','二天前','昨天','今天'], axisLine:{ lineStyle:{ color: '#034c6a' @@ -373,7 +373,7 @@ function init(){ }, series : [ { - name:'厦门第一医院', + name:'手机', type:'line', smooth:true, itemStyle: { @@ -383,10 +383,10 @@ function init(){ } } }, - data:[15, 0, 20, 45, 22.1, 25,].reverse() + data:[15, 0, 20, 45, 22.1,] }, { - name:'厦门中山医院', + name:'笔记本', type:'line', smooth:true, itemStyle: { @@ -396,34 +396,9 @@ function init(){ } } }, - data:[25, 10, 30, 55, 32.1, 35, ].reverse() + data:[25, 10, 30, 55, 32.1, ] }, - { - name:'厦门中医院', - type:'line', - smooth:true, - itemStyle: { - normal: { - lineStyle: { - shadowColor : 'rgba(0,0,0,0.4)' - } - } - }, - data:[35, 20, 40, 65, 42.1, 45, ].reverse() - }, - { - name:'厦门第五医院', - type:'line', - smooth:true, - itemStyle: { - normal: { - lineStyle: { - shadowColor : 'rgba(0,0,0,0.4)' - } - } - }, - data:[45, 30, 50, 75, 52.1, 55, 6].reverse() - } + ] }); diff --git a/chapter2/mysite/myapp/templates/myapp/crawl.html b/chapter2/mysite/myapp/templates/myapp/crawl.html index 0e2e1a9..0f7b60e 100644 --- a/chapter2/mysite/myapp/templates/myapp/crawl.html +++ b/chapter2/mysite/myapp/templates/myapp/crawl.html @@ -131,11 +131,11 @@
-
数据采集条数(当日)
+
评论数据采集条数(当日)

-
采集人数(当日)
+
商品数据采集(当日)

diff --git a/chapter3/spark.py b/chapter3/spark.py index b391f8f..2969268 100644 --- a/chapter3/spark.py +++ b/chapter3/spark.py @@ -3,6 +3,7 @@ from pyspark.sql import Row # $example off:spark_hive$ import os import pymysql +import datetime import time def mysql_query(sql): @@ -29,6 +30,12 @@ def mysql_execute(sql): def today(): return time.strftime('%Y-%m-%d') +def getYesterday(day): + today=datetime.date.today() + oneday=datetime.timedelta(days=day) + yesterday=today-oneday + return yesterday.strftime('%Y-%m-%d') + def collect_crawl_info(spark): df = spark.sql("select count(*) as N from jd_comment") @@ -57,6 +64,16 @@ def collect_news(spark): mysql_execute("insert into news (comment_time, content, comment_id) values ('{}', '{}', '{}')".format( row["comment_time"], row["content"], row["id"])) +def get_last_day_count(spark): + """获取过去几天的采集量""" + for i in range(5): + df = spark.sql("select count(*) as N from jd where created_at like '"+getYesterday(i+1)+"%'") + jd_last_count = df.rdd.collect()[0]["N"] + + df = spark.sql("select count(*) as N from jd_comment where created_at like '"+getYesterday(i+1)+"%'") + jd_comment_last_count = df.rdd.collect()[0]["N"] + + if __name__ == "__main__": # $example on:spark_hive$