From 5639278337d3b41daee5089ccff6cc1e161c1688 Mon Sep 17 00:00:00 2001 From: guange <8863824@gmail.com> Date: Mon, 21 Jan 2019 10:17:09 +0800 Subject: [PATCH] . --- .gitignore | 1 + .vscode/settings.json | 3 + chapter1/crawler/demo.py | 61 +++++++++++-------- chapter1/crawler/taobao/items.py | 3 +- chapter1/crawler/taobao/spiders/jd1.py | 43 +++++++------ chapter2/mysite/myapp/static/scripts/index.js | 3 + .../mysite/myapp/static/styles/common.css | 41 +++++++++++++ .../mysite/myapp/templates/myapp/crawl.html | 24 +++++++- chapter3/spark.py | 1 + chapter1/requirements.txt => requirements.txt | 1 + 10 files changed, 136 insertions(+), 45 deletions(-) create mode 100644 .vscode/settings.json rename chapter1/requirements.txt => requirements.txt (98%) diff --git a/.gitignore b/.gitignore index 46e2435..ebc0014 100644 --- a/.gitignore +++ b/.gitignore @@ -7,3 +7,4 @@ chapter1/crawler/datas/comments1/ chapter1/crawler/datas/products/ chapter1/crawler/taobao/settings.py chapter2/mysite/mysite/settings.py +.env \ No newline at end of file diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..4a5e2f5 --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,3 @@ +{ + "python.pythonPath": ".env/bin/python" +} \ No newline at end of file diff --git a/chapter1/crawler/demo.py b/chapter1/crawler/demo.py index 31968ab..46f57d6 100644 --- a/chapter1/crawler/demo.py +++ b/chapter1/crawler/demo.py @@ -1,31 +1,44 @@ -import re,requests,json +import re +import requests +import json s = requests.session() -url = 'https://club.jd.com/comment/productPageComments.action' +url = "https://club.jd.com/comment/productPageComments.action" data = { -'callback':'fetchJSON_comment98vv61', -'productId':'3888284', -'score':0, -'sortType':5, -'pageSize':10, -'isShadowSku':0, -'page':0 + 'callback': 'fetchJSON_comment98vv61', + 'productId': '3888284', + 'score': 0, + 'sortType': 5, + 'pageSize': 10, + 'isShadowSku': 0, + 'page': 0 } -while True: - t = s.get(url,params = data).text - try: - t = re.search(r'(?<=fetchJSON_comment98vv61\().*(?=\);)',t).group(0) - except Exception as e: - break - j = json.loads(t) - commentSummary = j['comments'] - for comment in commentSummary: - c_content = comment['content'] - c_time = comment['referenceTime'] - c_name = comment['nickname'] - c_client = comment['userClientShow'] - print('{} {} {}\n{}\n'.format(c_name,c_time,c_client,c_content)) +def main(): + while True: + t = s.get(url, params=data).text + try: + t = re.search( + r'(?<=fetchjson_comment98vv61\().*(?=\);)', t).group(0) + except Exception as e: + print(e) - data['page'] += 1 \ No newline at end of file + j = json.loads(t) + commentsummary = j["comments"] + for comment in commentsummary: + c_content = comment["content"] + c_time = comment['referencetime'] + c_name = comment['nickname'] + c_client = comment['userclientshow'] + print('{} {} {}\n{}\n'.format(c_name, c_time, c_client, c_content)) + + data['page'] += 1 + + +if __name__ == "__main__": + import time + import datetime + oldtime = datetime.timedelta(days=1) + print(oldtime) + print(datetime.datetime.now()-oldtime) diff --git a/chapter1/crawler/taobao/items.py b/chapter1/crawler/taobao/items.py index d03dbfd..6ff862b 100644 --- a/chapter1/crawler/taobao/items.py +++ b/chapter1/crawler/taobao/items.py @@ -22,7 +22,7 @@ class ProductItem(scrapy.Item): author = scrapy.Field() shop = scrapy.Field() pubtime = scrapy.Field() - publisher = scrapy.Field() + publisher = scrapy.Field() url = scrapy.Field() detail = scrapy.Field() comment_num = scrapy.Field() @@ -67,4 +67,3 @@ class JDCommentSummary(scrapy.Item): comment_count = scrapy.Field() default_good_count = scrapy.Field() good_rate = scrapy.Field() - diff --git a/chapter1/crawler/taobao/spiders/jd1.py b/chapter1/crawler/taobao/spiders/jd1.py index 2a03084..df18422 100644 --- a/chapter1/crawler/taobao/spiders/jd1.py +++ b/chapter1/crawler/taobao/spiders/jd1.py @@ -55,7 +55,8 @@ class Jd1Spider(scrapy.Spider): c_name = comment['nickname'] c_client = comment['userClientShow'] c_id = "%s_%s" % (product_id, comment['id']) - print('{}, {} {} {}\n{}\n'.format(c_id, c_name, c_time, c_client, c_content)) + print('{}, {} {} {}\n{}\n'.format( + c_id, c_name, c_time, c_client, c_content)) yield JDCommentItem( id=c_id, user_name=c_name, @@ -67,27 +68,26 @@ class Jd1Spider(scrapy.Spider): if page == 0: productCommentSummary = j['productCommentSummary'] yield JDCommentSummary( - id = product_id, - good_count = productCommentSummary['goodCount'], - general_count= productCommentSummary['generalCount'], - poor_count= productCommentSummary['poorCount'], - comment_count= productCommentSummary['commentCount'], - default_good_count= productCommentSummary['defaultGoodCount'], - good_rate= productCommentSummary['goodRate'] + id=product_id, + good_count=productCommentSummary['goodCount'], + general_count=productCommentSummary['generalCount'], + poor_count=productCommentSummary['poorCount'], + comment_count=productCommentSummary['commentCount'], + default_good_count=productCommentSummary['defaultGoodCount'], + good_rate=productCommentSummary['goodRate'] ) - page = page + 1 next_comment_url = self.make_comment_url(product_id, page) # time.sleep(10) - if page>settings['COMMENT_MAX_PAGE']: + if page > settings['COMMENT_MAX_PAGE']: print("评论抓取达到最大深度, %s, 页数: %d" % (product_id, page)) else: yield Request(next_comment_url, self.parse_comment, - meta={'product_id': product_id, 'page': page}) + meta={'product_id': product_id, 'page': page}) else: print("评论抓取完成, %s, 页数: %d" % (product_id, page)) @@ -96,20 +96,26 @@ class Jd1Spider(scrapy.Spider): for item in items: # pdb.set_trace() - name = item.xpath('.//div[contains(@class, "p-name")]/a/em/text()').extract_first() + name = item.xpath( + './/div[contains(@class, "p-name")]/a/em/text()').extract_first() print(name) price = item.xpath( './/div[contains(@class, "p-price")]/strong/i/text()').extract_first() print(price) - url = item.xpath('.//div[contains(@class, "p-name")]/a/@href').extract_first() + url = item.xpath( + './/div[contains(@class, "p-name")]/a/@href').extract_first() print('https:' + url) - img = item.xpath('.//div[@class="p-img"]/a/img/@data-lazy-img').extract_first() + img = item.xpath( + './/div[@class="p-img"]/a/img/@data-lazy-img').extract_first() if not img: - img = item.xpath('.//div[@class="p-img"]/a/img/@src').extract_first() + img = item.xpath( + './/div[@class="p-img"]/a/img/@src').extract_first() - comment_num = item.xpath('.//div[@class="p-commit"]/strong/a/text()').extract_first() + comment_num = item.xpath( + './/div[@class="p-commit"]/strong/a/text()').extract_first() print(comment_num) - shop = item.xpath('.//div[@class="p-shop"]/span/a/@title').extract_first() + shop = item.xpath( + './/div[@class="p-shop"]/span/a/@title').extract_first() print(shop) yield JDProductItem( @@ -132,7 +138,8 @@ class Jd1Spider(scrapy.Spider): priority=100) # 获取下一页 - next_page = response.xpath('//a[@class="pn-next"]/@href').extract_first() + next_page = response.xpath( + '//a[@class="pn-next"]/@href').extract_first() if next_page: page = 1 m = re.match(r'.+page=(\d+).+', next_page) diff --git a/chapter2/mysite/myapp/static/scripts/index.js b/chapter2/mysite/myapp/static/scripts/index.js index 629a227..cb24aa9 100755 --- a/chapter2/mysite/myapp/static/scripts/index.js +++ b/chapter2/mysite/myapp/static/scripts/index.js @@ -158,6 +158,7 @@ function init(){ ] }); + if(document.getElementById('histogramChart')){ var histogramChart = echarts.init(document.getElementById('histogramChart')); histogramChart.setOption({ @@ -255,6 +256,8 @@ function init(){ ] }); + } + var lineChart2 = echarts.init(document.getElementById('lineChart2')); lineChart2.setOption({ diff --git a/chapter2/mysite/myapp/static/styles/common.css b/chapter2/mysite/myapp/static/styles/common.css index 6f97d87..6aafd2a 100755 --- a/chapter2/mysite/myapp/static/styles/common.css +++ b/chapter2/mysite/myapp/static/styles/common.css @@ -1042,3 +1042,44 @@ border-radius: 3px; .popBox .ttBox{height: 30px; line-height: 30px; padding: 14px 30px; border-bottom: solid 1px #eef0f1;text-align: center;-webkit-box-sizing: content-box; -moz-box-sizing: content-box;box-sizing: content-box;} .popBox .ttBox .tt{font-size: 20px; display: inline-block; height: 30px;} .popBox .txtBox{height: calc(100% - 80px);overflow: auto;padding: 10px 0;} + +.percentChart{ + font-size: 14px; + padding: 90px 40px 0px 0px; +} +.percentChart li{ + margin-bottom: 25px; + clear:both; + line-height: 35px; + display: flex; +} +.percentChart li span.leftTitle{ + text-align: right; + display: block; + color: #ffffff; + width: 130px; +} +.percentChart li .rightPercent{ + flex:1; + position: relative; + background-color: #32CD32; + height: 35px; + margin-left: 10px; +} +.percentChart li .rightPercent span{ + position: absolute; + color: #fff; + right: 5px; + font-size: 12px; + line-height: 35px +} +.percentChart li .rightPercent p{ + background-color: #FF7F50; + width: 50%; + text-align: right; + color: #ffffff; + line-height: 35px; + font-size: 12px; + padding-right: 5px; + box-sizing:border-box; +} \ No newline at end of file diff --git a/chapter2/mysite/myapp/templates/myapp/crawl.html b/chapter2/mysite/myapp/templates/myapp/crawl.html index 24cb49c..385d2fd 100644 --- a/chapter2/mysite/myapp/templates/myapp/crawl.html +++ b/chapter2/mysite/myapp/templates/myapp/crawl.html @@ -139,7 +139,29 @@
40
+300
+150
+