diff --git a/.gitignore b/.gitignore index ebc0014..40dad18 100644 --- a/.gitignore +++ b/.gitignore @@ -7,4 +7,5 @@ chapter1/crawler/datas/comments1/ chapter1/crawler/datas/products/ chapter1/crawler/taobao/settings.py chapter2/mysite/mysite/settings.py -.env \ No newline at end of file +.env +chapter4/results.csv diff --git a/chapter1/crawler/taobao/__pycache__/middlewares.cpython-37.pyc b/chapter1/crawler/taobao/__pycache__/middlewares.cpython-37.pyc index a0323b1..f5c6419 100644 Binary files a/chapter1/crawler/taobao/__pycache__/middlewares.cpython-37.pyc and b/chapter1/crawler/taobao/__pycache__/middlewares.cpython-37.pyc differ diff --git a/chapter1/crawler/taobao/items.py b/chapter1/crawler/taobao/items.py index aa8f4c5..5d4a5df 100644 --- a/chapter1/crawler/taobao/items.py +++ b/chapter1/crawler/taobao/items.py @@ -13,6 +13,15 @@ class ZhihuItem(scrapy.Item): # name = scrapy.Field() pass +class ProductSpecItem(scrapy.Item): + cpu = scrapy.Field() + rom = scrapy.Field() + ram = scrapy.Field() + resolution = scrapy.Field() + charge = scrapy.Field() + weight = scrapy.Field() + brand = scrapy.Field() + class ProxyItem(scrapy.Item): ip = scrapy.Field() port = scrapy.Field() diff --git a/chapter1/crawler/taobao/middlewares.py b/chapter1/crawler/taobao/middlewares.py index fc129aa..0eae133 100644 --- a/chapter1/crawler/taobao/middlewares.py +++ b/chapter1/crawler/taobao/middlewares.py @@ -6,12 +6,14 @@ # https://doc.scrapy.org/en/latest/topics/spider-middleware.html import pdb import time +import json +import requests from scrapy.http import HtmlResponse from selenium.common.exceptions import TimeoutException from selenium import webdriver from selenium.webdriver.support.wait import WebDriverWait - +from scrapy_splash import SplashRequest from scrapy import signals @@ -144,3 +146,23 @@ class ZhihuChromeMiddleware(object): def spider_closed(self, spider, reason): print('驱动关闭') self.driver.close() + + +class RandomProxyMiddleware(object): + _ip_lists = [] + + def get_ip(self): + if len(self._ip_lists) <= 0: + r = requests.get('http://bigdata1.educoder.net/myapp/api/proxys.json') + self._ip_lists = json.loads(r.text) + + data = self._ip_lists.pop() + return "http://%s:%d" % (data['ip'], data['port']) + + + def process_request(self, request, spider): + if isinstance(request, SplashRequest): + ip = self.get_ip() + print(ip) + request.meta['splash']['args']['proxy'] = ip + # request.meta['proxy'] = ip \ No newline at end of file diff --git a/chapter1/crawler/taobao/pipelines.py b/chapter1/crawler/taobao/pipelines.py index 71ce326..2c49775 100644 --- a/chapter1/crawler/taobao/pipelines.py +++ b/chapter1/crawler/taobao/pipelines.py @@ -12,7 +12,7 @@ import happybase from scrapy.conf import settings from scrapy.pipelines.images import ImagesPipeline -from taobao.items import JDProductItem, JDCommentItem, JDCommentSummary +from taobao.items import JDProductItem, JDCommentItem, JDCommentSummary,ProductSpecItem from taobao.utils import check_alive_proxy from scrapy.exceptions import DropItem @@ -129,5 +129,17 @@ class JDHbasePipeline(object): "comment:default_good_count": str(item["default_good_count"]), "comment:good_rate": str(item["good_rate"]) }) + elif isinstance(item, ProductSpecItem): + self.product_table.put(item["id"], + { + "spec:cpu": str(item["cpu"]), + "spec:rom": str(item['rom']), + "spec:ram": str(item["ram"]), + "spec:resolution": str(item["resolution"]), + "spec:charge": str(item["charge"]), + "spec:weight": str(item["weight"]), + "spec:brand": str(item["brand"]), + } + ) return item diff --git a/chapter1/crawler/taobao/spiders/jd.py b/chapter1/crawler/taobao/spiders/jd.py index 5ddef03..86f3a42 100644 --- a/chapter1/crawler/taobao/spiders/jd.py +++ b/chapter1/crawler/taobao/spiders/jd.py @@ -7,7 +7,7 @@ from scrapy import Request from scrapy.conf import settings from scrapy_splash import SplashRequest import pdb -from taobao.items import JDProductItem, JDCommentItem, JDCommentSummary +from taobao.items import JDProductItem, JDCommentItem, JDCommentSummary, ProductSpecItem import re from taobao.utils import now_time @@ -21,11 +21,19 @@ class JdSpider(scrapy.Spider): ] custom_settings = { - 'LOG_LEVEL': "WARN", + 'LOG_LEVEL': "INFO", 'ITEM_PIPELINES': { 'taobao.pipelines.JDCleanDataPipeline': 300, - 'taobao.pipelines.JDHbasePipeline': 400, - } + #'taobao.pipelines.JDHbasePipeline': 400, + }, + 'DOWNLOADER_MIDDLEWARES': { + 'scrapy.downloadermiddlewares.retry.RetryMiddleware': 90, + #'taobao.middlewares.RandomProxyMiddleware': 100, + # 'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': 110, + 'scrapy_splash.SplashCookiesMiddleware': 723, + 'scrapy_splash.SplashMiddleware': 725, + 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810, + }, } def make_comment_url(self, product_id, page): @@ -38,10 +46,34 @@ class JdSpider(scrapy.Spider): for url in self.start_urls: yield SplashRequest(url, self.parse, + method='GET', args={}) + def parse_spec(self, response): + #spec + pdb.set_trace() + cpu = response.xpath('//dt[text()="CPU型号"]/following-sibling::dd/text()').extract_first() + rom = response.xpath('//dt[text()="ROM"]/following-sibling::dd[2]/text()').extract_first() + ram = response.xpath('//dt[text()="RAM"]/following-sibling::dd[2]/text()').extract_first() + resolution = response.xpath('//dt[text()="分辨率"]/following-sibling::dd/text()').extract_first() + charge = response.xpath('//dt[text()="电池容量(mAh)"]/following-sibling::dd/text()').extract_first() + weight = response.xpath('//dt[text()="机身重量(g)"]/following-sibling::dd/text()').extract_first() + brand = response.xpath('//dt[text()="品牌"]/following-sibling::dd/text()').extract_first() + + print(cpu, rom, ram, resolution, charge, weight, brand) + + yield ProductSpecItem( + cpu = cpu, + rom = rom, + ram = ram, + resolution = resolution, + charge = charge, + weight = weight, + brand = brand, + ) + + def parse_comment(self, response): - # pdb.set_trace() product_id = response.meta["product_id"] page = response.meta["page"] print("抓取评论, %s, 当前 %d页" % (product_id, page)) @@ -128,9 +160,12 @@ class JdSpider(scrapy.Spider): if m: product_id = m.group(1) comment_url = self.make_comment_url(product_id, 0) - yield Request(comment_url, self.parse_comment, - meta={'product_id': product_id, 'page': 0}, - priority=100) + # yield Request(comment_url, self.parse_comment, + # meta={'product_id': product_id, 'page': 0}, + # priority=100) + + if url: + yield SplashRequest(url='https:'+url, callback=self.parse_spec) # 获取下一页 next_page = response.xpath('//a[@class="pn-next"]/@href').extract_first() diff --git a/chapter4/check_proxy.py b/chapter4/check_proxy.py new file mode 100644 index 0000000..8eb6294 --- /dev/null +++ b/chapter4/check_proxy.py @@ -0,0 +1,31 @@ +import requests +import time +import re + +def check_alive_proxy(ip, port): + begin_time = int(time.time()) + proxies = { + "http": "http://%s:%s" % (ip, port), + "https": "https://%s:%s" % (ip, port), + } + response = requests.get( + 'http://2019.ip138.com/ic.asp', proxies=proxies, timeout=3) + + check_time = int(time.time()) - begin_time + + response.encoding = 'gb2312' + m = re.search(r'.+\[((\d+\.){3}\d+)\].+', response.text) + if m: + if m.group(1) == ip: + return check_time + raise RuntimeError("连接出错") + +if __name__ == "__main__": + with open('results.csv') as f: + for line in f: + try: + check_time = check_alive_proxy(line.strip(), 9999) + if check_time<5: + print("%s:%d %d", line, 9999, check_time) + except Exception as e: + print(e) \ No newline at end of file