添加产品规格

2019-01-25 15:54:57 +08:00 · 2019-01-25 15:54:57 +08:00 · 5955efe77b
parent 48dd0711b0
commit 5955efe77b
7 changed files with 121 additions and 11 deletions
--- a/.gitignore
+++ b/.gitignore
@ -8,3 +8,4 @@ chapter1/crawler/datas/products/
 chapter1/crawler/taobao/settings.py
 chapter2/mysite/mysite/settings.py
 .env
+chapter4/results.csv
--- a/chapter1/crawler/taobao/pycache/middlewares.cpython-37.pyc
+++ b/chapter1/crawler/taobao/pycache/middlewares.cpython-37.pyc
--- a/chapter1/crawler/taobao/items.py
+++ b/chapter1/crawler/taobao/items.py
@ -13,6 +13,15 @@ class ZhihuItem(scrapy.Item):
    # name = scrapy.Field()
    pass

+class ProductSpecItem(scrapy.Item):
+    cpu = scrapy.Field()
+    rom = scrapy.Field()
+    ram = scrapy.Field() 
+    resolution = scrapy.Field()
+    charge = scrapy.Field()
+    weight = scrapy.Field()
+    brand = scrapy.Field()
+
 class ProxyItem(scrapy.Item):
    ip = scrapy.Field()
    port = scrapy.Field()
--- a/chapter1/crawler/taobao/middlewares.py
+++ b/chapter1/crawler/taobao/middlewares.py
@ -6,12 +6,14 @@
 # https://doc.scrapy.org/en/latest/topics/spider-middleware.html
 import pdb
 import time
+import json
+import requests

 from scrapy.http import HtmlResponse
 from selenium.common.exceptions import TimeoutException
 from selenium import webdriver
 from selenium.webdriver.support.wait import WebDriverWait
-
+from scrapy_splash import SplashRequest

 from scrapy import signals

@ -144,3 +146,23 @@ class ZhihuChromeMiddleware(object):
    def spider_closed(self, spider, reason):
        print('驱动关闭')
        self.driver.close()
+
+
+class RandomProxyMiddleware(object):
+    _ip_lists = []
+
+    def get_ip(self):
+        if len(self._ip_lists) <= 0:
+            r = requests.get('http://bigdata1.educoder.net/myapp/api/proxys.json')
+            self._ip_lists = json.loads(r.text)
+
+        data = self._ip_lists.pop()
+        return "http://%s:%d" % (data['ip'], data['port'])
+            
+
+    def process_request(self, request, spider):
+        if isinstance(request, SplashRequest):
+            ip = self.get_ip()
+            print(ip)
+            request.meta['splash']['args']['proxy'] = ip
+            # request.meta['proxy'] = ip
--- a/chapter1/crawler/taobao/pipelines.py
+++ b/chapter1/crawler/taobao/pipelines.py
@ -12,7 +12,7 @@ import happybase
 from scrapy.conf import settings
 from scrapy.pipelines.images import ImagesPipeline

-from taobao.items import JDProductItem, JDCommentItem, JDCommentSummary
+from taobao.items import JDProductItem, JDCommentItem, JDCommentSummary,ProductSpecItem
 from taobao.utils import check_alive_proxy
 from scrapy.exceptions import DropItem

@ -129,5 +129,17 @@ class JDHbasePipeline(object):
                                    "comment:default_good_count": str(item["default_good_count"]),
                                    "comment:good_rate": str(item["good_rate"])
                                    })
+        elif isinstance(item, ProductSpecItem):
+            self.product_table.put(item["id"],
+                    {
+                        "spec:cpu": str(item["cpu"]),
+                        "spec:rom": str(item['rom']),
+                        "spec:ram": str(item["ram"]),
+                        "spec:resolution": str(item["resolution"]),
+                        "spec:charge": str(item["charge"]),
+                        "spec:weight": str(item["weight"]),
+                        "spec:brand": str(item["brand"]),
+                    }
+             )

        return item
--- a/chapter1/crawler/taobao/spiders/jd.py
+++ b/chapter1/crawler/taobao/spiders/jd.py
@ -7,7 +7,7 @@ from scrapy import Request
 from scrapy.conf import settings
 from scrapy_splash import SplashRequest
 import pdb
-from taobao.items import JDProductItem, JDCommentItem, JDCommentSummary
+from taobao.items import JDProductItem, JDCommentItem, JDCommentSummary, ProductSpecItem
 import re

 from taobao.utils import now_time
@ -21,11 +21,19 @@ class JdSpider(scrapy.Spider):
    ]

    custom_settings = {
-        'LOG_LEVEL': "WARN",
+        'LOG_LEVEL': "INFO",
        'ITEM_PIPELINES': {
            'taobao.pipelines.JDCleanDataPipeline': 300,
-            'taobao.pipelines.JDHbasePipeline': 400,
-        }
+            #'taobao.pipelines.JDHbasePipeline': 400,
+        },
+        'DOWNLOADER_MIDDLEWARES': {
+            'scrapy.downloadermiddlewares.retry.RetryMiddleware': 90,
+            #'taobao.middlewares.RandomProxyMiddleware': 100,
+            # 'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': 110,
+            'scrapy_splash.SplashCookiesMiddleware': 723,
+            'scrapy_splash.SplashMiddleware': 725,
+            'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810,
+        },
    }

    def make_comment_url(self, product_id, page):
@ -38,10 +46,34 @@ class JdSpider(scrapy.Spider):
        for url in self.start_urls:
            yield SplashRequest(url,
                                self.parse,
+                                method='GET',
                                args={})

+    def parse_spec(self, response):
+        #spec
+        pdb.set_trace()
+        cpu = response.xpath('//dt[text()="CPU型号"]/following-sibling::dd/text()').extract_first()
+        rom = response.xpath('//dt[text()="ROM"]/following-sibling::dd[2]/text()').extract_first()
+        ram = response.xpath('//dt[text()="RAM"]/following-sibling::dd[2]/text()').extract_first()
+        resolution = response.xpath('//dt[text()="分辨率"]/following-sibling::dd/text()').extract_first()
+        charge = response.xpath('//dt[text()="电池容量（mAh）"]/following-sibling::dd/text()').extract_first()
+        weight = response.xpath('//dt[text()="机身重量（g）"]/following-sibling::dd/text()').extract_first()
+        brand = response.xpath('//dt[text()="品牌"]/following-sibling::dd/text()').extract_first()
+
+        print(cpu, rom, ram, resolution, charge, weight, brand)
+
+        yield ProductSpecItem(
+            cpu = cpu,
+            rom = rom,
+            ram = ram,
+            resolution = resolution,
+            charge = charge,
+            weight = weight,
+            brand = brand, 
+        )
+
+
    def parse_comment(self, response):
-        # pdb.set_trace()
        product_id = response.meta["product_id"]
        page = response.meta["page"]
        print("抓取评论, %s, 当前 %d页" % (product_id, page))
@ -128,9 +160,12 @@ class JdSpider(scrapy.Spider):
            if m:
                product_id = m.group(1)
                comment_url = self.make_comment_url(product_id, 0)
-                yield Request(comment_url, self.parse_comment,
-                              meta={'product_id': product_id, 'page': 0},
-                              priority=100)
+                # yield Request(comment_url, self.parse_comment,
+                #               meta={'product_id': product_id, 'page': 0},
+                #               priority=100)
+            
+            if url:
+                yield SplashRequest(url='https:'+url, callback=self.parse_spec)

        # 获取下一页
        next_page = response.xpath('//a[@class="pn-next"]/@href').extract_first()
--- a/chapter4/check_proxy.py
+++ b/chapter4/check_proxy.py
@ -0,0 +1,31 @@
+import requests
+import time
+import re
+
+def check_alive_proxy(ip, port):
+    begin_time = int(time.time())
+    proxies = {
+        "http": "http://%s:%s" % (ip, port),
+        "https": "https://%s:%s" % (ip, port),
+    }
+    response = requests.get(
+        'http://2019.ip138.com/ic.asp', proxies=proxies, timeout=3)
+
+    check_time = int(time.time()) - begin_time
+
+    response.encoding = 'gb2312'
+    m = re.search(r'.+\[((\d+\.){3}\d+)\].+', response.text)
+    if m:
+        if m.group(1) == ip:
+            return check_time
+    raise RuntimeError("连接出错")
+
+if __name__ == "__main__":
+    with open('results.csv') as f:
+        for line in f:
+            try:
+                check_time = check_alive_proxy(line.strip(), 9999)
+                if check_time<5:
+                    print("%s:%d %d", line, 9999, check_time)
+            except Exception as e:
+                print(e)