代理抓取

2019-01-23 15:30:02 +08:00 · 2019-01-23 15:30:02 +08:00 · 30f04b9aeb
parent 13044e3244
commit 30f04b9aeb
10 changed files with 10138 additions and 0 deletions
--- a/chapter1/crawler/proxy.csv
+++ b/chapter1/crawler/proxy.csv
--- a/chapter1/crawler/proxy.list
+++ b/chapter1/crawler/proxy.list
--- a/chapter1/crawler/taobao/pycache/items.cpython-37.pyc
+++ b/chapter1/crawler/taobao/pycache/items.cpython-37.pyc
--- a/chapter1/crawler/taobao/pycache/pipelines.cpython-37.pyc
+++ b/chapter1/crawler/taobao/pycache/pipelines.cpython-37.pyc
--- a/chapter1/crawler/taobao/pycache/settings.cpython-37.pyc
+++ b/chapter1/crawler/taobao/pycache/settings.cpython-37.pyc
--- a/chapter1/crawler/taobao/items.py
+++ b/chapter1/crawler/taobao/items.py
@ -13,6 +13,14 @@ class ZhihuItem(scrapy.Item):
    # name = scrapy.Field()
    pass

+class ProxyItem(scrapy.Item):
+    ip = scrapy.Field()
+    port = scrapy.Field()
+    address = scrapy.Field()
+    http = scrapy.Field()
+    fast = scrapy.Field()
+    alive = scrapy.Field()
+    checktime = scrapy.Field()

 class ProductItem(scrapy.Item):
    id = scrapy.Field()
--- a/chapter1/crawler/taobao/pipelines.py
+++ b/chapter1/crawler/taobao/pipelines.py
@ -13,6 +13,21 @@ from scrapy.conf import settings
 from scrapy.pipelines.images import ImagesPipeline

 from taobao.items import JDProductItem, JDCommentItem, JDCommentSummary
+from taobao.utils import check_alive_proxy
+from scrapy.exceptions import DropItem
+
+
+class CheckProxyPipeline(object):
+
+    def process_item(self, item, spider):
+        try:
+            time = check_alive_proxy(item["ip"], item["port"])
+            item['checktime'] = time
+            return item
+        except Exception as e:
+            print(e)
+        
+        raise DropItem("校验出错")


 class MyImagesPipeline(ImagesPipeline):
--- a/chapter1/crawler/taobao/spiders/proxy.py
+++ b/chapter1/crawler/taobao/spiders/proxy.py
@ -0,0 +1,30 @@
+#coding=utf-8
+
+"""
+抓取西刺免费代理
+"""
+
+import scrapy
+from scrapy import Request
+from scrapy.conf import settings
+from scrapy_splash import SplashRequest
+import pdb
+from taobao.items import ProxyItem
+
+
+class PorxySpider(scrapy.Spider):
+    name = 'proxy'
+    allowed_domains = ['raw.githubusercontent.com']
+    start_urls = [
+        "https://raw.githubusercontent.com/fate0/proxylist/master/proxy.list"
+    ]
+
+    custom_settings = {
+        'LOG_LEVEL': "WARN",
+        'ITEM_PIPELINES': {
+            'taobao.pipelines.CheckProxyPipeline': 300,
+        }
+    }
+
+    def parse(self, response):
+        print(response.text)
--- a/chapter1/crawler/taobao/utils.py
+++ b/chapter1/crawler/taobao/utils.py
@ -1,6 +1,79 @@
 # coding=utf-8
 import time
+import requests
+import re
+import time
+import json
+import pymysql
+
+
+def mysql_query(sql):
+    db = pymysql.connect("localhost", "root", "123456789",
+                         "sparkproject", port=12346)
+    cursor = db.cursor()
+    cursor.execute(sql)
+    data = cursor.fetchone()
+    db.close()
+    return data
+
+
+def mysql_execute(sql):
+    print("execute: %s" % sql)
+    db = pymysql.connect(host="127.0.0.1",
+                         user="root",
+                         password="123456789",
+                         database="sparkproject",
+                         port=12346)
+    cursor = db.cursor()
+
+    try:
+        cursor.execute(sql)
+        db.commit()
+    except Exception as e:
+        print(e)
+        db.rollback()
+    finally:
+        db.close()


 def now_time():
    return time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
+
+
+def check_alive_proxy(ip, port):
+    begin_time = int(time.time())
+    proxies = {
+        "http": "http://%s:%s" % (ip, port),
+        "https": "https://%s:%s" % (ip, port),
+    }
+    response = requests.get(
+        'http://2019.ip138.com/ic.asp', proxies=proxies, timeout=5)
+
+    check_time = int(time.time()) - begin_time
+
+    response.encoding = 'gb2312'
+    m = re.search(r'.+\[((\d+\.){3}\d+)\].+', response.text)
+    if m:
+        if m.group(1) == ip:
+            return check_time
+    raise RuntimeError("连接出错")
+
+
+if __name__ == "__main__":
+    r = requests.get(
+        'https://raw.githubusercontent.com/fate0/proxylist/master/proxy.list')
+    print(r.status_code)
+    f = r.text.split("\n")
+    for line in f:
+        o = json.loads(line)
+        if o["host"]:
+            try:
+                check_time = check_alive_proxy(o["host"], str(o["port"]))
+                if check_time < 3:
+                    print("%s:%d" % (o["host"], o["port"]))
+                    mysql_execute("insert into proxys (ip,port,checktime) values ('{}', {}, {})".format(
+                        o['host'], o['port'], check_time
+                    ))
+            except Exception as e:
+                # print(e)
+                pass
--- a/readme.md
+++ b/readme.md
@ -1,3 +1,5 @@
 1. 数据抓取

 2. 数据清洗
+
+3. spark数据统计