代理抓取

2019-01-23 15:30:02 +08:00 · 2019-01-23 15:30:02 +08:00 · 30f04b9aeb
parent 13044e3244
commit 30f04b9aeb
10 changed files with 10138 additions and 0 deletions
--- a/chapter1/crawler/proxy.csv
+++ b/chapter1/crawler/proxy.csv
--- a/chapter1/crawler/proxy.list
+++ b/chapter1/crawler/proxy.list
--- a/chapter1/crawler/taobao/pycache/items.cpython-37.pyc
+++ b/chapter1/crawler/taobao/pycache/items.cpython-37.pyc
--- a/chapter1/crawler/taobao/pycache/pipelines.cpython-37.pyc
+++ b/chapter1/crawler/taobao/pycache/pipelines.cpython-37.pyc
--- a/chapter1/crawler/taobao/pycache/settings.cpython-37.pyc
+++ b/chapter1/crawler/taobao/pycache/settings.cpython-37.pyc
--- a/chapter1/crawler/taobao/items.py
+++ b/chapter1/crawler/taobao/items.py
@ -13,6 +13,14 @@ class ZhihuItem(scrapy.Item):
    # name = scrapy.Field()
    pass
 class ProxyItem(scrapy.Item):
    ip = scrapy.Field()
    port = scrapy.Field()
    address = scrapy.Field()
    http = scrapy.Field()
    fast = scrapy.Field()
    alive = scrapy.Field()
    checktime = scrapy.Field()
 class ProductItem(scrapy.Item):
    id = scrapy.Field()
--- a/chapter1/crawler/taobao/pipelines.py
+++ b/chapter1/crawler/taobao/pipelines.py
@ -13,6 +13,21 @@ from scrapy.conf import settings
 from scrapy.pipelines.images import ImagesPipeline
 from taobao.items import JDProductItem, JDCommentItem, JDCommentSummary
 from taobao.utils import check_alive_proxy
 from scrapy.exceptions import DropItem
 class CheckProxyPipeline(object):
    def process_item(self, item, spider):
        try:
            time = check_alive_proxy(item["ip"], item["port"])
            item['checktime'] = time
            return item
        except Exception as e:
            print(e)
        raise DropItem("校验出错")
 class MyImagesPipeline(ImagesPipeline):
--- a/chapter1/crawler/taobao/spiders/proxy.py
+++ b/chapter1/crawler/taobao/spiders/proxy.py
@ -0,0 +1,30 @@
 #coding=utf-8
 """
 抓取西刺免费代理
 """
 import scrapy
 from scrapy import Request
 from scrapy.conf import settings
 from scrapy_splash import SplashRequest
 import pdb
 from taobao.items import ProxyItem
 class PorxySpider(scrapy.Spider):
    name = 'proxy'
    allowed_domains = ['raw.githubusercontent.com']
    start_urls = [
        "https://raw.githubusercontent.com/fate0/proxylist/master/proxy.list"
    ]
    custom_settings = {
        'LOG_LEVEL': "WARN",
        'ITEM_PIPELINES': {
            'taobao.pipelines.CheckProxyPipeline': 300,
        }
    }
    def parse(self, response):
        print(response.text)
--- a/chapter1/crawler/taobao/utils.py
+++ b/chapter1/crawler/taobao/utils.py
@ -1,6 +1,79 @@
 # coding=utf-8
 import time
 import requests
 import re
 import time
 import json
 import pymysql
 def mysql_query(sql):
    db = pymysql.connect("localhost", "root", "123456789",
                         "sparkproject", port=12346)
    cursor = db.cursor()
    cursor.execute(sql)
    data = cursor.fetchone()
    db.close()
    return data
 def mysql_execute(sql):
    print("execute: %s" % sql)
    db = pymysql.connect(host="127.0.0.1",
                         user="root",
                         password="123456789",
                         database="sparkproject",
                         port=12346)
    cursor = db.cursor()
    try:
        cursor.execute(sql)
        db.commit()
    except Exception as e:
        print(e)
        db.rollback()
    finally:
        db.close()
 def now_time():
    return time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
 def check_alive_proxy(ip, port):
    begin_time = int(time.time())
    proxies = {
        "http": "http://%s:%s" % (ip, port),
        "https": "https://%s:%s" % (ip, port),
    }
    response = requests.get(
        'http://2019.ip138.com/ic.asp', proxies=proxies, timeout=5)
    check_time = int(time.time()) - begin_time
    response.encoding = 'gb2312'
    m = re.search(r'.+\[((\d+\.){3}\d+)\].+', response.text)
    if m:
        if m.group(1) == ip:
            return check_time
    raise RuntimeError("连接出错")
 if __name__ == "__main__":
    r = requests.get(
        'https://raw.githubusercontent.com/fate0/proxylist/master/proxy.list')
    print(r.status_code)
    f = r.text.split("\n")
    for line in f:
        o = json.loads(line)
        if o["host"]:
            try:
                check_time = check_alive_proxy(o["host"], str(o["port"]))
                if check_time < 3:
                    print("%s:%d" % (o["host"], o["port"]))
                    mysql_execute("insert into proxys (ip,port,checktime) values ('{}', {}, {})".format(
                        o['host'], o['port'], check_time
                    ))
            except Exception as e:
                # print(e)
                pass
--- a/readme.md
+++ b/readme.md
@ -1,3 +1,5 @@
 1. 数据抓取
 2. 数据清洗
 3. spark数据统计