代理抓取

This commit is contained in:
guange 2019-01-23 15:30:02 +08:00
parent 13044e3244
commit 30f04b9aeb
10 changed files with 10138 additions and 0 deletions

View File

10010
chapter1/crawler/proxy.list Normal file

File diff suppressed because it is too large Load Diff

View File

@ -13,6 +13,14 @@ class ZhihuItem(scrapy.Item):
# name = scrapy.Field()
pass
class ProxyItem(scrapy.Item):
ip = scrapy.Field()
port = scrapy.Field()
address = scrapy.Field()
http = scrapy.Field()
fast = scrapy.Field()
alive = scrapy.Field()
checktime = scrapy.Field()
class ProductItem(scrapy.Item):
id = scrapy.Field()

View File

@ -13,6 +13,21 @@ from scrapy.conf import settings
from scrapy.pipelines.images import ImagesPipeline
from taobao.items import JDProductItem, JDCommentItem, JDCommentSummary
from taobao.utils import check_alive_proxy
from scrapy.exceptions import DropItem
class CheckProxyPipeline(object):
def process_item(self, item, spider):
try:
time = check_alive_proxy(item["ip"], item["port"])
item['checktime'] = time
return item
except Exception as e:
print(e)
raise DropItem("校验出错")
class MyImagesPipeline(ImagesPipeline):

View File

@ -0,0 +1,30 @@
#coding=utf-8
"""
抓取西刺免费代理
"""
import scrapy
from scrapy import Request
from scrapy.conf import settings
from scrapy_splash import SplashRequest
import pdb
from taobao.items import ProxyItem
class PorxySpider(scrapy.Spider):
name = 'proxy'
allowed_domains = ['raw.githubusercontent.com']
start_urls = [
"https://raw.githubusercontent.com/fate0/proxylist/master/proxy.list"
]
custom_settings = {
'LOG_LEVEL': "WARN",
'ITEM_PIPELINES': {
'taobao.pipelines.CheckProxyPipeline': 300,
}
}
def parse(self, response):
print(response.text)

View File

@ -1,6 +1,79 @@
# coding=utf-8
import time
import requests
import re
import time
import json
import pymysql
def mysql_query(sql):
db = pymysql.connect("localhost", "root", "123456789",
"sparkproject", port=12346)
cursor = db.cursor()
cursor.execute(sql)
data = cursor.fetchone()
db.close()
return data
def mysql_execute(sql):
print("execute: %s" % sql)
db = pymysql.connect(host="127.0.0.1",
user="root",
password="123456789",
database="sparkproject",
port=12346)
cursor = db.cursor()
try:
cursor.execute(sql)
db.commit()
except Exception as e:
print(e)
db.rollback()
finally:
db.close()
def now_time():
return time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
def check_alive_proxy(ip, port):
begin_time = int(time.time())
proxies = {
"http": "http://%s:%s" % (ip, port),
"https": "https://%s:%s" % (ip, port),
}
response = requests.get(
'http://2019.ip138.com/ic.asp', proxies=proxies, timeout=5)
check_time = int(time.time()) - begin_time
response.encoding = 'gb2312'
m = re.search(r'.+\[((\d+\.){3}\d+)\].+', response.text)
if m:
if m.group(1) == ip:
return check_time
raise RuntimeError("连接出错")
if __name__ == "__main__":
r = requests.get(
'https://raw.githubusercontent.com/fate0/proxylist/master/proxy.list')
print(r.status_code)
f = r.text.split("\n")
for line in f:
o = json.loads(line)
if o["host"]:
try:
check_time = check_alive_proxy(o["host"], str(o["port"]))
if check_time < 3:
print("%s:%d" % (o["host"], o["port"]))
mysql_execute("insert into proxys (ip,port,checktime) values ('{}', {}, {})".format(
o['host'], o['port'], check_time
))
except Exception as e:
# print(e)
pass

View File

@ -1,3 +1,5 @@
1. 数据抓取
2. 数据清洗
3. spark数据统计