代理抓取
This commit is contained in:
parent
13044e3244
commit
30f04b9aeb
|
File diff suppressed because it is too large
Load Diff
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
@ -13,6 +13,14 @@ class ZhihuItem(scrapy.Item):
|
|||
# name = scrapy.Field()
|
||||
pass
|
||||
|
||||
class ProxyItem(scrapy.Item):
|
||||
ip = scrapy.Field()
|
||||
port = scrapy.Field()
|
||||
address = scrapy.Field()
|
||||
http = scrapy.Field()
|
||||
fast = scrapy.Field()
|
||||
alive = scrapy.Field()
|
||||
checktime = scrapy.Field()
|
||||
|
||||
class ProductItem(scrapy.Item):
|
||||
id = scrapy.Field()
|
||||
|
|
|
@ -13,6 +13,21 @@ from scrapy.conf import settings
|
|||
from scrapy.pipelines.images import ImagesPipeline
|
||||
|
||||
from taobao.items import JDProductItem, JDCommentItem, JDCommentSummary
|
||||
from taobao.utils import check_alive_proxy
|
||||
from scrapy.exceptions import DropItem
|
||||
|
||||
|
||||
class CheckProxyPipeline(object):
|
||||
|
||||
def process_item(self, item, spider):
|
||||
try:
|
||||
time = check_alive_proxy(item["ip"], item["port"])
|
||||
item['checktime'] = time
|
||||
return item
|
||||
except Exception as e:
|
||||
print(e)
|
||||
|
||||
raise DropItem("校验出错")
|
||||
|
||||
|
||||
class MyImagesPipeline(ImagesPipeline):
|
||||
|
|
|
@ -0,0 +1,30 @@
|
|||
#coding=utf-8
|
||||
|
||||
"""
|
||||
抓取西刺免费代理
|
||||
"""
|
||||
|
||||
import scrapy
|
||||
from scrapy import Request
|
||||
from scrapy.conf import settings
|
||||
from scrapy_splash import SplashRequest
|
||||
import pdb
|
||||
from taobao.items import ProxyItem
|
||||
|
||||
|
||||
class PorxySpider(scrapy.Spider):
|
||||
name = 'proxy'
|
||||
allowed_domains = ['raw.githubusercontent.com']
|
||||
start_urls = [
|
||||
"https://raw.githubusercontent.com/fate0/proxylist/master/proxy.list"
|
||||
]
|
||||
|
||||
custom_settings = {
|
||||
'LOG_LEVEL': "WARN",
|
||||
'ITEM_PIPELINES': {
|
||||
'taobao.pipelines.CheckProxyPipeline': 300,
|
||||
}
|
||||
}
|
||||
|
||||
def parse(self, response):
|
||||
print(response.text)
|
|
@ -1,6 +1,79 @@
|
|||
# coding=utf-8
|
||||
import time
|
||||
import requests
|
||||
import re
|
||||
import time
|
||||
import json
|
||||
import pymysql
|
||||
|
||||
|
||||
def mysql_query(sql):
|
||||
db = pymysql.connect("localhost", "root", "123456789",
|
||||
"sparkproject", port=12346)
|
||||
cursor = db.cursor()
|
||||
cursor.execute(sql)
|
||||
data = cursor.fetchone()
|
||||
db.close()
|
||||
return data
|
||||
|
||||
|
||||
def mysql_execute(sql):
|
||||
print("execute: %s" % sql)
|
||||
db = pymysql.connect(host="127.0.0.1",
|
||||
user="root",
|
||||
password="123456789",
|
||||
database="sparkproject",
|
||||
port=12346)
|
||||
cursor = db.cursor()
|
||||
|
||||
try:
|
||||
cursor.execute(sql)
|
||||
db.commit()
|
||||
except Exception as e:
|
||||
print(e)
|
||||
db.rollback()
|
||||
finally:
|
||||
db.close()
|
||||
|
||||
|
||||
def now_time():
|
||||
return time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
|
||||
|
||||
|
||||
def check_alive_proxy(ip, port):
|
||||
begin_time = int(time.time())
|
||||
proxies = {
|
||||
"http": "http://%s:%s" % (ip, port),
|
||||
"https": "https://%s:%s" % (ip, port),
|
||||
}
|
||||
response = requests.get(
|
||||
'http://2019.ip138.com/ic.asp', proxies=proxies, timeout=5)
|
||||
|
||||
check_time = int(time.time()) - begin_time
|
||||
|
||||
response.encoding = 'gb2312'
|
||||
m = re.search(r'.+\[((\d+\.){3}\d+)\].+', response.text)
|
||||
if m:
|
||||
if m.group(1) == ip:
|
||||
return check_time
|
||||
raise RuntimeError("连接出错")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
r = requests.get(
|
||||
'https://raw.githubusercontent.com/fate0/proxylist/master/proxy.list')
|
||||
print(r.status_code)
|
||||
f = r.text.split("\n")
|
||||
for line in f:
|
||||
o = json.loads(line)
|
||||
if o["host"]:
|
||||
try:
|
||||
check_time = check_alive_proxy(o["host"], str(o["port"]))
|
||||
if check_time < 3:
|
||||
print("%s:%d" % (o["host"], o["port"]))
|
||||
mysql_execute("insert into proxys (ip,port,checktime) values ('{}', {}, {})".format(
|
||||
o['host'], o['port'], check_time
|
||||
))
|
||||
except Exception as e:
|
||||
# print(e)
|
||||
pass
|
||||
|
|
Loading…
Reference in New Issue