代理抓取
This commit is contained in:
parent
13044e3244
commit
30f04b9aeb
|
File diff suppressed because it is too large
Load Diff
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
@ -13,6 +13,14 @@ class ZhihuItem(scrapy.Item):
|
||||||
# name = scrapy.Field()
|
# name = scrapy.Field()
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
class ProxyItem(scrapy.Item):
|
||||||
|
ip = scrapy.Field()
|
||||||
|
port = scrapy.Field()
|
||||||
|
address = scrapy.Field()
|
||||||
|
http = scrapy.Field()
|
||||||
|
fast = scrapy.Field()
|
||||||
|
alive = scrapy.Field()
|
||||||
|
checktime = scrapy.Field()
|
||||||
|
|
||||||
class ProductItem(scrapy.Item):
|
class ProductItem(scrapy.Item):
|
||||||
id = scrapy.Field()
|
id = scrapy.Field()
|
||||||
|
|
|
@ -13,6 +13,21 @@ from scrapy.conf import settings
|
||||||
from scrapy.pipelines.images import ImagesPipeline
|
from scrapy.pipelines.images import ImagesPipeline
|
||||||
|
|
||||||
from taobao.items import JDProductItem, JDCommentItem, JDCommentSummary
|
from taobao.items import JDProductItem, JDCommentItem, JDCommentSummary
|
||||||
|
from taobao.utils import check_alive_proxy
|
||||||
|
from scrapy.exceptions import DropItem
|
||||||
|
|
||||||
|
|
||||||
|
class CheckProxyPipeline(object):
|
||||||
|
|
||||||
|
def process_item(self, item, spider):
|
||||||
|
try:
|
||||||
|
time = check_alive_proxy(item["ip"], item["port"])
|
||||||
|
item['checktime'] = time
|
||||||
|
return item
|
||||||
|
except Exception as e:
|
||||||
|
print(e)
|
||||||
|
|
||||||
|
raise DropItem("校验出错")
|
||||||
|
|
||||||
|
|
||||||
class MyImagesPipeline(ImagesPipeline):
|
class MyImagesPipeline(ImagesPipeline):
|
||||||
|
|
|
@ -0,0 +1,30 @@
|
||||||
|
#coding=utf-8
|
||||||
|
|
||||||
|
"""
|
||||||
|
抓取西刺免费代理
|
||||||
|
"""
|
||||||
|
|
||||||
|
import scrapy
|
||||||
|
from scrapy import Request
|
||||||
|
from scrapy.conf import settings
|
||||||
|
from scrapy_splash import SplashRequest
|
||||||
|
import pdb
|
||||||
|
from taobao.items import ProxyItem
|
||||||
|
|
||||||
|
|
||||||
|
class PorxySpider(scrapy.Spider):
|
||||||
|
name = 'proxy'
|
||||||
|
allowed_domains = ['raw.githubusercontent.com']
|
||||||
|
start_urls = [
|
||||||
|
"https://raw.githubusercontent.com/fate0/proxylist/master/proxy.list"
|
||||||
|
]
|
||||||
|
|
||||||
|
custom_settings = {
|
||||||
|
'LOG_LEVEL': "WARN",
|
||||||
|
'ITEM_PIPELINES': {
|
||||||
|
'taobao.pipelines.CheckProxyPipeline': 300,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
def parse(self, response):
|
||||||
|
print(response.text)
|
|
@ -1,6 +1,79 @@
|
||||||
# coding=utf-8
|
# coding=utf-8
|
||||||
import time
|
import time
|
||||||
|
import requests
|
||||||
|
import re
|
||||||
|
import time
|
||||||
|
import json
|
||||||
|
import pymysql
|
||||||
|
|
||||||
|
|
||||||
|
def mysql_query(sql):
|
||||||
|
db = pymysql.connect("localhost", "root", "123456789",
|
||||||
|
"sparkproject", port=12346)
|
||||||
|
cursor = db.cursor()
|
||||||
|
cursor.execute(sql)
|
||||||
|
data = cursor.fetchone()
|
||||||
|
db.close()
|
||||||
|
return data
|
||||||
|
|
||||||
|
|
||||||
|
def mysql_execute(sql):
|
||||||
|
print("execute: %s" % sql)
|
||||||
|
db = pymysql.connect(host="127.0.0.1",
|
||||||
|
user="root",
|
||||||
|
password="123456789",
|
||||||
|
database="sparkproject",
|
||||||
|
port=12346)
|
||||||
|
cursor = db.cursor()
|
||||||
|
|
||||||
|
try:
|
||||||
|
cursor.execute(sql)
|
||||||
|
db.commit()
|
||||||
|
except Exception as e:
|
||||||
|
print(e)
|
||||||
|
db.rollback()
|
||||||
|
finally:
|
||||||
|
db.close()
|
||||||
|
|
||||||
|
|
||||||
def now_time():
|
def now_time():
|
||||||
return time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
|
return time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
|
||||||
|
|
||||||
|
|
||||||
|
def check_alive_proxy(ip, port):
|
||||||
|
begin_time = int(time.time())
|
||||||
|
proxies = {
|
||||||
|
"http": "http://%s:%s" % (ip, port),
|
||||||
|
"https": "https://%s:%s" % (ip, port),
|
||||||
|
}
|
||||||
|
response = requests.get(
|
||||||
|
'http://2019.ip138.com/ic.asp', proxies=proxies, timeout=5)
|
||||||
|
|
||||||
|
check_time = int(time.time()) - begin_time
|
||||||
|
|
||||||
|
response.encoding = 'gb2312'
|
||||||
|
m = re.search(r'.+\[((\d+\.){3}\d+)\].+', response.text)
|
||||||
|
if m:
|
||||||
|
if m.group(1) == ip:
|
||||||
|
return check_time
|
||||||
|
raise RuntimeError("连接出错")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
r = requests.get(
|
||||||
|
'https://raw.githubusercontent.com/fate0/proxylist/master/proxy.list')
|
||||||
|
print(r.status_code)
|
||||||
|
f = r.text.split("\n")
|
||||||
|
for line in f:
|
||||||
|
o = json.loads(line)
|
||||||
|
if o["host"]:
|
||||||
|
try:
|
||||||
|
check_time = check_alive_proxy(o["host"], str(o["port"]))
|
||||||
|
if check_time < 3:
|
||||||
|
print("%s:%d" % (o["host"], o["port"]))
|
||||||
|
mysql_execute("insert into proxys (ip,port,checktime) values ('{}', {}, {})".format(
|
||||||
|
o['host'], o['port'], check_time
|
||||||
|
))
|
||||||
|
except Exception as e:
|
||||||
|
# print(e)
|
||||||
|
pass
|
||||||
|
|
Loading…
Reference in New Issue