bigdata/chapter1/crawler/taobao/utils.py

91 lines
2.2 KiB
Python

# coding=utf-8
import json
import re
import sys
import time
import pymysql
import requests
def mysql_query(sql):
db = pymysql.connect("localhost", "root", "123456789",
"sparkproject", port=12346)
cursor = db.cursor()
cursor.execute(sql)
data = cursor.fetchone()
db.close()
return data
def mysql_execute(sql, port=3306):
print("execute: %s" % sql)
db = pymysql.connect(host="127.0.0.1",
user="root",
password="123456789",
database="sparkproject",
port=port)
cursor = db.cursor()
try:
cursor.execute(sql)
db.commit()
except Exception as e:
print(e)
db.rollback()
finally:
db.close()
def now_time():
return time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
def check_alive_proxy(ip, port):
begin_time = int(time.time())
proxies = {
"http": "http://%s:%s" % (ip, port),
"https": "https://%s:%s" % (ip, port),
}
response = requests.get(
'http://2019.ip138.com/ic.asp', proxies=proxies, timeout=5)
check_time = int(time.time()) - begin_time
response.encoding = 'gb2312'
m = re.search(r'.+\[((\d+\.){3}\d+)\].+', response.text)
if m:
if m.group(1) == ip:
return check_time
raise RuntimeError("连接出错")
def proxy(port):
r = requests.get(
'https://raw.githubusercontent.com/fate0/proxylist/master/proxy.list')
print(r.status_code)
f = r.text.split("\n")
for line in f:
o = json.loads(line)
if o["host"]:
try:
check_time = check_alive_proxy(o["host"], str(o["port"]))
if check_time < 3:
print("%s:%d" % (o["host"], o["port"]))
mysql_execute("insert into proxys (ip,port,checktime) values ('{}', {}, {})".format(
o['host'], o['port'], check_time
), port)
except Exception as e:
# print(e)
pass
if __name__ == "__main__":
port = 3306
if len(sys.argv) > 1:
port = int(sys.argv[1])
while True:
proxy(port)
time.sleep(15*60)