添加产品规格

This commit is contained in:
guange 2019-01-25 15:54:57 +08:00
parent 48dd0711b0
commit 5955efe77b
7 changed files with 121 additions and 11 deletions

1
.gitignore vendored
View File

@ -8,3 +8,4 @@ chapter1/crawler/datas/products/
chapter1/crawler/taobao/settings.py
chapter2/mysite/mysite/settings.py
.env
chapter4/results.csv

View File

@ -13,6 +13,15 @@ class ZhihuItem(scrapy.Item):
# name = scrapy.Field()
pass
class ProductSpecItem(scrapy.Item):
cpu = scrapy.Field()
rom = scrapy.Field()
ram = scrapy.Field()
resolution = scrapy.Field()
charge = scrapy.Field()
weight = scrapy.Field()
brand = scrapy.Field()
class ProxyItem(scrapy.Item):
ip = scrapy.Field()
port = scrapy.Field()

View File

@ -6,12 +6,14 @@
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
import pdb
import time
import json
import requests
from scrapy.http import HtmlResponse
from selenium.common.exceptions import TimeoutException
from selenium import webdriver
from selenium.webdriver.support.wait import WebDriverWait
from scrapy_splash import SplashRequest
from scrapy import signals
@ -144,3 +146,23 @@ class ZhihuChromeMiddleware(object):
def spider_closed(self, spider, reason):
print('驱动关闭')
self.driver.close()
class RandomProxyMiddleware(object):
_ip_lists = []
def get_ip(self):
if len(self._ip_lists) <= 0:
r = requests.get('http://bigdata1.educoder.net/myapp/api/proxys.json')
self._ip_lists = json.loads(r.text)
data = self._ip_lists.pop()
return "http://%s:%d" % (data['ip'], data['port'])
def process_request(self, request, spider):
if isinstance(request, SplashRequest):
ip = self.get_ip()
print(ip)
request.meta['splash']['args']['proxy'] = ip
# request.meta['proxy'] = ip

View File

@ -12,7 +12,7 @@ import happybase
from scrapy.conf import settings
from scrapy.pipelines.images import ImagesPipeline
from taobao.items import JDProductItem, JDCommentItem, JDCommentSummary
from taobao.items import JDProductItem, JDCommentItem, JDCommentSummary,ProductSpecItem
from taobao.utils import check_alive_proxy
from scrapy.exceptions import DropItem
@ -129,5 +129,17 @@ class JDHbasePipeline(object):
"comment:default_good_count": str(item["default_good_count"]),
"comment:good_rate": str(item["good_rate"])
})
elif isinstance(item, ProductSpecItem):
self.product_table.put(item["id"],
{
"spec:cpu": str(item["cpu"]),
"spec:rom": str(item['rom']),
"spec:ram": str(item["ram"]),
"spec:resolution": str(item["resolution"]),
"spec:charge": str(item["charge"]),
"spec:weight": str(item["weight"]),
"spec:brand": str(item["brand"]),
}
)
return item

View File

@ -7,7 +7,7 @@ from scrapy import Request
from scrapy.conf import settings
from scrapy_splash import SplashRequest
import pdb
from taobao.items import JDProductItem, JDCommentItem, JDCommentSummary
from taobao.items import JDProductItem, JDCommentItem, JDCommentSummary, ProductSpecItem
import re
from taobao.utils import now_time
@ -21,11 +21,19 @@ class JdSpider(scrapy.Spider):
]
custom_settings = {
'LOG_LEVEL': "WARN",
'LOG_LEVEL': "INFO",
'ITEM_PIPELINES': {
'taobao.pipelines.JDCleanDataPipeline': 300,
'taobao.pipelines.JDHbasePipeline': 400,
}
#'taobao.pipelines.JDHbasePipeline': 400,
},
'DOWNLOADER_MIDDLEWARES': {
'scrapy.downloadermiddlewares.retry.RetryMiddleware': 90,
#'taobao.middlewares.RandomProxyMiddleware': 100,
# 'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': 110,
'scrapy_splash.SplashCookiesMiddleware': 723,
'scrapy_splash.SplashMiddleware': 725,
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810,
},
}
def make_comment_url(self, product_id, page):
@ -38,10 +46,34 @@ class JdSpider(scrapy.Spider):
for url in self.start_urls:
yield SplashRequest(url,
self.parse,
method='GET',
args={})
def parse_spec(self, response):
#spec
pdb.set_trace()
cpu = response.xpath('//dt[text()="CPU型号"]/following-sibling::dd/text()').extract_first()
rom = response.xpath('//dt[text()="ROM"]/following-sibling::dd[2]/text()').extract_first()
ram = response.xpath('//dt[text()="RAM"]/following-sibling::dd[2]/text()').extract_first()
resolution = response.xpath('//dt[text()="分辨率"]/following-sibling::dd/text()').extract_first()
charge = response.xpath('//dt[text()="电池容量mAh"]/following-sibling::dd/text()').extract_first()
weight = response.xpath('//dt[text()="机身重量g"]/following-sibling::dd/text()').extract_first()
brand = response.xpath('//dt[text()="品牌"]/following-sibling::dd/text()').extract_first()
print(cpu, rom, ram, resolution, charge, weight, brand)
yield ProductSpecItem(
cpu = cpu,
rom = rom,
ram = ram,
resolution = resolution,
charge = charge,
weight = weight,
brand = brand,
)
def parse_comment(self, response):
# pdb.set_trace()
product_id = response.meta["product_id"]
page = response.meta["page"]
print("抓取评论, %s, 当前 %d" % (product_id, page))
@ -128,9 +160,12 @@ class JdSpider(scrapy.Spider):
if m:
product_id = m.group(1)
comment_url = self.make_comment_url(product_id, 0)
yield Request(comment_url, self.parse_comment,
meta={'product_id': product_id, 'page': 0},
priority=100)
# yield Request(comment_url, self.parse_comment,
# meta={'product_id': product_id, 'page': 0},
# priority=100)
if url:
yield SplashRequest(url='https:'+url, callback=self.parse_spec)
# 获取下一页
next_page = response.xpath('//a[@class="pn-next"]/@href').extract_first()

31
chapter4/check_proxy.py Normal file
View File

@ -0,0 +1,31 @@
import requests
import time
import re
def check_alive_proxy(ip, port):
begin_time = int(time.time())
proxies = {
"http": "http://%s:%s" % (ip, port),
"https": "https://%s:%s" % (ip, port),
}
response = requests.get(
'http://2019.ip138.com/ic.asp', proxies=proxies, timeout=3)
check_time = int(time.time()) - begin_time
response.encoding = 'gb2312'
m = re.search(r'.+\[((\d+\.){3}\d+)\].+', response.text)
if m:
if m.group(1) == ip:
return check_time
raise RuntimeError("连接出错")
if __name__ == "__main__":
with open('results.csv') as f:
for line in f:
try:
check_time = check_alive_proxy(line.strip(), 9999)
if check_time<5:
print("%s:%d %d", line, 9999, check_time)
except Exception as e:
print(e)