添加产品规格
This commit is contained in:
parent
48dd0711b0
commit
5955efe77b
|
@ -7,4 +7,5 @@ chapter1/crawler/datas/comments1/
|
|||
chapter1/crawler/datas/products/
|
||||
chapter1/crawler/taobao/settings.py
|
||||
chapter2/mysite/mysite/settings.py
|
||||
.env
|
||||
.env
|
||||
chapter4/results.csv
|
||||
|
|
Binary file not shown.
|
@ -13,6 +13,15 @@ class ZhihuItem(scrapy.Item):
|
|||
# name = scrapy.Field()
|
||||
pass
|
||||
|
||||
class ProductSpecItem(scrapy.Item):
|
||||
cpu = scrapy.Field()
|
||||
rom = scrapy.Field()
|
||||
ram = scrapy.Field()
|
||||
resolution = scrapy.Field()
|
||||
charge = scrapy.Field()
|
||||
weight = scrapy.Field()
|
||||
brand = scrapy.Field()
|
||||
|
||||
class ProxyItem(scrapy.Item):
|
||||
ip = scrapy.Field()
|
||||
port = scrapy.Field()
|
||||
|
|
|
@ -6,12 +6,14 @@
|
|||
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
|
||||
import pdb
|
||||
import time
|
||||
import json
|
||||
import requests
|
||||
|
||||
from scrapy.http import HtmlResponse
|
||||
from selenium.common.exceptions import TimeoutException
|
||||
from selenium import webdriver
|
||||
from selenium.webdriver.support.wait import WebDriverWait
|
||||
|
||||
from scrapy_splash import SplashRequest
|
||||
|
||||
from scrapy import signals
|
||||
|
||||
|
@ -144,3 +146,23 @@ class ZhihuChromeMiddleware(object):
|
|||
def spider_closed(self, spider, reason):
|
||||
print('驱动关闭')
|
||||
self.driver.close()
|
||||
|
||||
|
||||
class RandomProxyMiddleware(object):
|
||||
_ip_lists = []
|
||||
|
||||
def get_ip(self):
|
||||
if len(self._ip_lists) <= 0:
|
||||
r = requests.get('http://bigdata1.educoder.net/myapp/api/proxys.json')
|
||||
self._ip_lists = json.loads(r.text)
|
||||
|
||||
data = self._ip_lists.pop()
|
||||
return "http://%s:%d" % (data['ip'], data['port'])
|
||||
|
||||
|
||||
def process_request(self, request, spider):
|
||||
if isinstance(request, SplashRequest):
|
||||
ip = self.get_ip()
|
||||
print(ip)
|
||||
request.meta['splash']['args']['proxy'] = ip
|
||||
# request.meta['proxy'] = ip
|
|
@ -12,7 +12,7 @@ import happybase
|
|||
from scrapy.conf import settings
|
||||
from scrapy.pipelines.images import ImagesPipeline
|
||||
|
||||
from taobao.items import JDProductItem, JDCommentItem, JDCommentSummary
|
||||
from taobao.items import JDProductItem, JDCommentItem, JDCommentSummary,ProductSpecItem
|
||||
from taobao.utils import check_alive_proxy
|
||||
from scrapy.exceptions import DropItem
|
||||
|
||||
|
@ -129,5 +129,17 @@ class JDHbasePipeline(object):
|
|||
"comment:default_good_count": str(item["default_good_count"]),
|
||||
"comment:good_rate": str(item["good_rate"])
|
||||
})
|
||||
elif isinstance(item, ProductSpecItem):
|
||||
self.product_table.put(item["id"],
|
||||
{
|
||||
"spec:cpu": str(item["cpu"]),
|
||||
"spec:rom": str(item['rom']),
|
||||
"spec:ram": str(item["ram"]),
|
||||
"spec:resolution": str(item["resolution"]),
|
||||
"spec:charge": str(item["charge"]),
|
||||
"spec:weight": str(item["weight"]),
|
||||
"spec:brand": str(item["brand"]),
|
||||
}
|
||||
)
|
||||
|
||||
return item
|
||||
|
|
|
@ -7,7 +7,7 @@ from scrapy import Request
|
|||
from scrapy.conf import settings
|
||||
from scrapy_splash import SplashRequest
|
||||
import pdb
|
||||
from taobao.items import JDProductItem, JDCommentItem, JDCommentSummary
|
||||
from taobao.items import JDProductItem, JDCommentItem, JDCommentSummary, ProductSpecItem
|
||||
import re
|
||||
|
||||
from taobao.utils import now_time
|
||||
|
@ -21,11 +21,19 @@ class JdSpider(scrapy.Spider):
|
|||
]
|
||||
|
||||
custom_settings = {
|
||||
'LOG_LEVEL': "WARN",
|
||||
'LOG_LEVEL': "INFO",
|
||||
'ITEM_PIPELINES': {
|
||||
'taobao.pipelines.JDCleanDataPipeline': 300,
|
||||
'taobao.pipelines.JDHbasePipeline': 400,
|
||||
}
|
||||
#'taobao.pipelines.JDHbasePipeline': 400,
|
||||
},
|
||||
'DOWNLOADER_MIDDLEWARES': {
|
||||
'scrapy.downloadermiddlewares.retry.RetryMiddleware': 90,
|
||||
#'taobao.middlewares.RandomProxyMiddleware': 100,
|
||||
# 'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': 110,
|
||||
'scrapy_splash.SplashCookiesMiddleware': 723,
|
||||
'scrapy_splash.SplashMiddleware': 725,
|
||||
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810,
|
||||
},
|
||||
}
|
||||
|
||||
def make_comment_url(self, product_id, page):
|
||||
|
@ -38,10 +46,34 @@ class JdSpider(scrapy.Spider):
|
|||
for url in self.start_urls:
|
||||
yield SplashRequest(url,
|
||||
self.parse,
|
||||
method='GET',
|
||||
args={})
|
||||
|
||||
def parse_spec(self, response):
|
||||
#spec
|
||||
pdb.set_trace()
|
||||
cpu = response.xpath('//dt[text()="CPU型号"]/following-sibling::dd/text()').extract_first()
|
||||
rom = response.xpath('//dt[text()="ROM"]/following-sibling::dd[2]/text()').extract_first()
|
||||
ram = response.xpath('//dt[text()="RAM"]/following-sibling::dd[2]/text()').extract_first()
|
||||
resolution = response.xpath('//dt[text()="分辨率"]/following-sibling::dd/text()').extract_first()
|
||||
charge = response.xpath('//dt[text()="电池容量(mAh)"]/following-sibling::dd/text()').extract_first()
|
||||
weight = response.xpath('//dt[text()="机身重量(g)"]/following-sibling::dd/text()').extract_first()
|
||||
brand = response.xpath('//dt[text()="品牌"]/following-sibling::dd/text()').extract_first()
|
||||
|
||||
print(cpu, rom, ram, resolution, charge, weight, brand)
|
||||
|
||||
yield ProductSpecItem(
|
||||
cpu = cpu,
|
||||
rom = rom,
|
||||
ram = ram,
|
||||
resolution = resolution,
|
||||
charge = charge,
|
||||
weight = weight,
|
||||
brand = brand,
|
||||
)
|
||||
|
||||
|
||||
def parse_comment(self, response):
|
||||
# pdb.set_trace()
|
||||
product_id = response.meta["product_id"]
|
||||
page = response.meta["page"]
|
||||
print("抓取评论, %s, 当前 %d页" % (product_id, page))
|
||||
|
@ -128,9 +160,12 @@ class JdSpider(scrapy.Spider):
|
|||
if m:
|
||||
product_id = m.group(1)
|
||||
comment_url = self.make_comment_url(product_id, 0)
|
||||
yield Request(comment_url, self.parse_comment,
|
||||
meta={'product_id': product_id, 'page': 0},
|
||||
priority=100)
|
||||
# yield Request(comment_url, self.parse_comment,
|
||||
# meta={'product_id': product_id, 'page': 0},
|
||||
# priority=100)
|
||||
|
||||
if url:
|
||||
yield SplashRequest(url='https:'+url, callback=self.parse_spec)
|
||||
|
||||
# 获取下一页
|
||||
next_page = response.xpath('//a[@class="pn-next"]/@href').extract_first()
|
||||
|
|
|
@ -0,0 +1,31 @@
|
|||
import requests
|
||||
import time
|
||||
import re
|
||||
|
||||
def check_alive_proxy(ip, port):
|
||||
begin_time = int(time.time())
|
||||
proxies = {
|
||||
"http": "http://%s:%s" % (ip, port),
|
||||
"https": "https://%s:%s" % (ip, port),
|
||||
}
|
||||
response = requests.get(
|
||||
'http://2019.ip138.com/ic.asp', proxies=proxies, timeout=3)
|
||||
|
||||
check_time = int(time.time()) - begin_time
|
||||
|
||||
response.encoding = 'gb2312'
|
||||
m = re.search(r'.+\[((\d+\.){3}\d+)\].+', response.text)
|
||||
if m:
|
||||
if m.group(1) == ip:
|
||||
return check_time
|
||||
raise RuntimeError("连接出错")
|
||||
|
||||
if __name__ == "__main__":
|
||||
with open('results.csv') as f:
|
||||
for line in f:
|
||||
try:
|
||||
check_time = check_alive_proxy(line.strip(), 9999)
|
||||
if check_time<5:
|
||||
print("%s:%d %d", line, 9999, check_time)
|
||||
except Exception as e:
|
||||
print(e)
|
Loading…
Reference in New Issue