添加产品规格

This commit is contained in:
guange 2019-01-25 15:54:57 +08:00
parent 48dd0711b0
commit 5955efe77b
7 changed files with 121 additions and 11 deletions

1
.gitignore vendored
View File

@ -8,3 +8,4 @@ chapter1/crawler/datas/products/
chapter1/crawler/taobao/settings.py chapter1/crawler/taobao/settings.py
chapter2/mysite/mysite/settings.py chapter2/mysite/mysite/settings.py
.env .env
chapter4/results.csv

View File

@ -13,6 +13,15 @@ class ZhihuItem(scrapy.Item):
# name = scrapy.Field() # name = scrapy.Field()
pass pass
class ProductSpecItem(scrapy.Item):
cpu = scrapy.Field()
rom = scrapy.Field()
ram = scrapy.Field()
resolution = scrapy.Field()
charge = scrapy.Field()
weight = scrapy.Field()
brand = scrapy.Field()
class ProxyItem(scrapy.Item): class ProxyItem(scrapy.Item):
ip = scrapy.Field() ip = scrapy.Field()
port = scrapy.Field() port = scrapy.Field()

View File

@ -6,12 +6,14 @@
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html # https://doc.scrapy.org/en/latest/topics/spider-middleware.html
import pdb import pdb
import time import time
import json
import requests
from scrapy.http import HtmlResponse from scrapy.http import HtmlResponse
from selenium.common.exceptions import TimeoutException from selenium.common.exceptions import TimeoutException
from selenium import webdriver from selenium import webdriver
from selenium.webdriver.support.wait import WebDriverWait from selenium.webdriver.support.wait import WebDriverWait
from scrapy_splash import SplashRequest
from scrapy import signals from scrapy import signals
@ -144,3 +146,23 @@ class ZhihuChromeMiddleware(object):
def spider_closed(self, spider, reason): def spider_closed(self, spider, reason):
print('驱动关闭') print('驱动关闭')
self.driver.close() self.driver.close()
class RandomProxyMiddleware(object):
_ip_lists = []
def get_ip(self):
if len(self._ip_lists) <= 0:
r = requests.get('http://bigdata1.educoder.net/myapp/api/proxys.json')
self._ip_lists = json.loads(r.text)
data = self._ip_lists.pop()
return "http://%s:%d" % (data['ip'], data['port'])
def process_request(self, request, spider):
if isinstance(request, SplashRequest):
ip = self.get_ip()
print(ip)
request.meta['splash']['args']['proxy'] = ip
# request.meta['proxy'] = ip

View File

@ -12,7 +12,7 @@ import happybase
from scrapy.conf import settings from scrapy.conf import settings
from scrapy.pipelines.images import ImagesPipeline from scrapy.pipelines.images import ImagesPipeline
from taobao.items import JDProductItem, JDCommentItem, JDCommentSummary from taobao.items import JDProductItem, JDCommentItem, JDCommentSummary,ProductSpecItem
from taobao.utils import check_alive_proxy from taobao.utils import check_alive_proxy
from scrapy.exceptions import DropItem from scrapy.exceptions import DropItem
@ -129,5 +129,17 @@ class JDHbasePipeline(object):
"comment:default_good_count": str(item["default_good_count"]), "comment:default_good_count": str(item["default_good_count"]),
"comment:good_rate": str(item["good_rate"]) "comment:good_rate": str(item["good_rate"])
}) })
elif isinstance(item, ProductSpecItem):
self.product_table.put(item["id"],
{
"spec:cpu": str(item["cpu"]),
"spec:rom": str(item['rom']),
"spec:ram": str(item["ram"]),
"spec:resolution": str(item["resolution"]),
"spec:charge": str(item["charge"]),
"spec:weight": str(item["weight"]),
"spec:brand": str(item["brand"]),
}
)
return item return item

View File

@ -7,7 +7,7 @@ from scrapy import Request
from scrapy.conf import settings from scrapy.conf import settings
from scrapy_splash import SplashRequest from scrapy_splash import SplashRequest
import pdb import pdb
from taobao.items import JDProductItem, JDCommentItem, JDCommentSummary from taobao.items import JDProductItem, JDCommentItem, JDCommentSummary, ProductSpecItem
import re import re
from taobao.utils import now_time from taobao.utils import now_time
@ -21,11 +21,19 @@ class JdSpider(scrapy.Spider):
] ]
custom_settings = { custom_settings = {
'LOG_LEVEL': "WARN", 'LOG_LEVEL': "INFO",
'ITEM_PIPELINES': { 'ITEM_PIPELINES': {
'taobao.pipelines.JDCleanDataPipeline': 300, 'taobao.pipelines.JDCleanDataPipeline': 300,
'taobao.pipelines.JDHbasePipeline': 400, #'taobao.pipelines.JDHbasePipeline': 400,
} },
'DOWNLOADER_MIDDLEWARES': {
'scrapy.downloadermiddlewares.retry.RetryMiddleware': 90,
#'taobao.middlewares.RandomProxyMiddleware': 100,
# 'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': 110,
'scrapy_splash.SplashCookiesMiddleware': 723,
'scrapy_splash.SplashMiddleware': 725,
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810,
},
} }
def make_comment_url(self, product_id, page): def make_comment_url(self, product_id, page):
@ -38,10 +46,34 @@ class JdSpider(scrapy.Spider):
for url in self.start_urls: for url in self.start_urls:
yield SplashRequest(url, yield SplashRequest(url,
self.parse, self.parse,
method='GET',
args={}) args={})
def parse_spec(self, response):
#spec
pdb.set_trace()
cpu = response.xpath('//dt[text()="CPU型号"]/following-sibling::dd/text()').extract_first()
rom = response.xpath('//dt[text()="ROM"]/following-sibling::dd[2]/text()').extract_first()
ram = response.xpath('//dt[text()="RAM"]/following-sibling::dd[2]/text()').extract_first()
resolution = response.xpath('//dt[text()="分辨率"]/following-sibling::dd/text()').extract_first()
charge = response.xpath('//dt[text()="电池容量mAh"]/following-sibling::dd/text()').extract_first()
weight = response.xpath('//dt[text()="机身重量g"]/following-sibling::dd/text()').extract_first()
brand = response.xpath('//dt[text()="品牌"]/following-sibling::dd/text()').extract_first()
print(cpu, rom, ram, resolution, charge, weight, brand)
yield ProductSpecItem(
cpu = cpu,
rom = rom,
ram = ram,
resolution = resolution,
charge = charge,
weight = weight,
brand = brand,
)
def parse_comment(self, response): def parse_comment(self, response):
# pdb.set_trace()
product_id = response.meta["product_id"] product_id = response.meta["product_id"]
page = response.meta["page"] page = response.meta["page"]
print("抓取评论, %s, 当前 %d" % (product_id, page)) print("抓取评论, %s, 当前 %d" % (product_id, page))
@ -128,9 +160,12 @@ class JdSpider(scrapy.Spider):
if m: if m:
product_id = m.group(1) product_id = m.group(1)
comment_url = self.make_comment_url(product_id, 0) comment_url = self.make_comment_url(product_id, 0)
yield Request(comment_url, self.parse_comment, # yield Request(comment_url, self.parse_comment,
meta={'product_id': product_id, 'page': 0}, # meta={'product_id': product_id, 'page': 0},
priority=100) # priority=100)
if url:
yield SplashRequest(url='https:'+url, callback=self.parse_spec)
# 获取下一页 # 获取下一页
next_page = response.xpath('//a[@class="pn-next"]/@href').extract_first() next_page = response.xpath('//a[@class="pn-next"]/@href').extract_first()

31
chapter4/check_proxy.py Normal file
View File

@ -0,0 +1,31 @@
import requests
import time
import re
def check_alive_proxy(ip, port):
begin_time = int(time.time())
proxies = {
"http": "http://%s:%s" % (ip, port),
"https": "https://%s:%s" % (ip, port),
}
response = requests.get(
'http://2019.ip138.com/ic.asp', proxies=proxies, timeout=3)
check_time = int(time.time()) - begin_time
response.encoding = 'gb2312'
m = re.search(r'.+\[((\d+\.){3}\d+)\].+', response.text)
if m:
if m.group(1) == ip:
return check_time
raise RuntimeError("连接出错")
if __name__ == "__main__":
with open('results.csv') as f:
for line in f:
try:
check_time = check_alive_proxy(line.strip(), 9999)
if check_time<5:
print("%s:%d %d", line, 9999, check_time)
except Exception as e:
print(e)