数据抓取

This commit is contained in:
guange 2019-01-08 16:37:37 +08:00
parent f8fe624d77
commit a19364e304
8 changed files with 20 additions and 32171 deletions

View File

@ -19,6 +19,7 @@ class ProductItem(scrapy.Item):
title = scrapy.Field()
price = scrapy.Field()
author = scrapy.Field()
shop = scrapy.Field()
pubtime = scrapy.Field()
publisher = scrapy.Field()
url = scrapy.Field()

View File

@ -125,7 +125,7 @@ class ZhihuChromeMiddleware(object):
def __init__(self):
options = webdriver.ChromeOptions()
options.add_argument('--headless')
# options.add_argument('--headless')
options.add_argument('--window-size=1366,768')
options.add_argument('--blink-settings=imagesEnabled=false')
SERVICE_ARGS = ['--load-images=false', '--disk-cache=true', '--ignore-ssl-errors=true']

File diff suppressed because it is too large Load Diff

View File

@ -4,13 +4,27 @@
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
from scrapy.exceptions import DropItem
import json
import pdb
from scrapy.pipelines.images import ImagesPipeline
class MyImagesPipeline(ImagesPipeline):
pass
class ZhihuPipeline(object):
def process_item(self, item, spider):
return item
class JsonWithEncodingCnblogsPipeline(object):
def open_spider(self, spider):
self.file = open('../../datas/%s.json' % spider.keyword, 'w', encoding='utf-8')
def process_item(self, item, spider):
line = json.dumps(dict(item), ensure_ascii=False) + "\n"
self.file.write(line)
return item
def spider_closed(self, spider):
self.file.close()

View File

@ -1,10 +1,10 @@
# -*- coding: utf-8 -*-
# //div[@class="con shoplist"]//div[@id="search_nature_rg"]//ul/li//p[@class="name"]/a/@title
import pdb
import scrapy
import zhihu.items
import taobao.items
from scrapy import Request
@ -49,7 +49,7 @@ class DangdangSpider(scrapy.Spider):
url = self.safe_fetch(product.xpath('p[@class="name"]/a/@href'))
comment_num = self.safe_fetch(product.xpath('p[@class="search_star_line"]//a[@class="search_comment_num"]/text()'))
yield zhihu.items.ProductItem(title=title,
yield taobao.items.ProductItem(title=title,
img=img,
price=price,
author=author,

View File

@ -1,124 +0,0 @@
# -*- coding: utf-8 -*-
import time
import scrapy
from scrapy import Request
from scrapy.http import HtmlResponse
from selenium.common.exceptions import TimeoutException
from selenium import webdriver
from selenium.webdriver.support.wait import WebDriverWait
import pdb
class MeituSpider(scrapy.Spider):
name = 'meitu'
allowed_domains = ['zhihu.com']
main_url ='https://www.zhihu.com/collection/78172986'
start_urls = [main_url]
cookies={'z_c0':'2|1:0|10:1543151267|4:z_c0|92:Mi4xcFpjMkFBQUFBQUFBWUdnWG5naDZEaWNBQUFDRUFsVk5veXNpWEFELWM4WHU0b3c3anJvR3hobzhvMEpIR004b2xB|aada1968c0f5284002a0bbb552b95af6e87cf3848acc15adfe6d91ed4d9151d8'}
custom_settings = {
'LOG_LEVEL': "INFO"
}
CHROME_DRIVER_PATH = r'/Users/guange/.bin/chromedriver'
def __init__(self):
options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_argument('--window-size=1366,768')
options.add_argument('--blink-settings=imagesEnabled=false')
SERVICE_ARGS = ['--load-images=false', '--disk-cache=true', '--ignore-ssl-errors=true']
self.driver = webdriver.Chrome(chrome_options=options, service_args=SERVICE_ARGS, executable_path=self.CHROME_DRIVER_PATH)
self.driver.set_page_load_timeout(30)
self.wait = WebDriverWait(self.driver, 10)
def closed(self, spider):
print("spider closed")
self.driver.close()
def start_requests(self):
print("start_requests ")
for url in self.start_urls:
print('start request %s' % url)
yield Request(url, cookies=self.cookies)
def scroll_until_loaded(self):
check_height = self.driver.execute_script("return document.body.scrollHeight;")
print(check_height)
time.sleep(1)
while True:
self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
try:
self.wait.until(
lambda driver: self.driver.execute_script("return document.body.scrollHeight;") > check_height)
check_height = self.driver.execute_script("return document.body.scrollHeight;")
print(check_height)
except TimeoutException:
break
# pdb.set_trace()
return HtmlResponse(url=self.driver.current_url, body=self.driver.page_source, encoding="utf-8")
def scroll_until_loaded1(self):
check_height = self.driver.execute_script("return document.body.scrollHeight;")
print(check_height)
time.sleep(1)
# pdb.set_trace()
scrollH = 0
while True:
scrollH += 100
self.driver.execute_script("window.scrollTo(0, %d);" % scrollH)
time.sleep(0.1)
if scrollH > self.driver.execute_script("return document.body.scrollHeight;"):
break
# try:
# self.wait.until(
# lambda driver: self.driver.execute_script("return document.body.scrollHeight;") > check_height)
# check_height = self.driver.execute_script("return document.body.scrollHeight;")
# print(check_height)
# except TimeoutException:
# break
# pdb.set_trace()
return HtmlResponse(url=self.driver.current_url, body=self.driver.page_source, encoding="utf-8")
def parse_question(self,response):
print(response.url)
self.driver.get('https://www.zhihu.com')
self.driver.add_cookie({'name': 'z_c0', 'value': self.cookies['z_c0'], 'domain': '.zhihu.com','path':'/'})
self.driver.get(response.url)
response = self.scroll_until_loaded()
imgs = response.xpath('//div[@class="Question-main"]//div[@id="QuestionAnswers-answers"]//div[@class="List-item"]//div[@class="ContentItem AnswerItem"]//div[@class="RichContent-inner"]//div[@class="VagueImage origin_image zh-lightbox-thumb"]/@data-src').extract()
hrefs = [img for img in imgs if img is not None and img.startswith('http')]
print(hrefs)
yield {"image_urls": hrefs}
authors = response.xpath('//div[@class="Question-main"]//div[@id="QuestionAnswers-answers"]//div[@class="List-item"]//div[@class="ContentItem AnswerItem"]//div[@class="AuthorInfo-head"]//a[@class="UserLink-link"]/text()').extract()
print(authors)
def parse(self, response):
self.logger.warn(response.url)
titles = response.xpath('//div[@class="zu-main-content"]//div[@class="zm-item"]//h2[@class="zm-item-title"]/a/text()').extract()
print(titles)
hrefs = response.xpath(
'//div[@class="zu-main-content"]//div[@class="zm-item"]//h2[@class="zm-item-title"]/a/@href').extract()
print(hrefs)
yield Request('https://www.zhihu.com' + hrefs[5], callback=self.parse_question, cookies=self.cookies)
#for href in hrefs:
# yield Request('https://www.zhihu.com' + href, callback=self.parse_question, cookies=self.cookies)
next_href = response.xpath('//div[@class="border-pager"]//div[@class="zm-invite-pager"]//span/a[text()="下一页"]/@href').extract_first()
print(next_href)
# if next_href:
# yield Request('https://www.zhihu.com/collection/78172986' + next_href )

View File

@ -1,19 +0,0 @@
# -*- coding: utf-8 -*-
import scrapy
class PornSpider(scrapy.Spider):
name = 'porn'
allowed_domains = ['www.baidu.com']
start_urls = ['https://www.baidu.com/']
custom_settings = {
'LOG_LEVEL': "INFO",
'SPIDER_MIDDLEWARES': {
'zhihu.middlewares.ZhihuSpiderMiddleware': 543,
}
}
def parse(self, response):
print("parse response")

File diff suppressed because it is too large Load Diff