数据抓取
This commit is contained in:
parent
f8fe624d77
commit
a19364e304
|
@ -19,6 +19,7 @@ class ProductItem(scrapy.Item):
|
|||
title = scrapy.Field()
|
||||
price = scrapy.Field()
|
||||
author = scrapy.Field()
|
||||
shop = scrapy.Field()
|
||||
pubtime = scrapy.Field()
|
||||
publisher = scrapy.Field()
|
||||
url = scrapy.Field()
|
||||
|
|
|
@ -125,7 +125,7 @@ class ZhihuChromeMiddleware(object):
|
|||
|
||||
def __init__(self):
|
||||
options = webdriver.ChromeOptions()
|
||||
options.add_argument('--headless')
|
||||
# options.add_argument('--headless')
|
||||
options.add_argument('--window-size=1366,768')
|
||||
options.add_argument('--blink-settings=imagesEnabled=false')
|
||||
SERVICE_ARGS = ['--load-images=false', '--disk-cache=true', '--ignore-ssl-errors=true']
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -4,13 +4,27 @@
|
|||
#
|
||||
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
|
||||
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
|
||||
from scrapy.exceptions import DropItem
|
||||
import json
|
||||
import pdb
|
||||
|
||||
from scrapy.pipelines.images import ImagesPipeline
|
||||
|
||||
|
||||
class MyImagesPipeline(ImagesPipeline):
|
||||
pass
|
||||
|
||||
|
||||
class ZhihuPipeline(object):
|
||||
def process_item(self, item, spider):
|
||||
return item
|
||||
|
||||
|
||||
class JsonWithEncodingCnblogsPipeline(object):
|
||||
def open_spider(self, spider):
|
||||
self.file = open('../../datas/%s.json' % spider.keyword, 'w', encoding='utf-8')
|
||||
def process_item(self, item, spider):
|
||||
line = json.dumps(dict(item), ensure_ascii=False) + "\n"
|
||||
self.file.write(line)
|
||||
return item
|
||||
def spider_closed(self, spider):
|
||||
self.file.close()
|
||||
|
|
|
@ -1,10 +1,10 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
# //div[@class="con shoplist"]//div[@id="search_nature_rg"]//ul/li//p[@class="name"]/a/@title
|
||||
|
||||
import pdb
|
||||
|
||||
import scrapy
|
||||
import zhihu.items
|
||||
import taobao.items
|
||||
from scrapy import Request
|
||||
|
||||
|
||||
|
@ -49,7 +49,7 @@ class DangdangSpider(scrapy.Spider):
|
|||
url = self.safe_fetch(product.xpath('p[@class="name"]/a/@href'))
|
||||
comment_num = self.safe_fetch(product.xpath('p[@class="search_star_line"]//a[@class="search_comment_num"]/text()'))
|
||||
|
||||
yield zhihu.items.ProductItem(title=title,
|
||||
yield taobao.items.ProductItem(title=title,
|
||||
img=img,
|
||||
price=price,
|
||||
author=author,
|
||||
|
|
|
@ -1,124 +0,0 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
import time
|
||||
|
||||
import scrapy
|
||||
|
||||
from scrapy import Request
|
||||
from scrapy.http import HtmlResponse
|
||||
from selenium.common.exceptions import TimeoutException
|
||||
from selenium import webdriver
|
||||
from selenium.webdriver.support.wait import WebDriverWait
|
||||
import pdb
|
||||
|
||||
class MeituSpider(scrapy.Spider):
|
||||
name = 'meitu'
|
||||
allowed_domains = ['zhihu.com']
|
||||
main_url ='https://www.zhihu.com/collection/78172986'
|
||||
|
||||
start_urls = [main_url]
|
||||
cookies={'z_c0':'2|1:0|10:1543151267|4:z_c0|92:Mi4xcFpjMkFBQUFBQUFBWUdnWG5naDZEaWNBQUFDRUFsVk5veXNpWEFELWM4WHU0b3c3anJvR3hobzhvMEpIR004b2xB|aada1968c0f5284002a0bbb552b95af6e87cf3848acc15adfe6d91ed4d9151d8'}
|
||||
|
||||
custom_settings = {
|
||||
'LOG_LEVEL': "INFO"
|
||||
}
|
||||
|
||||
CHROME_DRIVER_PATH = r'/Users/guange/.bin/chromedriver'
|
||||
|
||||
def __init__(self):
|
||||
options = webdriver.ChromeOptions()
|
||||
options.add_argument('--headless')
|
||||
options.add_argument('--window-size=1366,768')
|
||||
options.add_argument('--blink-settings=imagesEnabled=false')
|
||||
SERVICE_ARGS = ['--load-images=false', '--disk-cache=true', '--ignore-ssl-errors=true']
|
||||
self.driver = webdriver.Chrome(chrome_options=options, service_args=SERVICE_ARGS, executable_path=self.CHROME_DRIVER_PATH)
|
||||
self.driver.set_page_load_timeout(30)
|
||||
self.wait = WebDriverWait(self.driver, 10)
|
||||
|
||||
def closed(self, spider):
|
||||
print("spider closed")
|
||||
self.driver.close()
|
||||
|
||||
def start_requests(self):
|
||||
print("start_requests ")
|
||||
for url in self.start_urls:
|
||||
print('start request %s' % url)
|
||||
yield Request(url, cookies=self.cookies)
|
||||
|
||||
def scroll_until_loaded(self):
|
||||
check_height = self.driver.execute_script("return document.body.scrollHeight;")
|
||||
print(check_height)
|
||||
time.sleep(1)
|
||||
while True:
|
||||
self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
|
||||
try:
|
||||
self.wait.until(
|
||||
lambda driver: self.driver.execute_script("return document.body.scrollHeight;") > check_height)
|
||||
check_height = self.driver.execute_script("return document.body.scrollHeight;")
|
||||
print(check_height)
|
||||
except TimeoutException:
|
||||
break
|
||||
|
||||
# pdb.set_trace()
|
||||
return HtmlResponse(url=self.driver.current_url, body=self.driver.page_source, encoding="utf-8")
|
||||
|
||||
def scroll_until_loaded1(self):
|
||||
check_height = self.driver.execute_script("return document.body.scrollHeight;")
|
||||
print(check_height)
|
||||
time.sleep(1)
|
||||
# pdb.set_trace()
|
||||
scrollH = 0
|
||||
while True:
|
||||
scrollH += 100
|
||||
self.driver.execute_script("window.scrollTo(0, %d);" % scrollH)
|
||||
time.sleep(0.1)
|
||||
|
||||
if scrollH > self.driver.execute_script("return document.body.scrollHeight;"):
|
||||
break
|
||||
# try:
|
||||
# self.wait.until(
|
||||
# lambda driver: self.driver.execute_script("return document.body.scrollHeight;") > check_height)
|
||||
# check_height = self.driver.execute_script("return document.body.scrollHeight;")
|
||||
# print(check_height)
|
||||
# except TimeoutException:
|
||||
# break
|
||||
|
||||
# pdb.set_trace()
|
||||
return HtmlResponse(url=self.driver.current_url, body=self.driver.page_source, encoding="utf-8")
|
||||
|
||||
|
||||
def parse_question(self,response):
|
||||
print(response.url)
|
||||
self.driver.get('https://www.zhihu.com')
|
||||
|
||||
self.driver.add_cookie({'name': 'z_c0', 'value': self.cookies['z_c0'], 'domain': '.zhihu.com','path':'/'})
|
||||
self.driver.get(response.url)
|
||||
response = self.scroll_until_loaded()
|
||||
|
||||
imgs = response.xpath('//div[@class="Question-main"]//div[@id="QuestionAnswers-answers"]//div[@class="List-item"]//div[@class="ContentItem AnswerItem"]//div[@class="RichContent-inner"]//div[@class="VagueImage origin_image zh-lightbox-thumb"]/@data-src').extract()
|
||||
|
||||
hrefs = [img for img in imgs if img is not None and img.startswith('http')]
|
||||
print(hrefs)
|
||||
yield {"image_urls": hrefs}
|
||||
|
||||
authors = response.xpath('//div[@class="Question-main"]//div[@id="QuestionAnswers-answers"]//div[@class="List-item"]//div[@class="ContentItem AnswerItem"]//div[@class="AuthorInfo-head"]//a[@class="UserLink-link"]/text()').extract()
|
||||
print(authors)
|
||||
|
||||
|
||||
def parse(self, response):
|
||||
self.logger.warn(response.url)
|
||||
titles = response.xpath('//div[@class="zu-main-content"]//div[@class="zm-item"]//h2[@class="zm-item-title"]/a/text()').extract()
|
||||
print(titles)
|
||||
|
||||
hrefs = response.xpath(
|
||||
'//div[@class="zu-main-content"]//div[@class="zm-item"]//h2[@class="zm-item-title"]/a/@href').extract()
|
||||
print(hrefs)
|
||||
|
||||
yield Request('https://www.zhihu.com' + hrefs[5], callback=self.parse_question, cookies=self.cookies)
|
||||
#for href in hrefs:
|
||||
# yield Request('https://www.zhihu.com' + href, callback=self.parse_question, cookies=self.cookies)
|
||||
|
||||
next_href = response.xpath('//div[@class="border-pager"]//div[@class="zm-invite-pager"]//span/a[text()="下一页"]/@href').extract_first()
|
||||
print(next_href)
|
||||
|
||||
# if next_href:
|
||||
# yield Request('https://www.zhihu.com/collection/78172986' + next_href )
|
|
@ -1,19 +0,0 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
import scrapy
|
||||
|
||||
|
||||
class PornSpider(scrapy.Spider):
|
||||
name = 'porn'
|
||||
allowed_domains = ['www.baidu.com']
|
||||
start_urls = ['https://www.baidu.com/']
|
||||
|
||||
custom_settings = {
|
||||
'LOG_LEVEL': "INFO",
|
||||
'SPIDER_MIDDLEWARES': {
|
||||
'zhihu.middlewares.ZhihuSpiderMiddleware': 543,
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
def parse(self, response):
|
||||
print("parse response")
|
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue