数据抓取
This commit is contained in:
parent
f8fe624d77
commit
a19364e304
|
@ -19,6 +19,7 @@ class ProductItem(scrapy.Item):
|
||||||
title = scrapy.Field()
|
title = scrapy.Field()
|
||||||
price = scrapy.Field()
|
price = scrapy.Field()
|
||||||
author = scrapy.Field()
|
author = scrapy.Field()
|
||||||
|
shop = scrapy.Field()
|
||||||
pubtime = scrapy.Field()
|
pubtime = scrapy.Field()
|
||||||
publisher = scrapy.Field()
|
publisher = scrapy.Field()
|
||||||
url = scrapy.Field()
|
url = scrapy.Field()
|
||||||
|
|
|
@ -125,7 +125,7 @@ class ZhihuChromeMiddleware(object):
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
options = webdriver.ChromeOptions()
|
options = webdriver.ChromeOptions()
|
||||||
options.add_argument('--headless')
|
# options.add_argument('--headless')
|
||||||
options.add_argument('--window-size=1366,768')
|
options.add_argument('--window-size=1366,768')
|
||||||
options.add_argument('--blink-settings=imagesEnabled=false')
|
options.add_argument('--blink-settings=imagesEnabled=false')
|
||||||
SERVICE_ARGS = ['--load-images=false', '--disk-cache=true', '--ignore-ssl-errors=true']
|
SERVICE_ARGS = ['--load-images=false', '--disk-cache=true', '--ignore-ssl-errors=true']
|
||||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -4,13 +4,27 @@
|
||||||
#
|
#
|
||||||
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
|
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
|
||||||
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
|
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
|
||||||
from scrapy.exceptions import DropItem
|
import json
|
||||||
|
import pdb
|
||||||
|
|
||||||
from scrapy.pipelines.images import ImagesPipeline
|
from scrapy.pipelines.images import ImagesPipeline
|
||||||
|
|
||||||
|
|
||||||
class MyImagesPipeline(ImagesPipeline):
|
class MyImagesPipeline(ImagesPipeline):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
class ZhihuPipeline(object):
|
class ZhihuPipeline(object):
|
||||||
def process_item(self, item, spider):
|
def process_item(self, item, spider):
|
||||||
return item
|
return item
|
||||||
|
|
||||||
|
|
||||||
|
class JsonWithEncodingCnblogsPipeline(object):
|
||||||
|
def open_spider(self, spider):
|
||||||
|
self.file = open('../../datas/%s.json' % spider.keyword, 'w', encoding='utf-8')
|
||||||
|
def process_item(self, item, spider):
|
||||||
|
line = json.dumps(dict(item), ensure_ascii=False) + "\n"
|
||||||
|
self.file.write(line)
|
||||||
|
return item
|
||||||
|
def spider_closed(self, spider):
|
||||||
|
self.file.close()
|
||||||
|
|
|
@ -1,10 +1,10 @@
|
||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
# //div[@class="con shoplist"]//div[@id="search_nature_rg"]//ul/li//p[@class="name"]/a/@title
|
# //div[@class="con shoplist"]//div[@id="search_nature_rg"]//ul/li//p[@class="name"]/a/@title
|
||||||
|
import pdb
|
||||||
|
|
||||||
import scrapy
|
import scrapy
|
||||||
import zhihu.items
|
import taobao.items
|
||||||
from scrapy import Request
|
from scrapy import Request
|
||||||
|
|
||||||
|
|
||||||
|
@ -49,7 +49,7 @@ class DangdangSpider(scrapy.Spider):
|
||||||
url = self.safe_fetch(product.xpath('p[@class="name"]/a/@href'))
|
url = self.safe_fetch(product.xpath('p[@class="name"]/a/@href'))
|
||||||
comment_num = self.safe_fetch(product.xpath('p[@class="search_star_line"]//a[@class="search_comment_num"]/text()'))
|
comment_num = self.safe_fetch(product.xpath('p[@class="search_star_line"]//a[@class="search_comment_num"]/text()'))
|
||||||
|
|
||||||
yield zhihu.items.ProductItem(title=title,
|
yield taobao.items.ProductItem(title=title,
|
||||||
img=img,
|
img=img,
|
||||||
price=price,
|
price=price,
|
||||||
author=author,
|
author=author,
|
||||||
|
|
|
@ -1,124 +0,0 @@
|
||||||
# -*- coding: utf-8 -*-
|
|
||||||
import time
|
|
||||||
|
|
||||||
import scrapy
|
|
||||||
|
|
||||||
from scrapy import Request
|
|
||||||
from scrapy.http import HtmlResponse
|
|
||||||
from selenium.common.exceptions import TimeoutException
|
|
||||||
from selenium import webdriver
|
|
||||||
from selenium.webdriver.support.wait import WebDriverWait
|
|
||||||
import pdb
|
|
||||||
|
|
||||||
class MeituSpider(scrapy.Spider):
|
|
||||||
name = 'meitu'
|
|
||||||
allowed_domains = ['zhihu.com']
|
|
||||||
main_url ='https://www.zhihu.com/collection/78172986'
|
|
||||||
|
|
||||||
start_urls = [main_url]
|
|
||||||
cookies={'z_c0':'2|1:0|10:1543151267|4:z_c0|92:Mi4xcFpjMkFBQUFBQUFBWUdnWG5naDZEaWNBQUFDRUFsVk5veXNpWEFELWM4WHU0b3c3anJvR3hobzhvMEpIR004b2xB|aada1968c0f5284002a0bbb552b95af6e87cf3848acc15adfe6d91ed4d9151d8'}
|
|
||||||
|
|
||||||
custom_settings = {
|
|
||||||
'LOG_LEVEL': "INFO"
|
|
||||||
}
|
|
||||||
|
|
||||||
CHROME_DRIVER_PATH = r'/Users/guange/.bin/chromedriver'
|
|
||||||
|
|
||||||
def __init__(self):
|
|
||||||
options = webdriver.ChromeOptions()
|
|
||||||
options.add_argument('--headless')
|
|
||||||
options.add_argument('--window-size=1366,768')
|
|
||||||
options.add_argument('--blink-settings=imagesEnabled=false')
|
|
||||||
SERVICE_ARGS = ['--load-images=false', '--disk-cache=true', '--ignore-ssl-errors=true']
|
|
||||||
self.driver = webdriver.Chrome(chrome_options=options, service_args=SERVICE_ARGS, executable_path=self.CHROME_DRIVER_PATH)
|
|
||||||
self.driver.set_page_load_timeout(30)
|
|
||||||
self.wait = WebDriverWait(self.driver, 10)
|
|
||||||
|
|
||||||
def closed(self, spider):
|
|
||||||
print("spider closed")
|
|
||||||
self.driver.close()
|
|
||||||
|
|
||||||
def start_requests(self):
|
|
||||||
print("start_requests ")
|
|
||||||
for url in self.start_urls:
|
|
||||||
print('start request %s' % url)
|
|
||||||
yield Request(url, cookies=self.cookies)
|
|
||||||
|
|
||||||
def scroll_until_loaded(self):
|
|
||||||
check_height = self.driver.execute_script("return document.body.scrollHeight;")
|
|
||||||
print(check_height)
|
|
||||||
time.sleep(1)
|
|
||||||
while True:
|
|
||||||
self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
|
|
||||||
try:
|
|
||||||
self.wait.until(
|
|
||||||
lambda driver: self.driver.execute_script("return document.body.scrollHeight;") > check_height)
|
|
||||||
check_height = self.driver.execute_script("return document.body.scrollHeight;")
|
|
||||||
print(check_height)
|
|
||||||
except TimeoutException:
|
|
||||||
break
|
|
||||||
|
|
||||||
# pdb.set_trace()
|
|
||||||
return HtmlResponse(url=self.driver.current_url, body=self.driver.page_source, encoding="utf-8")
|
|
||||||
|
|
||||||
def scroll_until_loaded1(self):
|
|
||||||
check_height = self.driver.execute_script("return document.body.scrollHeight;")
|
|
||||||
print(check_height)
|
|
||||||
time.sleep(1)
|
|
||||||
# pdb.set_trace()
|
|
||||||
scrollH = 0
|
|
||||||
while True:
|
|
||||||
scrollH += 100
|
|
||||||
self.driver.execute_script("window.scrollTo(0, %d);" % scrollH)
|
|
||||||
time.sleep(0.1)
|
|
||||||
|
|
||||||
if scrollH > self.driver.execute_script("return document.body.scrollHeight;"):
|
|
||||||
break
|
|
||||||
# try:
|
|
||||||
# self.wait.until(
|
|
||||||
# lambda driver: self.driver.execute_script("return document.body.scrollHeight;") > check_height)
|
|
||||||
# check_height = self.driver.execute_script("return document.body.scrollHeight;")
|
|
||||||
# print(check_height)
|
|
||||||
# except TimeoutException:
|
|
||||||
# break
|
|
||||||
|
|
||||||
# pdb.set_trace()
|
|
||||||
return HtmlResponse(url=self.driver.current_url, body=self.driver.page_source, encoding="utf-8")
|
|
||||||
|
|
||||||
|
|
||||||
def parse_question(self,response):
|
|
||||||
print(response.url)
|
|
||||||
self.driver.get('https://www.zhihu.com')
|
|
||||||
|
|
||||||
self.driver.add_cookie({'name': 'z_c0', 'value': self.cookies['z_c0'], 'domain': '.zhihu.com','path':'/'})
|
|
||||||
self.driver.get(response.url)
|
|
||||||
response = self.scroll_until_loaded()
|
|
||||||
|
|
||||||
imgs = response.xpath('//div[@class="Question-main"]//div[@id="QuestionAnswers-answers"]//div[@class="List-item"]//div[@class="ContentItem AnswerItem"]//div[@class="RichContent-inner"]//div[@class="VagueImage origin_image zh-lightbox-thumb"]/@data-src').extract()
|
|
||||||
|
|
||||||
hrefs = [img for img in imgs if img is not None and img.startswith('http')]
|
|
||||||
print(hrefs)
|
|
||||||
yield {"image_urls": hrefs}
|
|
||||||
|
|
||||||
authors = response.xpath('//div[@class="Question-main"]//div[@id="QuestionAnswers-answers"]//div[@class="List-item"]//div[@class="ContentItem AnswerItem"]//div[@class="AuthorInfo-head"]//a[@class="UserLink-link"]/text()').extract()
|
|
||||||
print(authors)
|
|
||||||
|
|
||||||
|
|
||||||
def parse(self, response):
|
|
||||||
self.logger.warn(response.url)
|
|
||||||
titles = response.xpath('//div[@class="zu-main-content"]//div[@class="zm-item"]//h2[@class="zm-item-title"]/a/text()').extract()
|
|
||||||
print(titles)
|
|
||||||
|
|
||||||
hrefs = response.xpath(
|
|
||||||
'//div[@class="zu-main-content"]//div[@class="zm-item"]//h2[@class="zm-item-title"]/a/@href').extract()
|
|
||||||
print(hrefs)
|
|
||||||
|
|
||||||
yield Request('https://www.zhihu.com' + hrefs[5], callback=self.parse_question, cookies=self.cookies)
|
|
||||||
#for href in hrefs:
|
|
||||||
# yield Request('https://www.zhihu.com' + href, callback=self.parse_question, cookies=self.cookies)
|
|
||||||
|
|
||||||
next_href = response.xpath('//div[@class="border-pager"]//div[@class="zm-invite-pager"]//span/a[text()="下一页"]/@href').extract_first()
|
|
||||||
print(next_href)
|
|
||||||
|
|
||||||
# if next_href:
|
|
||||||
# yield Request('https://www.zhihu.com/collection/78172986' + next_href )
|
|
|
@ -1,19 +0,0 @@
|
||||||
# -*- coding: utf-8 -*-
|
|
||||||
import scrapy
|
|
||||||
|
|
||||||
|
|
||||||
class PornSpider(scrapy.Spider):
|
|
||||||
name = 'porn'
|
|
||||||
allowed_domains = ['www.baidu.com']
|
|
||||||
start_urls = ['https://www.baidu.com/']
|
|
||||||
|
|
||||||
custom_settings = {
|
|
||||||
'LOG_LEVEL': "INFO",
|
|
||||||
'SPIDER_MIDDLEWARES': {
|
|
||||||
'zhihu.middlewares.ZhihuSpiderMiddleware': 543,
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
def parse(self, response):
|
|
||||||
print("parse response")
|
|
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue