数据抓取

2019-01-08 16:37:37 +08:00 · 2019-01-08 16:37:37 +08:00 · a19364e304
parent f8fe624d77
commit a19364e304
8 changed files with 20 additions and 32171 deletions
--- a/chapter1/crawler/taobao/items.py
+++ b/chapter1/crawler/taobao/items.py
@ -19,6 +19,7 @@ class ProductItem(scrapy.Item):
    title = scrapy.Field()
    price = scrapy.Field()
    author = scrapy.Field()
    shop = scrapy.Field()
    pubtime = scrapy.Field()
    publisher   = scrapy.Field()
    url = scrapy.Field()
--- a/chapter1/crawler/taobao/middlewares.py
+++ b/chapter1/crawler/taobao/middlewares.py
@ -125,7 +125,7 @@ class ZhihuChromeMiddleware(object):
    def __init__(self):
        options = webdriver.ChromeOptions()
-        options.add_argument('--headless')
+        # options.add_argument('--headless')
        options.add_argument('--window-size=1366,768')
        options.add_argument('--blink-settings=imagesEnabled=false')
        SERVICE_ARGS = ['--load-images=false', '--disk-cache=true', '--ignore-ssl-errors=true']
--- a/chapter1/crawler/taobao/output.json
+++ b/chapter1/crawler/taobao/output.json
--- a/chapter1/crawler/taobao/pipelines.py
+++ b/chapter1/crawler/taobao/pipelines.py
@ -4,13 +4,27 @@
 #
 # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
-from scrapy.exceptions import DropItem
+import json
 import pdb
 from scrapy.pipelines.images import ImagesPipeline
 class MyImagesPipeline(ImagesPipeline):
    pass
 class ZhihuPipeline(object):
    def process_item(self, item, spider):
        return item
 class JsonWithEncodingCnblogsPipeline(object):
    def open_spider(self, spider):
        self.file = open('../../datas/%s.json' % spider.keyword, 'w', encoding='utf-8')
    def process_item(self, item, spider):
        line = json.dumps(dict(item), ensure_ascii=False) + "\n"
        self.file.write(line)
        return item
    def spider_closed(self, spider):
        self.file.close()
--- a/chapter1/crawler/taobao/spiders/dangdang.py
+++ b/chapter1/crawler/taobao/spiders/dangdang.py
@ -1,10 +1,10 @@
 # -*- coding: utf-8 -*-
 # //div[@class="con shoplist"]//div[@id="search_nature_rg"]//ul/li//p[@class="name"]/a/@title
-
+import pdb
 import scrapy
-import zhihu.items
+import taobao.items
 from scrapy import Request
@ -49,7 +49,7 @@ class DangdangSpider(scrapy.Spider):
            url = self.safe_fetch(product.xpath('p[@class="name"]/a/@href'))
            comment_num = self.safe_fetch(product.xpath('p[@class="search_star_line"]//a[@class="search_comment_num"]/text()'))
-            yield zhihu.items.ProductItem(title=title,
+            yield taobao.items.ProductItem(title=title,
                                          img=img,
                                          price=price,
                                          author=author,
--- a/chapter1/crawler/taobao/spiders/meitu.py
+++ b/chapter1/crawler/taobao/spiders/meitu.py
@ -1,124 +0,0 @@
 # -*- coding: utf-8 -*-
 import time
 import scrapy
 from scrapy import Request
 from scrapy.http import HtmlResponse
 from selenium.common.exceptions import TimeoutException
 from selenium import webdriver
 from selenium.webdriver.support.wait import WebDriverWait
 import pdb
 class MeituSpider(scrapy.Spider):
    name = 'meitu'
    allowed_domains = ['zhihu.com']
    main_url ='https://www.zhihu.com/collection/78172986'
    start_urls = [main_url]
    cookies={'z_c0':'2|1:0|10:1543151267|4:z_c0|92:Mi4xcFpjMkFBQUFBQUFBWUdnWG5naDZEaWNBQUFDRUFsVk5veXNpWEFELWM4WHU0b3c3anJvR3hobzhvMEpIR004b2xB|aada1968c0f5284002a0bbb552b95af6e87cf3848acc15adfe6d91ed4d9151d8'}
    custom_settings = {
        'LOG_LEVEL': "INFO"
    }
    CHROME_DRIVER_PATH = r'/Users/guange/.bin/chromedriver'
    def __init__(self):
        options = webdriver.ChromeOptions()
        options.add_argument('--headless')
        options.add_argument('--window-size=1366,768')
        options.add_argument('--blink-settings=imagesEnabled=false')
        SERVICE_ARGS = ['--load-images=false', '--disk-cache=true', '--ignore-ssl-errors=true']
        self.driver = webdriver.Chrome(chrome_options=options, service_args=SERVICE_ARGS, executable_path=self.CHROME_DRIVER_PATH)
        self.driver.set_page_load_timeout(30)
        self.wait = WebDriverWait(self.driver, 10)
    def closed(self, spider):
        print("spider closed")
        self.driver.close()
    def start_requests(self):
        print("start_requests ")
        for url in self.start_urls:
            print('start request %s' % url)
            yield Request(url, cookies=self.cookies)
    def scroll_until_loaded(self):
        check_height = self.driver.execute_script("return document.body.scrollHeight;")
        print(check_height)
        time.sleep(1)
        while True:
            self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            try:
                self.wait.until(
                    lambda driver: self.driver.execute_script("return document.body.scrollHeight;") > check_height)
                check_height = self.driver.execute_script("return document.body.scrollHeight;")
                print(check_height)
            except TimeoutException:
                break
        # pdb.set_trace()
        return HtmlResponse(url=self.driver.current_url, body=self.driver.page_source, encoding="utf-8")
    def scroll_until_loaded1(self):
        check_height = self.driver.execute_script("return document.body.scrollHeight;")
        print(check_height)
        time.sleep(1)
        # pdb.set_trace()
        scrollH = 0
        while True:
            scrollH += 100
            self.driver.execute_script("window.scrollTo(0, %d);" % scrollH)
            time.sleep(0.1)
            if scrollH > self.driver.execute_script("return document.body.scrollHeight;"):
                break
            # try:
            #     self.wait.until(
            #         lambda driver: self.driver.execute_script("return document.body.scrollHeight;") > check_height)
            #     check_height = self.driver.execute_script("return document.body.scrollHeight;")
            #     print(check_height)
            # except TimeoutException:
            #     break
        # pdb.set_trace()
        return HtmlResponse(url=self.driver.current_url, body=self.driver.page_source, encoding="utf-8")
    def parse_question(self,response):
        print(response.url)
        self.driver.get('https://www.zhihu.com')
        self.driver.add_cookie({'name': 'z_c0', 'value': self.cookies['z_c0'], 'domain': '.zhihu.com','path':'/'})
        self.driver.get(response.url)
        response = self.scroll_until_loaded()
        imgs = response.xpath('//div[@class="Question-main"]//div[@id="QuestionAnswers-answers"]//div[@class="List-item"]//div[@class="ContentItem AnswerItem"]//div[@class="RichContent-inner"]//div[@class="VagueImage origin_image zh-lightbox-thumb"]/@data-src').extract()
        hrefs = [img for img in imgs if img is not None and  img.startswith('http')]
        print(hrefs)
        yield {"image_urls": hrefs}
        authors = response.xpath('//div[@class="Question-main"]//div[@id="QuestionAnswers-answers"]//div[@class="List-item"]//div[@class="ContentItem AnswerItem"]//div[@class="AuthorInfo-head"]//a[@class="UserLink-link"]/text()').extract()
        print(authors)
    def parse(self, response):
        self.logger.warn(response.url)
        titles = response.xpath('//div[@class="zu-main-content"]//div[@class="zm-item"]//h2[@class="zm-item-title"]/a/text()').extract()
        print(titles)
        hrefs = response.xpath(
            '//div[@class="zu-main-content"]//div[@class="zm-item"]//h2[@class="zm-item-title"]/a/@href').extract()
        print(hrefs)
        yield Request('https://www.zhihu.com' + hrefs[5], callback=self.parse_question, cookies=self.cookies)
        #for href in hrefs:
        #    yield Request('https://www.zhihu.com' + href, callback=self.parse_question, cookies=self.cookies)
        next_href = response.xpath('//div[@class="border-pager"]//div[@class="zm-invite-pager"]//span/a[text()="下一页"]/@href').extract_first()
        print(next_href)
        # if next_href:
        #     yield Request('https://www.zhihu.com/collection/78172986' + next_href )
--- a/chapter1/crawler/taobao/spiders/porn.py
+++ b/chapter1/crawler/taobao/spiders/porn.py
@ -1,19 +0,0 @@
 # -*- coding: utf-8 -*-
 import scrapy
 class PornSpider(scrapy.Spider):
    name = 'porn'
    allowed_domains = ['www.baidu.com']
    start_urls = ['https://www.baidu.com/']
    custom_settings = {
        'LOG_LEVEL': "INFO",
        'SPIDER_MIDDLEWARES': {
           'zhihu.middlewares.ZhihuSpiderMiddleware': 543,
        }
    }
    def parse(self, response):
        print("parse response")
--- a/chapter1/crawler/taobao/users.json
+++ b/chapter1/crawler/taobao/users.json