From 62feffac4b6324c1adf176b41e5cbde7034e6940 Mon Sep 17 00:00:00 2001
From: guange <8863824@gmail.com>
Date: Thu, 17 Jan 2019 22:46:46 +0800
Subject: [PATCH] .

---
 .../taobao/__pycache__/items.cpython-37.pyc   | Bin 1838 -> 1854 bytes
 .../__pycache__/pipelines.cpython-37.pyc      | Bin 4499 -> 4527 bytes
 .../__pycache__/settings.cpython-37.pyc       | Bin 1163 -> 1168 bytes
 chapter1/crawler/taobao/settings.py           | 123 ++++++++++++++++++
 chapter1/crawler/taobao/spiders/jd.py         |   2 +-
 chapter1/crawler/taobao/spiders/jd1.py        |   2 +-
 6 files changed, 125 insertions(+), 2 deletions(-)
 create mode 100644 chapter1/crawler/taobao/settings.py
diff --git a/chapter1/crawler/taobao/__pycache__/items.cpython-37.pyc b/chapter1/crawler/taobao/__pycache__/items.cpython-37.pyc
index aed5c8d06c073d40256cb936db0ac85ebf4207fc..3b602a38a2d7d65f49f4597c0d6fc29818694aeb 100644
GIT binary patch
delta 146
zcmZ3-w~vq4iI<m)0SN9)c8KZR$a|HUF=X;x7VF7!ECL*<%nMi+GDPuC)?*Q8<eMzO
zEXQ_>rKGYTb#e-eCz}vZ7YF0yjVz}br6$+1N=Qos1x$gsm<KGw!3cvz(u+5+GP247
hh3zJ9WEN+XoqU!}l?|l6NEWD7d@?_~3^R`qCjftW9asPW

delta 147
zcmdnTw~mk3iI<m)0SH{GZDaB`@?K?T^qKsY#d@+Fi-1H5V=#jz(=A>vmw=-Dl+xr9
z&yv(!KTY1r0?cxgi&#8a1b`YRA7nYrC^@;8RYF<{$Tb1tVjdvD!H9rGQi~6;GP245
hg>5HqWEN+XnS7T`l?|l6NCv1?e6l>d3^TV7Cji^F9-#mL

diff --git a/chapter1/crawler/taobao/__pycache__/pipelines.cpython-37.pyc b/chapter1/crawler/taobao/__pycache__/pipelines.cpython-37.pyc
index 0a8bcd20676dde467d7615d0215c66e196f108d6..c4897c47c22ba00c9d18936721965a9aa729f3fc 100644
GIT binary patch
delta 252
zcmbQNyk42tiI<m)0SNw0c8FQGk#`S|I1iBL48+B6fJ6yH4PyyoGgDE(<Oe)vDK(5K
zoRSPFTyr=UGBGmb%>YZY0HynC7_wLwFazZoC4pK}xWOb3nB)bMd|(o!n|}@~Sa%dy
zHycp*3!rW$nC{7SyhfTO>_9_AB^jC-Q^e-5ft1%U1~X`i`xR*dT~(wCByO>kR2HO8
xKFMn+!%@XkP?Voinp|R)mzbN1-~m-^=HOFg;x++lU|{58<Y8pnY$TA#2mtkcJmCNU

delta 224
zcmZ3lJXx97iI<m)0SIavZDVpa^6udg=LYhefw=e?kSJlOVJu;6W-9WU{D8+SBZYGg
z`$8s0hP)YI2^OG4M-4+3>jLH)#uP?LpyCuRFv$%jdB7wunB<UTNa35q3f3J3*3Aag
z{RpU=38uS*9jKpQlA)P#auu(UWQyn<Hjwff#$X0bF~1^BpnHl`fyCsiyoP-2RXhbn
g`6;EzC02Qfxto>v6qz`UfRYT1T#P)M{R9#j0mNT2(f|Me

diff --git a/chapter1/crawler/taobao/__pycache__/settings.cpython-37.pyc b/chapter1/crawler/taobao/__pycache__/settings.cpython-37.pyc
index 1aaf7e38e09af9a3634c3ab3d898c52ebf03ccc6..f754777be4cbb8abfcc67ab016147a34ea73af4f 100644
GIT binary patch
delta 61
zcmeC?oWRNL#LLUY00d8`I85Yj;uMP#O%V@f(3IFXZ5<<{++<fKL)Mod1)Iy5x*6F-
Mfvh6Y$?PmG0IkCfrT_o{

delta 55
zcmbQh+0DuA#LLUY00gSP>?U$Iaq>rrricYIXo_!~wvLgJYqBeo;pS?lZbmi{pqe6)
I$viAA08mB@Jpcdz

diff --git a/chapter1/crawler/taobao/settings.py b/chapter1/crawler/taobao/settings.py
new file mode 100644
index 0000000..d73d1b6
--- /dev/null
+++ b/chapter1/crawler/taobao/settings.py
@@ -0,0 +1,123 @@
+# -*- coding: utf-8 -*-
+
+# Scrapy settings for taobao project
+#
+# For simplicity, this file contains only settings considered important or
+# commonly used. You can find more settings consulting the documentation:
+#
+#     https://doc.scrapy.org/en/latest/topics/settings.html
+#     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
+#     https://doc.scrapy.org/en/latest/topics/spider-middleware.html
+
+BOT_NAME = 'taobao'
+
+SPIDER_MODULES = ['taobao.spiders']
+NEWSPIDER_MODULE = 'taobao.spiders'
+
+
+
+
+# Crawl responsibly by identifying yourself (and your website) on the user-agent
+USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36'
+
+# Obey robots.txt rules
+ROBOTSTXT_OBEY = False
+
+LOG_LEVEL='DEBUG'
+
+
+SPLASH_URL = 'http://127.0.0.1:8050'
+
+# Configure maximum concurrent requests performed by Scrapy (default: 16)
+#CONCURRENT_REQUESTS = 32
+
+# Configure a delay for requests for the same website (default: 0)
+# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
+# See also autothrottle settings and docs
+#DOWNLOAD_DELAY = 3
+# The download delay setting will honor only one of:
+#CONCURRENT_REQUESTS_PER_DOMAIN = 16
+#CONCURRENT_REQUESTS_PER_IP = 16
+
+# Disable cookies (enabled by default)
+COOKIES_ENABLED = True
+
+# Disable Telnet Console (enabled by default)
+TELNETCONSOLE_ENABLED = True
+
+# Override the default request headers:
+#DEFAULT_REQUEST_HEADERS = {
+#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
+#   'Accept-Language': 'en',
+#}
+
+
+# Enable or disable spider middlewares
+# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
+#SPIDER_MIDDLEWARES = {
+#    'taobao.middlewares.ZhihuSpiderMiddleware': 543,
+#}
+
+# Enable or disable downloader middlewares
+# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
+#DOWNLOADER_MIDDLEWARES = {
+#    'taobao.middlewares.ZhihuDownloaderMiddleware': 543,
+#}
+
+DOWNLOADER_MIDDLEWARES = {
+    'scrapy_splash.SplashCookiesMiddleware': 723,
+    'scrapy_splash.SplashMiddleware': 725,
+    'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810,
+}
+#Splash自己的去重过滤器
+DUPEFILTER_CLASS = 'scrapy_splash.SplashAwareDupeFilter'
+
+
+
+# Enable or disable extensions
+# See https://doc.scrapy.org/en/latest/topics/extensions.html
+#EXTENSIONS = {
+#    'scrapy.extensions.telnet.TelnetConsole': None,
+#}
+
+# Configure item pipelines
+# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
+#ITEM_PIPELINES = {
+#    'taobao.pipelines.ZhihuPipeline': 300,
+#}
+
+ITEM_PIPELINES = {'taobao.pipelines.MyImagesPipeline': 300}
+
+IMAGES_STORE = '/tmp/taobao'
+
+# Enable and configure the AutoThrottle extension (disabled by default)
+# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
+# 启用限速
+AUTOTHROTTLE_ENABLED = True
+# The initial download delay
+AUTOTHROTTLE_START_DELAY = 30
+# The maximum download delay to be set in case of high latencies
+AUTOTHROTTLE_MAX_DELAY = 60
+# The average number of requests Scrapy should be sending in parallel to
+# each remote server
+AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
+# Enable showing throttling stats for every response received:
+#AUTOTHROTTLE_DEBUG = False
+
+# Enable and configure HTTP caching (disabled by default)
+# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
+#HTTPCACHE_ENABLED = True
+#HTTPCACHE_EXPIRATION_SECS = 0
+#HTTPCACHE_DIR = 'httpcache'
+#HTTPCACHE_IGNORE_HTTP_CODES = []
+#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
+
+FEED_EXPORT_ENCODING='utf-8'
+
+
+#HBASE_HOST = '106.75.85.84'
+HBASE_HOST = '106.75.120.249'
+HBASE_PORT = 40009
+HBASE_TABLE = 'jd'
+
+COMMENT_MAX_PAGE = 10
diff --git a/chapter1/crawler/taobao/spiders/jd.py b/chapter1/crawler/taobao/spiders/jd.py
index 65d4fb9..5ddef03 100644
--- a/chapter1/crawler/taobao/spiders/jd.py
+++ b/chapter1/crawler/taobao/spiders/jd.py
@@ -114,7 +114,7 @@ class JdSpider(scrapy.Spider):
             print(shop)
 
             yield JDProductItem(
-                type=0,
+                type='0',
                 name=name,
                 price=price,
                 url=url,
diff --git a/chapter1/crawler/taobao/spiders/jd1.py b/chapter1/crawler/taobao/spiders/jd1.py
index 605159b..97dca08 100644
--- a/chapter1/crawler/taobao/spiders/jd1.py
+++ b/chapter1/crawler/taobao/spiders/jd1.py
@@ -114,7 +114,7 @@ class Jd1Spider(scrapy.Spider):
             print(shop)
 
             yield JDProductItem(
-                type=1,
+                type='1',
                 name=name,
                 price=price,
                 url=url,