导入hbase

This commit is contained in:
guange 2019-01-14 22:34:53 +08:00
parent 1a2e31cd14
commit e38ba68191
4 changed files with 123 additions and 60 deletions

View File

@ -1,7 +1,12 @@
<?xml version="1.0" encoding="UTF-8"?> <?xml version="1.0" encoding="UTF-8"?>
<project version="4"> <project version="4">
<component name="ChangeListManager"> <component name="ChangeListManager">
<list default="true" id="a5fbb387-9969-4874-8a7f-c9dd40d5225d" name="Default Changelist" comment="" /> <list default="true" id="a5fbb387-9969-4874-8a7f-c9dd40d5225d" name="Default Changelist" comment="">
<change beforePath="$PROJECT_DIR$/.idea/workspace.xml" beforeDir="false" afterPath="$PROJECT_DIR$/.idea/workspace.xml" afterDir="false" />
<change beforePath="$PROJECT_DIR$/crawler/taobao/pipelines.py" beforeDir="false" afterPath="$PROJECT_DIR$/crawler/taobao/pipelines.py" afterDir="false" />
<change beforePath="$PROJECT_DIR$/crawler/taobao/settings.py" beforeDir="false" afterPath="$PROJECT_DIR$/crawler/taobao/settings.py" afterDir="false" />
<change beforePath="$PROJECT_DIR$/crawler/taobao/spiders/jd.py" beforeDir="false" afterPath="$PROJECT_DIR$/crawler/taobao/spiders/jd.py" afterDir="false" />
</list>
<option name="EXCLUDED_CONVERTED_TO_IGNORED" value="true" /> <option name="EXCLUDED_CONVERTED_TO_IGNORED" value="true" />
<option name="SHOW_DIALOG" value="false" /> <option name="SHOW_DIALOG" value="false" />
<option name="HIGHLIGHT_CONFLICTS" value="true" /> <option name="HIGHLIGHT_CONFLICTS" value="true" />
@ -18,17 +23,8 @@
<file pinned="false" current-in-tab="false"> <file pinned="false" current-in-tab="false">
<entry file="file://$PROJECT_DIR$/crawler/taobao/items.py"> <entry file="file://$PROJECT_DIR$/crawler/taobao/items.py">
<provider selected="true" editor-type-id="text-editor"> <provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="269"> <state relative-caret-position="374">
<caret line="40" column="28" lean-forward="true" selection-start-line="40" selection-start-column="28" selection-end-line="40" selection-end-column="28" /> <caret line="47" column="15" selection-start-line="47" selection-start-column="4" selection-end-line="47" selection-end-column="15" />
</state>
</provider>
</entry>
</file>
<file pinned="false" current-in-tab="false">
<entry file="file://$PROJECT_DIR$/env/lib/python3.7/site-packages/scrapy/item.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="390">
<caret line="68" column="21" lean-forward="true" selection-start-line="68" selection-start-column="21" selection-end-line="68" selection-end-column="21" />
</state> </state>
</provider> </provider>
</entry> </entry>
@ -36,8 +32,8 @@
<file pinned="false" current-in-tab="false"> <file pinned="false" current-in-tab="false">
<entry file="file://$PROJECT_DIR$/crawler/taobao/pipelines.py"> <entry file="file://$PROJECT_DIR$/crawler/taobao/pipelines.py">
<provider selected="true" editor-type-id="text-editor"> <provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="282"> <state relative-caret-position="1155">
<caret line="60" column="21" selection-start-line="60" selection-start-column="6" selection-end-line="60" selection-end-column="21" /> <caret line="77" column="31" lean-forward="true" selection-start-line="77" selection-start-column="31" selection-end-line="77" selection-end-column="31" />
<folding> <folding>
<element signature="e#193#204#0" expanded="true" /> <element signature="e#193#204#0" expanded="true" />
</folding> </folding>
@ -45,11 +41,29 @@
</provider> </provider>
</entry> </entry>
</file> </file>
<file pinned="false" current-in-tab="false">
<entry file="file://$PROJECT_DIR$/env/lib/python3.7/site-packages/scrapy/conf.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="105">
<caret line="7" column="4" selection-start-line="7" selection-start-column="4" selection-end-line="7" selection-end-column="4" />
</state>
</provider>
</entry>
</file>
<file pinned="false" current-in-tab="false">
<entry file="file://$PROJECT_DIR$/env/lib/python3.7/site-packages/happybase/connection.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="270">
<caret line="107" column="8" selection-start-line="107" selection-start-column="8" selection-end-line="107" selection-end-column="8" />
</state>
</provider>
</entry>
</file>
<file pinned="false" current-in-tab="false"> <file pinned="false" current-in-tab="false">
<entry file="file://$PROJECT_DIR$/crawler/taobao/spiders/comment.py"> <entry file="file://$PROJECT_DIR$/crawler/taobao/spiders/comment.py">
<provider selected="true" editor-type-id="text-editor"> <provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="-271"> <state relative-caret-position="-271">
<caret line="36" column="53" lean-forward="true" selection-start-line="36" selection-start-column="53" selection-end-line="36" selection-end-column="53" /> <caret line="36" column="53" selection-start-line="36" selection-start-column="53" selection-end-line="36" selection-end-column="53" />
<folding> <folding>
<element signature="e#24#35#0" expanded="true" /> <element signature="e#24#35#0" expanded="true" />
</folding> </folding>
@ -69,23 +83,11 @@
</provider> </provider>
</entry> </entry>
</file> </file>
<file pinned="false" current-in-tab="false">
<entry file="file://$PROJECT_DIR$/crawler/taobao/spiders/category.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="251">
<caret line="47" column="59" selection-start-line="47" selection-start-column="54" selection-end-line="47" selection-end-column="59" />
<folding>
<element signature="e#24#35#0" expanded="true" />
</folding>
</state>
</provider>
</entry>
</file>
<file pinned="false" current-in-tab="false"> <file pinned="false" current-in-tab="false">
<entry file="file://$PROJECT_DIR$/crawler/taobao/settings.py"> <entry file="file://$PROJECT_DIR$/crawler/taobao/settings.py">
<provider selected="true" editor-type-id="text-editor"> <provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="196"> <state relative-caret-position="426">
<caret line="88" selection-start-line="88" selection-end-line="88" /> <caret line="119" column="12" selection-start-line="119" selection-start-column="12" selection-end-line="119" selection-end-column="12" />
</state> </state>
</provider> </provider>
</entry> </entry>
@ -93,10 +95,10 @@
<file pinned="false" current-in-tab="true"> <file pinned="false" current-in-tab="true">
<entry file="file://$PROJECT_DIR$/crawler/taobao/spiders/jd.py"> <entry file="file://$PROJECT_DIR$/crawler/taobao/spiders/jd.py">
<provider selected="true" editor-type-id="text-editor"> <provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="241"> <state relative-caret-position="63">
<caret line="67" column="24" lean-forward="true" selection-start-line="67" selection-start-column="24" selection-end-line="67" selection-end-column="24" /> <caret line="72" column="40" lean-forward="true" selection-start-line="72" selection-start-column="40" selection-end-line="72" selection-end-column="40" />
<folding> <folding>
<element signature="e#14#27#0" expanded="true" /> <element signature="e#14#25#0" expanded="true" />
</folding> </folding>
</state> </state>
</provider> </provider>
@ -115,7 +117,7 @@
<entry file="file://$PROJECT_DIR$/env/lib/python3.7/site-packages/parsel/selector.py"> <entry file="file://$PROJECT_DIR$/env/lib/python3.7/site-packages/parsel/selector.py">
<provider selected="true" editor-type-id="text-editor"> <provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="245"> <state relative-caret-position="245">
<caret line="238" column="39" lean-forward="true" selection-start-line="238" selection-start-column="39" selection-end-line="238" selection-end-column="39" /> <caret line="238" column="39" selection-start-line="238" selection-start-column="39" selection-end-line="238" selection-end-column="39" />
</state> </state>
</provider> </provider>
</entry> </entry>
@ -157,8 +159,8 @@
<option value="$PROJECT_DIR$/crawler/taobao/merge.py" /> <option value="$PROJECT_DIR$/crawler/taobao/merge.py" />
<option value="$PROJECT_DIR$/crawler/taobao/spiders/comment.py" /> <option value="$PROJECT_DIR$/crawler/taobao/spiders/comment.py" />
<option value="$PROJECT_DIR$/crawler/taobao/spiders/category.py" /> <option value="$PROJECT_DIR$/crawler/taobao/spiders/category.py" />
<option value="$PROJECT_DIR$/crawler/taobao/settings.py" />
<option value="$PROJECT_DIR$/crawler/taobao/items.py" /> <option value="$PROJECT_DIR$/crawler/taobao/items.py" />
<option value="$PROJECT_DIR$/crawler/taobao/settings.py" />
<option value="$PROJECT_DIR$/crawler/taobao/pipelines.py" /> <option value="$PROJECT_DIR$/crawler/taobao/pipelines.py" />
<option value="$PROJECT_DIR$/crawler/taobao/spiders/jd.py" /> <option value="$PROJECT_DIR$/crawler/taobao/spiders/jd.py" />
</list> </list>
@ -187,6 +189,19 @@
<item name="chapter1" type="462c0819:PsiDirectoryNode" /> <item name="chapter1" type="462c0819:PsiDirectoryNode" />
<item name="crawler" type="462c0819:PsiDirectoryNode" /> <item name="crawler" type="462c0819:PsiDirectoryNode" />
</path> </path>
<path>
<item name="chapter1" type="b2602c69:ProjectViewProjectNode" />
<item name="chapter1" type="462c0819:PsiDirectoryNode" />
<item name="crawler" type="462c0819:PsiDirectoryNode" />
<item name="taobao" type="462c0819:PsiDirectoryNode" />
</path>
<path>
<item name="chapter1" type="b2602c69:ProjectViewProjectNode" />
<item name="chapter1" type="462c0819:PsiDirectoryNode" />
<item name="crawler" type="462c0819:PsiDirectoryNode" />
<item name="taobao" type="462c0819:PsiDirectoryNode" />
<item name="spiders" type="462c0819:PsiDirectoryNode" />
</path>
</expand> </expand>
<select /> <select />
</subPane> </subPane>
@ -336,7 +351,6 @@
</component> </component>
<component name="ToolWindowManager"> <component name="ToolWindowManager">
<frame x="0" y="23" width="1280" height="777" extended-state="0" /> <frame x="0" y="23" width="1280" height="777" extended-state="0" />
<editor active="true" />
<layout> <layout>
<window_info content_ui="combo" id="Project" order="0" visible="true" weight="0.25767368" /> <window_info content_ui="combo" id="Project" order="0" visible="true" weight="0.25767368" />
<window_info id="Structure" order="1" side_tool="true" weight="0.25" /> <window_info id="Structure" order="1" side_tool="true" weight="0.25" />
@ -349,7 +363,7 @@
<window_info anchor="bottom" id="Inspection" order="5" weight="0.4" /> <window_info anchor="bottom" id="Inspection" order="5" weight="0.4" />
<window_info anchor="bottom" id="TODO" order="6" weight="0.329927" /> <window_info anchor="bottom" id="TODO" order="6" weight="0.329927" />
<window_info anchor="bottom" id="Version Control" order="7" /> <window_info anchor="bottom" id="Version Control" order="7" />
<window_info active="true" anchor="bottom" id="Terminal" order="8" visible="true" weight="0.33430657" /> <window_info active="true" anchor="bottom" id="Terminal" order="8" visible="true" weight="0.649635" />
<window_info anchor="bottom" id="Event Log" order="9" side_tool="true" /> <window_info anchor="bottom" id="Event Log" order="9" side_tool="true" />
<window_info anchor="bottom" id="Python Console" order="10" weight="0.329927" /> <window_info anchor="bottom" id="Python Console" order="10" weight="0.329927" />
<window_info anchor="right" id="Commander" internal_type="SLIDING" order="0" type="SLIDING" weight="0.4" /> <window_info anchor="right" id="Commander" internal_type="SLIDING" order="0" type="SLIDING" weight="0.4" />
@ -523,7 +537,7 @@
<entry file="file://$PROJECT_DIR$/env/lib/python3.7/site-packages/parsel/selector.py"> <entry file="file://$PROJECT_DIR$/env/lib/python3.7/site-packages/parsel/selector.py">
<provider selected="true" editor-type-id="text-editor"> <provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="245"> <state relative-caret-position="245">
<caret line="238" column="39" lean-forward="true" selection-start-line="238" selection-start-column="39" selection-end-line="238" selection-end-column="39" /> <caret line="238" column="39" selection-start-line="238" selection-start-column="39" selection-end-line="238" selection-end-column="39" />
</state> </state>
</provider> </provider>
</entry> </entry>
@ -544,24 +558,48 @@
</state> </state>
</provider> </provider>
</entry> </entry>
<entry file="file://$PROJECT_DIR$/crawler/taobao/items.py"> <entry file="file://$PROJECT_DIR$/crawler/taobao/spiders/comment.py">
<provider selected="true" editor-type-id="text-editor"> <provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="269"> <state relative-caret-position="-271">
<caret line="40" column="28" lean-forward="true" selection-start-line="40" selection-start-column="28" selection-end-line="40" selection-end-column="28" /> <caret line="36" column="53" selection-start-line="36" selection-start-column="53" selection-end-line="36" selection-end-column="53" />
<folding>
<element signature="e#24#35#0" expanded="true" />
</folding>
</state> </state>
</provider> </provider>
</entry> </entry>
<entry file="file://$PROJECT_DIR$/env/lib/python3.7/site-packages/scrapy/item.py"> <entry file="file://$PROJECT_DIR$/env/lib/python3.7/site-packages/scrapy/item.py">
<provider selected="true" editor-type-id="text-editor"> <provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="390"> <state relative-caret-position="390">
<caret line="68" column="21" lean-forward="true" selection-start-line="68" selection-start-column="21" selection-end-line="68" selection-end-column="21" /> <caret line="68" column="21" selection-start-line="68" selection-start-column="21" selection-end-line="68" selection-end-column="21" />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/env/lib/python3.7/site-packages/happybase/connection.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="270">
<caret line="107" column="8" selection-start-line="107" selection-start-column="8" selection-end-line="107" selection-end-column="8" />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/env/lib/python3.7/site-packages/scrapy/conf.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="105">
<caret line="7" column="4" selection-start-line="7" selection-start-column="4" selection-end-line="7" selection-end-column="4" />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/crawler/taobao/items.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="374">
<caret line="47" column="15" selection-start-line="47" selection-start-column="4" selection-end-line="47" selection-end-column="15" />
</state> </state>
</provider> </provider>
</entry> </entry>
<entry file="file://$PROJECT_DIR$/crawler/taobao/pipelines.py"> <entry file="file://$PROJECT_DIR$/crawler/taobao/pipelines.py">
<provider selected="true" editor-type-id="text-editor"> <provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="282"> <state relative-caret-position="1155">
<caret line="60" column="21" selection-start-line="60" selection-start-column="6" selection-end-line="60" selection-end-column="21" /> <caret line="77" column="31" lean-forward="true" selection-start-line="77" selection-start-column="31" selection-end-line="77" selection-end-column="31" />
<folding> <folding>
<element signature="e#193#204#0" expanded="true" /> <element signature="e#193#204#0" expanded="true" />
</folding> </folding>
@ -570,27 +608,17 @@
</entry> </entry>
<entry file="file://$PROJECT_DIR$/crawler/taobao/settings.py"> <entry file="file://$PROJECT_DIR$/crawler/taobao/settings.py">
<provider selected="true" editor-type-id="text-editor"> <provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="196"> <state relative-caret-position="426">
<caret line="88" selection-start-line="88" selection-end-line="88" /> <caret line="119" column="12" selection-start-line="119" selection-start-column="12" selection-end-line="119" selection-end-column="12" />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/crawler/taobao/spiders/comment.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="-271">
<caret line="36" column="53" lean-forward="true" selection-start-line="36" selection-start-column="53" selection-end-line="36" selection-end-column="53" />
<folding>
<element signature="e#24#35#0" expanded="true" />
</folding>
</state> </state>
</provider> </provider>
</entry> </entry>
<entry file="file://$PROJECT_DIR$/crawler/taobao/spiders/jd.py"> <entry file="file://$PROJECT_DIR$/crawler/taobao/spiders/jd.py">
<provider selected="true" editor-type-id="text-editor"> <provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="241"> <state relative-caret-position="63">
<caret line="67" column="24" lean-forward="true" selection-start-line="67" selection-start-column="24" selection-end-line="67" selection-end-column="24" /> <caret line="72" column="40" lean-forward="true" selection-start-line="72" selection-start-column="40" selection-end-line="72" selection-end-column="40" />
<folding> <folding>
<element signature="e#14#27#0" expanded="true" /> <element signature="e#14#25#0" expanded="true" />
</folding> </folding>
</state> </state>
</provider> </provider>

View File

@ -8,6 +8,8 @@ import json
import pdb import pdb
import re import re
import happybase
from scrapy.conf import settings
from scrapy.pipelines.images import ImagesPipeline from scrapy.pipelines.images import ImagesPipeline
@ -23,20 +25,25 @@ class ZhihuPipeline(object):
class JsonWithEncodingCnblogsPipeline(object): class JsonWithEncodingCnblogsPipeline(object):
def open_spider(self, spider): def open_spider(self, spider):
self.file = open('../../datas/%s.json' % spider.keyword, 'w', encoding='utf-8') self.file = open('../../datas/%s.json' % spider.keyword, 'w', encoding='utf-8')
def process_item(self, item, spider): def process_item(self, item, spider):
line = json.dumps(dict(item), ensure_ascii=False) + "\n" line = json.dumps(dict(item), ensure_ascii=False) + "\n"
self.file.write(line) self.file.write(line)
return item return item
def spider_closed(self, spider): def spider_closed(self, spider):
self.file.close() self.file.close()
class JsonWithEncodingCommentsPipeline(object): class JsonWithEncodingCommentsPipeline(object):
def open_spider(self, spider): def open_spider(self, spider):
self.file = open('../../datas/comments/%s.json' % spider.product_id, 'w', encoding='utf-8') self.file = open('../../datas/comments/%s.json' % spider.product_id, 'w', encoding='utf-8')
def process_item(self, item, spider): def process_item(self, item, spider):
line = json.dumps(dict(item), ensure_ascii=False) + "\n" line = json.dumps(dict(item), ensure_ascii=False) + "\n"
self.file.write(line) self.file.write(line)
return item return item
def spider_closed(self, spider): def spider_closed(self, spider):
self.file.close() self.file.close()
@ -45,9 +52,10 @@ class JDCleanDataPipeline(object):
""" """
京东数据清先 京东数据清先
""" """
def process_item(self, item, spider): def process_item(self, item, spider):
if isinstance(item["name"], str): if isinstance(item["name"], str):
item["name"] = item["name"].strip().replace('\n','').split() item["name"] = item["name"].strip().replace('\n', '')
m = re.match(r'.+\/(\d+)\.html', item['url']) m = re.match(r'.+\/(\d+)\.html', item['url'])
if m: if m:
@ -58,8 +66,25 @@ class JDCleanDataPipeline(object):
return item return item
class JDHbasePipeline(object): class JDHbasePipeline(object):
""" """
数据存到hbase中 数据存到hbase中
""" """
pass
def __init__(self):
host = settings['HBASE_HOST']
table_name = settings['HBASE_TABLE']
connection = happybase.Connection(host, port=settings["HBASE_PORT"])
table = connection.table(table_name)
self.table = table
def process_item(self, item, spider):
self.table.put(item["id"],
{"product:name": item["name"],
"product:price": item["price"],
"product:url": item["url"],
"product:img": item["img"],
"product:shop": item["shop"],
"product:comment_num": item["comment_num"]})
return item

View File

@ -112,3 +112,9 @@ IMAGES_STORE = '/tmp/taobao'
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
FEED_EXPORT_ENCODING='utf-8' FEED_EXPORT_ENCODING='utf-8'
HBASE_HOST = '106.75.85.84'
HBASE_PORT = 40009
HBASE_TABLE = 'jd'

View File

@ -1,4 +1,6 @@
#coding=utf-8 #coding=utf-8
import time
import scrapy import scrapy
from scrapy_splash import SplashRequest from scrapy_splash import SplashRequest
import pdb import pdb
@ -65,6 +67,8 @@ class JdSpider(scrapy.Spider):
page = m.group(1) page = m.group(1)
print("开始抓取:{0}".format(page)) print("开始抓取:{0}".format(page))
time.sleep(10)
yield SplashRequest( yield SplashRequest(
url= 'https://list.jd.com'+next_page url= 'https://list.jd.com'+next_page
) )