导入hbase
This commit is contained in:
parent
1a2e31cd14
commit
e38ba68191
|
@ -1,7 +1,12 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="ChangeListManager">
|
||||
<list default="true" id="a5fbb387-9969-4874-8a7f-c9dd40d5225d" name="Default Changelist" comment="" />
|
||||
<list default="true" id="a5fbb387-9969-4874-8a7f-c9dd40d5225d" name="Default Changelist" comment="">
|
||||
<change beforePath="$PROJECT_DIR$/.idea/workspace.xml" beforeDir="false" afterPath="$PROJECT_DIR$/.idea/workspace.xml" afterDir="false" />
|
||||
<change beforePath="$PROJECT_DIR$/crawler/taobao/pipelines.py" beforeDir="false" afterPath="$PROJECT_DIR$/crawler/taobao/pipelines.py" afterDir="false" />
|
||||
<change beforePath="$PROJECT_DIR$/crawler/taobao/settings.py" beforeDir="false" afterPath="$PROJECT_DIR$/crawler/taobao/settings.py" afterDir="false" />
|
||||
<change beforePath="$PROJECT_DIR$/crawler/taobao/spiders/jd.py" beforeDir="false" afterPath="$PROJECT_DIR$/crawler/taobao/spiders/jd.py" afterDir="false" />
|
||||
</list>
|
||||
<option name="EXCLUDED_CONVERTED_TO_IGNORED" value="true" />
|
||||
<option name="SHOW_DIALOG" value="false" />
|
||||
<option name="HIGHLIGHT_CONFLICTS" value="true" />
|
||||
|
@ -18,17 +23,8 @@
|
|||
<file pinned="false" current-in-tab="false">
|
||||
<entry file="file://$PROJECT_DIR$/crawler/taobao/items.py">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="269">
|
||||
<caret line="40" column="28" lean-forward="true" selection-start-line="40" selection-start-column="28" selection-end-line="40" selection-end-column="28" />
|
||||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
</file>
|
||||
<file pinned="false" current-in-tab="false">
|
||||
<entry file="file://$PROJECT_DIR$/env/lib/python3.7/site-packages/scrapy/item.py">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="390">
|
||||
<caret line="68" column="21" lean-forward="true" selection-start-line="68" selection-start-column="21" selection-end-line="68" selection-end-column="21" />
|
||||
<state relative-caret-position="374">
|
||||
<caret line="47" column="15" selection-start-line="47" selection-start-column="4" selection-end-line="47" selection-end-column="15" />
|
||||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
|
@ -36,8 +32,8 @@
|
|||
<file pinned="false" current-in-tab="false">
|
||||
<entry file="file://$PROJECT_DIR$/crawler/taobao/pipelines.py">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="282">
|
||||
<caret line="60" column="21" selection-start-line="60" selection-start-column="6" selection-end-line="60" selection-end-column="21" />
|
||||
<state relative-caret-position="1155">
|
||||
<caret line="77" column="31" lean-forward="true" selection-start-line="77" selection-start-column="31" selection-end-line="77" selection-end-column="31" />
|
||||
<folding>
|
||||
<element signature="e#193#204#0" expanded="true" />
|
||||
</folding>
|
||||
|
@ -45,11 +41,29 @@
|
|||
</provider>
|
||||
</entry>
|
||||
</file>
|
||||
<file pinned="false" current-in-tab="false">
|
||||
<entry file="file://$PROJECT_DIR$/env/lib/python3.7/site-packages/scrapy/conf.py">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="105">
|
||||
<caret line="7" column="4" selection-start-line="7" selection-start-column="4" selection-end-line="7" selection-end-column="4" />
|
||||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
</file>
|
||||
<file pinned="false" current-in-tab="false">
|
||||
<entry file="file://$PROJECT_DIR$/env/lib/python3.7/site-packages/happybase/connection.py">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="270">
|
||||
<caret line="107" column="8" selection-start-line="107" selection-start-column="8" selection-end-line="107" selection-end-column="8" />
|
||||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
</file>
|
||||
<file pinned="false" current-in-tab="false">
|
||||
<entry file="file://$PROJECT_DIR$/crawler/taobao/spiders/comment.py">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="-271">
|
||||
<caret line="36" column="53" lean-forward="true" selection-start-line="36" selection-start-column="53" selection-end-line="36" selection-end-column="53" />
|
||||
<caret line="36" column="53" selection-start-line="36" selection-start-column="53" selection-end-line="36" selection-end-column="53" />
|
||||
<folding>
|
||||
<element signature="e#24#35#0" expanded="true" />
|
||||
</folding>
|
||||
|
@ -69,23 +83,11 @@
|
|||
</provider>
|
||||
</entry>
|
||||
</file>
|
||||
<file pinned="false" current-in-tab="false">
|
||||
<entry file="file://$PROJECT_DIR$/crawler/taobao/spiders/category.py">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="251">
|
||||
<caret line="47" column="59" selection-start-line="47" selection-start-column="54" selection-end-line="47" selection-end-column="59" />
|
||||
<folding>
|
||||
<element signature="e#24#35#0" expanded="true" />
|
||||
</folding>
|
||||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
</file>
|
||||
<file pinned="false" current-in-tab="false">
|
||||
<entry file="file://$PROJECT_DIR$/crawler/taobao/settings.py">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="196">
|
||||
<caret line="88" selection-start-line="88" selection-end-line="88" />
|
||||
<state relative-caret-position="426">
|
||||
<caret line="119" column="12" selection-start-line="119" selection-start-column="12" selection-end-line="119" selection-end-column="12" />
|
||||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
|
@ -93,10 +95,10 @@
|
|||
<file pinned="false" current-in-tab="true">
|
||||
<entry file="file://$PROJECT_DIR$/crawler/taobao/spiders/jd.py">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="241">
|
||||
<caret line="67" column="24" lean-forward="true" selection-start-line="67" selection-start-column="24" selection-end-line="67" selection-end-column="24" />
|
||||
<state relative-caret-position="63">
|
||||
<caret line="72" column="40" lean-forward="true" selection-start-line="72" selection-start-column="40" selection-end-line="72" selection-end-column="40" />
|
||||
<folding>
|
||||
<element signature="e#14#27#0" expanded="true" />
|
||||
<element signature="e#14#25#0" expanded="true" />
|
||||
</folding>
|
||||
</state>
|
||||
</provider>
|
||||
|
@ -115,7 +117,7 @@
|
|||
<entry file="file://$PROJECT_DIR$/env/lib/python3.7/site-packages/parsel/selector.py">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="245">
|
||||
<caret line="238" column="39" lean-forward="true" selection-start-line="238" selection-start-column="39" selection-end-line="238" selection-end-column="39" />
|
||||
<caret line="238" column="39" selection-start-line="238" selection-start-column="39" selection-end-line="238" selection-end-column="39" />
|
||||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
|
@ -157,8 +159,8 @@
|
|||
<option value="$PROJECT_DIR$/crawler/taobao/merge.py" />
|
||||
<option value="$PROJECT_DIR$/crawler/taobao/spiders/comment.py" />
|
||||
<option value="$PROJECT_DIR$/crawler/taobao/spiders/category.py" />
|
||||
<option value="$PROJECT_DIR$/crawler/taobao/settings.py" />
|
||||
<option value="$PROJECT_DIR$/crawler/taobao/items.py" />
|
||||
<option value="$PROJECT_DIR$/crawler/taobao/settings.py" />
|
||||
<option value="$PROJECT_DIR$/crawler/taobao/pipelines.py" />
|
||||
<option value="$PROJECT_DIR$/crawler/taobao/spiders/jd.py" />
|
||||
</list>
|
||||
|
@ -187,6 +189,19 @@
|
|||
<item name="chapter1" type="462c0819:PsiDirectoryNode" />
|
||||
<item name="crawler" type="462c0819:PsiDirectoryNode" />
|
||||
</path>
|
||||
<path>
|
||||
<item name="chapter1" type="b2602c69:ProjectViewProjectNode" />
|
||||
<item name="chapter1" type="462c0819:PsiDirectoryNode" />
|
||||
<item name="crawler" type="462c0819:PsiDirectoryNode" />
|
||||
<item name="taobao" type="462c0819:PsiDirectoryNode" />
|
||||
</path>
|
||||
<path>
|
||||
<item name="chapter1" type="b2602c69:ProjectViewProjectNode" />
|
||||
<item name="chapter1" type="462c0819:PsiDirectoryNode" />
|
||||
<item name="crawler" type="462c0819:PsiDirectoryNode" />
|
||||
<item name="taobao" type="462c0819:PsiDirectoryNode" />
|
||||
<item name="spiders" type="462c0819:PsiDirectoryNode" />
|
||||
</path>
|
||||
</expand>
|
||||
<select />
|
||||
</subPane>
|
||||
|
@ -336,7 +351,6 @@
|
|||
</component>
|
||||
<component name="ToolWindowManager">
|
||||
<frame x="0" y="23" width="1280" height="777" extended-state="0" />
|
||||
<editor active="true" />
|
||||
<layout>
|
||||
<window_info content_ui="combo" id="Project" order="0" visible="true" weight="0.25767368" />
|
||||
<window_info id="Structure" order="1" side_tool="true" weight="0.25" />
|
||||
|
@ -349,7 +363,7 @@
|
|||
<window_info anchor="bottom" id="Inspection" order="5" weight="0.4" />
|
||||
<window_info anchor="bottom" id="TODO" order="6" weight="0.329927" />
|
||||
<window_info anchor="bottom" id="Version Control" order="7" />
|
||||
<window_info active="true" anchor="bottom" id="Terminal" order="8" visible="true" weight="0.33430657" />
|
||||
<window_info active="true" anchor="bottom" id="Terminal" order="8" visible="true" weight="0.649635" />
|
||||
<window_info anchor="bottom" id="Event Log" order="9" side_tool="true" />
|
||||
<window_info anchor="bottom" id="Python Console" order="10" weight="0.329927" />
|
||||
<window_info anchor="right" id="Commander" internal_type="SLIDING" order="0" type="SLIDING" weight="0.4" />
|
||||
|
@ -523,7 +537,7 @@
|
|||
<entry file="file://$PROJECT_DIR$/env/lib/python3.7/site-packages/parsel/selector.py">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="245">
|
||||
<caret line="238" column="39" lean-forward="true" selection-start-line="238" selection-start-column="39" selection-end-line="238" selection-end-column="39" />
|
||||
<caret line="238" column="39" selection-start-line="238" selection-start-column="39" selection-end-line="238" selection-end-column="39" />
|
||||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
|
@ -544,24 +558,48 @@
|
|||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
<entry file="file://$PROJECT_DIR$/crawler/taobao/items.py">
|
||||
<entry file="file://$PROJECT_DIR$/crawler/taobao/spiders/comment.py">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="269">
|
||||
<caret line="40" column="28" lean-forward="true" selection-start-line="40" selection-start-column="28" selection-end-line="40" selection-end-column="28" />
|
||||
<state relative-caret-position="-271">
|
||||
<caret line="36" column="53" selection-start-line="36" selection-start-column="53" selection-end-line="36" selection-end-column="53" />
|
||||
<folding>
|
||||
<element signature="e#24#35#0" expanded="true" />
|
||||
</folding>
|
||||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
<entry file="file://$PROJECT_DIR$/env/lib/python3.7/site-packages/scrapy/item.py">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="390">
|
||||
<caret line="68" column="21" lean-forward="true" selection-start-line="68" selection-start-column="21" selection-end-line="68" selection-end-column="21" />
|
||||
<caret line="68" column="21" selection-start-line="68" selection-start-column="21" selection-end-line="68" selection-end-column="21" />
|
||||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
<entry file="file://$PROJECT_DIR$/env/lib/python3.7/site-packages/happybase/connection.py">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="270">
|
||||
<caret line="107" column="8" selection-start-line="107" selection-start-column="8" selection-end-line="107" selection-end-column="8" />
|
||||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
<entry file="file://$PROJECT_DIR$/env/lib/python3.7/site-packages/scrapy/conf.py">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="105">
|
||||
<caret line="7" column="4" selection-start-line="7" selection-start-column="4" selection-end-line="7" selection-end-column="4" />
|
||||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
<entry file="file://$PROJECT_DIR$/crawler/taobao/items.py">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="374">
|
||||
<caret line="47" column="15" selection-start-line="47" selection-start-column="4" selection-end-line="47" selection-end-column="15" />
|
||||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
<entry file="file://$PROJECT_DIR$/crawler/taobao/pipelines.py">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="282">
|
||||
<caret line="60" column="21" selection-start-line="60" selection-start-column="6" selection-end-line="60" selection-end-column="21" />
|
||||
<state relative-caret-position="1155">
|
||||
<caret line="77" column="31" lean-forward="true" selection-start-line="77" selection-start-column="31" selection-end-line="77" selection-end-column="31" />
|
||||
<folding>
|
||||
<element signature="e#193#204#0" expanded="true" />
|
||||
</folding>
|
||||
|
@ -570,27 +608,17 @@
|
|||
</entry>
|
||||
<entry file="file://$PROJECT_DIR$/crawler/taobao/settings.py">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="196">
|
||||
<caret line="88" selection-start-line="88" selection-end-line="88" />
|
||||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
<entry file="file://$PROJECT_DIR$/crawler/taobao/spiders/comment.py">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="-271">
|
||||
<caret line="36" column="53" lean-forward="true" selection-start-line="36" selection-start-column="53" selection-end-line="36" selection-end-column="53" />
|
||||
<folding>
|
||||
<element signature="e#24#35#0" expanded="true" />
|
||||
</folding>
|
||||
<state relative-caret-position="426">
|
||||
<caret line="119" column="12" selection-start-line="119" selection-start-column="12" selection-end-line="119" selection-end-column="12" />
|
||||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
<entry file="file://$PROJECT_DIR$/crawler/taobao/spiders/jd.py">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="241">
|
||||
<caret line="67" column="24" lean-forward="true" selection-start-line="67" selection-start-column="24" selection-end-line="67" selection-end-column="24" />
|
||||
<state relative-caret-position="63">
|
||||
<caret line="72" column="40" lean-forward="true" selection-start-line="72" selection-start-column="40" selection-end-line="72" selection-end-column="40" />
|
||||
<folding>
|
||||
<element signature="e#14#27#0" expanded="true" />
|
||||
<element signature="e#14#25#0" expanded="true" />
|
||||
</folding>
|
||||
</state>
|
||||
</provider>
|
||||
|
|
|
@ -8,6 +8,8 @@ import json
|
|||
import pdb
|
||||
import re
|
||||
|
||||
import happybase
|
||||
from scrapy.conf import settings
|
||||
from scrapy.pipelines.images import ImagesPipeline
|
||||
|
||||
|
||||
|
@ -23,20 +25,25 @@ class ZhihuPipeline(object):
|
|||
class JsonWithEncodingCnblogsPipeline(object):
|
||||
def open_spider(self, spider):
|
||||
self.file = open('../../datas/%s.json' % spider.keyword, 'w', encoding='utf-8')
|
||||
|
||||
def process_item(self, item, spider):
|
||||
line = json.dumps(dict(item), ensure_ascii=False) + "\n"
|
||||
self.file.write(line)
|
||||
return item
|
||||
|
||||
def spider_closed(self, spider):
|
||||
self.file.close()
|
||||
|
||||
|
||||
class JsonWithEncodingCommentsPipeline(object):
|
||||
def open_spider(self, spider):
|
||||
self.file = open('../../datas/comments/%s.json' % spider.product_id, 'w', encoding='utf-8')
|
||||
|
||||
def process_item(self, item, spider):
|
||||
line = json.dumps(dict(item), ensure_ascii=False) + "\n"
|
||||
self.file.write(line)
|
||||
return item
|
||||
|
||||
def spider_closed(self, spider):
|
||||
self.file.close()
|
||||
|
||||
|
@ -45,9 +52,10 @@ class JDCleanDataPipeline(object):
|
|||
"""
|
||||
京东数据清先
|
||||
"""
|
||||
|
||||
def process_item(self, item, spider):
|
||||
if isinstance(item["name"], str):
|
||||
item["name"] = item["name"].strip().replace('\n','').split()
|
||||
item["name"] = item["name"].strip().replace('\n', '')
|
||||
|
||||
m = re.match(r'.+\/(\d+)\.html', item['url'])
|
||||
if m:
|
||||
|
@ -58,8 +66,25 @@ class JDCleanDataPipeline(object):
|
|||
|
||||
return item
|
||||
|
||||
|
||||
class JDHbasePipeline(object):
|
||||
"""
|
||||
数据存到hbase中
|
||||
"""
|
||||
pass
|
||||
|
||||
def __init__(self):
|
||||
host = settings['HBASE_HOST']
|
||||
table_name = settings['HBASE_TABLE']
|
||||
connection = happybase.Connection(host, port=settings["HBASE_PORT"])
|
||||
table = connection.table(table_name)
|
||||
self.table = table
|
||||
|
||||
def process_item(self, item, spider):
|
||||
self.table.put(item["id"],
|
||||
{"product:name": item["name"],
|
||||
"product:price": item["price"],
|
||||
"product:url": item["url"],
|
||||
"product:img": item["img"],
|
||||
"product:shop": item["shop"],
|
||||
"product:comment_num": item["comment_num"]})
|
||||
return item
|
||||
|
|
|
@ -112,3 +112,9 @@ IMAGES_STORE = '/tmp/taobao'
|
|||
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
|
||||
|
||||
FEED_EXPORT_ENCODING='utf-8'
|
||||
|
||||
|
||||
|
||||
HBASE_HOST = '106.75.85.84'
|
||||
HBASE_PORT = 40009
|
||||
HBASE_TABLE = 'jd'
|
|
@ -1,4 +1,6 @@
|
|||
#coding=utf-8
|
||||
import time
|
||||
|
||||
import scrapy
|
||||
from scrapy_splash import SplashRequest
|
||||
import pdb
|
||||
|
@ -65,6 +67,8 @@ class JdSpider(scrapy.Spider):
|
|||
page = m.group(1)
|
||||
print("开始抓取:{0}页".format(page))
|
||||
|
||||
time.sleep(10)
|
||||
|
||||
yield SplashRequest(
|
||||
url= 'https://list.jd.com'+next_page
|
||||
)
|
||||
|
|
Loading…
Reference in New Issue