导入hbase
This commit is contained in:
parent
1a2e31cd14
commit
e38ba68191
|
@ -1,7 +1,12 @@
|
||||||
<?xml version="1.0" encoding="UTF-8"?>
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
<project version="4">
|
<project version="4">
|
||||||
<component name="ChangeListManager">
|
<component name="ChangeListManager">
|
||||||
<list default="true" id="a5fbb387-9969-4874-8a7f-c9dd40d5225d" name="Default Changelist" comment="" />
|
<list default="true" id="a5fbb387-9969-4874-8a7f-c9dd40d5225d" name="Default Changelist" comment="">
|
||||||
|
<change beforePath="$PROJECT_DIR$/.idea/workspace.xml" beforeDir="false" afterPath="$PROJECT_DIR$/.idea/workspace.xml" afterDir="false" />
|
||||||
|
<change beforePath="$PROJECT_DIR$/crawler/taobao/pipelines.py" beforeDir="false" afterPath="$PROJECT_DIR$/crawler/taobao/pipelines.py" afterDir="false" />
|
||||||
|
<change beforePath="$PROJECT_DIR$/crawler/taobao/settings.py" beforeDir="false" afterPath="$PROJECT_DIR$/crawler/taobao/settings.py" afterDir="false" />
|
||||||
|
<change beforePath="$PROJECT_DIR$/crawler/taobao/spiders/jd.py" beforeDir="false" afterPath="$PROJECT_DIR$/crawler/taobao/spiders/jd.py" afterDir="false" />
|
||||||
|
</list>
|
||||||
<option name="EXCLUDED_CONVERTED_TO_IGNORED" value="true" />
|
<option name="EXCLUDED_CONVERTED_TO_IGNORED" value="true" />
|
||||||
<option name="SHOW_DIALOG" value="false" />
|
<option name="SHOW_DIALOG" value="false" />
|
||||||
<option name="HIGHLIGHT_CONFLICTS" value="true" />
|
<option name="HIGHLIGHT_CONFLICTS" value="true" />
|
||||||
|
@ -18,17 +23,8 @@
|
||||||
<file pinned="false" current-in-tab="false">
|
<file pinned="false" current-in-tab="false">
|
||||||
<entry file="file://$PROJECT_DIR$/crawler/taobao/items.py">
|
<entry file="file://$PROJECT_DIR$/crawler/taobao/items.py">
|
||||||
<provider selected="true" editor-type-id="text-editor">
|
<provider selected="true" editor-type-id="text-editor">
|
||||||
<state relative-caret-position="269">
|
<state relative-caret-position="374">
|
||||||
<caret line="40" column="28" lean-forward="true" selection-start-line="40" selection-start-column="28" selection-end-line="40" selection-end-column="28" />
|
<caret line="47" column="15" selection-start-line="47" selection-start-column="4" selection-end-line="47" selection-end-column="15" />
|
||||||
</state>
|
|
||||||
</provider>
|
|
||||||
</entry>
|
|
||||||
</file>
|
|
||||||
<file pinned="false" current-in-tab="false">
|
|
||||||
<entry file="file://$PROJECT_DIR$/env/lib/python3.7/site-packages/scrapy/item.py">
|
|
||||||
<provider selected="true" editor-type-id="text-editor">
|
|
||||||
<state relative-caret-position="390">
|
|
||||||
<caret line="68" column="21" lean-forward="true" selection-start-line="68" selection-start-column="21" selection-end-line="68" selection-end-column="21" />
|
|
||||||
</state>
|
</state>
|
||||||
</provider>
|
</provider>
|
||||||
</entry>
|
</entry>
|
||||||
|
@ -36,8 +32,8 @@
|
||||||
<file pinned="false" current-in-tab="false">
|
<file pinned="false" current-in-tab="false">
|
||||||
<entry file="file://$PROJECT_DIR$/crawler/taobao/pipelines.py">
|
<entry file="file://$PROJECT_DIR$/crawler/taobao/pipelines.py">
|
||||||
<provider selected="true" editor-type-id="text-editor">
|
<provider selected="true" editor-type-id="text-editor">
|
||||||
<state relative-caret-position="282">
|
<state relative-caret-position="1155">
|
||||||
<caret line="60" column="21" selection-start-line="60" selection-start-column="6" selection-end-line="60" selection-end-column="21" />
|
<caret line="77" column="31" lean-forward="true" selection-start-line="77" selection-start-column="31" selection-end-line="77" selection-end-column="31" />
|
||||||
<folding>
|
<folding>
|
||||||
<element signature="e#193#204#0" expanded="true" />
|
<element signature="e#193#204#0" expanded="true" />
|
||||||
</folding>
|
</folding>
|
||||||
|
@ -45,11 +41,29 @@
|
||||||
</provider>
|
</provider>
|
||||||
</entry>
|
</entry>
|
||||||
</file>
|
</file>
|
||||||
|
<file pinned="false" current-in-tab="false">
|
||||||
|
<entry file="file://$PROJECT_DIR$/env/lib/python3.7/site-packages/scrapy/conf.py">
|
||||||
|
<provider selected="true" editor-type-id="text-editor">
|
||||||
|
<state relative-caret-position="105">
|
||||||
|
<caret line="7" column="4" selection-start-line="7" selection-start-column="4" selection-end-line="7" selection-end-column="4" />
|
||||||
|
</state>
|
||||||
|
</provider>
|
||||||
|
</entry>
|
||||||
|
</file>
|
||||||
|
<file pinned="false" current-in-tab="false">
|
||||||
|
<entry file="file://$PROJECT_DIR$/env/lib/python3.7/site-packages/happybase/connection.py">
|
||||||
|
<provider selected="true" editor-type-id="text-editor">
|
||||||
|
<state relative-caret-position="270">
|
||||||
|
<caret line="107" column="8" selection-start-line="107" selection-start-column="8" selection-end-line="107" selection-end-column="8" />
|
||||||
|
</state>
|
||||||
|
</provider>
|
||||||
|
</entry>
|
||||||
|
</file>
|
||||||
<file pinned="false" current-in-tab="false">
|
<file pinned="false" current-in-tab="false">
|
||||||
<entry file="file://$PROJECT_DIR$/crawler/taobao/spiders/comment.py">
|
<entry file="file://$PROJECT_DIR$/crawler/taobao/spiders/comment.py">
|
||||||
<provider selected="true" editor-type-id="text-editor">
|
<provider selected="true" editor-type-id="text-editor">
|
||||||
<state relative-caret-position="-271">
|
<state relative-caret-position="-271">
|
||||||
<caret line="36" column="53" lean-forward="true" selection-start-line="36" selection-start-column="53" selection-end-line="36" selection-end-column="53" />
|
<caret line="36" column="53" selection-start-line="36" selection-start-column="53" selection-end-line="36" selection-end-column="53" />
|
||||||
<folding>
|
<folding>
|
||||||
<element signature="e#24#35#0" expanded="true" />
|
<element signature="e#24#35#0" expanded="true" />
|
||||||
</folding>
|
</folding>
|
||||||
|
@ -69,23 +83,11 @@
|
||||||
</provider>
|
</provider>
|
||||||
</entry>
|
</entry>
|
||||||
</file>
|
</file>
|
||||||
<file pinned="false" current-in-tab="false">
|
|
||||||
<entry file="file://$PROJECT_DIR$/crawler/taobao/spiders/category.py">
|
|
||||||
<provider selected="true" editor-type-id="text-editor">
|
|
||||||
<state relative-caret-position="251">
|
|
||||||
<caret line="47" column="59" selection-start-line="47" selection-start-column="54" selection-end-line="47" selection-end-column="59" />
|
|
||||||
<folding>
|
|
||||||
<element signature="e#24#35#0" expanded="true" />
|
|
||||||
</folding>
|
|
||||||
</state>
|
|
||||||
</provider>
|
|
||||||
</entry>
|
|
||||||
</file>
|
|
||||||
<file pinned="false" current-in-tab="false">
|
<file pinned="false" current-in-tab="false">
|
||||||
<entry file="file://$PROJECT_DIR$/crawler/taobao/settings.py">
|
<entry file="file://$PROJECT_DIR$/crawler/taobao/settings.py">
|
||||||
<provider selected="true" editor-type-id="text-editor">
|
<provider selected="true" editor-type-id="text-editor">
|
||||||
<state relative-caret-position="196">
|
<state relative-caret-position="426">
|
||||||
<caret line="88" selection-start-line="88" selection-end-line="88" />
|
<caret line="119" column="12" selection-start-line="119" selection-start-column="12" selection-end-line="119" selection-end-column="12" />
|
||||||
</state>
|
</state>
|
||||||
</provider>
|
</provider>
|
||||||
</entry>
|
</entry>
|
||||||
|
@ -93,10 +95,10 @@
|
||||||
<file pinned="false" current-in-tab="true">
|
<file pinned="false" current-in-tab="true">
|
||||||
<entry file="file://$PROJECT_DIR$/crawler/taobao/spiders/jd.py">
|
<entry file="file://$PROJECT_DIR$/crawler/taobao/spiders/jd.py">
|
||||||
<provider selected="true" editor-type-id="text-editor">
|
<provider selected="true" editor-type-id="text-editor">
|
||||||
<state relative-caret-position="241">
|
<state relative-caret-position="63">
|
||||||
<caret line="67" column="24" lean-forward="true" selection-start-line="67" selection-start-column="24" selection-end-line="67" selection-end-column="24" />
|
<caret line="72" column="40" lean-forward="true" selection-start-line="72" selection-start-column="40" selection-end-line="72" selection-end-column="40" />
|
||||||
<folding>
|
<folding>
|
||||||
<element signature="e#14#27#0" expanded="true" />
|
<element signature="e#14#25#0" expanded="true" />
|
||||||
</folding>
|
</folding>
|
||||||
</state>
|
</state>
|
||||||
</provider>
|
</provider>
|
||||||
|
@ -115,7 +117,7 @@
|
||||||
<entry file="file://$PROJECT_DIR$/env/lib/python3.7/site-packages/parsel/selector.py">
|
<entry file="file://$PROJECT_DIR$/env/lib/python3.7/site-packages/parsel/selector.py">
|
||||||
<provider selected="true" editor-type-id="text-editor">
|
<provider selected="true" editor-type-id="text-editor">
|
||||||
<state relative-caret-position="245">
|
<state relative-caret-position="245">
|
||||||
<caret line="238" column="39" lean-forward="true" selection-start-line="238" selection-start-column="39" selection-end-line="238" selection-end-column="39" />
|
<caret line="238" column="39" selection-start-line="238" selection-start-column="39" selection-end-line="238" selection-end-column="39" />
|
||||||
</state>
|
</state>
|
||||||
</provider>
|
</provider>
|
||||||
</entry>
|
</entry>
|
||||||
|
@ -157,8 +159,8 @@
|
||||||
<option value="$PROJECT_DIR$/crawler/taobao/merge.py" />
|
<option value="$PROJECT_DIR$/crawler/taobao/merge.py" />
|
||||||
<option value="$PROJECT_DIR$/crawler/taobao/spiders/comment.py" />
|
<option value="$PROJECT_DIR$/crawler/taobao/spiders/comment.py" />
|
||||||
<option value="$PROJECT_DIR$/crawler/taobao/spiders/category.py" />
|
<option value="$PROJECT_DIR$/crawler/taobao/spiders/category.py" />
|
||||||
<option value="$PROJECT_DIR$/crawler/taobao/settings.py" />
|
|
||||||
<option value="$PROJECT_DIR$/crawler/taobao/items.py" />
|
<option value="$PROJECT_DIR$/crawler/taobao/items.py" />
|
||||||
|
<option value="$PROJECT_DIR$/crawler/taobao/settings.py" />
|
||||||
<option value="$PROJECT_DIR$/crawler/taobao/pipelines.py" />
|
<option value="$PROJECT_DIR$/crawler/taobao/pipelines.py" />
|
||||||
<option value="$PROJECT_DIR$/crawler/taobao/spiders/jd.py" />
|
<option value="$PROJECT_DIR$/crawler/taobao/spiders/jd.py" />
|
||||||
</list>
|
</list>
|
||||||
|
@ -187,6 +189,19 @@
|
||||||
<item name="chapter1" type="462c0819:PsiDirectoryNode" />
|
<item name="chapter1" type="462c0819:PsiDirectoryNode" />
|
||||||
<item name="crawler" type="462c0819:PsiDirectoryNode" />
|
<item name="crawler" type="462c0819:PsiDirectoryNode" />
|
||||||
</path>
|
</path>
|
||||||
|
<path>
|
||||||
|
<item name="chapter1" type="b2602c69:ProjectViewProjectNode" />
|
||||||
|
<item name="chapter1" type="462c0819:PsiDirectoryNode" />
|
||||||
|
<item name="crawler" type="462c0819:PsiDirectoryNode" />
|
||||||
|
<item name="taobao" type="462c0819:PsiDirectoryNode" />
|
||||||
|
</path>
|
||||||
|
<path>
|
||||||
|
<item name="chapter1" type="b2602c69:ProjectViewProjectNode" />
|
||||||
|
<item name="chapter1" type="462c0819:PsiDirectoryNode" />
|
||||||
|
<item name="crawler" type="462c0819:PsiDirectoryNode" />
|
||||||
|
<item name="taobao" type="462c0819:PsiDirectoryNode" />
|
||||||
|
<item name="spiders" type="462c0819:PsiDirectoryNode" />
|
||||||
|
</path>
|
||||||
</expand>
|
</expand>
|
||||||
<select />
|
<select />
|
||||||
</subPane>
|
</subPane>
|
||||||
|
@ -336,7 +351,6 @@
|
||||||
</component>
|
</component>
|
||||||
<component name="ToolWindowManager">
|
<component name="ToolWindowManager">
|
||||||
<frame x="0" y="23" width="1280" height="777" extended-state="0" />
|
<frame x="0" y="23" width="1280" height="777" extended-state="0" />
|
||||||
<editor active="true" />
|
|
||||||
<layout>
|
<layout>
|
||||||
<window_info content_ui="combo" id="Project" order="0" visible="true" weight="0.25767368" />
|
<window_info content_ui="combo" id="Project" order="0" visible="true" weight="0.25767368" />
|
||||||
<window_info id="Structure" order="1" side_tool="true" weight="0.25" />
|
<window_info id="Structure" order="1" side_tool="true" weight="0.25" />
|
||||||
|
@ -349,7 +363,7 @@
|
||||||
<window_info anchor="bottom" id="Inspection" order="5" weight="0.4" />
|
<window_info anchor="bottom" id="Inspection" order="5" weight="0.4" />
|
||||||
<window_info anchor="bottom" id="TODO" order="6" weight="0.329927" />
|
<window_info anchor="bottom" id="TODO" order="6" weight="0.329927" />
|
||||||
<window_info anchor="bottom" id="Version Control" order="7" />
|
<window_info anchor="bottom" id="Version Control" order="7" />
|
||||||
<window_info active="true" anchor="bottom" id="Terminal" order="8" visible="true" weight="0.33430657" />
|
<window_info active="true" anchor="bottom" id="Terminal" order="8" visible="true" weight="0.649635" />
|
||||||
<window_info anchor="bottom" id="Event Log" order="9" side_tool="true" />
|
<window_info anchor="bottom" id="Event Log" order="9" side_tool="true" />
|
||||||
<window_info anchor="bottom" id="Python Console" order="10" weight="0.329927" />
|
<window_info anchor="bottom" id="Python Console" order="10" weight="0.329927" />
|
||||||
<window_info anchor="right" id="Commander" internal_type="SLIDING" order="0" type="SLIDING" weight="0.4" />
|
<window_info anchor="right" id="Commander" internal_type="SLIDING" order="0" type="SLIDING" weight="0.4" />
|
||||||
|
@ -523,7 +537,7 @@
|
||||||
<entry file="file://$PROJECT_DIR$/env/lib/python3.7/site-packages/parsel/selector.py">
|
<entry file="file://$PROJECT_DIR$/env/lib/python3.7/site-packages/parsel/selector.py">
|
||||||
<provider selected="true" editor-type-id="text-editor">
|
<provider selected="true" editor-type-id="text-editor">
|
||||||
<state relative-caret-position="245">
|
<state relative-caret-position="245">
|
||||||
<caret line="238" column="39" lean-forward="true" selection-start-line="238" selection-start-column="39" selection-end-line="238" selection-end-column="39" />
|
<caret line="238" column="39" selection-start-line="238" selection-start-column="39" selection-end-line="238" selection-end-column="39" />
|
||||||
</state>
|
</state>
|
||||||
</provider>
|
</provider>
|
||||||
</entry>
|
</entry>
|
||||||
|
@ -544,24 +558,48 @@
|
||||||
</state>
|
</state>
|
||||||
</provider>
|
</provider>
|
||||||
</entry>
|
</entry>
|
||||||
<entry file="file://$PROJECT_DIR$/crawler/taobao/items.py">
|
<entry file="file://$PROJECT_DIR$/crawler/taobao/spiders/comment.py">
|
||||||
<provider selected="true" editor-type-id="text-editor">
|
<provider selected="true" editor-type-id="text-editor">
|
||||||
<state relative-caret-position="269">
|
<state relative-caret-position="-271">
|
||||||
<caret line="40" column="28" lean-forward="true" selection-start-line="40" selection-start-column="28" selection-end-line="40" selection-end-column="28" />
|
<caret line="36" column="53" selection-start-line="36" selection-start-column="53" selection-end-line="36" selection-end-column="53" />
|
||||||
|
<folding>
|
||||||
|
<element signature="e#24#35#0" expanded="true" />
|
||||||
|
</folding>
|
||||||
</state>
|
</state>
|
||||||
</provider>
|
</provider>
|
||||||
</entry>
|
</entry>
|
||||||
<entry file="file://$PROJECT_DIR$/env/lib/python3.7/site-packages/scrapy/item.py">
|
<entry file="file://$PROJECT_DIR$/env/lib/python3.7/site-packages/scrapy/item.py">
|
||||||
<provider selected="true" editor-type-id="text-editor">
|
<provider selected="true" editor-type-id="text-editor">
|
||||||
<state relative-caret-position="390">
|
<state relative-caret-position="390">
|
||||||
<caret line="68" column="21" lean-forward="true" selection-start-line="68" selection-start-column="21" selection-end-line="68" selection-end-column="21" />
|
<caret line="68" column="21" selection-start-line="68" selection-start-column="21" selection-end-line="68" selection-end-column="21" />
|
||||||
|
</state>
|
||||||
|
</provider>
|
||||||
|
</entry>
|
||||||
|
<entry file="file://$PROJECT_DIR$/env/lib/python3.7/site-packages/happybase/connection.py">
|
||||||
|
<provider selected="true" editor-type-id="text-editor">
|
||||||
|
<state relative-caret-position="270">
|
||||||
|
<caret line="107" column="8" selection-start-line="107" selection-start-column="8" selection-end-line="107" selection-end-column="8" />
|
||||||
|
</state>
|
||||||
|
</provider>
|
||||||
|
</entry>
|
||||||
|
<entry file="file://$PROJECT_DIR$/env/lib/python3.7/site-packages/scrapy/conf.py">
|
||||||
|
<provider selected="true" editor-type-id="text-editor">
|
||||||
|
<state relative-caret-position="105">
|
||||||
|
<caret line="7" column="4" selection-start-line="7" selection-start-column="4" selection-end-line="7" selection-end-column="4" />
|
||||||
|
</state>
|
||||||
|
</provider>
|
||||||
|
</entry>
|
||||||
|
<entry file="file://$PROJECT_DIR$/crawler/taobao/items.py">
|
||||||
|
<provider selected="true" editor-type-id="text-editor">
|
||||||
|
<state relative-caret-position="374">
|
||||||
|
<caret line="47" column="15" selection-start-line="47" selection-start-column="4" selection-end-line="47" selection-end-column="15" />
|
||||||
</state>
|
</state>
|
||||||
</provider>
|
</provider>
|
||||||
</entry>
|
</entry>
|
||||||
<entry file="file://$PROJECT_DIR$/crawler/taobao/pipelines.py">
|
<entry file="file://$PROJECT_DIR$/crawler/taobao/pipelines.py">
|
||||||
<provider selected="true" editor-type-id="text-editor">
|
<provider selected="true" editor-type-id="text-editor">
|
||||||
<state relative-caret-position="282">
|
<state relative-caret-position="1155">
|
||||||
<caret line="60" column="21" selection-start-line="60" selection-start-column="6" selection-end-line="60" selection-end-column="21" />
|
<caret line="77" column="31" lean-forward="true" selection-start-line="77" selection-start-column="31" selection-end-line="77" selection-end-column="31" />
|
||||||
<folding>
|
<folding>
|
||||||
<element signature="e#193#204#0" expanded="true" />
|
<element signature="e#193#204#0" expanded="true" />
|
||||||
</folding>
|
</folding>
|
||||||
|
@ -570,27 +608,17 @@
|
||||||
</entry>
|
</entry>
|
||||||
<entry file="file://$PROJECT_DIR$/crawler/taobao/settings.py">
|
<entry file="file://$PROJECT_DIR$/crawler/taobao/settings.py">
|
||||||
<provider selected="true" editor-type-id="text-editor">
|
<provider selected="true" editor-type-id="text-editor">
|
||||||
<state relative-caret-position="196">
|
<state relative-caret-position="426">
|
||||||
<caret line="88" selection-start-line="88" selection-end-line="88" />
|
<caret line="119" column="12" selection-start-line="119" selection-start-column="12" selection-end-line="119" selection-end-column="12" />
|
||||||
</state>
|
|
||||||
</provider>
|
|
||||||
</entry>
|
|
||||||
<entry file="file://$PROJECT_DIR$/crawler/taobao/spiders/comment.py">
|
|
||||||
<provider selected="true" editor-type-id="text-editor">
|
|
||||||
<state relative-caret-position="-271">
|
|
||||||
<caret line="36" column="53" lean-forward="true" selection-start-line="36" selection-start-column="53" selection-end-line="36" selection-end-column="53" />
|
|
||||||
<folding>
|
|
||||||
<element signature="e#24#35#0" expanded="true" />
|
|
||||||
</folding>
|
|
||||||
</state>
|
</state>
|
||||||
</provider>
|
</provider>
|
||||||
</entry>
|
</entry>
|
||||||
<entry file="file://$PROJECT_DIR$/crawler/taobao/spiders/jd.py">
|
<entry file="file://$PROJECT_DIR$/crawler/taobao/spiders/jd.py">
|
||||||
<provider selected="true" editor-type-id="text-editor">
|
<provider selected="true" editor-type-id="text-editor">
|
||||||
<state relative-caret-position="241">
|
<state relative-caret-position="63">
|
||||||
<caret line="67" column="24" lean-forward="true" selection-start-line="67" selection-start-column="24" selection-end-line="67" selection-end-column="24" />
|
<caret line="72" column="40" lean-forward="true" selection-start-line="72" selection-start-column="40" selection-end-line="72" selection-end-column="40" />
|
||||||
<folding>
|
<folding>
|
||||||
<element signature="e#14#27#0" expanded="true" />
|
<element signature="e#14#25#0" expanded="true" />
|
||||||
</folding>
|
</folding>
|
||||||
</state>
|
</state>
|
||||||
</provider>
|
</provider>
|
||||||
|
|
|
@ -8,6 +8,8 @@ import json
|
||||||
import pdb
|
import pdb
|
||||||
import re
|
import re
|
||||||
|
|
||||||
|
import happybase
|
||||||
|
from scrapy.conf import settings
|
||||||
from scrapy.pipelines.images import ImagesPipeline
|
from scrapy.pipelines.images import ImagesPipeline
|
||||||
|
|
||||||
|
|
||||||
|
@ -23,20 +25,25 @@ class ZhihuPipeline(object):
|
||||||
class JsonWithEncodingCnblogsPipeline(object):
|
class JsonWithEncodingCnblogsPipeline(object):
|
||||||
def open_spider(self, spider):
|
def open_spider(self, spider):
|
||||||
self.file = open('../../datas/%s.json' % spider.keyword, 'w', encoding='utf-8')
|
self.file = open('../../datas/%s.json' % spider.keyword, 'w', encoding='utf-8')
|
||||||
|
|
||||||
def process_item(self, item, spider):
|
def process_item(self, item, spider):
|
||||||
line = json.dumps(dict(item), ensure_ascii=False) + "\n"
|
line = json.dumps(dict(item), ensure_ascii=False) + "\n"
|
||||||
self.file.write(line)
|
self.file.write(line)
|
||||||
return item
|
return item
|
||||||
|
|
||||||
def spider_closed(self, spider):
|
def spider_closed(self, spider):
|
||||||
self.file.close()
|
self.file.close()
|
||||||
|
|
||||||
|
|
||||||
class JsonWithEncodingCommentsPipeline(object):
|
class JsonWithEncodingCommentsPipeline(object):
|
||||||
def open_spider(self, spider):
|
def open_spider(self, spider):
|
||||||
self.file = open('../../datas/comments/%s.json' % spider.product_id, 'w', encoding='utf-8')
|
self.file = open('../../datas/comments/%s.json' % spider.product_id, 'w', encoding='utf-8')
|
||||||
|
|
||||||
def process_item(self, item, spider):
|
def process_item(self, item, spider):
|
||||||
line = json.dumps(dict(item), ensure_ascii=False) + "\n"
|
line = json.dumps(dict(item), ensure_ascii=False) + "\n"
|
||||||
self.file.write(line)
|
self.file.write(line)
|
||||||
return item
|
return item
|
||||||
|
|
||||||
def spider_closed(self, spider):
|
def spider_closed(self, spider):
|
||||||
self.file.close()
|
self.file.close()
|
||||||
|
|
||||||
|
@ -45,9 +52,10 @@ class JDCleanDataPipeline(object):
|
||||||
"""
|
"""
|
||||||
京东数据清先
|
京东数据清先
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def process_item(self, item, spider):
|
def process_item(self, item, spider):
|
||||||
if isinstance(item["name"], str):
|
if isinstance(item["name"], str):
|
||||||
item["name"] = item["name"].strip().replace('\n','').split()
|
item["name"] = item["name"].strip().replace('\n', '')
|
||||||
|
|
||||||
m = re.match(r'.+\/(\d+)\.html', item['url'])
|
m = re.match(r'.+\/(\d+)\.html', item['url'])
|
||||||
if m:
|
if m:
|
||||||
|
@ -58,8 +66,25 @@ class JDCleanDataPipeline(object):
|
||||||
|
|
||||||
return item
|
return item
|
||||||
|
|
||||||
|
|
||||||
class JDHbasePipeline(object):
|
class JDHbasePipeline(object):
|
||||||
"""
|
"""
|
||||||
数据存到hbase中
|
数据存到hbase中
|
||||||
"""
|
"""
|
||||||
pass
|
|
||||||
|
def __init__(self):
|
||||||
|
host = settings['HBASE_HOST']
|
||||||
|
table_name = settings['HBASE_TABLE']
|
||||||
|
connection = happybase.Connection(host, port=settings["HBASE_PORT"])
|
||||||
|
table = connection.table(table_name)
|
||||||
|
self.table = table
|
||||||
|
|
||||||
|
def process_item(self, item, spider):
|
||||||
|
self.table.put(item["id"],
|
||||||
|
{"product:name": item["name"],
|
||||||
|
"product:price": item["price"],
|
||||||
|
"product:url": item["url"],
|
||||||
|
"product:img": item["img"],
|
||||||
|
"product:shop": item["shop"],
|
||||||
|
"product:comment_num": item["comment_num"]})
|
||||||
|
return item
|
||||||
|
|
|
@ -112,3 +112,9 @@ IMAGES_STORE = '/tmp/taobao'
|
||||||
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
|
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
|
||||||
|
|
||||||
FEED_EXPORT_ENCODING='utf-8'
|
FEED_EXPORT_ENCODING='utf-8'
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
HBASE_HOST = '106.75.85.84'
|
||||||
|
HBASE_PORT = 40009
|
||||||
|
HBASE_TABLE = 'jd'
|
|
@ -1,4 +1,6 @@
|
||||||
#coding=utf-8
|
#coding=utf-8
|
||||||
|
import time
|
||||||
|
|
||||||
import scrapy
|
import scrapy
|
||||||
from scrapy_splash import SplashRequest
|
from scrapy_splash import SplashRequest
|
||||||
import pdb
|
import pdb
|
||||||
|
@ -65,6 +67,8 @@ class JdSpider(scrapy.Spider):
|
||||||
page = m.group(1)
|
page = m.group(1)
|
||||||
print("开始抓取:{0}页".format(page))
|
print("开始抓取:{0}页".format(page))
|
||||||
|
|
||||||
|
time.sleep(10)
|
||||||
|
|
||||||
yield SplashRequest(
|
yield SplashRequest(
|
||||||
url= 'https://list.jd.com'+next_page
|
url= 'https://list.jd.com'+next_page
|
||||||
)
|
)
|
||||||
|
|
Loading…
Reference in New Issue