加入京东抓取

This commit is contained in:
guange 2019-01-14 16:28:26 +08:00
parent ee0aea16aa
commit 63c091e6e1
6 changed files with 264 additions and 195 deletions

View File

@ -1,11 +1,13 @@
<?xml version="1.0" encoding="UTF-8"?> <?xml version="1.0" encoding="UTF-8"?>
<project version="4"> <project version="4">
<component name="ChangeListManager"> <component name="ChangeListManager">
<list default="true" id="a5fbb387-9969-4874-8a7f-c9dd40d5225d" name="Default Changelist" comment=""> <list default="true" id="a5fbb387-9969-4874-8a7f-c9dd40d5225d" name="Default Changelist" comment="增加id字段">
<change afterPath="$PROJECT_DIR$/crawler/taobao/merge.py" afterDir="false" /> <change afterPath="$PROJECT_DIR$/crawler/taobao/spiders/jd.py" afterDir="false" />
<change beforePath="$PROJECT_DIR$/crawler/datas/kind.json" beforeDir="false" /> <change beforePath="$PROJECT_DIR$/.idea/workspace.xml" beforeDir="false" afterPath="$PROJECT_DIR$/.idea/workspace.xml" afterDir="false" />
<change beforePath="$PROJECT_DIR$/crawler/taobao/items.py" beforeDir="false" afterPath="$PROJECT_DIR$/crawler/taobao/items.py" afterDir="false" /> <change beforePath="$PROJECT_DIR$/crawler/taobao/items.py" beforeDir="false" afterPath="$PROJECT_DIR$/crawler/taobao/items.py" afterDir="false" />
<change beforePath="$PROJECT_DIR$/crawler/taobao/pipelines.py" beforeDir="false" afterPath="$PROJECT_DIR$/crawler/taobao/pipelines.py" afterDir="false" /> <change beforePath="$PROJECT_DIR$/crawler/taobao/pipelines.py" beforeDir="false" afterPath="$PROJECT_DIR$/crawler/taobao/pipelines.py" afterDir="false" />
<change beforePath="$PROJECT_DIR$/crawler/taobao/settings.py" beforeDir="false" afterPath="$PROJECT_DIR$/crawler/taobao/settings.py" afterDir="false" />
<change beforePath="$PROJECT_DIR$/crawler/taobao/spiders/comment.py" beforeDir="false" afterPath="$PROJECT_DIR$/crawler/taobao/spiders/comment.py" afterDir="false" />
</list> </list>
<option name="EXCLUDED_CONVERTED_TO_IGNORED" value="true" /> <option name="EXCLUDED_CONVERTED_TO_IGNORED" value="true" />
<option name="SHOW_DIALOG" value="false" /> <option name="SHOW_DIALOG" value="false" />
@ -23,8 +25,17 @@
<file pinned="false" current-in-tab="false"> <file pinned="false" current-in-tab="false">
<entry file="file://$PROJECT_DIR$/crawler/taobao/items.py"> <entry file="file://$PROJECT_DIR$/crawler/taobao/items.py">
<provider selected="true" editor-type-id="text-editor"> <provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="388"> <state relative-caret-position="269">
<caret line="32" column="38" selection-start-line="32" selection-start-column="38" selection-end-line="32" selection-end-column="38" /> <caret line="40" column="28" lean-forward="true" selection-start-line="40" selection-start-column="28" selection-end-line="40" selection-end-column="28" />
</state>
</provider>
</entry>
</file>
<file pinned="false" current-in-tab="false">
<entry file="file://$PROJECT_DIR$/env/lib/python3.7/site-packages/scrapy/item.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="390">
<caret line="68" column="21" lean-forward="true" selection-start-line="68" selection-start-column="21" selection-end-line="68" selection-end-column="21" />
</state> </state>
</provider> </provider>
</entry> </entry>
@ -32,8 +43,8 @@
<file pinned="false" current-in-tab="false"> <file pinned="false" current-in-tab="false">
<entry file="file://$PROJECT_DIR$/crawler/taobao/pipelines.py"> <entry file="file://$PROJECT_DIR$/crawler/taobao/pipelines.py">
<provider selected="true" editor-type-id="text-editor"> <provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="388"> <state relative-caret-position="282">
<caret line="37" column="19" selection-start-line="37" selection-start-column="19" selection-end-line="37" selection-end-column="19" /> <caret line="60" column="21" selection-start-line="60" selection-start-column="6" selection-end-line="60" selection-end-column="21" />
<folding> <folding>
<element signature="e#193#204#0" expanded="true" /> <element signature="e#193#204#0" expanded="true" />
</folding> </folding>
@ -41,11 +52,11 @@
</provider> </provider>
</entry> </entry>
</file> </file>
<file pinned="false" current-in-tab="true"> <file pinned="false" current-in-tab="false">
<entry file="file://$PROJECT_DIR$/crawler/taobao/spiders/comment.py"> <entry file="file://$PROJECT_DIR$/crawler/taobao/spiders/comment.py">
<provider selected="true" editor-type-id="text-editor"> <provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="247"> <state relative-caret-position="-271">
<caret line="81" column="34" selection-start-line="81" selection-start-column="34" selection-end-line="81" selection-end-column="34" /> <caret line="36" column="53" lean-forward="true" selection-start-line="36" selection-start-column="53" selection-end-line="36" selection-end-column="53" />
<folding> <folding>
<element signature="e#24#35#0" expanded="true" /> <element signature="e#24#35#0" expanded="true" />
</folding> </folding>
@ -53,24 +64,6 @@
</provider> </provider>
</entry> </entry>
</file> </file>
<file pinned="false" current-in-tab="false">
<entry file="file:///usr/local/Cellar/python/3.7.0/Frameworks/Python.framework/Versions/3.7/lib/python3.7/subprocess.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="139">
<caret line="965" column="8" selection-start-line="965" selection-start-column="8" selection-end-line="965" selection-end-column="8" />
</state>
</provider>
</entry>
</file>
<file pinned="false" current-in-tab="false">
<entry file="file://$PROJECT_DIR$/crawler/taobao/spiders/httpbin.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="45">
<caret line="3" column="13" selection-start-line="3" selection-start-column="5" selection-end-line="3" selection-end-column="13" />
</state>
</provider>
</entry>
</file>
<file pinned="false" current-in-tab="false"> <file pinned="false" current-in-tab="false">
<entry file="file://$PROJECT_DIR$/crawler/taobao/middlewares.py"> <entry file="file://$PROJECT_DIR$/crawler/taobao/middlewares.py">
<provider selected="true" editor-type-id="text-editor"> <provider selected="true" editor-type-id="text-editor">
@ -83,32 +76,11 @@
</provider> </provider>
</entry> </entry>
</file> </file>
<file pinned="false" current-in-tab="false">
<entry file="file://$PROJECT_DIR$/crawler/taobao/spiders/dangdang.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="315">
<caret line="21" column="22" selection-start-line="21" selection-start-column="22" selection-end-line="21" selection-end-column="22" />
<folding>
<element signature="e#119#129#0" expanded="true" />
</folding>
</state>
</provider>
</entry>
</file>
<file pinned="false" current-in-tab="false">
<entry file="file://$USER_HOME$/Library/Caches/PyCharm2018.3/python_stubs/-1583084946/builtins.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="36915">
<caret line="2461" column="42" selection-start-line="2461" selection-start-column="34" selection-end-line="2461" selection-end-column="42" />
</state>
</provider>
</entry>
</file>
<file pinned="false" current-in-tab="false"> <file pinned="false" current-in-tab="false">
<entry file="file://$PROJECT_DIR$/crawler/taobao/spiders/category.py"> <entry file="file://$PROJECT_DIR$/crawler/taobao/spiders/category.py">
<provider selected="true" editor-type-id="text-editor"> <provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="1245"> <state relative-caret-position="251">
<caret line="91" column="45" selection-start-line="87" selection-start-column="4" selection-end-line="91" selection-end-column="45" /> <caret line="47" column="59" selection-start-line="47" selection-start-column="54" selection-end-line="47" selection-end-column="59" />
<folding> <folding>
<element signature="e#24#35#0" expanded="true" /> <element signature="e#24#35#0" expanded="true" />
</folding> </folding>
@ -117,10 +89,40 @@
</entry> </entry>
</file> </file>
<file pinned="false" current-in-tab="false"> <file pinned="false" current-in-tab="false">
<entry file="file://$PROJECT_DIR$/crawler/taobao/merge.py"> <entry file="file://$PROJECT_DIR$/crawler/taobao/settings.py">
<provider selected="true" editor-type-id="text-editor"> <provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="345"> <state relative-caret-position="196">
<caret line="23" column="30" selection-start-line="23" selection-start-column="30" selection-end-line="23" selection-end-column="30" /> <caret line="88" selection-start-line="88" selection-end-line="88" />
</state>
</provider>
</entry>
</file>
<file pinned="false" current-in-tab="true">
<entry file="file://$PROJECT_DIR$/crawler/taobao/spiders/jd.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="136">
<caret line="60" column="21" lean-forward="true" selection-start-line="60" selection-start-column="21" selection-end-line="60" selection-end-column="21" />
<folding>
<element signature="e#14#27#0" expanded="true" />
</folding>
</state>
</provider>
</entry>
</file>
<file pinned="false" current-in-tab="false">
<entry file="file://$PROJECT_DIR$/env/lib/python3.7/site-packages/scrapy/selector/unified.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="492">
<caret line="84" column="24" selection-start-line="84" selection-start-column="8" selection-end-line="84" selection-end-column="24" />
</state>
</provider>
</entry>
</file>
<file pinned="false" current-in-tab="false">
<entry file="file://$PROJECT_DIR$/env/lib/python3.7/site-packages/parsel/selector.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="245">
<caret line="238" column="39" lean-forward="true" selection-start-line="238" selection-start-column="39" selection-end-line="238" selection-end-column="39" />
</state> </state>
</provider> </provider>
</entry> </entry>
@ -140,6 +142,11 @@
<find>美式风</find> <find>美式风</find>
<find></find> <find></find>
<find>皮床</find> <find>皮床</find>
<find>SPLASH</find>
<find>DOWNLOADER_MIDDLEWARES</find>
<find>DUPEFILTER_CLASS</find>
<find>contains</find>
<find>scrapy.selector.unified.Selector</find>
</findStrings> </findStrings>
</component> </component>
<component name="Git.Settings"> <component name="Git.Settings">
@ -153,12 +160,14 @@
<option value="$PROJECT_DIR$/crawler/taobao/spiders/dangdang.py" /> <option value="$PROJECT_DIR$/crawler/taobao/spiders/dangdang.py" />
<option value="$PROJECT_DIR$/crawler/taobao/spiders/run.py" /> <option value="$PROJECT_DIR$/crawler/taobao/spiders/run.py" />
<option value="$PROJECT_DIR$/env/lib/python3.7/site-packages/scrapy/spiders/__init__.py" /> <option value="$PROJECT_DIR$/env/lib/python3.7/site-packages/scrapy/spiders/__init__.py" />
<option value="$PROJECT_DIR$/crawler/taobao/spiders/category.py" />
<option value="$PROJECT_DIR$/crawler/taobao/items.py" />
<option value="$PROJECT_DIR$/crawler/taobao/pipelines.py" />
<option value="$PROJECT_DIR$/crawler/datas/kind.json" /> <option value="$PROJECT_DIR$/crawler/datas/kind.json" />
<option value="$PROJECT_DIR$/crawler/taobao/merge.py" /> <option value="$PROJECT_DIR$/crawler/taobao/merge.py" />
<option value="$PROJECT_DIR$/crawler/taobao/spiders/comment.py" /> <option value="$PROJECT_DIR$/crawler/taobao/spiders/comment.py" />
<option value="$PROJECT_DIR$/crawler/taobao/spiders/category.py" />
<option value="$PROJECT_DIR$/crawler/taobao/settings.py" />
<option value="$PROJECT_DIR$/crawler/taobao/items.py" />
<option value="$PROJECT_DIR$/crawler/taobao/pipelines.py" />
<option value="$PROJECT_DIR$/crawler/taobao/spiders/jd.py" />
</list> </list>
</option> </option>
</component> </component>
@ -185,90 +194,6 @@
<item name="chapter1" type="462c0819:PsiDirectoryNode" /> <item name="chapter1" type="462c0819:PsiDirectoryNode" />
<item name="crawler" type="462c0819:PsiDirectoryNode" /> <item name="crawler" type="462c0819:PsiDirectoryNode" />
</path> </path>
<path>
<item name="chapter1" type="b2602c69:ProjectViewProjectNode" />
<item name="chapter1" type="462c0819:PsiDirectoryNode" />
<item name="crawler" type="462c0819:PsiDirectoryNode" />
<item name="datas" type="462c0819:PsiDirectoryNode" />
</path>
<path>
<item name="chapter1" type="b2602c69:ProjectViewProjectNode" />
<item name="chapter1" type="462c0819:PsiDirectoryNode" />
<item name="env" type="462c0819:PsiDirectoryNode" />
</path>
<path>
<item name="chapter1" type="b2602c69:ProjectViewProjectNode" />
<item name="chapter1" type="462c0819:PsiDirectoryNode" />
<item name="env" type="462c0819:PsiDirectoryNode" />
<item name="lib" type="462c0819:PsiDirectoryNode" />
</path>
<path>
<item name="chapter1" type="b2602c69:ProjectViewProjectNode" />
<item name="chapter1" type="462c0819:PsiDirectoryNode" />
<item name="env" type="462c0819:PsiDirectoryNode" />
<item name="lib" type="462c0819:PsiDirectoryNode" />
<item name="python3.7" type="462c0819:PsiDirectoryNode" />
</path>
<path>
<item name="chapter1" type="b2602c69:ProjectViewProjectNode" />
<item name="chapter1" type="462c0819:PsiDirectoryNode" />
<item name="env" type="462c0819:PsiDirectoryNode" />
<item name="lib" type="462c0819:PsiDirectoryNode" />
<item name="python3.7" type="462c0819:PsiDirectoryNode" />
<item name="site-packages" type="462c0819:PsiDirectoryNode" />
</path>
<path>
<item name="chapter1" type="b2602c69:ProjectViewProjectNode" />
<item name="chapter1" type="462c0819:PsiDirectoryNode" />
<item name="env" type="462c0819:PsiDirectoryNode" />
<item name="lib" type="462c0819:PsiDirectoryNode" />
<item name="python3.7" type="462c0819:PsiDirectoryNode" />
<item name="site-packages" type="462c0819:PsiDirectoryNode" />
<item name="lxml" type="462c0819:PsiDirectoryNode" />
</path>
<path>
<item name="chapter1" type="b2602c69:ProjectViewProjectNode" />
<item name="External Libraries" type="cb654da1:ExternalLibrariesNode" />
</path>
<path>
<item name="chapter1" type="b2602c69:ProjectViewProjectNode" />
<item name="External Libraries" type="cb654da1:ExternalLibrariesNode" />
<item name="&lt; Python 3.7 (chapter1) &gt;" type="70bed36:NamedLibraryElementNode" />
</path>
<path>
<item name="chapter1" type="b2602c69:ProjectViewProjectNode" />
<item name="External Libraries" type="cb654da1:ExternalLibrariesNode" />
<item name="&lt; Python 3.7 (chapter1) &gt;" type="70bed36:NamedLibraryElementNode" />
<item name="python3.7" type="462c0819:PsiDirectoryNode" />
</path>
<path>
<item name="chapter1" type="b2602c69:ProjectViewProjectNode" />
<item name="External Libraries" type="cb654da1:ExternalLibrariesNode" />
<item name="&lt; Python 3.7 (chapter1) &gt;" type="70bed36:NamedLibraryElementNode" />
<item name="python3.7" type="462c0819:PsiDirectoryNode" />
<item name="site-packages" type="462c0819:PsiDirectoryNode" />
</path>
<path>
<item name="chapter1" type="b2602c69:ProjectViewProjectNode" />
<item name="External Libraries" type="cb654da1:ExternalLibrariesNode" />
<item name="&lt; Python 3.7 (chapter1) &gt;" type="70bed36:NamedLibraryElementNode" />
<item name="python3.7" type="462c0819:PsiDirectoryNode" />
<item name="site-packages" type="462c0819:PsiDirectoryNode" />
<item name="lxml" type="462c0819:PsiDirectoryNode" />
</path>
<path>
<item name="chapter1" type="b2602c69:ProjectViewProjectNode" />
<item name="External Libraries" type="cb654da1:ExternalLibrariesNode" />
<item name="&lt; Python 3.7 (chapter1) &gt;" type="70bed36:NamedLibraryElementNode" />
<item name="site-packages" type="462c0819:PsiDirectoryNode" />
</path>
<path>
<item name="chapter1" type="b2602c69:ProjectViewProjectNode" />
<item name="External Libraries" type="cb654da1:ExternalLibrariesNode" />
<item name="&lt; Python 3.7 (chapter1) &gt;" type="70bed36:NamedLibraryElementNode" />
<item name="site-packages" type="462c0819:PsiDirectoryNode" />
<item name="lxml" type="462c0819:PsiDirectoryNode" />
</path>
</expand> </expand>
<select /> <select />
</subPane> </subPane>
@ -281,7 +206,7 @@
<property name="last_opened_file_path" value="$PROJECT_DIR$" /> <property name="last_opened_file_path" value="$PROJECT_DIR$" />
<property name="nodejs_interpreter_path.stuck_in_default_project" value="undefined stuck path" /> <property name="nodejs_interpreter_path.stuck_in_default_project" value="undefined stuck path" />
<property name="nodejs_npm_path_reset_for_default_project" value="true" /> <property name="nodejs_npm_path_reset_for_default_project" value="true" />
<property name="settings.editor.selected.configurable" value="configurable.group.project" /> <property name="settings.editor.selected.configurable" value="com.jetbrains.python.configuration.PyActiveSdkModuleConfigurable" />
</component> </component>
<component name="RecentsManager"> <component name="RecentsManager">
<key name="MoveFile.RECENT_KEYS"> <key name="MoveFile.RECENT_KEYS">
@ -412,7 +337,7 @@
<component name="ToolWindowManager"> <component name="ToolWindowManager">
<frame x="0" y="23" width="1280" height="777" extended-state="0" /> <frame x="0" y="23" width="1280" height="777" extended-state="0" />
<layout> <layout>
<window_info active="true" content_ui="combo" id="Project" order="0" visible="true" weight="0.25767368" /> <window_info content_ui="combo" id="Project" order="0" visible="true" weight="0.25767368" />
<window_info id="Structure" order="1" side_tool="true" weight="0.25" /> <window_info id="Structure" order="1" side_tool="true" weight="0.25" />
<window_info id="Favorites" order="2" side_tool="true" /> <window_info id="Favorites" order="2" side_tool="true" />
<window_info anchor="bottom" id="Message" order="0" /> <window_info anchor="bottom" id="Message" order="0" />
@ -423,7 +348,7 @@
<window_info anchor="bottom" id="Inspection" order="5" weight="0.4" /> <window_info anchor="bottom" id="Inspection" order="5" weight="0.4" />
<window_info anchor="bottom" id="TODO" order="6" weight="0.329927" /> <window_info anchor="bottom" id="TODO" order="6" weight="0.329927" />
<window_info anchor="bottom" id="Version Control" order="7" /> <window_info anchor="bottom" id="Version Control" order="7" />
<window_info anchor="bottom" id="Terminal" order="8" visible="true" weight="0.31678832" /> <window_info active="true" anchor="bottom" id="Terminal" order="8" visible="true" weight="0.33430657" />
<window_info anchor="bottom" id="Event Log" order="9" side_tool="true" /> <window_info anchor="bottom" id="Event Log" order="9" side_tool="true" />
<window_info anchor="bottom" id="Python Console" order="10" weight="0.329927" /> <window_info anchor="bottom" id="Python Console" order="10" weight="0.329927" />
<window_info anchor="right" id="Commander" internal_type="SLIDING" order="0" type="SLIDING" weight="0.4" /> <window_info anchor="right" id="Commander" internal_type="SLIDING" order="0" type="SLIDING" weight="0.4" />
@ -455,11 +380,6 @@
</state> </state>
</provider> </provider>
</entry> </entry>
<entry file="file://$PROJECT_DIR$/crawler/taobao/settings.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="-1210" />
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/crawler/taobao/__init__.py"> <entry file="file://$PROJECT_DIR$/crawler/taobao/__init__.py">
<provider selected="true" editor-type-id="text-editor" /> <provider selected="true" editor-type-id="text-editor" />
</entry> </entry>
@ -502,13 +422,6 @@
</state> </state>
</provider> </provider>
</entry> </entry>
<entry file="file://$PROJECT_DIR$/env/lib/python3.7/site-packages/parsel/selector.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="900">
<caret line="66" column="8" selection-start-line="66" selection-start-column="8" selection-end-line="66" selection-end-column="8" />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/env/lib/python3.7/site-packages/scrapy/http/request/__init__.py"> <entry file="file://$PROJECT_DIR$/env/lib/python3.7/site-packages/scrapy/http/request/__init__.py">
<provider selected="true" editor-type-id="text-editor"> <provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="118"> <state relative-caret-position="118">
@ -554,16 +467,6 @@
</state> </state>
</provider> </provider>
</entry> </entry>
<entry file="file://$PROJECT_DIR$/crawler/taobao/spiders/category.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="1245">
<caret line="91" column="45" selection-start-line="87" selection-start-column="4" selection-end-line="91" selection-end-column="45" />
<folding>
<element signature="e#24#35#0" expanded="true" />
</folding>
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/crawler/taobao/merge.py"> <entry file="file://$PROJECT_DIR$/crawler/taobao/merge.py">
<provider selected="true" editor-type-id="text-editor"> <provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="345"> <state relative-caret-position="345">
@ -574,33 +477,30 @@
<entry file="file://$PROJECT_DIR$/env/lib/python3.7/site-packages/jupyter.py"> <entry file="file://$PROJECT_DIR$/env/lib/python3.7/site-packages/jupyter.py">
<provider selected="true" editor-type-id="text-editor" /> <provider selected="true" editor-type-id="text-editor" />
</entry> </entry>
<entry file="file://$PROJECT_DIR$/crawler/taobao/items.py"> <entry file="file:///usr/local/Cellar/python/3.7.0/Frameworks/Python.framework/Versions/3.7/lib/python3.7/subprocess.py">
<provider selected="true" editor-type-id="text-editor"> <provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="388"> <state relative-caret-position="139">
<caret line="32" column="38" selection-start-line="32" selection-start-column="38" selection-end-line="32" selection-end-column="38" /> <caret line="965" column="8" selection-start-line="965" selection-start-column="8" selection-end-line="965" selection-end-column="8" />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/crawler/taobao/pipelines.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="388">
<caret line="37" column="19" selection-start-line="37" selection-start-column="19" selection-end-line="37" selection-end-column="19" />
<folding>
<element signature="e#193#204#0" expanded="true" />
</folding>
</state> </state>
</provider> </provider>
</entry> </entry>
<entry file="file://$PROJECT_DIR$/env/lib/python3.7/site-packages/lxml/__init__.py"> <entry file="file://$PROJECT_DIR$/env/lib/python3.7/site-packages/lxml/__init__.py">
<provider selected="true" editor-type-id="text-editor"> <provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="150"> <state relative-caret-position="300">
<caret line="10" column="33" selection-start-line="10" selection-start-column="33" selection-end-line="10" selection-end-column="33" /> <caret line="20" lean-forward="true" selection-start-line="20" selection-end-line="20" />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/crawler/taobao/spiders/httpbin.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="45">
<caret line="3" column="13" selection-start-line="3" selection-start-column="5" selection-end-line="3" selection-end-column="13" />
</state> </state>
</provider> </provider>
</entry> </entry>
<entry file="file://$PROJECT_DIR$/crawler/taobao/spiders/dangdang.py"> <entry file="file://$PROJECT_DIR$/crawler/taobao/spiders/dangdang.py">
<provider selected="true" editor-type-id="text-editor"> <provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="315"> <state relative-caret-position="-318">
<caret line="21" column="22" selection-start-line="21" selection-start-column="22" selection-end-line="21" selection-end-column="22" /> <caret line="21" column="22" selection-start-line="21" selection-start-column="22" selection-end-line="21" selection-end-column="22" />
<folding> <folding>
<element signature="e#119#129#0" expanded="true" /> <element signature="e#119#129#0" expanded="true" />
@ -608,6 +508,30 @@
</state> </state>
</provider> </provider>
</entry> </entry>
<entry file="file://$PROJECT_DIR$/crawler/taobao/spiders/category.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="251">
<caret line="47" column="59" selection-start-line="47" selection-start-column="54" selection-end-line="47" selection-end-column="59" />
<folding>
<element signature="e#24#35#0" expanded="true" />
</folding>
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/env/lib/python3.7/site-packages/parsel/selector.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="245">
<caret line="238" column="39" lean-forward="true" selection-start-line="238" selection-start-column="39" selection-end-line="238" selection-end-column="39" />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/env/lib/python3.7/site-packages/scrapy/selector/unified.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="492">
<caret line="84" column="24" selection-start-line="84" selection-start-column="8" selection-end-line="84" selection-end-column="24" />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/crawler/taobao/middlewares.py"> <entry file="file://$PROJECT_DIR$/crawler/taobao/middlewares.py">
<provider selected="true" editor-type-id="text-editor"> <provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="216"> <state relative-caret-position="216">
@ -618,30 +542,57 @@
</state> </state>
</provider> </provider>
</entry> </entry>
<entry file="file://$PROJECT_DIR$/crawler/taobao/spiders/httpbin.py"> <entry file="file://$PROJECT_DIR$/crawler/taobao/items.py">
<provider selected="true" editor-type-id="text-editor"> <provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="45"> <state relative-caret-position="269">
<caret line="3" column="13" selection-start-line="3" selection-start-column="5" selection-end-line="3" selection-end-column="13" /> <caret line="40" column="28" lean-forward="true" selection-start-line="40" selection-start-column="28" selection-end-line="40" selection-end-column="28" />
</state> </state>
</provider> </provider>
</entry> </entry>
<entry file="file:///usr/local/Cellar/python/3.7.0/Frameworks/Python.framework/Versions/3.7/lib/python3.7/subprocess.py"> <entry file="file://$PROJECT_DIR$/env/lib/python3.7/site-packages/scrapy/item.py">
<provider selected="true" editor-type-id="text-editor"> <provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="139"> <state relative-caret-position="390">
<caret line="965" column="8" selection-start-line="965" selection-start-column="8" selection-end-line="965" selection-end-column="8" /> <caret line="68" column="21" lean-forward="true" selection-start-line="68" selection-start-column="21" selection-end-line="68" selection-end-column="21" />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/crawler/taobao/pipelines.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="282">
<caret line="60" column="21" selection-start-line="60" selection-start-column="6" selection-end-line="60" selection-end-column="21" />
<folding>
<element signature="e#193#204#0" expanded="true" />
</folding>
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/crawler/taobao/settings.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="196">
<caret line="88" selection-start-line="88" selection-end-line="88" />
</state> </state>
</provider> </provider>
</entry> </entry>
<entry file="file://$PROJECT_DIR$/crawler/taobao/spiders/comment.py"> <entry file="file://$PROJECT_DIR$/crawler/taobao/spiders/comment.py">
<provider selected="true" editor-type-id="text-editor"> <provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="247"> <state relative-caret-position="-271">
<caret line="81" column="34" selection-start-line="81" selection-start-column="34" selection-end-line="81" selection-end-column="34" /> <caret line="36" column="53" lean-forward="true" selection-start-line="36" selection-start-column="53" selection-end-line="36" selection-end-column="53" />
<folding> <folding>
<element signature="e#24#35#0" expanded="true" /> <element signature="e#24#35#0" expanded="true" />
</folding> </folding>
</state> </state>
</provider> </provider>
</entry> </entry>
<entry file="file://$PROJECT_DIR$/crawler/taobao/spiders/jd.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="136">
<caret line="60" column="21" lean-forward="true" selection-start-line="60" selection-start-column="21" selection-end-line="60" selection-end-column="21" />
<folding>
<element signature="e#14#27#0" expanded="true" />
</folding>
</state>
</provider>
</entry>
</component> </component>
<component name="masterDetails"> <component name="masterDetails">
<states> <states>

View File

@ -36,3 +36,14 @@ class CommentItem(scrapy.Item):
total_detest_count = scrapy.Field() total_detest_count = scrapy.Field()
page_count = scrapy.Field() page_count = scrapy.Field()
comments = scrapy.Field() comments = scrapy.Field()
class JDProductItem(scrapy.Item):
id = scrapy.Field()
name = scrapy.Field()
price = scrapy.Field()
url = scrapy.Field()
img = scrapy.Field()
shop = scrapy.Field()
comment_num = scrapy.Field()

View File

@ -6,6 +6,7 @@
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
import json import json
import pdb import pdb
import re
from scrapy.pipelines.images import ImagesPipeline from scrapy.pipelines.images import ImagesPipeline
@ -38,3 +39,27 @@ class JsonWithEncodingCommentsPipeline(object):
return item return item
def spider_closed(self, spider): def spider_closed(self, spider):
self.file.close() self.file.close()
class JDCleanDataPipeline(object):
"""
京东数据清先
"""
def process_item(self, item, spider):
if isinstance(item["name"], str):
item["name"] = item["name"].strip().replace('\n','').split()
m = re.match(r'.+\/(\d+)\.html', item['url'])
if m:
item["id"] = m.group(1)
if not isinstance(item["id"], str):
return None
return item
class JDHbasePipeline(object):
"""
数据存到hbase中
"""
pass

View File

@ -25,6 +25,9 @@ ROBOTSTXT_OBEY = False
LOG_LEVEL='DEBUG' LOG_LEVEL='DEBUG'
SPLASH_URL = 'http://127.0.0.1:8050'
# Configure maximum concurrent requests performed by Scrapy (default: 16) # Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32 #CONCURRENT_REQUESTS = 32
@ -61,6 +64,16 @@ COOKIES_ENABLED = True
# 'taobao.middlewares.ZhihuDownloaderMiddleware': 543, # 'taobao.middlewares.ZhihuDownloaderMiddleware': 543,
#} #}
DOWNLOADER_MIDDLEWARES = {
'scrapy_splash.SplashCookiesMiddleware': 723,
'scrapy_splash.SplashMiddleware': 725,
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810,
}
#Splash自己的去重过滤器
DUPEFILTER_CLASS = 'scrapy_splash.SplashAwareDupeFilter'
# Enable or disable extensions # Enable or disable extensions
# See https://doc.scrapy.org/en/latest/topics/extensions.html # See https://doc.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = { #EXTENSIONS = {

View File

@ -1,17 +1,14 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
import json import json
import pdb
import subprocess import subprocess
import pandas as pd
import scrapy import scrapy
from scrapy import Request
from scrapy.http import Response
from lxml import etree from lxml import etree
from scrapy import Request
from taobao.items import CommentItem from taobao.items import CommentItem
import pandas as pd
""" """
商品评论数据抓取 商品评论数据抓取
""" """

View File

@ -0,0 +1,72 @@
#coding=utf-8
import scrapy
from scrapy_splash import SplashRequest
import pdb
from taobao.items import JDProductItem
import re
class JdSpider(scrapy.Spider):
name = 'jd'
allowed_domains = ['jd.com']
start_urls = [
"https://list.jd.com/list.html?cat=9987,653,655"
]
custom_settings = {
'LOG_LEVEL': "WARN",
'ITEM_PIPELINES': {
'taobao.pipelines.JDCleanDataPipeline': 300,
'taobao.pipelines.JDHbasePipeline': 400,
}
}
def start_requests(self):
for url in self.start_urls:
yield SplashRequest(url,
self.parse,
args={})
def parse(self, response):
items = response.xpath('//div[@id="plist"]//li[@class="gl-item"]')
for item in items:
# pdb.set_trace()
name = item.xpath('.//div[contains(@class, "p-name")]/a/em/text()').extract_first()
print(name)
price = item.xpath('.//div[contains(@class, "p-price")]/strong/i/text()').extract_first()
print(price)
url = item.xpath('.//div[contains(@class, "p-name")]/a/@href').extract_first()
print('https:'+url)
img = item.xpath('.//div[@class="p-img"]/a/img/@data-lazy-img').extract_first()
if not img:
img = item.xpath('.//div[@class="p-img"]/a/img/@src').extract_first()
comment_num = item.xpath('.//div[@class="p-commit"]/strong/a/text()').extract_first()
print(comment_num)
shop = item.xpath('.//div[@class="p-shop"]/span/a/@title').extract_first()
print(shop)
yield JDProductItem(
name = name,
price = price,
url = url,
img = img,
comment_num = comment_num,
shop = shop
)
# 获取下一页
next_page = response.xpath('//a[@class="pn-next"]/@href').extract_first()
if next_page:
page = 1
m = re.match(r'.+page=(\d+).+', next_page)
if m:
page = m.group(1)
print("开始抓取:{0}".format(page))
yield SplashRequest(
url= 'https://list.jd.com'+next_page
)
else:
print("抓取结束")