加入京东抓取

This commit is contained in:
guange 2019-01-14 16:28:26 +08:00
parent ee0aea16aa
commit 63c091e6e1
6 changed files with 264 additions and 195 deletions

View File

@ -1,11 +1,13 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ChangeListManager">
<list default="true" id="a5fbb387-9969-4874-8a7f-c9dd40d5225d" name="Default Changelist" comment="">
<change afterPath="$PROJECT_DIR$/crawler/taobao/merge.py" afterDir="false" />
<change beforePath="$PROJECT_DIR$/crawler/datas/kind.json" beforeDir="false" />
<list default="true" id="a5fbb387-9969-4874-8a7f-c9dd40d5225d" name="Default Changelist" comment="增加id字段">
<change afterPath="$PROJECT_DIR$/crawler/taobao/spiders/jd.py" afterDir="false" />
<change beforePath="$PROJECT_DIR$/.idea/workspace.xml" beforeDir="false" afterPath="$PROJECT_DIR$/.idea/workspace.xml" afterDir="false" />
<change beforePath="$PROJECT_DIR$/crawler/taobao/items.py" beforeDir="false" afterPath="$PROJECT_DIR$/crawler/taobao/items.py" afterDir="false" />
<change beforePath="$PROJECT_DIR$/crawler/taobao/pipelines.py" beforeDir="false" afterPath="$PROJECT_DIR$/crawler/taobao/pipelines.py" afterDir="false" />
<change beforePath="$PROJECT_DIR$/crawler/taobao/settings.py" beforeDir="false" afterPath="$PROJECT_DIR$/crawler/taobao/settings.py" afterDir="false" />
<change beforePath="$PROJECT_DIR$/crawler/taobao/spiders/comment.py" beforeDir="false" afterPath="$PROJECT_DIR$/crawler/taobao/spiders/comment.py" afterDir="false" />
</list>
<option name="EXCLUDED_CONVERTED_TO_IGNORED" value="true" />
<option name="SHOW_DIALOG" value="false" />
@ -23,8 +25,17 @@
<file pinned="false" current-in-tab="false">
<entry file="file://$PROJECT_DIR$/crawler/taobao/items.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="388">
<caret line="32" column="38" selection-start-line="32" selection-start-column="38" selection-end-line="32" selection-end-column="38" />
<state relative-caret-position="269">
<caret line="40" column="28" lean-forward="true" selection-start-line="40" selection-start-column="28" selection-end-line="40" selection-end-column="28" />
</state>
</provider>
</entry>
</file>
<file pinned="false" current-in-tab="false">
<entry file="file://$PROJECT_DIR$/env/lib/python3.7/site-packages/scrapy/item.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="390">
<caret line="68" column="21" lean-forward="true" selection-start-line="68" selection-start-column="21" selection-end-line="68" selection-end-column="21" />
</state>
</provider>
</entry>
@ -32,8 +43,8 @@
<file pinned="false" current-in-tab="false">
<entry file="file://$PROJECT_DIR$/crawler/taobao/pipelines.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="388">
<caret line="37" column="19" selection-start-line="37" selection-start-column="19" selection-end-line="37" selection-end-column="19" />
<state relative-caret-position="282">
<caret line="60" column="21" selection-start-line="60" selection-start-column="6" selection-end-line="60" selection-end-column="21" />
<folding>
<element signature="e#193#204#0" expanded="true" />
</folding>
@ -41,11 +52,11 @@
</provider>
</entry>
</file>
<file pinned="false" current-in-tab="true">
<file pinned="false" current-in-tab="false">
<entry file="file://$PROJECT_DIR$/crawler/taobao/spiders/comment.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="247">
<caret line="81" column="34" selection-start-line="81" selection-start-column="34" selection-end-line="81" selection-end-column="34" />
<state relative-caret-position="-271">
<caret line="36" column="53" lean-forward="true" selection-start-line="36" selection-start-column="53" selection-end-line="36" selection-end-column="53" />
<folding>
<element signature="e#24#35#0" expanded="true" />
</folding>
@ -53,24 +64,6 @@
</provider>
</entry>
</file>
<file pinned="false" current-in-tab="false">
<entry file="file:///usr/local/Cellar/python/3.7.0/Frameworks/Python.framework/Versions/3.7/lib/python3.7/subprocess.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="139">
<caret line="965" column="8" selection-start-line="965" selection-start-column="8" selection-end-line="965" selection-end-column="8" />
</state>
</provider>
</entry>
</file>
<file pinned="false" current-in-tab="false">
<entry file="file://$PROJECT_DIR$/crawler/taobao/spiders/httpbin.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="45">
<caret line="3" column="13" selection-start-line="3" selection-start-column="5" selection-end-line="3" selection-end-column="13" />
</state>
</provider>
</entry>
</file>
<file pinned="false" current-in-tab="false">
<entry file="file://$PROJECT_DIR$/crawler/taobao/middlewares.py">
<provider selected="true" editor-type-id="text-editor">
@ -83,32 +76,11 @@
</provider>
</entry>
</file>
<file pinned="false" current-in-tab="false">
<entry file="file://$PROJECT_DIR$/crawler/taobao/spiders/dangdang.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="315">
<caret line="21" column="22" selection-start-line="21" selection-start-column="22" selection-end-line="21" selection-end-column="22" />
<folding>
<element signature="e#119#129#0" expanded="true" />
</folding>
</state>
</provider>
</entry>
</file>
<file pinned="false" current-in-tab="false">
<entry file="file://$USER_HOME$/Library/Caches/PyCharm2018.3/python_stubs/-1583084946/builtins.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="36915">
<caret line="2461" column="42" selection-start-line="2461" selection-start-column="34" selection-end-line="2461" selection-end-column="42" />
</state>
</provider>
</entry>
</file>
<file pinned="false" current-in-tab="false">
<entry file="file://$PROJECT_DIR$/crawler/taobao/spiders/category.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="1245">
<caret line="91" column="45" selection-start-line="87" selection-start-column="4" selection-end-line="91" selection-end-column="45" />
<state relative-caret-position="251">
<caret line="47" column="59" selection-start-line="47" selection-start-column="54" selection-end-line="47" selection-end-column="59" />
<folding>
<element signature="e#24#35#0" expanded="true" />
</folding>
@ -117,10 +89,40 @@
</entry>
</file>
<file pinned="false" current-in-tab="false">
<entry file="file://$PROJECT_DIR$/crawler/taobao/merge.py">
<entry file="file://$PROJECT_DIR$/crawler/taobao/settings.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="345">
<caret line="23" column="30" selection-start-line="23" selection-start-column="30" selection-end-line="23" selection-end-column="30" />
<state relative-caret-position="196">
<caret line="88" selection-start-line="88" selection-end-line="88" />
</state>
</provider>
</entry>
</file>
<file pinned="false" current-in-tab="true">
<entry file="file://$PROJECT_DIR$/crawler/taobao/spiders/jd.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="136">
<caret line="60" column="21" lean-forward="true" selection-start-line="60" selection-start-column="21" selection-end-line="60" selection-end-column="21" />
<folding>
<element signature="e#14#27#0" expanded="true" />
</folding>
</state>
</provider>
</entry>
</file>
<file pinned="false" current-in-tab="false">
<entry file="file://$PROJECT_DIR$/env/lib/python3.7/site-packages/scrapy/selector/unified.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="492">
<caret line="84" column="24" selection-start-line="84" selection-start-column="8" selection-end-line="84" selection-end-column="24" />
</state>
</provider>
</entry>
</file>
<file pinned="false" current-in-tab="false">
<entry file="file://$PROJECT_DIR$/env/lib/python3.7/site-packages/parsel/selector.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="245">
<caret line="238" column="39" lean-forward="true" selection-start-line="238" selection-start-column="39" selection-end-line="238" selection-end-column="39" />
</state>
</provider>
</entry>
@ -140,6 +142,11 @@
<find>美式风</find>
<find></find>
<find>皮床</find>
<find>SPLASH</find>
<find>DOWNLOADER_MIDDLEWARES</find>
<find>DUPEFILTER_CLASS</find>
<find>contains</find>
<find>scrapy.selector.unified.Selector</find>
</findStrings>
</component>
<component name="Git.Settings">
@ -153,12 +160,14 @@
<option value="$PROJECT_DIR$/crawler/taobao/spiders/dangdang.py" />
<option value="$PROJECT_DIR$/crawler/taobao/spiders/run.py" />
<option value="$PROJECT_DIR$/env/lib/python3.7/site-packages/scrapy/spiders/__init__.py" />
<option value="$PROJECT_DIR$/crawler/taobao/spiders/category.py" />
<option value="$PROJECT_DIR$/crawler/taobao/items.py" />
<option value="$PROJECT_DIR$/crawler/taobao/pipelines.py" />
<option value="$PROJECT_DIR$/crawler/datas/kind.json" />
<option value="$PROJECT_DIR$/crawler/taobao/merge.py" />
<option value="$PROJECT_DIR$/crawler/taobao/spiders/comment.py" />
<option value="$PROJECT_DIR$/crawler/taobao/spiders/category.py" />
<option value="$PROJECT_DIR$/crawler/taobao/settings.py" />
<option value="$PROJECT_DIR$/crawler/taobao/items.py" />
<option value="$PROJECT_DIR$/crawler/taobao/pipelines.py" />
<option value="$PROJECT_DIR$/crawler/taobao/spiders/jd.py" />
</list>
</option>
</component>
@ -185,90 +194,6 @@
<item name="chapter1" type="462c0819:PsiDirectoryNode" />
<item name="crawler" type="462c0819:PsiDirectoryNode" />
</path>
<path>
<item name="chapter1" type="b2602c69:ProjectViewProjectNode" />
<item name="chapter1" type="462c0819:PsiDirectoryNode" />
<item name="crawler" type="462c0819:PsiDirectoryNode" />
<item name="datas" type="462c0819:PsiDirectoryNode" />
</path>
<path>
<item name="chapter1" type="b2602c69:ProjectViewProjectNode" />
<item name="chapter1" type="462c0819:PsiDirectoryNode" />
<item name="env" type="462c0819:PsiDirectoryNode" />
</path>
<path>
<item name="chapter1" type="b2602c69:ProjectViewProjectNode" />
<item name="chapter1" type="462c0819:PsiDirectoryNode" />
<item name="env" type="462c0819:PsiDirectoryNode" />
<item name="lib" type="462c0819:PsiDirectoryNode" />
</path>
<path>
<item name="chapter1" type="b2602c69:ProjectViewProjectNode" />
<item name="chapter1" type="462c0819:PsiDirectoryNode" />
<item name="env" type="462c0819:PsiDirectoryNode" />
<item name="lib" type="462c0819:PsiDirectoryNode" />
<item name="python3.7" type="462c0819:PsiDirectoryNode" />
</path>
<path>
<item name="chapter1" type="b2602c69:ProjectViewProjectNode" />
<item name="chapter1" type="462c0819:PsiDirectoryNode" />
<item name="env" type="462c0819:PsiDirectoryNode" />
<item name="lib" type="462c0819:PsiDirectoryNode" />
<item name="python3.7" type="462c0819:PsiDirectoryNode" />
<item name="site-packages" type="462c0819:PsiDirectoryNode" />
</path>
<path>
<item name="chapter1" type="b2602c69:ProjectViewProjectNode" />
<item name="chapter1" type="462c0819:PsiDirectoryNode" />
<item name="env" type="462c0819:PsiDirectoryNode" />
<item name="lib" type="462c0819:PsiDirectoryNode" />
<item name="python3.7" type="462c0819:PsiDirectoryNode" />
<item name="site-packages" type="462c0819:PsiDirectoryNode" />
<item name="lxml" type="462c0819:PsiDirectoryNode" />
</path>
<path>
<item name="chapter1" type="b2602c69:ProjectViewProjectNode" />
<item name="External Libraries" type="cb654da1:ExternalLibrariesNode" />
</path>
<path>
<item name="chapter1" type="b2602c69:ProjectViewProjectNode" />
<item name="External Libraries" type="cb654da1:ExternalLibrariesNode" />
<item name="&lt; Python 3.7 (chapter1) &gt;" type="70bed36:NamedLibraryElementNode" />
</path>
<path>
<item name="chapter1" type="b2602c69:ProjectViewProjectNode" />
<item name="External Libraries" type="cb654da1:ExternalLibrariesNode" />
<item name="&lt; Python 3.7 (chapter1) &gt;" type="70bed36:NamedLibraryElementNode" />
<item name="python3.7" type="462c0819:PsiDirectoryNode" />
</path>
<path>
<item name="chapter1" type="b2602c69:ProjectViewProjectNode" />
<item name="External Libraries" type="cb654da1:ExternalLibrariesNode" />
<item name="&lt; Python 3.7 (chapter1) &gt;" type="70bed36:NamedLibraryElementNode" />
<item name="python3.7" type="462c0819:PsiDirectoryNode" />
<item name="site-packages" type="462c0819:PsiDirectoryNode" />
</path>
<path>
<item name="chapter1" type="b2602c69:ProjectViewProjectNode" />
<item name="External Libraries" type="cb654da1:ExternalLibrariesNode" />
<item name="&lt; Python 3.7 (chapter1) &gt;" type="70bed36:NamedLibraryElementNode" />
<item name="python3.7" type="462c0819:PsiDirectoryNode" />
<item name="site-packages" type="462c0819:PsiDirectoryNode" />
<item name="lxml" type="462c0819:PsiDirectoryNode" />
</path>
<path>
<item name="chapter1" type="b2602c69:ProjectViewProjectNode" />
<item name="External Libraries" type="cb654da1:ExternalLibrariesNode" />
<item name="&lt; Python 3.7 (chapter1) &gt;" type="70bed36:NamedLibraryElementNode" />
<item name="site-packages" type="462c0819:PsiDirectoryNode" />
</path>
<path>
<item name="chapter1" type="b2602c69:ProjectViewProjectNode" />
<item name="External Libraries" type="cb654da1:ExternalLibrariesNode" />
<item name="&lt; Python 3.7 (chapter1) &gt;" type="70bed36:NamedLibraryElementNode" />
<item name="site-packages" type="462c0819:PsiDirectoryNode" />
<item name="lxml" type="462c0819:PsiDirectoryNode" />
</path>
</expand>
<select />
</subPane>
@ -281,7 +206,7 @@
<property name="last_opened_file_path" value="$PROJECT_DIR$" />
<property name="nodejs_interpreter_path.stuck_in_default_project" value="undefined stuck path" />
<property name="nodejs_npm_path_reset_for_default_project" value="true" />
<property name="settings.editor.selected.configurable" value="configurable.group.project" />
<property name="settings.editor.selected.configurable" value="com.jetbrains.python.configuration.PyActiveSdkModuleConfigurable" />
</component>
<component name="RecentsManager">
<key name="MoveFile.RECENT_KEYS">
@ -412,7 +337,7 @@
<component name="ToolWindowManager">
<frame x="0" y="23" width="1280" height="777" extended-state="0" />
<layout>
<window_info active="true" content_ui="combo" id="Project" order="0" visible="true" weight="0.25767368" />
<window_info content_ui="combo" id="Project" order="0" visible="true" weight="0.25767368" />
<window_info id="Structure" order="1" side_tool="true" weight="0.25" />
<window_info id="Favorites" order="2" side_tool="true" />
<window_info anchor="bottom" id="Message" order="0" />
@ -423,7 +348,7 @@
<window_info anchor="bottom" id="Inspection" order="5" weight="0.4" />
<window_info anchor="bottom" id="TODO" order="6" weight="0.329927" />
<window_info anchor="bottom" id="Version Control" order="7" />
<window_info anchor="bottom" id="Terminal" order="8" visible="true" weight="0.31678832" />
<window_info active="true" anchor="bottom" id="Terminal" order="8" visible="true" weight="0.33430657" />
<window_info anchor="bottom" id="Event Log" order="9" side_tool="true" />
<window_info anchor="bottom" id="Python Console" order="10" weight="0.329927" />
<window_info anchor="right" id="Commander" internal_type="SLIDING" order="0" type="SLIDING" weight="0.4" />
@ -455,11 +380,6 @@
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/crawler/taobao/settings.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="-1210" />
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/crawler/taobao/__init__.py">
<provider selected="true" editor-type-id="text-editor" />
</entry>
@ -502,13 +422,6 @@
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/env/lib/python3.7/site-packages/parsel/selector.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="900">
<caret line="66" column="8" selection-start-line="66" selection-start-column="8" selection-end-line="66" selection-end-column="8" />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/env/lib/python3.7/site-packages/scrapy/http/request/__init__.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="118">
@ -554,16 +467,6 @@
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/crawler/taobao/spiders/category.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="1245">
<caret line="91" column="45" selection-start-line="87" selection-start-column="4" selection-end-line="91" selection-end-column="45" />
<folding>
<element signature="e#24#35#0" expanded="true" />
</folding>
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/crawler/taobao/merge.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="345">
@ -574,33 +477,30 @@
<entry file="file://$PROJECT_DIR$/env/lib/python3.7/site-packages/jupyter.py">
<provider selected="true" editor-type-id="text-editor" />
</entry>
<entry file="file://$PROJECT_DIR$/crawler/taobao/items.py">
<entry file="file:///usr/local/Cellar/python/3.7.0/Frameworks/Python.framework/Versions/3.7/lib/python3.7/subprocess.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="388">
<caret line="32" column="38" selection-start-line="32" selection-start-column="38" selection-end-line="32" selection-end-column="38" />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/crawler/taobao/pipelines.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="388">
<caret line="37" column="19" selection-start-line="37" selection-start-column="19" selection-end-line="37" selection-end-column="19" />
<folding>
<element signature="e#193#204#0" expanded="true" />
</folding>
<state relative-caret-position="139">
<caret line="965" column="8" selection-start-line="965" selection-start-column="8" selection-end-line="965" selection-end-column="8" />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/env/lib/python3.7/site-packages/lxml/__init__.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="150">
<caret line="10" column="33" selection-start-line="10" selection-start-column="33" selection-end-line="10" selection-end-column="33" />
<state relative-caret-position="300">
<caret line="20" lean-forward="true" selection-start-line="20" selection-end-line="20" />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/crawler/taobao/spiders/httpbin.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="45">
<caret line="3" column="13" selection-start-line="3" selection-start-column="5" selection-end-line="3" selection-end-column="13" />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/crawler/taobao/spiders/dangdang.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="315">
<state relative-caret-position="-318">
<caret line="21" column="22" selection-start-line="21" selection-start-column="22" selection-end-line="21" selection-end-column="22" />
<folding>
<element signature="e#119#129#0" expanded="true" />
@ -608,6 +508,30 @@
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/crawler/taobao/spiders/category.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="251">
<caret line="47" column="59" selection-start-line="47" selection-start-column="54" selection-end-line="47" selection-end-column="59" />
<folding>
<element signature="e#24#35#0" expanded="true" />
</folding>
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/env/lib/python3.7/site-packages/parsel/selector.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="245">
<caret line="238" column="39" lean-forward="true" selection-start-line="238" selection-start-column="39" selection-end-line="238" selection-end-column="39" />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/env/lib/python3.7/site-packages/scrapy/selector/unified.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="492">
<caret line="84" column="24" selection-start-line="84" selection-start-column="8" selection-end-line="84" selection-end-column="24" />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/crawler/taobao/middlewares.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="216">
@ -618,30 +542,57 @@
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/crawler/taobao/spiders/httpbin.py">
<entry file="file://$PROJECT_DIR$/crawler/taobao/items.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="45">
<caret line="3" column="13" selection-start-line="3" selection-start-column="5" selection-end-line="3" selection-end-column="13" />
<state relative-caret-position="269">
<caret line="40" column="28" lean-forward="true" selection-start-line="40" selection-start-column="28" selection-end-line="40" selection-end-column="28" />
</state>
</provider>
</entry>
<entry file="file:///usr/local/Cellar/python/3.7.0/Frameworks/Python.framework/Versions/3.7/lib/python3.7/subprocess.py">
<entry file="file://$PROJECT_DIR$/env/lib/python3.7/site-packages/scrapy/item.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="139">
<caret line="965" column="8" selection-start-line="965" selection-start-column="8" selection-end-line="965" selection-end-column="8" />
<state relative-caret-position="390">
<caret line="68" column="21" lean-forward="true" selection-start-line="68" selection-start-column="21" selection-end-line="68" selection-end-column="21" />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/crawler/taobao/pipelines.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="282">
<caret line="60" column="21" selection-start-line="60" selection-start-column="6" selection-end-line="60" selection-end-column="21" />
<folding>
<element signature="e#193#204#0" expanded="true" />
</folding>
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/crawler/taobao/settings.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="196">
<caret line="88" selection-start-line="88" selection-end-line="88" />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/crawler/taobao/spiders/comment.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="247">
<caret line="81" column="34" selection-start-line="81" selection-start-column="34" selection-end-line="81" selection-end-column="34" />
<state relative-caret-position="-271">
<caret line="36" column="53" lean-forward="true" selection-start-line="36" selection-start-column="53" selection-end-line="36" selection-end-column="53" />
<folding>
<element signature="e#24#35#0" expanded="true" />
</folding>
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/crawler/taobao/spiders/jd.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="136">
<caret line="60" column="21" lean-forward="true" selection-start-line="60" selection-start-column="21" selection-end-line="60" selection-end-column="21" />
<folding>
<element signature="e#14#27#0" expanded="true" />
</folding>
</state>
</provider>
</entry>
</component>
<component name="masterDetails">
<states>

View File

@ -36,3 +36,14 @@ class CommentItem(scrapy.Item):
total_detest_count = scrapy.Field()
page_count = scrapy.Field()
comments = scrapy.Field()
class JDProductItem(scrapy.Item):
id = scrapy.Field()
name = scrapy.Field()
price = scrapy.Field()
url = scrapy.Field()
img = scrapy.Field()
shop = scrapy.Field()
comment_num = scrapy.Field()

View File

@ -6,6 +6,7 @@
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
import json
import pdb
import re
from scrapy.pipelines.images import ImagesPipeline
@ -38,3 +39,27 @@ class JsonWithEncodingCommentsPipeline(object):
return item
def spider_closed(self, spider):
self.file.close()
class JDCleanDataPipeline(object):
"""
京东数据清先
"""
def process_item(self, item, spider):
if isinstance(item["name"], str):
item["name"] = item["name"].strip().replace('\n','').split()
m = re.match(r'.+\/(\d+)\.html', item['url'])
if m:
item["id"] = m.group(1)
if not isinstance(item["id"], str):
return None
return item
class JDHbasePipeline(object):
"""
数据存到hbase中
"""
pass

View File

@ -25,6 +25,9 @@ ROBOTSTXT_OBEY = False
LOG_LEVEL='DEBUG'
SPLASH_URL = 'http://127.0.0.1:8050'
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
@ -61,6 +64,16 @@ COOKIES_ENABLED = True
# 'taobao.middlewares.ZhihuDownloaderMiddleware': 543,
#}
DOWNLOADER_MIDDLEWARES = {
'scrapy_splash.SplashCookiesMiddleware': 723,
'scrapy_splash.SplashMiddleware': 725,
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810,
}
#Splash自己的去重过滤器
DUPEFILTER_CLASS = 'scrapy_splash.SplashAwareDupeFilter'
# Enable or disable extensions
# See https://doc.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {

View File

@ -1,17 +1,14 @@
# -*- coding: utf-8 -*-
import json
import pdb
import subprocess
import pandas as pd
import scrapy
from scrapy import Request
from scrapy.http import Response
from lxml import etree
from scrapy import Request
from taobao.items import CommentItem
import pandas as pd
"""
商品评论数据抓取
"""

View File

@ -0,0 +1,72 @@
#coding=utf-8
import scrapy
from scrapy_splash import SplashRequest
import pdb
from taobao.items import JDProductItem
import re
class JdSpider(scrapy.Spider):
name = 'jd'
allowed_domains = ['jd.com']
start_urls = [
"https://list.jd.com/list.html?cat=9987,653,655"
]
custom_settings = {
'LOG_LEVEL': "WARN",
'ITEM_PIPELINES': {
'taobao.pipelines.JDCleanDataPipeline': 300,
'taobao.pipelines.JDHbasePipeline': 400,
}
}
def start_requests(self):
for url in self.start_urls:
yield SplashRequest(url,
self.parse,
args={})
def parse(self, response):
items = response.xpath('//div[@id="plist"]//li[@class="gl-item"]')
for item in items:
# pdb.set_trace()
name = item.xpath('.//div[contains(@class, "p-name")]/a/em/text()').extract_first()
print(name)
price = item.xpath('.//div[contains(@class, "p-price")]/strong/i/text()').extract_first()
print(price)
url = item.xpath('.//div[contains(@class, "p-name")]/a/@href').extract_first()
print('https:'+url)
img = item.xpath('.//div[@class="p-img"]/a/img/@data-lazy-img').extract_first()
if not img:
img = item.xpath('.//div[@class="p-img"]/a/img/@src').extract_first()
comment_num = item.xpath('.//div[@class="p-commit"]/strong/a/text()').extract_first()
print(comment_num)
shop = item.xpath('.//div[@class="p-shop"]/span/a/@title').extract_first()
print(shop)
yield JDProductItem(
name = name,
price = price,
url = url,
img = img,
comment_num = comment_num,
shop = shop
)
# 获取下一页
next_page = response.xpath('//a[@class="pn-next"]/@href').extract_first()
if next_page:
page = 1
m = re.match(r'.+page=(\d+).+', next_page)
if m:
page = m.group(1)
print("开始抓取:{0}".format(page))
yield SplashRequest(
url= 'https://list.jd.com'+next_page
)
else:
print("抓取结束")