抓取京东评论

This commit is contained in:
guange 2019-01-15 16:07:01 +08:00
parent e38ba68191
commit e8b54aa51f
11 changed files with 414 additions and 182 deletions

View File

@ -2,7 +2,11 @@
<project version="4"> <project version="4">
<component name="ChangeListManager"> <component name="ChangeListManager">
<list default="true" id="a5fbb387-9969-4874-8a7f-c9dd40d5225d" name="Default Changelist" comment=""> <list default="true" id="a5fbb387-9969-4874-8a7f-c9dd40d5225d" name="Default Changelist" comment="">
<change afterPath="$PROJECT_DIR$/crawler/client.py" afterDir="false" />
<change afterPath="$PROJECT_DIR$/crawler/demo.py" afterDir="false" />
<change afterPath="$PROJECT_DIR$/crawler/taobao/utils.py" afterDir="false" />
<change beforePath="$PROJECT_DIR$/.idea/workspace.xml" beforeDir="false" afterPath="$PROJECT_DIR$/.idea/workspace.xml" afterDir="false" /> <change beforePath="$PROJECT_DIR$/.idea/workspace.xml" beforeDir="false" afterPath="$PROJECT_DIR$/.idea/workspace.xml" afterDir="false" />
<change beforePath="$PROJECT_DIR$/crawler/taobao/items.py" beforeDir="false" afterPath="$PROJECT_DIR$/crawler/taobao/items.py" afterDir="false" />
<change beforePath="$PROJECT_DIR$/crawler/taobao/pipelines.py" beforeDir="false" afterPath="$PROJECT_DIR$/crawler/taobao/pipelines.py" afterDir="false" /> <change beforePath="$PROJECT_DIR$/crawler/taobao/pipelines.py" beforeDir="false" afterPath="$PROJECT_DIR$/crawler/taobao/pipelines.py" afterDir="false" />
<change beforePath="$PROJECT_DIR$/crawler/taobao/settings.py" beforeDir="false" afterPath="$PROJECT_DIR$/crawler/taobao/settings.py" afterDir="false" /> <change beforePath="$PROJECT_DIR$/crawler/taobao/settings.py" beforeDir="false" afterPath="$PROJECT_DIR$/crawler/taobao/settings.py" afterDir="false" />
<change beforePath="$PROJECT_DIR$/crawler/taobao/spiders/jd.py" beforeDir="false" afterPath="$PROJECT_DIR$/crawler/taobao/spiders/jd.py" afterDir="false" /> <change beforePath="$PROJECT_DIR$/crawler/taobao/spiders/jd.py" beforeDir="false" afterPath="$PROJECT_DIR$/crawler/taobao/spiders/jd.py" afterDir="false" />
@ -16,6 +20,7 @@
<component name="CoverageDataManager"> <component name="CoverageDataManager">
<SUITE FILE_PATH="coverage/chapter1$category.coverage" NAME="category Coverage Results" MODIFIED="1546938251328" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/crawler/taobao/spiders" /> <SUITE FILE_PATH="coverage/chapter1$category.coverage" NAME="category Coverage Results" MODIFIED="1546938251328" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/crawler/taobao/spiders" />
<SUITE FILE_PATH="coverage/chapter1$merge.coverage" NAME="merge Coverage Results" MODIFIED="1547014157363" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/crawler/taobao" /> <SUITE FILE_PATH="coverage/chapter1$merge.coverage" NAME="merge Coverage Results" MODIFIED="1547014157363" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/crawler/taobao" />
<SUITE FILE_PATH="coverage/chapter1$demo.coverage" NAME="demo Coverage Results" MODIFIED="1547519456164" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/crawler" />
<SUITE FILE_PATH="coverage/chapter1$comment.coverage" NAME="comment Coverage Results" MODIFIED="1547166377597" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/crawler/taobao/spiders" /> <SUITE FILE_PATH="coverage/chapter1$comment.coverage" NAME="comment Coverage Results" MODIFIED="1547166377597" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/crawler/taobao/spiders" />
</component> </component>
<component name="FileEditorManager"> <component name="FileEditorManager">
@ -23,8 +28,8 @@
<file pinned="false" current-in-tab="false"> <file pinned="false" current-in-tab="false">
<entry file="file://$PROJECT_DIR$/crawler/taobao/items.py"> <entry file="file://$PROJECT_DIR$/crawler/taobao/items.py">
<provider selected="true" editor-type-id="text-editor"> <provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="374"> <state relative-caret-position="-545">
<caret line="47" column="15" selection-start-line="47" selection-start-column="4" selection-end-line="47" selection-end-column="15" /> <caret line="14" selection-start-line="14" selection-end-line="14" />
</state> </state>
</provider> </provider>
</entry> </entry>
@ -32,8 +37,8 @@
<file pinned="false" current-in-tab="false"> <file pinned="false" current-in-tab="false">
<entry file="file://$PROJECT_DIR$/crawler/taobao/pipelines.py"> <entry file="file://$PROJECT_DIR$/crawler/taobao/pipelines.py">
<provider selected="true" editor-type-id="text-editor"> <provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="1155"> <state relative-caret-position="205">
<caret line="77" column="31" lean-forward="true" selection-start-line="77" selection-start-column="31" selection-end-line="77" selection-end-column="31" /> <caret line="105" column="20" selection-start-line="104" selection-start-column="20" selection-end-line="105" selection-end-column="20" />
<folding> <folding>
<element signature="e#193#204#0" expanded="true" /> <element signature="e#193#204#0" expanded="true" />
</folding> </folding>
@ -42,43 +47,10 @@
</entry> </entry>
</file> </file>
<file pinned="false" current-in-tab="false"> <file pinned="false" current-in-tab="false">
<entry file="file://$PROJECT_DIR$/env/lib/python3.7/site-packages/scrapy/conf.py"> <entry file="file://$PROJECT_DIR$/crawler/taobao/utils.py">
<provider selected="true" editor-type-id="text-editor"> <provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="105"> <state relative-caret-position="60">
<caret line="7" column="4" selection-start-line="7" selection-start-column="4" selection-end-line="7" selection-end-column="4" /> <caret line="4" column="12" selection-start-line="4" selection-start-column="4" selection-end-line="4" selection-end-column="12" />
</state>
</provider>
</entry>
</file>
<file pinned="false" current-in-tab="false">
<entry file="file://$PROJECT_DIR$/env/lib/python3.7/site-packages/happybase/connection.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="270">
<caret line="107" column="8" selection-start-line="107" selection-start-column="8" selection-end-line="107" selection-end-column="8" />
</state>
</provider>
</entry>
</file>
<file pinned="false" current-in-tab="false">
<entry file="file://$PROJECT_DIR$/crawler/taobao/spiders/comment.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="-271">
<caret line="36" column="53" selection-start-line="36" selection-start-column="53" selection-end-line="36" selection-end-column="53" />
<folding>
<element signature="e#24#35#0" expanded="true" />
</folding>
</state>
</provider>
</entry>
</file>
<file pinned="false" current-in-tab="false">
<entry file="file://$PROJECT_DIR$/crawler/taobao/middlewares.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="216">
<caret line="137" column="32" selection-start-line="137" selection-start-column="32" selection-end-line="137" selection-end-column="32" />
<folding>
<element signature="e#168#178#0" expanded="true" />
</folding>
</state> </state>
</provider> </provider>
</entry> </entry>
@ -86,38 +58,38 @@
<file pinned="false" current-in-tab="false"> <file pinned="false" current-in-tab="false">
<entry file="file://$PROJECT_DIR$/crawler/taobao/settings.py"> <entry file="file://$PROJECT_DIR$/crawler/taobao/settings.py">
<provider selected="true" editor-type-id="text-editor"> <provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="426"> <state relative-caret-position="176">
<caret line="119" column="12" selection-start-line="119" selection-start-column="12" selection-end-line="119" selection-end-column="12" /> <caret line="115" selection-start-line="115" selection-end-line="115" />
</state> </state>
</provider> </provider>
</entry> </entry>
</file> </file>
<file pinned="false" current-in-tab="true"> <file pinned="false" current-in-tab="true">
<entry file="file://$PROJECT_DIR$/crawler/client.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="235">
<caret line="36" column="10" lean-forward="true" selection-start-line="36" selection-start-column="10" selection-end-line="36" selection-end-column="10" />
</state>
</provider>
</entry>
</file>
<file pinned="false" current-in-tab="false">
<entry file="file://$PROJECT_DIR$/crawler/taobao/spiders/jd.py"> <entry file="file://$PROJECT_DIR$/crawler/taobao/spiders/jd.py">
<provider selected="true" editor-type-id="text-editor"> <provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="63"> <state relative-caret-position="-680">
<caret line="72" column="40" lean-forward="true" selection-start-line="72" selection-start-column="40" selection-end-line="72" selection-end-column="40" /> <caret line="85" selection-start-line="85" selection-end-line="85" />
<folding> <folding>
<element signature="e#14#25#0" expanded="true" /> <element signature="e#15#26#0" expanded="true" />
</folding> </folding>
</state> </state>
</provider> </provider>
</entry> </entry>
</file> </file>
<file pinned="false" current-in-tab="false"> <file pinned="false" current-in-tab="false">
<entry file="file://$PROJECT_DIR$/env/lib/python3.7/site-packages/scrapy/selector/unified.py"> <entry file="file://$PROJECT_DIR$/env/lib/python3.7/site-packages/scrapy/http/request/__init__.py">
<provider selected="true" editor-type-id="text-editor"> <provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="492"> <state relative-caret-position="180">
<caret line="84" column="24" selection-start-line="84" selection-start-column="8" selection-end-line="84" selection-end-column="24" /> <caret line="19" column="68" selection-start-line="19" selection-start-column="60" selection-end-line="19" selection-end-column="68" />
</state>
</provider>
</entry>
</file>
<file pinned="false" current-in-tab="false">
<entry file="file://$PROJECT_DIR$/env/lib/python3.7/site-packages/parsel/selector.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="245">
<caret line="238" column="39" selection-start-line="238" selection-start-column="39" selection-end-line="238" selection-end-column="39" />
</state> </state>
</provider> </provider>
</entry> </entry>
@ -142,6 +114,12 @@
<find>DUPEFILTER_CLASS</find> <find>DUPEFILTER_CLASS</find>
<find>contains</find> <find>contains</find>
<find>scrapy.selector.unified.Selector</find> <find>scrapy.selector.unified.Selector</find>
<find>AUTOTHROTTLE_ENABLED</find>
<find>WEBSERVICE_PORT</find>
<find>WEBSERVICE_ENABLED</find>
<find>TELNETCONSOLE_ENABLED</find>
<find>EXTENSIONS_BASE</find>
<find>TELNET</find>
</findStrings> </findStrings>
</component> </component>
<component name="Git.Settings"> <component name="Git.Settings">
@ -159,10 +137,13 @@
<option value="$PROJECT_DIR$/crawler/taobao/merge.py" /> <option value="$PROJECT_DIR$/crawler/taobao/merge.py" />
<option value="$PROJECT_DIR$/crawler/taobao/spiders/comment.py" /> <option value="$PROJECT_DIR$/crawler/taobao/spiders/comment.py" />
<option value="$PROJECT_DIR$/crawler/taobao/spiders/category.py" /> <option value="$PROJECT_DIR$/crawler/taobao/spiders/category.py" />
<option value="$PROJECT_DIR$/crawler/demo.py" />
<option value="$PROJECT_DIR$/crawler/taobao/utils.py" />
<option value="$PROJECT_DIR$/crawler/taobao/items.py" /> <option value="$PROJECT_DIR$/crawler/taobao/items.py" />
<option value="$PROJECT_DIR$/crawler/taobao/settings.py" />
<option value="$PROJECT_DIR$/crawler/taobao/pipelines.py" /> <option value="$PROJECT_DIR$/crawler/taobao/pipelines.py" />
<option value="$PROJECT_DIR$/crawler/taobao/spiders/jd.py" /> <option value="$PROJECT_DIR$/crawler/taobao/spiders/jd.py" />
<option value="$PROJECT_DIR$/crawler/taobao/settings.py" />
<option value="$PROJECT_DIR$/crawler/client.py" />
</list> </list>
</option> </option>
</component> </component>
@ -177,6 +158,7 @@
<foldersAlwaysOnTop value="true" /> <foldersAlwaysOnTop value="true" />
</navigator> </navigator>
<panes> <panes>
<pane id="Scope" />
<pane id="ProjectPane"> <pane id="ProjectPane">
<subPane> <subPane>
<expand> <expand>
@ -189,24 +171,10 @@
<item name="chapter1" type="462c0819:PsiDirectoryNode" /> <item name="chapter1" type="462c0819:PsiDirectoryNode" />
<item name="crawler" type="462c0819:PsiDirectoryNode" /> <item name="crawler" type="462c0819:PsiDirectoryNode" />
</path> </path>
<path>
<item name="chapter1" type="b2602c69:ProjectViewProjectNode" />
<item name="chapter1" type="462c0819:PsiDirectoryNode" />
<item name="crawler" type="462c0819:PsiDirectoryNode" />
<item name="taobao" type="462c0819:PsiDirectoryNode" />
</path>
<path>
<item name="chapter1" type="b2602c69:ProjectViewProjectNode" />
<item name="chapter1" type="462c0819:PsiDirectoryNode" />
<item name="crawler" type="462c0819:PsiDirectoryNode" />
<item name="taobao" type="462c0819:PsiDirectoryNode" />
<item name="spiders" type="462c0819:PsiDirectoryNode" />
</path>
</expand> </expand>
<select /> <select />
</subPane> </subPane>
</pane> </pane>
<pane id="Scope" />
</panes> </panes>
</component> </component>
<component name="PropertiesComponent"> <component name="PropertiesComponent">
@ -233,7 +201,7 @@
</list> </list>
</option> </option>
</component> </component>
<component name="RunManager" selected="Python.comment"> <component name="RunManager" selected="Python.demo">
<configuration name="category" type="PythonConfigurationType" factoryName="Python" temporary="true"> <configuration name="category" type="PythonConfigurationType" factoryName="Python" temporary="true">
<module name="chapter1" /> <module name="chapter1" />
<option name="INTERPRETER_OPTIONS" value="" /> <option name="INTERPRETER_OPTIONS" value="" />
@ -278,6 +246,28 @@
<option name="INPUT_FILE" value="" /> <option name="INPUT_FILE" value="" />
<method v="2" /> <method v="2" />
</configuration> </configuration>
<configuration name="demo" type="PythonConfigurationType" factoryName="Python" temporary="true">
<module name="chapter1" />
<option name="INTERPRETER_OPTIONS" value="" />
<option name="PARENT_ENVS" value="true" />
<envs>
<env name="PYTHONUNBUFFERED" value="1" />
</envs>
<option name="SDK_HOME" value="" />
<option name="WORKING_DIRECTORY" value="$PROJECT_DIR$/crawler" />
<option name="IS_MODULE_SDK" value="true" />
<option name="ADD_CONTENT_ROOTS" value="true" />
<option name="ADD_SOURCE_ROOTS" value="true" />
<EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
<option name="SCRIPT_NAME" value="$PROJECT_DIR$/crawler/demo.py" />
<option name="PARAMETERS" value="" />
<option name="SHOW_COMMAND_LINE" value="false" />
<option name="EMULATE_TERMINAL" value="false" />
<option name="MODULE_MODE" value="false" />
<option name="REDIRECT_INPUT" value="false" />
<option name="INPUT_FILE" value="" />
<method v="2" />
</configuration>
<configuration name="merge" type="PythonConfigurationType" factoryName="Python" temporary="true"> <configuration name="merge" type="PythonConfigurationType" factoryName="Python" temporary="true">
<module name="chapter1" /> <module name="chapter1" />
<option name="INTERPRETER_OPTIONS" value="" /> <option name="INTERPRETER_OPTIONS" value="" />
@ -302,6 +292,7 @@
</configuration> </configuration>
<recent_temporary> <recent_temporary>
<list> <list>
<item itemvalue="Python.demo" />
<item itemvalue="Python.comment" /> <item itemvalue="Python.comment" />
<item itemvalue="Python.merge" /> <item itemvalue="Python.merge" />
<item itemvalue="Python.category" /> <item itemvalue="Python.category" />
@ -337,7 +328,14 @@
<option name="project" value="LOCAL" /> <option name="project" value="LOCAL" />
<updated>1547454506424</updated> <updated>1547454506424</updated>
</task> </task>
<option name="localTasksCounter" value="4" /> <task id="LOCAL-00004" summary="导入hbase">
<created>1547476493838</created>
<option name="number" value="00004" />
<option name="presentableId" value="LOCAL-00004" />
<option name="project" value="LOCAL" />
<updated>1547476493838</updated>
</task>
<option name="localTasksCounter" value="5" />
<servers /> <servers />
</component> </component>
<component name="TodoView"> <component name="TodoView">
@ -351,19 +349,20 @@
</component> </component>
<component name="ToolWindowManager"> <component name="ToolWindowManager">
<frame x="0" y="23" width="1280" height="777" extended-state="0" /> <frame x="0" y="23" width="1280" height="777" extended-state="0" />
<editor active="true" />
<layout> <layout>
<window_info content_ui="combo" id="Project" order="0" visible="true" weight="0.25767368" /> <window_info active="true" content_ui="combo" id="Project" order="0" visible="true" weight="0.26171243" />
<window_info id="Structure" order="1" side_tool="true" weight="0.25" /> <window_info id="Structure" order="1" side_tool="true" weight="0.25" />
<window_info id="Favorites" order="2" side_tool="true" /> <window_info id="Favorites" order="2" side_tool="true" />
<window_info anchor="bottom" id="Message" order="0" /> <window_info anchor="bottom" id="Message" order="0" />
<window_info anchor="bottom" id="Find" order="1" /> <window_info anchor="bottom" id="Find" order="1" />
<window_info anchor="bottom" id="Run" order="2" weight="0.3970803" /> <window_info anchor="bottom" id="Run" order="2" weight="0.2729927" />
<window_info anchor="bottom" id="Debug" order="3" weight="0.4" /> <window_info anchor="bottom" id="Debug" order="3" weight="0.4" />
<window_info anchor="bottom" id="Cvs" order="4" weight="0.25" /> <window_info anchor="bottom" id="Cvs" order="4" weight="0.25" />
<window_info anchor="bottom" id="Inspection" order="5" weight="0.4" /> <window_info anchor="bottom" id="Inspection" order="5" weight="0.4" />
<window_info anchor="bottom" id="TODO" order="6" weight="0.329927" /> <window_info anchor="bottom" id="TODO" order="6" weight="0.329927" />
<window_info anchor="bottom" id="Version Control" order="7" /> <window_info anchor="bottom" id="Version Control" order="7" />
<window_info active="true" anchor="bottom" id="Terminal" order="8" visible="true" weight="0.649635" /> <window_info anchor="bottom" id="Terminal" order="8" visible="true" weight="0.38686132" />
<window_info anchor="bottom" id="Event Log" order="9" side_tool="true" /> <window_info anchor="bottom" id="Event Log" order="9" side_tool="true" />
<window_info anchor="bottom" id="Python Console" order="10" weight="0.329927" /> <window_info anchor="bottom" id="Python Console" order="10" weight="0.329927" />
<window_info anchor="right" id="Commander" internal_type="SLIDING" order="0" type="SLIDING" weight="0.4" /> <window_info anchor="right" id="Commander" internal_type="SLIDING" order="0" type="SLIDING" weight="0.4" />
@ -376,7 +375,8 @@
<MESSAGE value="数据抓取" /> <MESSAGE value="数据抓取" />
<MESSAGE value="增加id字段" /> <MESSAGE value="增加id字段" />
<MESSAGE value="加入京东抓取" /> <MESSAGE value="加入京东抓取" />
<option name="LAST_COMMIT_MESSAGE" value="加入京东抓取" /> <MESSAGE value="导入hbase" />
<option name="LAST_COMMIT_MESSAGE" value="导入hbase" />
</component> </component>
<component name="editorHistoryManager"> <component name="editorHistoryManager">
<entry file="file://$PROJECT_DIR$/crawler/scrapy.cfg"> <entry file="file://$PROJECT_DIR$/crawler/scrapy.cfg">
@ -396,9 +396,6 @@
</state> </state>
</provider> </provider>
</entry> </entry>
<entry file="file://$PROJECT_DIR$/crawler/taobao/__init__.py">
<provider selected="true" editor-type-id="text-editor" />
</entry>
<entry file="file://$PROJECT_DIR$/crawler/taobao/spiders/run.py" /> <entry file="file://$PROJECT_DIR$/crawler/taobao/spiders/run.py" />
<entry file="file://$PROJECT_DIR$/env/lib/python3.7/site-packages/scrapy/spiders/__init__.py"> <entry file="file://$PROJECT_DIR$/env/lib/python3.7/site-packages/scrapy/spiders/__init__.py">
<provider selected="true" editor-type-id="text-editor"> <provider selected="true" editor-type-id="text-editor">
@ -438,13 +435,6 @@
</state> </state>
</provider> </provider>
</entry> </entry>
<entry file="file://$PROJECT_DIR$/env/lib/python3.7/site-packages/scrapy/http/request/__init__.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="118">
<caret line="16" column="6" selection-start-line="16" selection-start-column="6" selection-end-line="16" selection-end-column="6" />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/env/lib/python3.7/site-packages/scrapy/http/response/__init__.py"> <entry file="file://$PROJECT_DIR$/env/lib/python3.7/site-packages/scrapy/http/response/__init__.py">
<provider selected="true" editor-type-id="text-editor"> <provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="110"> <state relative-caret-position="110">
@ -469,9 +459,7 @@
</entry> </entry>
<entry file="file://$PROJECT_DIR$/crawler/taobao/spiders/hah.json" /> <entry file="file://$PROJECT_DIR$/crawler/taobao/spiders/hah.json" />
<entry file="file://$PROJECT_DIR$/crawler/taobao/spiders/douban.json" /> <entry file="file://$PROJECT_DIR$/crawler/taobao/spiders/douban.json" />
<entry file="file://$PROJECT_DIR$/crawler/datas/comments/23761145.json"> <entry file="file://$PROJECT_DIR$/crawler/datas/comments/23761145.json" />
<provider selected="true" editor-type-id="text-editor" />
</entry>
<entry file="file://$PROJECT_DIR$/crawler/datas/2小时送货服务.json" /> <entry file="file://$PROJECT_DIR$/crawler/datas/2小时送货服务.json" />
<entry file="file://$PROJECT_DIR$/crawler/datas/3万以下.json" /> <entry file="file://$PROJECT_DIR$/crawler/datas/3万以下.json" />
<entry file="file://$PROJECT_DIR$/crawler/datas/kind.json" /> <entry file="file://$PROJECT_DIR$/crawler/datas/kind.json" />
@ -483,13 +471,6 @@
</state> </state>
</provider> </provider>
</entry> </entry>
<entry file="file://$PROJECT_DIR$/crawler/taobao/merge.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="345">
<caret line="23" column="30" selection-start-line="23" selection-start-column="30" selection-end-line="23" selection-end-column="30" />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/env/lib/python3.7/site-packages/jupyter.py"> <entry file="file://$PROJECT_DIR$/env/lib/python3.7/site-packages/jupyter.py">
<provider selected="true" editor-type-id="text-editor" /> <provider selected="true" editor-type-id="text-editor" />
</entry> </entry>
@ -548,40 +529,6 @@
</state> </state>
</provider> </provider>
</entry> </entry>
<entry file="file://$PROJECT_DIR$/crawler/taobao/middlewares.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="216">
<caret line="137" column="32" selection-start-line="137" selection-start-column="32" selection-end-line="137" selection-end-column="32" />
<folding>
<element signature="e#168#178#0" expanded="true" />
</folding>
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/crawler/taobao/spiders/comment.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="-271">
<caret line="36" column="53" selection-start-line="36" selection-start-column="53" selection-end-line="36" selection-end-column="53" />
<folding>
<element signature="e#24#35#0" expanded="true" />
</folding>
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/env/lib/python3.7/site-packages/scrapy/item.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="390">
<caret line="68" column="21" selection-start-line="68" selection-start-column="21" selection-end-line="68" selection-end-column="21" />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/env/lib/python3.7/site-packages/happybase/connection.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="270">
<caret line="107" column="8" selection-start-line="107" selection-start-column="8" selection-end-line="107" selection-end-column="8" />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/env/lib/python3.7/site-packages/scrapy/conf.py"> <entry file="file://$PROJECT_DIR$/env/lib/python3.7/site-packages/scrapy/conf.py">
<provider selected="true" editor-type-id="text-editor"> <provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="105"> <state relative-caret-position="105">
@ -589,37 +536,123 @@
</state> </state>
</provider> </provider>
</entry> </entry>
<entry file="file://$PROJECT_DIR$/crawler/taobao/items.py"> <entry file="file://$PROJECT_DIR$/env/lib/python3.7/site-packages/happybase/connection.py">
<provider selected="true" editor-type-id="text-editor"> <provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="374"> <state relative-caret-position="1455">
<caret line="47" column="15" selection-start-line="47" selection-start-column="4" selection-end-line="47" selection-end-column="15" /> <caret line="107" column="8" selection-start-line="107" selection-start-column="8" selection-end-line="107" selection-end-column="8" />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/crawler/taobao/spiders/comment.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="420">
<caret line="36" column="53" selection-start-line="36" selection-start-column="53" selection-end-line="36" selection-end-column="53" />
<folding>
<element signature="e#24#35#0" expanded="true" />
</folding>
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/crawler/taobao/middlewares.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="1920">
<caret line="137" column="32" selection-start-line="137" selection-start-column="32" selection-end-line="137" selection-end-column="32" />
<folding>
<element signature="e#168#178#0" expanded="true" />
</folding>
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/env/lib/python3.7/site-packages/scrapy/item.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="162">
<caret line="58" selection-start-line="58" selection-end-line="58" />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/crawler/taobao/__init__.py">
<provider selected="true" editor-type-id="text-editor" />
</entry>
<entry file="file://$PROJECT_DIR$/crawler/taobao/merge.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="345">
<caret line="23" column="30" selection-start-line="23" selection-start-column="30" selection-end-line="23" selection-end-column="30" />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/env/lib/python3.7/site-packages/scrapy_splash/request.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="-632">
<caret line="20" column="8" selection-start-line="20" selection-start-column="8" selection-end-line="20" selection-end-column="8" />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/crawler/demo.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="311">
<caret line="26" column="36" lean-forward="true" selection-start-line="26" selection-start-column="36" selection-end-line="26" selection-end-column="36" />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/env/lib/python3.7/site-packages/scrapy/http/request/__init__.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="180">
<caret line="19" column="68" selection-start-line="19" selection-start-column="60" selection-end-line="19" selection-end-column="68" />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/crawler/taobao/spiders/jd.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="-680">
<caret line="85" selection-start-line="85" selection-end-line="85" />
<folding>
<element signature="e#15#26#0" expanded="true" />
</folding>
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/crawler/taobao/utils.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="60">
<caret line="4" column="12" selection-start-line="4" selection-start-column="4" selection-end-line="4" selection-end-column="12" />
</state> </state>
</provider> </provider>
</entry> </entry>
<entry file="file://$PROJECT_DIR$/crawler/taobao/pipelines.py"> <entry file="file://$PROJECT_DIR$/crawler/taobao/pipelines.py">
<provider selected="true" editor-type-id="text-editor"> <provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="1155"> <state relative-caret-position="205">
<caret line="77" column="31" lean-forward="true" selection-start-line="77" selection-start-column="31" selection-end-line="77" selection-end-column="31" /> <caret line="105" column="20" selection-start-line="104" selection-start-column="20" selection-end-line="105" selection-end-column="20" />
<folding> <folding>
<element signature="e#193#204#0" expanded="true" /> <element signature="e#193#204#0" expanded="true" />
</folding> </folding>
</state> </state>
</provider> </provider>
</entry> </entry>
<entry file="file://$PROJECT_DIR$/crawler/taobao/settings.py"> <entry file="file://$PROJECT_DIR$/crawler/taobao/items.py">
<provider selected="true" editor-type-id="text-editor"> <provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="426"> <state relative-caret-position="-545">
<caret line="119" column="12" selection-start-line="119" selection-start-column="12" selection-end-line="119" selection-end-column="12" /> <caret line="14" selection-start-line="14" selection-end-line="14" />
</state> </state>
</provider> </provider>
</entry> </entry>
<entry file="file://$PROJECT_DIR$/crawler/taobao/spiders/jd.py"> <entry file="file://$PROJECT_DIR$/crawler/taobao/settings.py">
<provider selected="true" editor-type-id="text-editor"> <provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="63"> <state relative-caret-position="176">
<caret line="72" column="40" lean-forward="true" selection-start-line="72" selection-start-column="40" selection-end-line="72" selection-end-column="40" /> <caret line="115" selection-start-line="115" selection-end-line="115" />
<folding> </state>
<element signature="e#14#25#0" expanded="true" /> </provider>
</folding> </entry>
<entry file="file:///usr/local/Cellar/python/3.7.0/Frameworks/Python.framework/Versions/3.7/lib/python3.7/telnetlib.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="291">
<caret line="289" column="33" lean-forward="true" selection-start-line="289" selection-start-column="33" selection-end-line="289" selection-end-column="33" />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/crawler/client.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="235">
<caret line="36" column="10" lean-forward="true" selection-start-line="36" selection-start-column="10" selection-end-line="36" selection-end-column="10" />
</state> </state>
</provider> </provider>
</entry> </entry>

View File

@ -0,0 +1,40 @@
#!/usr/bin/env python
"""
Example script to control a Scrapy server using its JSON-RPC web service.
It only provides a reduced functionality as its main purpose is to illustrate
how to write a web service client. Feel free to improve or write you own.
Also, keep in mind that the JSON-RPC API is not stable. The recommended way for
controlling a Scrapy server is through the execution queue (see the "queue"
command).
"""
import telnetlib
def do_telnet(Host, finish):
'''Telnet远程登录Windows客户端连接Linux服务器'''
# 连接Telnet服务器
tn = telnetlib.Telnet(Host, port=6023, timeout=10)
tn.set_debuglevel(2)
# 输入登录用户名
out = tn.read_until(finish)
tn.write(b'est()\n')
print('-----------------1')
print(out.decode('utf8'))
print('-----------------11')
# 输入登录密码
out = tn.read_until(finish)
print('-----------------2')
print(out.decode('utf8'))
print('-----------------3')
tn.close() # tn.write('exit\n')
if __name__ == '__main__':
# 配置选项
Host = '127.0.0.1' # Telnet服务器IP
finish = b'>>> ' # 命令提示符
do_telnet(Host,finish)

31
chapter1/crawler/demo.py Normal file
View File

@ -0,0 +1,31 @@
import re,requests,json
s = requests.session()
url = 'https://club.jd.com/comment/productPageComments.action'
data = {
'callback':'fetchJSON_comment98vv61',
'productId':'3888284',
'score':0,
'sortType':5,
'pageSize':10,
'isShadowSku':0,
'page':0
}
while True:
t = s.get(url,params = data).text
try:
t = re.search(r'(?<=fetchJSON_comment98vv61\().*(?=\);)',t).group(0)
except Exception as e:
break
j = json.loads(t)
commentSummary = j['comments']
for comment in commentSummary:
c_content = comment['content']
c_time = comment['referenceTime']
c_name = comment['nickname']
c_client = comment['userClientShow']
print('{} {} {}\n{}\n'.format(c_name,c_time,c_client,c_content))
data['page'] += 1

View File

@ -46,4 +46,24 @@ class JDProductItem(scrapy.Item):
img = scrapy.Field() img = scrapy.Field()
shop = scrapy.Field() shop = scrapy.Field()
comment_num = scrapy.Field() comment_num = scrapy.Field()
created_at = scrapy.Field()
class JDCommentItem(scrapy.Item):
id = scrapy.Field()
user_name = scrapy.Field()
from_type = scrapy.Field()
content = scrapy.Field()
comment_time = scrapy.Field()
created_at = scrapy.Field()
class JDCommentSummary(scrapy.Item):
id = scrapy.Field()
good_count = scrapy.Field()
general_count = scrapy.Field()
poor_count = scrapy.Field()
comment_count = scrapy.Field()
default_good_count = scrapy.Field()
good_rate = scrapy.Field()

View File

@ -12,6 +12,8 @@ import happybase
from scrapy.conf import settings from scrapy.conf import settings
from scrapy.pipelines.images import ImagesPipeline from scrapy.pipelines.images import ImagesPipeline
from taobao.items import JDProductItem, JDCommentItem, JDCommentSummary
class MyImagesPipeline(ImagesPipeline): class MyImagesPipeline(ImagesPipeline):
pass pass
@ -52,17 +54,19 @@ class JDCleanDataPipeline(object):
""" """
京东数据清先 京东数据清先
""" """
def process_item(self, item, spider): def process_item(self, item, spider):
if isinstance(item["name"], str): if isinstance(item, JDProductItem):
item["name"] = item["name"].strip().replace('\n', '') if isinstance(item["name"], str):
item["name"] = item["name"].strip().replace('\n', '')
m = re.match(r'.+\/(\d+)\.html', item['url']) m = re.match(r'.+\/(\d+)\.html', item['url'])
if m: if m:
item["id"] = m.group(1) item["id"] = m.group(1)
if not isinstance(item["id"], str): if not isinstance(item["id"], str):
return None return None
elif isinstance(item, JDCommentItem):
pass
return item return item
@ -71,20 +75,43 @@ class JDHbasePipeline(object):
""" """
数据存到hbase中 数据存到hbase中
""" """
def __init__(self): def __init__(self):
host = settings['HBASE_HOST'] host = settings['HBASE_HOST']
table_name = settings['HBASE_TABLE'] table_name = settings['HBASE_TABLE']
connection = happybase.Connection(host, port=settings["HBASE_PORT"]) connection = happybase.Connection(host, port=settings["HBASE_PORT"])
table = connection.table(table_name) table = connection.table(table_name)
self.table = table self.product_table = table
self.comment_table = connection.table('jd_comment')
def process_item(self, item, spider): def process_item(self, item, spider):
self.table.put(item["id"], if isinstance(item, JDProductItem):
{"product:name": item["name"], self.product_table.put(item["id"],
"product:price": item["price"], {"product:name": item["name"],
"product:url": item["url"], "product:price": item["price"],
"product:img": item["img"], "product:url": item["url"],
"product:shop": item["shop"], "product:img": item["img"],
"product:comment_num": item["comment_num"]}) "product:shop": item["shop"],
"product:comment_num": item["comment_num"],
"product:created_at": item["created_at"]
})
elif isinstance(item, JDCommentItem):
self.comment_table.put(item["id"],
{
"comment:user_name": item["user_name"],
"comment:from_type": item["from_type"],
"comment:content": item["content"],
"comment:comment_time": item["comment_time"],
"comment:created_at": item["created_at"]
})
elif isinstance(item, JDCommentSummary):
self.product_table.put(item["id"],
{"comment:good_count": str(item["good_count"]),
"comment:general_count": str(item["general_count"]),
"comment:poor_count": str(item["poor_count"]),
"comment:comment_count": str(item["comment_count"]),
"comment:default_good_count": str(item["default_good_count"]),
"comment:good_rate": str(item["good_rate"])
})
return item return item

View File

@ -43,7 +43,7 @@ SPLASH_URL = 'http://127.0.0.1:8050'
COOKIES_ENABLED = True COOKIES_ENABLED = True
# Disable Telnet Console (enabled by default) # Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False TELNETCONSOLE_ENABLED = True
# Override the default request headers: # Override the default request headers:
#DEFAULT_REQUEST_HEADERS = { #DEFAULT_REQUEST_HEADERS = {
@ -92,14 +92,15 @@ IMAGES_STORE = '/tmp/taobao'
# Enable and configure the AutoThrottle extension (disabled by default) # Enable and configure the AutoThrottle extension (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html # See https://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True # 启用限速
AUTOTHROTTLE_ENABLED = True
# The initial download delay # The initial download delay
#AUTOTHROTTLE_START_DELAY = 5 AUTOTHROTTLE_START_DELAY = 10
# The maximum download delay to be set in case of high latencies # The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60 AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to # The average number of requests Scrapy should be sending in parallel to
# each remote server # each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received: # Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False #AUTOTHROTTLE_DEBUG = False
@ -114,7 +115,8 @@ IMAGES_STORE = '/tmp/taobao'
FEED_EXPORT_ENCODING='utf-8' FEED_EXPORT_ENCODING='utf-8'
HBASE_HOST = '106.75.85.84' HBASE_HOST = '106.75.85.84'
HBASE_PORT = 40009 HBASE_PORT = 40009
HBASE_TABLE = 'jd' HBASE_TABLE = 'jd'
COMMENT_MAX_PAGE = 10

View File

@ -1,12 +1,17 @@
#coding=utf-8 # coding=utf-8
import json
import time import time
import scrapy import scrapy
from scrapy import Request
from scrapy.conf import settings
from scrapy_splash import SplashRequest from scrapy_splash import SplashRequest
import pdb import pdb
from taobao.items import JDProductItem from taobao.items import JDProductItem, JDCommentItem, JDCommentSummary
import re import re
from taobao.utils import now_time
class JdSpider(scrapy.Spider): class JdSpider(scrapy.Spider):
name = 'jd' name = 'jd'
@ -23,12 +28,70 @@ class JdSpider(scrapy.Spider):
} }
} }
def make_comment_url(self, product_id, page):
comment_url = "https://club.jd.com/comment/productPageComments.action?callback=fetchJSON" \
"_comment98vv33164&productId=%s&score=0&sortType=5&page=%d&pageSize=10" \
"&isShadowSku=0&rid=0&fold=1" % (product_id, page)
return comment_url
def start_requests(self): def start_requests(self):
for url in self.start_urls: for url in self.start_urls:
yield SplashRequest(url, yield SplashRequest(url,
self.parse, self.parse,
args={}) args={})
def parse_comment(self, response):
# pdb.set_trace()
product_id = response.meta["product_id"]
page = response.meta["page"]
print("抓取评论, %s, 当前 %d" % (product_id, page))
m = re.match(r'^fetchJSON_comment98vv33164\((.+)\);$', response.text)
if m:
t = m.group(1)
j = json.loads(t)
commentSummary = j['comments']
for comment in commentSummary:
c_content = comment['content']
c_time = comment['referenceTime']
c_name = comment['nickname']
c_client = comment['userClientShow']
c_id = "%s_%s" % (product_id, comment['id'])
print('{}, {} {} {}\n{}\n'.format(c_id, c_name, c_time, c_client, c_content))
yield JDCommentItem(
id=c_id,
user_name=c_name,
from_type=c_client,
content=c_content,
comment_time=c_time,
created_at=now_time()
)
if page == 0:
productCommentSummary = j['productCommentSummary']
yield JDCommentSummary(
id = product_id,
good_count = productCommentSummary['goodCount'],
general_count= productCommentSummary['generalCount'],
poor_count= productCommentSummary['poorCount'],
comment_count= productCommentSummary['commentCount'],
default_good_count= productCommentSummary['defaultGoodCount'],
good_rate= productCommentSummary['goodRate']
)
page = page + 1
next_comment_url = self.make_comment_url(product_id, page)
# time.sleep(10)
if page>settings['COMMENT_MAX_PAGE']:
print("评论抓取达到最大深度, %s, 页数: %d", product_id, page)
else:
yield Request(next_comment_url, self.parse_comment,
meta={'product_id': product_id, 'page': page})
else:
print("评论抓取完成, %s, 页数: %d", product_id, page)
def parse(self, response): def parse(self, response):
items = response.xpath('//div[@id="plist"]//li[@class="gl-item"]') items = response.xpath('//div[@id="plist"]//li[@class="gl-item"]')
@ -36,10 +99,11 @@ class JdSpider(scrapy.Spider):
# pdb.set_trace() # pdb.set_trace()
name = item.xpath('.//div[contains(@class, "p-name")]/a/em/text()').extract_first() name = item.xpath('.//div[contains(@class, "p-name")]/a/em/text()').extract_first()
print(name) print(name)
price = item.xpath('.//div[contains(@class, "p-price")]/strong/i/text()').extract_first() price = item.xpath(
'.//div[contains(@class, "p-price")]/strong/i/text()').extract_first()
print(price) print(price)
url = item.xpath('.//div[contains(@class, "p-name")]/a/@href').extract_first() url = item.xpath('.//div[contains(@class, "p-name")]/a/@href').extract_first()
print('https:'+url) print('https:' + url)
img = item.xpath('.//div[@class="p-img"]/a/img/@data-lazy-img').extract_first() img = item.xpath('.//div[@class="p-img"]/a/img/@data-lazy-img').extract_first()
if not img: if not img:
img = item.xpath('.//div[@class="p-img"]/a/img/@src').extract_first() img = item.xpath('.//div[@class="p-img"]/a/img/@src').extract_first()
@ -50,14 +114,23 @@ class JdSpider(scrapy.Spider):
print(shop) print(shop)
yield JDProductItem( yield JDProductItem(
name = name, name=name,
price = price, price=price,
url = url, url=url,
img = img, img=img,
comment_num = comment_num, comment_num=comment_num,
shop = shop shop=shop,
created_at=now_time()
) )
m = re.match(r'.+\/(\d+)\.html', url)
if m:
product_id = m.group(1)
comment_url = self.make_comment_url(product_id, 0)
yield Request(comment_url, self.parse_comment,
meta={'product_id': product_id, 'page': 0},
priority=100)
# 获取下一页 # 获取下一页
next_page = response.xpath('//a[@class="pn-next"]/@href').extract_first() next_page = response.xpath('//a[@class="pn-next"]/@href').extract_first()
if next_page: if next_page:
@ -67,10 +140,10 @@ class JdSpider(scrapy.Spider):
page = m.group(1) page = m.group(1)
print("开始抓取:{0}".format(page)) print("开始抓取:{0}".format(page))
time.sleep(10) # time.sleep(10)
yield SplashRequest( yield SplashRequest(
url= 'https://list.jd.com'+next_page url='https://list.jd.com' + next_page
) )
else: else:
print("抓取结束") print("抓取结束")

View File

@ -0,0 +1,6 @@
# coding=utf-8
import time
def now_time():
return time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))