抓取京东评论

This commit is contained in:
guange 2019-01-15 16:07:01 +08:00
parent e38ba68191
commit e8b54aa51f
11 changed files with 414 additions and 182 deletions

View File

@ -2,7 +2,11 @@
<project version="4">
<component name="ChangeListManager">
<list default="true" id="a5fbb387-9969-4874-8a7f-c9dd40d5225d" name="Default Changelist" comment="">
<change afterPath="$PROJECT_DIR$/crawler/client.py" afterDir="false" />
<change afterPath="$PROJECT_DIR$/crawler/demo.py" afterDir="false" />
<change afterPath="$PROJECT_DIR$/crawler/taobao/utils.py" afterDir="false" />
<change beforePath="$PROJECT_DIR$/.idea/workspace.xml" beforeDir="false" afterPath="$PROJECT_DIR$/.idea/workspace.xml" afterDir="false" />
<change beforePath="$PROJECT_DIR$/crawler/taobao/items.py" beforeDir="false" afterPath="$PROJECT_DIR$/crawler/taobao/items.py" afterDir="false" />
<change beforePath="$PROJECT_DIR$/crawler/taobao/pipelines.py" beforeDir="false" afterPath="$PROJECT_DIR$/crawler/taobao/pipelines.py" afterDir="false" />
<change beforePath="$PROJECT_DIR$/crawler/taobao/settings.py" beforeDir="false" afterPath="$PROJECT_DIR$/crawler/taobao/settings.py" afterDir="false" />
<change beforePath="$PROJECT_DIR$/crawler/taobao/spiders/jd.py" beforeDir="false" afterPath="$PROJECT_DIR$/crawler/taobao/spiders/jd.py" afterDir="false" />
@ -16,6 +20,7 @@
<component name="CoverageDataManager">
<SUITE FILE_PATH="coverage/chapter1$category.coverage" NAME="category Coverage Results" MODIFIED="1546938251328" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/crawler/taobao/spiders" />
<SUITE FILE_PATH="coverage/chapter1$merge.coverage" NAME="merge Coverage Results" MODIFIED="1547014157363" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/crawler/taobao" />
<SUITE FILE_PATH="coverage/chapter1$demo.coverage" NAME="demo Coverage Results" MODIFIED="1547519456164" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/crawler" />
<SUITE FILE_PATH="coverage/chapter1$comment.coverage" NAME="comment Coverage Results" MODIFIED="1547166377597" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/crawler/taobao/spiders" />
</component>
<component name="FileEditorManager">
@ -23,8 +28,8 @@
<file pinned="false" current-in-tab="false">
<entry file="file://$PROJECT_DIR$/crawler/taobao/items.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="374">
<caret line="47" column="15" selection-start-line="47" selection-start-column="4" selection-end-line="47" selection-end-column="15" />
<state relative-caret-position="-545">
<caret line="14" selection-start-line="14" selection-end-line="14" />
</state>
</provider>
</entry>
@ -32,8 +37,8 @@
<file pinned="false" current-in-tab="false">
<entry file="file://$PROJECT_DIR$/crawler/taobao/pipelines.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="1155">
<caret line="77" column="31" lean-forward="true" selection-start-line="77" selection-start-column="31" selection-end-line="77" selection-end-column="31" />
<state relative-caret-position="205">
<caret line="105" column="20" selection-start-line="104" selection-start-column="20" selection-end-line="105" selection-end-column="20" />
<folding>
<element signature="e#193#204#0" expanded="true" />
</folding>
@ -42,43 +47,10 @@
</entry>
</file>
<file pinned="false" current-in-tab="false">
<entry file="file://$PROJECT_DIR$/env/lib/python3.7/site-packages/scrapy/conf.py">
<entry file="file://$PROJECT_DIR$/crawler/taobao/utils.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="105">
<caret line="7" column="4" selection-start-line="7" selection-start-column="4" selection-end-line="7" selection-end-column="4" />
</state>
</provider>
</entry>
</file>
<file pinned="false" current-in-tab="false">
<entry file="file://$PROJECT_DIR$/env/lib/python3.7/site-packages/happybase/connection.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="270">
<caret line="107" column="8" selection-start-line="107" selection-start-column="8" selection-end-line="107" selection-end-column="8" />
</state>
</provider>
</entry>
</file>
<file pinned="false" current-in-tab="false">
<entry file="file://$PROJECT_DIR$/crawler/taobao/spiders/comment.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="-271">
<caret line="36" column="53" selection-start-line="36" selection-start-column="53" selection-end-line="36" selection-end-column="53" />
<folding>
<element signature="e#24#35#0" expanded="true" />
</folding>
</state>
</provider>
</entry>
</file>
<file pinned="false" current-in-tab="false">
<entry file="file://$PROJECT_DIR$/crawler/taobao/middlewares.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="216">
<caret line="137" column="32" selection-start-line="137" selection-start-column="32" selection-end-line="137" selection-end-column="32" />
<folding>
<element signature="e#168#178#0" expanded="true" />
</folding>
<state relative-caret-position="60">
<caret line="4" column="12" selection-start-line="4" selection-start-column="4" selection-end-line="4" selection-end-column="12" />
</state>
</provider>
</entry>
@ -86,38 +58,38 @@
<file pinned="false" current-in-tab="false">
<entry file="file://$PROJECT_DIR$/crawler/taobao/settings.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="426">
<caret line="119" column="12" selection-start-line="119" selection-start-column="12" selection-end-line="119" selection-end-column="12" />
<state relative-caret-position="176">
<caret line="115" selection-start-line="115" selection-end-line="115" />
</state>
</provider>
</entry>
</file>
<file pinned="false" current-in-tab="true">
<entry file="file://$PROJECT_DIR$/crawler/client.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="235">
<caret line="36" column="10" lean-forward="true" selection-start-line="36" selection-start-column="10" selection-end-line="36" selection-end-column="10" />
</state>
</provider>
</entry>
</file>
<file pinned="false" current-in-tab="false">
<entry file="file://$PROJECT_DIR$/crawler/taobao/spiders/jd.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="63">
<caret line="72" column="40" lean-forward="true" selection-start-line="72" selection-start-column="40" selection-end-line="72" selection-end-column="40" />
<state relative-caret-position="-680">
<caret line="85" selection-start-line="85" selection-end-line="85" />
<folding>
<element signature="e#14#25#0" expanded="true" />
<element signature="e#15#26#0" expanded="true" />
</folding>
</state>
</provider>
</entry>
</file>
<file pinned="false" current-in-tab="false">
<entry file="file://$PROJECT_DIR$/env/lib/python3.7/site-packages/scrapy/selector/unified.py">
<entry file="file://$PROJECT_DIR$/env/lib/python3.7/site-packages/scrapy/http/request/__init__.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="492">
<caret line="84" column="24" selection-start-line="84" selection-start-column="8" selection-end-line="84" selection-end-column="24" />
</state>
</provider>
</entry>
</file>
<file pinned="false" current-in-tab="false">
<entry file="file://$PROJECT_DIR$/env/lib/python3.7/site-packages/parsel/selector.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="245">
<caret line="238" column="39" selection-start-line="238" selection-start-column="39" selection-end-line="238" selection-end-column="39" />
<state relative-caret-position="180">
<caret line="19" column="68" selection-start-line="19" selection-start-column="60" selection-end-line="19" selection-end-column="68" />
</state>
</provider>
</entry>
@ -142,6 +114,12 @@
<find>DUPEFILTER_CLASS</find>
<find>contains</find>
<find>scrapy.selector.unified.Selector</find>
<find>AUTOTHROTTLE_ENABLED</find>
<find>WEBSERVICE_PORT</find>
<find>WEBSERVICE_ENABLED</find>
<find>TELNETCONSOLE_ENABLED</find>
<find>EXTENSIONS_BASE</find>
<find>TELNET</find>
</findStrings>
</component>
<component name="Git.Settings">
@ -159,10 +137,13 @@
<option value="$PROJECT_DIR$/crawler/taobao/merge.py" />
<option value="$PROJECT_DIR$/crawler/taobao/spiders/comment.py" />
<option value="$PROJECT_DIR$/crawler/taobao/spiders/category.py" />
<option value="$PROJECT_DIR$/crawler/demo.py" />
<option value="$PROJECT_DIR$/crawler/taobao/utils.py" />
<option value="$PROJECT_DIR$/crawler/taobao/items.py" />
<option value="$PROJECT_DIR$/crawler/taobao/settings.py" />
<option value="$PROJECT_DIR$/crawler/taobao/pipelines.py" />
<option value="$PROJECT_DIR$/crawler/taobao/spiders/jd.py" />
<option value="$PROJECT_DIR$/crawler/taobao/settings.py" />
<option value="$PROJECT_DIR$/crawler/client.py" />
</list>
</option>
</component>
@ -177,6 +158,7 @@
<foldersAlwaysOnTop value="true" />
</navigator>
<panes>
<pane id="Scope" />
<pane id="ProjectPane">
<subPane>
<expand>
@ -189,24 +171,10 @@
<item name="chapter1" type="462c0819:PsiDirectoryNode" />
<item name="crawler" type="462c0819:PsiDirectoryNode" />
</path>
<path>
<item name="chapter1" type="b2602c69:ProjectViewProjectNode" />
<item name="chapter1" type="462c0819:PsiDirectoryNode" />
<item name="crawler" type="462c0819:PsiDirectoryNode" />
<item name="taobao" type="462c0819:PsiDirectoryNode" />
</path>
<path>
<item name="chapter1" type="b2602c69:ProjectViewProjectNode" />
<item name="chapter1" type="462c0819:PsiDirectoryNode" />
<item name="crawler" type="462c0819:PsiDirectoryNode" />
<item name="taobao" type="462c0819:PsiDirectoryNode" />
<item name="spiders" type="462c0819:PsiDirectoryNode" />
</path>
</expand>
<select />
</subPane>
</pane>
<pane id="Scope" />
</panes>
</component>
<component name="PropertiesComponent">
@ -233,7 +201,7 @@
</list>
</option>
</component>
<component name="RunManager" selected="Python.comment">
<component name="RunManager" selected="Python.demo">
<configuration name="category" type="PythonConfigurationType" factoryName="Python" temporary="true">
<module name="chapter1" />
<option name="INTERPRETER_OPTIONS" value="" />
@ -278,6 +246,28 @@
<option name="INPUT_FILE" value="" />
<method v="2" />
</configuration>
<configuration name="demo" type="PythonConfigurationType" factoryName="Python" temporary="true">
<module name="chapter1" />
<option name="INTERPRETER_OPTIONS" value="" />
<option name="PARENT_ENVS" value="true" />
<envs>
<env name="PYTHONUNBUFFERED" value="1" />
</envs>
<option name="SDK_HOME" value="" />
<option name="WORKING_DIRECTORY" value="$PROJECT_DIR$/crawler" />
<option name="IS_MODULE_SDK" value="true" />
<option name="ADD_CONTENT_ROOTS" value="true" />
<option name="ADD_SOURCE_ROOTS" value="true" />
<EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
<option name="SCRIPT_NAME" value="$PROJECT_DIR$/crawler/demo.py" />
<option name="PARAMETERS" value="" />
<option name="SHOW_COMMAND_LINE" value="false" />
<option name="EMULATE_TERMINAL" value="false" />
<option name="MODULE_MODE" value="false" />
<option name="REDIRECT_INPUT" value="false" />
<option name="INPUT_FILE" value="" />
<method v="2" />
</configuration>
<configuration name="merge" type="PythonConfigurationType" factoryName="Python" temporary="true">
<module name="chapter1" />
<option name="INTERPRETER_OPTIONS" value="" />
@ -302,6 +292,7 @@
</configuration>
<recent_temporary>
<list>
<item itemvalue="Python.demo" />
<item itemvalue="Python.comment" />
<item itemvalue="Python.merge" />
<item itemvalue="Python.category" />
@ -337,7 +328,14 @@
<option name="project" value="LOCAL" />
<updated>1547454506424</updated>
</task>
<option name="localTasksCounter" value="4" />
<task id="LOCAL-00004" summary="导入hbase">
<created>1547476493838</created>
<option name="number" value="00004" />
<option name="presentableId" value="LOCAL-00004" />
<option name="project" value="LOCAL" />
<updated>1547476493838</updated>
</task>
<option name="localTasksCounter" value="5" />
<servers />
</component>
<component name="TodoView">
@ -351,19 +349,20 @@
</component>
<component name="ToolWindowManager">
<frame x="0" y="23" width="1280" height="777" extended-state="0" />
<editor active="true" />
<layout>
<window_info content_ui="combo" id="Project" order="0" visible="true" weight="0.25767368" />
<window_info active="true" content_ui="combo" id="Project" order="0" visible="true" weight="0.26171243" />
<window_info id="Structure" order="1" side_tool="true" weight="0.25" />
<window_info id="Favorites" order="2" side_tool="true" />
<window_info anchor="bottom" id="Message" order="0" />
<window_info anchor="bottom" id="Find" order="1" />
<window_info anchor="bottom" id="Run" order="2" weight="0.3970803" />
<window_info anchor="bottom" id="Run" order="2" weight="0.2729927" />
<window_info anchor="bottom" id="Debug" order="3" weight="0.4" />
<window_info anchor="bottom" id="Cvs" order="4" weight="0.25" />
<window_info anchor="bottom" id="Inspection" order="5" weight="0.4" />
<window_info anchor="bottom" id="TODO" order="6" weight="0.329927" />
<window_info anchor="bottom" id="Version Control" order="7" />
<window_info active="true" anchor="bottom" id="Terminal" order="8" visible="true" weight="0.649635" />
<window_info anchor="bottom" id="Terminal" order="8" visible="true" weight="0.38686132" />
<window_info anchor="bottom" id="Event Log" order="9" side_tool="true" />
<window_info anchor="bottom" id="Python Console" order="10" weight="0.329927" />
<window_info anchor="right" id="Commander" internal_type="SLIDING" order="0" type="SLIDING" weight="0.4" />
@ -376,7 +375,8 @@
<MESSAGE value="数据抓取" />
<MESSAGE value="增加id字段" />
<MESSAGE value="加入京东抓取" />
<option name="LAST_COMMIT_MESSAGE" value="加入京东抓取" />
<MESSAGE value="导入hbase" />
<option name="LAST_COMMIT_MESSAGE" value="导入hbase" />
</component>
<component name="editorHistoryManager">
<entry file="file://$PROJECT_DIR$/crawler/scrapy.cfg">
@ -396,9 +396,6 @@
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/crawler/taobao/__init__.py">
<provider selected="true" editor-type-id="text-editor" />
</entry>
<entry file="file://$PROJECT_DIR$/crawler/taobao/spiders/run.py" />
<entry file="file://$PROJECT_DIR$/env/lib/python3.7/site-packages/scrapy/spiders/__init__.py">
<provider selected="true" editor-type-id="text-editor">
@ -438,13 +435,6 @@
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/env/lib/python3.7/site-packages/scrapy/http/request/__init__.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="118">
<caret line="16" column="6" selection-start-line="16" selection-start-column="6" selection-end-line="16" selection-end-column="6" />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/env/lib/python3.7/site-packages/scrapy/http/response/__init__.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="110">
@ -469,9 +459,7 @@
</entry>
<entry file="file://$PROJECT_DIR$/crawler/taobao/spiders/hah.json" />
<entry file="file://$PROJECT_DIR$/crawler/taobao/spiders/douban.json" />
<entry file="file://$PROJECT_DIR$/crawler/datas/comments/23761145.json">
<provider selected="true" editor-type-id="text-editor" />
</entry>
<entry file="file://$PROJECT_DIR$/crawler/datas/comments/23761145.json" />
<entry file="file://$PROJECT_DIR$/crawler/datas/2小时送货服务.json" />
<entry file="file://$PROJECT_DIR$/crawler/datas/3万以下.json" />
<entry file="file://$PROJECT_DIR$/crawler/datas/kind.json" />
@ -483,13 +471,6 @@
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/crawler/taobao/merge.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="345">
<caret line="23" column="30" selection-start-line="23" selection-start-column="30" selection-end-line="23" selection-end-column="30" />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/env/lib/python3.7/site-packages/jupyter.py">
<provider selected="true" editor-type-id="text-editor" />
</entry>
@ -548,40 +529,6 @@
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/crawler/taobao/middlewares.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="216">
<caret line="137" column="32" selection-start-line="137" selection-start-column="32" selection-end-line="137" selection-end-column="32" />
<folding>
<element signature="e#168#178#0" expanded="true" />
</folding>
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/crawler/taobao/spiders/comment.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="-271">
<caret line="36" column="53" selection-start-line="36" selection-start-column="53" selection-end-line="36" selection-end-column="53" />
<folding>
<element signature="e#24#35#0" expanded="true" />
</folding>
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/env/lib/python3.7/site-packages/scrapy/item.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="390">
<caret line="68" column="21" selection-start-line="68" selection-start-column="21" selection-end-line="68" selection-end-column="21" />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/env/lib/python3.7/site-packages/happybase/connection.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="270">
<caret line="107" column="8" selection-start-line="107" selection-start-column="8" selection-end-line="107" selection-end-column="8" />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/env/lib/python3.7/site-packages/scrapy/conf.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="105">
@ -589,37 +536,123 @@
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/crawler/taobao/items.py">
<entry file="file://$PROJECT_DIR$/env/lib/python3.7/site-packages/happybase/connection.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="374">
<caret line="47" column="15" selection-start-line="47" selection-start-column="4" selection-end-line="47" selection-end-column="15" />
<state relative-caret-position="1455">
<caret line="107" column="8" selection-start-line="107" selection-start-column="8" selection-end-line="107" selection-end-column="8" />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/crawler/taobao/spiders/comment.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="420">
<caret line="36" column="53" selection-start-line="36" selection-start-column="53" selection-end-line="36" selection-end-column="53" />
<folding>
<element signature="e#24#35#0" expanded="true" />
</folding>
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/crawler/taobao/middlewares.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="1920">
<caret line="137" column="32" selection-start-line="137" selection-start-column="32" selection-end-line="137" selection-end-column="32" />
<folding>
<element signature="e#168#178#0" expanded="true" />
</folding>
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/env/lib/python3.7/site-packages/scrapy/item.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="162">
<caret line="58" selection-start-line="58" selection-end-line="58" />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/crawler/taobao/__init__.py">
<provider selected="true" editor-type-id="text-editor" />
</entry>
<entry file="file://$PROJECT_DIR$/crawler/taobao/merge.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="345">
<caret line="23" column="30" selection-start-line="23" selection-start-column="30" selection-end-line="23" selection-end-column="30" />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/env/lib/python3.7/site-packages/scrapy_splash/request.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="-632">
<caret line="20" column="8" selection-start-line="20" selection-start-column="8" selection-end-line="20" selection-end-column="8" />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/crawler/demo.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="311">
<caret line="26" column="36" lean-forward="true" selection-start-line="26" selection-start-column="36" selection-end-line="26" selection-end-column="36" />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/env/lib/python3.7/site-packages/scrapy/http/request/__init__.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="180">
<caret line="19" column="68" selection-start-line="19" selection-start-column="60" selection-end-line="19" selection-end-column="68" />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/crawler/taobao/spiders/jd.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="-680">
<caret line="85" selection-start-line="85" selection-end-line="85" />
<folding>
<element signature="e#15#26#0" expanded="true" />
</folding>
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/crawler/taobao/utils.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="60">
<caret line="4" column="12" selection-start-line="4" selection-start-column="4" selection-end-line="4" selection-end-column="12" />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/crawler/taobao/pipelines.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="1155">
<caret line="77" column="31" lean-forward="true" selection-start-line="77" selection-start-column="31" selection-end-line="77" selection-end-column="31" />
<state relative-caret-position="205">
<caret line="105" column="20" selection-start-line="104" selection-start-column="20" selection-end-line="105" selection-end-column="20" />
<folding>
<element signature="e#193#204#0" expanded="true" />
</folding>
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/crawler/taobao/settings.py">
<entry file="file://$PROJECT_DIR$/crawler/taobao/items.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="426">
<caret line="119" column="12" selection-start-line="119" selection-start-column="12" selection-end-line="119" selection-end-column="12" />
<state relative-caret-position="-545">
<caret line="14" selection-start-line="14" selection-end-line="14" />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/crawler/taobao/spiders/jd.py">
<entry file="file://$PROJECT_DIR$/crawler/taobao/settings.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="63">
<caret line="72" column="40" lean-forward="true" selection-start-line="72" selection-start-column="40" selection-end-line="72" selection-end-column="40" />
<folding>
<element signature="e#14#25#0" expanded="true" />
</folding>
<state relative-caret-position="176">
<caret line="115" selection-start-line="115" selection-end-line="115" />
</state>
</provider>
</entry>
<entry file="file:///usr/local/Cellar/python/3.7.0/Frameworks/Python.framework/Versions/3.7/lib/python3.7/telnetlib.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="291">
<caret line="289" column="33" lean-forward="true" selection-start-line="289" selection-start-column="33" selection-end-line="289" selection-end-column="33" />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/crawler/client.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="235">
<caret line="36" column="10" lean-forward="true" selection-start-line="36" selection-start-column="10" selection-end-line="36" selection-end-column="10" />
</state>
</provider>
</entry>

View File

@ -0,0 +1,40 @@
#!/usr/bin/env python
"""
Example script to control a Scrapy server using its JSON-RPC web service.
It only provides a reduced functionality as its main purpose is to illustrate
how to write a web service client. Feel free to improve or write you own.
Also, keep in mind that the JSON-RPC API is not stable. The recommended way for
controlling a Scrapy server is through the execution queue (see the "queue"
command).
"""
import telnetlib
def do_telnet(Host, finish):
'''Telnet远程登录Windows客户端连接Linux服务器'''
# 连接Telnet服务器
tn = telnetlib.Telnet(Host, port=6023, timeout=10)
tn.set_debuglevel(2)
# 输入登录用户名
out = tn.read_until(finish)
tn.write(b'est()\n')
print('-----------------1')
print(out.decode('utf8'))
print('-----------------11')
# 输入登录密码
out = tn.read_until(finish)
print('-----------------2')
print(out.decode('utf8'))
print('-----------------3')
tn.close() # tn.write('exit\n')
if __name__ == '__main__':
# 配置选项
Host = '127.0.0.1' # Telnet服务器IP
finish = b'>>> ' # 命令提示符
do_telnet(Host,finish)

31
chapter1/crawler/demo.py Normal file
View File

@ -0,0 +1,31 @@
import re,requests,json
s = requests.session()
url = 'https://club.jd.com/comment/productPageComments.action'
data = {
'callback':'fetchJSON_comment98vv61',
'productId':'3888284',
'score':0,
'sortType':5,
'pageSize':10,
'isShadowSku':0,
'page':0
}
while True:
t = s.get(url,params = data).text
try:
t = re.search(r'(?<=fetchJSON_comment98vv61\().*(?=\);)',t).group(0)
except Exception as e:
break
j = json.loads(t)
commentSummary = j['comments']
for comment in commentSummary:
c_content = comment['content']
c_time = comment['referenceTime']
c_name = comment['nickname']
c_client = comment['userClientShow']
print('{} {} {}\n{}\n'.format(c_name,c_time,c_client,c_content))
data['page'] += 1

View File

@ -46,4 +46,24 @@ class JDProductItem(scrapy.Item):
img = scrapy.Field()
shop = scrapy.Field()
comment_num = scrapy.Field()
created_at = scrapy.Field()
class JDCommentItem(scrapy.Item):
id = scrapy.Field()
user_name = scrapy.Field()
from_type = scrapy.Field()
content = scrapy.Field()
comment_time = scrapy.Field()
created_at = scrapy.Field()
class JDCommentSummary(scrapy.Item):
id = scrapy.Field()
good_count = scrapy.Field()
general_count = scrapy.Field()
poor_count = scrapy.Field()
comment_count = scrapy.Field()
default_good_count = scrapy.Field()
good_rate = scrapy.Field()

View File

@ -12,6 +12,8 @@ import happybase
from scrapy.conf import settings
from scrapy.pipelines.images import ImagesPipeline
from taobao.items import JDProductItem, JDCommentItem, JDCommentSummary
class MyImagesPipeline(ImagesPipeline):
pass
@ -52,17 +54,19 @@ class JDCleanDataPipeline(object):
"""
京东数据清先
"""
def process_item(self, item, spider):
if isinstance(item["name"], str):
item["name"] = item["name"].strip().replace('\n', '')
if isinstance(item, JDProductItem):
if isinstance(item["name"], str):
item["name"] = item["name"].strip().replace('\n', '')
m = re.match(r'.+\/(\d+)\.html', item['url'])
if m:
item["id"] = m.group(1)
m = re.match(r'.+\/(\d+)\.html', item['url'])
if m:
item["id"] = m.group(1)
if not isinstance(item["id"], str):
return None
if not isinstance(item["id"], str):
return None
elif isinstance(item, JDCommentItem):
pass
return item
@ -71,20 +75,43 @@ class JDHbasePipeline(object):
"""
数据存到hbase中
"""
def __init__(self):
host = settings['HBASE_HOST']
table_name = settings['HBASE_TABLE']
connection = happybase.Connection(host, port=settings["HBASE_PORT"])
table = connection.table(table_name)
self.table = table
self.product_table = table
self.comment_table = connection.table('jd_comment')
def process_item(self, item, spider):
self.table.put(item["id"],
{"product:name": item["name"],
"product:price": item["price"],
"product:url": item["url"],
"product:img": item["img"],
"product:shop": item["shop"],
"product:comment_num": item["comment_num"]})
if isinstance(item, JDProductItem):
self.product_table.put(item["id"],
{"product:name": item["name"],
"product:price": item["price"],
"product:url": item["url"],
"product:img": item["img"],
"product:shop": item["shop"],
"product:comment_num": item["comment_num"],
"product:created_at": item["created_at"]
})
elif isinstance(item, JDCommentItem):
self.comment_table.put(item["id"],
{
"comment:user_name": item["user_name"],
"comment:from_type": item["from_type"],
"comment:content": item["content"],
"comment:comment_time": item["comment_time"],
"comment:created_at": item["created_at"]
})
elif isinstance(item, JDCommentSummary):
self.product_table.put(item["id"],
{"comment:good_count": str(item["good_count"]),
"comment:general_count": str(item["general_count"]),
"comment:poor_count": str(item["poor_count"]),
"comment:comment_count": str(item["comment_count"]),
"comment:default_good_count": str(item["default_good_count"]),
"comment:good_rate": str(item["good_rate"])
})
return item

View File

@ -43,7 +43,7 @@ SPLASH_URL = 'http://127.0.0.1:8050'
COOKIES_ENABLED = True
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
TELNETCONSOLE_ENABLED = True
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
@ -92,14 +92,15 @@ IMAGES_STORE = '/tmp/taobao'
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# 启用限速
AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
AUTOTHROTTLE_START_DELAY = 10
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
@ -114,7 +115,8 @@ IMAGES_STORE = '/tmp/taobao'
FEED_EXPORT_ENCODING='utf-8'
HBASE_HOST = '106.75.85.84'
HBASE_PORT = 40009
HBASE_TABLE = 'jd'
HBASE_TABLE = 'jd'
COMMENT_MAX_PAGE = 10

View File

@ -1,12 +1,17 @@
#coding=utf-8
# coding=utf-8
import json
import time
import scrapy
from scrapy import Request
from scrapy.conf import settings
from scrapy_splash import SplashRequest
import pdb
from taobao.items import JDProductItem
from taobao.items import JDProductItem, JDCommentItem, JDCommentSummary
import re
from taobao.utils import now_time
class JdSpider(scrapy.Spider):
name = 'jd'
@ -23,12 +28,70 @@ class JdSpider(scrapy.Spider):
}
}
def make_comment_url(self, product_id, page):
comment_url = "https://club.jd.com/comment/productPageComments.action?callback=fetchJSON" \
"_comment98vv33164&productId=%s&score=0&sortType=5&page=%d&pageSize=10" \
"&isShadowSku=0&rid=0&fold=1" % (product_id, page)
return comment_url
def start_requests(self):
for url in self.start_urls:
yield SplashRequest(url,
self.parse,
args={})
def parse_comment(self, response):
# pdb.set_trace()
product_id = response.meta["product_id"]
page = response.meta["page"]
print("抓取评论, %s, 当前 %d" % (product_id, page))
m = re.match(r'^fetchJSON_comment98vv33164\((.+)\);$', response.text)
if m:
t = m.group(1)
j = json.loads(t)
commentSummary = j['comments']
for comment in commentSummary:
c_content = comment['content']
c_time = comment['referenceTime']
c_name = comment['nickname']
c_client = comment['userClientShow']
c_id = "%s_%s" % (product_id, comment['id'])
print('{}, {} {} {}\n{}\n'.format(c_id, c_name, c_time, c_client, c_content))
yield JDCommentItem(
id=c_id,
user_name=c_name,
from_type=c_client,
content=c_content,
comment_time=c_time,
created_at=now_time()
)
if page == 0:
productCommentSummary = j['productCommentSummary']
yield JDCommentSummary(
id = product_id,
good_count = productCommentSummary['goodCount'],
general_count= productCommentSummary['generalCount'],
poor_count= productCommentSummary['poorCount'],
comment_count= productCommentSummary['commentCount'],
default_good_count= productCommentSummary['defaultGoodCount'],
good_rate= productCommentSummary['goodRate']
)
page = page + 1
next_comment_url = self.make_comment_url(product_id, page)
# time.sleep(10)
if page>settings['COMMENT_MAX_PAGE']:
print("评论抓取达到最大深度, %s, 页数: %d", product_id, page)
else:
yield Request(next_comment_url, self.parse_comment,
meta={'product_id': product_id, 'page': page})
else:
print("评论抓取完成, %s, 页数: %d", product_id, page)
def parse(self, response):
items = response.xpath('//div[@id="plist"]//li[@class="gl-item"]')
@ -36,10 +99,11 @@ class JdSpider(scrapy.Spider):
# pdb.set_trace()
name = item.xpath('.//div[contains(@class, "p-name")]/a/em/text()').extract_first()
print(name)
price = item.xpath('.//div[contains(@class, "p-price")]/strong/i/text()').extract_first()
price = item.xpath(
'.//div[contains(@class, "p-price")]/strong/i/text()').extract_first()
print(price)
url = item.xpath('.//div[contains(@class, "p-name")]/a/@href').extract_first()
print('https:'+url)
print('https:' + url)
img = item.xpath('.//div[@class="p-img"]/a/img/@data-lazy-img').extract_first()
if not img:
img = item.xpath('.//div[@class="p-img"]/a/img/@src').extract_first()
@ -50,14 +114,23 @@ class JdSpider(scrapy.Spider):
print(shop)
yield JDProductItem(
name = name,
price = price,
url = url,
img = img,
comment_num = comment_num,
shop = shop
name=name,
price=price,
url=url,
img=img,
comment_num=comment_num,
shop=shop,
created_at=now_time()
)
m = re.match(r'.+\/(\d+)\.html', url)
if m:
product_id = m.group(1)
comment_url = self.make_comment_url(product_id, 0)
yield Request(comment_url, self.parse_comment,
meta={'product_id': product_id, 'page': 0},
priority=100)
# 获取下一页
next_page = response.xpath('//a[@class="pn-next"]/@href').extract_first()
if next_page:
@ -67,10 +140,10 @@ class JdSpider(scrapy.Spider):
page = m.group(1)
print("开始抓取:{0}".format(page))
time.sleep(10)
# time.sleep(10)
yield SplashRequest(
url= 'https://list.jd.com'+next_page
url='https://list.jd.com' + next_page
)
else:
print("抓取结束")

View File

@ -0,0 +1,6 @@
# coding=utf-8
import time
def now_time():
return time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))