抓取京东评论
This commit is contained in:
parent
e38ba68191
commit
e8b54aa51f
|
@ -2,7 +2,11 @@
|
|||
<project version="4">
|
||||
<component name="ChangeListManager">
|
||||
<list default="true" id="a5fbb387-9969-4874-8a7f-c9dd40d5225d" name="Default Changelist" comment="">
|
||||
<change afterPath="$PROJECT_DIR$/crawler/client.py" afterDir="false" />
|
||||
<change afterPath="$PROJECT_DIR$/crawler/demo.py" afterDir="false" />
|
||||
<change afterPath="$PROJECT_DIR$/crawler/taobao/utils.py" afterDir="false" />
|
||||
<change beforePath="$PROJECT_DIR$/.idea/workspace.xml" beforeDir="false" afterPath="$PROJECT_DIR$/.idea/workspace.xml" afterDir="false" />
|
||||
<change beforePath="$PROJECT_DIR$/crawler/taobao/items.py" beforeDir="false" afterPath="$PROJECT_DIR$/crawler/taobao/items.py" afterDir="false" />
|
||||
<change beforePath="$PROJECT_DIR$/crawler/taobao/pipelines.py" beforeDir="false" afterPath="$PROJECT_DIR$/crawler/taobao/pipelines.py" afterDir="false" />
|
||||
<change beforePath="$PROJECT_DIR$/crawler/taobao/settings.py" beforeDir="false" afterPath="$PROJECT_DIR$/crawler/taobao/settings.py" afterDir="false" />
|
||||
<change beforePath="$PROJECT_DIR$/crawler/taobao/spiders/jd.py" beforeDir="false" afterPath="$PROJECT_DIR$/crawler/taobao/spiders/jd.py" afterDir="false" />
|
||||
|
@ -16,6 +20,7 @@
|
|||
<component name="CoverageDataManager">
|
||||
<SUITE FILE_PATH="coverage/chapter1$category.coverage" NAME="category Coverage Results" MODIFIED="1546938251328" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/crawler/taobao/spiders" />
|
||||
<SUITE FILE_PATH="coverage/chapter1$merge.coverage" NAME="merge Coverage Results" MODIFIED="1547014157363" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/crawler/taobao" />
|
||||
<SUITE FILE_PATH="coverage/chapter1$demo.coverage" NAME="demo Coverage Results" MODIFIED="1547519456164" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/crawler" />
|
||||
<SUITE FILE_PATH="coverage/chapter1$comment.coverage" NAME="comment Coverage Results" MODIFIED="1547166377597" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/crawler/taobao/spiders" />
|
||||
</component>
|
||||
<component name="FileEditorManager">
|
||||
|
@ -23,8 +28,8 @@
|
|||
<file pinned="false" current-in-tab="false">
|
||||
<entry file="file://$PROJECT_DIR$/crawler/taobao/items.py">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="374">
|
||||
<caret line="47" column="15" selection-start-line="47" selection-start-column="4" selection-end-line="47" selection-end-column="15" />
|
||||
<state relative-caret-position="-545">
|
||||
<caret line="14" selection-start-line="14" selection-end-line="14" />
|
||||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
|
@ -32,8 +37,8 @@
|
|||
<file pinned="false" current-in-tab="false">
|
||||
<entry file="file://$PROJECT_DIR$/crawler/taobao/pipelines.py">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="1155">
|
||||
<caret line="77" column="31" lean-forward="true" selection-start-line="77" selection-start-column="31" selection-end-line="77" selection-end-column="31" />
|
||||
<state relative-caret-position="205">
|
||||
<caret line="105" column="20" selection-start-line="104" selection-start-column="20" selection-end-line="105" selection-end-column="20" />
|
||||
<folding>
|
||||
<element signature="e#193#204#0" expanded="true" />
|
||||
</folding>
|
||||
|
@ -42,43 +47,10 @@
|
|||
</entry>
|
||||
</file>
|
||||
<file pinned="false" current-in-tab="false">
|
||||
<entry file="file://$PROJECT_DIR$/env/lib/python3.7/site-packages/scrapy/conf.py">
|
||||
<entry file="file://$PROJECT_DIR$/crawler/taobao/utils.py">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="105">
|
||||
<caret line="7" column="4" selection-start-line="7" selection-start-column="4" selection-end-line="7" selection-end-column="4" />
|
||||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
</file>
|
||||
<file pinned="false" current-in-tab="false">
|
||||
<entry file="file://$PROJECT_DIR$/env/lib/python3.7/site-packages/happybase/connection.py">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="270">
|
||||
<caret line="107" column="8" selection-start-line="107" selection-start-column="8" selection-end-line="107" selection-end-column="8" />
|
||||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
</file>
|
||||
<file pinned="false" current-in-tab="false">
|
||||
<entry file="file://$PROJECT_DIR$/crawler/taobao/spiders/comment.py">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="-271">
|
||||
<caret line="36" column="53" selection-start-line="36" selection-start-column="53" selection-end-line="36" selection-end-column="53" />
|
||||
<folding>
|
||||
<element signature="e#24#35#0" expanded="true" />
|
||||
</folding>
|
||||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
</file>
|
||||
<file pinned="false" current-in-tab="false">
|
||||
<entry file="file://$PROJECT_DIR$/crawler/taobao/middlewares.py">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="216">
|
||||
<caret line="137" column="32" selection-start-line="137" selection-start-column="32" selection-end-line="137" selection-end-column="32" />
|
||||
<folding>
|
||||
<element signature="e#168#178#0" expanded="true" />
|
||||
</folding>
|
||||
<state relative-caret-position="60">
|
||||
<caret line="4" column="12" selection-start-line="4" selection-start-column="4" selection-end-line="4" selection-end-column="12" />
|
||||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
|
@ -86,38 +58,38 @@
|
|||
<file pinned="false" current-in-tab="false">
|
||||
<entry file="file://$PROJECT_DIR$/crawler/taobao/settings.py">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="426">
|
||||
<caret line="119" column="12" selection-start-line="119" selection-start-column="12" selection-end-line="119" selection-end-column="12" />
|
||||
<state relative-caret-position="176">
|
||||
<caret line="115" selection-start-line="115" selection-end-line="115" />
|
||||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
</file>
|
||||
<file pinned="false" current-in-tab="true">
|
||||
<entry file="file://$PROJECT_DIR$/crawler/client.py">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="235">
|
||||
<caret line="36" column="10" lean-forward="true" selection-start-line="36" selection-start-column="10" selection-end-line="36" selection-end-column="10" />
|
||||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
</file>
|
||||
<file pinned="false" current-in-tab="false">
|
||||
<entry file="file://$PROJECT_DIR$/crawler/taobao/spiders/jd.py">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="63">
|
||||
<caret line="72" column="40" lean-forward="true" selection-start-line="72" selection-start-column="40" selection-end-line="72" selection-end-column="40" />
|
||||
<state relative-caret-position="-680">
|
||||
<caret line="85" selection-start-line="85" selection-end-line="85" />
|
||||
<folding>
|
||||
<element signature="e#14#25#0" expanded="true" />
|
||||
<element signature="e#15#26#0" expanded="true" />
|
||||
</folding>
|
||||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
</file>
|
||||
<file pinned="false" current-in-tab="false">
|
||||
<entry file="file://$PROJECT_DIR$/env/lib/python3.7/site-packages/scrapy/selector/unified.py">
|
||||
<entry file="file://$PROJECT_DIR$/env/lib/python3.7/site-packages/scrapy/http/request/__init__.py">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="492">
|
||||
<caret line="84" column="24" selection-start-line="84" selection-start-column="8" selection-end-line="84" selection-end-column="24" />
|
||||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
</file>
|
||||
<file pinned="false" current-in-tab="false">
|
||||
<entry file="file://$PROJECT_DIR$/env/lib/python3.7/site-packages/parsel/selector.py">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="245">
|
||||
<caret line="238" column="39" selection-start-line="238" selection-start-column="39" selection-end-line="238" selection-end-column="39" />
|
||||
<state relative-caret-position="180">
|
||||
<caret line="19" column="68" selection-start-line="19" selection-start-column="60" selection-end-line="19" selection-end-column="68" />
|
||||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
|
@ -142,6 +114,12 @@
|
|||
<find>DUPEFILTER_CLASS</find>
|
||||
<find>contains</find>
|
||||
<find>scrapy.selector.unified.Selector</find>
|
||||
<find>AUTOTHROTTLE_ENABLED</find>
|
||||
<find>WEBSERVICE_PORT</find>
|
||||
<find>WEBSERVICE_ENABLED</find>
|
||||
<find>TELNETCONSOLE_ENABLED</find>
|
||||
<find>EXTENSIONS_BASE</find>
|
||||
<find>TELNET</find>
|
||||
</findStrings>
|
||||
</component>
|
||||
<component name="Git.Settings">
|
||||
|
@ -159,10 +137,13 @@
|
|||
<option value="$PROJECT_DIR$/crawler/taobao/merge.py" />
|
||||
<option value="$PROJECT_DIR$/crawler/taobao/spiders/comment.py" />
|
||||
<option value="$PROJECT_DIR$/crawler/taobao/spiders/category.py" />
|
||||
<option value="$PROJECT_DIR$/crawler/demo.py" />
|
||||
<option value="$PROJECT_DIR$/crawler/taobao/utils.py" />
|
||||
<option value="$PROJECT_DIR$/crawler/taobao/items.py" />
|
||||
<option value="$PROJECT_DIR$/crawler/taobao/settings.py" />
|
||||
<option value="$PROJECT_DIR$/crawler/taobao/pipelines.py" />
|
||||
<option value="$PROJECT_DIR$/crawler/taobao/spiders/jd.py" />
|
||||
<option value="$PROJECT_DIR$/crawler/taobao/settings.py" />
|
||||
<option value="$PROJECT_DIR$/crawler/client.py" />
|
||||
</list>
|
||||
</option>
|
||||
</component>
|
||||
|
@ -177,6 +158,7 @@
|
|||
<foldersAlwaysOnTop value="true" />
|
||||
</navigator>
|
||||
<panes>
|
||||
<pane id="Scope" />
|
||||
<pane id="ProjectPane">
|
||||
<subPane>
|
||||
<expand>
|
||||
|
@ -189,24 +171,10 @@
|
|||
<item name="chapter1" type="462c0819:PsiDirectoryNode" />
|
||||
<item name="crawler" type="462c0819:PsiDirectoryNode" />
|
||||
</path>
|
||||
<path>
|
||||
<item name="chapter1" type="b2602c69:ProjectViewProjectNode" />
|
||||
<item name="chapter1" type="462c0819:PsiDirectoryNode" />
|
||||
<item name="crawler" type="462c0819:PsiDirectoryNode" />
|
||||
<item name="taobao" type="462c0819:PsiDirectoryNode" />
|
||||
</path>
|
||||
<path>
|
||||
<item name="chapter1" type="b2602c69:ProjectViewProjectNode" />
|
||||
<item name="chapter1" type="462c0819:PsiDirectoryNode" />
|
||||
<item name="crawler" type="462c0819:PsiDirectoryNode" />
|
||||
<item name="taobao" type="462c0819:PsiDirectoryNode" />
|
||||
<item name="spiders" type="462c0819:PsiDirectoryNode" />
|
||||
</path>
|
||||
</expand>
|
||||
<select />
|
||||
</subPane>
|
||||
</pane>
|
||||
<pane id="Scope" />
|
||||
</panes>
|
||||
</component>
|
||||
<component name="PropertiesComponent">
|
||||
|
@ -233,7 +201,7 @@
|
|||
</list>
|
||||
</option>
|
||||
</component>
|
||||
<component name="RunManager" selected="Python.comment">
|
||||
<component name="RunManager" selected="Python.demo">
|
||||
<configuration name="category" type="PythonConfigurationType" factoryName="Python" temporary="true">
|
||||
<module name="chapter1" />
|
||||
<option name="INTERPRETER_OPTIONS" value="" />
|
||||
|
@ -278,6 +246,28 @@
|
|||
<option name="INPUT_FILE" value="" />
|
||||
<method v="2" />
|
||||
</configuration>
|
||||
<configuration name="demo" type="PythonConfigurationType" factoryName="Python" temporary="true">
|
||||
<module name="chapter1" />
|
||||
<option name="INTERPRETER_OPTIONS" value="" />
|
||||
<option name="PARENT_ENVS" value="true" />
|
||||
<envs>
|
||||
<env name="PYTHONUNBUFFERED" value="1" />
|
||||
</envs>
|
||||
<option name="SDK_HOME" value="" />
|
||||
<option name="WORKING_DIRECTORY" value="$PROJECT_DIR$/crawler" />
|
||||
<option name="IS_MODULE_SDK" value="true" />
|
||||
<option name="ADD_CONTENT_ROOTS" value="true" />
|
||||
<option name="ADD_SOURCE_ROOTS" value="true" />
|
||||
<EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
|
||||
<option name="SCRIPT_NAME" value="$PROJECT_DIR$/crawler/demo.py" />
|
||||
<option name="PARAMETERS" value="" />
|
||||
<option name="SHOW_COMMAND_LINE" value="false" />
|
||||
<option name="EMULATE_TERMINAL" value="false" />
|
||||
<option name="MODULE_MODE" value="false" />
|
||||
<option name="REDIRECT_INPUT" value="false" />
|
||||
<option name="INPUT_FILE" value="" />
|
||||
<method v="2" />
|
||||
</configuration>
|
||||
<configuration name="merge" type="PythonConfigurationType" factoryName="Python" temporary="true">
|
||||
<module name="chapter1" />
|
||||
<option name="INTERPRETER_OPTIONS" value="" />
|
||||
|
@ -302,6 +292,7 @@
|
|||
</configuration>
|
||||
<recent_temporary>
|
||||
<list>
|
||||
<item itemvalue="Python.demo" />
|
||||
<item itemvalue="Python.comment" />
|
||||
<item itemvalue="Python.merge" />
|
||||
<item itemvalue="Python.category" />
|
||||
|
@ -337,7 +328,14 @@
|
|||
<option name="project" value="LOCAL" />
|
||||
<updated>1547454506424</updated>
|
||||
</task>
|
||||
<option name="localTasksCounter" value="4" />
|
||||
<task id="LOCAL-00004" summary="导入hbase">
|
||||
<created>1547476493838</created>
|
||||
<option name="number" value="00004" />
|
||||
<option name="presentableId" value="LOCAL-00004" />
|
||||
<option name="project" value="LOCAL" />
|
||||
<updated>1547476493838</updated>
|
||||
</task>
|
||||
<option name="localTasksCounter" value="5" />
|
||||
<servers />
|
||||
</component>
|
||||
<component name="TodoView">
|
||||
|
@ -351,19 +349,20 @@
|
|||
</component>
|
||||
<component name="ToolWindowManager">
|
||||
<frame x="0" y="23" width="1280" height="777" extended-state="0" />
|
||||
<editor active="true" />
|
||||
<layout>
|
||||
<window_info content_ui="combo" id="Project" order="0" visible="true" weight="0.25767368" />
|
||||
<window_info active="true" content_ui="combo" id="Project" order="0" visible="true" weight="0.26171243" />
|
||||
<window_info id="Structure" order="1" side_tool="true" weight="0.25" />
|
||||
<window_info id="Favorites" order="2" side_tool="true" />
|
||||
<window_info anchor="bottom" id="Message" order="0" />
|
||||
<window_info anchor="bottom" id="Find" order="1" />
|
||||
<window_info anchor="bottom" id="Run" order="2" weight="0.3970803" />
|
||||
<window_info anchor="bottom" id="Run" order="2" weight="0.2729927" />
|
||||
<window_info anchor="bottom" id="Debug" order="3" weight="0.4" />
|
||||
<window_info anchor="bottom" id="Cvs" order="4" weight="0.25" />
|
||||
<window_info anchor="bottom" id="Inspection" order="5" weight="0.4" />
|
||||
<window_info anchor="bottom" id="TODO" order="6" weight="0.329927" />
|
||||
<window_info anchor="bottom" id="Version Control" order="7" />
|
||||
<window_info active="true" anchor="bottom" id="Terminal" order="8" visible="true" weight="0.649635" />
|
||||
<window_info anchor="bottom" id="Terminal" order="8" visible="true" weight="0.38686132" />
|
||||
<window_info anchor="bottom" id="Event Log" order="9" side_tool="true" />
|
||||
<window_info anchor="bottom" id="Python Console" order="10" weight="0.329927" />
|
||||
<window_info anchor="right" id="Commander" internal_type="SLIDING" order="0" type="SLIDING" weight="0.4" />
|
||||
|
@ -376,7 +375,8 @@
|
|||
<MESSAGE value="数据抓取" />
|
||||
<MESSAGE value="增加id字段" />
|
||||
<MESSAGE value="加入京东抓取" />
|
||||
<option name="LAST_COMMIT_MESSAGE" value="加入京东抓取" />
|
||||
<MESSAGE value="导入hbase" />
|
||||
<option name="LAST_COMMIT_MESSAGE" value="导入hbase" />
|
||||
</component>
|
||||
<component name="editorHistoryManager">
|
||||
<entry file="file://$PROJECT_DIR$/crawler/scrapy.cfg">
|
||||
|
@ -396,9 +396,6 @@
|
|||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
<entry file="file://$PROJECT_DIR$/crawler/taobao/__init__.py">
|
||||
<provider selected="true" editor-type-id="text-editor" />
|
||||
</entry>
|
||||
<entry file="file://$PROJECT_DIR$/crawler/taobao/spiders/run.py" />
|
||||
<entry file="file://$PROJECT_DIR$/env/lib/python3.7/site-packages/scrapy/spiders/__init__.py">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
|
@ -438,13 +435,6 @@
|
|||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
<entry file="file://$PROJECT_DIR$/env/lib/python3.7/site-packages/scrapy/http/request/__init__.py">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="118">
|
||||
<caret line="16" column="6" selection-start-line="16" selection-start-column="6" selection-end-line="16" selection-end-column="6" />
|
||||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
<entry file="file://$PROJECT_DIR$/env/lib/python3.7/site-packages/scrapy/http/response/__init__.py">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="110">
|
||||
|
@ -469,9 +459,7 @@
|
|||
</entry>
|
||||
<entry file="file://$PROJECT_DIR$/crawler/taobao/spiders/hah.json" />
|
||||
<entry file="file://$PROJECT_DIR$/crawler/taobao/spiders/douban.json" />
|
||||
<entry file="file://$PROJECT_DIR$/crawler/datas/comments/23761145.json">
|
||||
<provider selected="true" editor-type-id="text-editor" />
|
||||
</entry>
|
||||
<entry file="file://$PROJECT_DIR$/crawler/datas/comments/23761145.json" />
|
||||
<entry file="file://$PROJECT_DIR$/crawler/datas/2小时送货服务.json" />
|
||||
<entry file="file://$PROJECT_DIR$/crawler/datas/3万以下.json" />
|
||||
<entry file="file://$PROJECT_DIR$/crawler/datas/kind.json" />
|
||||
|
@ -483,13 +471,6 @@
|
|||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
<entry file="file://$PROJECT_DIR$/crawler/taobao/merge.py">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="345">
|
||||
<caret line="23" column="30" selection-start-line="23" selection-start-column="30" selection-end-line="23" selection-end-column="30" />
|
||||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
<entry file="file://$PROJECT_DIR$/env/lib/python3.7/site-packages/jupyter.py">
|
||||
<provider selected="true" editor-type-id="text-editor" />
|
||||
</entry>
|
||||
|
@ -548,40 +529,6 @@
|
|||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
<entry file="file://$PROJECT_DIR$/crawler/taobao/middlewares.py">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="216">
|
||||
<caret line="137" column="32" selection-start-line="137" selection-start-column="32" selection-end-line="137" selection-end-column="32" />
|
||||
<folding>
|
||||
<element signature="e#168#178#0" expanded="true" />
|
||||
</folding>
|
||||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
<entry file="file://$PROJECT_DIR$/crawler/taobao/spiders/comment.py">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="-271">
|
||||
<caret line="36" column="53" selection-start-line="36" selection-start-column="53" selection-end-line="36" selection-end-column="53" />
|
||||
<folding>
|
||||
<element signature="e#24#35#0" expanded="true" />
|
||||
</folding>
|
||||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
<entry file="file://$PROJECT_DIR$/env/lib/python3.7/site-packages/scrapy/item.py">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="390">
|
||||
<caret line="68" column="21" selection-start-line="68" selection-start-column="21" selection-end-line="68" selection-end-column="21" />
|
||||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
<entry file="file://$PROJECT_DIR$/env/lib/python3.7/site-packages/happybase/connection.py">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="270">
|
||||
<caret line="107" column="8" selection-start-line="107" selection-start-column="8" selection-end-line="107" selection-end-column="8" />
|
||||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
<entry file="file://$PROJECT_DIR$/env/lib/python3.7/site-packages/scrapy/conf.py">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="105">
|
||||
|
@ -589,37 +536,123 @@
|
|||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
<entry file="file://$PROJECT_DIR$/crawler/taobao/items.py">
|
||||
<entry file="file://$PROJECT_DIR$/env/lib/python3.7/site-packages/happybase/connection.py">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="374">
|
||||
<caret line="47" column="15" selection-start-line="47" selection-start-column="4" selection-end-line="47" selection-end-column="15" />
|
||||
<state relative-caret-position="1455">
|
||||
<caret line="107" column="8" selection-start-line="107" selection-start-column="8" selection-end-line="107" selection-end-column="8" />
|
||||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
<entry file="file://$PROJECT_DIR$/crawler/taobao/spiders/comment.py">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="420">
|
||||
<caret line="36" column="53" selection-start-line="36" selection-start-column="53" selection-end-line="36" selection-end-column="53" />
|
||||
<folding>
|
||||
<element signature="e#24#35#0" expanded="true" />
|
||||
</folding>
|
||||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
<entry file="file://$PROJECT_DIR$/crawler/taobao/middlewares.py">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="1920">
|
||||
<caret line="137" column="32" selection-start-line="137" selection-start-column="32" selection-end-line="137" selection-end-column="32" />
|
||||
<folding>
|
||||
<element signature="e#168#178#0" expanded="true" />
|
||||
</folding>
|
||||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
<entry file="file://$PROJECT_DIR$/env/lib/python3.7/site-packages/scrapy/item.py">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="162">
|
||||
<caret line="58" selection-start-line="58" selection-end-line="58" />
|
||||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
<entry file="file://$PROJECT_DIR$/crawler/taobao/__init__.py">
|
||||
<provider selected="true" editor-type-id="text-editor" />
|
||||
</entry>
|
||||
<entry file="file://$PROJECT_DIR$/crawler/taobao/merge.py">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="345">
|
||||
<caret line="23" column="30" selection-start-line="23" selection-start-column="30" selection-end-line="23" selection-end-column="30" />
|
||||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
<entry file="file://$PROJECT_DIR$/env/lib/python3.7/site-packages/scrapy_splash/request.py">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="-632">
|
||||
<caret line="20" column="8" selection-start-line="20" selection-start-column="8" selection-end-line="20" selection-end-column="8" />
|
||||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
<entry file="file://$PROJECT_DIR$/crawler/demo.py">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="311">
|
||||
<caret line="26" column="36" lean-forward="true" selection-start-line="26" selection-start-column="36" selection-end-line="26" selection-end-column="36" />
|
||||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
<entry file="file://$PROJECT_DIR$/env/lib/python3.7/site-packages/scrapy/http/request/__init__.py">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="180">
|
||||
<caret line="19" column="68" selection-start-line="19" selection-start-column="60" selection-end-line="19" selection-end-column="68" />
|
||||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
<entry file="file://$PROJECT_DIR$/crawler/taobao/spiders/jd.py">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="-680">
|
||||
<caret line="85" selection-start-line="85" selection-end-line="85" />
|
||||
<folding>
|
||||
<element signature="e#15#26#0" expanded="true" />
|
||||
</folding>
|
||||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
<entry file="file://$PROJECT_DIR$/crawler/taobao/utils.py">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="60">
|
||||
<caret line="4" column="12" selection-start-line="4" selection-start-column="4" selection-end-line="4" selection-end-column="12" />
|
||||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
<entry file="file://$PROJECT_DIR$/crawler/taobao/pipelines.py">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="1155">
|
||||
<caret line="77" column="31" lean-forward="true" selection-start-line="77" selection-start-column="31" selection-end-line="77" selection-end-column="31" />
|
||||
<state relative-caret-position="205">
|
||||
<caret line="105" column="20" selection-start-line="104" selection-start-column="20" selection-end-line="105" selection-end-column="20" />
|
||||
<folding>
|
||||
<element signature="e#193#204#0" expanded="true" />
|
||||
</folding>
|
||||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
<entry file="file://$PROJECT_DIR$/crawler/taobao/settings.py">
|
||||
<entry file="file://$PROJECT_DIR$/crawler/taobao/items.py">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="426">
|
||||
<caret line="119" column="12" selection-start-line="119" selection-start-column="12" selection-end-line="119" selection-end-column="12" />
|
||||
<state relative-caret-position="-545">
|
||||
<caret line="14" selection-start-line="14" selection-end-line="14" />
|
||||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
<entry file="file://$PROJECT_DIR$/crawler/taobao/spiders/jd.py">
|
||||
<entry file="file://$PROJECT_DIR$/crawler/taobao/settings.py">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="63">
|
||||
<caret line="72" column="40" lean-forward="true" selection-start-line="72" selection-start-column="40" selection-end-line="72" selection-end-column="40" />
|
||||
<folding>
|
||||
<element signature="e#14#25#0" expanded="true" />
|
||||
</folding>
|
||||
<state relative-caret-position="176">
|
||||
<caret line="115" selection-start-line="115" selection-end-line="115" />
|
||||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
<entry file="file:///usr/local/Cellar/python/3.7.0/Frameworks/Python.framework/Versions/3.7/lib/python3.7/telnetlib.py">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="291">
|
||||
<caret line="289" column="33" lean-forward="true" selection-start-line="289" selection-start-column="33" selection-end-line="289" selection-end-column="33" />
|
||||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
<entry file="file://$PROJECT_DIR$/crawler/client.py">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="235">
|
||||
<caret line="36" column="10" lean-forward="true" selection-start-line="36" selection-start-column="10" selection-end-line="36" selection-end-column="10" />
|
||||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
|
|
|
@ -0,0 +1,40 @@
|
|||
#!/usr/bin/env python
|
||||
"""
|
||||
Example script to control a Scrapy server using its JSON-RPC web service.
|
||||
It only provides a reduced functionality as its main purpose is to illustrate
|
||||
how to write a web service client. Feel free to improve or write you own.
|
||||
Also, keep in mind that the JSON-RPC API is not stable. The recommended way for
|
||||
controlling a Scrapy server is through the execution queue (see the "queue"
|
||||
command).
|
||||
"""
|
||||
|
||||
import telnetlib
|
||||
|
||||
|
||||
def do_telnet(Host, finish):
|
||||
'''Telnet远程登录:Windows客户端连接Linux服务器'''
|
||||
|
||||
# 连接Telnet服务器
|
||||
tn = telnetlib.Telnet(Host, port=6023, timeout=10)
|
||||
tn.set_debuglevel(2)
|
||||
|
||||
# 输入登录用户名
|
||||
out = tn.read_until(finish)
|
||||
tn.write(b'est()\n')
|
||||
print('-----------------1')
|
||||
print(out.decode('utf8'))
|
||||
print('-----------------11')
|
||||
|
||||
# 输入登录密码
|
||||
out = tn.read_until(finish)
|
||||
print('-----------------2')
|
||||
print(out.decode('utf8'))
|
||||
print('-----------------3')
|
||||
|
||||
tn.close() # tn.write('exit\n')
|
||||
|
||||
if __name__ == '__main__':
|
||||
# 配置选项
|
||||
Host = '127.0.0.1' # Telnet服务器IP
|
||||
finish = b'>>> ' # 命令提示符
|
||||
do_telnet(Host,finish)
|
|
@ -0,0 +1,31 @@
|
|||
import re,requests,json
|
||||
|
||||
s = requests.session()
|
||||
url = 'https://club.jd.com/comment/productPageComments.action'
|
||||
data = {
|
||||
'callback':'fetchJSON_comment98vv61',
|
||||
'productId':'3888284',
|
||||
'score':0,
|
||||
'sortType':5,
|
||||
'pageSize':10,
|
||||
'isShadowSku':0,
|
||||
'page':0
|
||||
}
|
||||
|
||||
while True:
|
||||
t = s.get(url,params = data).text
|
||||
try:
|
||||
t = re.search(r'(?<=fetchJSON_comment98vv61\().*(?=\);)',t).group(0)
|
||||
except Exception as e:
|
||||
break
|
||||
|
||||
j = json.loads(t)
|
||||
commentSummary = j['comments']
|
||||
for comment in commentSummary:
|
||||
c_content = comment['content']
|
||||
c_time = comment['referenceTime']
|
||||
c_name = comment['nickname']
|
||||
c_client = comment['userClientShow']
|
||||
print('{} {} {}\n{}\n'.format(c_name,c_time,c_client,c_content))
|
||||
|
||||
data['page'] += 1
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
@ -46,4 +46,24 @@ class JDProductItem(scrapy.Item):
|
|||
img = scrapy.Field()
|
||||
shop = scrapy.Field()
|
||||
comment_num = scrapy.Field()
|
||||
created_at = scrapy.Field()
|
||||
|
||||
|
||||
class JDCommentItem(scrapy.Item):
|
||||
id = scrapy.Field()
|
||||
user_name = scrapy.Field()
|
||||
from_type = scrapy.Field()
|
||||
content = scrapy.Field()
|
||||
comment_time = scrapy.Field()
|
||||
created_at = scrapy.Field()
|
||||
|
||||
|
||||
class JDCommentSummary(scrapy.Item):
|
||||
id = scrapy.Field()
|
||||
good_count = scrapy.Field()
|
||||
general_count = scrapy.Field()
|
||||
poor_count = scrapy.Field()
|
||||
comment_count = scrapy.Field()
|
||||
default_good_count = scrapy.Field()
|
||||
good_rate = scrapy.Field()
|
||||
|
||||
|
|
|
@ -12,6 +12,8 @@ import happybase
|
|||
from scrapy.conf import settings
|
||||
from scrapy.pipelines.images import ImagesPipeline
|
||||
|
||||
from taobao.items import JDProductItem, JDCommentItem, JDCommentSummary
|
||||
|
||||
|
||||
class MyImagesPipeline(ImagesPipeline):
|
||||
pass
|
||||
|
@ -52,17 +54,19 @@ class JDCleanDataPipeline(object):
|
|||
"""
|
||||
京东数据清先
|
||||
"""
|
||||
|
||||
def process_item(self, item, spider):
|
||||
if isinstance(item["name"], str):
|
||||
item["name"] = item["name"].strip().replace('\n', '')
|
||||
if isinstance(item, JDProductItem):
|
||||
if isinstance(item["name"], str):
|
||||
item["name"] = item["name"].strip().replace('\n', '')
|
||||
|
||||
m = re.match(r'.+\/(\d+)\.html', item['url'])
|
||||
if m:
|
||||
item["id"] = m.group(1)
|
||||
m = re.match(r'.+\/(\d+)\.html', item['url'])
|
||||
if m:
|
||||
item["id"] = m.group(1)
|
||||
|
||||
if not isinstance(item["id"], str):
|
||||
return None
|
||||
if not isinstance(item["id"], str):
|
||||
return None
|
||||
elif isinstance(item, JDCommentItem):
|
||||
pass
|
||||
|
||||
return item
|
||||
|
||||
|
@ -71,20 +75,43 @@ class JDHbasePipeline(object):
|
|||
"""
|
||||
数据存到hbase中
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
host = settings['HBASE_HOST']
|
||||
table_name = settings['HBASE_TABLE']
|
||||
connection = happybase.Connection(host, port=settings["HBASE_PORT"])
|
||||
table = connection.table(table_name)
|
||||
self.table = table
|
||||
self.product_table = table
|
||||
self.comment_table = connection.table('jd_comment')
|
||||
|
||||
def process_item(self, item, spider):
|
||||
self.table.put(item["id"],
|
||||
{"product:name": item["name"],
|
||||
"product:price": item["price"],
|
||||
"product:url": item["url"],
|
||||
"product:img": item["img"],
|
||||
"product:shop": item["shop"],
|
||||
"product:comment_num": item["comment_num"]})
|
||||
if isinstance(item, JDProductItem):
|
||||
self.product_table.put(item["id"],
|
||||
{"product:name": item["name"],
|
||||
"product:price": item["price"],
|
||||
"product:url": item["url"],
|
||||
"product:img": item["img"],
|
||||
"product:shop": item["shop"],
|
||||
"product:comment_num": item["comment_num"],
|
||||
"product:created_at": item["created_at"]
|
||||
})
|
||||
|
||||
elif isinstance(item, JDCommentItem):
|
||||
self.comment_table.put(item["id"],
|
||||
{
|
||||
"comment:user_name": item["user_name"],
|
||||
"comment:from_type": item["from_type"],
|
||||
"comment:content": item["content"],
|
||||
"comment:comment_time": item["comment_time"],
|
||||
"comment:created_at": item["created_at"]
|
||||
})
|
||||
elif isinstance(item, JDCommentSummary):
|
||||
self.product_table.put(item["id"],
|
||||
{"comment:good_count": str(item["good_count"]),
|
||||
"comment:general_count": str(item["general_count"]),
|
||||
"comment:poor_count": str(item["poor_count"]),
|
||||
"comment:comment_count": str(item["comment_count"]),
|
||||
"comment:default_good_count": str(item["default_good_count"]),
|
||||
"comment:good_rate": str(item["good_rate"])
|
||||
})
|
||||
|
||||
return item
|
||||
|
|
|
@ -43,7 +43,7 @@ SPLASH_URL = 'http://127.0.0.1:8050'
|
|||
COOKIES_ENABLED = True
|
||||
|
||||
# Disable Telnet Console (enabled by default)
|
||||
#TELNETCONSOLE_ENABLED = False
|
||||
TELNETCONSOLE_ENABLED = True
|
||||
|
||||
# Override the default request headers:
|
||||
#DEFAULT_REQUEST_HEADERS = {
|
||||
|
@ -92,14 +92,15 @@ IMAGES_STORE = '/tmp/taobao'
|
|||
|
||||
# Enable and configure the AutoThrottle extension (disabled by default)
|
||||
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
|
||||
#AUTOTHROTTLE_ENABLED = True
|
||||
# 启用限速
|
||||
AUTOTHROTTLE_ENABLED = True
|
||||
# The initial download delay
|
||||
#AUTOTHROTTLE_START_DELAY = 5
|
||||
AUTOTHROTTLE_START_DELAY = 10
|
||||
# The maximum download delay to be set in case of high latencies
|
||||
#AUTOTHROTTLE_MAX_DELAY = 60
|
||||
AUTOTHROTTLE_MAX_DELAY = 60
|
||||
# The average number of requests Scrapy should be sending in parallel to
|
||||
# each remote server
|
||||
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
|
||||
AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
|
||||
# Enable showing throttling stats for every response received:
|
||||
#AUTOTHROTTLE_DEBUG = False
|
||||
|
||||
|
@ -114,7 +115,8 @@ IMAGES_STORE = '/tmp/taobao'
|
|||
FEED_EXPORT_ENCODING='utf-8'
|
||||
|
||||
|
||||
|
||||
HBASE_HOST = '106.75.85.84'
|
||||
HBASE_PORT = 40009
|
||||
HBASE_TABLE = 'jd'
|
||||
|
||||
COMMENT_MAX_PAGE = 10
|
|
@ -1,12 +1,17 @@
|
|||
#coding=utf-8
|
||||
# coding=utf-8
|
||||
import json
|
||||
import time
|
||||
|
||||
import scrapy
|
||||
from scrapy import Request
|
||||
from scrapy.conf import settings
|
||||
from scrapy_splash import SplashRequest
|
||||
import pdb
|
||||
from taobao.items import JDProductItem
|
||||
from taobao.items import JDProductItem, JDCommentItem, JDCommentSummary
|
||||
import re
|
||||
|
||||
from taobao.utils import now_time
|
||||
|
||||
|
||||
class JdSpider(scrapy.Spider):
|
||||
name = 'jd'
|
||||
|
@ -23,12 +28,70 @@ class JdSpider(scrapy.Spider):
|
|||
}
|
||||
}
|
||||
|
||||
def make_comment_url(self, product_id, page):
|
||||
comment_url = "https://club.jd.com/comment/productPageComments.action?callback=fetchJSON" \
|
||||
"_comment98vv33164&productId=%s&score=0&sortType=5&page=%d&pageSize=10" \
|
||||
"&isShadowSku=0&rid=0&fold=1" % (product_id, page)
|
||||
return comment_url
|
||||
|
||||
def start_requests(self):
|
||||
for url in self.start_urls:
|
||||
yield SplashRequest(url,
|
||||
self.parse,
|
||||
args={})
|
||||
|
||||
def parse_comment(self, response):
|
||||
# pdb.set_trace()
|
||||
product_id = response.meta["product_id"]
|
||||
page = response.meta["page"]
|
||||
print("抓取评论, %s, 当前 %d页" % (product_id, page))
|
||||
m = re.match(r'^fetchJSON_comment98vv33164\((.+)\);$', response.text)
|
||||
if m:
|
||||
t = m.group(1)
|
||||
j = json.loads(t)
|
||||
commentSummary = j['comments']
|
||||
for comment in commentSummary:
|
||||
c_content = comment['content']
|
||||
c_time = comment['referenceTime']
|
||||
c_name = comment['nickname']
|
||||
c_client = comment['userClientShow']
|
||||
c_id = "%s_%s" % (product_id, comment['id'])
|
||||
print('{}, {} {} {}\n{}\n'.format(c_id, c_name, c_time, c_client, c_content))
|
||||
yield JDCommentItem(
|
||||
id=c_id,
|
||||
user_name=c_name,
|
||||
from_type=c_client,
|
||||
content=c_content,
|
||||
comment_time=c_time,
|
||||
created_at=now_time()
|
||||
)
|
||||
if page == 0:
|
||||
productCommentSummary = j['productCommentSummary']
|
||||
yield JDCommentSummary(
|
||||
id = product_id,
|
||||
good_count = productCommentSummary['goodCount'],
|
||||
general_count= productCommentSummary['generalCount'],
|
||||
poor_count= productCommentSummary['poorCount'],
|
||||
comment_count= productCommentSummary['commentCount'],
|
||||
default_good_count= productCommentSummary['defaultGoodCount'],
|
||||
good_rate= productCommentSummary['goodRate']
|
||||
)
|
||||
|
||||
|
||||
page = page + 1
|
||||
|
||||
next_comment_url = self.make_comment_url(product_id, page)
|
||||
|
||||
# time.sleep(10)
|
||||
|
||||
if page>settings['COMMENT_MAX_PAGE']:
|
||||
print("评论抓取达到最大深度, %s, 页数: %d", product_id, page)
|
||||
else:
|
||||
yield Request(next_comment_url, self.parse_comment,
|
||||
meta={'product_id': product_id, 'page': page})
|
||||
else:
|
||||
print("评论抓取完成, %s, 页数: %d", product_id, page)
|
||||
|
||||
def parse(self, response):
|
||||
items = response.xpath('//div[@id="plist"]//li[@class="gl-item"]')
|
||||
|
||||
|
@ -36,10 +99,11 @@ class JdSpider(scrapy.Spider):
|
|||
# pdb.set_trace()
|
||||
name = item.xpath('.//div[contains(@class, "p-name")]/a/em/text()').extract_first()
|
||||
print(name)
|
||||
price = item.xpath('.//div[contains(@class, "p-price")]/strong/i/text()').extract_first()
|
||||
price = item.xpath(
|
||||
'.//div[contains(@class, "p-price")]/strong/i/text()').extract_first()
|
||||
print(price)
|
||||
url = item.xpath('.//div[contains(@class, "p-name")]/a/@href').extract_first()
|
||||
print('https:'+url)
|
||||
print('https:' + url)
|
||||
img = item.xpath('.//div[@class="p-img"]/a/img/@data-lazy-img').extract_first()
|
||||
if not img:
|
||||
img = item.xpath('.//div[@class="p-img"]/a/img/@src').extract_first()
|
||||
|
@ -50,14 +114,23 @@ class JdSpider(scrapy.Spider):
|
|||
print(shop)
|
||||
|
||||
yield JDProductItem(
|
||||
name = name,
|
||||
price = price,
|
||||
url = url,
|
||||
img = img,
|
||||
comment_num = comment_num,
|
||||
shop = shop
|
||||
name=name,
|
||||
price=price,
|
||||
url=url,
|
||||
img=img,
|
||||
comment_num=comment_num,
|
||||
shop=shop,
|
||||
created_at=now_time()
|
||||
)
|
||||
|
||||
m = re.match(r'.+\/(\d+)\.html', url)
|
||||
if m:
|
||||
product_id = m.group(1)
|
||||
comment_url = self.make_comment_url(product_id, 0)
|
||||
yield Request(comment_url, self.parse_comment,
|
||||
meta={'product_id': product_id, 'page': 0},
|
||||
priority=100)
|
||||
|
||||
# 获取下一页
|
||||
next_page = response.xpath('//a[@class="pn-next"]/@href').extract_first()
|
||||
if next_page:
|
||||
|
@ -67,10 +140,10 @@ class JdSpider(scrapy.Spider):
|
|||
page = m.group(1)
|
||||
print("开始抓取:{0}页".format(page))
|
||||
|
||||
time.sleep(10)
|
||||
# time.sleep(10)
|
||||
|
||||
yield SplashRequest(
|
||||
url= 'https://list.jd.com'+next_page
|
||||
url='https://list.jd.com' + next_page
|
||||
)
|
||||
else:
|
||||
print("抓取结束")
|
||||
|
|
|
@ -0,0 +1,6 @@
|
|||
# coding=utf-8
|
||||
import time
|
||||
|
||||
|
||||
def now_time():
|
||||
return time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
|
Loading…
Reference in New Issue