抓取评论

This commit is contained in:
guange 2019-01-13 11:34:47 +08:00
parent f73fbff4c8
commit ee0aea16aa
24 changed files with 997 additions and 1508 deletions

4
.gitignore vendored
View File

@ -1,3 +1,7 @@
SprakProject/
chapter1/env
*.ipynb
*.pyc
chapter1/crawler/datas/comments/
chapter1/crawler/datas/comments1/
chapter1/crawler/datas/products/

View File

@ -0,0 +1,22 @@
<?xml version="1.0" encoding="UTF-8"?>
<module type="PYTHON_MODULE" version="4">
<component name="NewModuleRootManager">
<content url="file://$MODULE_DIR$">
<sourceFolder url="file://$MODULE_DIR$/crawler" isTestSource="false" />
<excludeFolder url="file://$MODULE_DIR$/crawler/datas" />
</content>
<orderEntry type="inheritedJdk" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
<component name="TemplatesService">
<option name="TEMPLATE_FOLDERS">
<list>
<option value="$MODULE_DIR$/env/lib/python3.7/site-packages/scrapy/templates" />
</list>
</option>
</component>
<component name="TestRunnerService">
<option name="projectConfiguration" value="Twisted Trial" />
<option name="PROJECT_TEST_RUNNER" value="Twisted Trial" />
</component>
</module>

View File

@ -0,0 +1,4 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="Encoding" addBOMForNewFiles="with NO BOM" />
</project>

7
chapter1/.idea/misc.xml Normal file
View File

@ -0,0 +1,7 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.7 (chapter1)" project-jdk-type="Python SDK" />
<component name="PyPackaging">
<option name="earlyReleasesAsUpgrades" value="true" />
</component>
</project>

View File

@ -0,0 +1,8 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectModuleManager">
<modules>
<module fileurl="file://$PROJECT_DIR$/.idea/chapter1.iml" filepath="$PROJECT_DIR$/.idea/chapter1.iml" />
</modules>
</component>
</project>

6
chapter1/.idea/vcs.xml Normal file
View File

@ -0,0 +1,6 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="VcsDirectoryMappings">
<mapping directory="$PROJECT_DIR$/.." vcs="Git" />
</component>
</project>

View File

@ -0,0 +1,661 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ChangeListManager">
<list default="true" id="a5fbb387-9969-4874-8a7f-c9dd40d5225d" name="Default Changelist" comment="">
<change afterPath="$PROJECT_DIR$/crawler/taobao/merge.py" afterDir="false" />
<change beforePath="$PROJECT_DIR$/crawler/datas/kind.json" beforeDir="false" />
<change beforePath="$PROJECT_DIR$/crawler/taobao/items.py" beforeDir="false" afterPath="$PROJECT_DIR$/crawler/taobao/items.py" afterDir="false" />
<change beforePath="$PROJECT_DIR$/crawler/taobao/pipelines.py" beforeDir="false" afterPath="$PROJECT_DIR$/crawler/taobao/pipelines.py" afterDir="false" />
</list>
<option name="EXCLUDED_CONVERTED_TO_IGNORED" value="true" />
<option name="SHOW_DIALOG" value="false" />
<option name="HIGHLIGHT_CONFLICTS" value="true" />
<option name="HIGHLIGHT_NON_ACTIVE_CHANGELIST" value="false" />
<option name="LAST_RESOLUTION" value="IGNORE" />
</component>
<component name="CoverageDataManager">
<SUITE FILE_PATH="coverage/chapter1$category.coverage" NAME="category Coverage Results" MODIFIED="1546938251328" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/crawler/taobao/spiders" />
<SUITE FILE_PATH="coverage/chapter1$merge.coverage" NAME="merge Coverage Results" MODIFIED="1547014157363" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/crawler/taobao" />
<SUITE FILE_PATH="coverage/chapter1$comment.coverage" NAME="comment Coverage Results" MODIFIED="1547166377597" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/crawler/taobao/spiders" />
</component>
<component name="FileEditorManager">
<leaf SIDE_TABS_SIZE_LIMIT_KEY="300">
<file pinned="false" current-in-tab="false">
<entry file="file://$PROJECT_DIR$/crawler/taobao/items.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="388">
<caret line="32" column="38" selection-start-line="32" selection-start-column="38" selection-end-line="32" selection-end-column="38" />
</state>
</provider>
</entry>
</file>
<file pinned="false" current-in-tab="false">
<entry file="file://$PROJECT_DIR$/crawler/taobao/pipelines.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="388">
<caret line="37" column="19" selection-start-line="37" selection-start-column="19" selection-end-line="37" selection-end-column="19" />
<folding>
<element signature="e#193#204#0" expanded="true" />
</folding>
</state>
</provider>
</entry>
</file>
<file pinned="false" current-in-tab="true">
<entry file="file://$PROJECT_DIR$/crawler/taobao/spiders/comment.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="247">
<caret line="81" column="34" selection-start-line="81" selection-start-column="34" selection-end-line="81" selection-end-column="34" />
<folding>
<element signature="e#24#35#0" expanded="true" />
</folding>
</state>
</provider>
</entry>
</file>
<file pinned="false" current-in-tab="false">
<entry file="file:///usr/local/Cellar/python/3.7.0/Frameworks/Python.framework/Versions/3.7/lib/python3.7/subprocess.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="139">
<caret line="965" column="8" selection-start-line="965" selection-start-column="8" selection-end-line="965" selection-end-column="8" />
</state>
</provider>
</entry>
</file>
<file pinned="false" current-in-tab="false">
<entry file="file://$PROJECT_DIR$/crawler/taobao/spiders/httpbin.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="45">
<caret line="3" column="13" selection-start-line="3" selection-start-column="5" selection-end-line="3" selection-end-column="13" />
</state>
</provider>
</entry>
</file>
<file pinned="false" current-in-tab="false">
<entry file="file://$PROJECT_DIR$/crawler/taobao/middlewares.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="216">
<caret line="137" column="32" selection-start-line="137" selection-start-column="32" selection-end-line="137" selection-end-column="32" />
<folding>
<element signature="e#168#178#0" expanded="true" />
</folding>
</state>
</provider>
</entry>
</file>
<file pinned="false" current-in-tab="false">
<entry file="file://$PROJECT_DIR$/crawler/taobao/spiders/dangdang.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="315">
<caret line="21" column="22" selection-start-line="21" selection-start-column="22" selection-end-line="21" selection-end-column="22" />
<folding>
<element signature="e#119#129#0" expanded="true" />
</folding>
</state>
</provider>
</entry>
</file>
<file pinned="false" current-in-tab="false">
<entry file="file://$USER_HOME$/Library/Caches/PyCharm2018.3/python_stubs/-1583084946/builtins.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="36915">
<caret line="2461" column="42" selection-start-line="2461" selection-start-column="34" selection-end-line="2461" selection-end-column="42" />
</state>
</provider>
</entry>
</file>
<file pinned="false" current-in-tab="false">
<entry file="file://$PROJECT_DIR$/crawler/taobao/spiders/category.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="1245">
<caret line="91" column="45" selection-start-line="87" selection-start-column="4" selection-end-line="91" selection-end-column="45" />
<folding>
<element signature="e#24#35#0" expanded="true" />
</folding>
</state>
</provider>
</entry>
</file>
<file pinned="false" current-in-tab="false">
<entry file="file://$PROJECT_DIR$/crawler/taobao/merge.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="345">
<caret line="23" column="30" selection-start-line="23" selection-start-column="30" selection-end-line="23" selection-end-column="30" />
</state>
</provider>
</entry>
</file>
</leaf>
</component>
<component name="FileTemplateManagerImpl">
<option name="RECENT_TEMPLATES">
<list>
<option value="Python Script" />
</list>
</option>
</component>
<component name="FindInProjectRecents">
<findStrings>
<find>日本可爱毛球熊猫束发带</find>
<find>美式风</find>
<find></find>
<find>皮床</find>
</findStrings>
</component>
<component name="Git.Settings">
<option name="RECENT_GIT_ROOT_PATH" value="$PROJECT_DIR$/.." />
</component>
<component name="IdeDocumentHistory">
<option name="CHANGED_PATHS">
<list>
<option value="$PROJECT_DIR$/crawler/taobao/middlewares.py" />
<option value="$PROJECT_DIR$/crawler/taobao/run.py" />
<option value="$PROJECT_DIR$/crawler/taobao/spiders/dangdang.py" />
<option value="$PROJECT_DIR$/crawler/taobao/spiders/run.py" />
<option value="$PROJECT_DIR$/env/lib/python3.7/site-packages/scrapy/spiders/__init__.py" />
<option value="$PROJECT_DIR$/crawler/taobao/spiders/category.py" />
<option value="$PROJECT_DIR$/crawler/taobao/items.py" />
<option value="$PROJECT_DIR$/crawler/taobao/pipelines.py" />
<option value="$PROJECT_DIR$/crawler/datas/kind.json" />
<option value="$PROJECT_DIR$/crawler/taobao/merge.py" />
<option value="$PROJECT_DIR$/crawler/taobao/spiders/comment.py" />
</list>
</option>
</component>
<component name="ProjectFrameBounds">
<option name="y" value="23" />
<option name="width" value="1280" />
<option name="height" value="777" />
</component>
<component name="ProjectLevelVcsManager" settingsEditedManually="true" />
<component name="ProjectView">
<navigator proportions="" version="1">
<foldersAlwaysOnTop value="true" />
</navigator>
<panes>
<pane id="ProjectPane">
<subPane>
<expand>
<path>
<item name="chapter1" type="b2602c69:ProjectViewProjectNode" />
<item name="chapter1" type="462c0819:PsiDirectoryNode" />
</path>
<path>
<item name="chapter1" type="b2602c69:ProjectViewProjectNode" />
<item name="chapter1" type="462c0819:PsiDirectoryNode" />
<item name="crawler" type="462c0819:PsiDirectoryNode" />
</path>
<path>
<item name="chapter1" type="b2602c69:ProjectViewProjectNode" />
<item name="chapter1" type="462c0819:PsiDirectoryNode" />
<item name="crawler" type="462c0819:PsiDirectoryNode" />
<item name="datas" type="462c0819:PsiDirectoryNode" />
</path>
<path>
<item name="chapter1" type="b2602c69:ProjectViewProjectNode" />
<item name="chapter1" type="462c0819:PsiDirectoryNode" />
<item name="env" type="462c0819:PsiDirectoryNode" />
</path>
<path>
<item name="chapter1" type="b2602c69:ProjectViewProjectNode" />
<item name="chapter1" type="462c0819:PsiDirectoryNode" />
<item name="env" type="462c0819:PsiDirectoryNode" />
<item name="lib" type="462c0819:PsiDirectoryNode" />
</path>
<path>
<item name="chapter1" type="b2602c69:ProjectViewProjectNode" />
<item name="chapter1" type="462c0819:PsiDirectoryNode" />
<item name="env" type="462c0819:PsiDirectoryNode" />
<item name="lib" type="462c0819:PsiDirectoryNode" />
<item name="python3.7" type="462c0819:PsiDirectoryNode" />
</path>
<path>
<item name="chapter1" type="b2602c69:ProjectViewProjectNode" />
<item name="chapter1" type="462c0819:PsiDirectoryNode" />
<item name="env" type="462c0819:PsiDirectoryNode" />
<item name="lib" type="462c0819:PsiDirectoryNode" />
<item name="python3.7" type="462c0819:PsiDirectoryNode" />
<item name="site-packages" type="462c0819:PsiDirectoryNode" />
</path>
<path>
<item name="chapter1" type="b2602c69:ProjectViewProjectNode" />
<item name="chapter1" type="462c0819:PsiDirectoryNode" />
<item name="env" type="462c0819:PsiDirectoryNode" />
<item name="lib" type="462c0819:PsiDirectoryNode" />
<item name="python3.7" type="462c0819:PsiDirectoryNode" />
<item name="site-packages" type="462c0819:PsiDirectoryNode" />
<item name="lxml" type="462c0819:PsiDirectoryNode" />
</path>
<path>
<item name="chapter1" type="b2602c69:ProjectViewProjectNode" />
<item name="External Libraries" type="cb654da1:ExternalLibrariesNode" />
</path>
<path>
<item name="chapter1" type="b2602c69:ProjectViewProjectNode" />
<item name="External Libraries" type="cb654da1:ExternalLibrariesNode" />
<item name="&lt; Python 3.7 (chapter1) &gt;" type="70bed36:NamedLibraryElementNode" />
</path>
<path>
<item name="chapter1" type="b2602c69:ProjectViewProjectNode" />
<item name="External Libraries" type="cb654da1:ExternalLibrariesNode" />
<item name="&lt; Python 3.7 (chapter1) &gt;" type="70bed36:NamedLibraryElementNode" />
<item name="python3.7" type="462c0819:PsiDirectoryNode" />
</path>
<path>
<item name="chapter1" type="b2602c69:ProjectViewProjectNode" />
<item name="External Libraries" type="cb654da1:ExternalLibrariesNode" />
<item name="&lt; Python 3.7 (chapter1) &gt;" type="70bed36:NamedLibraryElementNode" />
<item name="python3.7" type="462c0819:PsiDirectoryNode" />
<item name="site-packages" type="462c0819:PsiDirectoryNode" />
</path>
<path>
<item name="chapter1" type="b2602c69:ProjectViewProjectNode" />
<item name="External Libraries" type="cb654da1:ExternalLibrariesNode" />
<item name="&lt; Python 3.7 (chapter1) &gt;" type="70bed36:NamedLibraryElementNode" />
<item name="python3.7" type="462c0819:PsiDirectoryNode" />
<item name="site-packages" type="462c0819:PsiDirectoryNode" />
<item name="lxml" type="462c0819:PsiDirectoryNode" />
</path>
<path>
<item name="chapter1" type="b2602c69:ProjectViewProjectNode" />
<item name="External Libraries" type="cb654da1:ExternalLibrariesNode" />
<item name="&lt; Python 3.7 (chapter1) &gt;" type="70bed36:NamedLibraryElementNode" />
<item name="site-packages" type="462c0819:PsiDirectoryNode" />
</path>
<path>
<item name="chapter1" type="b2602c69:ProjectViewProjectNode" />
<item name="External Libraries" type="cb654da1:ExternalLibrariesNode" />
<item name="&lt; Python 3.7 (chapter1) &gt;" type="70bed36:NamedLibraryElementNode" />
<item name="site-packages" type="462c0819:PsiDirectoryNode" />
<item name="lxml" type="462c0819:PsiDirectoryNode" />
</path>
</expand>
<select />
</subPane>
</pane>
<pane id="Scope" />
</panes>
</component>
<component name="PropertiesComponent">
<property name="WebServerToolWindowFactoryState" value="false" />
<property name="last_opened_file_path" value="$PROJECT_DIR$" />
<property name="nodejs_interpreter_path.stuck_in_default_project" value="undefined stuck path" />
<property name="nodejs_npm_path_reset_for_default_project" value="true" />
<property name="settings.editor.selected.configurable" value="configurable.group.project" />
</component>
<component name="RecentsManager">
<key name="MoveFile.RECENT_KEYS">
<recent name="$PROJECT_DIR$/crawler/taobao/spiders" />
</key>
</component>
<component name="RunDashboard">
<option name="ruleStates">
<list>
<RuleState>
<option name="name" value="ConfigurationTypeDashboardGroupingRule" />
</RuleState>
<RuleState>
<option name="name" value="StatusDashboardGroupingRule" />
</RuleState>
</list>
</option>
</component>
<component name="RunManager" selected="Python.comment">
<configuration name="category" type="PythonConfigurationType" factoryName="Python" temporary="true">
<module name="chapter1" />
<option name="INTERPRETER_OPTIONS" value="" />
<option name="PARENT_ENVS" value="true" />
<envs>
<env name="PYTHONUNBUFFERED" value="1" />
</envs>
<option name="SDK_HOME" value="" />
<option name="WORKING_DIRECTORY" value="$PROJECT_DIR$/crawler/taobao/spiders" />
<option name="IS_MODULE_SDK" value="true" />
<option name="ADD_CONTENT_ROOTS" value="true" />
<option name="ADD_SOURCE_ROOTS" value="true" />
<EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
<option name="SCRIPT_NAME" value="$PROJECT_DIR$/crawler/taobao/spiders/category.py" />
<option name="PARAMETERS" value="" />
<option name="SHOW_COMMAND_LINE" value="false" />
<option name="EMULATE_TERMINAL" value="false" />
<option name="MODULE_MODE" value="false" />
<option name="REDIRECT_INPUT" value="false" />
<option name="INPUT_FILE" value="" />
<method v="2" />
</configuration>
<configuration name="comment" type="PythonConfigurationType" factoryName="Python" temporary="true">
<module name="chapter1" />
<option name="INTERPRETER_OPTIONS" value="" />
<option name="PARENT_ENVS" value="true" />
<envs>
<env name="PYTHONUNBUFFERED" value="1" />
</envs>
<option name="SDK_HOME" value="" />
<option name="WORKING_DIRECTORY" value="$PROJECT_DIR$/crawler/taobao/spiders" />
<option name="IS_MODULE_SDK" value="true" />
<option name="ADD_CONTENT_ROOTS" value="true" />
<option name="ADD_SOURCE_ROOTS" value="true" />
<EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
<option name="SCRIPT_NAME" value="$PROJECT_DIR$/crawler/taobao/spiders/comment.py" />
<option name="PARAMETERS" value="" />
<option name="SHOW_COMMAND_LINE" value="false" />
<option name="EMULATE_TERMINAL" value="false" />
<option name="MODULE_MODE" value="false" />
<option name="REDIRECT_INPUT" value="false" />
<option name="INPUT_FILE" value="" />
<method v="2" />
</configuration>
<configuration name="merge" type="PythonConfigurationType" factoryName="Python" temporary="true">
<module name="chapter1" />
<option name="INTERPRETER_OPTIONS" value="" />
<option name="PARENT_ENVS" value="true" />
<envs>
<env name="PYTHONUNBUFFERED" value="1" />
</envs>
<option name="SDK_HOME" value="" />
<option name="WORKING_DIRECTORY" value="$PROJECT_DIR$/crawler/taobao" />
<option name="IS_MODULE_SDK" value="true" />
<option name="ADD_CONTENT_ROOTS" value="true" />
<option name="ADD_SOURCE_ROOTS" value="true" />
<EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
<option name="SCRIPT_NAME" value="$PROJECT_DIR$/crawler/taobao/merge.py" />
<option name="PARAMETERS" value="" />
<option name="SHOW_COMMAND_LINE" value="false" />
<option name="EMULATE_TERMINAL" value="false" />
<option name="MODULE_MODE" value="false" />
<option name="REDIRECT_INPUT" value="false" />
<option name="INPUT_FILE" value="" />
<method v="2" />
</configuration>
<recent_temporary>
<list>
<item itemvalue="Python.comment" />
<item itemvalue="Python.merge" />
<item itemvalue="Python.category" />
</list>
</recent_temporary>
</component>
<component name="TaskManager">
<task active="true" id="Default" summary="Default task">
<changelist id="a5fbb387-9969-4874-8a7f-c9dd40d5225d" name="Default Changelist" comment="" />
<created>1546928387088</created>
<option name="number" value="Default" />
<option name="presentableId" value="Default" />
<updated>1546928387088</updated>
</task>
<task id="LOCAL-00001" summary="数据抓取">
<created>1546936657398</created>
<option name="number" value="00001" />
<option name="presentableId" value="LOCAL-00001" />
<option name="project" value="LOCAL" />
<updated>1546936657399</updated>
</task>
<task id="LOCAL-00002" summary="增加id字段">
<created>1546938314632</created>
<option name="number" value="00002" />
<option name="presentableId" value="LOCAL-00002" />
<option name="project" value="LOCAL" />
<updated>1546938314632</updated>
</task>
<option name="localTasksCounter" value="3" />
<servers />
</component>
<component name="TodoView">
<todo-panel id="selected-file">
<is-autoscroll-to-source value="true" />
</todo-panel>
<todo-panel id="all">
<are-packages-shown value="true" />
<is-autoscroll-to-source value="true" />
</todo-panel>
</component>
<component name="ToolWindowManager">
<frame x="0" y="23" width="1280" height="777" extended-state="0" />
<layout>
<window_info active="true" content_ui="combo" id="Project" order="0" visible="true" weight="0.25767368" />
<window_info id="Structure" order="1" side_tool="true" weight="0.25" />
<window_info id="Favorites" order="2" side_tool="true" />
<window_info anchor="bottom" id="Message" order="0" />
<window_info anchor="bottom" id="Find" order="1" />
<window_info anchor="bottom" id="Run" order="2" weight="0.3970803" />
<window_info anchor="bottom" id="Debug" order="3" weight="0.4" />
<window_info anchor="bottom" id="Cvs" order="4" weight="0.25" />
<window_info anchor="bottom" id="Inspection" order="5" weight="0.4" />
<window_info anchor="bottom" id="TODO" order="6" weight="0.329927" />
<window_info anchor="bottom" id="Version Control" order="7" />
<window_info anchor="bottom" id="Terminal" order="8" visible="true" weight="0.31678832" />
<window_info anchor="bottom" id="Event Log" order="9" side_tool="true" />
<window_info anchor="bottom" id="Python Console" order="10" weight="0.329927" />
<window_info anchor="right" id="Commander" internal_type="SLIDING" order="0" type="SLIDING" weight="0.4" />
<window_info anchor="right" id="Ant Build" order="1" weight="0.25" />
<window_info anchor="right" content_ui="combo" id="Hierarchy" order="2" weight="0.25" />
<window_info anchor="right" id="SciView" order="3" />
</layout>
</component>
<component name="VcsManagerConfiguration">
<MESSAGE value="数据抓取" />
<MESSAGE value="增加id字段" />
<option name="LAST_COMMIT_MESSAGE" value="增加id字段" />
</component>
<component name="editorHistoryManager">
<entry file="file://$PROJECT_DIR$/crawler/scrapy.cfg">
<provider selected="true" editor-type-id="text-editor" />
</entry>
<entry file="file://$PROJECT_DIR$/crawler/taobao/spiders/users.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="255">
<caret line="18" column="52" selection-start-line="18" selection-start-column="52" selection-end-line="18" selection-end-column="52" />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/env/lib/python3.7/site-packages/scrapy/utils/project.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="140">
<caret line="59" column="4" selection-start-line="59" selection-start-column="4" selection-end-line="59" selection-end-column="4" />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/crawler/taobao/settings.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="-1210" />
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/crawler/taobao/__init__.py">
<provider selected="true" editor-type-id="text-editor" />
</entry>
<entry file="file://$PROJECT_DIR$/crawler/taobao/spiders/run.py" />
<entry file="file://$PROJECT_DIR$/env/lib/python3.7/site-packages/scrapy/spiders/__init__.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="238">
<caret line="32" column="32" selection-start-line="32" selection-start-column="32" selection-end-line="32" selection-end-column="32" />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/crawler/datas/大数据.json" />
<entry file="file://$PROJECT_DIR$/env/lib/python3.7/site-packages/scrapy/cmdline.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="1365">
<caret line="105" column="36" selection-start-line="105" selection-start-column="28" selection-end-line="105" selection-end-column="36" />
</state>
</provider>
</entry>
<entry file="file:///usr/local/Cellar/python/3.7.0/Frameworks/Python.framework/Versions/3.7/lib/python3.7/json/__init__.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="146">
<caret line="356" column="29" selection-start-line="356" selection-start-column="29" selection-end-line="356" selection-end-column="29" />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/crawler/datas/T恤.json" />
<entry file="file://$PROJECT_DIR$/crawler/datas/冬上新.json" />
<entry file="file://$PROJECT_DIR$/env/lib/python3.7/site-packages/scrapy/crawler.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="-93">
<caret line="147" column="8" selection-start-line="147" selection-start-column="8" selection-end-line="147" selection-end-column="8" />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/env/lib/python3.7/site-packages/cssselect/xpath.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="118">
<caret line="247" column="8" selection-start-line="247" selection-start-column="8" selection-end-line="247" selection-end-column="8" />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/env/lib/python3.7/site-packages/parsel/selector.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="900">
<caret line="66" column="8" selection-start-line="66" selection-start-column="8" selection-end-line="66" selection-end-column="8" />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/env/lib/python3.7/site-packages/scrapy/http/request/__init__.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="118">
<caret line="16" column="6" selection-start-line="16" selection-start-column="6" selection-end-line="16" selection-end-column="6" />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/env/lib/python3.7/site-packages/scrapy/http/response/__init__.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="110">
<caret line="18" column="26" selection-start-line="18" selection-start-column="23" selection-end-line="18" selection-end-column="26" />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/../../python/crawler/env/lib/python3.7/site-packages/scrapy/http/response/__init__.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="130">
<caret line="54" column="35" lean-forward="true" selection-start-line="54" selection-start-column="35" selection-end-line="54" selection-end-column="35" />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/crawler/datas/内衣套装.json" />
<entry file="file://$PROJECT_DIR$/../../python/crawler/env/lib/python3.7/site-packages/scrapy/item.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="361">
<caret line="72" selection-start-line="72" selection-end-line="72" />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/crawler/taobao/spiders/hah.json" />
<entry file="file://$PROJECT_DIR$/crawler/taobao/spiders/douban.json" />
<entry file="file://$PROJECT_DIR$/crawler/datas/comments/23761145.json">
<provider selected="true" editor-type-id="text-editor" />
</entry>
<entry file="file://$PROJECT_DIR$/crawler/datas/2小时送货服务.json" />
<entry file="file://$PROJECT_DIR$/crawler/datas/3万以下.json" />
<entry file="file://$PROJECT_DIR$/crawler/datas/kind.json" />
<entry file="file://$PROJECT_DIR$/crawler/datas/products/data.json" />
<entry file="file://$USER_HOME$/Library/Caches/PyCharm2018.3/python_stubs/-1583084946/builtins.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="36915">
<caret line="2461" column="42" selection-start-line="2461" selection-start-column="34" selection-end-line="2461" selection-end-column="42" />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/crawler/taobao/spiders/category.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="1245">
<caret line="91" column="45" selection-start-line="87" selection-start-column="4" selection-end-line="91" selection-end-column="45" />
<folding>
<element signature="e#24#35#0" expanded="true" />
</folding>
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/crawler/taobao/merge.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="345">
<caret line="23" column="30" selection-start-line="23" selection-start-column="30" selection-end-line="23" selection-end-column="30" />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/env/lib/python3.7/site-packages/jupyter.py">
<provider selected="true" editor-type-id="text-editor" />
</entry>
<entry file="file://$PROJECT_DIR$/crawler/taobao/items.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="388">
<caret line="32" column="38" selection-start-line="32" selection-start-column="38" selection-end-line="32" selection-end-column="38" />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/crawler/taobao/pipelines.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="388">
<caret line="37" column="19" selection-start-line="37" selection-start-column="19" selection-end-line="37" selection-end-column="19" />
<folding>
<element signature="e#193#204#0" expanded="true" />
</folding>
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/env/lib/python3.7/site-packages/lxml/__init__.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="150">
<caret line="10" column="33" selection-start-line="10" selection-start-column="33" selection-end-line="10" selection-end-column="33" />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/crawler/taobao/spiders/dangdang.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="315">
<caret line="21" column="22" selection-start-line="21" selection-start-column="22" selection-end-line="21" selection-end-column="22" />
<folding>
<element signature="e#119#129#0" expanded="true" />
</folding>
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/crawler/taobao/middlewares.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="216">
<caret line="137" column="32" selection-start-line="137" selection-start-column="32" selection-end-line="137" selection-end-column="32" />
<folding>
<element signature="e#168#178#0" expanded="true" />
</folding>
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/crawler/taobao/spiders/httpbin.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="45">
<caret line="3" column="13" selection-start-line="3" selection-start-column="5" selection-end-line="3" selection-end-column="13" />
</state>
</provider>
</entry>
<entry file="file:///usr/local/Cellar/python/3.7.0/Frameworks/Python.framework/Versions/3.7/lib/python3.7/subprocess.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="139">
<caret line="965" column="8" selection-start-line="965" selection-start-column="8" selection-end-line="965" selection-end-column="8" />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/crawler/taobao/spiders/comment.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="247">
<caret line="81" column="34" selection-start-line="81" selection-start-column="34" selection-end-line="81" selection-end-column="34" />
<folding>
<element signature="e#24#35#0" expanded="true" />
</folding>
</state>
</provider>
</entry>
</component>
<component name="masterDetails">
<states>
<state key="ScopeChooserConfigurable.UI">
<settings>
<splitter-proportions>
<option name="proportions">
<list>
<option value="0.2" />
</list>
</option>
</splitter-proportions>
</settings>
</state>
</states>
</component>
</project>

File diff suppressed because it is too large Load Diff

View File

@ -26,3 +26,13 @@ class ProductItem(scrapy.Item):
url = scrapy.Field()
detail = scrapy.Field()
comment_num = scrapy.Field()
class CommentItem(scrapy.Item):
id = scrapy.Field()
total_comment_num = scrapy.Field()
total_crazy_count = scrapy.Field()
total_indifferent_count = scrapy.Field()
total_detest_count = scrapy.Field()
page_count = scrapy.Field()
comments = scrapy.Field()

View File

@ -0,0 +1,48 @@
"""
商品数据合并
"""
import json
def pretty(word):
"""
去掉问题之类的
:param word:
:return:
"""
if word is not None:
return word.replace('"', "")
return None
def main():
wf = open('../datas/products/data.csv', 'w')
with open('../datas/kind.json', 'r') as f:
kinds = json.load(f)
for kind in kinds:
for keyword in kind["category"]:
# print("大类 %s" % kind["kind"])
print("小类 %s" % keyword)
try:
ff = open('../datas/%s.json' % keyword, 'r')
for line in ff:
data = json.loads(line)
fline = '%s,%s,%s,"%s","%s","%s",%s,"%s",%s,%s' % (data["id"],
kind["kind"],
keyword,
pretty(data["title"]),
data["price"],
pretty(data["shop"]),
data["pubtime"],
pretty(data["publisher"]),
pretty(data["url"]),
data["comment_num"])
wf.write(fline + "\n")
ff.close()
except Exception as e:
print(e)
wf.close()
if __name__ == '__main__':
main()

View File

@ -28,3 +28,13 @@ class JsonWithEncodingCnblogsPipeline(object):
return item
def spider_closed(self, spider):
self.file.close()
class JsonWithEncodingCommentsPipeline(object):
def open_spider(self, spider):
self.file = open('../../datas/comments/%s.json' % spider.product_id, 'w', encoding='utf-8')
def process_item(self, item, spider):
line = json.dumps(dict(item), ensure_ascii=False) + "\n"
self.file.write(line)
return item
def spider_closed(self, spider):
self.file.close()

View File

@ -0,0 +1,100 @@
# -*- coding: utf-8 -*-
import json
import re
import subprocess
import time
import scrapy
from scrapy import Request
from taobao.items import ProductItem
"""
按分类采集电商商品数据
"""
class CategorySpider(scrapy.Spider):
name = 'category'
allowed_domains = ['dangdang.com']
start_urls = []
custom_settings = {
'LOG_LEVEL': "WARN",
'ITEM_PIPELINES': {
'taobao.pipelines.JsonWithEncodingCnblogsPipeline': 300,
}
}
page = 1
def __init__(self, name=None, **kwargs):
self.keyword = kwargs['keyword']
self.start_urls = [
'http://search.dangdang.com/?key=%s&act=input&show=list&show_shop=0#J_tab'
% self.keyword]
super(CategorySpider, self).__init__(name, **kwargs)
def safe_fetch(self, node):
try:
return node[0].extract()
except Exception as e:
return None
def parse(self, response):
products = response.xpath(
'//div[@class="con shoplist"]//div[@id="search_nature_rg"]//ul/li')
for product in products:
title = self.safe_fetch(product.xpath('p[@class="name"]/a/@title'))
price = self.safe_fetch(
product.xpath('p[@class="price"]/span[@class="search_now_price"]/text()'))
shop = self.safe_fetch(product.xpath('p[@class="search_book_author"]/span/a/@title'))
if shop is None:
shop = self.safe_fetch(product.xpath('p[@class="link"]/a/text()'))
pubtime = self.safe_fetch(
product.xpath('p[@class="search_book_author"]/span[2]/text()'))
publisher = self.safe_fetch(
product.xpath('p[@class="search_book_author"]/span[3]/a/text()'))
detail = self.safe_fetch(product.xpath('p[@class="detail"]/text()'))
url = self.safe_fetch(product.xpath('p[@class="name"]/a/@href'))
comment_num = self.safe_fetch(product.xpath(
'p[@class="search_star_line"]//a[@class="search_comment_num"]/text()'))
match_obj = re.match(r'http.+/(\d+).html', url)
product_id = match_obj.group(1)
yield ProductItem(
id=product_id,
title=title,
price=price,
shop=shop,
pubtime=pubtime,
publisher=publisher,
url=url,
comment_num=comment_num
)
next_page_url = response.xpath(
'//div[@class="paging"]//a[text()="下一页"]/@href').extract()
if len(next_page_url) > 0:
# print(next_page_url[0])
print("开始抓取 %s%d" % (self.keyword, CategorySpider.page))
CategorySpider.page += 1
yield Request("http://search.dangdang.com%s" % next_page_url[0])
if __name__ == '__main__':
with open('../../datas/kind.json', 'r') as f:
kinds = json.load(f)
for kind in kinds:
for keyword in kind["category"]:
print("开始抓取 %s 分类" % keyword)
cmd = "scrapy crawl category -a keyword=%s" % keyword
p = subprocess.Popen(cmd, shell=True)
p.wait()
if p.returncode != 0:
print("抓取 %s 出现错误" % keyword)
time.sleep(3)
print("抓取 %s 分类结束" % keyword)

View File

@ -0,0 +1,117 @@
# -*- coding: utf-8 -*-
import json
import pdb
import subprocess
import scrapy
from scrapy import Request
from scrapy.http import Response
from lxml import etree
from taobao.items import CommentItem
import pandas as pd
"""
商品评论数据抓取
"""
class CommentSpider(scrapy.Spider):
name = 'comment'
allowed_domains = ['dangdang.com']
start_urls = []
custom_settings = {
'LOG_LEVEL': "WARN",
'ITEM_PIPELINES': {
'taobao.pipelines.JsonWithEncodingCommentsPipeline': 300,
}
}
def __init__(self, name=None, **kwargs):
self.product_id = kwargs['id']
self.start_urls = [
'http://product.dangdang.com/index.php?r=comment%%2Flist&productId=%s&'
'mainProductId=%s&mediumId=0&pageIndex=1&sortType=1&filterType=1'
'&isSystem=1&tagId=0&tagFilterCount=0' % (self.product_id, self.product_id)
]
self.comments = None
super(CommentSpider, self).__init__(name, **kwargs)
def parse(self, response):
data = json.loads(response.body)
summary = data["data"]["list"]["summary"]
html = data["data"]["list"]["html"]
print("评论总数: %s" % summary["total_comment_num"])
print("好评: %s" % summary["total_crazy_count"])
print("中评: %s" % summary["total_indifferent_count"])
print("差评: %s" % summary["total_detest_count"])
print("评论页数: %s" % summary["pageCount"])
print("当前页面: %s" % summary["pageIndex"])
if self.comments is None:
self.comments = CommentItem(
id=self.product_id,
total_comment_num=summary["total_comment_num"],
total_crazy_count=summary["total_crazy_count"],
total_indifferent_count=summary["total_indifferent_count"],
total_detest_count=summary["total_detest_count"],
page_count = summary["pageCount"],
comments=[]
)
el = etree.HTML(html)
items = el.xpath('//div[contains(@class,"comment_items")]')
for item in items:
score = item.xpath('div[@class="items_right"]//div[@class="pinglun"]//em/text()')
desc = item.xpath(
'div[@class="items_right"]//div[@class="describe_detail"]/span/text()')
date = item.xpath(
'div[@class="items_right"]//div[@class="starline clearfix"]/span[1]/text()')
headimg = item.xpath('div[@class="items_left_pic"]/a/img/@src')
username = item.xpath('div[@class="items_left_pic"]/span[@class="name"]/text()')
userlevel = item.xpath(
'div[@class="items_left_pic"]/span[contains(@class,"level")]/text()')
comment = {
"score": score and score[0] or None,
"desc": desc and desc[0] or None ,
"date": date and date[0] or None,
"headimg": headimg and headimg[0] or None,
"username": username and username[0] or None,
"userlevel": userlevel and userlevel[0] or None
}
self.comments["comments"].append(comment)
if int(summary["pageIndex"])<int(summary["pageCount"]):
yield Request('http://product.dangdang.com/index.php?r=comment%%2Flist&productId=%s&'
'mainProductId=%s&mediumId=0&pageIndex=%d&sortType=1&filterType=1'
'&isSystem=1&tagId=0&tagFilterCount=0' % (self.product_id, self.product_id, int(summary["pageIndex"])+1))
else:
#end
yield self.comments
if __name__ == '__main__':
df = pd.read_csv('/Users/guange/work/bigdata/chapter1/crawler/datas/products/clean_data.csv')
df = df[df.comment_num>0]
print(type(df))
for i in range(0, df.size, 10):
pp = []
for x in range(10):
product = df.iloc[i+x]
if product["comment_num"]>0:
cmd = "scrapy crawl comment -a id=%s" % product["id"]
pp.append(subprocess.Popen(cmd, shell=True))
for p in pp:
try:
p.wait(20)
if p.returncode != 0:
print("抓取 %s 出现错误" % product["id"])
except Exception as e:
print(e)
print("总数: %d, 当前行: %d" % (df.size, i))