抓取评论
This commit is contained in:
parent
f73fbff4c8
commit
ee0aea16aa
|
@ -1,3 +1,7 @@
|
|||
SprakProject/
|
||||
chapter1/env
|
||||
*.ipynb
|
||||
*.pyc
|
||||
chapter1/crawler/datas/comments/
|
||||
chapter1/crawler/datas/comments1/
|
||||
chapter1/crawler/datas/products/
|
||||
|
|
|
@ -0,0 +1,22 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<module type="PYTHON_MODULE" version="4">
|
||||
<component name="NewModuleRootManager">
|
||||
<content url="file://$MODULE_DIR$">
|
||||
<sourceFolder url="file://$MODULE_DIR$/crawler" isTestSource="false" />
|
||||
<excludeFolder url="file://$MODULE_DIR$/crawler/datas" />
|
||||
</content>
|
||||
<orderEntry type="inheritedJdk" />
|
||||
<orderEntry type="sourceFolder" forTests="false" />
|
||||
</component>
|
||||
<component name="TemplatesService">
|
||||
<option name="TEMPLATE_FOLDERS">
|
||||
<list>
|
||||
<option value="$MODULE_DIR$/env/lib/python3.7/site-packages/scrapy/templates" />
|
||||
</list>
|
||||
</option>
|
||||
</component>
|
||||
<component name="TestRunnerService">
|
||||
<option name="projectConfiguration" value="Twisted Trial" />
|
||||
<option name="PROJECT_TEST_RUNNER" value="Twisted Trial" />
|
||||
</component>
|
||||
</module>
|
|
@ -0,0 +1,4 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="Encoding" addBOMForNewFiles="with NO BOM" />
|
||||
</project>
|
|
@ -0,0 +1,7 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.7 (chapter1)" project-jdk-type="Python SDK" />
|
||||
<component name="PyPackaging">
|
||||
<option name="earlyReleasesAsUpgrades" value="true" />
|
||||
</component>
|
||||
</project>
|
|
@ -0,0 +1,8 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="ProjectModuleManager">
|
||||
<modules>
|
||||
<module fileurl="file://$PROJECT_DIR$/.idea/chapter1.iml" filepath="$PROJECT_DIR$/.idea/chapter1.iml" />
|
||||
</modules>
|
||||
</component>
|
||||
</project>
|
|
@ -0,0 +1,6 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="VcsDirectoryMappings">
|
||||
<mapping directory="$PROJECT_DIR$/.." vcs="Git" />
|
||||
</component>
|
||||
</project>
|
|
@ -0,0 +1,661 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="ChangeListManager">
|
||||
<list default="true" id="a5fbb387-9969-4874-8a7f-c9dd40d5225d" name="Default Changelist" comment="">
|
||||
<change afterPath="$PROJECT_DIR$/crawler/taobao/merge.py" afterDir="false" />
|
||||
<change beforePath="$PROJECT_DIR$/crawler/datas/kind.json" beforeDir="false" />
|
||||
<change beforePath="$PROJECT_DIR$/crawler/taobao/items.py" beforeDir="false" afterPath="$PROJECT_DIR$/crawler/taobao/items.py" afterDir="false" />
|
||||
<change beforePath="$PROJECT_DIR$/crawler/taobao/pipelines.py" beforeDir="false" afterPath="$PROJECT_DIR$/crawler/taobao/pipelines.py" afterDir="false" />
|
||||
</list>
|
||||
<option name="EXCLUDED_CONVERTED_TO_IGNORED" value="true" />
|
||||
<option name="SHOW_DIALOG" value="false" />
|
||||
<option name="HIGHLIGHT_CONFLICTS" value="true" />
|
||||
<option name="HIGHLIGHT_NON_ACTIVE_CHANGELIST" value="false" />
|
||||
<option name="LAST_RESOLUTION" value="IGNORE" />
|
||||
</component>
|
||||
<component name="CoverageDataManager">
|
||||
<SUITE FILE_PATH="coverage/chapter1$category.coverage" NAME="category Coverage Results" MODIFIED="1546938251328" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/crawler/taobao/spiders" />
|
||||
<SUITE FILE_PATH="coverage/chapter1$merge.coverage" NAME="merge Coverage Results" MODIFIED="1547014157363" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/crawler/taobao" />
|
||||
<SUITE FILE_PATH="coverage/chapter1$comment.coverage" NAME="comment Coverage Results" MODIFIED="1547166377597" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/crawler/taobao/spiders" />
|
||||
</component>
|
||||
<component name="FileEditorManager">
|
||||
<leaf SIDE_TABS_SIZE_LIMIT_KEY="300">
|
||||
<file pinned="false" current-in-tab="false">
|
||||
<entry file="file://$PROJECT_DIR$/crawler/taobao/items.py">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="388">
|
||||
<caret line="32" column="38" selection-start-line="32" selection-start-column="38" selection-end-line="32" selection-end-column="38" />
|
||||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
</file>
|
||||
<file pinned="false" current-in-tab="false">
|
||||
<entry file="file://$PROJECT_DIR$/crawler/taobao/pipelines.py">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="388">
|
||||
<caret line="37" column="19" selection-start-line="37" selection-start-column="19" selection-end-line="37" selection-end-column="19" />
|
||||
<folding>
|
||||
<element signature="e#193#204#0" expanded="true" />
|
||||
</folding>
|
||||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
</file>
|
||||
<file pinned="false" current-in-tab="true">
|
||||
<entry file="file://$PROJECT_DIR$/crawler/taobao/spiders/comment.py">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="247">
|
||||
<caret line="81" column="34" selection-start-line="81" selection-start-column="34" selection-end-line="81" selection-end-column="34" />
|
||||
<folding>
|
||||
<element signature="e#24#35#0" expanded="true" />
|
||||
</folding>
|
||||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
</file>
|
||||
<file pinned="false" current-in-tab="false">
|
||||
<entry file="file:///usr/local/Cellar/python/3.7.0/Frameworks/Python.framework/Versions/3.7/lib/python3.7/subprocess.py">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="139">
|
||||
<caret line="965" column="8" selection-start-line="965" selection-start-column="8" selection-end-line="965" selection-end-column="8" />
|
||||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
</file>
|
||||
<file pinned="false" current-in-tab="false">
|
||||
<entry file="file://$PROJECT_DIR$/crawler/taobao/spiders/httpbin.py">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="45">
|
||||
<caret line="3" column="13" selection-start-line="3" selection-start-column="5" selection-end-line="3" selection-end-column="13" />
|
||||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
</file>
|
||||
<file pinned="false" current-in-tab="false">
|
||||
<entry file="file://$PROJECT_DIR$/crawler/taobao/middlewares.py">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="216">
|
||||
<caret line="137" column="32" selection-start-line="137" selection-start-column="32" selection-end-line="137" selection-end-column="32" />
|
||||
<folding>
|
||||
<element signature="e#168#178#0" expanded="true" />
|
||||
</folding>
|
||||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
</file>
|
||||
<file pinned="false" current-in-tab="false">
|
||||
<entry file="file://$PROJECT_DIR$/crawler/taobao/spiders/dangdang.py">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="315">
|
||||
<caret line="21" column="22" selection-start-line="21" selection-start-column="22" selection-end-line="21" selection-end-column="22" />
|
||||
<folding>
|
||||
<element signature="e#119#129#0" expanded="true" />
|
||||
</folding>
|
||||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
</file>
|
||||
<file pinned="false" current-in-tab="false">
|
||||
<entry file="file://$USER_HOME$/Library/Caches/PyCharm2018.3/python_stubs/-1583084946/builtins.py">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="36915">
|
||||
<caret line="2461" column="42" selection-start-line="2461" selection-start-column="34" selection-end-line="2461" selection-end-column="42" />
|
||||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
</file>
|
||||
<file pinned="false" current-in-tab="false">
|
||||
<entry file="file://$PROJECT_DIR$/crawler/taobao/spiders/category.py">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="1245">
|
||||
<caret line="91" column="45" selection-start-line="87" selection-start-column="4" selection-end-line="91" selection-end-column="45" />
|
||||
<folding>
|
||||
<element signature="e#24#35#0" expanded="true" />
|
||||
</folding>
|
||||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
</file>
|
||||
<file pinned="false" current-in-tab="false">
|
||||
<entry file="file://$PROJECT_DIR$/crawler/taobao/merge.py">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="345">
|
||||
<caret line="23" column="30" selection-start-line="23" selection-start-column="30" selection-end-line="23" selection-end-column="30" />
|
||||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
</file>
|
||||
</leaf>
|
||||
</component>
|
||||
<component name="FileTemplateManagerImpl">
|
||||
<option name="RECENT_TEMPLATES">
|
||||
<list>
|
||||
<option value="Python Script" />
|
||||
</list>
|
||||
</option>
|
||||
</component>
|
||||
<component name="FindInProjectRecents">
|
||||
<findStrings>
|
||||
<find>日本可爱毛球熊猫束发带</find>
|
||||
<find>美式风</find>
|
||||
<find>雪</find>
|
||||
<find>皮床</find>
|
||||
</findStrings>
|
||||
</component>
|
||||
<component name="Git.Settings">
|
||||
<option name="RECENT_GIT_ROOT_PATH" value="$PROJECT_DIR$/.." />
|
||||
</component>
|
||||
<component name="IdeDocumentHistory">
|
||||
<option name="CHANGED_PATHS">
|
||||
<list>
|
||||
<option value="$PROJECT_DIR$/crawler/taobao/middlewares.py" />
|
||||
<option value="$PROJECT_DIR$/crawler/taobao/run.py" />
|
||||
<option value="$PROJECT_DIR$/crawler/taobao/spiders/dangdang.py" />
|
||||
<option value="$PROJECT_DIR$/crawler/taobao/spiders/run.py" />
|
||||
<option value="$PROJECT_DIR$/env/lib/python3.7/site-packages/scrapy/spiders/__init__.py" />
|
||||
<option value="$PROJECT_DIR$/crawler/taobao/spiders/category.py" />
|
||||
<option value="$PROJECT_DIR$/crawler/taobao/items.py" />
|
||||
<option value="$PROJECT_DIR$/crawler/taobao/pipelines.py" />
|
||||
<option value="$PROJECT_DIR$/crawler/datas/kind.json" />
|
||||
<option value="$PROJECT_DIR$/crawler/taobao/merge.py" />
|
||||
<option value="$PROJECT_DIR$/crawler/taobao/spiders/comment.py" />
|
||||
</list>
|
||||
</option>
|
||||
</component>
|
||||
<component name="ProjectFrameBounds">
|
||||
<option name="y" value="23" />
|
||||
<option name="width" value="1280" />
|
||||
<option name="height" value="777" />
|
||||
</component>
|
||||
<component name="ProjectLevelVcsManager" settingsEditedManually="true" />
|
||||
<component name="ProjectView">
|
||||
<navigator proportions="" version="1">
|
||||
<foldersAlwaysOnTop value="true" />
|
||||
</navigator>
|
||||
<panes>
|
||||
<pane id="ProjectPane">
|
||||
<subPane>
|
||||
<expand>
|
||||
<path>
|
||||
<item name="chapter1" type="b2602c69:ProjectViewProjectNode" />
|
||||
<item name="chapter1" type="462c0819:PsiDirectoryNode" />
|
||||
</path>
|
||||
<path>
|
||||
<item name="chapter1" type="b2602c69:ProjectViewProjectNode" />
|
||||
<item name="chapter1" type="462c0819:PsiDirectoryNode" />
|
||||
<item name="crawler" type="462c0819:PsiDirectoryNode" />
|
||||
</path>
|
||||
<path>
|
||||
<item name="chapter1" type="b2602c69:ProjectViewProjectNode" />
|
||||
<item name="chapter1" type="462c0819:PsiDirectoryNode" />
|
||||
<item name="crawler" type="462c0819:PsiDirectoryNode" />
|
||||
<item name="datas" type="462c0819:PsiDirectoryNode" />
|
||||
</path>
|
||||
<path>
|
||||
<item name="chapter1" type="b2602c69:ProjectViewProjectNode" />
|
||||
<item name="chapter1" type="462c0819:PsiDirectoryNode" />
|
||||
<item name="env" type="462c0819:PsiDirectoryNode" />
|
||||
</path>
|
||||
<path>
|
||||
<item name="chapter1" type="b2602c69:ProjectViewProjectNode" />
|
||||
<item name="chapter1" type="462c0819:PsiDirectoryNode" />
|
||||
<item name="env" type="462c0819:PsiDirectoryNode" />
|
||||
<item name="lib" type="462c0819:PsiDirectoryNode" />
|
||||
</path>
|
||||
<path>
|
||||
<item name="chapter1" type="b2602c69:ProjectViewProjectNode" />
|
||||
<item name="chapter1" type="462c0819:PsiDirectoryNode" />
|
||||
<item name="env" type="462c0819:PsiDirectoryNode" />
|
||||
<item name="lib" type="462c0819:PsiDirectoryNode" />
|
||||
<item name="python3.7" type="462c0819:PsiDirectoryNode" />
|
||||
</path>
|
||||
<path>
|
||||
<item name="chapter1" type="b2602c69:ProjectViewProjectNode" />
|
||||
<item name="chapter1" type="462c0819:PsiDirectoryNode" />
|
||||
<item name="env" type="462c0819:PsiDirectoryNode" />
|
||||
<item name="lib" type="462c0819:PsiDirectoryNode" />
|
||||
<item name="python3.7" type="462c0819:PsiDirectoryNode" />
|
||||
<item name="site-packages" type="462c0819:PsiDirectoryNode" />
|
||||
</path>
|
||||
<path>
|
||||
<item name="chapter1" type="b2602c69:ProjectViewProjectNode" />
|
||||
<item name="chapter1" type="462c0819:PsiDirectoryNode" />
|
||||
<item name="env" type="462c0819:PsiDirectoryNode" />
|
||||
<item name="lib" type="462c0819:PsiDirectoryNode" />
|
||||
<item name="python3.7" type="462c0819:PsiDirectoryNode" />
|
||||
<item name="site-packages" type="462c0819:PsiDirectoryNode" />
|
||||
<item name="lxml" type="462c0819:PsiDirectoryNode" />
|
||||
</path>
|
||||
<path>
|
||||
<item name="chapter1" type="b2602c69:ProjectViewProjectNode" />
|
||||
<item name="External Libraries" type="cb654da1:ExternalLibrariesNode" />
|
||||
</path>
|
||||
<path>
|
||||
<item name="chapter1" type="b2602c69:ProjectViewProjectNode" />
|
||||
<item name="External Libraries" type="cb654da1:ExternalLibrariesNode" />
|
||||
<item name="< Python 3.7 (chapter1) >" type="70bed36:NamedLibraryElementNode" />
|
||||
</path>
|
||||
<path>
|
||||
<item name="chapter1" type="b2602c69:ProjectViewProjectNode" />
|
||||
<item name="External Libraries" type="cb654da1:ExternalLibrariesNode" />
|
||||
<item name="< Python 3.7 (chapter1) >" type="70bed36:NamedLibraryElementNode" />
|
||||
<item name="python3.7" type="462c0819:PsiDirectoryNode" />
|
||||
</path>
|
||||
<path>
|
||||
<item name="chapter1" type="b2602c69:ProjectViewProjectNode" />
|
||||
<item name="External Libraries" type="cb654da1:ExternalLibrariesNode" />
|
||||
<item name="< Python 3.7 (chapter1) >" type="70bed36:NamedLibraryElementNode" />
|
||||
<item name="python3.7" type="462c0819:PsiDirectoryNode" />
|
||||
<item name="site-packages" type="462c0819:PsiDirectoryNode" />
|
||||
</path>
|
||||
<path>
|
||||
<item name="chapter1" type="b2602c69:ProjectViewProjectNode" />
|
||||
<item name="External Libraries" type="cb654da1:ExternalLibrariesNode" />
|
||||
<item name="< Python 3.7 (chapter1) >" type="70bed36:NamedLibraryElementNode" />
|
||||
<item name="python3.7" type="462c0819:PsiDirectoryNode" />
|
||||
<item name="site-packages" type="462c0819:PsiDirectoryNode" />
|
||||
<item name="lxml" type="462c0819:PsiDirectoryNode" />
|
||||
</path>
|
||||
<path>
|
||||
<item name="chapter1" type="b2602c69:ProjectViewProjectNode" />
|
||||
<item name="External Libraries" type="cb654da1:ExternalLibrariesNode" />
|
||||
<item name="< Python 3.7 (chapter1) >" type="70bed36:NamedLibraryElementNode" />
|
||||
<item name="site-packages" type="462c0819:PsiDirectoryNode" />
|
||||
</path>
|
||||
<path>
|
||||
<item name="chapter1" type="b2602c69:ProjectViewProjectNode" />
|
||||
<item name="External Libraries" type="cb654da1:ExternalLibrariesNode" />
|
||||
<item name="< Python 3.7 (chapter1) >" type="70bed36:NamedLibraryElementNode" />
|
||||
<item name="site-packages" type="462c0819:PsiDirectoryNode" />
|
||||
<item name="lxml" type="462c0819:PsiDirectoryNode" />
|
||||
</path>
|
||||
</expand>
|
||||
<select />
|
||||
</subPane>
|
||||
</pane>
|
||||
<pane id="Scope" />
|
||||
</panes>
|
||||
</component>
|
||||
<component name="PropertiesComponent">
|
||||
<property name="WebServerToolWindowFactoryState" value="false" />
|
||||
<property name="last_opened_file_path" value="$PROJECT_DIR$" />
|
||||
<property name="nodejs_interpreter_path.stuck_in_default_project" value="undefined stuck path" />
|
||||
<property name="nodejs_npm_path_reset_for_default_project" value="true" />
|
||||
<property name="settings.editor.selected.configurable" value="configurable.group.project" />
|
||||
</component>
|
||||
<component name="RecentsManager">
|
||||
<key name="MoveFile.RECENT_KEYS">
|
||||
<recent name="$PROJECT_DIR$/crawler/taobao/spiders" />
|
||||
</key>
|
||||
</component>
|
||||
<component name="RunDashboard">
|
||||
<option name="ruleStates">
|
||||
<list>
|
||||
<RuleState>
|
||||
<option name="name" value="ConfigurationTypeDashboardGroupingRule" />
|
||||
</RuleState>
|
||||
<RuleState>
|
||||
<option name="name" value="StatusDashboardGroupingRule" />
|
||||
</RuleState>
|
||||
</list>
|
||||
</option>
|
||||
</component>
|
||||
<component name="RunManager" selected="Python.comment">
|
||||
<configuration name="category" type="PythonConfigurationType" factoryName="Python" temporary="true">
|
||||
<module name="chapter1" />
|
||||
<option name="INTERPRETER_OPTIONS" value="" />
|
||||
<option name="PARENT_ENVS" value="true" />
|
||||
<envs>
|
||||
<env name="PYTHONUNBUFFERED" value="1" />
|
||||
</envs>
|
||||
<option name="SDK_HOME" value="" />
|
||||
<option name="WORKING_DIRECTORY" value="$PROJECT_DIR$/crawler/taobao/spiders" />
|
||||
<option name="IS_MODULE_SDK" value="true" />
|
||||
<option name="ADD_CONTENT_ROOTS" value="true" />
|
||||
<option name="ADD_SOURCE_ROOTS" value="true" />
|
||||
<EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
|
||||
<option name="SCRIPT_NAME" value="$PROJECT_DIR$/crawler/taobao/spiders/category.py" />
|
||||
<option name="PARAMETERS" value="" />
|
||||
<option name="SHOW_COMMAND_LINE" value="false" />
|
||||
<option name="EMULATE_TERMINAL" value="false" />
|
||||
<option name="MODULE_MODE" value="false" />
|
||||
<option name="REDIRECT_INPUT" value="false" />
|
||||
<option name="INPUT_FILE" value="" />
|
||||
<method v="2" />
|
||||
</configuration>
|
||||
<configuration name="comment" type="PythonConfigurationType" factoryName="Python" temporary="true">
|
||||
<module name="chapter1" />
|
||||
<option name="INTERPRETER_OPTIONS" value="" />
|
||||
<option name="PARENT_ENVS" value="true" />
|
||||
<envs>
|
||||
<env name="PYTHONUNBUFFERED" value="1" />
|
||||
</envs>
|
||||
<option name="SDK_HOME" value="" />
|
||||
<option name="WORKING_DIRECTORY" value="$PROJECT_DIR$/crawler/taobao/spiders" />
|
||||
<option name="IS_MODULE_SDK" value="true" />
|
||||
<option name="ADD_CONTENT_ROOTS" value="true" />
|
||||
<option name="ADD_SOURCE_ROOTS" value="true" />
|
||||
<EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
|
||||
<option name="SCRIPT_NAME" value="$PROJECT_DIR$/crawler/taobao/spiders/comment.py" />
|
||||
<option name="PARAMETERS" value="" />
|
||||
<option name="SHOW_COMMAND_LINE" value="false" />
|
||||
<option name="EMULATE_TERMINAL" value="false" />
|
||||
<option name="MODULE_MODE" value="false" />
|
||||
<option name="REDIRECT_INPUT" value="false" />
|
||||
<option name="INPUT_FILE" value="" />
|
||||
<method v="2" />
|
||||
</configuration>
|
||||
<configuration name="merge" type="PythonConfigurationType" factoryName="Python" temporary="true">
|
||||
<module name="chapter1" />
|
||||
<option name="INTERPRETER_OPTIONS" value="" />
|
||||
<option name="PARENT_ENVS" value="true" />
|
||||
<envs>
|
||||
<env name="PYTHONUNBUFFERED" value="1" />
|
||||
</envs>
|
||||
<option name="SDK_HOME" value="" />
|
||||
<option name="WORKING_DIRECTORY" value="$PROJECT_DIR$/crawler/taobao" />
|
||||
<option name="IS_MODULE_SDK" value="true" />
|
||||
<option name="ADD_CONTENT_ROOTS" value="true" />
|
||||
<option name="ADD_SOURCE_ROOTS" value="true" />
|
||||
<EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
|
||||
<option name="SCRIPT_NAME" value="$PROJECT_DIR$/crawler/taobao/merge.py" />
|
||||
<option name="PARAMETERS" value="" />
|
||||
<option name="SHOW_COMMAND_LINE" value="false" />
|
||||
<option name="EMULATE_TERMINAL" value="false" />
|
||||
<option name="MODULE_MODE" value="false" />
|
||||
<option name="REDIRECT_INPUT" value="false" />
|
||||
<option name="INPUT_FILE" value="" />
|
||||
<method v="2" />
|
||||
</configuration>
|
||||
<recent_temporary>
|
||||
<list>
|
||||
<item itemvalue="Python.comment" />
|
||||
<item itemvalue="Python.merge" />
|
||||
<item itemvalue="Python.category" />
|
||||
</list>
|
||||
</recent_temporary>
|
||||
</component>
|
||||
<component name="TaskManager">
|
||||
<task active="true" id="Default" summary="Default task">
|
||||
<changelist id="a5fbb387-9969-4874-8a7f-c9dd40d5225d" name="Default Changelist" comment="" />
|
||||
<created>1546928387088</created>
|
||||
<option name="number" value="Default" />
|
||||
<option name="presentableId" value="Default" />
|
||||
<updated>1546928387088</updated>
|
||||
</task>
|
||||
<task id="LOCAL-00001" summary="数据抓取">
|
||||
<created>1546936657398</created>
|
||||
<option name="number" value="00001" />
|
||||
<option name="presentableId" value="LOCAL-00001" />
|
||||
<option name="project" value="LOCAL" />
|
||||
<updated>1546936657399</updated>
|
||||
</task>
|
||||
<task id="LOCAL-00002" summary="增加id字段">
|
||||
<created>1546938314632</created>
|
||||
<option name="number" value="00002" />
|
||||
<option name="presentableId" value="LOCAL-00002" />
|
||||
<option name="project" value="LOCAL" />
|
||||
<updated>1546938314632</updated>
|
||||
</task>
|
||||
<option name="localTasksCounter" value="3" />
|
||||
<servers />
|
||||
</component>
|
||||
<component name="TodoView">
|
||||
<todo-panel id="selected-file">
|
||||
<is-autoscroll-to-source value="true" />
|
||||
</todo-panel>
|
||||
<todo-panel id="all">
|
||||
<are-packages-shown value="true" />
|
||||
<is-autoscroll-to-source value="true" />
|
||||
</todo-panel>
|
||||
</component>
|
||||
<component name="ToolWindowManager">
|
||||
<frame x="0" y="23" width="1280" height="777" extended-state="0" />
|
||||
<layout>
|
||||
<window_info active="true" content_ui="combo" id="Project" order="0" visible="true" weight="0.25767368" />
|
||||
<window_info id="Structure" order="1" side_tool="true" weight="0.25" />
|
||||
<window_info id="Favorites" order="2" side_tool="true" />
|
||||
<window_info anchor="bottom" id="Message" order="0" />
|
||||
<window_info anchor="bottom" id="Find" order="1" />
|
||||
<window_info anchor="bottom" id="Run" order="2" weight="0.3970803" />
|
||||
<window_info anchor="bottom" id="Debug" order="3" weight="0.4" />
|
||||
<window_info anchor="bottom" id="Cvs" order="4" weight="0.25" />
|
||||
<window_info anchor="bottom" id="Inspection" order="5" weight="0.4" />
|
||||
<window_info anchor="bottom" id="TODO" order="6" weight="0.329927" />
|
||||
<window_info anchor="bottom" id="Version Control" order="7" />
|
||||
<window_info anchor="bottom" id="Terminal" order="8" visible="true" weight="0.31678832" />
|
||||
<window_info anchor="bottom" id="Event Log" order="9" side_tool="true" />
|
||||
<window_info anchor="bottom" id="Python Console" order="10" weight="0.329927" />
|
||||
<window_info anchor="right" id="Commander" internal_type="SLIDING" order="0" type="SLIDING" weight="0.4" />
|
||||
<window_info anchor="right" id="Ant Build" order="1" weight="0.25" />
|
||||
<window_info anchor="right" content_ui="combo" id="Hierarchy" order="2" weight="0.25" />
|
||||
<window_info anchor="right" id="SciView" order="3" />
|
||||
</layout>
|
||||
</component>
|
||||
<component name="VcsManagerConfiguration">
|
||||
<MESSAGE value="数据抓取" />
|
||||
<MESSAGE value="增加id字段" />
|
||||
<option name="LAST_COMMIT_MESSAGE" value="增加id字段" />
|
||||
</component>
|
||||
<component name="editorHistoryManager">
|
||||
<entry file="file://$PROJECT_DIR$/crawler/scrapy.cfg">
|
||||
<provider selected="true" editor-type-id="text-editor" />
|
||||
</entry>
|
||||
<entry file="file://$PROJECT_DIR$/crawler/taobao/spiders/users.py">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="255">
|
||||
<caret line="18" column="52" selection-start-line="18" selection-start-column="52" selection-end-line="18" selection-end-column="52" />
|
||||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
<entry file="file://$PROJECT_DIR$/env/lib/python3.7/site-packages/scrapy/utils/project.py">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="140">
|
||||
<caret line="59" column="4" selection-start-line="59" selection-start-column="4" selection-end-line="59" selection-end-column="4" />
|
||||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
<entry file="file://$PROJECT_DIR$/crawler/taobao/settings.py">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="-1210" />
|
||||
</provider>
|
||||
</entry>
|
||||
<entry file="file://$PROJECT_DIR$/crawler/taobao/__init__.py">
|
||||
<provider selected="true" editor-type-id="text-editor" />
|
||||
</entry>
|
||||
<entry file="file://$PROJECT_DIR$/crawler/taobao/spiders/run.py" />
|
||||
<entry file="file://$PROJECT_DIR$/env/lib/python3.7/site-packages/scrapy/spiders/__init__.py">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="238">
|
||||
<caret line="32" column="32" selection-start-line="32" selection-start-column="32" selection-end-line="32" selection-end-column="32" />
|
||||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
<entry file="file://$PROJECT_DIR$/crawler/datas/大数据.json" />
|
||||
<entry file="file://$PROJECT_DIR$/env/lib/python3.7/site-packages/scrapy/cmdline.py">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="1365">
|
||||
<caret line="105" column="36" selection-start-line="105" selection-start-column="28" selection-end-line="105" selection-end-column="36" />
|
||||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
<entry file="file:///usr/local/Cellar/python/3.7.0/Frameworks/Python.framework/Versions/3.7/lib/python3.7/json/__init__.py">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="146">
|
||||
<caret line="356" column="29" selection-start-line="356" selection-start-column="29" selection-end-line="356" selection-end-column="29" />
|
||||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
<entry file="file://$PROJECT_DIR$/crawler/datas/T恤.json" />
|
||||
<entry file="file://$PROJECT_DIR$/crawler/datas/冬上新.json" />
|
||||
<entry file="file://$PROJECT_DIR$/env/lib/python3.7/site-packages/scrapy/crawler.py">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="-93">
|
||||
<caret line="147" column="8" selection-start-line="147" selection-start-column="8" selection-end-line="147" selection-end-column="8" />
|
||||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
<entry file="file://$PROJECT_DIR$/env/lib/python3.7/site-packages/cssselect/xpath.py">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="118">
|
||||
<caret line="247" column="8" selection-start-line="247" selection-start-column="8" selection-end-line="247" selection-end-column="8" />
|
||||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
<entry file="file://$PROJECT_DIR$/env/lib/python3.7/site-packages/parsel/selector.py">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="900">
|
||||
<caret line="66" column="8" selection-start-line="66" selection-start-column="8" selection-end-line="66" selection-end-column="8" />
|
||||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
<entry file="file://$PROJECT_DIR$/env/lib/python3.7/site-packages/scrapy/http/request/__init__.py">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="118">
|
||||
<caret line="16" column="6" selection-start-line="16" selection-start-column="6" selection-end-line="16" selection-end-column="6" />
|
||||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
<entry file="file://$PROJECT_DIR$/env/lib/python3.7/site-packages/scrapy/http/response/__init__.py">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="110">
|
||||
<caret line="18" column="26" selection-start-line="18" selection-start-column="23" selection-end-line="18" selection-end-column="26" />
|
||||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
<entry file="file://$PROJECT_DIR$/../../python/crawler/env/lib/python3.7/site-packages/scrapy/http/response/__init__.py">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="130">
|
||||
<caret line="54" column="35" lean-forward="true" selection-start-line="54" selection-start-column="35" selection-end-line="54" selection-end-column="35" />
|
||||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
<entry file="file://$PROJECT_DIR$/crawler/datas/内衣套装.json" />
|
||||
<entry file="file://$PROJECT_DIR$/../../python/crawler/env/lib/python3.7/site-packages/scrapy/item.py">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="361">
|
||||
<caret line="72" selection-start-line="72" selection-end-line="72" />
|
||||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
<entry file="file://$PROJECT_DIR$/crawler/taobao/spiders/hah.json" />
|
||||
<entry file="file://$PROJECT_DIR$/crawler/taobao/spiders/douban.json" />
|
||||
<entry file="file://$PROJECT_DIR$/crawler/datas/comments/23761145.json">
|
||||
<provider selected="true" editor-type-id="text-editor" />
|
||||
</entry>
|
||||
<entry file="file://$PROJECT_DIR$/crawler/datas/2小时送货服务.json" />
|
||||
<entry file="file://$PROJECT_DIR$/crawler/datas/3万以下.json" />
|
||||
<entry file="file://$PROJECT_DIR$/crawler/datas/kind.json" />
|
||||
<entry file="file://$PROJECT_DIR$/crawler/datas/products/data.json" />
|
||||
<entry file="file://$USER_HOME$/Library/Caches/PyCharm2018.3/python_stubs/-1583084946/builtins.py">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="36915">
|
||||
<caret line="2461" column="42" selection-start-line="2461" selection-start-column="34" selection-end-line="2461" selection-end-column="42" />
|
||||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
<entry file="file://$PROJECT_DIR$/crawler/taobao/spiders/category.py">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="1245">
|
||||
<caret line="91" column="45" selection-start-line="87" selection-start-column="4" selection-end-line="91" selection-end-column="45" />
|
||||
<folding>
|
||||
<element signature="e#24#35#0" expanded="true" />
|
||||
</folding>
|
||||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
<entry file="file://$PROJECT_DIR$/crawler/taobao/merge.py">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="345">
|
||||
<caret line="23" column="30" selection-start-line="23" selection-start-column="30" selection-end-line="23" selection-end-column="30" />
|
||||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
<entry file="file://$PROJECT_DIR$/env/lib/python3.7/site-packages/jupyter.py">
|
||||
<provider selected="true" editor-type-id="text-editor" />
|
||||
</entry>
|
||||
<entry file="file://$PROJECT_DIR$/crawler/taobao/items.py">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="388">
|
||||
<caret line="32" column="38" selection-start-line="32" selection-start-column="38" selection-end-line="32" selection-end-column="38" />
|
||||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
<entry file="file://$PROJECT_DIR$/crawler/taobao/pipelines.py">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="388">
|
||||
<caret line="37" column="19" selection-start-line="37" selection-start-column="19" selection-end-line="37" selection-end-column="19" />
|
||||
<folding>
|
||||
<element signature="e#193#204#0" expanded="true" />
|
||||
</folding>
|
||||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
<entry file="file://$PROJECT_DIR$/env/lib/python3.7/site-packages/lxml/__init__.py">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="150">
|
||||
<caret line="10" column="33" selection-start-line="10" selection-start-column="33" selection-end-line="10" selection-end-column="33" />
|
||||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
<entry file="file://$PROJECT_DIR$/crawler/taobao/spiders/dangdang.py">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="315">
|
||||
<caret line="21" column="22" selection-start-line="21" selection-start-column="22" selection-end-line="21" selection-end-column="22" />
|
||||
<folding>
|
||||
<element signature="e#119#129#0" expanded="true" />
|
||||
</folding>
|
||||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
<entry file="file://$PROJECT_DIR$/crawler/taobao/middlewares.py">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="216">
|
||||
<caret line="137" column="32" selection-start-line="137" selection-start-column="32" selection-end-line="137" selection-end-column="32" />
|
||||
<folding>
|
||||
<element signature="e#168#178#0" expanded="true" />
|
||||
</folding>
|
||||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
<entry file="file://$PROJECT_DIR$/crawler/taobao/spiders/httpbin.py">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="45">
|
||||
<caret line="3" column="13" selection-start-line="3" selection-start-column="5" selection-end-line="3" selection-end-column="13" />
|
||||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
<entry file="file:///usr/local/Cellar/python/3.7.0/Frameworks/Python.framework/Versions/3.7/lib/python3.7/subprocess.py">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="139">
|
||||
<caret line="965" column="8" selection-start-line="965" selection-start-column="8" selection-end-line="965" selection-end-column="8" />
|
||||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
<entry file="file://$PROJECT_DIR$/crawler/taobao/spiders/comment.py">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="247">
|
||||
<caret line="81" column="34" selection-start-line="81" selection-start-column="34" selection-end-line="81" selection-end-column="34" />
|
||||
<folding>
|
||||
<element signature="e#24#35#0" expanded="true" />
|
||||
</folding>
|
||||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
</component>
|
||||
<component name="masterDetails">
|
||||
<states>
|
||||
<state key="ScopeChooserConfigurable.UI">
|
||||
<settings>
|
||||
<splitter-proportions>
|
||||
<option name="proportions">
|
||||
<list>
|
||||
<option value="0.2" />
|
||||
</list>
|
||||
</option>
|
||||
</splitter-proportions>
|
||||
</settings>
|
||||
</state>
|
||||
</states>
|
||||
</component>
|
||||
</project>
|
File diff suppressed because it is too large
Load Diff
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
@ -26,3 +26,13 @@ class ProductItem(scrapy.Item):
|
|||
url = scrapy.Field()
|
||||
detail = scrapy.Field()
|
||||
comment_num = scrapy.Field()
|
||||
|
||||
|
||||
class CommentItem(scrapy.Item):
|
||||
id = scrapy.Field()
|
||||
total_comment_num = scrapy.Field()
|
||||
total_crazy_count = scrapy.Field()
|
||||
total_indifferent_count = scrapy.Field()
|
||||
total_detest_count = scrapy.Field()
|
||||
page_count = scrapy.Field()
|
||||
comments = scrapy.Field()
|
||||
|
|
|
@ -0,0 +1,48 @@
|
|||
"""
|
||||
商品数据合并
|
||||
"""
|
||||
import json
|
||||
|
||||
|
||||
def pretty(word):
|
||||
"""
|
||||
去掉问题之类的
|
||||
:param word:
|
||||
:return:
|
||||
"""
|
||||
if word is not None:
|
||||
return word.replace('"', "")
|
||||
return None
|
||||
|
||||
|
||||
def main():
|
||||
wf = open('../datas/products/data.csv', 'w')
|
||||
with open('../datas/kind.json', 'r') as f:
|
||||
kinds = json.load(f)
|
||||
for kind in kinds:
|
||||
for keyword in kind["category"]:
|
||||
# print("大类 %s" % kind["kind"])
|
||||
print("小类 %s" % keyword)
|
||||
try:
|
||||
ff = open('../datas/%s.json' % keyword, 'r')
|
||||
for line in ff:
|
||||
data = json.loads(line)
|
||||
fline = '%s,%s,%s,"%s","%s","%s",%s,"%s",%s,%s' % (data["id"],
|
||||
kind["kind"],
|
||||
keyword,
|
||||
pretty(data["title"]),
|
||||
data["price"],
|
||||
pretty(data["shop"]),
|
||||
data["pubtime"],
|
||||
pretty(data["publisher"]),
|
||||
pretty(data["url"]),
|
||||
data["comment_num"])
|
||||
wf.write(fline + "\n")
|
||||
ff.close()
|
||||
except Exception as e:
|
||||
print(e)
|
||||
wf.close()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
|
@ -28,3 +28,13 @@ class JsonWithEncodingCnblogsPipeline(object):
|
|||
return item
|
||||
def spider_closed(self, spider):
|
||||
self.file.close()
|
||||
|
||||
class JsonWithEncodingCommentsPipeline(object):
|
||||
def open_spider(self, spider):
|
||||
self.file = open('../../datas/comments/%s.json' % spider.product_id, 'w', encoding='utf-8')
|
||||
def process_item(self, item, spider):
|
||||
line = json.dumps(dict(item), ensure_ascii=False) + "\n"
|
||||
self.file.write(line)
|
||||
return item
|
||||
def spider_closed(self, spider):
|
||||
self.file.close()
|
||||
|
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
@ -0,0 +1,100 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
import json
|
||||
import re
|
||||
import subprocess
|
||||
import time
|
||||
|
||||
import scrapy
|
||||
from scrapy import Request
|
||||
|
||||
from taobao.items import ProductItem
|
||||
|
||||
"""
|
||||
按分类采集电商商品数据
|
||||
"""
|
||||
|
||||
|
||||
class CategorySpider(scrapy.Spider):
|
||||
name = 'category'
|
||||
allowed_domains = ['dangdang.com']
|
||||
start_urls = []
|
||||
|
||||
custom_settings = {
|
||||
'LOG_LEVEL': "WARN",
|
||||
'ITEM_PIPELINES': {
|
||||
'taobao.pipelines.JsonWithEncodingCnblogsPipeline': 300,
|
||||
}
|
||||
}
|
||||
|
||||
page = 1
|
||||
|
||||
def __init__(self, name=None, **kwargs):
|
||||
self.keyword = kwargs['keyword']
|
||||
self.start_urls = [
|
||||
'http://search.dangdang.com/?key=%s&act=input&show=list&show_shop=0#J_tab'
|
||||
% self.keyword]
|
||||
super(CategorySpider, self).__init__(name, **kwargs)
|
||||
|
||||
def safe_fetch(self, node):
|
||||
try:
|
||||
return node[0].extract()
|
||||
except Exception as e:
|
||||
return None
|
||||
|
||||
def parse(self, response):
|
||||
products = response.xpath(
|
||||
'//div[@class="con shoplist"]//div[@id="search_nature_rg"]//ul/li')
|
||||
for product in products:
|
||||
title = self.safe_fetch(product.xpath('p[@class="name"]/a/@title'))
|
||||
price = self.safe_fetch(
|
||||
product.xpath('p[@class="price"]/span[@class="search_now_price"]/text()'))
|
||||
shop = self.safe_fetch(product.xpath('p[@class="search_book_author"]/span/a/@title'))
|
||||
if shop is None:
|
||||
shop = self.safe_fetch(product.xpath('p[@class="link"]/a/text()'))
|
||||
pubtime = self.safe_fetch(
|
||||
product.xpath('p[@class="search_book_author"]/span[2]/text()'))
|
||||
publisher = self.safe_fetch(
|
||||
product.xpath('p[@class="search_book_author"]/span[3]/a/text()'))
|
||||
|
||||
detail = self.safe_fetch(product.xpath('p[@class="detail"]/text()'))
|
||||
url = self.safe_fetch(product.xpath('p[@class="name"]/a/@href'))
|
||||
comment_num = self.safe_fetch(product.xpath(
|
||||
'p[@class="search_star_line"]//a[@class="search_comment_num"]/text()'))
|
||||
|
||||
match_obj = re.match(r'http.+/(\d+).html', url)
|
||||
product_id = match_obj.group(1)
|
||||
|
||||
yield ProductItem(
|
||||
id=product_id,
|
||||
title=title,
|
||||
price=price,
|
||||
shop=shop,
|
||||
pubtime=pubtime,
|
||||
publisher=publisher,
|
||||
url=url,
|
||||
comment_num=comment_num
|
||||
)
|
||||
|
||||
next_page_url = response.xpath(
|
||||
'//div[@class="paging"]//a[text()="下一页"]/@href').extract()
|
||||
if len(next_page_url) > 0:
|
||||
# print(next_page_url[0])
|
||||
print("开始抓取 %s 第%d页" % (self.keyword, CategorySpider.page))
|
||||
CategorySpider.page += 1
|
||||
yield Request("http://search.dangdang.com%s" % next_page_url[0])
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
with open('../../datas/kind.json', 'r') as f:
|
||||
kinds = json.load(f)
|
||||
for kind in kinds:
|
||||
for keyword in kind["category"]:
|
||||
print("开始抓取 %s 分类" % keyword)
|
||||
|
||||
cmd = "scrapy crawl category -a keyword=%s" % keyword
|
||||
p = subprocess.Popen(cmd, shell=True)
|
||||
p.wait()
|
||||
if p.returncode != 0:
|
||||
print("抓取 %s 出现错误" % keyword)
|
||||
time.sleep(3)
|
||||
print("抓取 %s 分类结束" % keyword)
|
|
@ -0,0 +1,117 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
import json
|
||||
import pdb
|
||||
import subprocess
|
||||
|
||||
import scrapy
|
||||
from scrapy import Request
|
||||
from scrapy.http import Response
|
||||
from lxml import etree
|
||||
|
||||
from taobao.items import CommentItem
|
||||
|
||||
import pandas as pd
|
||||
|
||||
"""
|
||||
商品评论数据抓取
|
||||
"""
|
||||
|
||||
|
||||
class CommentSpider(scrapy.Spider):
|
||||
name = 'comment'
|
||||
allowed_domains = ['dangdang.com']
|
||||
start_urls = []
|
||||
|
||||
custom_settings = {
|
||||
'LOG_LEVEL': "WARN",
|
||||
'ITEM_PIPELINES': {
|
||||
'taobao.pipelines.JsonWithEncodingCommentsPipeline': 300,
|
||||
}
|
||||
}
|
||||
|
||||
def __init__(self, name=None, **kwargs):
|
||||
self.product_id = kwargs['id']
|
||||
self.start_urls = [
|
||||
'http://product.dangdang.com/index.php?r=comment%%2Flist&productId=%s&'
|
||||
'mainProductId=%s&mediumId=0&pageIndex=1&sortType=1&filterType=1'
|
||||
'&isSystem=1&tagId=0&tagFilterCount=0' % (self.product_id, self.product_id)
|
||||
]
|
||||
self.comments = None
|
||||
super(CommentSpider, self).__init__(name, **kwargs)
|
||||
|
||||
def parse(self, response):
|
||||
data = json.loads(response.body)
|
||||
summary = data["data"]["list"]["summary"]
|
||||
html = data["data"]["list"]["html"]
|
||||
|
||||
print("评论总数: %s" % summary["total_comment_num"])
|
||||
print("好评: %s" % summary["total_crazy_count"])
|
||||
print("中评: %s" % summary["total_indifferent_count"])
|
||||
print("差评: %s" % summary["total_detest_count"])
|
||||
print("评论页数: %s" % summary["pageCount"])
|
||||
print("当前页面: %s" % summary["pageIndex"])
|
||||
|
||||
if self.comments is None:
|
||||
self.comments = CommentItem(
|
||||
id=self.product_id,
|
||||
total_comment_num=summary["total_comment_num"],
|
||||
total_crazy_count=summary["total_crazy_count"],
|
||||
total_indifferent_count=summary["total_indifferent_count"],
|
||||
total_detest_count=summary["total_detest_count"],
|
||||
page_count = summary["pageCount"],
|
||||
comments=[]
|
||||
)
|
||||
|
||||
el = etree.HTML(html)
|
||||
items = el.xpath('//div[contains(@class,"comment_items")]')
|
||||
for item in items:
|
||||
score = item.xpath('div[@class="items_right"]//div[@class="pinglun"]//em/text()')
|
||||
desc = item.xpath(
|
||||
'div[@class="items_right"]//div[@class="describe_detail"]/span/text()')
|
||||
date = item.xpath(
|
||||
'div[@class="items_right"]//div[@class="starline clearfix"]/span[1]/text()')
|
||||
headimg = item.xpath('div[@class="items_left_pic"]/a/img/@src')
|
||||
username = item.xpath('div[@class="items_left_pic"]/span[@class="name"]/text()')
|
||||
userlevel = item.xpath(
|
||||
'div[@class="items_left_pic"]/span[contains(@class,"level")]/text()')
|
||||
|
||||
comment = {
|
||||
"score": score and score[0] or None,
|
||||
"desc": desc and desc[0] or None ,
|
||||
"date": date and date[0] or None,
|
||||
"headimg": headimg and headimg[0] or None,
|
||||
"username": username and username[0] or None,
|
||||
"userlevel": userlevel and userlevel[0] or None
|
||||
}
|
||||
|
||||
self.comments["comments"].append(comment)
|
||||
|
||||
if int(summary["pageIndex"])<int(summary["pageCount"]):
|
||||
yield Request('http://product.dangdang.com/index.php?r=comment%%2Flist&productId=%s&'
|
||||
'mainProductId=%s&mediumId=0&pageIndex=%d&sortType=1&filterType=1'
|
||||
'&isSystem=1&tagId=0&tagFilterCount=0' % (self.product_id, self.product_id, int(summary["pageIndex"])+1))
|
||||
else:
|
||||
#end
|
||||
yield self.comments
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
df = pd.read_csv('/Users/guange/work/bigdata/chapter1/crawler/datas/products/clean_data.csv')
|
||||
df = df[df.comment_num>0]
|
||||
print(type(df))
|
||||
for i in range(0, df.size, 10):
|
||||
pp = []
|
||||
for x in range(10):
|
||||
product = df.iloc[i+x]
|
||||
if product["comment_num"]>0:
|
||||
cmd = "scrapy crawl comment -a id=%s" % product["id"]
|
||||
pp.append(subprocess.Popen(cmd, shell=True))
|
||||
|
||||
for p in pp:
|
||||
try:
|
||||
p.wait(20)
|
||||
if p.returncode != 0:
|
||||
print("抓取 %s 出现错误" % product["id"])
|
||||
except Exception as e:
|
||||
print(e)
|
||||
print("总数: %d, 当前行: %d" % (df.size, i))
|
Loading…
Reference in New Issue