开始第二章

This commit is contained in:
guange 2019-01-15 19:06:36 +08:00
parent 35d6572833
commit d614fe908b
7 changed files with 189 additions and 60 deletions

View File

@ -1,15 +1,9 @@
<?xml version="1.0" encoding="UTF-8"?> <?xml version="1.0" encoding="UTF-8"?>
<project version="4"> <project version="4">
<component name="ChangeListManager"> <component name="ChangeListManager">
<list default="true" id="a5fbb387-9969-4874-8a7f-c9dd40d5225d" name="Default Changelist" comment=""> <list default="true" id="a5fbb387-9969-4874-8a7f-c9dd40d5225d" name="Default Changelist" comment="开始第二章">
<change afterPath="$PROJECT_DIR$/crawler/client.py" afterDir="false" />
<change afterPath="$PROJECT_DIR$/crawler/demo.py" afterDir="false" />
<change afterPath="$PROJECT_DIR$/crawler/taobao/utils.py" afterDir="false" />
<change beforePath="$PROJECT_DIR$/.idea/workspace.xml" beforeDir="false" afterPath="$PROJECT_DIR$/.idea/workspace.xml" afterDir="false" /> <change beforePath="$PROJECT_DIR$/.idea/workspace.xml" beforeDir="false" afterPath="$PROJECT_DIR$/.idea/workspace.xml" afterDir="false" />
<change beforePath="$PROJECT_DIR$/crawler/taobao/items.py" beforeDir="false" afterPath="$PROJECT_DIR$/crawler/taobao/items.py" afterDir="false" /> <change beforePath="$PROJECT_DIR$/requirements.txt" beforeDir="false" afterPath="$PROJECT_DIR$/requirements.txt" afterDir="false" />
<change beforePath="$PROJECT_DIR$/crawler/taobao/pipelines.py" beforeDir="false" afterPath="$PROJECT_DIR$/crawler/taobao/pipelines.py" afterDir="false" />
<change beforePath="$PROJECT_DIR$/crawler/taobao/settings.py" beforeDir="false" afterPath="$PROJECT_DIR$/crawler/taobao/settings.py" afterDir="false" />
<change beforePath="$PROJECT_DIR$/crawler/taobao/spiders/jd.py" beforeDir="false" afterPath="$PROJECT_DIR$/crawler/taobao/spiders/jd.py" afterDir="false" />
</list> </list>
<option name="EXCLUDED_CONVERTED_TO_IGNORED" value="true" /> <option name="EXCLUDED_CONVERTED_TO_IGNORED" value="true" />
<option name="SHOW_DIALOG" value="false" /> <option name="SHOW_DIALOG" value="false" />
@ -28,8 +22,8 @@
<file pinned="false" current-in-tab="false"> <file pinned="false" current-in-tab="false">
<entry file="file://$PROJECT_DIR$/crawler/taobao/items.py"> <entry file="file://$PROJECT_DIR$/crawler/taobao/items.py">
<provider selected="true" editor-type-id="text-editor"> <provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="-545"> <state relative-caret-position="238">
<caret line="14" selection-start-line="14" selection-end-line="14" /> <caret line="64" column="14" selection-start-line="64" selection-start-column="4" selection-end-line="64" selection-end-column="14" />
</state> </state>
</provider> </provider>
</entry> </entry>
@ -37,8 +31,8 @@
<file pinned="false" current-in-tab="false"> <file pinned="false" current-in-tab="false">
<entry file="file://$PROJECT_DIR$/crawler/taobao/pipelines.py"> <entry file="file://$PROJECT_DIR$/crawler/taobao/pipelines.py">
<provider selected="true" editor-type-id="text-editor"> <provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="205"> <state relative-caret-position="-1250">
<caret line="105" column="20" selection-start-line="104" selection-start-column="20" selection-end-line="105" selection-end-column="20" /> <caret line="15" lean-forward="true" selection-start-line="15" selection-end-line="15" />
<folding> <folding>
<element signature="e#193#204#0" expanded="true" /> <element signature="e#193#204#0" expanded="true" />
</folding> </folding>
@ -58,26 +52,26 @@
<file pinned="false" current-in-tab="false"> <file pinned="false" current-in-tab="false">
<entry file="file://$PROJECT_DIR$/crawler/taobao/settings.py"> <entry file="file://$PROJECT_DIR$/crawler/taobao/settings.py">
<provider selected="true" editor-type-id="text-editor"> <provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="176"> <state relative-caret-position="101">
<caret line="115" selection-start-line="115" selection-end-line="115" /> <caret line="110" column="28" lean-forward="true" selection-start-line="110" selection-start-column="28" selection-end-line="110" selection-end-column="28" />
</state>
</provider>
</entry>
</file>
<file pinned="false" current-in-tab="true">
<entry file="file://$PROJECT_DIR$/crawler/client.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="235">
<caret line="36" column="10" lean-forward="true" selection-start-line="36" selection-start-column="10" selection-end-line="36" selection-end-column="10" />
</state> </state>
</provider> </provider>
</entry> </entry>
</file> </file>
<file pinned="false" current-in-tab="false"> <file pinned="false" current-in-tab="false">
<entry file="file://$PROJECT_DIR$/crawler/client.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="235">
<caret line="31" column="31" lean-forward="true" selection-start-line="31" selection-start-column="31" selection-end-line="31" selection-end-column="31" />
</state>
</provider>
</entry>
</file>
<file pinned="false" current-in-tab="true">
<entry file="file://$PROJECT_DIR$/crawler/taobao/spiders/jd.py"> <entry file="file://$PROJECT_DIR$/crawler/taobao/spiders/jd.py">
<provider selected="true" editor-type-id="text-editor"> <provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="-680"> <state relative-caret-position="153">
<caret line="85" selection-start-line="85" selection-end-line="85" /> <caret line="149" lean-forward="true" selection-start-line="149" selection-end-line="149" />
<folding> <folding>
<element signature="e#15#26#0" expanded="true" /> <element signature="e#15#26#0" expanded="true" />
</folding> </folding>
@ -120,6 +114,7 @@
<find>TELNETCONSOLE_ENABLED</find> <find>TELNETCONSOLE_ENABLED</find>
<find>EXTENSIONS_BASE</find> <find>EXTENSIONS_BASE</find>
<find>TELNET</find> <find>TELNET</find>
<find>HBASE_PORT</find>
</findStrings> </findStrings>
</component> </component>
<component name="Git.Settings"> <component name="Git.Settings">
@ -171,6 +166,12 @@
<item name="chapter1" type="462c0819:PsiDirectoryNode" /> <item name="chapter1" type="462c0819:PsiDirectoryNode" />
<item name="crawler" type="462c0819:PsiDirectoryNode" /> <item name="crawler" type="462c0819:PsiDirectoryNode" />
</path> </path>
<path>
<item name="chapter1" type="b2602c69:ProjectViewProjectNode" />
<item name="chapter1" type="462c0819:PsiDirectoryNode" />
<item name="crawler" type="462c0819:PsiDirectoryNode" />
<item name="taobao" type="462c0819:PsiDirectoryNode" />
</path>
</expand> </expand>
<select /> <select />
</subPane> </subPane>
@ -335,7 +336,14 @@
<option name="project" value="LOCAL" /> <option name="project" value="LOCAL" />
<updated>1547476493838</updated> <updated>1547476493838</updated>
</task> </task>
<option name="localTasksCounter" value="5" /> <task id="LOCAL-00005" summary="开始第二章">
<created>1547550283596</created>
<option name="number" value="00005" />
<option name="presentableId" value="LOCAL-00005" />
<option name="project" value="LOCAL" />
<updated>1547550283596</updated>
</task>
<option name="localTasksCounter" value="6" />
<servers /> <servers />
</component> </component>
<component name="TodoView"> <component name="TodoView">
@ -351,7 +359,7 @@
<frame x="0" y="23" width="1280" height="777" extended-state="0" /> <frame x="0" y="23" width="1280" height="777" extended-state="0" />
<editor active="true" /> <editor active="true" />
<layout> <layout>
<window_info active="true" content_ui="combo" id="Project" order="0" visible="true" weight="0.26171243" /> <window_info content_ui="combo" id="Project" order="0" visible="true" weight="0.26171243" />
<window_info id="Structure" order="1" side_tool="true" weight="0.25" /> <window_info id="Structure" order="1" side_tool="true" weight="0.25" />
<window_info id="Favorites" order="2" side_tool="true" /> <window_info id="Favorites" order="2" side_tool="true" />
<window_info anchor="bottom" id="Message" order="0" /> <window_info anchor="bottom" id="Message" order="0" />
@ -362,7 +370,7 @@
<window_info anchor="bottom" id="Inspection" order="5" weight="0.4" /> <window_info anchor="bottom" id="Inspection" order="5" weight="0.4" />
<window_info anchor="bottom" id="TODO" order="6" weight="0.329927" /> <window_info anchor="bottom" id="TODO" order="6" weight="0.329927" />
<window_info anchor="bottom" id="Version Control" order="7" /> <window_info anchor="bottom" id="Version Control" order="7" />
<window_info anchor="bottom" id="Terminal" order="8" visible="true" weight="0.38686132" /> <window_info active="true" anchor="bottom" id="Terminal" order="8" visible="true" weight="0.62919706" />
<window_info anchor="bottom" id="Event Log" order="9" side_tool="true" /> <window_info anchor="bottom" id="Event Log" order="9" side_tool="true" />
<window_info anchor="bottom" id="Python Console" order="10" weight="0.329927" /> <window_info anchor="bottom" id="Python Console" order="10" weight="0.329927" />
<window_info anchor="right" id="Commander" internal_type="SLIDING" order="0" type="SLIDING" weight="0.4" /> <window_info anchor="right" id="Commander" internal_type="SLIDING" order="0" type="SLIDING" weight="0.4" />
@ -376,7 +384,8 @@
<MESSAGE value="增加id字段" /> <MESSAGE value="增加id字段" />
<MESSAGE value="加入京东抓取" /> <MESSAGE value="加入京东抓取" />
<MESSAGE value="导入hbase" /> <MESSAGE value="导入hbase" />
<option name="LAST_COMMIT_MESSAGE" value="导入hbase" /> <MESSAGE value="开始第二章" />
<option name="LAST_COMMIT_MESSAGE" value="开始第二章" />
</component> </component>
<component name="editorHistoryManager"> <component name="editorHistoryManager">
<entry file="file://$PROJECT_DIR$/crawler/scrapy.cfg"> <entry file="file://$PROJECT_DIR$/crawler/scrapy.cfg">
@ -594,6 +603,13 @@
</state> </state>
</provider> </provider>
</entry> </entry>
<entry file="file:///usr/local/Cellar/python/3.7.0/Frameworks/Python.framework/Versions/3.7/lib/python3.7/telnetlib.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="291">
<caret line="289" column="33" lean-forward="true" selection-start-line="289" selection-start-column="33" selection-end-line="289" selection-end-column="33" />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/env/lib/python3.7/site-packages/scrapy/http/request/__init__.py"> <entry file="file://$PROJECT_DIR$/env/lib/python3.7/site-packages/scrapy/http/request/__init__.py">
<provider selected="true" editor-type-id="text-editor"> <provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="180"> <state relative-caret-position="180">
@ -601,12 +617,19 @@
</state> </state>
</provider> </provider>
</entry> </entry>
<entry file="file://$PROJECT_DIR$/crawler/taobao/spiders/jd.py"> <entry file="file://$PROJECT_DIR$/crawler/taobao/items.py">
<provider selected="true" editor-type-id="text-editor"> <provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="-680"> <state relative-caret-position="238">
<caret line="85" selection-start-line="85" selection-end-line="85" /> <caret line="64" column="14" selection-start-line="64" selection-start-column="4" selection-end-line="64" selection-end-column="14" />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/crawler/taobao/pipelines.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="-1250">
<caret line="15" lean-forward="true" selection-start-line="15" selection-end-line="15" />
<folding> <folding>
<element signature="e#15#26#0" expanded="true" /> <element signature="e#193#204#0" expanded="true" />
</folding> </folding>
</state> </state>
</provider> </provider>
@ -618,41 +641,27 @@
</state> </state>
</provider> </provider>
</entry> </entry>
<entry file="file://$PROJECT_DIR$/crawler/taobao/pipelines.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="205">
<caret line="105" column="20" selection-start-line="104" selection-start-column="20" selection-end-line="105" selection-end-column="20" />
<folding>
<element signature="e#193#204#0" expanded="true" />
</folding>
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/crawler/taobao/items.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="-545">
<caret line="14" selection-start-line="14" selection-end-line="14" />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/crawler/taobao/settings.py"> <entry file="file://$PROJECT_DIR$/crawler/taobao/settings.py">
<provider selected="true" editor-type-id="text-editor"> <provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="176"> <state relative-caret-position="101">
<caret line="115" selection-start-line="115" selection-end-line="115" /> <caret line="110" column="28" lean-forward="true" selection-start-line="110" selection-start-column="28" selection-end-line="110" selection-end-column="28" />
</state>
</provider>
</entry>
<entry file="file:///usr/local/Cellar/python/3.7.0/Frameworks/Python.framework/Versions/3.7/lib/python3.7/telnetlib.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="291">
<caret line="289" column="33" lean-forward="true" selection-start-line="289" selection-start-column="33" selection-end-line="289" selection-end-column="33" />
</state> </state>
</provider> </provider>
</entry> </entry>
<entry file="file://$PROJECT_DIR$/crawler/client.py"> <entry file="file://$PROJECT_DIR$/crawler/client.py">
<provider selected="true" editor-type-id="text-editor"> <provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="235"> <state relative-caret-position="235">
<caret line="36" column="10" lean-forward="true" selection-start-line="36" selection-start-column="10" selection-end-line="36" selection-end-column="10" /> <caret line="31" column="31" lean-forward="true" selection-start-line="31" selection-start-column="31" selection-end-line="31" selection-end-column="31" />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/crawler/taobao/spiders/jd.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="153">
<caret line="149" lean-forward="true" selection-start-line="149" selection-end-line="149" />
<folding>
<element signature="e#15#26#0" expanded="true" />
</folding>
</state> </state>
</provider> </provider>
</entry> </entry>

View File

@ -4,13 +4,18 @@ attrs==18.2.0
Automat==0.7.0 Automat==0.7.0
backcall==0.1.0 backcall==0.1.0
bleach==3.1.0 bleach==3.1.0
certifi==2018.11.29
cffi==1.11.5 cffi==1.11.5
chardet==3.0.4
constantly==15.1.0 constantly==15.1.0
cryptography==2.4.2 cryptography==2.4.2
cssselect==1.0.3 cssselect==1.0.3
Cython==0.29.2
decorator==4.3.0 decorator==4.3.0
defusedxml==0.5.0 defusedxml==0.5.0
Django==2.1.5
entrypoints==0.3 entrypoints==0.3
happybase==1.1.0
hyperlink==18.0.0 hyperlink==18.0.0
idna==2.8 idna==2.8
incremental==17.5.0 incremental==17.5.0
@ -39,6 +44,7 @@ parso==0.3.1
pexpect==4.6.0 pexpect==4.6.0
pickleshare==0.7.5 pickleshare==0.7.5
Pillow==5.4.1 Pillow==5.4.1
ply==3.11
prometheus-client==0.5.0 prometheus-client==0.5.0
prompt-toolkit==2.0.7 prompt-toolkit==2.0.7
ptyprocess==0.6.0 ptyprocess==0.6.0
@ -54,7 +60,9 @@ pytz==2018.9
pyzmq==17.1.2 pyzmq==17.1.2
qtconsole==4.4.3 qtconsole==4.4.3
queuelib==1.5.0 queuelib==1.5.0
requests==2.21.0
Scrapy==1.5.1 Scrapy==1.5.1
scrapy-jsonrpc==0.3.0
scrapy-splash==0.7.2 scrapy-splash==0.7.2
selenium==3.141.0 selenium==3.141.0
Send2Trash==1.5.0 Send2Trash==1.5.0
@ -62,6 +70,7 @@ service-identity==18.1.0
six==1.12.0 six==1.12.0
terminado==0.8.1 terminado==0.8.1
testpath==0.4.2 testpath==0.4.2
thriftpy==0.3.9
tornado==5.1.1 tornado==5.1.1
traitlets==4.3.2 traitlets==4.3.2
Twisted==18.9.0 Twisted==18.9.0

View File

View File

@ -0,0 +1,45 @@
#coding=utf-8
import telnetlib
import re
import pdb
import happybase
def get_crawl_data_info():
connection = happybase.Connection('106.75.85.84', port=40009)
table = connection.table('jd')
num = 0
for i in table.scan(scan_batching=True):
num += 1
return num
def do_telnet(Host, finish):
'''Telnet远程登录Windows客户端连接Linux服务器'''
# 连接Telnet服务器
tn = telnetlib.Telnet(Host, port=6023, timeout=10)
tn.set_debuglevel(2)
# 输入登录用户名
out = tn.read_until(finish)
tn.write(b'est()\n')
# 输入登录密码
out = tn.read_until(finish)
tn.close() # tn.write('exit\n')
return out.decode('utf8')
def get_scrapy_info():
response = do_telnet('127.0.0.1', b'>>> ')
mm = re.findall(r'(.+?)\s+?:\s+?(.+?)\s+', response)
info = {}
for m in mm:
info[m[0]] = m[1]
return info
if __name__ == "__main__":
print(get_crawl_data_info())

View File

@ -0,0 +1,44 @@
<!DOCTYPE html>
<html>
<head>
<title>数据抓取</title>
</head>
<body>
<h2>爬虫运行状态</h2>
<hr/>
<table>
{% for k, v in info.items %}
<tr>
<td>{{ k }}</td>
<td>{{ v }}</td>
</tr>
{% endfor %}
</table>
<h2>数据抓取状态</h2>
<hr/>
<table>
<tr>
<td>表名</td>
<td>抓取条数</td>
</tr>
<tr>
<td>商品表</td>
<td></td>
</tr>
<tr>
<td>评论表</td>
<td></td>
</tr>
</table>
</body>
</html>

View File

@ -0,0 +1,14 @@
<!DOCTYPE html>
<html>
<head>
<title>Spark电商大数据</title>
</head>
<body>
<ul>
<li><a href={% url 'crawl' %}>数据抓取</a></li>
<li><a href="">数据分析</a></li>
</ul>
</body>
</html>

View File

@ -0,0 +1,8 @@
from django.urls import path
from . import views
urlpatterns = [
path('', views.index, name='index'),
path('crawl', views.crawl, name='crawl'),
]