From ee0aea16aa6aec9e141b3258df01850f086ac143 Mon Sep 17 00:00:00 2001
From: guange <8863824@gmail.com>
Date: Sun, 13 Jan 2019 11:34:47 +0800
Subject: [PATCH] =?UTF-8?q?=E6=8A=93=E5=8F=96=E8=AF=84=E8=AE=BA?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
---
.gitignore | 4 +
chapter1/.idea/chapter1.iml | 22 +
chapter1/.idea/encodings.xml | 4 +
chapter1/.idea/misc.xml | 7 +
chapter1/.idea/modules.xml | 8 +
chapter1/.idea/vcs.xml | 6 +
chapter1/.idea/workspace.xml | 661 ++++++++
chapter1/crawler/datas/kind.json | 1508 -----------------
.../__pycache__/__init__.cpython-37.pyc | Bin 144 -> 149 bytes
.../taobao/__pycache__/items.cpython-37.pyc | Bin 631 -> 991 bytes
.../__pycache__/middlewares.cpython-37.pyc | Bin 4345 -> 4326 bytes
.../__pycache__/pipelines.cpython-37.pyc | Bin 704 -> 2258 bytes
.../__pycache__/settings.cpython-37.pyc | Bin 569 -> 578 bytes
chapter1/crawler/taobao/items.py | 10 +
chapter1/crawler/taobao/merge.py | 48 +
chapter1/crawler/taobao/pipelines.py | 10 +
.../__pycache__/__init__.cpython-37.pyc | Bin 152 -> 157 bytes
.../__pycache__/dangdang.cpython-37.pyc | Bin 2374 -> 2381 bytes
.../__pycache__/httpbin.cpython-37.pyc | Bin 1950 -> 1955 bytes
.../spiders/__pycache__/meitu.cpython-37.pyc | Bin 4891 -> 0 bytes
.../spiders/__pycache__/porn.cpython-37.pyc | Bin 701 -> 0 bytes
.../spiders/__pycache__/users.cpython-37.pyc | Bin 1397 -> 1402 bytes
chapter1/crawler/taobao/spiders/category.py | 100 ++
chapter1/crawler/taobao/spiders/comment.py | 117 ++
24 files changed, 997 insertions(+), 1508 deletions(-)
create mode 100644 chapter1/.idea/chapter1.iml
create mode 100644 chapter1/.idea/encodings.xml
create mode 100644 chapter1/.idea/misc.xml
create mode 100644 chapter1/.idea/modules.xml
create mode 100644 chapter1/.idea/vcs.xml
create mode 100644 chapter1/.idea/workspace.xml
delete mode 100644 chapter1/crawler/datas/kind.json
create mode 100644 chapter1/crawler/taobao/merge.py
delete mode 100644 chapter1/crawler/taobao/spiders/__pycache__/meitu.cpython-37.pyc
delete mode 100644 chapter1/crawler/taobao/spiders/__pycache__/porn.cpython-37.pyc
create mode 100644 chapter1/crawler/taobao/spiders/category.py
create mode 100644 chapter1/crawler/taobao/spiders/comment.py
diff --git a/.gitignore b/.gitignore
index ee26f5d..59daf2a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,7 @@
SprakProject/
chapter1/env
*.ipynb
+*.pyc
+chapter1/crawler/datas/comments/
+chapter1/crawler/datas/comments1/
+chapter1/crawler/datas/products/
diff --git a/chapter1/.idea/chapter1.iml b/chapter1/.idea/chapter1.iml
new file mode 100644
index 0000000..ee1e345
--- /dev/null
+++ b/chapter1/.idea/chapter1.iml
@@ -0,0 +1,22 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/chapter1/.idea/encodings.xml b/chapter1/.idea/encodings.xml
new file mode 100644
index 0000000..15a15b2
--- /dev/null
+++ b/chapter1/.idea/encodings.xml
@@ -0,0 +1,4 @@
+
+
+
+
\ No newline at end of file
diff --git a/chapter1/.idea/misc.xml b/chapter1/.idea/misc.xml
new file mode 100644
index 0000000..92d7ed4
--- /dev/null
+++ b/chapter1/.idea/misc.xml
@@ -0,0 +1,7 @@
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/chapter1/.idea/modules.xml b/chapter1/.idea/modules.xml
new file mode 100644
index 0000000..daf6423
--- /dev/null
+++ b/chapter1/.idea/modules.xml
@@ -0,0 +1,8 @@
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/chapter1/.idea/vcs.xml b/chapter1/.idea/vcs.xml
new file mode 100644
index 0000000..6c0b863
--- /dev/null
+++ b/chapter1/.idea/vcs.xml
@@ -0,0 +1,6 @@
+
+
+
+
+
+
\ No newline at end of file
diff --git a/chapter1/.idea/workspace.xml b/chapter1/.idea/workspace.xml
new file mode 100644
index 0000000..238f824
--- /dev/null
+++ b/chapter1/.idea/workspace.xml
@@ -0,0 +1,661 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ 日本可爱毛球熊猫束发带
+ 美式风
+ 雪
+ 皮床
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ 1546928387088
+
+
+ 1546928387088
+
+
+ 1546936657398
+
+
+
+ 1546936657399
+
+
+ 1546938314632
+
+
+
+ 1546938314632
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/chapter1/crawler/datas/kind.json b/chapter1/crawler/datas/kind.json
deleted file mode 100644
index 7d187d1..0000000
--- a/chapter1/crawler/datas/kind.json
+++ /dev/null
@@ -1,1508 +0,0 @@
-[
- {
- "kind": "女装",
- "category": [
- "冬上新",
- "毛呢外套",
- "羽绒服",
- "毛衣",
- "裤子",
- "棉衣棉服",
- "毛针织衫",
- "连衣裙",
- "皮草",
- "短外套",
- "卫衣绒衫",
- "半身裙",
- "时尚套装",
- "牛仔裤",
- "大码女装",
- "T恤",
- "衬衫",
- "私服名媛",
- "OL丽人",
- "街头潮流",
- "日韩清新",
- "文艺复古",
- "原创设计",
- "休闲裤",
- "风衣",
- "牛仔外套",
- "长袖连衣裙",
- "阔腿裤",
- "西装",
- "长风衣",
- "蕾丝衫/雪纺衫",
- "大码外套",
- "婚纱礼服",
- "民族服装"
- ]
- },
- {
- "kind": "男装",
- "category": [
- "外套",
- "羽绒服",
- "棉衣",
- "品质好物",
- "夹克",
- "卫衣",
- "皮衣",
- "衬衫",
- "针织衫/毛衣",
- "毛呢大衣",
- "T恤",
- "POLO衫",
- "休闲裤",
- "牛仔裤",
- "风衣",
- "西装",
- "牛仔外套",
- "棒球服",
- "运动裤",
- "九分裤",
- "套装",
- "开衫",
- "马甲",
- "中老年",
- "情侣装",
- "大码",
- "民族风",
- "专柜大牌",
- "明星网红",
- "原创设计"
- ]
- },
- {
- "kind": "内衣",
- "category": [
- "法式内衣",
- "无钢圈内衣",
- "内裤女",
- "文胸",
- "内裤男",
- "长袖睡衣",
- "睡裙",
- "真丝睡衣",
- "丝袜",
- "船袜",
- "情侣睡衣",
- "抹胸",
- "背心",
- "睡袍",
- "男士睡衣",
- "塑身衣",
- "内衣套装",
- "打底裤",
- "连体睡衣",
- "聚拢文胸",
- "男士袜子",
- "棉袜女",
- "卡通睡衣",
- "无痕内裤",
- "少女文胸"
- ]
- },
- {
- "kind": "鞋靴",
- "category": [
- "流行女鞋",
- "雪地靴",
- "短靴",
- "长靴",
- "运动风",
- "加绒鞋",
- "毛毛鞋",
- "厚底鞋",
- "冬季热销",
- "高跟鞋",
- "中跟鞋",
- "平底鞋",
- "马丁靴",
- "切尔西靴",
- "袜靴",
- "圆头鞋",
- "尖头鞋",
- "方跟鞋",
- "低帮鞋",
- "高帮鞋",
- "瘦瘦靴",
- "一脚蹬",
- "懒人鞋",
- "男鞋",
- "休闲鞋",
- "板鞋",
- "帆布鞋",
- "运动风",
- "豆豆鞋",
- "乐福鞋",
- "雕花布洛克",
- "船鞋",
- "增高鞋",
- "正装商务",
- "户外休闲",
- "爸爸鞋",
- "德比鞋",
- "孟克鞋",
- "布鞋"
- ]
- },
- {
- "kind": "箱包",
- "category": [
- "女包",
- "骚包",
- "双肩包",
- "男包",
- "旅行箱",
- "钱包",
- "真皮包",
- "大牌",
- "宽肩带",
- "小方包",
- "水桶包",
- "迷你包",
- "链条包",
- "贝壳包",
- "波士顿包",
- "手拿包",
- "单肩包",
- "手提包",
- "斜挎包",
- "零钱包",
- "妈妈包",
- "欧美潮搭",
- "日韩流行",
- "青春学院",
- "男士商务",
- "雅痞休闲",
- "拉杆箱",
- "腰包",
- "胸包",
- "手工皮具",
- "红人优品"
- ]
- },
- {
- "kind": "配件配饰",
- "category": [
- "帽子",
- "贝雷帽",
- "渔夫帽",
- "鸭舌帽",
- "礼帽",
- "草帽",
- "爵士帽",
- "盆帽",
- "八角帽",
- "丝巾",
- "披肩",
- "真丝围巾",
- "棉麻围巾",
- "方巾",
- "手套",
- "真皮手套",
- "触屏手套",
- "半指手套",
- "全指手套",
- "真皮腰带",
- "腰带",
- "手工皮带"
- ]
- },
- {
- "kind": "童装玩具",
- "category": [
- "连衣裙",
- "保暖连体",
- "裤子",
- "羽绒",
- "居家睡衣",
- "针织",
- "帽子",
- "亲子装",
- "童鞋",
- "学步鞋",
- "女童运动鞋",
- "男童运动鞋",
- "毛毛虫童鞋",
- "雪地靴",
- "马丁靴",
- "长靴",
- "玩具",
- "积木",
- "毛绒玩具",
- "早教",
- "儿童自行车",
- "电动童车",
- "遥控模型",
- "户外玩具",
- "亲子玩具",
- "学习用品",
- "描红本"
- ]
- },
- {
- "kind": "孕产用品",
- "category": [
- "美妈大衣",
- "孕妇裤",
- "月子服",
- "哺乳文胸",
- "吸奶器",
- "防辐射",
- "孕妇内裤",
- "连衣裙",
- "待产包",
- "孕妇牛仔裤",
- "孕妇营养品",
- "防溢乳垫",
- "美德乐",
- "十月妈咪",
- "三洋",
- "Bravado",
- "新生儿",
- "婴儿床",
- "婴儿推车",
- "睡袋",
- "抱被",
- "隔尿垫",
- "学步车",
- "安抚奶嘴",
- "体温计",
- "纸尿裤",
- "花王",
- "洗衣液",
- "湿巾"
- ]
- },
- {
- "kind": "奶粉辅食",
- "category": [
- "爱他美",
- "羊奶粉",
- "特殊配方奶粉",
- "喜宝",
- "惠氏",
- "启赋",
- "牛栏",
- "美素佳儿",
- "贝因美",
- "雅培",
- "美赞臣",
- "可瑞康",
- "a2",
- "嘉宝",
- "美林",
- "米粉",
- "泡芙",
- "溶溶豆",
- "肉肠",
- "果肉条",
- "奶片",
- "益生菌",
- "维生素",
- "钙铁锌",
- "DHA",
- "宝宝食用油",
- "核桃油",
- "葡萄糖",
- "宝宝调料",
- "奶瓶",
- "餐具",
- "餐椅",
- "暖奶器"
- ]
- },
- {
- "kind": "家电",
- "category": [
- "淘宝速达",
- "实体商场服务",
- "淘火炬品牌",
- "生活电器",
- "厨房电器",
- "个人护理",
- "空气净化器",
- "扫地机器人",
- "吸尘器",
- "取暖器",
- "烤箱",
- "豆浆机",
- "榨汁料理",
- "电饭煲",
- "吹风机",
- "足浴盆",
- "剃须刀",
- "卷发器",
- "按摩器材",
- "冬季火锅",
- "蓝牙耳机",
- "电暖桌",
- "蓝牙音箱",
- "电热毯",
- "加湿器",
- "暖风机"
- ]
- },
- {
- "kind": "数码",
- "category": [
- "淘宝速达",
- "淘宝火炬品牌",
- "实体商场服务",
- "2小时送货服务",
- "surface平板电脑",
- "苹果/Apple",
- "iPad",
- "Pro",
- "电脑主机",
- "数码相机",
- "电玩动漫",
- "单反相机",
- "华为",
- "MateBook",
- "IPAD",
- "mini4",
- "游戏主机",
- "鼠标键盘",
- "无人机",
- "二手数码",
- "二手手机",
- "二手笔记本",
- "二手平板电脑"
- ]
- },
- {
- "kind": "手机",
- "category": [
- "iPhone",
- "xs",
- "iPhone",
- "xs",
- "max",
- "iPhone",
- "xr",
- "华为Mate20P",
- "小米MIX3",
- "荣耀Magic2",
- "一加6T",
- "黑鲨2代",
- "努比亚X",
- "iPhone",
- "X",
- "iPhone",
- "8",
- "OPPO",
- "vivo",
- "华为P20",
- "小米",
- "魅族",
- "二手手机",
- "手机以旧换新"
- ]
- },
- {
- "kind": "美妆",
- "category": [
- "面膜",
- "洁面",
- "防晒",
- "爽肤水",
- "眼霜",
- "乳液",
- "面霜",
- "精华",
- "卸妆",
- "男士护肤",
- "眼线",
- "粉底液",
- "BB霜",
- "隔离",
- "睫毛膏",
- "彩妆盘",
- "唇膏",
- "腮红",
- "香水",
- "精油",
- "身体护理",
- "丰胸",
- "纤体",
- "脱毛",
- "海外直邮"
- ]
- },
- {
- "kind": "个人护理",
- "category": [
- "洗发水",
- "护发素",
- "发膜",
- "头发造型",
- "染发膏",
- "烫发水",
- "假发",
- "沐浴露",
- "私处护理",
- "身体乳液",
- "牙膏",
- "牙刷",
- "漱口水",
- "足浴",
- "足贴",
- "洗手液",
- "卫生巾",
- "成人纸尿裤",
- "抽纸",
- "卷纸",
- "洗衣液",
- "清洁剂",
- "厨房清洁",
- "家私/皮具护理",
- "香薰"
- ]
- },
- {
- "kind": "营养保健",
- "category": [
- "B族维生素",
- "葡萄籽",
- "辅酶Q10",
- "消化酶",
- "软骨素",
- "维生素C",
- "钙",
- "大豆异黄酮",
- "益生菌",
- "鱼油",
- "氨基葡萄糖",
- "葡萄籽",
- "生物素",
- "玛咖(玛卡)",
- "酵素",
- "螺旋藻",
- "胶原蛋白",
- "月见草油",
- "DHA",
- "蔓越莓",
- "左旋肉碱",
- "褪黑素",
- "锯棕榈"
- ]
- },
- {
- "kind": "珠宝",
- "category": [
- "琥珀蜜蜡",
- "翡翠手镯",
- "钻戒",
- "铂金",
- "黄金首饰",
- "高端定制",
- "彩色宝石",
- "珍珠",
- "金镶玉",
- "钻石",
- "K金首饰",
- "岫岩玉雕",
- "和田籽料拍卖",
- "裸石",
- "翡翠玉石",
- "一元起拍",
- "设计师",
- "珠宝首饰",
- "金条",
- "情侣对戒",
- "琥珀原石",
- "老坑冰种拍卖"
- ]
- },
- {
- "kind": "眼镜",
- "category": [
- "眼镜架",
- "3D眼镜",
- "司机镜",
- "防辐射眼镜",
- "老花镜",
- "儿童镜",
- "色盲眼镜",
- "无框眼镜",
- "眼镜片",
- "依视路",
- "雷朋",
- "复古眼镜",
- "超轻眼镜",
- "护目镜",
- "眼镜配件",
- "滑雪镜",
- "超耐磨",
- "GM眼镜",
- "配镜服务"
- ]
- },
- {
- "kind": "手表",
- "category": [
- "运动表",
- "卡西欧",
- "国表",
- "时尚表",
- "女表",
- "儿童表",
- "学生表",
- "浪琴",
- "斯沃琪表",
- "镂空机械表",
- "皮带表",
- "钢带表",
- "欧米茄",
- "电子表",
- "陶瓷表",
- "瑞士表",
- "手表放心淘",
- "日韩腕表",
- "情侣表",
- "光能表",
- "怀表",
- "表带",
- "手表配件",
- "休闲",
- "精钢",
- "复古手表",
- "中性手表",
- "帆布表带",
- "深度防水"
- ]
- },
- {
- "kind": "运动",
- "category": [
- "Yeezy",
- "350",
- "Alpha",
- "Bounce",
- "AJ30",
- "Stan",
- "Smith",
- "大Air皮蓬",
- "KD9",
- "Kayano23",
- "Sock",
- "Dart",
- "Hyperdunk",
- "耐克",
- "阿迪达斯",
- "New",
- "Balance",
- "亚瑟士",
- "Under",
- "Armour",
- "匡威",
- "彪马",
- "VANS",
- "锐步",
- "斯凯奇",
- "美津浓",
- "李宁",
- "跑鞋",
- "篮球鞋",
- "复古休闲",
- "健身",
- "足球",
- "羽毛球"
- ]
- },
- {
- "kind": "户外健身",
- "category": [
- "鱼线",
- "鱼线轮",
- "户外鞋",
- "登山包",
- "帐篷",
- "睡袋",
- "望远镜",
- "皮肤衣",
- "速干衣",
- "速干裤",
- "手电筒",
- "山地车",
- "公路车",
- "骑行服",
- "护具",
- "军迷用品",
- "舞蹈体操",
- "羽毛球",
- "游泳",
- "瑜伽",
- "跑步机",
- "健身器",
- "烧烤架",
- "休闲鞋",
- "冲锋裤",
- "单车零件",
- "骑行装备",
- "遮阳棚",
- "户外手表",
- "户外风衣",
- "军迷套装",
- "战术鞋"
- ]
- },
- {
- "kind": "乐器",
- "category": [
- "全新钢琴",
- "二手钢琴",
- "电钢琴",
- "电子琴",
- "萨克斯",
- "尤克里里",
- "架子鼓",
- "小提琴",
- "口琴",
- "手卷钢琴",
- "古筝",
- "古琴",
- "二胡",
- "葫芦丝",
- "陶笛",
- "琵琶",
- "笛子",
- "非洲鼓",
- "贝斯",
- "调音器",
- "节拍器",
- "电吉他",
- "电箱吉他",
- "乐器音箱",
- "电子鼓",
- "手风琴",
- "大提琴",
- "合成器",
- "乐器租赁"
- ]
- },
- {
- "kind": "游戏",
- "category": [
- "DNF",
- "梦幻西游",
- "魔兽",
- "LOL",
- "坦克世界",
- "剑网3",
- "魔域",
- "DOTA2",
- "街头篮球",
- "CF",
- "天龙八部",
- "大话西游2",
- "三国争霸",
- "YY",
- "劲舞团",
- "倩女幽魂",
- "天下3",
- "反恐精英",
- "冒险岛",
- "问道",
- "逆战",
- "大唐无双",
- "征途2",
- "九阴真经",
- "龙之谷",
- "热血江湖",
- "剑灵"
- ]
- },
- {
- "kind": "动漫周边",
- "category": [
- "手办",
- "高达模型",
- "秦时明月",
- "大圣归来",
- "海贼王",
- "圣斗士",
- "火影忍者",
- "LOVELIVE!",
- "银魂",
- "进击的巨人",
- "从零开始的异世界生活",
- "最终幻想",
- "复仇者联盟",
- "钢铁侠",
- "魔兽世界",
- "守望先锋",
- "英雄联盟",
- "剑三",
- "全职高手",
- "盗墓笔记",
- "COS女装",
- "COS男装",
- "漫展门票",
- "在下坂本有何贵干"
- ]
- },
- {
- "kind": "热门影视周边",
- "category": [
- "太阳的后裔",
- "琅琊榜",
- "太子妃升职记",
- "哈利波特",
- "星球大战",
- "蝙蝠侠",
- "盗墓笔记",
- "神探夏洛克",
- "古剑奇谭",
- "花千骨",
- "EXO",
- "欢乐颂",
- "少女时代",
- "BIGBANG",
- "TFBOYS",
- "东方神起",
- "AKB48",
- "岚ARASHI",
- "SHINHWA神话"
- ]
- },
- {
- "kind": "美食",
- "category": [
- "牛奶",
- "柚子茶",
- "酸梅汤",
- "矿泉水",
- "酵素",
- "藕粉",
- "大米",
- "小米",
- "黄豆",
- "火腿",
- "香肠",
- "木耳",
- "枸杞",
- "人参",
- "石斛",
- "雪蛤",
- "蜂蜜",
- "天麻花粉",
- "铁观音",
- "红茶",
- "花草茶",
- "龙井",
- "黑茶",
- "绿茶",
- "鸡尾酒",
- "精酿啤酒",
- "洋酒",
- "红酒"
- ]
- },
- {
- "kind": "生鲜",
- "category": [
- "荔枝",
- "水果",
- "百香果",
- "芒果",
- "小龙虾",
- "樱桃",
- "榴莲",
- "杨梅",
- "牛排",
- "柠檬",
- "海参",
- "水蜜桃",
- "咸鸭蛋",
- "李子",
- "桃子",
- "龙虾",
- "苹果",
- "黄桃",
- "火龙果",
- "波罗蜜",
- "山竹",
- "蓝莓",
- "鸡胸肉",
- "猕猴桃",
- "三文鱼",
- "红薯",
- "车厘子",
- "海鲜"
- ]
- },
- {
- "kind": "零食",
- "category": [
- "冰皮月饼",
- "零食大礼包",
- "牛肉干",
- "面包",
- "辣条",
- "红枣",
- "核桃",
- "饼干",
- "巧克力",
- "葡萄干",
- "芒果干",
- "绿豆糕",
- "薯片",
- "锅巴",
- "海苔",
- "月饼",
- "蛋黄酥",
- "猪肉脯",
- "花生",
- "长沙臭豆腐",
- "瓜子",
- "棒棒糖",
- "糖果",
- "手撕面包",
- "豆干",
- "罗汉果",
- "开心果",
- "山楂",
- "全麦面包",
- "早餐",
- "腰果",
- "压缩饼干"
- ]
- },
- {
- "kind": "工具",
- "category": [
- "电钻",
- "气钉枪",
- "电锯",
- "抛光机",
- "磨砂机",
- "修边机",
- "焊接设备",
- "扳手",
- "钳子",
- "锉、刨",
- "打磨机",
- "角磨机",
- "工具组套",
- "电锯片",
- "电锤",
- "云石片",
- "电动螺丝批",
- "测电笔",
- "电子钳",
- "电烙铁",
- "电子焊接工具套装",
- "接线板/插头",
- "断路器",
- "开关",
- "布线箱",
- "转换器",
- "断路器",
- "插座",
- "底盒",
- "电线",
- "电工套管",
- "监控器材及系统",
- "防盗报警器材及系统",
- "楼宇智能系统"
- ]
- },
- {
- "kind": "装修",
- "category": [
- "全包",
- "半包",
- "免费设计",
- "全套设计",
- "优质装修公司",
- "样板",
- "小户型",
- "美式风",
- "宜家风",
- "集成吊顶"
- ]
- },
- {
- "kind": "建材",
- "category": [
- "建材优品",
- "智能马桶",
- "客厅灯",
- "LED灯泡",
- "实木地板",
- "强化地板",
- "仿古砖",
- "花砖",
- "马赛克",
- "玻化砖",
- "浴室柜",
- "花洒",
- "浴室柜套装",
- "水槽",
- "台上盆",
- "毛巾杆",
- "普通马桶",
- "龙头",
- "浴缸",
- "墙纸",
- "壁纸",
- "墙布",
- "背景墙",
- "指纹锁",
- "防盗锁",
- "监控摄像头",
- "开关插座",
- "无线摄像头",
- "门",
- "榻榻米"
- ]
- },
- {
- "kind": "家具",
- "category": [
- "沙发",
- "床",
- "高低床",
- "餐桌",
- "床垫",
- "茶几",
- "电视柜",
- "衣柜",
- "鞋柜",
- "椅凳",
- "书桌",
- "电脑桌",
- "坐具",
- "现代简约",
- "美式家具",
- "北欧家具",
- "中式家具",
- "儿童家具",
- "真皮沙发",
- "布艺沙发",
- "皮床",
- "实木床",
- "儿童床",
- "乳胶床垫",
- "儿童学习桌",
- "书架",
- "花架",
- "椅子",
- "电脑椅",
- "佛山家具"
- ]
- },
- {
- "kind": "布艺软饰",
- "category": [
- "窗帘",
- "地毯",
- "沙发垫",
- "十字绣",
- "桌布",
- "地垫",
- "抱枕",
- "坐垫",
- "飘窗垫",
- "门帘",
- "缝纫机",
- "洗衣机罩",
- "卷帘",
- "珠帘",
- "沙发巾",
- "靠垫",
- "空调罩",
- "餐桌布",
- "门垫",
- "浴室防滑垫",
- "茶几桌布",
- "桌垫",
- "装饰画",
- "摆件",
- "照片墙",
- "相框",
- "墙贴",
- "花瓶",
- "壁纸",
- "挂钟",
- "仿真花",
- "油画",
- "客厅装饰画",
- "玻璃贴纸",
- "香炉",
- "玻璃花瓶",
- "相框挂墙",
- "数字油画",
- "假花",
- "画框",
- "干花",
- "挂画",
- "钟",
- "音乐盒",
- "倒流香炉"
- ]
- },
- {
- "kind": "床上用品",
- "category": [
- "夏凉被",
- "草席",
- "床褥",
- "U型枕",
- "蚊帐",
- "凉席",
- "天丝套件",
- "贡缎套件",
- "提花套件",
- "婚庆套件",
- "儿童套件",
- "空调被",
- "儿童床品",
- "麻将凉席",
- "四件套",
- "毛巾被",
- "记忆枕",
- "老粗布",
- "床垫",
- "婚庆床品",
- "床笠",
- "蒙古包蚊帐",
- "空调毯",
- "枕头",
- "宫廷蚊帐",
- "牛皮席",
- "冰丝席",
- "竹席",
- "藤席",
- "床单",
- "四件套",
- "全棉套件",
- "磨毛四件套",
- "保暖套件",
- "婚庆",
- "婚庆套件",
- "儿童套件",
- "儿童床品",
- "贡缎套件",
- "提花套件",
- "被子",
- "蚕丝被",
- "羽绒被",
- "厚被",
- "冬被",
- "七孔被",
- "保暖被",
- "枕头",
- "乳胶枕",
- "羽绒枕",
- "记忆枕",
- "床褥",
- "床单被罩",
- "毛毯",
- "毯子",
- "绒毯"
- ]
- },
- {
- "kind": "品质汽车",
- "category": [
- "买车送油卡",
- "v60一口价39.99万",
- "首付一成开新车",
- "超级试驾",
- "新能源车",
- "轿车",
- "SUV",
- "小型车",
- "玛莎拉蒂",
- "沃尔沃",
- "荣威",
- "别克",
- "雪佛兰",
- "日产",
- "本田",
- "起亚",
- "标致",
- "奇瑞",
- "海马",
- "宝马新1系",
- "smart",
- "Mini",
- "荣威",
- "本田",
- "天猫养车",
- "4S保养",
- "上门保养",
- "镀晶服务",
- "打蜡服务",
- "空调清洗"
- ]
- },
- {
- "kind": "二手车",
- "category": [
- "司法车拍卖",
- "公车拍卖",
- "二手车卖场",
- "二手车拍卖",
- "汽车估价",
- "车秒拍",
- "大众",
- "宝马",
- "奥迪",
- "丰田",
- "奔驰",
- "本田",
- "别克",
- "福特",
- "马自达",
- "雪佛兰",
- "3万以下",
- "3-5万",
- "5-10万",
- "10-20万",
- "20-30万",
- "30-40万",
- "40万以上",
- "SUV",
- "MPV",
- "跑车",
- "越野车",
- "玛莎拉蒂特价车"
- ]
- },
- {
- "kind": "汽车用品",
- "category": [
- "车载空气净化器",
- "脚垫",
- "夏季坐垫",
- "后备箱垫",
- "座套",
- "安全座椅",
- "香水",
- "记录仪",
- "手机支架",
- "车载导航",
- "安全预警仪",
- "后视镜导航",
- "机油",
- "燃油宝",
- "轮胎",
- "贴膜",
- "车载吸尘器",
- "镀晶",
- "车蜡",
- "洗车机",
- "轮胎报警器",
- "车充",
- "氙气灯",
- "雨刮",
- "空调滤芯",
- "大灯总成",
- "车挂",
- "安全锤",
- "应急工具"
- ]
- },
- {
- "kind": "办公",
- "category": [
- "打印机",
- "一体机",
- "复合机",
- "3D打印机",
- "投影机",
- "收银机",
- "收银纸",
- "电子面单机",
- "考勤门禁",
- "保险箱",
- "会议白板",
- "安防摄像",
- "无线网卡",
- "WiFi放大器",
- "无线呼叫器",
- "格子间",
- "电脑桌",
- "办公椅",
- "理线器",
- "计算器",
- "荧光告示贴",
- "翻译笔",
- "毛笔",
- "马克笔",
- "文件收纳",
- "本册",
- "书写工具",
- "文具",
- "画具画材",
- "钢笔",
- "中性笔",
- "财会用品",
- "碎纸机",
- "包装设备"
- ]
- },
- {
- "kind": "DIY",
- "category": [
- "定制T恤",
- "文化衫",
- "工作服",
- "卫衣定制",
- "LOGO设计",
- "VI设计",
- "海报定制",
- "3D效果图制作",
- "广告扇",
- "水晶奖杯",
- "胸牌工牌",
- "奖杯",
- "徽章",
- "洗照片",
- "照片冲印",
- "相册/照片书",
- "软陶人偶",
- "手绘漫画",
- "纸箱",
- "搬家纸箱",
- "胶带",
- "标签贴纸",
- "二维码贴纸",
- "塑料袋",
- "自封袋",
- "快递袋",
- "气泡膜",
- "编织袋",
- "飞机盒",
- "泡沫箱",
- "气柱袋",
- "纸手提袋",
- "打包绳带",
- "气泡信封",
- "缠绕膜"
- ]
- },
- {
- "kind": "五金/电子",
- "category": [
- "物联网市场",
- "万用表",
- "电动螺丝刀",
- "管钳子",
- "电钻",
- "无尘锯",
- "电焊机",
- "角磨机",
- "切割机",
- "发电机",
- "快排阀",
- "增压泵",
- "钢珠",
- "测距仪",
- "水平仪",
- "传感器",
- "电容器",
- "变压器",
- "单片机开发板",
- "智能小车",
- "机器人套件",
- "3D打印耗材",
- "GPS",
- "蓝牙",
- "LED灯珠",
- "树莓派"
- ]
- },
- {
- "kind": "百货",
- "category": [
- "保温杯",
- "饭盒",
- "玻璃杯",
- "马克杯",
- "餐具套装",
- "碗",
- "盘",
- "碟",
- "茶具套装",
- "茶杯",
- "茶壶",
- "茶具",
- "陶瓷餐具",
- "拖鞋",
- "雨伞雨具",
- "口罩",
- "垃圾桶",
- "居家鞋",
- "省力拖把",
- "家务清洁",
- "垃圾袋",
- "梳子",
- "抹布",
- "围裙",
- "拖把",
- "浴帘",
- "浴室置物架",
- "拖把桶旋转",
- "镜子",
- "铸铁锅",
- "炒锅",
- "饮具",
- "心机小物",
- "厨房置物架",
- "密封罐",
- "潮州陶瓷",
- "景德镇陶瓷",
- "厨用小工具",
- "刀具砧板",
- "烧烤烘培"
- ]
- },
- {
- "kind": "餐厨",
- "category": [
- "收纳整理",
- "收纳箱",
- "儿童收纳柜",
- "压缩袋",
- "衣柜整理",
- "鞋柜",
- "布艺软收纳",
- "浴室收纳",
- "置物架",
- "强力不粘钩",
- "厨房收纳",
- "桌面收纳",
- "壁挂收纳",
- "旅行收纳",
- "化妆包",
- "购物车",
- "环保袋",
- "野餐蓝",
- "药箱药盒",
- "衣物洗晒",
- "粘毛剪球",
- "脏衣篮",
- "木制衣架",
- "大型晾晒架",
- "裤架",
- "儿童衣架",
- "柳编",
- "ZAKKA风",
- "原生态",
- "棉麻风",
- "纸质收纳"
- ]
- },
- {
- "kind": "家庭保健",
- "category": [
- "创可贴",
- "消毒用品",
- "体温计",
- "冷敷降温",
- "急救箱",
- "医用口罩",
- "绷带纱布",
- "血压监测",
- "血糖监测",
- "心率监测",
- "呼吸制氧",
- "拐杖",
- "轮椅",
- "助行器",
- "矫正牵引",
- "医用床上护理",
- "拔罐"
- ]
- },
- {
- "kind": "学习",
- "category": [
- "英语四级",
- "2018考研",
- "成人学历",
- "小学教学",
- "雅思托福",
- "学历提升",
- "会计提升",
- "ps美工技能",
- "考研辅导",
- "外教口语课",
- "建造师",
- "口语一对一",
- "驾照报名",
- "汽车维修",
- "化妆课程",
- "电商培训",
- "少儿英语",
- "公务员考试",
- "中小学辅导",
- "宝宝早教",
- "健身减肥",
- "DIY手工",
- "微信小程序",
- "JAVA",
- "CAD教程",
- "驾校学车"
- ]
- },
- {
- "kind": "卡券票",
- "category": [
- "劳动节福利",
- "超市卡",
- "沃尔玛",
- "家乐福",
- "银泰卡",
- "面包券",
- "来伊份券",
- "粽子券",
- "熟食/半成品",
- "星巴克",
- "咖啡",
- "哈根达斯",
- "冰淇淋",
- "网站建设",
- "云服务器",
- "财务管理",
- "网页设计",
- "软件"
- ]
- },
- {
- "kind": "本地服务",
- "category": [
- "婚纱摄影",
- "青岛婚拍",
- "丽江婚拍",
- "三亚婚拍",
- "厦门婚拍",
- "新娘跟妆",
- "婚礼司仪",
- "婚车租赁",
- "婚礼策划",
- "婚宴预订",
- "婚纱礼服",
- "礼服租赁",
- "家电清洗",
- "家庭保洁",
- "搬家搬运",
- "在线洗衣",
- "上门养车",
- "跑腿代办",
- "名企招聘",
- "上门美甲",
- "入职体检",
- "法律咨询",
- "上门按摩",
- "专业翻译"
- ]
- }
-]
\ No newline at end of file
diff --git a/chapter1/crawler/taobao/__pycache__/__init__.cpython-37.pyc b/chapter1/crawler/taobao/__pycache__/__init__.cpython-37.pyc
index 05cca0864fc5a3b1cd87ba5d2ba527d3237c32e1..3d5983580d6ee8625648a0110c9548857f043323 100644
GIT binary patch
delta 75
zcmbQhIF*sdiIEkvFBpM5Ok&}VuYSRhobq6P{>d?Mq#?_
z0h}F!VVx-N>W5@ot9vI~fF`JCZ=iE1dpK7--_14Vu^Gv=huWX3%7tn3t<(rXO0IT2!o`UYeMf
zo~mD-UzDw1P+5|ZpQoQ(lvtjVTBKi9<%*GD~t&Z?P5>WhSTIVoNLqnp-saKckPn5YSB=
mjOeh411PM?Qp5?QZn1!jD`EmNi#R5)XIA520qJ2DVgmr=4mnW(
diff --git a/chapter1/crawler/taobao/__pycache__/middlewares.cpython-37.pyc b/chapter1/crawler/taobao/__pycache__/middlewares.cpython-37.pyc
index 56dd606a13c015ee6cca12491759ff1ebae96624..a0323b1ab08474ed549198bc4f6f30ecfff7493e 100644
GIT binary patch
delta 168
zcmeyV_)L-4iIPs&VBNi0d!
zPtHgzC`m0c)K4x-EYC?T(l1HOPfE<+EYGx>lQClQX>NJ@6y_9`6xMX+TBZ_)Ean=<
z8m1bCES40uUgjExcvc{rbpcxn`$9%Wh7xuVFNLESs3M*tg&~+hlhbdrJx?MdhXBY)
Nj2ujxSMmBY0|4awE%E>W
delta 187
zcmaE+_*0SBiI1hIm#Wn{@$O3I|AK2|I|F!r2T|5zmpr5X_*-U-l&;@z~YDj4ld=8ea1-uJ!ldG=nf*CEi}{Qk|e
zKNul@qOoXTlU?ZQ7Z^C(RcZ}_F`j~i|f59na;8A_%hegTi+S(=Lk*B6b#WEZ;X!XOC7
z5nASbOZR^|JpSguv$Y#Vc{~wOq&rbGDfl$Qc{hq)OyjJ6@*9zD@Mea;zYm@$AyqJ%
z#`#DD2Zekd4AT*ht2jvZmZTwK_+BS#lJ0DWS^Uu57ubcN}eBXe|MLI04*`;Gg`pspU~lLz~|~~KdK^snpXP{@}%HtKDwU|vtqQ8h`|8<@E3zL
z1YLqj1+tyfG2H^WqDWuca}rscfLmuUyWsl){9$rca9#OhI7ZWhnKP~S?tBYUbj8tu
zNN!PzT(_T#Ll6bmY%k4(XGyFdiE3=nVVK^;{ZXBTZFs&Q`qn96{Kk6;W*i&RkKF*f
z0h5Z1X=TmnOUfy?UIVyo@+*KF+i%(2en)Ao4K{cPFuEdFQz@cYC26W@$8$6@SRFo{
zlu9$6CY8JZZnbkDK^c&6gKHC(3^>iO6i+}jD!2$+6l#LdE!w5C+o!>6Dl2Iw4(A5n
z8JGYP@|L~>xtnCs0Z?5Bm}G@|FYa^T=%EpM$dn^>!a*$^s%w<8y
zK3JYDvBGr+GRdb&6{Xyy<~_wAk+?6QFEX*Nk`rJ)eig`1;e3gy@f?W-DL=yj^=rbH
z_`O&YzPyajm-mDDh#Cca3F}SWuWRchq7M_kyn^BV_xDDkB5z_)rl#o3KTR+GM^O7!
zIB*9KZKxgBybVLKwo5(?w-sEN<)Od2CsT97ON`W9kh)b2#{#}VcwLxsk?5tt`?A2a
Xd$+#Ezr!W-e*jil*Y2~v)9?HPwJhJI
delta 403
zcmZvXPfEi;6vi`6nnX=%yQ)`+NV5{86c^&gjT;f62of{Tb~MQhZ&GXAc!t(ncnZ(p
za&F)iTsf~+yK!KC@cw?^`{=#5J123x?bzo<@%0;9boRO<8*zuT!YSF9F=sJ6==0#h
zNy1NROqdBzQL3{V%Ej{WDzrHyd=1k}4XD$}EUPAv&Luvis;Q^4N^{KS1z`F-71P<8
zN+>8q1$v+w^z4Ei`*q7AWazze#*b)i;u=pwac4YX6^*Zz;D9T|A8gqs6DgEAXnjLX
zyUW)8OWUT=Cj^8J;m_-466ty>{wh;^o&=_+t!dQ^;3)@H3t4F-Z{z+tzp*dMYyuix
jTV35@e0h6+2zkBQ{hz1_N2TPm0*=X>CYKS5+^+i#UGrYK
diff --git a/chapter1/crawler/taobao/__pycache__/settings.cpython-37.pyc b/chapter1/crawler/taobao/__pycache__/settings.cpython-37.pyc
index 48b993e4af838a53f6cf6a52cede88d3e98d48c8..8ab276f7071bb5ce96154e2d865c0b5be6c2bee6 100644
GIT binary patch
delta 124
zcmdnVa)^c3iIRl!-1;TC1^N*F
zWNpUv#&-Il#i>Qb`st;KdFiS8<@rU~`bnATDTyVC`pFrI1tqCPhWg1xiRC$|MKFCF
GOojlfkSXy1
delta 144
zcmX@avXh0^iIQb`st;KdFiS8<@rU~`URCG
W8Tono$wi6fIjKbu#ZWDROojk4b}=CU
diff --git a/chapter1/crawler/taobao/items.py b/chapter1/crawler/taobao/items.py
index 3dc9563..fba9542 100644
--- a/chapter1/crawler/taobao/items.py
+++ b/chapter1/crawler/taobao/items.py
@@ -26,3 +26,13 @@ class ProductItem(scrapy.Item):
url = scrapy.Field()
detail = scrapy.Field()
comment_num = scrapy.Field()
+
+
+class CommentItem(scrapy.Item):
+ id = scrapy.Field()
+ total_comment_num = scrapy.Field()
+ total_crazy_count = scrapy.Field()
+ total_indifferent_count = scrapy.Field()
+ total_detest_count = scrapy.Field()
+ page_count = scrapy.Field()
+ comments = scrapy.Field()
diff --git a/chapter1/crawler/taobao/merge.py b/chapter1/crawler/taobao/merge.py
new file mode 100644
index 0000000..5dc8d3a
--- /dev/null
+++ b/chapter1/crawler/taobao/merge.py
@@ -0,0 +1,48 @@
+"""
+商品数据合并
+"""
+import json
+
+
+def pretty(word):
+ """
+ 去掉问题之类的
+ :param word:
+ :return:
+ """
+ if word is not None:
+ return word.replace('"', "")
+ return None
+
+
+def main():
+ wf = open('../datas/products/data.csv', 'w')
+ with open('../datas/kind.json', 'r') as f:
+ kinds = json.load(f)
+ for kind in kinds:
+ for keyword in kind["category"]:
+ # print("大类 %s" % kind["kind"])
+ print("小类 %s" % keyword)
+ try:
+ ff = open('../datas/%s.json' % keyword, 'r')
+ for line in ff:
+ data = json.loads(line)
+ fline = '%s,%s,%s,"%s","%s","%s",%s,"%s",%s,%s' % (data["id"],
+ kind["kind"],
+ keyword,
+ pretty(data["title"]),
+ data["price"],
+ pretty(data["shop"]),
+ data["pubtime"],
+ pretty(data["publisher"]),
+ pretty(data["url"]),
+ data["comment_num"])
+ wf.write(fline + "\n")
+ ff.close()
+ except Exception as e:
+ print(e)
+ wf.close()
+
+
+if __name__ == '__main__':
+ main()
diff --git a/chapter1/crawler/taobao/pipelines.py b/chapter1/crawler/taobao/pipelines.py
index 3ef6e34..75a0ce2 100644
--- a/chapter1/crawler/taobao/pipelines.py
+++ b/chapter1/crawler/taobao/pipelines.py
@@ -28,3 +28,13 @@ class JsonWithEncodingCnblogsPipeline(object):
return item
def spider_closed(self, spider):
self.file.close()
+
+class JsonWithEncodingCommentsPipeline(object):
+ def open_spider(self, spider):
+ self.file = open('../../datas/comments/%s.json' % spider.product_id, 'w', encoding='utf-8')
+ def process_item(self, item, spider):
+ line = json.dumps(dict(item), ensure_ascii=False) + "\n"
+ self.file.write(line)
+ return item
+ def spider_closed(self, spider):
+ self.file.close()
diff --git a/chapter1/crawler/taobao/spiders/__pycache__/__init__.cpython-37.pyc b/chapter1/crawler/taobao/spiders/__pycache__/__init__.cpython-37.pyc
index 6daed034fa1308c27bb13ad91dec2471790e6d01..abd3b1fe103da8d000ca3ea1a2dd6a8f5657b896 100644
GIT binary patch
delta 75
zcmbQiIG2&fiI7@YtB
delta 70
zcmbQsID?VLiImbXJJliIS+E7p##RD--FS8^yx41|Oq>^beJExu?3y@jN0VFt>
KI2bt?d6)s&vprY<
delta 215
zcmX>rbWDibiI7|Kz>8bkV`9<0K1(hWk`FZ-uMTzA(sYUu#8JQWS5OOj<
zvjU^yW&`Hu%xtV+@yT-RZj4crQ`zl-
diff --git a/chapter1/crawler/taobao/spiders/__pycache__/httpbin.cpython-37.pyc b/chapter1/crawler/taobao/spiders/__pycache__/httpbin.cpython-37.pyc
index 0a0d2a6d72283e0d2cb2c2350f03e42abdbb9430..be4b2b62507248a7e1d0aefb81e520582371e118 100644
GIT binary patch
delta 77
zcmbQoznGufiIAn7^5qWg;^GkS7^c
delta 72
zcmZ3?KaZc=iI$WCj2UhZmOs
diff --git a/chapter1/crawler/taobao/spiders/__pycache__/meitu.cpython-37.pyc b/chapter1/crawler/taobao/spiders/__pycache__/meitu.cpython-37.pyc
deleted file mode 100644
index 7997bc07f1fb693f563d5266184f2213bec679a7..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001
literal 4891
zcmd5=NpBm;74D5>lUlux9Vb>CI~h*gT(oHAyYwWabheP1Nh!`>kKS;@yFP
zoCMF;^}(ay2o_T86AhUl*)qEmUZ|CMSE5c#o`&+fzbq?NoJXIAO+
z7UdyS`yYf2Zb*8^z-G_I9DyU33D^qYjE$oSH6Ajb5LgjJ3#eO3~j(+vOXjcDZz;
zJTX32o+yt_PPNBx;8^?m=#6D}e8;)h++42TyZiamq^v(B{`&n1-<~%Y?DZAwpI?1?
zkFV7yw&)it&Gm(Q3rp+Eyy(?X=7q^dc0H`wM&&sWnyBqa(%+yn55HF&c@jE__Xaf&tq8pgKklVIE6ZbWCwW0RBDrY5JOD~3_=T)%E`
z8iuZ4u2R0>+ZB(JnGM_HG&*G%USJa=8S8YOFbv}Iy5ZPPjn0IuMWc@l!>#%O
zqXy@mK^Y4e7bAH!9!ykY1tI`=+z-0javf^ftjgmdxK4w%$9^)}*6kH^Ogt80sM3)Y?0`;Y%3u?sNw{q3V>-ws&4*nAPz
zz}+2YZ+ny#MJ#kE7tzCuB6YqmzgEOD6Q39XUgTA&Bv5AYJZuK0*HGrDr2AW$07d
z@>vPF4xt7kDpB7mYry;S;q`=nZ6zAwq0K^zCCJK0qG8c7(&-qv$^kqxP7u^x%JUlg
zDSR)^C5u|j;|#fBKL^qkQ-$v1G7cX7N*<aIpanD4B0#zm;VC_oFs`5WmQc|*a#e=P$0Lwl@C&qp)}`Ff5u
zWzd%_J(9kJzAvDxOQDWyHnW1}@GN+%f-mSR(Pc)%7V}3);E13pR08s%z#Rr~{eZgF
zT6l9Gg@}p?Qgnu%1Rs~K{g*}gn@3cD;-5klpkp7f|3<27iA(+nHo#WGK?f61pcXLc
zHsN3u-FYSM2gGseR!wj%-1m^j>bE?*Q6cv2DP#kR6qS$IPe^RF{Lu9T6i`yQN_vSW63$Jv
zm9vsKiEv(8oMYWF2!OiAt~`>SUjg^Tb4%Jx3!1@<$|MW+E{`TonD>4jh_6d-@e$0OC*Enq?X@VJ&q2d>{aC|u
zKYCXz5_jvXS;wgVb!%p74Ih_rbTpk*3NE^oo
z<^n&2Xk3J$kz^079-p=9IBV06uzZZBy$qxZ(f8~EW`;znXPE{{hr@kO#@Zj0=Lv?!8Y&x3
zzT~9i+iq9`R8$J3SaloKWA6Drcgq7MK*HX)zUI1gzhCJO@Ilha16
z(lE|pTAAb%B5hzLm=vdW&^O}@tYo$m=2HR`Vx4uac%}ww=W(jW=ms}4wqS=JvqL<1
zRLNh$&K-cr$^FVjc~I7sK?Nv4#Ne@{quvfv5
z1)qY;BLp(`s?aL}Jxmj|4UlnOI)}U9jY21>w`wQ!RRI)PSfRm+gV4{cL|+|shNvZi
z)o_vkJ037mqZf{~(Vmeyy+-JU9)%@#$4rs=KiPa1E%hHTYabN~;iN-G;8kp=E*zot
z40@09QVAH`iw!)ci;W?-FLs)-}@^_E7-~B;fPJ>DeiDcTUWk6EK~kYB7d7z16W_y$mHY|3#UO}VR-Ch}MnGC{akJun3oyj?
zk`XqJpB@4&2HTWa3Cwd-M<_=aVF-AM979fvhe5;Y-a;9sknAFo5hPcTU?{RtBtik7
zVGWa@Lj*G>hG*pf6prh4S%bVtQ_`}o>Z+#bIbGA!`U(BKep1ZL4~oMd?muvQYrYUS
zhPW?j6cA@cMqplmn<4exRs;X}XaGdg?teL4o{NeCsFPA7
z-d7`bz=hEbiTA)m&VA*bSE5|i8E_F>HOBp<>$>1Wl0h4!RngzyG)*m^hT(tdP+9Zk
z@mc@4-(;IMC^CSYcZuLKh;?Smc=|V;Jv0dM4K-Svov=mpHd?5|)%koDU9i*nY_^Er
zy}XE4DP9^(pW0ue%P~TO+FVnL8sXH&vcFd+Lh?u0udYhTeX8tQWMzXbEXnha%H5T|
z5ivr7-_OH0k_#EG+oW9cu&cynSYI`psti++bU7E{u*)sVK@u5iNrL(c>26`a`%8xO
V9TKgL&`ipGww?EdrW}@Z%|GUAn7>(tC6EyS#h4l6
delta 72
zcmeyx^_7d;iInpw|IKeRZts8~O}G%+tdRlhvHC|kdvvLquvPd~XR
Wu{ 0:
+ # print(next_page_url[0])
+ print("开始抓取 %s 第%d页" % (self.keyword, CategorySpider.page))
+ CategorySpider.page += 1
+ yield Request("http://search.dangdang.com%s" % next_page_url[0])
+
+
+if __name__ == '__main__':
+ with open('../../datas/kind.json', 'r') as f:
+ kinds = json.load(f)
+ for kind in kinds:
+ for keyword in kind["category"]:
+ print("开始抓取 %s 分类" % keyword)
+
+ cmd = "scrapy crawl category -a keyword=%s" % keyword
+ p = subprocess.Popen(cmd, shell=True)
+ p.wait()
+ if p.returncode != 0:
+ print("抓取 %s 出现错误" % keyword)
+ time.sleep(3)
+ print("抓取 %s 分类结束" % keyword)
diff --git a/chapter1/crawler/taobao/spiders/comment.py b/chapter1/crawler/taobao/spiders/comment.py
new file mode 100644
index 0000000..fac0f20
--- /dev/null
+++ b/chapter1/crawler/taobao/spiders/comment.py
@@ -0,0 +1,117 @@
+# -*- coding: utf-8 -*-
+import json
+import pdb
+import subprocess
+
+import scrapy
+from scrapy import Request
+from scrapy.http import Response
+from lxml import etree
+
+from taobao.items import CommentItem
+
+import pandas as pd
+
+"""
+商品评论数据抓取
+"""
+
+
+class CommentSpider(scrapy.Spider):
+ name = 'comment'
+ allowed_domains = ['dangdang.com']
+ start_urls = []
+
+ custom_settings = {
+ 'LOG_LEVEL': "WARN",
+ 'ITEM_PIPELINES': {
+ 'taobao.pipelines.JsonWithEncodingCommentsPipeline': 300,
+ }
+ }
+
+ def __init__(self, name=None, **kwargs):
+ self.product_id = kwargs['id']
+ self.start_urls = [
+ 'http://product.dangdang.com/index.php?r=comment%%2Flist&productId=%s&'
+ 'mainProductId=%s&mediumId=0&pageIndex=1&sortType=1&filterType=1'
+ '&isSystem=1&tagId=0&tagFilterCount=0' % (self.product_id, self.product_id)
+ ]
+ self.comments = None
+ super(CommentSpider, self).__init__(name, **kwargs)
+
+ def parse(self, response):
+ data = json.loads(response.body)
+ summary = data["data"]["list"]["summary"]
+ html = data["data"]["list"]["html"]
+
+ print("评论总数: %s" % summary["total_comment_num"])
+ print("好评: %s" % summary["total_crazy_count"])
+ print("中评: %s" % summary["total_indifferent_count"])
+ print("差评: %s" % summary["total_detest_count"])
+ print("评论页数: %s" % summary["pageCount"])
+ print("当前页面: %s" % summary["pageIndex"])
+
+ if self.comments is None:
+ self.comments = CommentItem(
+ id=self.product_id,
+ total_comment_num=summary["total_comment_num"],
+ total_crazy_count=summary["total_crazy_count"],
+ total_indifferent_count=summary["total_indifferent_count"],
+ total_detest_count=summary["total_detest_count"],
+ page_count = summary["pageCount"],
+ comments=[]
+ )
+
+ el = etree.HTML(html)
+ items = el.xpath('//div[contains(@class,"comment_items")]')
+ for item in items:
+ score = item.xpath('div[@class="items_right"]//div[@class="pinglun"]//em/text()')
+ desc = item.xpath(
+ 'div[@class="items_right"]//div[@class="describe_detail"]/span/text()')
+ date = item.xpath(
+ 'div[@class="items_right"]//div[@class="starline clearfix"]/span[1]/text()')
+ headimg = item.xpath('div[@class="items_left_pic"]/a/img/@src')
+ username = item.xpath('div[@class="items_left_pic"]/span[@class="name"]/text()')
+ userlevel = item.xpath(
+ 'div[@class="items_left_pic"]/span[contains(@class,"level")]/text()')
+
+ comment = {
+ "score": score and score[0] or None,
+ "desc": desc and desc[0] or None ,
+ "date": date and date[0] or None,
+ "headimg": headimg and headimg[0] or None,
+ "username": username and username[0] or None,
+ "userlevel": userlevel and userlevel[0] or None
+ }
+
+ self.comments["comments"].append(comment)
+
+ if int(summary["pageIndex"])0]
+ print(type(df))
+ for i in range(0, df.size, 10):
+ pp = []
+ for x in range(10):
+ product = df.iloc[i+x]
+ if product["comment_num"]>0:
+ cmd = "scrapy crawl comment -a id=%s" % product["id"]
+ pp.append(subprocess.Popen(cmd, shell=True))
+
+ for p in pp:
+ try:
+ p.wait(20)
+ if p.returncode != 0:
+ print("抓取 %s 出现错误" % product["id"])
+ except Exception as e:
+ print(e)
+ print("总数: %d, 当前行: %d" % (df.size, i))