diff --git a/.gitignore b/.gitignore
index ee26f5d..59daf2a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,7 @@
SprakProject/
chapter1/env
*.ipynb
+*.pyc
+chapter1/crawler/datas/comments/
+chapter1/crawler/datas/comments1/
+chapter1/crawler/datas/products/
diff --git a/chapter1/.idea/chapter1.iml b/chapter1/.idea/chapter1.iml
new file mode 100644
index 0000000..ee1e345
--- /dev/null
+++ b/chapter1/.idea/chapter1.iml
@@ -0,0 +1,22 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/chapter1/.idea/encodings.xml b/chapter1/.idea/encodings.xml
new file mode 100644
index 0000000..15a15b2
--- /dev/null
+++ b/chapter1/.idea/encodings.xml
@@ -0,0 +1,4 @@
+
+
+
+
\ No newline at end of file
diff --git a/chapter1/.idea/misc.xml b/chapter1/.idea/misc.xml
new file mode 100644
index 0000000..92d7ed4
--- /dev/null
+++ b/chapter1/.idea/misc.xml
@@ -0,0 +1,7 @@
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/chapter1/.idea/modules.xml b/chapter1/.idea/modules.xml
new file mode 100644
index 0000000..daf6423
--- /dev/null
+++ b/chapter1/.idea/modules.xml
@@ -0,0 +1,8 @@
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/chapter1/.idea/vcs.xml b/chapter1/.idea/vcs.xml
new file mode 100644
index 0000000..6c0b863
--- /dev/null
+++ b/chapter1/.idea/vcs.xml
@@ -0,0 +1,6 @@
+
+
+
+
+
+
\ No newline at end of file
diff --git a/chapter1/.idea/workspace.xml b/chapter1/.idea/workspace.xml
new file mode 100644
index 0000000..238f824
--- /dev/null
+++ b/chapter1/.idea/workspace.xml
@@ -0,0 +1,661 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ 日本可爱毛球熊猫束发带
+ 美式风
+ 雪
+ 皮床
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ 1546928387088
+
+
+ 1546928387088
+
+
+ 1546936657398
+
+
+
+ 1546936657399
+
+
+ 1546938314632
+
+
+
+ 1546938314632
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/chapter1/crawler/datas/kind.json b/chapter1/crawler/datas/kind.json
deleted file mode 100644
index 7d187d1..0000000
--- a/chapter1/crawler/datas/kind.json
+++ /dev/null
@@ -1,1508 +0,0 @@
-[
- {
- "kind": "女装",
- "category": [
- "冬上新",
- "毛呢外套",
- "羽绒服",
- "毛衣",
- "裤子",
- "棉衣棉服",
- "毛针织衫",
- "连衣裙",
- "皮草",
- "短外套",
- "卫衣绒衫",
- "半身裙",
- "时尚套装",
- "牛仔裤",
- "大码女装",
- "T恤",
- "衬衫",
- "私服名媛",
- "OL丽人",
- "街头潮流",
- "日韩清新",
- "文艺复古",
- "原创设计",
- "休闲裤",
- "风衣",
- "牛仔外套",
- "长袖连衣裙",
- "阔腿裤",
- "西装",
- "长风衣",
- "蕾丝衫/雪纺衫",
- "大码外套",
- "婚纱礼服",
- "民族服装"
- ]
- },
- {
- "kind": "男装",
- "category": [
- "外套",
- "羽绒服",
- "棉衣",
- "品质好物",
- "夹克",
- "卫衣",
- "皮衣",
- "衬衫",
- "针织衫/毛衣",
- "毛呢大衣",
- "T恤",
- "POLO衫",
- "休闲裤",
- "牛仔裤",
- "风衣",
- "西装",
- "牛仔外套",
- "棒球服",
- "运动裤",
- "九分裤",
- "套装",
- "开衫",
- "马甲",
- "中老年",
- "情侣装",
- "大码",
- "民族风",
- "专柜大牌",
- "明星网红",
- "原创设计"
- ]
- },
- {
- "kind": "内衣",
- "category": [
- "法式内衣",
- "无钢圈内衣",
- "内裤女",
- "文胸",
- "内裤男",
- "长袖睡衣",
- "睡裙",
- "真丝睡衣",
- "丝袜",
- "船袜",
- "情侣睡衣",
- "抹胸",
- "背心",
- "睡袍",
- "男士睡衣",
- "塑身衣",
- "内衣套装",
- "打底裤",
- "连体睡衣",
- "聚拢文胸",
- "男士袜子",
- "棉袜女",
- "卡通睡衣",
- "无痕内裤",
- "少女文胸"
- ]
- },
- {
- "kind": "鞋靴",
- "category": [
- "流行女鞋",
- "雪地靴",
- "短靴",
- "长靴",
- "运动风",
- "加绒鞋",
- "毛毛鞋",
- "厚底鞋",
- "冬季热销",
- "高跟鞋",
- "中跟鞋",
- "平底鞋",
- "马丁靴",
- "切尔西靴",
- "袜靴",
- "圆头鞋",
- "尖头鞋",
- "方跟鞋",
- "低帮鞋",
- "高帮鞋",
- "瘦瘦靴",
- "一脚蹬",
- "懒人鞋",
- "男鞋",
- "休闲鞋",
- "板鞋",
- "帆布鞋",
- "运动风",
- "豆豆鞋",
- "乐福鞋",
- "雕花布洛克",
- "船鞋",
- "增高鞋",
- "正装商务",
- "户外休闲",
- "爸爸鞋",
- "德比鞋",
- "孟克鞋",
- "布鞋"
- ]
- },
- {
- "kind": "箱包",
- "category": [
- "女包",
- "骚包",
- "双肩包",
- "男包",
- "旅行箱",
- "钱包",
- "真皮包",
- "大牌",
- "宽肩带",
- "小方包",
- "水桶包",
- "迷你包",
- "链条包",
- "贝壳包",
- "波士顿包",
- "手拿包",
- "单肩包",
- "手提包",
- "斜挎包",
- "零钱包",
- "妈妈包",
- "欧美潮搭",
- "日韩流行",
- "青春学院",
- "男士商务",
- "雅痞休闲",
- "拉杆箱",
- "腰包",
- "胸包",
- "手工皮具",
- "红人优品"
- ]
- },
- {
- "kind": "配件配饰",
- "category": [
- "帽子",
- "贝雷帽",
- "渔夫帽",
- "鸭舌帽",
- "礼帽",
- "草帽",
- "爵士帽",
- "盆帽",
- "八角帽",
- "丝巾",
- "披肩",
- "真丝围巾",
- "棉麻围巾",
- "方巾",
- "手套",
- "真皮手套",
- "触屏手套",
- "半指手套",
- "全指手套",
- "真皮腰带",
- "腰带",
- "手工皮带"
- ]
- },
- {
- "kind": "童装玩具",
- "category": [
- "连衣裙",
- "保暖连体",
- "裤子",
- "羽绒",
- "居家睡衣",
- "针织",
- "帽子",
- "亲子装",
- "童鞋",
- "学步鞋",
- "女童运动鞋",
- "男童运动鞋",
- "毛毛虫童鞋",
- "雪地靴",
- "马丁靴",
- "长靴",
- "玩具",
- "积木",
- "毛绒玩具",
- "早教",
- "儿童自行车",
- "电动童车",
- "遥控模型",
- "户外玩具",
- "亲子玩具",
- "学习用品",
- "描红本"
- ]
- },
- {
- "kind": "孕产用品",
- "category": [
- "美妈大衣",
- "孕妇裤",
- "月子服",
- "哺乳文胸",
- "吸奶器",
- "防辐射",
- "孕妇内裤",
- "连衣裙",
- "待产包",
- "孕妇牛仔裤",
- "孕妇营养品",
- "防溢乳垫",
- "美德乐",
- "十月妈咪",
- "三洋",
- "Bravado",
- "新生儿",
- "婴儿床",
- "婴儿推车",
- "睡袋",
- "抱被",
- "隔尿垫",
- "学步车",
- "安抚奶嘴",
- "体温计",
- "纸尿裤",
- "花王",
- "洗衣液",
- "湿巾"
- ]
- },
- {
- "kind": "奶粉辅食",
- "category": [
- "爱他美",
- "羊奶粉",
- "特殊配方奶粉",
- "喜宝",
- "惠氏",
- "启赋",
- "牛栏",
- "美素佳儿",
- "贝因美",
- "雅培",
- "美赞臣",
- "可瑞康",
- "a2",
- "嘉宝",
- "美林",
- "米粉",
- "泡芙",
- "溶溶豆",
- "肉肠",
- "果肉条",
- "奶片",
- "益生菌",
- "维生素",
- "钙铁锌",
- "DHA",
- "宝宝食用油",
- "核桃油",
- "葡萄糖",
- "宝宝调料",
- "奶瓶",
- "餐具",
- "餐椅",
- "暖奶器"
- ]
- },
- {
- "kind": "家电",
- "category": [
- "淘宝速达",
- "实体商场服务",
- "淘火炬品牌",
- "生活电器",
- "厨房电器",
- "个人护理",
- "空气净化器",
- "扫地机器人",
- "吸尘器",
- "取暖器",
- "烤箱",
- "豆浆机",
- "榨汁料理",
- "电饭煲",
- "吹风机",
- "足浴盆",
- "剃须刀",
- "卷发器",
- "按摩器材",
- "冬季火锅",
- "蓝牙耳机",
- "电暖桌",
- "蓝牙音箱",
- "电热毯",
- "加湿器",
- "暖风机"
- ]
- },
- {
- "kind": "数码",
- "category": [
- "淘宝速达",
- "淘宝火炬品牌",
- "实体商场服务",
- "2小时送货服务",
- "surface平板电脑",
- "苹果/Apple",
- "iPad",
- "Pro",
- "电脑主机",
- "数码相机",
- "电玩动漫",
- "单反相机",
- "华为",
- "MateBook",
- "IPAD",
- "mini4",
- "游戏主机",
- "鼠标键盘",
- "无人机",
- "二手数码",
- "二手手机",
- "二手笔记本",
- "二手平板电脑"
- ]
- },
- {
- "kind": "手机",
- "category": [
- "iPhone",
- "xs",
- "iPhone",
- "xs",
- "max",
- "iPhone",
- "xr",
- "华为Mate20P",
- "小米MIX3",
- "荣耀Magic2",
- "一加6T",
- "黑鲨2代",
- "努比亚X",
- "iPhone",
- "X",
- "iPhone",
- "8",
- "OPPO",
- "vivo",
- "华为P20",
- "小米",
- "魅族",
- "二手手机",
- "手机以旧换新"
- ]
- },
- {
- "kind": "美妆",
- "category": [
- "面膜",
- "洁面",
- "防晒",
- "爽肤水",
- "眼霜",
- "乳液",
- "面霜",
- "精华",
- "卸妆",
- "男士护肤",
- "眼线",
- "粉底液",
- "BB霜",
- "隔离",
- "睫毛膏",
- "彩妆盘",
- "唇膏",
- "腮红",
- "香水",
- "精油",
- "身体护理",
- "丰胸",
- "纤体",
- "脱毛",
- "海外直邮"
- ]
- },
- {
- "kind": "个人护理",
- "category": [
- "洗发水",
- "护发素",
- "发膜",
- "头发造型",
- "染发膏",
- "烫发水",
- "假发",
- "沐浴露",
- "私处护理",
- "身体乳液",
- "牙膏",
- "牙刷",
- "漱口水",
- "足浴",
- "足贴",
- "洗手液",
- "卫生巾",
- "成人纸尿裤",
- "抽纸",
- "卷纸",
- "洗衣液",
- "清洁剂",
- "厨房清洁",
- "家私/皮具护理",
- "香薰"
- ]
- },
- {
- "kind": "营养保健",
- "category": [
- "B族维生素",
- "葡萄籽",
- "辅酶Q10",
- "消化酶",
- "软骨素",
- "维生素C",
- "钙",
- "大豆异黄酮",
- "益生菌",
- "鱼油",
- "氨基葡萄糖",
- "葡萄籽",
- "生物素",
- "玛咖(玛卡)",
- "酵素",
- "螺旋藻",
- "胶原蛋白",
- "月见草油",
- "DHA",
- "蔓越莓",
- "左旋肉碱",
- "褪黑素",
- "锯棕榈"
- ]
- },
- {
- "kind": "珠宝",
- "category": [
- "琥珀蜜蜡",
- "翡翠手镯",
- "钻戒",
- "铂金",
- "黄金首饰",
- "高端定制",
- "彩色宝石",
- "珍珠",
- "金镶玉",
- "钻石",
- "K金首饰",
- "岫岩玉雕",
- "和田籽料拍卖",
- "裸石",
- "翡翠玉石",
- "一元起拍",
- "设计师",
- "珠宝首饰",
- "金条",
- "情侣对戒",
- "琥珀原石",
- "老坑冰种拍卖"
- ]
- },
- {
- "kind": "眼镜",
- "category": [
- "眼镜架",
- "3D眼镜",
- "司机镜",
- "防辐射眼镜",
- "老花镜",
- "儿童镜",
- "色盲眼镜",
- "无框眼镜",
- "眼镜片",
- "依视路",
- "雷朋",
- "复古眼镜",
- "超轻眼镜",
- "护目镜",
- "眼镜配件",
- "滑雪镜",
- "超耐磨",
- "GM眼镜",
- "配镜服务"
- ]
- },
- {
- "kind": "手表",
- "category": [
- "运动表",
- "卡西欧",
- "国表",
- "时尚表",
- "女表",
- "儿童表",
- "学生表",
- "浪琴",
- "斯沃琪表",
- "镂空机械表",
- "皮带表",
- "钢带表",
- "欧米茄",
- "电子表",
- "陶瓷表",
- "瑞士表",
- "手表放心淘",
- "日韩腕表",
- "情侣表",
- "光能表",
- "怀表",
- "表带",
- "手表配件",
- "休闲",
- "精钢",
- "复古手表",
- "中性手表",
- "帆布表带",
- "深度防水"
- ]
- },
- {
- "kind": "运动",
- "category": [
- "Yeezy",
- "350",
- "Alpha",
- "Bounce",
- "AJ30",
- "Stan",
- "Smith",
- "大Air皮蓬",
- "KD9",
- "Kayano23",
- "Sock",
- "Dart",
- "Hyperdunk",
- "耐克",
- "阿迪达斯",
- "New",
- "Balance",
- "亚瑟士",
- "Under",
- "Armour",
- "匡威",
- "彪马",
- "VANS",
- "锐步",
- "斯凯奇",
- "美津浓",
- "李宁",
- "跑鞋",
- "篮球鞋",
- "复古休闲",
- "健身",
- "足球",
- "羽毛球"
- ]
- },
- {
- "kind": "户外健身",
- "category": [
- "鱼线",
- "鱼线轮",
- "户外鞋",
- "登山包",
- "帐篷",
- "睡袋",
- "望远镜",
- "皮肤衣",
- "速干衣",
- "速干裤",
- "手电筒",
- "山地车",
- "公路车",
- "骑行服",
- "护具",
- "军迷用品",
- "舞蹈体操",
- "羽毛球",
- "游泳",
- "瑜伽",
- "跑步机",
- "健身器",
- "烧烤架",
- "休闲鞋",
- "冲锋裤",
- "单车零件",
- "骑行装备",
- "遮阳棚",
- "户外手表",
- "户外风衣",
- "军迷套装",
- "战术鞋"
- ]
- },
- {
- "kind": "乐器",
- "category": [
- "全新钢琴",
- "二手钢琴",
- "电钢琴",
- "电子琴",
- "萨克斯",
- "尤克里里",
- "架子鼓",
- "小提琴",
- "口琴",
- "手卷钢琴",
- "古筝",
- "古琴",
- "二胡",
- "葫芦丝",
- "陶笛",
- "琵琶",
- "笛子",
- "非洲鼓",
- "贝斯",
- "调音器",
- "节拍器",
- "电吉他",
- "电箱吉他",
- "乐器音箱",
- "电子鼓",
- "手风琴",
- "大提琴",
- "合成器",
- "乐器租赁"
- ]
- },
- {
- "kind": "游戏",
- "category": [
- "DNF",
- "梦幻西游",
- "魔兽",
- "LOL",
- "坦克世界",
- "剑网3",
- "魔域",
- "DOTA2",
- "街头篮球",
- "CF",
- "天龙八部",
- "大话西游2",
- "三国争霸",
- "YY",
- "劲舞团",
- "倩女幽魂",
- "天下3",
- "反恐精英",
- "冒险岛",
- "问道",
- "逆战",
- "大唐无双",
- "征途2",
- "九阴真经",
- "龙之谷",
- "热血江湖",
- "剑灵"
- ]
- },
- {
- "kind": "动漫周边",
- "category": [
- "手办",
- "高达模型",
- "秦时明月",
- "大圣归来",
- "海贼王",
- "圣斗士",
- "火影忍者",
- "LOVELIVE!",
- "银魂",
- "进击的巨人",
- "从零开始的异世界生活",
- "最终幻想",
- "复仇者联盟",
- "钢铁侠",
- "魔兽世界",
- "守望先锋",
- "英雄联盟",
- "剑三",
- "全职高手",
- "盗墓笔记",
- "COS女装",
- "COS男装",
- "漫展门票",
- "在下坂本有何贵干"
- ]
- },
- {
- "kind": "热门影视周边",
- "category": [
- "太阳的后裔",
- "琅琊榜",
- "太子妃升职记",
- "哈利波特",
- "星球大战",
- "蝙蝠侠",
- "盗墓笔记",
- "神探夏洛克",
- "古剑奇谭",
- "花千骨",
- "EXO",
- "欢乐颂",
- "少女时代",
- "BIGBANG",
- "TFBOYS",
- "东方神起",
- "AKB48",
- "岚ARASHI",
- "SHINHWA神话"
- ]
- },
- {
- "kind": "美食",
- "category": [
- "牛奶",
- "柚子茶",
- "酸梅汤",
- "矿泉水",
- "酵素",
- "藕粉",
- "大米",
- "小米",
- "黄豆",
- "火腿",
- "香肠",
- "木耳",
- "枸杞",
- "人参",
- "石斛",
- "雪蛤",
- "蜂蜜",
- "天麻花粉",
- "铁观音",
- "红茶",
- "花草茶",
- "龙井",
- "黑茶",
- "绿茶",
- "鸡尾酒",
- "精酿啤酒",
- "洋酒",
- "红酒"
- ]
- },
- {
- "kind": "生鲜",
- "category": [
- "荔枝",
- "水果",
- "百香果",
- "芒果",
- "小龙虾",
- "樱桃",
- "榴莲",
- "杨梅",
- "牛排",
- "柠檬",
- "海参",
- "水蜜桃",
- "咸鸭蛋",
- "李子",
- "桃子",
- "龙虾",
- "苹果",
- "黄桃",
- "火龙果",
- "波罗蜜",
- "山竹",
- "蓝莓",
- "鸡胸肉",
- "猕猴桃",
- "三文鱼",
- "红薯",
- "车厘子",
- "海鲜"
- ]
- },
- {
- "kind": "零食",
- "category": [
- "冰皮月饼",
- "零食大礼包",
- "牛肉干",
- "面包",
- "辣条",
- "红枣",
- "核桃",
- "饼干",
- "巧克力",
- "葡萄干",
- "芒果干",
- "绿豆糕",
- "薯片",
- "锅巴",
- "海苔",
- "月饼",
- "蛋黄酥",
- "猪肉脯",
- "花生",
- "长沙臭豆腐",
- "瓜子",
- "棒棒糖",
- "糖果",
- "手撕面包",
- "豆干",
- "罗汉果",
- "开心果",
- "山楂",
- "全麦面包",
- "早餐",
- "腰果",
- "压缩饼干"
- ]
- },
- {
- "kind": "工具",
- "category": [
- "电钻",
- "气钉枪",
- "电锯",
- "抛光机",
- "磨砂机",
- "修边机",
- "焊接设备",
- "扳手",
- "钳子",
- "锉、刨",
- "打磨机",
- "角磨机",
- "工具组套",
- "电锯片",
- "电锤",
- "云石片",
- "电动螺丝批",
- "测电笔",
- "电子钳",
- "电烙铁",
- "电子焊接工具套装",
- "接线板/插头",
- "断路器",
- "开关",
- "布线箱",
- "转换器",
- "断路器",
- "插座",
- "底盒",
- "电线",
- "电工套管",
- "监控器材及系统",
- "防盗报警器材及系统",
- "楼宇智能系统"
- ]
- },
- {
- "kind": "装修",
- "category": [
- "全包",
- "半包",
- "免费设计",
- "全套设计",
- "优质装修公司",
- "样板",
- "小户型",
- "美式风",
- "宜家风",
- "集成吊顶"
- ]
- },
- {
- "kind": "建材",
- "category": [
- "建材优品",
- "智能马桶",
- "客厅灯",
- "LED灯泡",
- "实木地板",
- "强化地板",
- "仿古砖",
- "花砖",
- "马赛克",
- "玻化砖",
- "浴室柜",
- "花洒",
- "浴室柜套装",
- "水槽",
- "台上盆",
- "毛巾杆",
- "普通马桶",
- "龙头",
- "浴缸",
- "墙纸",
- "壁纸",
- "墙布",
- "背景墙",
- "指纹锁",
- "防盗锁",
- "监控摄像头",
- "开关插座",
- "无线摄像头",
- "门",
- "榻榻米"
- ]
- },
- {
- "kind": "家具",
- "category": [
- "沙发",
- "床",
- "高低床",
- "餐桌",
- "床垫",
- "茶几",
- "电视柜",
- "衣柜",
- "鞋柜",
- "椅凳",
- "书桌",
- "电脑桌",
- "坐具",
- "现代简约",
- "美式家具",
- "北欧家具",
- "中式家具",
- "儿童家具",
- "真皮沙发",
- "布艺沙发",
- "皮床",
- "实木床",
- "儿童床",
- "乳胶床垫",
- "儿童学习桌",
- "书架",
- "花架",
- "椅子",
- "电脑椅",
- "佛山家具"
- ]
- },
- {
- "kind": "布艺软饰",
- "category": [
- "窗帘",
- "地毯",
- "沙发垫",
- "十字绣",
- "桌布",
- "地垫",
- "抱枕",
- "坐垫",
- "飘窗垫",
- "门帘",
- "缝纫机",
- "洗衣机罩",
- "卷帘",
- "珠帘",
- "沙发巾",
- "靠垫",
- "空调罩",
- "餐桌布",
- "门垫",
- "浴室防滑垫",
- "茶几桌布",
- "桌垫",
- "装饰画",
- "摆件",
- "照片墙",
- "相框",
- "墙贴",
- "花瓶",
- "壁纸",
- "挂钟",
- "仿真花",
- "油画",
- "客厅装饰画",
- "玻璃贴纸",
- "香炉",
- "玻璃花瓶",
- "相框挂墙",
- "数字油画",
- "假花",
- "画框",
- "干花",
- "挂画",
- "钟",
- "音乐盒",
- "倒流香炉"
- ]
- },
- {
- "kind": "床上用品",
- "category": [
- "夏凉被",
- "草席",
- "床褥",
- "U型枕",
- "蚊帐",
- "凉席",
- "天丝套件",
- "贡缎套件",
- "提花套件",
- "婚庆套件",
- "儿童套件",
- "空调被",
- "儿童床品",
- "麻将凉席",
- "四件套",
- "毛巾被",
- "记忆枕",
- "老粗布",
- "床垫",
- "婚庆床品",
- "床笠",
- "蒙古包蚊帐",
- "空调毯",
- "枕头",
- "宫廷蚊帐",
- "牛皮席",
- "冰丝席",
- "竹席",
- "藤席",
- "床单",
- "四件套",
- "全棉套件",
- "磨毛四件套",
- "保暖套件",
- "婚庆",
- "婚庆套件",
- "儿童套件",
- "儿童床品",
- "贡缎套件",
- "提花套件",
- "被子",
- "蚕丝被",
- "羽绒被",
- "厚被",
- "冬被",
- "七孔被",
- "保暖被",
- "枕头",
- "乳胶枕",
- "羽绒枕",
- "记忆枕",
- "床褥",
- "床单被罩",
- "毛毯",
- "毯子",
- "绒毯"
- ]
- },
- {
- "kind": "品质汽车",
- "category": [
- "买车送油卡",
- "v60一口价39.99万",
- "首付一成开新车",
- "超级试驾",
- "新能源车",
- "轿车",
- "SUV",
- "小型车",
- "玛莎拉蒂",
- "沃尔沃",
- "荣威",
- "别克",
- "雪佛兰",
- "日产",
- "本田",
- "起亚",
- "标致",
- "奇瑞",
- "海马",
- "宝马新1系",
- "smart",
- "Mini",
- "荣威",
- "本田",
- "天猫养车",
- "4S保养",
- "上门保养",
- "镀晶服务",
- "打蜡服务",
- "空调清洗"
- ]
- },
- {
- "kind": "二手车",
- "category": [
- "司法车拍卖",
- "公车拍卖",
- "二手车卖场",
- "二手车拍卖",
- "汽车估价",
- "车秒拍",
- "大众",
- "宝马",
- "奥迪",
- "丰田",
- "奔驰",
- "本田",
- "别克",
- "福特",
- "马自达",
- "雪佛兰",
- "3万以下",
- "3-5万",
- "5-10万",
- "10-20万",
- "20-30万",
- "30-40万",
- "40万以上",
- "SUV",
- "MPV",
- "跑车",
- "越野车",
- "玛莎拉蒂特价车"
- ]
- },
- {
- "kind": "汽车用品",
- "category": [
- "车载空气净化器",
- "脚垫",
- "夏季坐垫",
- "后备箱垫",
- "座套",
- "安全座椅",
- "香水",
- "记录仪",
- "手机支架",
- "车载导航",
- "安全预警仪",
- "后视镜导航",
- "机油",
- "燃油宝",
- "轮胎",
- "贴膜",
- "车载吸尘器",
- "镀晶",
- "车蜡",
- "洗车机",
- "轮胎报警器",
- "车充",
- "氙气灯",
- "雨刮",
- "空调滤芯",
- "大灯总成",
- "车挂",
- "安全锤",
- "应急工具"
- ]
- },
- {
- "kind": "办公",
- "category": [
- "打印机",
- "一体机",
- "复合机",
- "3D打印机",
- "投影机",
- "收银机",
- "收银纸",
- "电子面单机",
- "考勤门禁",
- "保险箱",
- "会议白板",
- "安防摄像",
- "无线网卡",
- "WiFi放大器",
- "无线呼叫器",
- "格子间",
- "电脑桌",
- "办公椅",
- "理线器",
- "计算器",
- "荧光告示贴",
- "翻译笔",
- "毛笔",
- "马克笔",
- "文件收纳",
- "本册",
- "书写工具",
- "文具",
- "画具画材",
- "钢笔",
- "中性笔",
- "财会用品",
- "碎纸机",
- "包装设备"
- ]
- },
- {
- "kind": "DIY",
- "category": [
- "定制T恤",
- "文化衫",
- "工作服",
- "卫衣定制",
- "LOGO设计",
- "VI设计",
- "海报定制",
- "3D效果图制作",
- "广告扇",
- "水晶奖杯",
- "胸牌工牌",
- "奖杯",
- "徽章",
- "洗照片",
- "照片冲印",
- "相册/照片书",
- "软陶人偶",
- "手绘漫画",
- "纸箱",
- "搬家纸箱",
- "胶带",
- "标签贴纸",
- "二维码贴纸",
- "塑料袋",
- "自封袋",
- "快递袋",
- "气泡膜",
- "编织袋",
- "飞机盒",
- "泡沫箱",
- "气柱袋",
- "纸手提袋",
- "打包绳带",
- "气泡信封",
- "缠绕膜"
- ]
- },
- {
- "kind": "五金/电子",
- "category": [
- "物联网市场",
- "万用表",
- "电动螺丝刀",
- "管钳子",
- "电钻",
- "无尘锯",
- "电焊机",
- "角磨机",
- "切割机",
- "发电机",
- "快排阀",
- "增压泵",
- "钢珠",
- "测距仪",
- "水平仪",
- "传感器",
- "电容器",
- "变压器",
- "单片机开发板",
- "智能小车",
- "机器人套件",
- "3D打印耗材",
- "GPS",
- "蓝牙",
- "LED灯珠",
- "树莓派"
- ]
- },
- {
- "kind": "百货",
- "category": [
- "保温杯",
- "饭盒",
- "玻璃杯",
- "马克杯",
- "餐具套装",
- "碗",
- "盘",
- "碟",
- "茶具套装",
- "茶杯",
- "茶壶",
- "茶具",
- "陶瓷餐具",
- "拖鞋",
- "雨伞雨具",
- "口罩",
- "垃圾桶",
- "居家鞋",
- "省力拖把",
- "家务清洁",
- "垃圾袋",
- "梳子",
- "抹布",
- "围裙",
- "拖把",
- "浴帘",
- "浴室置物架",
- "拖把桶旋转",
- "镜子",
- "铸铁锅",
- "炒锅",
- "饮具",
- "心机小物",
- "厨房置物架",
- "密封罐",
- "潮州陶瓷",
- "景德镇陶瓷",
- "厨用小工具",
- "刀具砧板",
- "烧烤烘培"
- ]
- },
- {
- "kind": "餐厨",
- "category": [
- "收纳整理",
- "收纳箱",
- "儿童收纳柜",
- "压缩袋",
- "衣柜整理",
- "鞋柜",
- "布艺软收纳",
- "浴室收纳",
- "置物架",
- "强力不粘钩",
- "厨房收纳",
- "桌面收纳",
- "壁挂收纳",
- "旅行收纳",
- "化妆包",
- "购物车",
- "环保袋",
- "野餐蓝",
- "药箱药盒",
- "衣物洗晒",
- "粘毛剪球",
- "脏衣篮",
- "木制衣架",
- "大型晾晒架",
- "裤架",
- "儿童衣架",
- "柳编",
- "ZAKKA风",
- "原生态",
- "棉麻风",
- "纸质收纳"
- ]
- },
- {
- "kind": "家庭保健",
- "category": [
- "创可贴",
- "消毒用品",
- "体温计",
- "冷敷降温",
- "急救箱",
- "医用口罩",
- "绷带纱布",
- "血压监测",
- "血糖监测",
- "心率监测",
- "呼吸制氧",
- "拐杖",
- "轮椅",
- "助行器",
- "矫正牵引",
- "医用床上护理",
- "拔罐"
- ]
- },
- {
- "kind": "学习",
- "category": [
- "英语四级",
- "2018考研",
- "成人学历",
- "小学教学",
- "雅思托福",
- "学历提升",
- "会计提升",
- "ps美工技能",
- "考研辅导",
- "外教口语课",
- "建造师",
- "口语一对一",
- "驾照报名",
- "汽车维修",
- "化妆课程",
- "电商培训",
- "少儿英语",
- "公务员考试",
- "中小学辅导",
- "宝宝早教",
- "健身减肥",
- "DIY手工",
- "微信小程序",
- "JAVA",
- "CAD教程",
- "驾校学车"
- ]
- },
- {
- "kind": "卡券票",
- "category": [
- "劳动节福利",
- "超市卡",
- "沃尔玛",
- "家乐福",
- "银泰卡",
- "面包券",
- "来伊份券",
- "粽子券",
- "熟食/半成品",
- "星巴克",
- "咖啡",
- "哈根达斯",
- "冰淇淋",
- "网站建设",
- "云服务器",
- "财务管理",
- "网页设计",
- "软件"
- ]
- },
- {
- "kind": "本地服务",
- "category": [
- "婚纱摄影",
- "青岛婚拍",
- "丽江婚拍",
- "三亚婚拍",
- "厦门婚拍",
- "新娘跟妆",
- "婚礼司仪",
- "婚车租赁",
- "婚礼策划",
- "婚宴预订",
- "婚纱礼服",
- "礼服租赁",
- "家电清洗",
- "家庭保洁",
- "搬家搬运",
- "在线洗衣",
- "上门养车",
- "跑腿代办",
- "名企招聘",
- "上门美甲",
- "入职体检",
- "法律咨询",
- "上门按摩",
- "专业翻译"
- ]
- }
-]
\ No newline at end of file
diff --git a/chapter1/crawler/taobao/__pycache__/__init__.cpython-37.pyc b/chapter1/crawler/taobao/__pycache__/__init__.cpython-37.pyc
index 05cca08..3d59835 100644
Binary files a/chapter1/crawler/taobao/__pycache__/__init__.cpython-37.pyc and b/chapter1/crawler/taobao/__pycache__/__init__.cpython-37.pyc differ
diff --git a/chapter1/crawler/taobao/__pycache__/items.cpython-37.pyc b/chapter1/crawler/taobao/__pycache__/items.cpython-37.pyc
index 5e194c3..8c12716 100644
Binary files a/chapter1/crawler/taobao/__pycache__/items.cpython-37.pyc and b/chapter1/crawler/taobao/__pycache__/items.cpython-37.pyc differ
diff --git a/chapter1/crawler/taobao/__pycache__/middlewares.cpython-37.pyc b/chapter1/crawler/taobao/__pycache__/middlewares.cpython-37.pyc
index 56dd606..a0323b1 100644
Binary files a/chapter1/crawler/taobao/__pycache__/middlewares.cpython-37.pyc and b/chapter1/crawler/taobao/__pycache__/middlewares.cpython-37.pyc differ
diff --git a/chapter1/crawler/taobao/__pycache__/pipelines.cpython-37.pyc b/chapter1/crawler/taobao/__pycache__/pipelines.cpython-37.pyc
index 5a15be6..6701ea5 100644
Binary files a/chapter1/crawler/taobao/__pycache__/pipelines.cpython-37.pyc and b/chapter1/crawler/taobao/__pycache__/pipelines.cpython-37.pyc differ
diff --git a/chapter1/crawler/taobao/__pycache__/settings.cpython-37.pyc b/chapter1/crawler/taobao/__pycache__/settings.cpython-37.pyc
index 48b993e..8ab276f 100644
Binary files a/chapter1/crawler/taobao/__pycache__/settings.cpython-37.pyc and b/chapter1/crawler/taobao/__pycache__/settings.cpython-37.pyc differ
diff --git a/chapter1/crawler/taobao/items.py b/chapter1/crawler/taobao/items.py
index 3dc9563..fba9542 100644
--- a/chapter1/crawler/taobao/items.py
+++ b/chapter1/crawler/taobao/items.py
@@ -26,3 +26,13 @@ class ProductItem(scrapy.Item):
url = scrapy.Field()
detail = scrapy.Field()
comment_num = scrapy.Field()
+
+
+class CommentItem(scrapy.Item):
+ id = scrapy.Field()
+ total_comment_num = scrapy.Field()
+ total_crazy_count = scrapy.Field()
+ total_indifferent_count = scrapy.Field()
+ total_detest_count = scrapy.Field()
+ page_count = scrapy.Field()
+ comments = scrapy.Field()
diff --git a/chapter1/crawler/taobao/merge.py b/chapter1/crawler/taobao/merge.py
new file mode 100644
index 0000000..5dc8d3a
--- /dev/null
+++ b/chapter1/crawler/taobao/merge.py
@@ -0,0 +1,48 @@
+"""
+商品数据合并
+"""
+import json
+
+
+def pretty(word):
+ """
+ 去掉问题之类的
+ :param word:
+ :return:
+ """
+ if word is not None:
+ return word.replace('"', "")
+ return None
+
+
+def main():
+ wf = open('../datas/products/data.csv', 'w')
+ with open('../datas/kind.json', 'r') as f:
+ kinds = json.load(f)
+ for kind in kinds:
+ for keyword in kind["category"]:
+ # print("大类 %s" % kind["kind"])
+ print("小类 %s" % keyword)
+ try:
+ ff = open('../datas/%s.json' % keyword, 'r')
+ for line in ff:
+ data = json.loads(line)
+ fline = '%s,%s,%s,"%s","%s","%s",%s,"%s",%s,%s' % (data["id"],
+ kind["kind"],
+ keyword,
+ pretty(data["title"]),
+ data["price"],
+ pretty(data["shop"]),
+ data["pubtime"],
+ pretty(data["publisher"]),
+ pretty(data["url"]),
+ data["comment_num"])
+ wf.write(fline + "\n")
+ ff.close()
+ except Exception as e:
+ print(e)
+ wf.close()
+
+
+if __name__ == '__main__':
+ main()
diff --git a/chapter1/crawler/taobao/pipelines.py b/chapter1/crawler/taobao/pipelines.py
index 3ef6e34..75a0ce2 100644
--- a/chapter1/crawler/taobao/pipelines.py
+++ b/chapter1/crawler/taobao/pipelines.py
@@ -28,3 +28,13 @@ class JsonWithEncodingCnblogsPipeline(object):
return item
def spider_closed(self, spider):
self.file.close()
+
+class JsonWithEncodingCommentsPipeline(object):
+ def open_spider(self, spider):
+ self.file = open('../../datas/comments/%s.json' % spider.product_id, 'w', encoding='utf-8')
+ def process_item(self, item, spider):
+ line = json.dumps(dict(item), ensure_ascii=False) + "\n"
+ self.file.write(line)
+ return item
+ def spider_closed(self, spider):
+ self.file.close()
diff --git a/chapter1/crawler/taobao/spiders/__pycache__/__init__.cpython-37.pyc b/chapter1/crawler/taobao/spiders/__pycache__/__init__.cpython-37.pyc
index 6daed03..abd3b1f 100644
Binary files a/chapter1/crawler/taobao/spiders/__pycache__/__init__.cpython-37.pyc and b/chapter1/crawler/taobao/spiders/__pycache__/__init__.cpython-37.pyc differ
diff --git a/chapter1/crawler/taobao/spiders/__pycache__/dangdang.cpython-37.pyc b/chapter1/crawler/taobao/spiders/__pycache__/dangdang.cpython-37.pyc
index 2614fa4..b2592ec 100644
Binary files a/chapter1/crawler/taobao/spiders/__pycache__/dangdang.cpython-37.pyc and b/chapter1/crawler/taobao/spiders/__pycache__/dangdang.cpython-37.pyc differ
diff --git a/chapter1/crawler/taobao/spiders/__pycache__/httpbin.cpython-37.pyc b/chapter1/crawler/taobao/spiders/__pycache__/httpbin.cpython-37.pyc
index 0a0d2a6..be4b2b6 100644
Binary files a/chapter1/crawler/taobao/spiders/__pycache__/httpbin.cpython-37.pyc and b/chapter1/crawler/taobao/spiders/__pycache__/httpbin.cpython-37.pyc differ
diff --git a/chapter1/crawler/taobao/spiders/__pycache__/meitu.cpython-37.pyc b/chapter1/crawler/taobao/spiders/__pycache__/meitu.cpython-37.pyc
deleted file mode 100644
index 7997bc0..0000000
Binary files a/chapter1/crawler/taobao/spiders/__pycache__/meitu.cpython-37.pyc and /dev/null differ
diff --git a/chapter1/crawler/taobao/spiders/__pycache__/porn.cpython-37.pyc b/chapter1/crawler/taobao/spiders/__pycache__/porn.cpython-37.pyc
deleted file mode 100644
index 5ca4cb3..0000000
Binary files a/chapter1/crawler/taobao/spiders/__pycache__/porn.cpython-37.pyc and /dev/null differ
diff --git a/chapter1/crawler/taobao/spiders/__pycache__/users.cpython-37.pyc b/chapter1/crawler/taobao/spiders/__pycache__/users.cpython-37.pyc
index 1f9c4aa..12d9cf2 100644
Binary files a/chapter1/crawler/taobao/spiders/__pycache__/users.cpython-37.pyc and b/chapter1/crawler/taobao/spiders/__pycache__/users.cpython-37.pyc differ
diff --git a/chapter1/crawler/taobao/spiders/category.py b/chapter1/crawler/taobao/spiders/category.py
new file mode 100644
index 0000000..82a579f
--- /dev/null
+++ b/chapter1/crawler/taobao/spiders/category.py
@@ -0,0 +1,100 @@
+# -*- coding: utf-8 -*-
+import json
+import re
+import subprocess
+import time
+
+import scrapy
+from scrapy import Request
+
+from taobao.items import ProductItem
+
+"""
+按分类采集电商商品数据
+"""
+
+
+class CategorySpider(scrapy.Spider):
+ name = 'category'
+ allowed_domains = ['dangdang.com']
+ start_urls = []
+
+ custom_settings = {
+ 'LOG_LEVEL': "WARN",
+ 'ITEM_PIPELINES': {
+ 'taobao.pipelines.JsonWithEncodingCnblogsPipeline': 300,
+ }
+ }
+
+ page = 1
+
+ def __init__(self, name=None, **kwargs):
+ self.keyword = kwargs['keyword']
+ self.start_urls = [
+ 'http://search.dangdang.com/?key=%s&act=input&show=list&show_shop=0#J_tab'
+ % self.keyword]
+ super(CategorySpider, self).__init__(name, **kwargs)
+
+ def safe_fetch(self, node):
+ try:
+ return node[0].extract()
+ except Exception as e:
+ return None
+
+ def parse(self, response):
+ products = response.xpath(
+ '//div[@class="con shoplist"]//div[@id="search_nature_rg"]//ul/li')
+ for product in products:
+ title = self.safe_fetch(product.xpath('p[@class="name"]/a/@title'))
+ price = self.safe_fetch(
+ product.xpath('p[@class="price"]/span[@class="search_now_price"]/text()'))
+ shop = self.safe_fetch(product.xpath('p[@class="search_book_author"]/span/a/@title'))
+ if shop is None:
+ shop = self.safe_fetch(product.xpath('p[@class="link"]/a/text()'))
+ pubtime = self.safe_fetch(
+ product.xpath('p[@class="search_book_author"]/span[2]/text()'))
+ publisher = self.safe_fetch(
+ product.xpath('p[@class="search_book_author"]/span[3]/a/text()'))
+
+ detail = self.safe_fetch(product.xpath('p[@class="detail"]/text()'))
+ url = self.safe_fetch(product.xpath('p[@class="name"]/a/@href'))
+ comment_num = self.safe_fetch(product.xpath(
+ 'p[@class="search_star_line"]//a[@class="search_comment_num"]/text()'))
+
+ match_obj = re.match(r'http.+/(\d+).html', url)
+ product_id = match_obj.group(1)
+
+ yield ProductItem(
+ id=product_id,
+ title=title,
+ price=price,
+ shop=shop,
+ pubtime=pubtime,
+ publisher=publisher,
+ url=url,
+ comment_num=comment_num
+ )
+
+ next_page_url = response.xpath(
+ '//div[@class="paging"]//a[text()="下一页"]/@href').extract()
+ if len(next_page_url) > 0:
+ # print(next_page_url[0])
+ print("开始抓取 %s 第%d页" % (self.keyword, CategorySpider.page))
+ CategorySpider.page += 1
+ yield Request("http://search.dangdang.com%s" % next_page_url[0])
+
+
+if __name__ == '__main__':
+ with open('../../datas/kind.json', 'r') as f:
+ kinds = json.load(f)
+ for kind in kinds:
+ for keyword in kind["category"]:
+ print("开始抓取 %s 分类" % keyword)
+
+ cmd = "scrapy crawl category -a keyword=%s" % keyword
+ p = subprocess.Popen(cmd, shell=True)
+ p.wait()
+ if p.returncode != 0:
+ print("抓取 %s 出现错误" % keyword)
+ time.sleep(3)
+ print("抓取 %s 分类结束" % keyword)
diff --git a/chapter1/crawler/taobao/spiders/comment.py b/chapter1/crawler/taobao/spiders/comment.py
new file mode 100644
index 0000000..fac0f20
--- /dev/null
+++ b/chapter1/crawler/taobao/spiders/comment.py
@@ -0,0 +1,117 @@
+# -*- coding: utf-8 -*-
+import json
+import pdb
+import subprocess
+
+import scrapy
+from scrapy import Request
+from scrapy.http import Response
+from lxml import etree
+
+from taobao.items import CommentItem
+
+import pandas as pd
+
+"""
+商品评论数据抓取
+"""
+
+
+class CommentSpider(scrapy.Spider):
+ name = 'comment'
+ allowed_domains = ['dangdang.com']
+ start_urls = []
+
+ custom_settings = {
+ 'LOG_LEVEL': "WARN",
+ 'ITEM_PIPELINES': {
+ 'taobao.pipelines.JsonWithEncodingCommentsPipeline': 300,
+ }
+ }
+
+ def __init__(self, name=None, **kwargs):
+ self.product_id = kwargs['id']
+ self.start_urls = [
+ 'http://product.dangdang.com/index.php?r=comment%%2Flist&productId=%s&'
+ 'mainProductId=%s&mediumId=0&pageIndex=1&sortType=1&filterType=1'
+ '&isSystem=1&tagId=0&tagFilterCount=0' % (self.product_id, self.product_id)
+ ]
+ self.comments = None
+ super(CommentSpider, self).__init__(name, **kwargs)
+
+ def parse(self, response):
+ data = json.loads(response.body)
+ summary = data["data"]["list"]["summary"]
+ html = data["data"]["list"]["html"]
+
+ print("评论总数: %s" % summary["total_comment_num"])
+ print("好评: %s" % summary["total_crazy_count"])
+ print("中评: %s" % summary["total_indifferent_count"])
+ print("差评: %s" % summary["total_detest_count"])
+ print("评论页数: %s" % summary["pageCount"])
+ print("当前页面: %s" % summary["pageIndex"])
+
+ if self.comments is None:
+ self.comments = CommentItem(
+ id=self.product_id,
+ total_comment_num=summary["total_comment_num"],
+ total_crazy_count=summary["total_crazy_count"],
+ total_indifferent_count=summary["total_indifferent_count"],
+ total_detest_count=summary["total_detest_count"],
+ page_count = summary["pageCount"],
+ comments=[]
+ )
+
+ el = etree.HTML(html)
+ items = el.xpath('//div[contains(@class,"comment_items")]')
+ for item in items:
+ score = item.xpath('div[@class="items_right"]//div[@class="pinglun"]//em/text()')
+ desc = item.xpath(
+ 'div[@class="items_right"]//div[@class="describe_detail"]/span/text()')
+ date = item.xpath(
+ 'div[@class="items_right"]//div[@class="starline clearfix"]/span[1]/text()')
+ headimg = item.xpath('div[@class="items_left_pic"]/a/img/@src')
+ username = item.xpath('div[@class="items_left_pic"]/span[@class="name"]/text()')
+ userlevel = item.xpath(
+ 'div[@class="items_left_pic"]/span[contains(@class,"level")]/text()')
+
+ comment = {
+ "score": score and score[0] or None,
+ "desc": desc and desc[0] or None ,
+ "date": date and date[0] or None,
+ "headimg": headimg and headimg[0] or None,
+ "username": username and username[0] or None,
+ "userlevel": userlevel and userlevel[0] or None
+ }
+
+ self.comments["comments"].append(comment)
+
+ if int(summary["pageIndex"])0]
+ print(type(df))
+ for i in range(0, df.size, 10):
+ pp = []
+ for x in range(10):
+ product = df.iloc[i+x]
+ if product["comment_num"]>0:
+ cmd = "scrapy crawl comment -a id=%s" % product["id"]
+ pp.append(subprocess.Popen(cmd, shell=True))
+
+ for p in pp:
+ try:
+ p.wait(20)
+ if p.returncode != 0:
+ print("抓取 %s 出现错误" % product["id"])
+ except Exception as e:
+ print(e)
+ print("总数: %d, 当前行: %d" % (df.size, i))