From 3712b433437b6055e8d09fdc81025fa4ee5de293 Mon Sep 17 00:00:00 2001 From: guange <8863824@gmail.com> Date: Fri, 18 Jan 2019 19:34:11 +0800 Subject: [PATCH] . --- chapter3/spark.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/chapter3/spark.py b/chapter3/spark.py index 6f1bda8..b6e0d7e 100644 --- a/chapter3/spark.py +++ b/chapter3/spark.py @@ -90,11 +90,13 @@ def collect_crawl_info(spark): def collect_news(spark): df = spark.sql("select * from jd_comment order by created_at desc limit 20") - mysql_execute("delete from news") + for row in df.rdd.collect(): mysql_execute("insert into news (comment_time, content, comment_id) values ('{}', '{}', '{}')".format( row["comment_time"], row["content"], row["id"])) + mysql_execute("delete from news where id not in ( select x.id from (select id from news order by id desc limit 20) as x);") + def get_last_day_count(spark): """获取过去几天的采集量""" for i in range(5):