.

2019-01-18 10:23:18 +08:00 · 2019-01-18 10:23:18 +08:00 · 411e14684f
parent da3f9324b6
commit 411e14684f
2 changed files with 30 additions and 19 deletions
--- a/chapter2/mysite/myapp/views.py
+++ b/chapter2/mysite/myapp/views.py
@ -13,7 +13,7 @@ def crawl(request):
 	# info = scrapy_client.get_scrapy_info()
 	crawl_info = CrawlInfos.objects.order_by('-id').first()
 	platform_info = get_platform_info()
-  news = News.objects.all()
+  news = News.objects.order_by('-id')[0:20]
 	return render(request, 'myapp/crawl.html', {"crawl_info": crawl_info, 
    "platform_info":json.dumps(platform_info),
    "news": news})	
--- a/chapter3/spark.py
+++ b/chapter3/spark.py
@ -30,19 +30,7 @@ def today():
    return time.strftime('%Y-%m-%d')


-if __name__ == "__main__":
-    # $example on:spark_hive$
-    # warehouse_location points to the default location for managed databases and tables
-    warehouse_location = os.path.abspath('spark-warehouse')
-
-    spark = SparkSession \
-        .builder \
-        .appName("Python Spark SQL Hive integration example") \
-        .config("spark.sql.warehouse.dir", warehouse_location) \
-        .enableHiveSupport() \
-        .getOrCreate()
-
-    while True:
+def collect_crawl_info(spark):
    df = spark.sql("select count(*) as N from jd_comment")
    jd_comment_count = df.rdd.collect()[0]["N"]

@ -62,6 +50,29 @@ if __name__ == "__main__":
        total_count, today_total_count, jd_count,jd_today_count, jd_comment_count, jd_comment_today_count) )
    

+def collect_news(spark):
+    df = spark.sql("select * from jd_comment order by created_at desc limit 20")
+    for row in df.rdd.collect():
+        mysql_execute("insert into news (comment_time, content, comment_id) values ('{}', '{}', '{}'')".format(
+            row["comment_time"], row["content"], row["id"]))
+
+
+if __name__ == "__main__":
+    # $example on:spark_hive$
+    # warehouse_location points to the default location for managed databases and tables
+    warehouse_location = os.path.abspath('spark-warehouse')
+
+    spark = SparkSession \
+        .builder \
+        .appName("Python Spark SQL Hive integration example") \
+        .config("spark.sql.warehouse.dir", warehouse_location) \
+        .enableHiveSupport() \
+        .getOrCreate()
+
+    while True:
+        collect_crawl_info(spark)
+        collect_news(spark)
+
        time.sleep(10)

    spark.stop()