2019-01-17 20:15:22 +08:00
|
|
|
from pyspark.sql import SparkSession
|
|
|
|
from pyspark.sql import Row
|
|
|
|
# $example off:spark_hive$
|
|
|
|
import os
|
|
|
|
import pymysql
|
2019-01-17 23:15:11 +08:00
|
|
|
import time
|
2019-01-17 20:15:22 +08:00
|
|
|
|
|
|
|
def mysql_query(sql):
|
|
|
|
db = pymysql.connect("localhost","root","123456789","sparkproject" )
|
|
|
|
cursor = db.cursor()
|
|
|
|
cursor.execute(sql)
|
|
|
|
data = cursor.fetchone()
|
|
|
|
db.close()
|
|
|
|
return data
|
|
|
|
|
|
|
|
def mysql_execute(sql):
|
|
|
|
db = pymysql.connect("localhost","root","123456789","sparkproject" )
|
|
|
|
cursor = db.cursor()
|
|
|
|
|
|
|
|
try:
|
|
|
|
cursor.execute(sql)
|
|
|
|
db.commit()
|
|
|
|
except Exception as e:
|
|
|
|
print(e)
|
|
|
|
db.rollback()
|
|
|
|
finally:
|
|
|
|
db.close()
|
|
|
|
|
2019-01-17 23:13:38 +08:00
|
|
|
def today():
|
|
|
|
return time.strftime('%Y-%m-%d')
|
|
|
|
|
2019-01-17 20:15:22 +08:00
|
|
|
|
2019-01-18 10:23:18 +08:00
|
|
|
def collect_crawl_info(spark):
|
|
|
|
df = spark.sql("select count(*) as N from jd_comment")
|
|
|
|
jd_comment_count = df.rdd.collect()[0]["N"]
|
|
|
|
|
|
|
|
df = spark.sql("select count(*) as N from jd_comment where created_at like '"+today()+"%'")
|
|
|
|
jd_comment_today_count = df.rdd.collect()[0]["N"]
|
|
|
|
|
|
|
|
df = spark.sql("select count(*) as N from jd")
|
|
|
|
jd_count = df.rdd.collect()[0]["N"]
|
|
|
|
|
|
|
|
df = spark.sql("select count(*) as N from jd where created_at like '"+today()+"%'")
|
|
|
|
jd_today_count = df.rdd.collect()[0]["N"]
|
|
|
|
|
|
|
|
total_count = jd_comment_count + jd_count
|
|
|
|
today_total_count = jd_comment_today_count + jd_today_count
|
|
|
|
|
|
|
|
mysql_execute("insert into crawl_infos (total_count, today_total_count, product_count, today_product_count, comment_count, today_comment_count) values ({},{},{},{},{},{})".format(
|
|
|
|
total_count, today_total_count, jd_count,jd_today_count, jd_comment_count, jd_comment_today_count) )
|
|
|
|
|
|
|
|
|
|
|
|
def collect_news(spark):
|
|
|
|
df = spark.sql("select * from jd_comment order by created_at desc limit 20")
|
2019-01-18 10:26:32 +08:00
|
|
|
mysql_execute("delete from news")
|
2019-01-18 10:23:18 +08:00
|
|
|
for row in df.rdd.collect():
|
2019-01-18 10:24:34 +08:00
|
|
|
mysql_execute("insert into news (comment_time, content, comment_id) values ('{}', '{}', '{}')".format(
|
2019-01-18 10:23:18 +08:00
|
|
|
row["comment_time"], row["content"], row["id"]))
|
|
|
|
|
|
|
|
|
2019-01-17 20:15:22 +08:00
|
|
|
if __name__ == "__main__":
|
|
|
|
# $example on:spark_hive$
|
|
|
|
# warehouse_location points to the default location for managed databases and tables
|
|
|
|
warehouse_location = os.path.abspath('spark-warehouse')
|
|
|
|
|
|
|
|
spark = SparkSession \
|
|
|
|
.builder \
|
|
|
|
.appName("Python Spark SQL Hive integration example") \
|
|
|
|
.config("spark.sql.warehouse.dir", warehouse_location) \
|
|
|
|
.enableHiveSupport() \
|
|
|
|
.getOrCreate()
|
|
|
|
|
2019-01-17 23:16:48 +08:00
|
|
|
while True:
|
2019-01-18 10:23:18 +08:00
|
|
|
collect_crawl_info(spark)
|
|
|
|
collect_news(spark)
|
2019-01-17 23:16:48 +08:00
|
|
|
|
|
|
|
time.sleep(10)
|
|
|
|
|
2019-01-17 20:15:22 +08:00
|
|
|
spark.stop()
|