OrderVis/PredictCovid.py

425 lines
11 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import pandas as pd
import numpy as np
import tensorflow as tf
import keras
from sklearn.preprocessing import MinMaxScaler
import re
import copy
from random import shuffle
maxSeqLength=250
batchSize = 24
lstmUnits = 64
numClasses = 2
numDimensions = 50 #Dimensions for each word vector
#辅助函数
from random import randint
import re
#把整个评论切割成子句 输出list
def InputToSenList(senten,mark=' mark! '):
stripSpecialChars=re.compile("[^A-Za-z0-9 ]+")
senten=senten.lower().replace('<br />','')
#print(senten)
myinput=re.sub(stripSpecialChars,mark,senten)
wordVec=myinput.split()
markLoc=[]
markLoc.append(0)
subSenList=[]
shiftNum=0
for i in range(len(wordVec)):
if wordVec[i-shiftNum]=='mark!':
markLoc.append(i-shiftNum)
wordVec.pop(i-shiftNum)
shiftNum+=1
for i in range(len(markLoc)-1):
subSenList.append(" ".join(wordVec[markLoc[i]:markLoc[i+1]]))
return subSenList
#把list转化为词嵌入向量的comment
def ListToVecComment(tempSenList):
wordsList=np.load('./VectorList/wordsList.npy')
wordsList = wordsList.tolist() #Originally loaded as numpy array
wordsList = [word.decode('UTF-8') for word in wordsList] #Encode words as UTF-8
comment=np.zeros([batchSize,maxSeqLength])
#comment保存的评论每个字都是用字典中的数字组成的
fullSent=' '.join(tempSenList)
counter=0
for word in fullSent.split():
try:
comment[0][counter]=wordsList.index(word)
except Exception:
comment[0][counter]=399999
counter+=1
if counter==250:
break
return comment
def ListToVecCommentMulti(tempSenList):
wordsList=np.load('./VectorList/wordsList.npy')
wordsList = wordsList.tolist() #Originally loaded as numpy array
wordsList = [word.decode('UTF-8') for word in wordsList] #Encode words as UTF-8
comment=np.zeros([batchSize,maxSeqLength])
#comment保存的评论每个字都是用字典中的数字组成的
for i in range(len(tempSenList)):
fullSent=' '.join(tempSenList[i])
counter=0
for word in fullSent.split():
try:
comment[i][counter]=wordsList.index(word)
except Exception:
comment[i][counter]=399999
counter+=1
if counter==250:
break
return comment
#人工智能部分
#定义变量
def DefVar(rnnType):
wordVectors = np.load('./VectorList/wordVectors.npy')
tf.reset_default_graph()
#24个数据二分类问题2个标签
#24条评论每条评论200个长度,这时候每个字还是用id来表示的
inputData = tf.placeholder(tf.int32, [batchSize, maxSeqLength])
data = tf.Variable(tf.zeros([batchSize, maxSeqLength, numDimensions]),dtype = tf.float32)
#根据嵌入向量把用id表示的单词转化为对应的词嵌入向量
data = tf.nn.embedding_lookup(wordVectors,inputData)
#lstmUnits决定了每个隐藏层输出的数量这里输出为64维输入为50维权重矩阵就是64*114
lstmCell = tf.contrib.rnn.BasicLSTMCell(lstmUnits)
if(rnnType=='GRU'):
lstmCell = tf.contrib.rnn.GRUCell(lstmUnits)
elif(rnnType=='vanilla'):
lstmCell = tf.contrib.rnn.BasicRNNCell(lstmUnits)
# 每条评论200个输入就是200个隐藏神经元每个神经元输出是64维
initial_state = lstmCell.zero_state(batchSize, tf.float32)
value, _ = tf.nn.dynamic_rnn(lstmCell,data, initial_state=initial_state,dtype=tf.float32)
# _ 两部分一部分是24 *64 24条文本64维的最后一个神经元的cell state的输出
# 第二部分是24*64 64维的最后一个神经元的hidden state输出
#产生一个 64*2的随机矩阵
weight = tf.Variable(tf.truncated_normal([lstmUnits,numClasses]))
#构造一个二位数组[0.1,0.1]
bias = tf.Variable(tf.constant(0.1, shape=[numClasses]))
#把数组从维度上转置,变成了200*24*64 ,200神经元24个样例64个输出
# value = tf.transpose(value,[1,0,2])
return inputData,value,weight,bias
#定义predict函数
def DefPreFun(value,weight,bias):
#取出最后一个神经元的数据 24*64
value = tf.transpose(value,[1,0,2])
last=tf.gather(value,int(value.get_shape()[0])-1)
# 64维的向量映射到二分类的问题上加上b
prediction= (tf.matmul(last,weight)+bias)
#prediction=tf.argmax(prediction,1)
output = tf.nn.softmax(prediction)
return output
def GetSenList(myinput,model='clause'):
senList=[]
#只预测一天,只要最后十五天的数据
tempList=myinput.split()[-15:]
if model=='word':
senList=tempList
else:
senten = ''
count = 0
for number in tempList:
senten += str(number)+' '
count += 1
if(count>=3):
senList.append(senten)
senten=''
count=0
if senten:
senList.append(senten)
return senList
class PredictRNN():
def __init__(self):
keras.backend.clear_session()
tf.reset_default_graph()
self.model1=keras.models.load_model(savePath)
data = pd.read_csv("Aamerica.csv")
date = data["date"].values
cases = data["comfirmed"].values
temp1 = []
for i in range(len(date)):
date[i] = i
if(i == 0):
temp1.append(cases[0])
if(i>0):
temp1.append(cases[i] - cases[i-1])
cases = temp1[1:]
#抽出训练集
date_train = date[0:len(cases)-25]
cases_train = cases[0:len(cases)-25]
#把训练数据打包
cases_train = list(zip(date_train, cases_train))
train1 = pd.DataFrame(cases_train, columns=['date', 'comfirmed'])
train1['date'] = train1['date'].astype(float)
train1['comfirmed'] = train1['comfirmed'].astype(float)
x_train1 = train1.iloc[:, 1:2].values
self.scaler1 = MinMaxScaler(feature_range = (0,1))
x_train1 = self.scaler1.fit_transform(x_train1)
def __del__(self):
self.sess.close()
def Predict(self,test_inputs):
fullSent = ' '.join(test_inputs)
test_inputs = fullSent.split()
test_inputs = np.array(test_inputs)
test_inputs = test_inputs.reshape(-1,1)
test_inputs = self.scaler1.transform(test_inputs)
test_features = []
test_features.append(test_inputs[-15:])
test_features=np.array(test_features)
# test_features = np.reshape(test_features, (test_features.shape[0], test_features.shape[1], 1))
prediction = self.model1.predict(test_features)
prediction = self.scaler1.inverse_transform(prediction)
return prediction[0][0]
def GetRes(self,reorder_inputs):
predictions = []
for order in reorder_inputs:
orderInput=np.array(order)
orderInput = orderInput.reshape(-1,1)
orderInput = self.scaler1.transform(orderInput)
test_features = []
test_features.append(orderInput[-15:])
test_features=np.array(test_features)
# test_features = np.reshape(test_features, (test_features.shape[0], test_features.shape[1], 1))
print('test_features',test_features)
prediction = self.model1.predict(test_features)
prediction = self.scaler1.inverse_transform(prediction)
predictions.append(prediction[0][0])
return predictions
class ForPredictCovid():
def __init__(self,myinput,modelType,judgeType):
#拆分好的句子的list
self.senList=GetSenList(myinput,'clause')
#拆分好的单词的list
self.wordList=GetSenList(myinput,'word')
#根据选择的句子/单词 model 确定的list
self.chosenList=self.senList
#计算到的list长度
self.sentenSize=len(self.chosenList)
if modelType=='word':
self.chosenList=self.wordList
self.judgeType=judgeType
#self.rnnType=rnnType
#定义好的RNN
self.PreRNN=PredictRNN()
#原序列的预测值
self.oriRes=self.PreRNN.Predict(self.chosenList)
#只接受list的输入
def Predict(self,tempSenList,rnnType):
res=self.PreRNN.Predict(tempSenList)
return res
def PredictMulti(tempSenList,rnnType):
inputData,value,weight,bias=DefVar(rnnType)
predicr=DefPreFun(value,weight,bias)
sess=tf.InteractiveSession()
saver=tf.train.Saver()
if(rnnType=='GRU'):
saver.restore(sess,'./modelsMoreGRU/pretrained_gru.ckpt-130000')
elif(rnnType=='vanilla'):
saver.restore(sess,'./modelsMoreVanilla/pretrained_gru.ckpt-500000')
else:
saver.restore(sess,'./modelsMoreMid/pretrained_lstm.ckpt-290000')
comment=ListToVecCommentMulti(tempSenList)
res=sess.run(predicr, {inputData: comment})
return res
def ListToVecCommentBySomeSub(tempSenList,loc):
wordsList=np.load('./VectorList/wordsList.npy')
wordsList = wordsList.tolist() #Originally loaded as numpy array
wordsList = [word.decode('UTF-8') for word in wordsList] #Encode words as UTF-8
comment=np.zeros([batchSize,maxSeqLength])
#comment保存的评论每个字都是用字典中的数字组成的
listSize=len(tempSenList)
for i in range(batchSize):
if(loc+i>=listSize):
break
counter=0
for word in tempSenList[loc+i].split():
try:
comment[i][counter]=wordsList.index(word)
except Exception:
comment[i][counter]=399999
counter+=1
if counter==250:
break
return comment
def GetDeatail(tempSenList,rnnType):
inputData,value,weight,bias=DefVar(rnnType)
predicr=DefPreFun(value,weight,bias)
sess=tf.InteractiveSession()
saver=tf.train.Saver()
if(rnnType=='GRU'):
saver.restore(sess,'./modelsMoreGRU/pretrained_gru.ckpt-130000')
elif(rnnType=='vanilla'):
saver.restore(sess,'./modelsMoreVanilla/pretrained_gru.ckpt-500000')
else:
saver.restore(sess,'./modelsMoreMid/pretrained_lstm.ckpt-290000')
subSenRes=[]
listSize=len(tempSenList)
loc=0
while True:
comment=ListToVecCommentBySomeSub(tempSenList,loc)
res=sess.run(predicr, {inputData: comment})
for i in range(batchSize):
subSenRes.append(res[i].tolist())
loc+=1
if(loc==listSize):
break
if(loc==listSize):
break
return subSenRes