OrderVis/GetPartnerCovid.py

528 lines
14 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import numpy as np
# from Predict import GetSenList,PredictMulti
import os
import re
class TrainData():
def __init__(self):
data = pd.read_csv("./CovidInfo/Aamerica.csv")
date = data["date"].values
cases = data["comfirmed"].values
temp1 = []
for i in range(len(date)):
date[i] = i
if(i == 0):
temp1.append(cases[0])
if(i>0):
temp1.append(cases[i] - cases[i-1])
cases = temp1[1:]
#抽出训练集
date_train = date[0:len(cases)-25]
cases_train = cases[0:len(cases)-25]
#把训练数据打包
cases_train = list(zip(date_train, cases_train))
train1 = pd.DataFrame(cases_train, columns=['date', 'comfirmed'])
train1['date'] = train1['date'].astype(float)
train1['comfirmed'] = train1['comfirmed'].astype(float)
x_train1 = train1.iloc[:, 1:2].values
self.scaler1 = MinMaxScaler(feature_range = (0,1))
self.scaler1.fit_transform(x_train1)
self.trainData=x_train1
def CastToTrainData(emotion,theDifferCount):
oriSenListZip=[]
differCountZip=[]
colorCountZip=[]
path=None
myNP=None
myDiffer=None
if(emotion=='neg'):
path="./VectorList/negativeReviews/"
myNP=negNP
myDiffer=negDiffer
else:
path="./VectorList/positiveReviews/"
myNP=posNP
myDiffer=posDiffer
maxDiffer=0
CriticakDifferCount=[]
for num in theDifferCount:
if(abs(num)>maxDiffer):
maxDiffer=abs(num)
for num in theDifferCount:
if(abs(num)>(maxDiffer*0.5)):
CriticakDifferCount.append(num)
counter=0
for i in range(len(myNP)):
trainDataNp=myNP[i]
#从训练集中抽取一个数据
if i%1000==0:
print('i:',i)
#逐个部分计算差值
gap=0
compTimes=10
if len(CriticakDifferCount)<10:
compTimes=len(CriticakDifferCount)
if trainDataNp[compTimes-1]==0:
continue
for j in (range(compTimes)):
#这边直接退出加了1gap一定不满足小于0.03
if trainDataNp[j]==0:
gap+=1
else:
gap+=abs(CriticakDifferCount[j]-trainDataNp[j])
#print(gap)
#如果差距比较小,则打开观察
if(gap<0.002):
print(i)
name=''
if(emotion=='neg'):
for l in range(5):
name=path+str(i)+'_'+str(l)+'.txt'
if(os.path.exists(name)):
break
else:
for l in range(7,11):
name=path+str(i+1000)+'_'+str(l)+'.txt'
if(os.path.exists(name)):
break
print(name)
with open(name ,'r',encoding='utf-8') as f:
content=''
for line in f.readlines():
content+=line
content+='. '
inputList=InputToSenList(content)
if(len(inputList)>=50):
continue
oriSenListZip.append(inputList)
theDiffer=myDiffer[i].tolist()
theDiffer=theDiffer[:len(inputList)]
print('last:',theDiffer[len(inputList)-1])
differCountZip.append(theDiffer)
colorCountZip.append(DifferToColor(theDiffer))
counter+=1
if(counter==5):
print('end:',i)
break
return oriSenListZip,differCountZip,colorCountZip
def CastToTrainWithOrderCovid(orderLine1,orderLine2,emotion,theDifferCount):
# oriSenListZip=[]
# differCountZip=[]
MyTrainData=TrainData()
trainDataZip=[]
myDiffer=np.load('./CovidInfo/TrainDataDifferCovid.npy')
for line in orderLine1:
oriSenListsmallZip=[]
differCountsmallZip=[]
counter=0
for i in range(len(myDiffer)):
trainDataNp=myDiffer[i]
#从训练集中抽取一个数据
#逐个部分计算差值
lineLenth=len(line)
# print(line)
# print('linelenth',lineLenth)
for j in (range(10-lineLenth)):
if trainDataNp[j] == 0:
break
gap=0
for k in range(lineLenth):
# print(trainDataNp )
# print('k',k)
# print( theDifferCount[int(line[k])])
gap+=abs(trainDataNp[j+k] - theDifferCount[int(line[k])])
#如果差距比较小,则打开观察
if(gap<3000):
content=MyTrainData.trainData[i:i+15]
inputList=content.tolist()
subName=''
for num in range(i+j,i+j+lineLenth+1):
subName=subName+' '+str(num)
oriSenListsmallZip.append({'name':subName,'content':inputList[j:j+lineLenth]})
theDiffer=trainDataNp.tolist()
theDiffer=theDiffer[j:j+lineLenth]
print('last:',theDiffer)
differCountsmallZip.append(theDiffer)
#colorCountZip.append(DifferToColor(theDiffer))
counter+=1
break
# print(counter)
if(counter>=5):
print('end:',i)
break
trainDataZip.append({'emotion':'pos','orderLine':line,'oriSenListsmallZip':oriSenListsmallZip,'differCountsmallZip':differCountsmallZip})
# oriSenListZip.append(oriSenListsmallZip)
# differCountZip.append(differCountsmallZip)
for line in orderLine2:
oriSenListsmallZip=[]
differCountsmallZip=[]
counter=0
for i in range(len(myDiffer)):
trainDataNp=myDiffer[i]
#从训练集中抽取一个数据
#逐个部分计算差值
lineLenth=len(line)
# print(line)
# print('linelenth',lineLenth)
for j in (range(10-lineLenth)):
if trainDataNp[j] == 0:
break
gap=0
for k in range(lineLenth):
# print(trainDataNp )
# print('k',k)
# print( theDifferCount[int(line[k])])
gap+=abs(trainDataNp[j+k] - theDifferCount[int(line[k])])
#print(gap)
#如果差距比较小,则打开观察
if(gap<3000):
content=MyTrainData.trainData[i:i+15]
inputList=content.tolist()
subName=''
for num in range(i+j,i+j+lineLenth+1):
subName=subName+' '+str(num)
oriSenListsmallZip.append({'name':subName,'content':inputList[j:j+lineLenth]})
theDiffer=trainDataNp.tolist()
theDiffer=theDiffer[j:j+lineLenth]
print('last:',theDiffer)
differCountsmallZip.append(theDiffer)
#colorCountZip.append(DifferToColor(theDiffer))
counter+=1
break
# print(counter)
if(counter>=5):
print('end:',i)
break
trainDataZip.append({'emotion':'neg','orderLine':line,'oriSenListsmallZip':oriSenListsmallZip,'differCountsmallZip':differCountsmallZip})
# oriSenListZip.append({oriSenListsmallZip})
# differCountZip.append(differCountsmallZip)
return trainDataZip
# return oriSenListZip,differCountZip #,colorCountZip
def GetOrderLineDetailCovid(orderLine1,orderLine2,oriSenList,differCount,localDifferCount,RNNModel):
orderLineInfoZip=[]
meanImpZip = []
allImpZip=[]
allLocImpZip=[]
meanLocImpZip=[]
res=[]
orderLines = orderLine1+orderLine2
for line in orderLines:
count = 0
impor = 0
locImpor = 0
ablationInd = []
allImp=[]
allLocImp=[]
for ind in line:
count += 1
impor += differCount[int(ind)]
locImpor += localDifferCount[int(ind)]
allImp.append(differCount[int(ind)])
allLocImp.append((-1)*localDifferCount[int(ind)])
ablationInd.append(int(ind))
allImpZip.append(allImp)
allLocImpZip.append(allLocImp)
meanImpZip.append(format(impor/count, '.4f'))
meanLocImpZip.append(format((-1)*locImpor/count, '.4f'))
senten=[]
for i in range(len(oriSenList)):
if i not in ablationInd:
senten.extend(oriSenList[i].split())
for i in range(15-len(senten)):
senten.extend('0')
# res=MyPredict.GetRes(senten)!!
res.append(RNNModel.Predict(' '.join(senten)))
tableReorderValue=GetTableReorderValue(orderLines,oriSenList,RNNModel)
print('tableReorderValue',tableReorderValue)
for i in range(len(orderLines)):
orderLineInfoZip.append({'id':i,'order':orderLines[i],'allImp':allImpZip[i],'allLocImp':allLocImpZip[i],'importance':meanImpZip[i],'locImportance':meanLocImpZip[i],'value':res[i],'resValue':tableReorderValue[i]})
return orderLineInfoZip
import copy
from random import shuffle
def GetTableReorderValue(orderLines,oriSenList,RNNModel):
#记录每次重新排序以后和原来结果的差值
differCount=np.zeros(len(orderLines),dtype=float)
senNum=len(oriSenList)
batchSize=24
subNumCount1=0 #每次处理都记录这是第几个字句
subNumCount2=0
iterations=10
orderNums=len(orderLines)
print('orderLines',orderLines)
print('orderNums',orderNums)
calTimes=1
for i in range(iterations+15):
theWordCom=[]
for j in range(batchSize):
theOrder=orderLines[subNumCount1]
shufOrder=copy.deepcopy(theOrder)
shuffle(shufOrder)
# print('theOrder',theOrder)
# print('shufOrder',shufOrder)
theInd=[]
wordCom=[]
for k in range(senNum):
theInd.append(k)
# print('theInd',theInd)
for k in range(len(theOrder)):
# print('theOrder',theOrder[k])
# print('shufOrder',shufOrder[k])
theInd[int(theOrder[k])]=shufOrder[k]
# print('theInd',theInd)
for ind in theInd:
wordCom.extend(oriSenList[int(ind)].split())
# wordCom = wordCom+' '+oriSenList[int(ind)]
for i in range(15-len(wordCom)):
wordCom.extend('0')
theWordCom.append(' '.join(wordCom))
subNumCount1+=1
if(subNumCount1>=orderNums):
subNumCount1=0
calTimes+=1
if(i>iterations):
break
res=[]
#res=sess.run(PreRNN.predicr,{PreRNN.inputData: comment})
for com in theWordCom:
res.append(RNNModel.Predict(com))
#print(res)
#计算差值
for j in range(batchSize):
differCount[subNumCount2]+=res[j]
subNumCount2+=1
if(subNumCount2>=orderNums):
subNumCount2=0
calTimes+=1
if(i>iterations):
break
if(i>iterations and subNumCount2==0):
break
differCount=differCount/calTimes
#返回差值
return differCount
def DifferToColor(differCount):
maxProb=np.max(differCount)
minProb=np.min(differCount)
colorCount=[]
for i in range(len(differCount)):
if(differCount[i]>0):
decNum=int(differCount[i]*127/maxProb)+127
color='#8888'+hex(decNum)[2:].zfill(2)
colorCount.append(color)
elif (differCount[i]<0):
decNum=int(differCount[i]*127/minProb)+127
color='#'+hex(decNum)[2:].zfill(2)+'8888'
colorCount.append(color)
else:
colorCount.append('#888888')
return colorCount
def InputToSenList(senten,mark=' mark! '):
stripSpecialChars=re.compile("[^A-Za-z0-9 ]+")
senten=senten.lower().replace('<br />','')
#print(senten)
myinput=re.sub(stripSpecialChars,mark,senten)
wordVec=myinput.split()
markLoc=[]
markLoc.append(0)
subSenList=[]
shiftNum=0
for i in range(len(wordVec)):
if wordVec[i-shiftNum]=='mark!':
markLoc.append(i-shiftNum)
wordVec.pop(i-shiftNum)
shiftNum+=1
for i in range(len(markLoc)-1):
subSenList.append(" ".join(wordVec[markLoc[i]:markLoc[i+1]]))
return subSenList
if __name__ == "__main__":
myinput="This is the worst movie ever made. Ever. It beats everything. I have never seen worse. Retire the trophy and give it to these people.....there's just no comparison.<br /><br />Even three days after watching this (for some reason I still don't know why) I cannot believe how insanely horrific this movie is/was. Its so bad. So far from anything that could be considered a movie, a story or anything that should have ever been created and brought into our existence.<br /><br />This made me question whether or not humans are truly put on this earth to do good. It made me feel disgusted with ourselves and our progress as a species in this universe. This type of movie sincerely hurts us as a society."
# myinput=input("输入")
CastToTrainData('neg',myinput)