OrderVis/ReorderByGroupCovid.py

647 lines
16 KiB
Python
Raw Normal View History

2021-09-20 13:28:30 +08:00
import pandas as pd
import numpy as np
import tensorflow as tf
import keras
from sklearn.preprocessing import MinMaxScaler
import re
import copy
from random import shuffle
import math
#辅助函数
#把整个评论切割成子句 输出list
#myinput必须是string类型
def InputToSenList(senten,model):
mark=' mark! '
#使用正则表达式确定是否要切分
stripSpecialChars=re.compile("[^A-Za-z0-9 ]+")
#把大写字母改成小写字母
senten=senten.lower().replace('<br />','')
#print(senten)
#把所有的标点符号更换为mark
subSenList=[]
if model=='clause':
myinput=re.sub(stripSpecialChars,mark,senten)
#wordVec保存的是token即单词
wordVec=myinput.split()
#markLoc保存mark的位置这就是标点符号的位置作为切分子句的依据
markLoc=[]
markLoc.append(0)
shiftNum=0
for i in range(len(wordVec)):
if wordVec[i-shiftNum]=='mark!':
markLoc.append(i-shiftNum)
wordVec.pop(i-shiftNum)
shiftNum+=1
#按照标点符号划分子句把每个子句放入subSenList
for i in range(len(markLoc)-1):
subSenList.append(" ".join(wordVec[markLoc[i]:markLoc[i+1]]))
else:
myinput=re.sub(stripSpecialChars,' ',senten)
#wordVec保存的是token即单词
subSenList=myinput.split()
return subSenList
# def GetSenList(myinput,model='clause'):
# senList=[]
# #将string类型切割开
# tempList=myinput.split()
# if model=='word':
# senList=tempList
# else:
# senten = ''
# count = 0
# for number in tempList:
# senten += str(number)+' '
# count += 1
# if(count>=3):
# senList.append(senten)
# senten=''
# count=0
# if senten:
# senList.append(senten)
# return senList
#辅助函数
from random import randint
class ReorderByGroupCovid():
def __init__(self,myinput,RNNModel):
#拆分好的句子的list
self.senList=InputToSenList(myinput,'clause')
#拆分好的单词的list
self.wordList=InputToSenList(myinput,'word')
#根据选择的句子/单词 model 确定的list
#计算到的list长度
self.sentenSize=len(self.senList)
print('senlist',self.senList)
#定义好的RNN
# self.PreRNN=PredictRNN()
self.PreRNN=RNNModel
#原序列的预测值
#使用者的模型中必须有一个predict的方法
self.oriRes=self.PreRNN.Predict(' '.join(self.senList))
self.iterations=100
#设置循环的次数,建议是总的子句数量*整体循环次数
#按照output计算reorder 的影响大小
def ReorderByOutGlobal(self):
#记录每次重新排序以后和原来结果的差值
differCount=np.zeros(self.sentenSize,dtype=float)
subNumCount=0 #每次处理都记录这是第几个字句
iterations=self.iterations
calTimes=0
counter=0
while True:
comment=self.ChanInpSubByOne(subNumCount)
#res=sess.run(PreRNN.predicr,{PreRNN.inputData: comment})
res=self.PreRNN.Predict(comment)
#print(res)
differCount[subNumCount]+=(res-self.oriRes)
subNumCount+=1
counter+=1
if subNumCount>=self.sentenSize:
subNumCount=0
calTimes+=1
if counter>iterations:
break
differCount=differCount/calTimes
return differCount
def ChanInpSubByOne(self,subNumCount):
#记录所有的评论,这些评论是原来的文字
subSenList=[]
smallSenNum=subNumCount
bigSenNum=randint(0,self.sentenSize-1)
if(smallSenNum>bigSenNum):
temp=smallSenNum
smallSenNum=bigSenNum
bigSenNum=temp
for j in range(self.sentenSize):
if j==bigSenNum:
subSenList.append(self.senList[smallSenNum])
elif j>=smallSenNum and j<bigSenNum:
subSenList.append(self.senList[j+1])
else:
subSenList.append(self.senList[j])
fullSent=' '.join(subSenList)
return fullSent #,sentenList
def ChanTokenInDefinitedPart(self,wodNumCount,tokenLocInSub):
#记录所有的评论,这些评论是原来的文字
subSenList=copy.deepcopy(self.senList)
senLoc=tokenLocInSub[wodNumCount][0]
wordLoc=tokenLocInSub[wodNumCount][1]
subSenInLoc=subSenList[senLoc]
senInLocList=subSenInLoc.split()
ranWordLoc=randint(0,len(senInLocList)-1)
temp=senInLocList[wordLoc]
senInLocList[wordLoc]=senInLocList[ranWordLoc]
senInLocList[ranWordLoc]=temp
subSenList[senLoc]=' '.join(senInLocList)
fullSent=' '.join(subSenList)
return fullSent #,sentenList
def GetTokenImportance(self):
wordSize=len(self.wordList)
senDifferCount=np.zeros(self.sentenSize,dtype=float)
tokenDifferCount=np.zeros(wordSize,dtype=float)
tokenLocInSub=np.zeros([wordSize,2],dtype=int)
wordLoc=0
for i in range (len(self.senList)):
for j in range (len(self.senList[i].split())):
tokenLocInSub[wordLoc][0]=i
tokenLocInSub[wordLoc][1]=j
wordLoc+=1
iterations=self.iterations*4
wodNumCount=0
calTimes=0
counter=0
while True:
comment=self.ChanTokenInDefinitedPart(wodNumCount,tokenLocInSub)
res=self.PreRNN.Predict(comment)
tokenDifferCount[wodNumCount]+=(res-self.oriRes)
wodNumCount+=1
counter+=1
if wodNumCount>=wordSize:
wodNumCount=0
calTimes+=1
if counter>iterations:
break
tokenDifferCount=tokenDifferCount/calTimes
wordLoc=0
for i in range (len(self.senList)):
for j in range (len(self.senList[i].split())):
senDifferCount[i]+=tokenDifferCount[wordLoc]
wordLoc+=1
senDifferCount[i]/=len(self.senList[i].split())
return senDifferCount,tokenDifferCount,tokenLocInSub
def CalColor(self,percent,color):
theColor="#"
gray='c4d7d6'
blue='baccd9'
red='eeb8c3'
print('colorPercent',percent)
if color=='blue':
for i in range(3):
blueR=int(blue[i*2:i*2+2],16)
grayR=int(gray[i*2:i*2+2],16)
R=int((blueR-grayR)*percent)+grayR
theColor+=hex(R)[2:].zfill(2)
elif color=='red':
for i in range(3):
redR=int(red[i*2:i*2+2],16)
grayR=int(gray[i*2:i*2+2],16)
R=int((redR-grayR)*percent)+grayR
theColor+=hex(R)[2:].zfill(2)
return theColor
def GetImportanceByColor(self):
#计算global 的重要性
differCount=self.ReorderByOutGlobal()
differCount=differCount.tolist()
maxProb=np.max(differCount)
minProb=np.min(differCount)
maxMmin=abs(maxProb) if abs(maxProb)>abs(minProb) else abs(minProb)
if maxMmin==0:
maxMmin=1
colorCount=[]
for i in range(len(differCount)):
differCount[i]=(differCount[i])/maxMmin
if(differCount[i]>0):
colorCount.append(self.CalColor(differCount[i],'blue'))
elif (differCount[i]<0):
colorCount.append(self.CalColor(-differCount[i],'red'))
else:
colorCount.append('#b2b9b4')
self.colorCount=colorCount
self.differCount=differCount
globalDataZip={'differCount':differCount,'colorCount':colorCount}
#计算local重要性
senDifferCount,tokenDifferCount,tokenLocInSub=self.GetTokenImportance()
senDifferCount=senDifferCount.tolist()
localColorCount=[]
maxProb=np.max(tokenDifferCount)
minProb=np.min(tokenDifferCount)
maxMmin=abs(maxProb) if abs(maxProb)>abs(minProb) else abs(minProb)
if maxMmin==0:
maxMmin=1
for i in range(len(tokenDifferCount)):
tokenDifferCount[i]=(tokenDifferCount[i])/maxMmin
maxProb=np.max(senDifferCount)
minProb=np.min(senDifferCount)
maxMmin=abs(maxProb) if abs(maxProb)>abs(minProb) else abs(minProb)
if maxMmin==0:
maxMmin=1
for i in range(len(senDifferCount)):
senDifferCount[i]=(senDifferCount[i])/maxMmin
# senDifferCount[i]=(senDifferCount[i]-minProb)/maxMmin
# localColorCount.append(self.CalColor(senDifferCount[i],'red'))
if(senDifferCount[i]>0):
localColorCount.append(self.CalColor(senDifferCount[i],'blue'))
elif (senDifferCount[i]<0):
localColorCount.append(self.CalColor(-senDifferCount[i],'red'))
else:
localColorCount.append('#b2b9b4')
tokenColorZip,tokenDifferCountZip = self.GetTokenColorZip(tokenDifferCount,tokenLocInSub)
localDataZip={'localColorCount':localColorCount,'senDifferCount':senDifferCount,'tokenColorZip':tokenColorZip,'tokenDifferCountZip':tokenDifferCountZip}
maxProb=np.max(senDifferCount)
minProb=np.min(senDifferCount)
threshold = abs(maxProb) if abs(maxProb)>abs(minProb) else abs(minProb)
threshold*=0.8
localOrderLine,lineDiffer=self.GetLocalOrderLine(senDifferCount,threshold)
print('lineDiffer',lineDiffer)
lineDifferColor=[]
maxProb=np.max(lineDiffer)
minProb=np.min(lineDiffer)
for i in range(len(lineDiffer)):
lineDiffer[i]=(lineDiffer[i])/maxMmin
if(lineDiffer[i]>0):
lineDifferColor.append(self.CalColor(lineDiffer[i],'blue'))
elif (lineDiffer[i]<0):
lineDifferColor.append(self.CalColor(-lineDiffer[i],'red'))
else:
lineDifferColor.append('#b2b9b4')
orderLineZip={'localOrderLine':localOrderLine,'lineDiffer':lineDiffer,'lineDifferColor':lineDifferColor}
return globalDataZip,localDataZip,orderLineZip
def ShufflePatter(self,start,end):
#记录所有的评论,这些评论是原来的文字
#comment保存的评论每个字都是用字典中的数字组成的
subSenList=copy.deepcopy(self.senList)
subSenInLoc=' '.join(subSenList[start:end+1])
senInLocList=subSenInLoc.split()
shuffle(senInLocList)
fullSent=' '.join(subSenList[:start])+' '+' '.join(senInLocList)+' '+' '.join(subSenList[end+1:])
return fullSent
def GetLocalOrderLine(self,differCount,threshold):
critLoc=[]
orderLine=[]
lineDiffer=[]
for i in range(len(differCount)):
if(abs(differCount[i])>threshold):
critLoc.append(i)
print(critLoc)
for loc in range(len(critLoc)):
index=critLoc[loc]
front=-1
if loc>0:
front=critLoc[loc-1]
back=self.sentenSize
if loc<len(critLoc)-1:
back=critLoc[loc+1]
start=index
end=index
oneRes=None
for i in range(10):
comment=self.ShufflePatter(index,index)
if oneRes:
oneRes+=self.PreRNN.Predict(comment)
else:
oneRes=self.PreRNN.Predict(comment)
oneRes = oneRes/10
print('oneRes',oneRes)
theRes=oneRes
for froSen in range(index-1,front,-1):
res=None
for i in range(10):
comment=self.ShufflePatter(index,index)
if res:
res+=self.PreRNN.Predict(comment)
else:
res=self.PreRNN.Predict(comment)
res = res/10
if(abs(res-self.oriRes)<abs(oneRes-self.oriRes)):
break
start=froSen
for backSen in range(index+1,back):
res=None
for i in range(10):
comment=self.ShufflePatter(index,index)
if res:
res+=self.PreRNN.Predict(comment)
else:
res=self.PreRNN.Predict(comment)
res = res/10
if(abs(res-self.oriRes)<abs(oneRes-self.oriRes)):
break
end=backSen
theRes=res
theList=[]
for i in range(start,end+1):
theList.append(i)
orderLine.append(theList)
lineDiffer.append(theRes-self.oriRes)
print(orderLine)
return orderLine,lineDiffer
def GetTokenColorZip(self,tokenDifferCount,tokenLocInSub):
tokenColorZip=[]
differCountZip=[]
for i in range(self.sentenSize):
differCount=[]
for j in range(len(tokenLocInSub)):
if tokenLocInSub[j][0]==i:
differCount.append(tokenDifferCount[j])
tokenColor=[]
maxProb=np.max(differCount)
minProb=np.min(differCount)
for i in range(len(differCount)):
if(differCount[i]>0):
decNum=int(differCount[i]*127/maxProb)+127
color='#8888'+hex(decNum)[2:].zfill(2)
tokenColor.append(color)
elif (differCount[i]<0):
decNum=int(differCount[i]*127/minProb)+127
color='#'+hex(decNum)[2:].zfill(2)+'8888'
tokenColor.append(color)
else:
tokenColor.append('#888888')
tokenColorZip.append(tokenColor)
differCountZip.append(differCount)
return tokenColorZip,differCountZip
def DealDataZip(self,DataZip):
oriSenListZip=[]
colorCountZip=[]
differCountZip=[]
for data in DataZip:
oriSenList=InputToSenList(data)
oriSenListZip.append(oriSenList)
#另一方面传递给机器模型,让其进行预测
#res=Predict(oriSenList,rnnType)
colorCount,differCount=self.GetImportanceByColor()
colorCountZip.append(colorCount)
differCount=differCount.tolist()
differCountZip.append(differCount)
return oriSenListZip,colorCountZip,differCountZip
def GetDeatail(self):
comment=[]
for i in range(self.sentenSize):
subSenList=copy.deepcopy(self.senList)
theData=[]
for j in range(self.sentenSize):
if j==i:
theData.append(subSenList[i])
else:
theData.append('0 0 0')
fullSent=' '.join(theData)
# sentenList.append(fullSent)
comment.append(fullSent.split())
res=self.PreRNN.GetRes(comment)
return res
if __name__ == "__main__":
myinput="44210 50393 43088 31169 23567 27393 34057 36073 47778 41062 34351 34428 39507 39018 45137 49284 42159 38415 51972 "
# myinput=input("输入")
MyReorder=ReorderByGroupCovid(myinput,'clause','judge')
oriSenList=MyReorder.senList
#先将当前的输入转化为list一方面作为list传递给前端
#另一方面传递给机器模型,让其进行预测
res=MyReorder.oriRes
globalDataZip,localDataZip,orderLineZip = MyReorder.GetImportanceByColor()