OrderVis/ReorderByGroupCovid.py

647 lines
16 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import pandas as pd
import numpy as np
import tensorflow as tf
import keras
from sklearn.preprocessing import MinMaxScaler
import re
import copy
from random import shuffle
import math
#辅助函数
#把整个评论切割成子句 输出list
#myinput必须是string类型
def InputToSenList(senten,model):
mark=' mark! '
#使用正则表达式确定是否要切分
stripSpecialChars=re.compile("[^A-Za-z0-9 ]+")
#把大写字母改成小写字母
senten=senten.lower().replace('<br />','')
#print(senten)
#把所有的标点符号更换为mark
subSenList=[]
if model=='clause':
myinput=re.sub(stripSpecialChars,mark,senten)
#wordVec保存的是token即单词
wordVec=myinput.split()
#markLoc保存mark的位置这就是标点符号的位置作为切分子句的依据
markLoc=[]
markLoc.append(0)
shiftNum=0
for i in range(len(wordVec)):
if wordVec[i-shiftNum]=='mark!':
markLoc.append(i-shiftNum)
wordVec.pop(i-shiftNum)
shiftNum+=1
#按照标点符号划分子句把每个子句放入subSenList
for i in range(len(markLoc)-1):
subSenList.append(" ".join(wordVec[markLoc[i]:markLoc[i+1]]))
else:
myinput=re.sub(stripSpecialChars,' ',senten)
#wordVec保存的是token即单词
subSenList=myinput.split()
return subSenList
# def GetSenList(myinput,model='clause'):
# senList=[]
# #将string类型切割开
# tempList=myinput.split()
# if model=='word':
# senList=tempList
# else:
# senten = ''
# count = 0
# for number in tempList:
# senten += str(number)+' '
# count += 1
# if(count>=3):
# senList.append(senten)
# senten=''
# count=0
# if senten:
# senList.append(senten)
# return senList
#辅助函数
from random import randint
class ReorderByGroupCovid():
def __init__(self,myinput,RNNModel):
#拆分好的句子的list
self.senList=InputToSenList(myinput,'clause')
#拆分好的单词的list
self.wordList=InputToSenList(myinput,'word')
#根据选择的句子/单词 model 确定的list
#计算到的list长度
self.sentenSize=len(self.senList)
print('senlist',self.senList)
#定义好的RNN
# self.PreRNN=PredictRNN()
self.PreRNN=RNNModel
#原序列的预测值
#使用者的模型中必须有一个predict的方法
self.oriRes=self.PreRNN.Predict(' '.join(self.senList))
self.iterations=100
#设置循环的次数,建议是总的子句数量*整体循环次数
#按照output计算reorder 的影响大小
def ReorderByOutGlobal(self):
#记录每次重新排序以后和原来结果的差值
differCount=np.zeros(self.sentenSize,dtype=float)
subNumCount=0 #每次处理都记录这是第几个字句
iterations=self.iterations
calTimes=0
counter=0
while True:
comment=self.ChanInpSubByOne(subNumCount)
#res=sess.run(PreRNN.predicr,{PreRNN.inputData: comment})
res=self.PreRNN.Predict(comment)
#print(res)
differCount[subNumCount]+=(res-self.oriRes)
subNumCount+=1
counter+=1
if subNumCount>=self.sentenSize:
subNumCount=0
calTimes+=1
if counter>iterations:
break
differCount=differCount/calTimes
return differCount
def ChanInpSubByOne(self,subNumCount):
#记录所有的评论,这些评论是原来的文字
subSenList=[]
smallSenNum=subNumCount
bigSenNum=randint(0,self.sentenSize-1)
if(smallSenNum>bigSenNum):
temp=smallSenNum
smallSenNum=bigSenNum
bigSenNum=temp
for j in range(self.sentenSize):
if j==bigSenNum:
subSenList.append(self.senList[smallSenNum])
elif j>=smallSenNum and j<bigSenNum:
subSenList.append(self.senList[j+1])
else:
subSenList.append(self.senList[j])
fullSent=' '.join(subSenList)
return fullSent #,sentenList
def ChanTokenInDefinitedPart(self,wodNumCount,tokenLocInSub):
#记录所有的评论,这些评论是原来的文字
subSenList=copy.deepcopy(self.senList)
senLoc=tokenLocInSub[wodNumCount][0]
wordLoc=tokenLocInSub[wodNumCount][1]
subSenInLoc=subSenList[senLoc]
senInLocList=subSenInLoc.split()
ranWordLoc=randint(0,len(senInLocList)-1)
temp=senInLocList[wordLoc]
senInLocList[wordLoc]=senInLocList[ranWordLoc]
senInLocList[ranWordLoc]=temp
subSenList[senLoc]=' '.join(senInLocList)
fullSent=' '.join(subSenList)
return fullSent #,sentenList
def GetTokenImportance(self):
wordSize=len(self.wordList)
senDifferCount=np.zeros(self.sentenSize,dtype=float)
tokenDifferCount=np.zeros(wordSize,dtype=float)
tokenLocInSub=np.zeros([wordSize,2],dtype=int)
wordLoc=0
for i in range (len(self.senList)):
for j in range (len(self.senList[i].split())):
tokenLocInSub[wordLoc][0]=i
tokenLocInSub[wordLoc][1]=j
wordLoc+=1
iterations=self.iterations*4
wodNumCount=0
calTimes=0
counter=0
while True:
comment=self.ChanTokenInDefinitedPart(wodNumCount,tokenLocInSub)
res=self.PreRNN.Predict(comment)
tokenDifferCount[wodNumCount]+=(res-self.oriRes)
wodNumCount+=1
counter+=1
if wodNumCount>=wordSize:
wodNumCount=0
calTimes+=1
if counter>iterations:
break
tokenDifferCount=tokenDifferCount/calTimes
wordLoc=0
for i in range (len(self.senList)):
for j in range (len(self.senList[i].split())):
senDifferCount[i]+=tokenDifferCount[wordLoc]
wordLoc+=1
senDifferCount[i]/=len(self.senList[i].split())
return senDifferCount,tokenDifferCount,tokenLocInSub
def CalColor(self,percent,color):
theColor="#"
gray='c4d7d6'
blue='baccd9'
red='eeb8c3'
print('colorPercent',percent)
if color=='blue':
for i in range(3):
blueR=int(blue[i*2:i*2+2],16)
grayR=int(gray[i*2:i*2+2],16)
R=int((blueR-grayR)*percent)+grayR
theColor+=hex(R)[2:].zfill(2)
elif color=='red':
for i in range(3):
redR=int(red[i*2:i*2+2],16)
grayR=int(gray[i*2:i*2+2],16)
R=int((redR-grayR)*percent)+grayR
theColor+=hex(R)[2:].zfill(2)
return theColor
def GetImportanceByColor(self):
#计算global 的重要性
differCount=self.ReorderByOutGlobal()
differCount=differCount.tolist()
maxProb=np.max(differCount)
minProb=np.min(differCount)
maxMmin=abs(maxProb) if abs(maxProb)>abs(minProb) else abs(minProb)
if maxMmin==0:
maxMmin=1
colorCount=[]
for i in range(len(differCount)):
differCount[i]=(differCount[i])/maxMmin
if(differCount[i]>0):
colorCount.append(self.CalColor(differCount[i],'blue'))
elif (differCount[i]<0):
colorCount.append(self.CalColor(-differCount[i],'red'))
else:
colorCount.append('#b2b9b4')
self.colorCount=colorCount
self.differCount=differCount
globalDataZip={'differCount':differCount,'colorCount':colorCount}
#计算local重要性
senDifferCount,tokenDifferCount,tokenLocInSub=self.GetTokenImportance()
senDifferCount=senDifferCount.tolist()
localColorCount=[]
maxProb=np.max(tokenDifferCount)
minProb=np.min(tokenDifferCount)
maxMmin=abs(maxProb) if abs(maxProb)>abs(minProb) else abs(minProb)
if maxMmin==0:
maxMmin=1
for i in range(len(tokenDifferCount)):
tokenDifferCount[i]=(tokenDifferCount[i])/maxMmin
maxProb=np.max(senDifferCount)
minProb=np.min(senDifferCount)
maxMmin=abs(maxProb) if abs(maxProb)>abs(minProb) else abs(minProb)
if maxMmin==0:
maxMmin=1
for i in range(len(senDifferCount)):
senDifferCount[i]=(senDifferCount[i])/maxMmin
# senDifferCount[i]=(senDifferCount[i]-minProb)/maxMmin
# localColorCount.append(self.CalColor(senDifferCount[i],'red'))
if(senDifferCount[i]>0):
localColorCount.append(self.CalColor(senDifferCount[i],'blue'))
elif (senDifferCount[i]<0):
localColorCount.append(self.CalColor(-senDifferCount[i],'red'))
else:
localColorCount.append('#b2b9b4')
tokenColorZip,tokenDifferCountZip = self.GetTokenColorZip(tokenDifferCount,tokenLocInSub)
localDataZip={'localColorCount':localColorCount,'senDifferCount':senDifferCount,'tokenColorZip':tokenColorZip,'tokenDifferCountZip':tokenDifferCountZip}
maxProb=np.max(senDifferCount)
minProb=np.min(senDifferCount)
threshold = abs(maxProb) if abs(maxProb)>abs(minProb) else abs(minProb)
threshold*=0.8
localOrderLine,lineDiffer=self.GetLocalOrderLine(senDifferCount,threshold)
print('lineDiffer',lineDiffer)
lineDifferColor=[]
maxProb=np.max(lineDiffer)
minProb=np.min(lineDiffer)
for i in range(len(lineDiffer)):
lineDiffer[i]=(lineDiffer[i])/maxMmin
if(lineDiffer[i]>0):
lineDifferColor.append(self.CalColor(lineDiffer[i],'blue'))
elif (lineDiffer[i]<0):
lineDifferColor.append(self.CalColor(-lineDiffer[i],'red'))
else:
lineDifferColor.append('#b2b9b4')
orderLineZip={'localOrderLine':localOrderLine,'lineDiffer':lineDiffer,'lineDifferColor':lineDifferColor}
return globalDataZip,localDataZip,orderLineZip
def ShufflePatter(self,start,end):
#记录所有的评论,这些评论是原来的文字
#comment保存的评论每个字都是用字典中的数字组成的
subSenList=copy.deepcopy(self.senList)
subSenInLoc=' '.join(subSenList[start:end+1])
senInLocList=subSenInLoc.split()
shuffle(senInLocList)
fullSent=' '.join(subSenList[:start])+' '+' '.join(senInLocList)+' '+' '.join(subSenList[end+1:])
return fullSent
def GetLocalOrderLine(self,differCount,threshold):
critLoc=[]
orderLine=[]
lineDiffer=[]
for i in range(len(differCount)):
if(abs(differCount[i])>threshold):
critLoc.append(i)
print(critLoc)
for loc in range(len(critLoc)):
index=critLoc[loc]
front=-1
if loc>0:
front=critLoc[loc-1]
back=self.sentenSize
if loc<len(critLoc)-1:
back=critLoc[loc+1]
start=index
end=index
oneRes=None
for i in range(10):
comment=self.ShufflePatter(index,index)
if oneRes:
oneRes+=self.PreRNN.Predict(comment)
else:
oneRes=self.PreRNN.Predict(comment)
oneRes = oneRes/10
print('oneRes',oneRes)
theRes=oneRes
for froSen in range(index-1,front,-1):
res=None
for i in range(10):
comment=self.ShufflePatter(index,index)
if res:
res+=self.PreRNN.Predict(comment)
else:
res=self.PreRNN.Predict(comment)
res = res/10
if(abs(res-self.oriRes)<abs(oneRes-self.oriRes)):
break
start=froSen
for backSen in range(index+1,back):
res=None
for i in range(10):
comment=self.ShufflePatter(index,index)
if res:
res+=self.PreRNN.Predict(comment)
else:
res=self.PreRNN.Predict(comment)
res = res/10
if(abs(res-self.oriRes)<abs(oneRes-self.oriRes)):
break
end=backSen
theRes=res
theList=[]
for i in range(start,end+1):
theList.append(i)
orderLine.append(theList)
lineDiffer.append(theRes-self.oriRes)
print(orderLine)
return orderLine,lineDiffer
def GetTokenColorZip(self,tokenDifferCount,tokenLocInSub):
tokenColorZip=[]
differCountZip=[]
for i in range(self.sentenSize):
differCount=[]
for j in range(len(tokenLocInSub)):
if tokenLocInSub[j][0]==i:
differCount.append(tokenDifferCount[j])
tokenColor=[]
maxProb=np.max(differCount)
minProb=np.min(differCount)
for i in range(len(differCount)):
if(differCount[i]>0):
decNum=int(differCount[i]*127/maxProb)+127
color='#8888'+hex(decNum)[2:].zfill(2)
tokenColor.append(color)
elif (differCount[i]<0):
decNum=int(differCount[i]*127/minProb)+127
color='#'+hex(decNum)[2:].zfill(2)+'8888'
tokenColor.append(color)
else:
tokenColor.append('#888888')
tokenColorZip.append(tokenColor)
differCountZip.append(differCount)
return tokenColorZip,differCountZip
def DealDataZip(self,DataZip):
oriSenListZip=[]
colorCountZip=[]
differCountZip=[]
for data in DataZip:
oriSenList=InputToSenList(data)
oriSenListZip.append(oriSenList)
#另一方面传递给机器模型,让其进行预测
#res=Predict(oriSenList,rnnType)
colorCount,differCount=self.GetImportanceByColor()
colorCountZip.append(colorCount)
differCount=differCount.tolist()
differCountZip.append(differCount)
return oriSenListZip,colorCountZip,differCountZip
def GetDeatail(self):
comment=[]
for i in range(self.sentenSize):
subSenList=copy.deepcopy(self.senList)
theData=[]
for j in range(self.sentenSize):
if j==i:
theData.append(subSenList[i])
else:
theData.append('0 0 0')
fullSent=' '.join(theData)
# sentenList.append(fullSent)
comment.append(fullSent.split())
res=self.PreRNN.GetRes(comment)
return res
if __name__ == "__main__":
myinput="44210 50393 43088 31169 23567 27393 34057 36073 47778 41062 34351 34428 39507 39018 45137 49284 42159 38415 51972 "
# myinput=input("输入")
MyReorder=ReorderByGroupCovid(myinput,'clause','judge')
oriSenList=MyReorder.senList
#先将当前的输入转化为list一方面作为list传递给前端
#另一方面传递给机器模型,让其进行预测
res=MyReorder.oriRes
globalDataZip,localDataZip,orderLineZip = MyReorder.GetImportanceByColor()