OrderVis/ReorderByGroup.py

822 lines
22 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import numpy as np
import tensorflow as tf
import re
import copy
from random import shuffle
import math
# wordsList=np.load('./VectorList/wordsList.npy')
# wordsList = wordsList.tolist() #Originally loaded as numpy array
# wordsList = [word.decode('UTF-8') for word in wordsList] #Encode words as UTF-8
#辅助函数
#把整个评论切割成子句 输出list
def InputToSenList(senten,model):
mark=' mark! '
#使用正则表达式确定是否要切分
stripSpecialChars=re.compile("[^A-Za-z0-9 ]+")
#把大写字母改成小写字母
senten=senten.lower().replace('<br />','')
#print(senten)
#把所有的标点符号更换为mark
subSenList=[]
if model=='clause':
myinput=re.sub(stripSpecialChars,mark,senten)
#wordVec保存的是token即单词
wordVec=myinput.split()
#markLoc保存mark的位置这就是标点符号的位置作为切分子句的依据
markLoc=[]
markLoc.append(0)
shiftNum=0
for i in range(len(wordVec)):
if wordVec[i-shiftNum]=='mark!':
markLoc.append(i-shiftNum)
wordVec.pop(i-shiftNum)
shiftNum+=1
#按照标点符号划分子句把每个子句放入subSenList
for i in range(len(markLoc)-1):
subSenList.append(" ".join(wordVec[markLoc[i]:markLoc[i+1]]))
else:
myinput=re.sub(stripSpecialChars,' ',senten)
#wordVec保存的是token即单词
subSenList=myinput.split()
return subSenList
#把list转化为词嵌入向量的comment
def ListToVecComment(tempSenList):
global wordsList
comment=np.zeros([batchSize,maxSeqLength])
#comment保存的评论每个字都是用字典中的数字组成的
fullSent=' '.join(tempSenList)
counter=0
for word in fullSent.split():
try:
comment[0][counter]=wordsList.index(word)
except Exception:
comment[0][counter]=399999
counter+=1
if counter==250:
break
return comment
def CountWordNum(strs):
count=1
for word in strs:
if word==' ':
count+=1
return count
def GetSenList(myinput,model='clause'):
myinput+='. '
#按照子句切分
senList=InputToSenList(myinput)
#如果按照单词切分,则将子句重新整合,并按照空格切分
if(model=='word'):
fullSent=' '.join(senList)
senList=fullSent.split()
return senList
maxSeqLength=250
batchSize = 24
lstmUnits = 64
numClasses = 2
numDimensions = 50 #Dimensions for each word vector
iterations = 1000 #100000
learnRate=0
#辅助函数
from random import randint
import keras
class ReorderByGroup():
#这个类是计算局部可解释性、全局可解释性的数值、根据局部可解释性抽取orderline
def __init__(self,myinput,RNNModel):
#拆分好的句子的list
self.senList=InputToSenList(myinput,'clause')
print('senlist',self.senList)
#拆分好的单词的list
self.wordList=InputToSenList(myinput,'word')
print('wordList',self.wordList)
#根据选择的句子/单词 model 确定的list
self.sentenSize=len(self.senList)
#计算到的list长度
#定义好的RNN
# self.PreRNN=PredictRNN()
self.PreRNN=RNNModel
#原序列的预测值
#使用者的模型中必须有一个predict的方法
self.oriRes=self.PreRNN.Predict(' '.join(self.senList))
self.iterations=100
#设置循环的次数,建议是总的子句数量*整体循环次数
def ChanCompo(self,loc):
#将指定子句和随机的任意另一个子句互换位置
global wordsList
#记录所有的评论,这些评论是原来的文字
comment=np.zeros([batchSize,maxSeqLength])
#comment保存的评论每个字都是用字典中的数字组成的
for i in range(batchSize):
counter=0
#深复制,浅复制会复制地址而不重新开辟一个内存空间
subSenList=copy.deepcopy(self.chosenList)
subSenSize=len(subSenList)-1
smallSenNum=loc
bigSenNum=randint(0,subSenSize)
#随机生成一个位置将该位置的内容和loc的内容互换
try:
temp=subSenList[smallSenNum]
subSenList[smallSenNum]=subSenList[bigSenNum]
subSenList[bigSenNum]=temp
except:
print(smallSenNum,' !!! ',subSenList[smallSenNum])
loc+=1
if(loc>subSenSize):
loc=0
#子句位置调整好之后重新拼接并切分为单词
fullSent=' '.join(subSenList)
# sentenList.append(fullSent)
for word in fullSent.split():
try:
comment[i][counter]=wordsList.index(word)
except Exception:
comment[i][counter]=399999
counter+=1
if counter==250:
break
return comment #,sentenList
def ChanCompoInTokenRandom(self,loc):
#破坏某个子句内部顺序shuffle依此计算该子句的重要性
global wordsList
#记录所有的评论,这些评论是原来的文字
comment=np.zeros([batchSize,maxSeqLength])
#comment保存的评论每个字都是用字典中的数字组成的
for i in range(batchSize):
counter=0
subSenList=copy.deepcopy(self.chosenList)
#拿到所有的子句
subSenSize=len(subSenList)-1
#选中这个子句
subSenInLoc=subSenList[loc]
senInLocList=subSenInLoc.split()
#将子句内部的数据进行调换,破坏子句内部的顺序
shuffle(senInLocList)
subSenList[loc]=' '.join(senInLocList)
# smallSenNum=loc
# bigSenNum=randint(0,subSenSize)
loc+=1
if(loc>subSenSize):
loc=0
fullSent=' '.join(subSenList)
# sentenList.append(fullSent)
for word in fullSent.split():
try:
comment[i][counter]=wordsList.index(word)
except Exception:
comment[i][counter]=399999
counter+=1
if counter==250:
break
return comment #,sentenList
def ChanInpSubByOne(self,subNumCount):
subSenList=[]
smallSenNum=subNumCount
bigSenNum=randint(0,self.sentenSize-1)
if(smallSenNum>bigSenNum):
temp=smallSenNum
smallSenNum=bigSenNum
bigSenNum=temp
for j in range(self.sentenSize):
if j==bigSenNum:
subSenList.append(self.senList[smallSenNum])
elif j>=smallSenNum and j<bigSenNum:
subSenList.append(self.senList[j+1])
else:
subSenList.append(self.senList[j])
fullSent=' '.join(subSenList)
return fullSent #,sentenList
def ChanTokenInDefinitedPart(self,wodNumCount,tokenLocInSub):
#记录所有的评论,这些评论是原来的文字
subSenList=copy.deepcopy(self.senList)
senLoc=tokenLocInSub[wodNumCount][0]
wordLoc=tokenLocInSub[wodNumCount][1]
subSenInLoc=subSenList[senLoc]
senInLocList=subSenInLoc.split()
ranWordLoc=randint(0,len(senInLocList)-1)
temp=senInLocList[wordLoc]
senInLocList[wordLoc]=senInLocList[ranWordLoc]
senInLocList[ranWordLoc]=temp
subSenList[senLoc]=' '.join(senInLocList)
fullSent=' '.join(subSenList)
return fullSent
def GetTokenImportance(self):
#调用算法计算每个单词的局部重要性,
# 并通过计算子句中所包含单词的重要性求和取平均值计算子句的局部重要性
wordSize=len(self.wordList)
#子句局部重要性
senDifferCount=np.zeros(self.sentenSize,dtype=float)
#单词局部重要性
tokenDifferCount=np.zeros(wordSize,dtype=float)
#计算单词在子句中的位置
tokenLocInSub=np.zeros([wordSize,2],dtype=int)
wordLoc=0
#计算某个单词在哪个子句中,且在该子句中的哪个位置
for i in range (len(self.senList)):
for j in range (len(self.senList[i].split())):
tokenLocInSub[wordLoc][0]=i
tokenLocInSub[wordLoc][1]=j
wordLoc+=1
# subNumCount=0 #每次处理都记录这是第几个字句
iterations=self.iterations*4
wodNumCount=0
calTimes=0
counter=0
while True:
#计算第subNumCount个单词的局部重要性
comment=self.ChanTokenInDefinitedPart(wodNumCount,tokenLocInSub)
res=self.PreRNN.Predict(comment)
calDif=0
for i in range(len(res)):
calDif+=abs(res[i]-self.oriRes[i])
calDif/=len(res)
tokenDifferCount[wodNumCount]+=calDif
wodNumCount+=1
counter+=1
if wodNumCount>=wordSize:
wodNumCount=0
calTimes+=1
if counter>iterations:
break
tokenDifferCount=tokenDifferCount/calTimes
#计算出单词局部重要性
#将第i个子句中的所有单词的局部重要性累加
wordLoc=0
for i in range (len(self.senList)):
for j in range (len(self.senList[i].split())):
senDifferCount[i]+=tokenDifferCount[wordLoc]
wordLoc+=1
#讲第i个子句的局部重要性总和除以该子句单词数量
senDifferCount[i]/=len(self.senList[i].split())
return senDifferCount,tokenDifferCount,tokenLocInSub
#按照output计算reorder 的影响大小
def ReorderByOutGlobal(self):
#记录每次重新排序以后和原来结果的差值
differCount=np.zeros(self.sentenSize,dtype=float)
subNumCount=0 #每次处理都记录这是第几个字句
iterations=self.iterations
calTimes=0
counter=0
while True:
comment=self.ChanInpSubByOne(subNumCount)
#res=sess.run(PreRNN.predicr,{PreRNN.inputData: comment})
res=self.PreRNN.Predict(comment)
#print(res)
calDif=0
print('res',res)
print('oriRes',self.oriRes)
for i in range(len(res)):
calDif+=abs(res[i]-self.oriRes[i])
calDif/=len(res)
print('differcount',differCount[subNumCount])
print('caldif',calDif)
differCount[subNumCount]+=calDif
subNumCount+=1
counter+=1
if subNumCount>=self.sentenSize:
subNumCount=0
calTimes+=1
if counter>iterations:
break
differCount=differCount/calTimes
return differCount
def CalColor(self,percent,color):
#计算颜色
theColor="#"
gray='c4d7d6'
blue='baccd9'
red='eeb8c3'
print('colorPercent',percent)
if color=='blue':
#如果颜色情感为蓝色
for i in range(3):
#分别提取蓝色、灰色rgb的16进制数值并转化为10进制
blueR=int(blue[i*2:i*2+2],16)
grayR=int(gray[i*2:i*2+2],16)
#将r管道的10进制差值计算出来乘以百分比再加上灰色的管道数值
# 算到该情感表达的颜色应该是灰色偏向蓝色多少
R=int((blueR-grayR)*percent)+grayR
theColor+=hex(R)[2:].zfill(2)
elif color=='red':
for i in range(3):
redR=int(red[i*2:i*2+2],16)
grayR=int(gray[i*2:i*2+2],16)
R=int((redR-grayR)*percent)+grayR
theColor+=hex(R)[2:].zfill(2)
return theColor
def GetImportanceByColor(self):
#计算global 的重要性
differCount=self.ReorderByOutGlobal()
differCount=differCount.tolist()
# oriDifferCount=differCount
maxProb=np.max(differCount)
minProb=np.min(differCount)
#算出最大最小值
colorCount=[]
for i in range(len(differCount)):
#根据正负,确定不同的情感趋势,计算出百分比提交给函数,进行计算
if(differCount[i]>0):
colorCount.append(self.CalColor(differCount[i]/maxProb,'blue'))
elif (differCount[i]<0):
colorCount.append(self.CalColor(differCount[i]/minProb,'red'))
else:
colorCount.append('#b2b9b4')
self.colorCount=colorCount
self.differCount=differCount
print(colorCount)
globalDataZip={'differCount':differCount,'colorCount':colorCount}
#计算local重要性
senDifferCount,tokenDifferCount,tokenLocInSub=self.GetTokenImportance()
senDifferCount=senDifferCount.tolist()
localColorCount=[]
maxProb=np.max(senDifferCount)
minProb=np.min(senDifferCount)
for i in range(len(senDifferCount)):
if(senDifferCount[i]>0):
localColorCount.append(self.CalColor(senDifferCount[i]/maxProb,'blue'))
elif (senDifferCount[i]<0):
localColorCount.append(self.CalColor(senDifferCount[i]/minProb,'red'))
else:
localColorCount.append('#b2b9b4')
tokenColorZip,tokenDifferCountZip = self.GetTokenColorZip(tokenDifferCount,tokenLocInSub)
localDataZip={'localColorCount':localColorCount,'senDifferCount':senDifferCount,'tokenColorZip':tokenColorZip,'tokenDifferCountZip':tokenDifferCountZip}
#计算阈值即绝对值最大的数值乘以0.5
threshold = abs(maxProb) if abs(maxProb)>abs(minProb) else abs(minProb)
threshold*=0.5
#根据阈值提取orderline
localOrderLine,lineDiffer=self.GetLocalOrderLine(senDifferCount,threshold)
print('localOrderLine',localOrderLine)
print('lineDiffer',lineDiffer)
lineDifferColor=[]
maxProb=np.max(lineDiffer)
minProb=np.min(lineDiffer)
for i in range(len(lineDiffer)):
if(lineDiffer[i]>0):
lineDifferColor.append(self.CalColor(lineDiffer[i]/maxProb,'blue'))
elif (lineDiffer[i]<0):
lineDifferColor.append(self.CalColor(lineDiffer[i]/minProb,'red'))
else:
lineDifferColor.append('#b2b9b4')
orderLineZip={'localOrderLine':localOrderLine,'lineDiffer':lineDiffer,'lineDifferColor':lineDifferColor}
return globalDataZip,localDataZip,orderLineZip #,oriDifferCount
def ShufflePatter(self,start,end):
subSenList=copy.deepcopy(self.senList)
subSenInLoc=' '.join(subSenList[start:end+1])
senInLocList=subSenInLoc.split()
shuffle(senInLocList)
fullSent=' '.join(subSenList[:start])+' '+' '.join(senInLocList)+' '+' '.join(subSenList[end+1:])
return fullSent
def GetLocalOrderLine(self,differCount,threshold):
#根据重要性和阈值计算orderline
critLoc=[]
orderLine=[]
lineDiffer=[]
for i in range(len(differCount)):
if(abs(differCount[i])>threshold):
critLoc.append(i)
print(critLoc)
for loc in range(len(critLoc)):
index=critLoc[loc]
#取出关键子句
front=-1
if loc>0:
front=critLoc[loc-1]
back=self.sentenSize
if loc<len(critLoc)-1:
back=critLoc[loc+1]
start=index
end=index
oneGap=0
for i in range(10):
comment=self.ShufflePatter(index,index)
res=self.PreRNN.Predict(comment)
calDif=0
for i in range(len(res)):
calDif+=abs(res[i]-self.oriRes[i])
calDif/=len(res)
oneGap+= calDif
# if oneRes:
# oneRes+=self.PreRNN.Predict(comment)
# else:
# oneRes=self.PreRNN.Predict(comment)
oneGap = oneGap/10
# comment=self.ShufflePatter(index,index)
# oneRes=self.PreRNN.GetRes(comment)
# oneRes = np.sum(oneRes,axis=0)/24
theGap=oneGap
for froSen in range(index-1,front,-1):
resGap=0
for i in range(10):
comment=self.ShufflePatter(index,index)
res=self.PreRNN.Predict(comment)
calDif=0
for i in range(len(res)):
calDif+=abs(res[i]-self.oriRes[i])
calDif/=len(res)
resGap+= calDif
resGap = resGap/10
if(resGap)<abs(oneGap):
break
start=froSen
for backSen in range(index+1,back):
resGap=0
for i in range(10):
comment=self.ShufflePatter(index,index)
res=self.PreRNN.Predict(comment)
calDif=0
for i in range(len(res)):
calDif+=abs(res[i]-self.oriRes[i])
calDif/=len(res)
resGap+= calDif
resGap = resGap/10
if(resGap)<abs(oneGap):
break
end=backSen
theGap=resGap
theList=[]
for i in range(start,end+1):
theList.append(i)
orderLine.append(theList)
lineDiffer.append(theGap)
print(orderLine)
return orderLine,lineDiffer
def GetTokenColorZip(self,tokenDifferCount,tokenLocInSub):
tokenColorZip=[]
differCountZip=[]
for i in range(self.sentenSize):
differCount=[]
for j in range(len(tokenLocInSub)):
if tokenLocInSub[j][0]==i:
differCount.append(tokenDifferCount[j])
tokenColor=[]
maxProb=np.max(differCount)
minProb=np.min(differCount)
for i in range(len(differCount)):
if(differCount[i]>0):
tokenColor.append(self.CalColor(differCount[i]/maxProb,'blue'))
elif (differCount[i]<0):
tokenColor.append(self.CalColor(differCount[i]/minProb,'red'))
else:
tokenColor.append('#b2b9b4')
tokenColorZip.append(tokenColor)
differCountZip.append(differCount)
return tokenColorZip,differCountZip
def DealDataZip(self,DataZip):
oriSenListZip=[]
colorCountZip=[]
differCountZip=[]
for data in DataZip:
oriSenList=InputToSenList(data)
oriSenListZip.append(oriSenList)
#另一方面传递给机器模型,让其进行预测
#res=Predict(oriSenList,rnnType)
colorCount,differCount=self.GetImportanceByColor()
colorCountZip.append(colorCount)
differCount=differCount.tolist()
differCountZip.append(differCount)
return oriSenListZip,colorCountZip,differCountZip
def ListToVecCommentBySomeSub(self,tempSenList,loc):
global wordsList
comment=np.zeros([batchSize,maxSeqLength])
#comment保存的评论每个字都是用字典中的数字组成的
listSize=len(tempSenList)
for i in range(batchSize):
if(loc+i>=listSize):
break
counter=0
for word in tempSenList[loc+i].split():
try:
comment[i][counter]=wordsList.index(word)
except Exception:
comment[i][counter]=399999
counter+=1
if counter==250:
break
return comment
def GetDeatail(self):
subSenRes=[]
loc=0
while True:
comment=self.ListToVecCommentBySomeSub(self.senList,loc)
res=self.PreRNN.GetRes(comment)
# res=sess.run(predicr, {inputData: comment})
for i in range(batchSize):
subSenRes.append(res[i].tolist())
loc+=1
if(loc==self.sentenSize):
break
if(loc==self.sentenSize):
break
return subSenRes
if __name__ == "__main__":
myinput="This is the worst movie ever made. Ever. It beats everything. I have never seen worse. Retire the trophy and give it to these people.....there's just no comparison.<br /><br />Even three days after watching this (for some reason I still don't know why) I cannot believe how insanely horrific this movie is/was. Its so bad. So far from anything that could be considered a movie, a story or anything that should have ever been created and brought into our existence.<br /><br />This made me question whether or not humans are truly put on this earth to do good. It made me feel disgusted with ourselves and our progress as a species in this universe. This type of movie sincerely hurts us as a society."
# myinput=input("输入")
oriSenList=InputToSenListToReplace(myinput)
print(oriSenList)