OrderVis/ReorderByGroupCovid.py

import pandas as pd
import numpy as np
import tensorflow as tf
import keras
from sklearn.preprocessing import MinMaxScaler
import re
import copy
from random import shuffle
import math


#辅助函数

#把整个评论切割成子句 输出list


#myinput必须是string类型
def InputToSenList(senten,model):
    mark=' mark! '
    #使用正则表达式确定是否要切分
    stripSpecialChars=re.compile("[^A-Za-z0-9 ]+")
    #把大写字母改成小写字母
    senten=senten.lower().replace('<br />','')
    #print(senten)
    #把所有的标点符号更换为mark
    subSenList=[]

    if model=='clause':
        myinput=re.sub(stripSpecialChars,mark,senten)
        #wordVec保存的是token，即单词
        wordVec=myinput.split()
        
        #markLoc保存mark！的位置，这就是标点符号的位置，作为切分子句的依据
        markLoc=[]
        markLoc.append(0)

        shiftNum=0
        for i in range(len(wordVec)):
            if wordVec[i-shiftNum]=='mark!':
                markLoc.append(i-shiftNum)
                wordVec.pop(i-shiftNum)
                shiftNum+=1

        #按照标点符号划分子句，把每个子句放入subSenList
        for i in range(len(markLoc)-1):
            subSenList.append(" ".join(wordVec[markLoc[i]:markLoc[i+1]]))
    else:
        myinput=re.sub(stripSpecialChars,' ',senten)
        #wordVec保存的是token，即单词
        subSenList=myinput.split()        
    
    return subSenList


# def GetSenList(myinput,model='clause'):

#     senList=[]
#     #将string类型切割开
#     tempList=myinput.split()

#     if model=='word':
#         senList=tempList
#     else:
#         senten = ''
#         count = 0
#         for number in tempList:
#             senten += str(number)+' '
#             count += 1
#             if(count>=3):
#                 senList.append(senten)
#                 senten=''
#                 count=0

#         if senten:
#             senList.append(senten)


#     return senList


#辅助函数
from random import randint


class ReorderByGroupCovid():
    def __init__(self,myinput,RNNModel):
        #拆分好的句子的list
        self.senList=InputToSenList(myinput,'clause')
        #拆分好的单词的list
        self.wordList=InputToSenList(myinput,'word')
        #根据选择的句子/单词 model 确定的list

        #计算到的list长度
        self.sentenSize=len(self.senList)

        print('senlist',self.senList)
        #定义好的RNN
        # self.PreRNN=PredictRNN()
        self.PreRNN=RNNModel
        #原序列的预测值
        #使用者的模型中必须有一个predict的方法
        self.oriRes=self.PreRNN.Predict(' '.join(self.senList))
        self.iterations=100
        #设置循环的次数，建议是总的子句数量*整体循环次数


    #按照output计算reorder 的影响大小
    def ReorderByOutGlobal(self):

        #记录每次重新排序以后和原来结果的差值
        differCount=np.zeros(self.sentenSize,dtype=float)


        subNumCount=0 #每次处理都记录这是第几个字句
        iterations=self.iterations


        calTimes=0
        counter=0
        while True:

            comment=self.ChanInpSubByOne(subNumCount)

            #res=sess.run(PreRNN.predicr,{PreRNN.inputData: comment})
            res=self.PreRNN.Predict(comment)
            #print(res)
            

            differCount[subNumCount]+=(res-self.oriRes)
            subNumCount+=1
            counter+=1

            if subNumCount>=self.sentenSize:
                subNumCount=0
                calTimes+=1
                if counter>iterations:
                    break
 
        
        differCount=differCount/calTimes

            
        return differCount


    def ChanInpSubByOne(self,subNumCount):    
        #记录所有的评论，这些评论是原来的文字


        subSenList=[]

        smallSenNum=subNumCount
        bigSenNum=randint(0,self.sentenSize-1)

        if(smallSenNum>bigSenNum):
            temp=smallSenNum
            smallSenNum=bigSenNum
            bigSenNum=temp

        for j in range(self.sentenSize):
            if j==bigSenNum:

                subSenList.append(self.senList[smallSenNum])
            elif j>=smallSenNum and j<bigSenNum:
                subSenList.append(self.senList[j+1])
            else:
                subSenList.append(self.senList[j])


        fullSent=' '.join(subSenList)


        return fullSent  #,sentenList


    def ChanTokenInDefinitedPart(self,wodNumCount,tokenLocInSub):
        #记录所有的评论，这些评论是原来的文字

             
        subSenList=copy.deepcopy(self.senList)


        senLoc=tokenLocInSub[wodNumCount][0]
        wordLoc=tokenLocInSub[wodNumCount][1]


        subSenInLoc=subSenList[senLoc]


        senInLocList=subSenInLoc.split()


        ranWordLoc=randint(0,len(senInLocList)-1)

    
        temp=senInLocList[wordLoc]
        senInLocList[wordLoc]=senInLocList[ranWordLoc]
        senInLocList[ranWordLoc]=temp


        subSenList[senLoc]=' '.join(senInLocList)


        fullSent=' '.join(subSenList)
        

        return fullSent  #,sentenList


    def GetTokenImportance(self):
        wordSize=len(self.wordList)

        senDifferCount=np.zeros(self.sentenSize,dtype=float)

        tokenDifferCount=np.zeros(wordSize,dtype=float)

        tokenLocInSub=np.zeros([wordSize,2],dtype=int)

        wordLoc=0


        for i in range (len(self.senList)):
            for j in range (len(self.senList[i].split())):
                tokenLocInSub[wordLoc][0]=i
                tokenLocInSub[wordLoc][1]=j
                wordLoc+=1
                

        iterations=self.iterations*4

        wodNumCount=0
        calTimes=0
        counter=0

        while True:
            comment=self.ChanTokenInDefinitedPart(wodNumCount,tokenLocInSub)
            

            res=self.PreRNN.Predict(comment)


            tokenDifferCount[wodNumCount]+=(res-self.oriRes)   
            wodNumCount+=1
            counter+=1

            if wodNumCount>=wordSize:
                wodNumCount=0
                calTimes+=1
                if counter>iterations:
                    break

                    
        tokenDifferCount=tokenDifferCount/calTimes

        wordLoc=0
        for i in range (len(self.senList)):
            for j in range (len(self.senList[i].split())):
                senDifferCount[i]+=tokenDifferCount[wordLoc]
                wordLoc+=1
            
            senDifferCount[i]/=len(self.senList[i].split())

        return senDifferCount,tokenDifferCount,tokenLocInSub


    def CalColor(self,percent,color):
        theColor="#"
        gray='c4d7d6'
        blue='baccd9'
        red='eeb8c3'
        print('colorPercent',percent)
        if color=='blue':
            for i in range(3):
                blueR=int(blue[i*2:i*2+2],16)
                grayR=int(gray[i*2:i*2+2],16)
                R=int((blueR-grayR)*percent)+grayR
                theColor+=hex(R)[2:].zfill(2)

        elif color=='red':
            for i in range(3):
                redR=int(red[i*2:i*2+2],16)
                grayR=int(gray[i*2:i*2+2],16)
                R=int((redR-grayR)*percent)+grayR
                theColor+=hex(R)[2:].zfill(2)


        return theColor

    def GetImportanceByColor(self):

        #计算global 的重要性
        differCount=self.ReorderByOutGlobal()
        differCount=differCount.tolist()


        maxProb=np.max(differCount)
        minProb=np.min(differCount)   
        maxMmin=abs(maxProb) if abs(maxProb)>abs(minProb) else abs(minProb)
        if maxMmin==0:
            maxMmin=1 

        colorCount=[]
        
        for i in range(len(differCount)):
            
                differCount[i]=(differCount[i])/maxMmin
                if(differCount[i]>0):            
                    colorCount.append(self.CalColor(differCount[i],'blue'))
                elif (differCount[i]<0):

                    colorCount.append(self.CalColor(-differCount[i],'red'))
                else:
                    colorCount.append('#b2b9b4')

        self.colorCount=colorCount
        self.differCount=differCount


        globalDataZip={'differCount':differCount,'colorCount':colorCount}


        #计算local重要性
        senDifferCount,tokenDifferCount,tokenLocInSub=self.GetTokenImportance()


        senDifferCount=senDifferCount.tolist()            
        localColorCount=[]


        maxProb=np.max(tokenDifferCount)
        minProb=np.min(tokenDifferCount)   
        maxMmin=abs(maxProb) if abs(maxProb)>abs(minProb) else abs(minProb)
        if maxMmin==0:
            maxMmin=1 
        
        for i in range(len(tokenDifferCount)):

            tokenDifferCount[i]=(tokenDifferCount[i])/maxMmin


        maxProb=np.max(senDifferCount)
        minProb=np.min(senDifferCount)   

        maxMmin=abs(maxProb) if abs(maxProb)>abs(minProb) else abs(minProb)
        if maxMmin==0:
            maxMmin=1 

        
        for i in range(len(senDifferCount)):
            senDifferCount[i]=(senDifferCount[i])/maxMmin
            # senDifferCount[i]=(senDifferCount[i]-minProb)/maxMmin
            # localColorCount.append(self.CalColor(senDifferCount[i],'red'))
            if(senDifferCount[i]>0):     
                localColorCount.append(self.CalColor(senDifferCount[i],'blue'))
            elif (senDifferCount[i]<0):
                localColorCount.append(self.CalColor(-senDifferCount[i],'red'))
            else:
                localColorCount.append('#b2b9b4')


        tokenColorZip,tokenDifferCountZip = self.GetTokenColorZip(tokenDifferCount,tokenLocInSub)
        

        localDataZip={'localColorCount':localColorCount,'senDifferCount':senDifferCount,'tokenColorZip':tokenColorZip,'tokenDifferCountZip':tokenDifferCountZip}


        maxProb=np.max(senDifferCount)
        minProb=np.min(senDifferCount)  
        threshold = abs(maxProb) if abs(maxProb)>abs(minProb) else abs(minProb)
        threshold*=0.8
        localOrderLine,lineDiffer=self.GetLocalOrderLine(senDifferCount,threshold)

        print('lineDiffer',lineDiffer)

        lineDifferColor=[]
        maxProb=np.max(lineDiffer)
        minProb=np.min(lineDiffer)   
        for i in range(len(lineDiffer)):
            lineDiffer[i]=(lineDiffer[i])/maxMmin
            if(lineDiffer[i]>0):     
                lineDifferColor.append(self.CalColor(lineDiffer[i],'blue'))
            elif (lineDiffer[i]<0):
                lineDifferColor.append(self.CalColor(-lineDiffer[i],'red'))
            else:
                lineDifferColor.append('#b2b9b4')
    
        orderLineZip={'localOrderLine':localOrderLine,'lineDiffer':lineDiffer,'lineDifferColor':lineDifferColor}

        return globalDataZip,localDataZip,orderLineZip


    def ShufflePatter(self,start,end):
        #记录所有的评论，这些评论是原来的文字
        #comment保存的评论，每个字都是用字典中的数字组成的

        
        subSenList=copy.deepcopy(self.senList)


        subSenInLoc=' '.join(subSenList[start:end+1])

        senInLocList=subSenInLoc.split()
        shuffle(senInLocList)


        fullSent=' '.join(subSenList[:start])+' '+' '.join(senInLocList)+' '+' '.join(subSenList[end+1:])


        return fullSent         


    def GetLocalOrderLine(self,differCount,threshold):
        critLoc=[]
        orderLine=[]
        lineDiffer=[]
        for i in range(len(differCount)):
            if(abs(differCount[i])>threshold):
                critLoc.append(i)

        print(critLoc)
        for loc in range(len(critLoc)):

            index=critLoc[loc]
            front=-1
            if loc>0:
                front=critLoc[loc-1]
            
            back=self.sentenSize
            if loc<len(critLoc)-1:
                back=critLoc[loc+1]
        
            start=index
            end=index

            oneRes=None
            for i in range(10):
                comment=self.ShufflePatter(index,index)
                if oneRes:
                    oneRes+=self.PreRNN.Predict(comment)
                else:
                    oneRes=self.PreRNN.Predict(comment)
            oneRes = oneRes/10

            print('oneRes',oneRes)

            theRes=oneRes
            for froSen in range(index-1,front,-1):
                res=None
                for i in range(10):
                    comment=self.ShufflePatter(index,index)
                    if res:
                        res+=self.PreRNN.Predict(comment)
                    else:
                        res=self.PreRNN.Predict(comment)
                res = res/10

                if(abs(res-self.oriRes)<abs(oneRes-self.oriRes)):
                    break
                start=froSen

            for backSen in range(index+1,back):
                res=None
                for i in range(10):
                    comment=self.ShufflePatter(index,index)
                    if res:
                        res+=self.PreRNN.Predict(comment)
                    else:
                        res=self.PreRNN.Predict(comment)
                res = res/10
                if(abs(res-self.oriRes)<abs(oneRes-self.oriRes)):
                    break
                end=backSen
                theRes=res

            theList=[]
            for i in range(start,end+1):
                theList.append(i)
            orderLine.append(theList)
            lineDiffer.append(theRes-self.oriRes)

        print(orderLine)
        return orderLine,lineDiffer


    def GetTokenColorZip(self,tokenDifferCount,tokenLocInSub):
        tokenColorZip=[]
        differCountZip=[]


        for i in range(self.sentenSize):
            differCount=[]

            for j in range(len(tokenLocInSub)):
                if tokenLocInSub[j][0]==i:
                    differCount.append(tokenDifferCount[j])

            tokenColor=[]
            maxProb=np.max(differCount)
            minProb=np.min(differCount)   

            
            for i in range(len(differCount)):
                
                if(differCount[i]>0):
                    decNum=int(differCount[i]*127/maxProb)+127
                    color='#8888'+hex(decNum)[2:].zfill(2)
                    tokenColor.append(color)
                elif (differCount[i]<0):
                    decNum=int(differCount[i]*127/minProb)+127
                    color='#'+hex(decNum)[2:].zfill(2)+'8888'
                    tokenColor.append(color)
                else:
                    tokenColor.append('#888888')

            tokenColorZip.append(tokenColor)
            differCountZip.append(differCount)

        return tokenColorZip,differCountZip


    def DealDataZip(self,DataZip):
        oriSenListZip=[]
        colorCountZip=[]
        differCountZip=[]

        for data in DataZip:
            oriSenList=InputToSenList(data)
            oriSenListZip.append(oriSenList)
            #另一方面传递给机器模型，让其进行预测
            
            #res=Predict(oriSenList,rnnType)
            
            colorCount,differCount=self.GetImportanceByColor()
            colorCountZip.append(colorCount)
            differCount=differCount.tolist()
            differCountZip.append(differCount)

        return oriSenListZip,colorCountZip,differCountZip


    def GetDeatail(self):
        comment=[]

        for i in range(self.sentenSize):
            subSenList=copy.deepcopy(self.senList)

            theData=[]

            for j in range(self.sentenSize):
                if j==i:
                    theData.append(subSenList[i])
                else:
                    theData.append('0 0 0')

            fullSent=' '.join(theData)

            # sentenList.append(fullSent)   
            comment.append(fullSent.split())

        
        res=self.PreRNN.GetRes(comment)

        return res


if __name__ == "__main__":
    myinput="44210 50393 43088 31169 23567 27393  34057  36073  47778  41062  34351  34428  39507  39018  45137  49284  42159  38415  51972 "
    # myinput=input("输入")

    MyReorder=ReorderByGroupCovid(myinput,'clause','judge')
    oriSenList=MyReorder.senList
    #先将当前的输入转化为list，一方面作为list传递给前端

    #另一方面传递给机器模型，让其进行预测
    res=MyReorder.oriRes
    globalDataZip,localDataZip,orderLineZip = MyReorder.GetImportanceByColor()