OrderVis/GetPartnerCovid.py

import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import numpy as np

# from Predict import GetSenList,PredictMulti
import os

import re


class TrainData():
    def __init__(self):

        data = pd.read_csv("./CovidInfo/Aamerica.csv")

        date = data["date"].values
        cases = data["comfirmed"].values

        temp1 = []

        for i in range(len(date)):
            date[i] = i
            if(i == 0):
                temp1.append(cases[0])
            if(i>0):
                temp1.append(cases[i] - cases[i-1])

        cases = temp1[1:]

        #抽出训练集
        date_train = date[0:len(cases)-25]
        cases_train = cases[0:len(cases)-25]


        #把训练数据打包
        cases_train = list(zip(date_train, cases_train))


        train1 = pd.DataFrame(cases_train, columns=['date', 'comfirmed'])


        train1['date'] = train1['date'].astype(float)
        train1['comfirmed'] = train1['comfirmed'].astype(float)


        x_train1 = train1.iloc[:, 1:2].values
        self.scaler1 = MinMaxScaler(feature_range = (0,1))
        self.scaler1.fit_transform(x_train1)

        self.trainData=x_train1


def CastToTrainData(emotion,theDifferCount):
    oriSenListZip=[]
    differCountZip=[]
    colorCountZip=[]


    path=None
    myNP=None
    myDiffer=None
    if(emotion=='neg'):
        path="./VectorList/negativeReviews/"
        myNP=negNP
        myDiffer=negDiffer
    else:
        path="./VectorList/positiveReviews/"
        myNP=posNP
        myDiffer=posDiffer


    maxDiffer=0
    CriticakDifferCount=[]

    for num in theDifferCount:
        if(abs(num)>maxDiffer):
            maxDiffer=abs(num)

    for num in theDifferCount:
        if(abs(num)>(maxDiffer*0.5)):
            CriticakDifferCount.append(num)

    counter=0
    for i in range(len(myNP)):
        trainDataNp=myNP[i]
        #从训练集中抽取一个数据
        if i%1000==0:
            print('i:',i)

        #逐个部分计算差值
        gap=0

        compTimes=10
        if len(CriticakDifferCount)<10:
            compTimes=len(CriticakDifferCount)


        if trainDataNp[compTimes-1]==0:
            continue

        for j in (range(compTimes)):
            #这边直接退出，加了1，gap一定不满足小于0.03
            if trainDataNp[j]==0:
                gap+=1
            else:
                gap+=abs(CriticakDifferCount[j]-trainDataNp[j])

        #print(gap)

        #如果差距比较小，则打开观察
        if(gap<0.002):
            print(i)


            name=''
            if(emotion=='neg'):
                for l in range(5):
                    name=path+str(i)+'_'+str(l)+'.txt'
                    if(os.path.exists(name)):
                        break

            else:
                for l in range(7,11):
                    name=path+str(i+1000)+'_'+str(l)+'.txt'
                    if(os.path.exists(name)):
                        break


            print(name)

            with open(name ,'r',encoding='utf-8') as f:
                content=''
                for line in f.readlines():
                    content+=line

                content+='. '
                inputList=InputToSenList(content)
                if(len(inputList)>=50):
                    continue
                oriSenListZip.append(inputList)


                theDiffer=myDiffer[i].tolist()
                theDiffer=theDiffer[:len(inputList)]
                print('last:',theDiffer[len(inputList)-1])

                differCountZip.append(theDiffer)
                colorCountZip.append(DifferToColor(theDiffer))


                counter+=1
                if(counter==5):
                    print('end:',i)
                    break

    return oriSenListZip,differCountZip,colorCountZip


def CastToTrainWithOrderCovid(orderLine1,orderLine2,emotion,theDifferCount):
    # oriSenListZip=[]
    # differCountZip=[]
    MyTrainData=TrainData()

    trainDataZip=[]


    myDiffer=np.load('./CovidInfo/TrainDataDifferCovid.npy')


    for line in orderLine1:
        oriSenListsmallZip=[]
        differCountsmallZip=[]
        counter=0
        for i in range(len(myDiffer)):
            trainDataNp=myDiffer[i]
            #从训练集中抽取一个数据

            #逐个部分计算差值

            lineLenth=len(line)
            # print(line)
            # print('linelenth',lineLenth)

            for j in (range(10-lineLenth)):
                if trainDataNp[j] == 0:
                    break
                gap=0
                for k in range(lineLenth):
                    # print(trainDataNp  )
                    # print('k',k)
                    # print( theDifferCount[int(line[k])])
                    gap+=abs(trainDataNp[j+k] - theDifferCount[int(line[k])])


                    #如果差距比较小，则打开观察
                if(gap<3000):


                    content=MyTrainData.trainData[i:i+15]
                    inputList=content.tolist()

                    subName=''
                    for num in range(i+j,i+j+lineLenth+1):
                        subName=subName+' '+str(num)

                    oriSenListsmallZip.append({'name':subName,'content':inputList[j:j+lineLenth]})


                    theDiffer=trainDataNp.tolist()
                    theDiffer=theDiffer[j:j+lineLenth]
                    print('last:',theDiffer)

                    differCountsmallZip.append(theDiffer)
                    #colorCountZip.append(DifferToColor(theDiffer))


                    counter+=1
                    break

            # print(counter)

            if(counter>=5):
                print('end:',i)
                break

        trainDataZip.append({'emotion':'pos','orderLine':line,'oriSenListsmallZip':oriSenListsmallZip,'differCountsmallZip':differCountsmallZip})

        # oriSenListZip.append(oriSenListsmallZip)
        # differCountZip.append(differCountsmallZip)


    for line in orderLine2:
        oriSenListsmallZip=[]
        differCountsmallZip=[]
        counter=0
        for i in range(len(myDiffer)):
            trainDataNp=myDiffer[i]
            #从训练集中抽取一个数据

            #逐个部分计算差值

            lineLenth=len(line)
            # print(line)
            # print('linelenth',lineLenth)

            for j in (range(10-lineLenth)):
                if trainDataNp[j] == 0:
                    break
                gap=0
                for k in range(lineLenth):
                    # print(trainDataNp  )
                    # print('k',k)
                    # print( theDifferCount[int(line[k])])
                    gap+=abs(trainDataNp[j+k] - theDifferCount[int(line[k])])

                #print(gap)

                    #如果差距比较小，则打开观察
                if(gap<3000):


                    content=MyTrainData.trainData[i:i+15]
                    inputList=content.tolist()

                    subName=''
                    for num in range(i+j,i+j+lineLenth+1):
                        subName=subName+' '+str(num)

                    oriSenListsmallZip.append({'name':subName,'content':inputList[j:j+lineLenth]})


                    theDiffer=trainDataNp.tolist()
                    theDiffer=theDiffer[j:j+lineLenth]
                    print('last:',theDiffer)

                    differCountsmallZip.append(theDiffer)
                    #colorCountZip.append(DifferToColor(theDiffer))


                    counter+=1
                    break

            # print(counter)

            if(counter>=5):
                print('end:',i)
                break

        trainDataZip.append({'emotion':'neg','orderLine':line,'oriSenListsmallZip':oriSenListsmallZip,'differCountsmallZip':differCountsmallZip})
        # oriSenListZip.append({oriSenListsmallZip})

        # differCountZip.append(differCountsmallZip)

    return trainDataZip
    # return oriSenListZip,differCountZip #,colorCountZip


def GetOrderLineDetailCovid(orderLine1,orderLine2,oriSenList,differCount,localDifferCount,RNNModel):

    orderLineInfoZip=[]

    meanImpZip = []

    allImpZip=[]


    allLocImpZip=[]
    meanLocImpZip=[]

    res=[]


    orderLines = orderLine1+orderLine2
    for line in orderLines:
        count = 0
        impor = 0
        locImpor = 0
        ablationInd = []
        allImp=[]
        allLocImp=[]
        for ind in line:
            count += 1
            impor += differCount[int(ind)]
            locImpor += localDifferCount[int(ind)]
            allImp.append(differCount[int(ind)])
            allLocImp.append((-1)*localDifferCount[int(ind)])
            ablationInd.append(int(ind))
        allImpZip.append(allImp)
        allLocImpZip.append(allLocImp)
        meanImpZip.append(format(impor/count, '.4f'))
        meanLocImpZip.append(format((-1)*locImpor/count, '.4f'))
        senten=[]
        for i in range(len(oriSenList)):
            if i not in ablationInd:
                senten.extend(oriSenList[i].split())

        for i in range(15-len(senten)):
            senten.extend('0')
        # res=MyPredict.GetRes(senten)!!

        res.append(RNNModel.Predict(' '.join(senten)))


    tableReorderValue=GetTableReorderValue(orderLines,oriSenList,RNNModel)
    print('tableReorderValue',tableReorderValue)

    for i in range(len(orderLines)):
        orderLineInfoZip.append({'id':i,'order':orderLines[i],'allImp':allImpZip[i],'allLocImp':allLocImpZip[i],'importance':meanImpZip[i],'locImportance':meanLocImpZip[i],'value':res[i],'resValue':tableReorderValue[i]})


    return orderLineInfoZip

import copy
from random import shuffle

def GetTableReorderValue(orderLines,oriSenList,RNNModel):


    #记录每次重新排序以后和原来结果的差值
    differCount=np.zeros(len(orderLines),dtype=float)

    senNum=len(oriSenList)

    batchSize=24

    subNumCount1=0 #每次处理都记录这是第几个字句
    subNumCount2=0
    iterations=10

    orderNums=len(orderLines)
    print('orderLines',orderLines)
    print('orderNums',orderNums)


    calTimes=1
    for i in range(iterations+15):

        theWordCom=[]
        for j in range(batchSize):
            theOrder=orderLines[subNumCount1]
            shufOrder=copy.deepcopy(theOrder)
            shuffle(shufOrder)
            # print('theOrder',theOrder)
            # print('shufOrder',shufOrder)
            theInd=[]

            wordCom=[]

            for k in range(senNum):
                theInd.append(k)

            # print('theInd',theInd)
            for k in range(len(theOrder)):
                # print('theOrder',theOrder[k])
                # print('shufOrder',shufOrder[k])
                theInd[int(theOrder[k])]=shufOrder[k]

            # print('theInd',theInd)
            for ind in theInd:
                wordCom.extend(oriSenList[int(ind)].split())
                # wordCom = wordCom+' '+oriSenList[int(ind)]

            for i in range(15-len(wordCom)):
                wordCom.extend('0')

            theWordCom.append(' '.join(wordCom))


            subNumCount1+=1
            if(subNumCount1>=orderNums):
                subNumCount1=0
                calTimes+=1
                if(i>iterations):
                    break


        res=[]
        #res=sess.run(PreRNN.predicr,{PreRNN.inputData: comment})
        for com in theWordCom:
            res.append(RNNModel.Predict(com))
        #print(res)

        #计算差值
        for j in range(batchSize):
            differCount[subNumCount2]+=res[j]
            subNumCount2+=1
            if(subNumCount2>=orderNums):
                subNumCount2=0
                calTimes+=1
                if(i>iterations):
                    break


        if(i>iterations and subNumCount2==0):
            break

    differCount=differCount/calTimes

    #返回差值
    return differCount


def DifferToColor(differCount):
    maxProb=np.max(differCount)
    minProb=np.min(differCount)

    colorCount=[]

    for i in range(len(differCount)):

        if(differCount[i]>0):
            decNum=int(differCount[i]*127/maxProb)+127
            color='#8888'+hex(decNum)[2:].zfill(2)
            colorCount.append(color)
        elif (differCount[i]<0):
            decNum=int(differCount[i]*127/minProb)+127
            color='#'+hex(decNum)[2:].zfill(2)+'8888'
            colorCount.append(color)
        else:
            colorCount.append('#888888')


    return colorCount


def InputToSenList(senten,mark=' mark! '):
    stripSpecialChars=re.compile("[^A-Za-z0-9 ]+")
    senten=senten.lower().replace('<br />','')
    #print(senten)
    myinput=re.sub(stripSpecialChars,mark,senten)
    wordVec=myinput.split()


    markLoc=[]
    markLoc.append(0)
    subSenList=[]
    shiftNum=0
    for i in range(len(wordVec)):
        if wordVec[i-shiftNum]=='mark!':
            markLoc.append(i-shiftNum)
            wordVec.pop(i-shiftNum)
            shiftNum+=1

    for i in range(len(markLoc)-1):
        subSenList.append(" ".join(wordVec[markLoc[i]:markLoc[i+1]]))

    return subSenList


if __name__ == "__main__":
    myinput="This is the worst movie ever made. Ever. It beats everything. I have never seen worse. Retire the trophy and give it to these people.....there's just no comparison.<br /><br />Even three days after watching this (for some reason I still don't know why) I cannot believe how insanely horrific this movie is/was. Its so bad. So far from anything that could be considered a movie, a story or anything that should have ever been created and brought into our existence.<br /><br />This made me question whether or not humans are truly put on this earth to do good. It made me feel disgusted with ourselves and our progress as a species in this universe. This type of movie sincerely hurts us as a society."
    # myinput=input("输入")


    CastToTrainData('neg',myinput)