190 lines
3.9 KiB
Python
190 lines
3.9 KiB
Python
from collections import Counter
|
||
import pandas as pd
|
||
import numpy as np
|
||
import tensorflow as tf
|
||
import keras
|
||
from sklearn.preprocessing import MinMaxScaler
|
||
import re
|
||
import copy
|
||
from random import shuffle
|
||
import math
|
||
|
||
|
||
|
||
|
||
|
||
|
||
def InputToSenList(senten,model):
|
||
mark=' mark! '
|
||
#使用正则表达式确定是否要切分
|
||
stripSpecialChars=re.compile("[^A-Za-z0-9 ]+")
|
||
#把大写字母改成小写字母
|
||
senten=senten.lower().replace('<br />','')
|
||
#print(senten)
|
||
#把所有的标点符号更换为mark
|
||
subSenList=[]
|
||
|
||
if model=='clause':
|
||
myinput=re.sub(stripSpecialChars,mark,senten)
|
||
#wordVec保存的是token,即单词
|
||
wordVec=myinput.split()
|
||
|
||
#markLoc保存mark!的位置,这就是标点符号的位置,作为切分子句的依据
|
||
markLoc=[]
|
||
markLoc.append(0)
|
||
|
||
shiftNum=0
|
||
for i in range(len(wordVec)):
|
||
if wordVec[i-shiftNum]=='mark!':
|
||
markLoc.append(i-shiftNum)
|
||
wordVec.pop(i-shiftNum)
|
||
shiftNum+=1
|
||
|
||
#按照标点符号划分子句,把每个子句放入subSenList
|
||
for i in range(len(markLoc)-1):
|
||
subSenList.append(" ".join(wordVec[markLoc[i]:markLoc[i+1]]))
|
||
else:
|
||
myinput=re.sub(stripSpecialChars,' ',senten)
|
||
#wordVec保存的是token,即单词
|
||
subSenList=myinput.split()
|
||
|
||
return subSenList
|
||
|
||
|
||
|
||
|
||
from random import randint
|
||
|
||
|
||
|
||
def ChanInpSubByOne(senList,subNumCount):
|
||
sentenSize=len(senList)
|
||
subSenList=[]
|
||
|
||
smallSenNum=subNumCount
|
||
bigSenNum=randint(0,sentenSize-1)
|
||
|
||
if(smallSenNum>bigSenNum):
|
||
temp=smallSenNum
|
||
smallSenNum=bigSenNum
|
||
bigSenNum=temp
|
||
|
||
for j in range(sentenSize):
|
||
if j==bigSenNum:
|
||
|
||
subSenList.append(senList[smallSenNum])
|
||
elif j>=smallSenNum and j<bigSenNum:
|
||
subSenList.append(senList[j+1])
|
||
else:
|
||
subSenList.append(senList[j])
|
||
|
||
|
||
|
||
fullSent=' '.join(subSenList)
|
||
|
||
|
||
|
||
|
||
return fullSent
|
||
|
||
|
||
|
||
|
||
|
||
|
||
def Rrtreat(trainData,PreRNN):
|
||
print('preRNNOk')
|
||
# PreRNN=aRNNModel()
|
||
#定义好的RNN
|
||
|
||
#原序列的预测值
|
||
|
||
|
||
|
||
|
||
# trainData=PreRNN.trainData
|
||
|
||
|
||
|
||
|
||
TrainDataDiffer=np.zeros([len(trainData),10])
|
||
# startNum=10900
|
||
# TrainDataDetail=np.load('./TrainDateForXplain_'+str(startNum)+'.npy')
|
||
|
||
for i in range(len(trainData)):
|
||
|
||
|
||
theDifferCount=[]
|
||
|
||
comment=trainData[i]
|
||
# comment=comment.tolist()
|
||
# print('comment',comment)
|
||
# comment=' '.join(comment)
|
||
|
||
|
||
# comment=' '.join(commentList)
|
||
|
||
senList=InputToSenList(comment,'clause')
|
||
|
||
sentenSize=len(senList)
|
||
|
||
# if(sentenSize<=2):
|
||
# continue
|
||
#记录每次重新排序以后和原来结果的差值
|
||
differCount=np.zeros(sentenSize,dtype=float)
|
||
|
||
|
||
|
||
iterations=15
|
||
|
||
# print('senlist',senList)
|
||
oriRes=PreRNN.Predict(' '.join(senList))
|
||
|
||
|
||
counter=0
|
||
for l in range(sentenSize):
|
||
|
||
counter+=1
|
||
for k in range(iterations):
|
||
thecomment=ChanInpSubByOne(senList,l)
|
||
|
||
|
||
res=PreRNN.Predict(' '.join(thecomment))
|
||
|
||
calGap=0
|
||
for m in range(len(res)):
|
||
calGap+=abs(res[m]-oriRes[m])
|
||
|
||
calGap/=len(res)
|
||
|
||
differCount[l]+=calGap
|
||
|
||
|
||
if(counter==10):
|
||
print('!!error')
|
||
break
|
||
|
||
|
||
|
||
theDifferCount=differCount/iterations
|
||
counter=0
|
||
for num in theDifferCount:
|
||
TrainDataDiffer[i][counter]=num
|
||
counter+=1
|
||
if(counter>=10):
|
||
break
|
||
|
||
|
||
if i%1000 == 0:
|
||
print('i',i)
|
||
thePath='./TrainDataDifferRandomByOne_'+str(i)+'.npy'
|
||
np.save(thePath,TrainDataDiffer)
|
||
|
||
|
||
np.save('./TrainDataDiffer',TrainDataDiffer)
|
||
|
||
|
||
|
||
if __name__ == "__main__":
|
||
Rrtreat()
|