This commit is contained in:
ZhiWang 2023-11-22 11:30:27 +08:00
parent 645c43c9cb
commit 3ea2506fb6
12 changed files with 1381 additions and 0 deletions

183
代码/VennABERS.py Normal file
View File

@ -0,0 +1,183 @@
# Straight-forward implementation of IVAP algorithm described in:
# Large-scale probabilistic prediction with and without validity guarantees, Vovk et al.
# https://arxiv.org/pdf/1511.00213.pdf
#
# Paolo Toccaceli
#
# https://github.com/ptocca/VennABERS
#
# 2020-07-09: Fixed bug in p0 calculation
import numpy as np
# Some elementary functions to speak the same language as the paper
# (at some point we'll just replace the occurrence of the calls with the function body itself)
def push(x,stack):
stack.append(x)
def pop(stack):
return stack.pop()
def top(stack):
return stack[-1]
def nextToTop(stack):
return stack[-2]
# perhaps inefficient but clear implementation
def nonleftTurn(a,b,c):
d1 = b-a
d2 = c-b
return np.cross(d1,d2)<=0
def nonrightTurn(a,b,c):
d1 = b-a
d2 = c-b
return np.cross(d1,d2)>=0
def slope(a,b):
ax,ay = a
bx,by = b
return (by-ay)/(bx-ax)
def notBelow(t,p1,p2):
p1x,p1y = p1
p2x,p2y = p2
tx,ty = t
m = (p2y-p1y)/(p2x-p1x)
b = (p2x*p1y - p1x*p2y)/(p2x-p1x)
return (ty >= tx*m+b)
kPrime = None
# Because we cannot have negative indices in Python (they have another meaning), I use a dictionary
def algorithm1(P):
global kPrime
S = []
P[-1] = np.array((-1,-1))
push(P[-1],S)
push(P[0],S)
for i in range(1,kPrime+1):
while len(S)>1 and nonleftTurn(nextToTop(S),top(S),P[i]):
pop(S)
push(P[i],S)
return S
def algorithm2(P,S):
global kPrime
Sprime = S[::-1] # reverse the stack
F1 = np.zeros((kPrime+1,))
for i in range(1,kPrime+1):
F1[i] = slope(top(Sprime),nextToTop(Sprime))
P[i-1] = P[i-2]+P[i]-P[i-1]
if notBelow(P[i-1],top(Sprime),nextToTop(Sprime)):
continue
pop(Sprime)
while len(Sprime)>1 and nonleftTurn(P[i-1],top(Sprime),nextToTop(Sprime)):
pop(Sprime)
push(P[i-1],Sprime)
return F1
def algorithm3(P):
global kPrime
S = []
push(P[kPrime+1],S)
push(P[kPrime],S)
for i in range(kPrime-1,0-1,-1): # k'-1,k'-2,...,0
while len(S)>1 and nonrightTurn(nextToTop(S),top(S),P[i]):
pop(S)
push(P[i],S)
return S
def algorithm4(P,S):
global kPrime
Sprime = S[::-1] # reverse the stack
F0 = np.zeros((kPrime+1,))
for i in range(kPrime,1-1,-1): # k',k'-1,...,1
F0[i] = slope(top(Sprime),nextToTop(Sprime))
P[i] = P[i-1]+P[i+1]-P[i]
if notBelow(P[i],top(Sprime),nextToTop(Sprime)):
continue
pop(Sprime)
while len(Sprime)>1 and nonrightTurn(P[i],top(Sprime),nextToTop(Sprime)):
pop(Sprime)
push(P[i],Sprime)
return F0
def prepareData(calibrPoints):
global kPrime
ptsSorted = sorted(calibrPoints)
xs = np.fromiter((p[0] for p in ptsSorted),float)
ys = np.fromiter((p[1] for p in ptsSorted),float)
ptsUnique,ptsIndex,ptsInverse,ptsCounts = np.unique(xs,
return_index=True,
return_counts=True,
return_inverse=True)
a = np.zeros(ptsUnique.shape)
np.add.at(a,ptsInverse,ys)
# now a contains the sums of ys for each unique value of the objects
w = ptsCounts
yPrime = a/w
yCsd = np.cumsum(w*yPrime) # Might as well do just np.cumsum(a)
xPrime = np.cumsum(w)
kPrime = len(xPrime)
return yPrime,yCsd,xPrime,ptsUnique
def computeF(xPrime,yCsd):
global kPrime
P = {0:np.array((0,0))}
P.update({i+1:np.array((k,v)) for i,(k,v) in enumerate(zip(xPrime,yCsd))})
S = algorithm1(P)
F1 = algorithm2(P,S)
P = {0:np.array((0,0))}
P.update({i+1:np.array((k,v)) for i,(k,v) in enumerate(zip(xPrime,yCsd))})
P[kPrime+1] = P[kPrime] + np.array((1.0,0.0)) # The paper says (1,1)
S = algorithm3(P)
F0 = algorithm4(P,S)
return F0,F1
def getFVal(F0,F1,ptsUnique,testObjects):
pos0 = np.searchsorted(ptsUnique,testObjects,side='left')
pos1 = np.searchsorted(ptsUnique[:-1],testObjects,side='right')+1
return F0[pos0],F1[pos1]
def ScoresToMultiProbs(calibrPoints,testObjects):
# sort the points, transform into unique objects, with weights and updated values
yPrime,yCsd,xPrime,ptsUnique = prepareData(calibrPoints)
# compute the F0 and F1 functions from the CSD
F0,F1 = computeF(xPrime,yCsd)
# compute the values for the given test objects
p0,p1 = getFVal(F0,F1,ptsUnique,testObjects)
return p0,p1

107
代码/casestudy.py Normal file
View File

@ -0,0 +1,107 @@
# -*- coding: utf-8 -*-
import sklearn
import pandas as pd
import numpy as np
import VennABERS
from sklearn import metrics
import matplotlib
import matplotlib.pyplot as plt
import os
#找例子
'''
data=pd.read_csv("./predict1.csv")
#data.columns=["label","PCA","MCD","IForest","LODA","LOF","KNN","CBLOF","HBOS","VAE","OCSVM"]
print(data.head())
for indexs in data.index:
data1=data.loc[indexs].values[0:-1]
data1=data.loc[indexs].tolist()
count0=data1[0]
count1=data1[1:].count(1)
count2=data1[1:].count(2)
count3=data1[1:].count(0)
if count1==3 and count0==1 and count3==1:
#if count3==count1 and count2>0:
print(indexs)
'''
def plot(f,t):
plt.plot(f, t, "r", marker='*', ms=1, label="a")
plt.xlabel("p1-p0")
plt.ylabel("f1")
plt.show()
global t
def accuracy_score(y, y_hat):
return sum(yi == yi_hat for yi, yi_hat in zip(y, y_hat)) / len(y)
def precision_score(y, y_hat):
true_positive = sum(yi and yi_hat for yi, yi_hat in zip(y, y_hat))
predicted_positive = sum(y_hat)
return true_positive / predicted_positive
def get_tpr(y, y_hat):
true_positive = sum(yi and yi_hat for yi, yi_hat in zip(y, y_hat))
actual_positive = sum(y)
return true_positive / actual_positive
def count_p(p0,p1):
'''
global t
global yz
if p1-p0>=yz-0.000000001:
return 2
'''
#if p1-p0>=t:
#return 2
if p1/(1-p0+p1)>(0.5-t) and p1/(1-p0+p1)<(0.50+0.5*t):
return 2
if p1/(1-p0+p1)>0.5: #可以用来调节roc
return 1
else:
return 0
data=pd.read_csv("./p01.csv")
xx=[]
yy=[]
for i in range(1,50):
t=0.005*i
data['venn_pre']=data.apply(lambda x:count_p(x["PCA_p0"],x["PCA_p1"]),axis=1)
print(len(data))
data1=data[data['venn_pre']!=2]
print(len(data1))
try:
accuracy=accuracy_score(np.array(data1['label_number']),np.array(data1['venn_pre']))
precision=precision_score(np.array(data1['label_number']),np.array(data1['venn_pre']))
tpr=get_tpr(np.array(data1['label_number']),np.array(data1['venn_pre']))
f1=2 * precision * tpr / (precision + tpr)
except ZeroDivisionError:
continue
else:
xx.append(t)
yy.append(f1)
'''
print("准确率为{}".format(accuracy))
print("精确率为{}".format(precision))
print("召回率为{}".format(tpr))
'''
print("F1值为{}".format(f1))
plot(xx,yy)

116
代码/ivap.py Normal file
View File

@ -0,0 +1,116 @@
# -*- coding: utf-8 -*-
import pandas as pd
import numpy as np
import VennABERS
import os
from sklearn.preprocessing import normalize
def accuracy_score(y, y_hat):
return sum(yi == yi_hat for yi, yi_hat in zip(y, y_hat)) / len(y)
def precision_score(y, y_hat):
true_positive = sum(yi and yi_hat for yi, yi_hat in zip(y, y_hat))
predicted_positive = sum(y_hat)
return true_positive / predicted_positive
def get_tpr(y, y_hat):
true_positive = sum(yi and yi_hat for yi, yi_hat in zip(y, y_hat))
actual_positive = sum(y)
return true_positive / actual_positive
def read_csv(filepath,testname,calname):
name=calname.split('')[0]
test_data=pd.read_csv(filepath+testname)
cal_data=pd.read_csv(filepath+calname)
cal_data['new_col']= list(zip(cal_data.score,cal_data.label_number))
tac=cal_data.new_col.tolist()
tec=test_data.score.tolist()
p0,p1=VennABERS.ScoresToMultiProbs(tac,tec)
test_data[name+'_p0']=p0.tolist()
test_data[name+'_p1']=p1.tolist()
#p_data=pd.merge(test_data[name+'_p0'],test_data[name+'_p1'],left_index=True,right_index=True)
p01="/home/shaoleshi/民航/数据/NSL_KDD-master/NSL_KDD-master/多模型协同/p01.csv"
if os.path.exists(p01):
p_data=pd.read_csv(p01)
p_data[name+'_p0']=test_data[name+'_p0']
p_data[name+'_p1']=test_data[name+'_p1']
p_data.to_csv('p01.csv',index=None)
else:
p_data=pd.merge(test_data[name+'_p0'],test_data[name+'_p1'],left_index=True,right_index=True)
p_data.to_csv('p01.csv',index=None)
def count_p(p0,p1):
if p1-p0>=0.02:
return 2
if p1/(1-p0+p1)>0.5: #可以用来调节roc
return 1
else:
return 0
def count_re(a,b,c,d,e,f,g,h):
T=(a,b,c,d,e,f,g,h)
r0=T.count(0)
r1=T.count(1)
if r1>r0:
return 1
else:
return 0
def ivap(p01,model):
data=pd.read_csv(p01)
data[model]=data.apply(lambda x:count_p(x[model+"_p0"],x[model+"_p1"]),axis=1)
pre="/home/shaoleshi/民航/数据/kddcup.data/多模型协同/predict.csv"
if os.path.exists(pre):
p_data=pd.read_csv(pre)
p_data[model]=data[model]
p_data.to_csv('predict.csv',index=None)
else:
p_data=pd.merge(data['label_number'],data[model],left_index=True,right_index=True)
p_data.to_csv('predict.csv',index=None)
def result(predict):
data=pd.read_csv(predict)
data['venn_pre']=data.apply(lambda x:count_re(x["PCA"],x["MCD"],x["IForest"],x["LODA"],x["AutoEncoder"],x["LOF"],x["KNN"],x["OCSVM"]),axis=1)
accuracy=accuracy_score(np.array(data['label_number']),np.array(data['venn_pre']))
precision=precision_score(np.array(data['label_number']),np.array(data['venn_pre']))
tpr=get_tpr(np.array(data['label_number']),np.array(data['venn_pre']))
print("准确率为{}".format(accuracy))
print("精确率为{}".format(precision))
print("召回率为{}".format(tpr))
if __name__=="__main__":
Model=["PCA","MCD","IForest","LODA","LOF","KNN","OCSVM","CBLOF","HBOS","VAE"]
p01="/home/shaoleshi/民航/数据/NSL_KDD-master/NSL_KDD-master/多模型协同/p01.csv"
predict="/home/shaoleshi/民航/数据/NSL_KDD-master/NSL_KDD-master/多模型协同/predict.csv"
filepath='/home/shaoleshi/民航/数据/NSL_KDD-master/NSL_KDD-master/多模型协同/score/'
for i in Model:
testname=i+'测试集分数.csv'
calname=i+'校准集分数.csv'
read_csv(filepath,testname,calname)
'''
for i in Model:
ivap(p01,i)
print(i)
#result(predict)
'''

107
代码/noreject.py Normal file
View File

@ -0,0 +1,107 @@
# -*- coding: utf-8 -*-
import pandas as pd
import numpy as np
import VennABERS
import os
def accuracy_score(y, y_hat):
return sum(yi == yi_hat for yi, yi_hat in zip(y, y_hat)) / len(y)
def precision_score(y, y_hat):
true_positive = sum(yi and yi_hat for yi, yi_hat in zip(y, y_hat))
predicted_positive = sum(y_hat)
return true_positive / predicted_positive
def get_tpr(y, y_hat):
true_positive = sum(yi and yi_hat for yi, yi_hat in zip(y, y_hat))
actual_positive = sum(y)
return true_positive / actual_positive
def count_re(a,b,c,d,e,f,g,h,i,j,k):
T=(a,b,c,d,e,f,g,h,i,j,k)
r0=T.count(0)
r1=T.count(1)
if r1>r0:
return 1
else:
return 0
def count_p(p0,p1):
#if p1-p0>=0.02:
#return 2
if p1/(1-p0+p1)>0.18: #可以用来调节roc
return 1
else:
return 0
def read_csv(filepath,testname,calname):
name=calname.split('')[0]
test_data=pd.read_csv(filepath+testname)
cal_data=pd.read_csv(filepath+calname)
cal_data['new_col']= list(zip(cal_data.score,cal_data.label_number))
tac=cal_data.new_col.tolist()
tec=test_data.score.tolist()
p0,p1=VennABERS.ScoresToMultiProbs(tac,tec)
test_data[name+'_p0']=p0.tolist()
test_data[name+'_p1']=p1.tolist()
#p_data=pd.merge(test_data[name+'_p0'],test_data[name+'_p1'],left_index=True,right_index=True)
p01="/home/shaoleshi/毕设/NSL_KDD-master/多模型协同/p01.csv"
if os.path.exists(p01):
p_data=pd.read_csv(p01)
p_data[name+'_p0']=test_data[name+'_p0']
p_data[name+'_p1']=test_data[name+'_p1']
p_data.to_csv('p01.csv',index=None)
else:
p_data=pd.merge(test_data[name+'_p0'],test_data[name+'_p1'],left_index=True,right_index=True)
p_data.to_csv('p01.csv',index=None)
def ivap(p01,model):
data=pd.read_csv(p01)
data[model]=data.apply(lambda x:count_p(x[model+"_p0"],x[model+"_p1"]),axis=1)
pre="/home/shaoleshi/毕设/NSL_KDD-master/多模型协同/predict.csv"
if os.path.exists(pre):
p_data=pd.read_csv(pre)
p_data[model]=data[model]
p_data.to_csv('predict.csv',index=None)
else:
p_data=pd.merge(data['label_number'],data[model],left_index=True,right_index=True)
p_data.to_csv('predict.csv',index=None)
def result(predict):
data=pd.read_csv(predict)
#data2=pd.read_csv("/home/shaoleshi/毕设/NSL_KDD-master/多模型协同/p01.csv")
data['venn_pre']=data.apply(lambda x:count_re(x["CBLOF"],x["HBOS"],x["VAE"],x["PCA"],x["MCD"],x["IForest"],x["LODA"],x["AutoEncoder"],x["LOF"],x["KNN"],x["OCSVM"]),axis=1)
accuracy=accuracy_score(np.array(data['label_number']),np.array(data['venn_pre']))
precision=precision_score(np.array(data['label_number']),np.array(data['venn_pre']))
tpr=get_tpr(np.array(data['label_number']),np.array(data['venn_pre']))
f1=2 * precision * tpr / (precision + tpr)
print(data.head())
print(len(data['label_number']))
print("准确率为{}".format(accuracy))
print("精确率为{}".format(precision))
print("召回率为{}".format(tpr))
print("F1值为{}".format(f1))
if __name__=="__main__":
Model=["CBLOF","HBOS","PCA","MCD","IForest","LODA","LOF","KNN","OCSVM","VAE","AutoEncoder"]
p01="/home/shaoleshi/毕设/NSL_KDD-master/多模型协同/p01.csv"
predict="/home/shaoleshi/毕设/NSL_KDD-master/多模型协同/predict.csv"
filepath='/home/shaoleshi/毕设/NSL_KDD-master/多模型协同/score/'
'''
for i in Model:
testname=i+'测试集分数.csv'
calname=i+'校准集分数.csv'
read_csv(filepath,testname,calname)
'''
for i in Model:
ivap(p01,i)
print(i)
result(predict)

150
代码/reject.py Normal file
View File

@ -0,0 +1,150 @@
# -*- coding: utf-8 -*-
import sklearn
import pandas as pd
import numpy as np
import VennABERS
from sklearn import metrics
import matplotlib
import matplotlib.pyplot as plt
import os
def accuracy_score(y, y_hat):
return sum(yi == yi_hat for yi, yi_hat in zip(y, y_hat)) / len(y)
def precision_score(y, y_hat):
true_positive = sum(yi and yi_hat for yi, yi_hat in zip(y, y_hat))
predicted_positive = sum(y_hat)
return true_positive / predicted_positive
def get_tpr(y, y_hat):
true_positive = sum(yi and yi_hat for yi, yi_hat in zip(y, y_hat))
actual_positive = sum(y)
return true_positive / actual_positive
def p_yuzhi(p0,p1):
return p1-p0
def count_p(p0,p1):
#global t
global yz
if p1-p0>=yz-0.000000001:
return 2
if p1/(1-p0+p1)<0.18 and p1/(1-p0+p1)>0.06:
return 2
if p1/(1-p0+p1)>0.18: #可以用来调节roc
return 1
else:
return 0
def count_re(a,b,c,d,e,f,g,h,i,j,k):
T=(a,b,c,d,e,f,g,h,i,j,k)
r0=T.count(0)
r1=T.count(1)
if r0==0 and r1==0:
return 2
if r1>r0:
return 1
else:
return 0
def ivap(p01,model):
global yz
p0p1="/home/shaoleshi/毕设/NSL_KDD-master/多模型协同/阈值.csv"
data2=pd.read_csv(p0p1)
yz=data2[model+"_p"].tolist()[0]
print(yz)
data=pd.read_csv(p01)
data[model]=data.apply(lambda x:count_p(x[model+"_p0"],x[model+"_p1"]),axis=1)
pre="/home/shaoleshi/毕设/NSL_KDD-master/多模型协同/predict1.csv"
if os.path.exists(pre):
p_data=pd.read_csv(pre)
p_data[model]=data[model]
p_data.to_csv('predict1.csv',index=None)
else:
p_data=pd.merge(data['label_number'],data[model],left_index=True,right_index=True)
p_data.to_csv('predict1.csv',index=None)
#处理p1-p0阈值的问题
def p1p0(p01,model):
i=0
data=pd.read_csv(p01)
data[model+'_p']=data.apply(lambda x:p_yuzhi(x[model+"_p0"],x[model+"_p1"]),axis=1)
#pre="/home/shaoleshi/民航/数据/kddcup.data/多模型协同/阈值.csv"
list1=data[model+'_p'].tolist()
list1=sorted(list1)
if model=="PCA":
i=0.995
if model=="MCD":
i=0.995
if model=="IForest":
i=0.995
if model=="LODA":
i=0.995
if model=="LOF":
i=0.995
if model=="KNN":
i=0.995
if model=="OCSVM":
i=0.995
if model=="CBLOF":
i=0.995
if model=="HBOS":
i=0.995
if model=="VAE":
i=0.995
if model=="AutoEncoder":
i=0.995
t=list1[int(len(list1)*i)]
'''
print(list1[int(len(list1)*0.1)])
print(list1[int(len(list1)*0.25)])
print(list1[int(len(list1)*0.5)])
print(list1[int(len(list1)*0.75)])
print(list1[int(len(list1)*0.99)])
'''
return t
#多模型投票所得结果
def result(predict):
data=pd.read_csv(predict)
data['venn_pre']=data.apply(lambda x:count_re(x["CBLOF"],x["HBOS"],x["VAE"],x["PCA"],x["MCD"],x["IForest"],x["LODA"],x["AutoEncoder"],x["LOF"],x["KNN"],x["OCSVM"]),axis=1)
print(len(data))
data=data[data['venn_pre']!=2]
print(len(data))
accuracy=accuracy_score(np.array(data['label_number']),np.array(data['venn_pre']))
precision=precision_score(np.array(data['label_number']),np.array(data['venn_pre']))
tpr=get_tpr(np.array(data['label_number']),np.array(data['venn_pre']))
f1=2 * precision * tpr / (precision + tpr)
print("ivap的结果如下所示:")
print("准确率为{}".format(accuracy))
print("精确率为{}".format(precision))
print("召回率为{}".format(tpr))
print("F1值为{}".format(f1))
#return list(fpr)[1],list(tpr1)[1]
if __name__=="__main__":
global t
Model=["CBLOF","HBOS","PCA","MCD","IForest","LODA","LOF","KNN","OCSVM","VAE","AutoEncoder"]
p01="/home/shaoleshi/毕设/NSL_KDD-master/多模型协同/p01.csv"
predict1="/home/shaoleshi/毕设/NSL_KDD-master/多模型协同/predict1.csv"
'''
dict_p={}
for i in Model:
t=p1p0(p01,i)
dict_p[i+'_p']=t
ppp=pd.DataFrame(dict_p,index=[0])
print(dict_p)
ppp.to_csv("阈值.csv",index=None)
'''
#输出每个模型的预测结果
for i in Model:
ivap(p01,i)
#result1(predict1,i)
result(predict1)

199
代码/result.py Normal file
View File

@ -0,0 +1,199 @@
# -*- coding: utf-8 -*-
import sklearn
import pandas as pd
import numpy as np
import VennABERS
from sklearn import metrics
import matplotlib
import matplotlib.pyplot as plt
def accuracy_score(y, y_hat):
return sum(yi == yi_hat for yi, yi_hat in zip(y, y_hat)) / len(y)
def precision_score(y, y_hat):
true_positive = sum(yi and yi_hat for yi, yi_hat in zip(y, y_hat))
predicted_positive = sum(y_hat)
return true_positive / predicted_positive
def get_tpr(y, y_hat):
true_positive = sum(yi and yi_hat for yi, yi_hat in zip(y, y_hat))
actual_positive = sum(y)
return true_positive / actual_positive
def plot():
data=pd.read_csv("/home/shaoleshi/毕设/NSL_KDD-master/多模型协同/多模型roc.csv")
#data2=pd.read_csv("/home/shaoleshi/民航/数据/kddcup.data/多模型协同/ivap_roc.csv")
#data2=data2.sort_values(['fpr'], ascending = True)
#print(data2.head())
x=data["AutoEncoder_fpr"].tolist()
y=data["AutoEncoder_tpr"].tolist()
x1=data["HBOS_fpr"].tolist()
y1=data["HBOS_tpr"].tolist()
x2=data["IForest_fpr"].tolist()
y2=data["IForest_tpr"].tolist()
x3=data["KNN_fpr"].tolist()
y3=data["KNN_tpr"].tolist()
x4=data["LODA_fpr"].tolist()
y4=data["LODA_tpr"].tolist()
x5=data["LOF_fpr"].tolist()
y5=data["LOF_tpr"].tolist()
x6=data["MCD_fpr"].tolist()
y6=data["MCD_tpr"].tolist()
x7=data["OCSVM_fpr"].tolist()
y7=data["OCSVM_tpr"].tolist()
x8=data["PCA_fpr"].tolist()
y8=data["PCA_tpr"].tolist()
x9=data["CBLOF_fpr"].tolist()
y9=data["CBLOF_tpr"].tolist()
x10=data["VAE_fpr"].tolist()
y10=data["VAE_tpr"].tolist()
print(metrics.auc(x1, y1))
print(metrics.auc(x2, y2))
print(metrics.auc(x3, y3))
print(metrics.auc(x4, y4))
print(metrics.auc(x5, y5))
print(metrics.auc(x6, y6))
print(metrics.auc(x7, y7))
print(metrics.auc(x8, y8))
print(metrics.auc(x9, y9))
print(metrics.auc(x10, y10))
l=plt.plot(x, y, "pink", marker='*', ms=1,label="AutoEncoder")
l1=plt.plot(x1, y1, "r", marker='*', ms=1,label="HBOS")
l2=plt.plot(x2, y2, "y", marker='*', ms=1,label="IForest")
l3=plt.plot(x3, y3, "c", marker='*', ms=1,label="KNN")
l4=plt.plot(x4, y4, "m", marker='*', ms=1,label="LODA")
l5=plt.plot(x5, y5, "g", marker='*', ms=1,label="LOF")
l6=plt.plot(x6, y6, "b", marker='*', ms=1,label="MCD")
l7=plt.plot(x7, y7, "k", marker='*', ms=1,label="OCSVM")
l8=plt.plot(x8, y8, "greenyellow", marker='*', ms=1,label='PCA')
l9=plt.plot(x9, y9, "sienna", marker='*', ms=1,label="CBLOF")
l10=plt.plot(x10, y10, "orange", marker='*', ms=1,label="VAE")
plt.legend()
plt.title('NSL_KDD ROC')
plt.xlabel("fpr")
plt.ylabel("tpr")
plt.show()
def iqr(result,i):
Percentile=np.percentile(result["score"],[0,25,50,75,100])
aa=result.score.tolist()
IQR=0
uplimit=0
IQR=Percentile[3]-Percentile[1]
uplimit=Percentile[3]+IQR*i
if np.isnan(uplimit):
aa=result.score.tolist()
aa.sort(reverse = True)
print(type(aa))
IQR=aa[int(len(aa)/4)]-aa[int(len(aa)*3/4)]
print(aa[int(len(aa)/4)])
print(aa[int(len(aa)*3/4)])
print(aa[1])
uplimit=aa[int(len(aa)/4)]+IQR*i
print(uplimit)
print(len(result[result.score>uplimit]))
return uplimit
def roc(test):
f=[]
t=[]
dd = test.sort_values(by='score',ascending=False)
print(dd.head())
for i in range(1,112):
#limit=iqr(train,0.01*i)
limit=list(dd.score)[i*100]
test['label_test']=test.score.apply(lambda x: 1 if x>limit else 0)
fpr,tpr1,thresholds=sklearn.metrics.roc_curve(test.label_number,
test.label_test,
pos_label=None,
sample_weight=None,
drop_intermediate=True)
f.append(list(fpr)[1])
t.append(list(tpr1)[1])
return f,t
def result(data,filename):
dd = data.sort_values(by='score',ascending=False)
#for i in range(6,10):
train=pd.read_csv("/home/shaoleshi/毕设/NSL_KDD-master/多模型协同/AutoEncoder训练集.csv")
limit=iqr(train,1.3)
data['label_test']=data.score.apply(lambda x: 1 if x>limit else 0)
accuracy=accuracy_score(np.array(data.label_number),np.array(data.label_test))
precision=precision_score(np.array(data.label_number),np.array(data.label_test))
tpr=get_tpr(np.array(data.label_number),np.array(data.label_test))
print("准确率为{}".format(accuracy))
print("精确率为{}".format(precision))
print("召回率为{}".format(tpr))
def read_csv(filepath,filename):
data=pd.read_csv(filepath+filename)
result(data,filename)
if __name__=='__main__':
Model=["CBLOF","HBOS","PCA","MCD","IForest","LODA","LOF","KNN","OCSVM","VAE","AutoEncoder"]
#多模型记录roc数据csv
dict={}
for i in Model:
data=pd.read_csv("/home/shaoleshi/毕设/NSL_KDD-master/多模型协同/score/"+i+"测试集分数.csv")
f,t=roc(data)
dict[i+"_fpr"]=f
dict[i+"_tpr"]=t
data=pd.DataFrame(dict)
data.to_csv("多模型roc.csv",index=None)
'''
#print("已完成.format{}",i)
#计算准确率召回率
filepath="/home/shaoleshi/民航/数据/kddcup.data/多模型协同/score/"
filename="AutoEncoder测试集分数.csv"
read_csv(filepath,filename)
    '''
plot()#总图

156
代码/run.py Normal file
View File

@ -0,0 +1,156 @@
# -*- coding: utf-8 -*-
import sklearn
import pandas as pd
import numpy as np
from sklearn.preprocessing import normalize
'''
author:leshi
'''
from pyod.models.auto_encoder import AutoEncoder
from pyod.models.lscp import LSCP
from pyod.models.feature_bagging import FeatureBagging
from pyod.models.pca import PCA
from pyod.models.hbos import HBOS
from pyod.models.iforest import IForest
from pyod.models.cof import COF
from pyod.models.mcd import MCD
from pyod.models.lof import LOF
from pyod.models.knn import KNN
from pyod.models.cblof import CBLOF
from pyod.models.loda import LODA
from pyod.models.ocsvm import OCSVM
from pyod.models.abod import ABOD
from pyod.models.vae import VAE
from pyod.models.so_gaal import SO_GAAL
from pyod.models.mo_gaal import MO_GAAL
from pyod.models.loci import LOCI
COF_clf = COF(contamination=0.01,n_neighbors=30)
LSCP_clf = LSCP(contamination=0.01,detector_list = [LOF(), PCA()])
LOCI_clf = LOCI(contamination=0.05)
VAE_clf = VAE(contamination=0.001, epochs=50, gamma=0.8, capacity=0.2, encoder_neurons=[9, 4], decoder_neurons=[4, 9])
ABOD_clf = ABOD(contamination=0.01,n_neighbors=20,method='default')
FeatureBagging_clf = FeatureBagging(contamination=0.01,)
AutoEncoder_clf = AutoEncoder(contamination=0.001)
OCSVM_clf= OCSVM(contamination=0.001)
LODA_clf = LODA(contamination=0.001)
CBLOF_clf = CBLOF(contamination=0.001)
LOF_clf = LOF(contamination=0.001)
PCA_clf = PCA(contamination=0.001)
HBOS_clf = HBOS(contamination=0.001)
IForest_clf = IForest(contamination=0.001)
MCD_clf = MCD(contamination=0.001)
KNN_clf = KNN(contamination=0.001)
SO_GAAL_clf = SO_GAAL(contamination=0.001)
MO_GAAL_clf = MO_GAAL(contamination=0.05, stop_epochs=2) #需要调参
Path="/home/shaoleshi/毕设/NSL_KDD-master/多模型协同/"
Model_list=["CBLOF","HBOS","PCA","MCD","IForest","LODA","LOF","KNN","OCSVM","VAE","AutoEncoder"]
#Model_list=["VAE"]
def read_csv(filepath,filename):
total_data=pd.read_csv(filepath+filename,header=None)
total_data.columns=["duration","protocol_type","service","flag","src_bytes","dst_bytes","land",
"wrong_fragment","urgent","hot","num_failed_logins","logged_in","num_compromised","root_shell",
"su_attempted","num_root","num_file_creations","num_shells","num_access_files","num_outbound_cmds","is_host_login"
,"is_guest_login","count","srv_count","serror_rate","srv_serror_rate","rerror_rate","srv_rerror_rate"
,"same_srv_rate","diff_srv_rate","srv_diff_host_rate","dst_host_count","dst_host_srv_count","dst_host_same_srv_rate","dst_host_diff_srv_rate"
,"dst_host_same_src_port_rate:","dst_host_srv_diff_host_rate","dst_host_serror_rate"
,"dst_host_srv_serror_rate","dst_host_rerror_rate","dst_host_srv_rerror_rate","label","biao"]
train1,cal=np.split(total_data.sample(frac=1),[int(.4*len(total_data))])
filename2="KDDTest+.csv"
test_data=pd.read_csv(filepath+filename2,header=None)
#cal=cal.sample(n=10000,random_state=2)
train=train1[train1["label"]=="normal"].sample(n=20000,random_state=1)
cal1=cal[cal["label"]=="normal"].sample(n=15000,random_state=1)
cal2=cal[cal["label"]!="normal"].sample(n=15000,random_state=10)
cal = cal1.append(cal2)
test_data.columns=total_data.columns
val,test1=np.split(test_data.sample(frac=1),[int(.5*len(test_data))])
val.to_csv(Path+"验证集.csv",index=None)
test1.to_csv(Path+"测试集.csv",index=None)
train.to_csv(Path+"训练集.csv",index=None)
cal.to_csv(Path+"校准集.csv",index=None)#训练模型得到模型
def count_data():
Path="/home/shaoleshi/毕设/NSL_KDD-master/多模型协同/"
sample_ab=pd.read_csv(Path+"测试集.csv")
sample_nor=pd.read_csv(Path+"训练集.csv")
sample_cal=pd.read_csv(Path+"校准集.csv")
sample_val=pd.read_csv(Path+"验证集.csv")
nor=0
abn=0
nor_va=len(sample_val[sample_ab["label"]=="normal"])
ab_va=len(sample_val[sample_ab["label"]!="normal"])
print("验证")
print(nor_va)
print(ab_va)
nor=nor+len(sample_ab[sample_ab["label"]=="normal"])
abn=abn+len(sample_ab[sample_ab["label"]!="normal"])
nor=nor+len(sample_nor[sample_nor["label"]=="normal"])
abn=abn+len(sample_nor[sample_nor["label"]!="normal"])
nor=nor+len(sample_cal[sample_cal["label"]=="normal"])
abn=abn+len(sample_cal[sample_cal["label"]!="normal"])
print(nor)
print(abn)
#训练模型得到校准集测试集文件
def model_train(model):
total_data=pd.read_csv(Path+"训练集.csv")
x_train=total_data.drop(columns=["protocol_type","service","flag","src_bytes","label","biao"],axis=1)#训练去除无用列名
x_train = pd.DataFrame(normalize(x_train.values), index=x_train.index, columns=x_train.columns)
clf=eval(model+"_clf").fit(x_train)
y_train_scores = clf.decision_scores_
total_data['score']=y_train_scores
total_data.to_csv(model+"训练集.csv",index=None)
model_cal(clf,model)
model_test(clf,model)
model_val(clf,model)
def model_cal(clf,model):
total_data=pd.read_csv(Path+"校准集.csv")
x_cal=total_data.drop(columns=["protocol_type","service","flag","src_bytes","label","biao"],axis=1)
x_cal = pd.DataFrame(normalize(x_cal.values), index=x_cal.index, columns=x_cal.columns)
y_cal_scores = clf.decision_function(x_cal)
total_data['label_number']=total_data.label.apply(lambda x: 0 if x=="normal" else 1)
total_data['score']=y_cal_scores
total_data.to_csv(Path+"score/"+model+"校准集分数.csv",index=None)#记录校准集分数
def model_test(clf,model):
total_data=pd.read_csv(Path+"测试集.csv")
x_test=total_data.drop(columns=["protocol_type","service","flag","src_bytes","label","biao"],axis=1)
x_test = pd.DataFrame(normalize(x_test.values), index=x_test.index, columns=x_test.columns)
y_test_scores = clf.decision_function(x_test)
total_data['label_number']=total_data.label.apply(lambda x: 0 if x=="normal" else 1)
total_data['score']=y_test_scores
total_data.to_csv(Path+"score/"+model+"测试集分数.csv",index=None)#记录校准集分数
def model_val(clf,model):
total_data=pd.read_csv(Path+"验证集.csv")
x_test=total_data.drop(columns=["protocol_type","service","flag","src_bytes","label","biao"],axis=1)
x_test = pd.DataFrame(normalize(x_test.values), index=x_test.index, columns=x_test.columns)
y_test_scores = clf.decision_function(x_test)
total_data['label_number']=total_data.label.apply(lambda x: 0 if x=="normal" else 1)
total_data['score']=y_test_scores
total_data.to_csv(Path+"score/"+model+"验证集分数.csv",index=None)#记录校准集分数
if __name__=="__main__":
'''
filepath='/home/shaoleshi/毕设/NSL_KDD-master/'
filename='KDDTrain+.csv'
read_csv(filepath,filename)#提取训练集,测试集,校准集
'''
#count_data()
for i in Model_list:
model_train(i)
print("已完成.format{}",i)

45
代码/single.py Normal file
View File

@ -0,0 +1,45 @@
# -*- coding: utf-8 -*-
import sklearn
import pandas as pd
import numpy as np
import VennABERS
from sklearn import metrics
import os
def accuracy_score(y, y_hat):
return sum(yi == yi_hat for yi, yi_hat in zip(y, y_hat)) / len(y)
def precision_score(y, y_hat):
true_positive = sum(yi and yi_hat for yi, yi_hat in zip(y, y_hat))
predicted_positive = sum(y_hat)
return true_positive / predicted_positive
def get_tpr(y, y_hat):
true_positive = sum(yi and yi_hat for yi, yi_hat in zip(y, y_hat))
actual_positive = sum(y)
return true_positive / actual_positive
def result(model,predict):
data=pd.read_csv(predict)
data=data[data[model]!=2]
accuracy=accuracy_score(np.array(data['label_number']),np.array(data[model]))
precision=precision_score(np.array(data['label_number']),np.array(data[model]))
tpr=get_tpr(np.array(data['label_number']),np.array(data[model]))
f1=2 * precision * tpr / (precision + tpr)
print(model+"的结果如下所示:")
print("准确率为{}".format(accuracy))
print("精确率为{}".format(precision))
print("召回率为{}".format(tpr))
print("F1值为{}".format(f1))
print("预测个数值为{}".format(len(data)))
if __name__=="__main__":
Model=["PCA","MCD","IForest","LODA","LOF","KNN","OCSVM","CBLOF","HBOS","VAE"]
predict="/home/shaoleshi/民航/数据/NSL_KDD-master/NSL_KDD-master/多模型协同/predict1.csv"
result("VAE",predict)
#for i in Model:
#result(i,predict);

60
代码/val.py Normal file
View File

@ -0,0 +1,60 @@
# -*- coding: utf-8 -*-
import pandas as pd
import numpy as np
import VennABERS
import os
def accuracy_score(y, y_hat):
return sum(yi == yi_hat for yi, yi_hat in zip(y, y_hat)) / len(y)
def precision_score(y, y_hat):
true_positive = sum(yi and yi_hat for yi, yi_hat in zip(y, y_hat))
predicted_positive = sum(y_hat)
return true_positive / predicted_positive
def get_tpr(y, y_hat):
true_positive = sum(yi and yi_hat for yi, yi_hat in zip(y, y_hat))
actual_positive = sum(y)
return true_positive / actual_positive
def handle(test,j):
dd = test.sort_values(by='score',ascending=False)
print(dd.head())
max=0
flag=0
for i in range(1,112):
limit=list(dd.score)[i*100]
test['label_test']=test.score.apply(lambda x: 1 if x>limit else 0)
precision=precision_score(np.array(test.label_number),np.array(test.label_test))
tpr=get_tpr(np.array(test.label_number),np.array(test.label_test))
f1=2 * precision * tpr / (precision + tpr)
if max<f1:
max=f1
flag=limit
data=pd.read_csv("/home/shaoleshi/毕设/NSL_KDD-master/多模型协同/score/"+j+"测试集分数.csv")
data['label_test']=data.score.apply(lambda x: 1 if x>flag else 0)
accuracy=accuracy_score(np.array(data.label_number),np.array(data.label_test))
precision=precision_score(np.array(data.label_number),np.array(data.label_test))
tpr=get_tpr(np.array(data.label_number),np.array(data.label_test))
f1=2 * precision * tpr / (precision + tpr)
print(j+"的结果如下所示:")
print("准确率为{}".format(accuracy))
print("精确率为{}".format(precision))
print("召回率为{}".format(tpr))
print("F1值为{}".format(f1))
if __name__=='__main__':
Model=["PCA","MCD","IForest","LODA","LOF","KNN","CBLOF","HBOS","VAE","OCSVM","AutoEncoder"]
#Model=["CBLOF"]
for i in Model:
data=pd.read_csv("/home/shaoleshi/毕设/NSL_KDD-master/多模型协同/score/"+i+"验证集分数.csv")
handle(data,i)

36
代码/venn.py Normal file
View File

@ -0,0 +1,36 @@
# -*- coding: utf-8 -*-
import pandas as pd
import numpy as np
import VennABERS
def read_csv(filepath,testname,trainname):
train=pd.read_csv(filepath+trainname)
train['label_01']=train.label.apply(lambda x: 0 if x=="normal." else 1)
train.to_csv("训练.csv")
#train['new_col'] = list(zip(train.score, train.label_01))
#tac=train.new_col.tolist()
test=pd.read_csv(filepath+testname)
test['new_col'] = list(zip(test.score, test.label_number))
tac=test.new_col.tolist()
tec=test.score.tolist()
p0,p1=VennABERS.ScoresToMultiProbs(tac,tec)
print(p1.tolist()[9])
test['p0']=p0.tolist()
test['p1']=p1.tolist()
test.to_csv('p01.csv')
if __name__=='__main__':
filepath='/home/shaoleshi/民航/数据/NUSW-NB15/'
testname='测试.csv'
trainname='训练.csv'
read_csv(filepath,testname,trainname)

221
代码/概率预测.py Normal file
View File

@ -0,0 +1,221 @@
# -*- coding: utf-8 -*-
import sklearn
import pandas as pd
import numpy as np
import VennABERS
from sklearn import metrics
import matplotlib
import matplotlib.pyplot as plt
import os
global yz
def plot(f,t):
plt.plot(f, t, "r", marker='*', ms=1, label="a")
plt.xlabel("fpr")
plt.ylabel("tpr")
plt.show()
def accuracy_score(y, y_hat):
return sum(yi == yi_hat for yi, yi_hat in zip(y, y_hat)) / len(y)
def precision_score(y, y_hat):
true_positive = sum(yi and yi_hat for yi, yi_hat in zip(y, y_hat))
predicted_positive = sum(y_hat)
return true_positive / predicted_positive
def get_tpr(y, y_hat):
true_positive = sum(yi and yi_hat for yi, yi_hat in zip(y, y_hat))
actual_positive = sum(y)
return true_positive / actual_positive
def count_p(p0,p1):
'''
global t
global yz
if p1-p0>=yz-0.000000001:
return 2
'''
#if p1-p0>=0.3:
#return 2
if p1/(1-p0+p1)>0.25 and p1/(1-p0+p1)<0.60:
return 2
if p1/(1-p0+p1)>0.5: #可以用来调节roc
return 1
else:
return 0
def count_re(a,b,c,d,e,f,g,h,i,j):
T=(a,b,c,d,e,f,g,h,i,j)
r0=T.count(0)
r1=T.count(1)
if r0==0 and r1==0:
return 2
if r1>r0:
return 1
else:
return 0
def p_yuzhi(p0,p1):
return p1-p0
def ivap(p01,model):
global yz
'''
p0p1="/home/shaoleshi/民航/数据/NSL_KDD-master/NSL_KDD-master/多模型协同/阈值.csv"
data2=pd.read_csv(p0p1)
yz=data2[model+"_p"].tolist()[0]
print(yz)
'''
data=pd.read_csv(p01)
data[model]=data.apply(lambda x:count_p(x[model+"_p0"],x[model+"_p1"]),axis=1)
pre="/home/shaoleshi/民航/数据/NSL_KDD-master/NSL_KDD-master/多模型协同/predict1.csv"
if os.path.exists(pre):
p_data=pd.read_csv(pre)
p_data[model]=data[model]
p_data.to_csv('predict1.csv',index=None)
else:
p_data=pd.merge(data['label_number'],data[model],left_index=True,right_index=True)
p_data.to_csv('predict1.csv',index=None)
def result(predict):
data=pd.read_csv(predict)
data['venn_pre']=data.apply(lambda x:count_re(x["PCA"],x["MCD"],x["IForest"],x["LODA"],x["LOF"],x["KNN"],x["OCSVM"],x["CBLOF"],x["HBOS"],x["VAE"]),axis=1)
print(len(data))
data=data[data['venn_pre']!=2]
print(len(data))
accuracy=accuracy_score(np.array(data['label_number']),np.array(data['venn_pre']))
precision=precision_score(np.array(data['label_number']),np.array(data['venn_pre']))
tpr=get_tpr(np.array(data['label_number']),np.array(data['venn_pre']))
f1=2 * precision * tpr / (precision + tpr)
print("ivap的结果如下所示:")
print("准确率为{}".format(accuracy))
print("精确率为{}".format(precision))
print("召回率为{}".format(tpr))
print("F1值为{}".format(f1))
#return list(fpr)[1],list(tpr1)[1]
#处理p1-p0阈值的问题
def p1p0(p01,model):
i=0
data=pd.read_csv(p01)
data[model+'_p']=data.apply(lambda x:p_yuzhi(x[model+"_p0"],x[model+"_p1"]),axis=1)
#pre="/home/shaoleshi/民航/数据/kddcup.data/多模型协同/阈值.csv"
list1=data[model+'_p'].tolist()
list1=sorted(list1)
if model=="PCA":
i=0.89
if model=="MCD":
i=0.95
if model=="IForest":
i=0.99
if model=="LODA":
i=0.93
if model=="LOF":
i=0.7
if model=="KNN":
i=0.7
if model=="OCSVM":
i=0.7
if model=="CBLOF":
i=0.6
if model=="HBOS":
i=0.6
if model=="VAE":
i=0.6
t=list1[int(len(list1)*i)]
'''
print(list1[int(len(list1)*0.1)])
print(list1[int(len(list1)*0.25)])
print(list1[int(len(list1)*0.5)])
print(list1[int(len(list1)*0.75)])
print(list1[int(len(list1)*0.99)])
'''
return t
#多模型投票所得结果
def result1(predict,model):
data=pd.read_csv(predict)
#print(len(data))
#data=data[data[model]!=2]
#print(len(data))
print("模型{}结果为".format(model))
accuracy=accuracy_score(np.array(data['label_number']),np.array(data[model]))
precision=precision_score(np.array(data['label_number']),np.array(data[model]))
tpr=get_tpr(np.array(data['label_number']),np.array(data[model]))
f1=2 * precision * tpr / (precision + tpr)
print("准确率为{}".format(accuracy))
print("精确率为{}".format(precision))
print("召回率为{}".format(tpr))
print("F1值为{}".format(f1))
if __name__=="__main__":
global t
f=[]
tt=[]
Model=["PCA","MCD","IForest","LODA","LOF","KNN","OCSVM","CBLOF","HBOS","VAE"]
p01="/home/shaoleshi/民航/数据/NSL_KDD-master/NSL_KDD-master/多模型协同/p01.csv"
predict="/home/shaoleshi/民航/数据/NSL_KDD-master/NSL_KDD-master/多模型协同/predict.csv"
filepath='/home/shaoleshi/民航/数据/NSL_KDD-master/NSL_KDD-master/多模型协同/score/'
predict1="/home/shaoleshi/民航/数据/NSL_KDD-master/NSL_KDD-master/多模型协同/predict1.csv"
'''
#设置p1-p0参数
dict_p={}
for i in Model:
t=p1p0(p01,i)
dict_p[i+'_p']=t
ppp=pd.DataFrame(dict_p,index=[0])
print(dict_p)
ppp.to_csv("阈值.csv",index=None)
#ivap(p01,"IForest") #0.85
#result1(predict1,"IForest")
#ivap(p01,"PCA") #0.6
#result1(predict1,"PCA")
#ivap(p01,"LOF") #0.7
#result1(predict1,"LOF")
'''
#输出每个模型的预测结果
for i in Model:
ivap(p01,i)
#result1(predict1,i)
result(predict1)
'''
#设置p1-p0参数
dict_p={}
for i in Model:
t=p1p0(p01,i)
dict_p[i+'_p']=t
ppp=pd.DataFrame(dict_p,index=[0])
print(dict_p)
ppp.to_csv("阈值.csv",index=None)
'''

1
代码/说明 Normal file
View File

@ -0,0 +1 @@
关键代码