Blockchain_phishing_node_id.../model.py

166 lines
9.5 KiB
Python
Raw Normal View History

2023-11-22 23:04:51 +08:00
import pandas as pd
import numpy as np
from sklearn import metrics
import lightgbm as lgb
from sklearn.metrics import mean_squared_error
#该部分代码有一小部分参考借鉴了陈博士的代码资料,参考的部分已用*标出
class Model:
def __init__(self,train_data,train_label):
self.train_data=train_data
self.train_label=train_label
self.lgb_params = {
'boosting_type': 'goss',
'objective': 'binary',
'learning_rate': 0.2,
'num_leaves': 40,
'verbosity': -1,
'max_depth': 8,
'metric': {'l2', 'auc'}
}
self.slice=1/3
def init_train_data(self):
reverse_train_data = self.train_data.copy(deep=True)
reverse_train_data.columns = ['blockNumber','to','from','value']
reverse_train_data['value']*= -1
self.train_data = pd.concat([self.train_data, reverse_train_data],axis = 0)
self.train_data.sort_index(inplace=True)
feature = self.get_feature(self.train_data)
feature.columns=list(map(self.deal_feature_name,feature.columns))
train_list,train_list_y=self.concat_feature(feature, train_label)
train_list_fd,train_list_y_fd=self.concat_feature(feature, train_label,fd=1)
return train_list,train_list_y,train_list_fd,train_list_y_fd
def model_train(self,train_list,train_list_y,train_list_fd,train_list_y_fd):
for i in range(0,5):
x_train=train_list[i]
y_train=train_list_y[i]
split=1-self.slice
split= int(split * len(x_train))
train = x_train[0:split]
train_y = y_train[0:split]
test = x_train[split:]
test_y = y_train[split:]
train_set = lgb.Dataset(train, label = train_y, free_raw_data=False)
test_set = lgb.Dataset(test, label = test_y, free_raw_data=False)
x_train_fd=train_list_fd[i]
y_train_fd=train_list_y_fd[i]
split=1-self.slice
split_fd = int(split * len(x_train))
train_fd = x_train_fd[0:split_fd]
train_y_fd = y_train_fd[0:split_fd]
test_fd = x_train_fd[split_fd:]
test_y_fd = y_train_fd[split_fd:]
train_set_fd = lgb.Dataset(train_fd, label = train_y_fd, free_raw_data=False)
test_set_fd = lgb.Dataset(test_fd, label = test_y_fd, free_raw_data=False)
if i==0:
lgb_model = lgb.train(self.lgb_params, train_set,keep_training_booster=True,
num_boost_round=200,verbose_eval = 5,valid_sets=test_set,early_stopping_rounds= 100)
lgb_model_fd = lgb.train(self.lgb_params, train_set_fd,keep_training_booster=True,
num_boost_round=200,verbose_eval = 5,valid_sets=test_set_fd,early_stopping_rounds= 100)
else:
lgb_model = lgb.train(self.lgb_params, train_set,keep_training_booster=True,init_model=lgb_model,
num_boost_round=200,verbose_eval = 5,valid_sets=test_set,early_stopping_rounds= 100)
lgb_model_fd = lgb.train(self.lgb_params, train_set_fd,keep_training_booster=True,init_model=lgb_model_fd,
num_boost_round=200,verbose_eval = 5,valid_sets=test_set_fd,early_stopping_rounds= 100)
print("done!!!!!!!")
lgb_model.save_model('model.txt')
lgb_model_fd.save_model('model_fd.txt')
def concat_feature(self,feature, train_label,fd=0):
train_data= pd.merge(left=feature, right=train_label, left_on='address', right_on='address', how='inner')
fishing_node = train_data[train_data['label'] == 1].sample(n=200)
common_node = train_data[train_data['label'] == 0].sample(n=1000)
common_node_list=[]
for i in range(0,5):
common_node_list.append(common_node[200*i:199+200*i])
train_list=[]
train_list_y=[]
for i in range(0,5):
train_list.append(pd.concat([fishing_node, common_node_list[i]]))
train_list[i]=train_list[i].sample(frac=1)
train_list_y.append(train_list[i]['label'])
train_list[i]= train_list[i].drop(['address', 'label'], axis=1)
if fd==1:
train_list[i].drop(['from_in_sum_sum',
'from_in_sum_std', 'from_in_sum_median', 'from_in_sum_max',
'from_in_sum_min', 'from_in_mean_sum', 'from_in_mean_std',
'from_in_mean_median', 'from_in_mean_max', 'from_in_mean_min',
'from_in_max_sum', 'from_in_max_std', 'from_in_max_median',
'from_in_max_max', 'from_in_max_min', 'from_in_min_sum',
'from_in_min_std', 'from_in_min_median', 'from_in_min_max',
'from_in_min_min', 'out_count', 'out_nunique',
'out_count_nunique_ratio', 'out_count_nunique_equal', 'to_out_sum_sum',
'to_out_sum_std', 'to_out_sum_median', 'to_out_sum_max',
'to_out_sum_min', 'to_out_mean_sum', 'to_out_mean_std',
'to_out_mean_median', 'to_out_mean_max', 'to_out_mean_min',
'to_out_max_sum', 'to_out_max_std', 'to_out_max_median',
'to_out_max_max', 'to_out_max_min', 'to_out_min_sum', 'to_out_min_std',
'to_out_min_median', 'to_out_min_max', 'to_out_min_min', 'in_count',
'in_nunique', 'in_count_nunique_ratio', 'in_count_nunique_equal'],axis=1,inplace=True)
return train_list, train_list_y
def get_node_feature(self,train_data):
#*
from_block_feature = train_data.groupby(['from'])['blockNumber'].agg([np.ptp, 'std'])
from_block_feature=from_block_feature.add_prefix('from_block_')
from_value_feature = train_data.groupby(['from'])['value'].agg(['sum', 'mean', 'std', 'max', 'min'])
from_value_feature=from_value_feature.add_prefix('from_value_')
from_feature = pd.concat([from_block_feature, from_value_feature], axis = 1)
to_block_feature = train_data.groupby(['to'])['blockNumber'].agg([np.ptp, 'std'])
to_block_feature=to_block_feature.add_prefix('to_block_')
to_value_feature = train_data.groupby(['to'])['value'].agg(['sum', 'mean', 'std', 'max', 'min'])
to_value_feature=to_value_feature.add_prefix('to_value_')
to_feature = pd.concat([to_block_feature, to_value_feature], axis = 1)
from_feature.reset_index(inplace=True)
to_feature.reset_index(inplace=True)
temp_feature = pd.merge(left=from_feature, right=to_feature, left_on='from', right_on='to', how='inner')
temp_feature['address'] = temp_feature['from']
temp_feature.drop(['from', 'to'], axis = 1, inplace=True)
return temp_feature
def get_net_feature(self,train_data):
#*
from_to_one_degree_feature = train_data.groupby(['from'])['to'].agg(['count', 'nunique'])
from_to_one_degree_feature=from_to_one_degree_feature.add_prefix('from_to_').reset_index()
from_to_one_degree_feature['from_to_count_nunique_ratio'] = from_to_one_degree_feature['from_to_count'] / from_to_one_degree_feature['from_to_nunique']
from_to_one_degree_feature['from_to_count_nunique_equal'] = (from_to_one_degree_feature['from_to_count'] == from_to_one_degree_feature['from_to_nunique']).astype(int)
from_to_two_degree_feature = train_data.groupby(['from', 'to'])['value'].agg(['sum', 'mean', 'max', 'min']).groupby(['from']).agg(
['sum', 'std', 'median', 'max', 'min']).add_prefix('from_to_')
from_to_two_degree_feature = pd.DataFrame(from_to_two_degree_feature.to_records(), index=None)
from_to_net_feature = from_to_two_degree_feature.merge(right=from_to_one_degree_feature, left_on='from', right_on='from', how='inner')
to_from_one_degree_feature = train_data.groupby(['to'])['from'].agg(['count', 'nunique'])
to_from_one_degree_feature=to_from_one_degree_feature.add_prefix('to_from_').reset_index()
to_from_one_degree_feature['to_from_count_nunique_ratio'] = to_from_one_degree_feature['to_from_count'] / to_from_one_degree_feature['to_from_nunique']
to_from_one_degree_feature['to_from_count_nunique_equal'] = (to_from_one_degree_feature['to_from_count'] == to_from_one_degree_feature['to_from_nunique']).astype(int)
to_from_two_degree_feature = train_data.groupby(['to', 'from'])['value'].agg(['sum', 'mean', 'max', 'min']).groupby(['to']).agg(
['sum', 'std', 'median', 'max', 'min']).add_prefix('to_from_')
to_from_two_degree_feature = pd.DataFrame(to_from_two_degree_feature.to_records(), index=None)
to_from_net_feature = to_from_two_degree_feature.merge(right=to_from_one_degree_feature, left_on='to', right_on='to', how='inner')
return from_to_net_feature, to_from_net_feature
def get_feature(self,train_data):
node_feature = self.get_node_feature(train_data)
from_to_net_feature, to_from_net_feature = self.get_net_feature(train_data)
feature = node_feature.merge(right=from_to_net_feature, left_on='address', right_on='from', how='left')
feature = feature.merge(right=to_from_net_feature, left_on='address', right_on='to',how='left')
feature.drop(['from','to'], inplace=True, axis = 1)
return feature
def deal_feature_name(self,x):
#*
if x.count('from_to_') == 2:
return 'from_in_' + x[1:-1].replace('from_to_', '').replace(',', '').replace('\'', '').replace(' ', '_')
if x.count('to_from_') == 2:
return 'to_out_' + x[1:-1].replace('to_from_', '').replace(',', '').replace('\'', '').replace(' ', '_')
if x.count('from_to_') == 1:
return x.replace('from_to', 'out')
if x.count('to_from_') == 1:
return x.replace('to_from', 'in')
if (x.count('block') == 1 or x.count('value') == 1):
return x.replace('from_block', 'out_block').replace('from_value', 'out_value').replace('to_block', 'in_block').replace('to_value', 'in_value')
return x
if __name__ == '__main__':
#train_data = pd.read_csv('./train_data.csv')
#train_label = pd.read_csv('./train_label.csv')
#model=Model(train_data,train_label)
#train_list,train_list_y,train_list_fd,train_list_y_fd=model.init_train_data()
#model.model_train(train_list,train_list_y,train_list_fd,train_list_y_fd)