我生成了一个包含六个变量和 500 个观测值的模拟数据集。无监督模型只使用 X 变量,而 Y 变量仅用于验证。异常值的百分比设定为 5%,contamination=0.05。可以绘制前两个变量的散点图,黄色的点表示异常值,紫色的点为正常数据点。
import numpy as np import pandas as pd import matplotlib.pyplot as plt from pyod.utils.data import generate_data contamination = 0.05# percentage of outliers n_train = 500# number of training points n_test = 500# number of testing points n_features = 6# number of features X_train, X_test, y_train, y_test = generate_data( n_train=n_train, n_test=n_test, n_features= n_features, contamination=contamination, random_state=123)
from pyod.models.iforest import IForest isft = IForest(contamination=0.05, max_samples=40, behaviour='new') isft.fit(X_train)
# Training data y_train_scores = isft.decision_function(X_train) y_train_pred = isft.predict(X_train)
# Test data y_test_scores = isft.decision_function(X_test) y_test_pred = isft.predict(X_test) # outlier labels (0 or 1)
# Threshold for the defined comtanimation rate print("The threshold for the defined contamination rate:" , isft.threshold_)
defcount_stat(vector): # Because it is '0' and '1', we can run a count statistic. unique, counts = np.unique(vector, return_counts=True) return dict(zip(unique, counts))
print("The training data:", count_stat(y_train_pred)) print("The training data:", count_stat(y_test_pred))
The threshold for the defined contamination rate: -3.986394547794703e-15 The training data: {0: 475, 1: 25} The training data: {0: 470, 1: 30}
threshold = isft.threshold_ # Or other value from the above histogram
defdescriptive_stat_threshold(df,pred_score, threshold): # Let's see how many '0's and '1's. df = pd.DataFrame(df) df['Anomaly_Score'] = pred_score df['Group'] = np.where(df['Anomaly_Score']'Normal', 'Outlier')
# Now let's show the summary statistics: cnt = df.groupby('Group')['Anomaly_Score'].count().reset_index().rename(columns={'Anomaly_Score':'Count'}) cnt['Count %'] = (cnt['Count'] / cnt['Count'].sum()) * 100# The count and count % stat = df.groupby('Group').mean().round(2).reset_index() # The avg. stat = cnt.merge(stat, left_on='Group',right_on='Group') # Put the count and the avg. together return (stat)
from pyod.models.combination import aom, moa, average, maximization from pyod.utils.utility import standardizer from pyod.models.iforest import IForest
# Standardize data X_train_norm, X_test_norm = standardizer(X_train, X_test)
# Test a range of the number of trees k_list = [100, 200, 300, 400, 500] n_clf = len(k_list) # Just prepare data frames so we can store the model results train_scores = np.zeros([X_train.shape[0], n_clf]) test_scores = np.zeros([X_test.shape[0], n_clf])
# Modeling for i in range(n_clf): k = k_list[i] #isft = IForest(contamination=0.05, max_samples=k) isft = IForest(contamination=0.05, n_estimators=k) isft.fit(X_train_norm)
# Store the results in each column: train_scores[:, i] = isft.decision_function(X_train_norm) test_scores[:, i] = isft.decision_function(X_test_norm) # Decision scores have to be normalized before combination train_scores_norm, test_scores_norm = standardizer(train_scores,test_scores)
# Combination by average # The test_scores_norm is 500 x 10. The "average" function will take the average of the 10 columns. The result "y_by_average" is a single column: y_train_by_average = average(train_scores_norm) y_test_by_average = average(test_scores_norm) import matplotlib.pyplot as plt plt.hist(y_train_by_average, bins='auto') # arguments are passed to np.histogram plt.title("Combination by average") plt.show()