线性变换如图(A)所示,通过旋转原始的 X 轴和 Y 轴来更好地拟合数据体(红色部分)。主成分分析中的第一个主成分(PC1)捕捉到数据中最大的方差,而第二个主成分则捕捉到了PC1未能捕捉到的数据中的最大差异。接下来的主成分将继续捕捉前几个未能捕捉到的方差,直到所有方差都被解释。主成分的数量应当等于原始变量的数量。
我生成了一个包含 500 个观测值和 6 个变量的模拟数据集。异常值的百分比被设定为 5%。无监督模型只使用 X 变量,而模拟数据集中的目标变量 Y 仅用于验证。
import numpy as np import pandas as pd import matplotlib.pyplot as plt from pyod.utils.data import generate_data contamination = 0.05# percentage of outliers n_train = 500# number of training points n_test = 500# number of testing points n_features = 6# number of features X_train, X_test, y_train, y_test = generate_data( n_train=n_train, n_test=n_test, n_features= n_features, contamination=contamination, random_state=123)
from pyod.models.pca import PCA pca = PCA(contamination=0.05) pca.fit(X_train)
# get the prediction labels and outlier scores of the training data y_train_pred = pca.labels_ # binary labels (0: inliers, 1: outliers) y_train_scores = pca.decision_scores_ # .decision_scores_ yields the raw outlier scores for the training data y_train_scores = pca.decision_function(X_train) # You also can use .decision_function() y_train_pred = pca.predict(X_train) # You also can use .decision_function()
y_test_scores = pca.decision_function(X_test) # You also can use .decision_function() y_test_pred = pca.predict(X_test) # You also can use .decision_function()
# get the prediction on the test data y_test_pred = pca.predict(X_test) # outlier labels (0 or 1) y_test_scores = pca.decision_function(X_test) # outlier scores
import matplotlib.pyplot as plt plt.hist(y_train_scores, bins='auto') # arguments are passed to np.histogram plt.title("Histogram with 'auto' bins") plt.xlabel('PCA outlier score') plt.show()
defdescriptive_stat_threshold(df,pred_score, threshold): # Let's see how many '0's and '1's. df = pd.DataFrame(df) df['Anomaly_Score'] = pred_score df['Group'] = np.where(df['Anomaly_Score']'Normal', 'Outlier')
# Now let's show the summary statistics: cnt = df.groupby('Group')['Anomaly_Score'].count().reset_index().rename(columns={'Anomaly_Score':'Count'}) cnt['Count %'] = (cnt['Count'] / cnt['Count'].sum()) * 100# The count and count % stat = df.groupby('Group').mean().round(2).reset_index() # The avg. stat = cnt.merge(stat, left_on='Group',right_on='Group') # Put the count and the avg. together return (stat)
# get the prediction labels and outlier scores of the training data y_test_pca_scores = pca.decision_function(X_test) # You also can use .decision_function()