# Libraries for this section from sklearn.ensemble import RandomForestRegressor from sklearn.model_selection import train_test_split from sklearn.metrics import mean_squared_error import numpy as np import shap
# Split data, establish model, fit model, make prediction, score model, print result X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 10) model = RandomForestRegressor(random_state=10) # Random state for reproducibility (same results every time)
fit = model.fit(X_train, y_train) yhat = fit.predict(X_test) result = mean_squared_error(y_test, yhat) print('RMSE:',round(np.sqrt(result),4))
# Use SHAP to explain predictions explainer = shap.TreeExplainer(model) shap_values = explainer.shap_values(X_test) shap.summary_plot(shap_values, features = X.columns)
ix_training, ix_test = [], [] # Loop through each fold and append the training & test indices to the empty lists above for fold in CV.split(df): ix_training.append(fold[0]), ix_test.append(fold[1])
SHAP_values_per_fold = [] #-#-# ## Loop through each outer fold and extract SHAP values for i, (train_outer_ix, test_outer_ix) in enumerate(zip(ix_training, ix_test)): #-#-# #Verbose print('\n------ Fold Number:',i) X_train, X_test = X.iloc[train_outer_ix, :], X.iloc[test_outer_ix, :] y_train, y_test = y.iloc[train_outer_ix], y.iloc[test_outer_ix]
model = RandomForestRegressor(random_state=10) # Random state for reproducibility (same results every time) fit = model.fit(X_train, y_train) yhat = fit.predict(X_test) result = mean_squared_error(y_test, yhat) print('RMSE:',round(np.sqrt(result),4))
# Use SHAP to explain predictions explainer = shap.TreeExplainer(model) shap_values = explainer.shap_values(X_test) for SHAPs in shap_values: SHAP_values_per_fold.append(SHAPs) #-#-#
np.random.seed(1) # Reproducibility CV_repeats = 10 # Make a list of random integers between 0 and 10000 of length = CV_repeats to act as different data splits random_states = np.random.randint(10000, size=CV_repeats)
######## Use a dict to track the SHAP values of each observation per CV repitition
shap_values_per_cv = dict() for sample in X.index: ## Create keys for each sample shap_values_per_cv[sample] = {} ## Then, keys for each CV fold within each sample for CV_repeat in range(CV_repeats): shap_values_per_cv[sample][CV_repeat] = {}
for i, CV_repeat in enumerate(range(CV_repeats)): #-#-# #Verbose print('\n------------ CV Repeat number:', CV_repeat) #Establish CV scheme CV = KFold(n_splits=5, shuffle=True, random_state=random_states[i]) # Set random state
ix_training, ix_test = [], [] # Loop through each fold and append the training & test indices to the empty lists above for fold in CV.split(df): ix_training.append(fold[0]), ix_test.append(fold[1])
## Loop through each outer fold and extract SHAP values for i, (train_outer_ix, test_outer_ix) in enumerate(zip(ix_training, ix_test)): #Verbose print('\n------ Fold Number:',i) X_train, X_test = X.iloc[train_outer_ix, :], X.iloc[test_outer_ix, :] y_train, y_test = y.iloc[train_outer_ix], y.iloc[test_outer_ix]
model = RandomForestRegressor(random_state=10) # Random state for reproducibility (same results every time) fit = model.fit(X_train, y_train) yhat = fit.predict(X_test) result = mean_squared_error(y_test, yhat) print('RMSE:',round(np.sqrt(result),4))
# Use SHAP to explain predictions explainer = shap.TreeExplainer(model) shap_values = explainer.shap_values(X_test)
# Extract SHAP information per fold per sample for i, test_index in enumerate(test_outer_ix): shap_values_per_cv[test_index][CV_repeat] = shap_values[i] #-#-#
# Establish lists to keep average Shap values, their Stds, and their min and max average_shap_values, stds, ranges = [],[],[]
for i in range(0,len(df)): df_per_obs = pd.DataFrame.from_dict(shap_values_per_cv[i]) # Get all SHAP values for sample number i # Get relevant statistics for every sample average_shap_values.append(df_per_obs.mean(axis=1).values) stds.append(df_per_obs.std(axis=1).values) ranges.append(df_per_obs.max(axis=1).values-df_per_obs.min(axis=1).values)
ranges = pd.DataFrame(ranges) ; ranges.columns = X.columns import seaborn as sns; from matplotlib import pyplot as plt # Transpose dataframe to long form values, labels = [],[] for i in range(len(ranges.columns)): for j in range(len(ranges)): values.append(ranges.T[j][i]) labels.append(ranges.columns[i]) long_df = pd.DataFrame([values,labels]).T ; long_df.columns = ['Values', 'Features']
title = 'Range of SHAP values per sample across all\ncross-validation repeats' xlab, ylab = 'SHAP Value Variability', 'SHAP range per sample' sns.catplot(data = long_df, x = 'Features', y = 'Values').set(xlabel = xlab, ylabel = ylab, title = title) plt.xticks(rotation=45)
title = 'Scaled Range of SHAP values per sample \nacross all cross-validation repeats' sns.catplot(data = standardized, x = 'Features', y = 'Values').set(xlabel = 'SHAP Value Variability Scaled by Mean', title = title) plt.xticks(rotation=45)
CV_repeats = 2 from sklearn.model_selection import RandomizedSearchCV
for i, CV_repeat in enumerate(range(CV_repeats)): #Verbose print('\n------------ CV Repeat number:', CV_repeat) #Establish CV scheme CV = KFold(n_splits=5, shuffle=True, random_state=random_states[i]) # Set random state
ix_training, ix_test = [], [] # Loop through each fold and append the training & test indices to the empty lists above for fold in CV.split(df): ix_training.append(fold[0]), ix_test.append(fold[1])
## Loop through each outer fold and extract SHAP values for i, (train_outer_ix, test_outer_ix) in enumerate(zip(ix_training, ix_test)): #Verbose print('\n------ Fold Number:',i) X_train, X_test = X.iloc[train_outer_ix, :], X.iloc[test_outer_ix, :] y_train, y_test = y.iloc[train_outer_ix], y.iloc[test_outer_ix]
# Search to optimize hyperparameters model = RandomForestRegressor(random_state=10) search = RandomizedSearchCV(model, param_grid, scoring='neg_mean_squared_error', cv=cv_inner, refit=True) #-#-# result = search.fit(X_train, y_train) #-#=#
# Fit model on training data result.best_estimator_.fit(X_train, y_train) #-#-#
# Use SHAP to explain predictions using best estimator explainer = shap.TreeExplainer(result.best_estimator_) shap_values = explainer.shap_values(X_test)
# Extract SHAP information per fold per sample for i, test_index in enumerate(test_outer_ix): shap_values_per_cv[test_index][CV_repeat] = shap_values[i]