defcalc_ent(datasets): data_length = len(datasets) label_count = {} for i in range(data_length): label = datasets[i][-1] if label notin label_count:
label_count[label] = 0 label_count[label] += 1 ent = -sum([(p / data_length) * log(p / data_length, 2) for p in label_count.values()]) return ent
条件熵
defcond_ent(datasets, axis=0): data_length = len(datasets) feature_sets = {} for i in range(data_length): feature = datasets[i][axis] if feature notin feature_sets: feature_sets[feature] = [] feature_sets[feature].append(datasets[i]) cond_ent = sum([(len(p) / data_length) * calc_ent(p) for p in feature_sets.values()]) return cond_ent
calc_ent(datasets)
0.9709505944546686
信息增益
definfo_gain(ent, cond_ent): return ent - cond_ent
definfo_gain_train(datasets): count = len(datasets[0]) - 1 ent = calc_ent(datasets) best_feature = [] for c in range(count): c_info_gain = info_gain(ent, cond_ent(datasets, axis=c)) best_feature.append((c, c_info_gain)) print('特征({}) 的信息增益为: {:.3f}'.format(labels[c], c_info_gain)) # 比较大小 best_ = max(best_feature, key=lambda x: x[-1]) return'特征({})的信息增益最大,选择为根节点特征'.format(labels[best_[0]])
# 熵 @staticmethod defcalc_ent(datasets): data_length = len(datasets) label_count = {} for i in range(data_length): label = datasets[i][-1
] if label notin label_count: label_count[label] = 0 label_count[label] += 1 ent = -sum([(p / data_length) * log(p / data_length, 2) for p in label_count.values()]) return ent
# 经验条件熵 defcond_ent(self, datasets, axis=0): data_length = len(datasets) feature_sets = {} for i in range(data_length): feature = datasets[i][axis] if feature notin feature_sets: feature_sets[feature] = [] feature_sets[feature].append(datasets[i]) cond_ent = sum([(len(p) / data_length) * self.calc_ent(p) for p in feature_sets.values()]) return cond_ent
# 信息增益 @staticmethod definfo_gain(ent, cond_ent): return ent - cond_ent
definfo_gain_train(self, datasets): count = len(datasets[0]) - 1 ent = self.calc_ent(datasets) best_feature = [] for c in range(count): c_info_gain = self.info_gain(ent, self.cond_ent(datasets, axis=c)) best_feature.append((c, c_info_gain)) # 比较大小 best_ = max(best_feature, key=lambda x: x[-1]) return best_
feature_list = train_data[max_feature_name].value_counts().index for f in feature_list: sub_train_df = train_data.loc[train_data[max_feature_name] == f].drop([max_feature_name], axis=1)
# Plot the results plt.figure() plt.scatter(X, y, s=20, edgecolor="black", c="darkorange", label="data") plt.plot(X_test, y_1, color="cornflowerblue", label="max_depth=2", linewidth=2) plt.plot(X_test, y_2, color="yellowgreen", label="max_depth=5", linewidth=2) plt.xlabel("data") plt.ylabel("target") plt.title("Decision Tree Regression") plt.legend() plt.show()
决策树调参
# 导入库 from sklearn.tree import DecisionTreeClassifier from sklearn import datasets from sklearn.model_selection import train_test_split import matplotlib.pyplot as plt from sklearn.model_selection import GridSearchCV from sklearn.tree import DecisionTreeRegressor from sklearn import metrics
# 导入数据集 X = datasets.load_iris() # 以全部字典形式返回,有data,target,target_names三个键 data = X.data target = X.target name = X.target_names x, y = datasets.load_iris(return_X_y=True) # 能一次性取前2个 print(x.shape, y.shape)