LaLonde (1986) 研究了一项就业培训计划 NSW (The National Supported Work Demonstration),对比该计划对收入的影响在随机实验和非实验数据研究下的差距。NSW 计划旨在通过提供职业培训,帮助美国弱势工人走上长期工作岗位。该培训项目针对的是符合某些条件的男性和女性工人,如正在领取联邦福利金 (AFDC) 或曾经被监禁过。
具体步骤为:首先,通过比较 NSW 项目中 Treatment group 和 Control group 来估计项目的效果。虽然工人选择是否参与该计划并不随机,可能与一些个人特征相关,但因为这些工人是随机分配的,所以不用担心任何混杂因素,可以将随机实验的估计值作为真实因果效应的基准。
然后,将 NSW 样本中的 Treatment group 与其他数据集 (非实验数据):the Panel Study of Income Dynamics (PSID) 和 the Current Population Survey (CPS) 中的工人进行比较,重新评估培训的效果。这些非实验数据的工人特征与 NSW 项目的 Control group 存在较大差异,说明从观察数据 (非实验数据) 估计因果关系较为困难。
from sklearn.linear_model import LogisticRegressionCV, \ LinearRegression,LogisticRegression,Lasso,LassoCV from sklearn.ensemble import RandomForestClassifier, \ RandomForestRegressor,GradientBoostingClassifier import lightgbm as lgb from sklearn.preprocessing import StandardScaler from econml.sklearn_extensions.model_selection import GridSearchCVList deffirst_stage_reg(X, y, *, automl=True): if automl: model = GridSearchCVList([LassoCV(), RandomForestRegressor(n_estimators=100), lgb.LGBMRegressor()], param_grid_list=[{}, {'max_depth': [5,10,20],'min_samples_leaf': [5, 10]}, {'learning_rate': [0.02,0.05,0.08], 'max_depth': [3, 5]}], cv=3, scoring='neg_mean_squared_error') best_est = model.fit(X, y).best_estimator_ if isinstance(best_est, LassoCV): return Lasso(alpha=best_est.alpha_) return best_est else: model = LassoCV(cv=5).fit(X, y) return Lasso(alpha=model.alpha_) deffirst_stage_clf(X, y, *, make_regressor=False, automl=True): if automl: model = GridSearchCVList([LogisticRegressionCV(max_iter=1000), RandomForestClassifier(n_estimators=100), lgb.LGBMClassifier()], param_grid_list=[{}, {'max_depth': [5,10,20], 'min_samples_leaf': [5, 10]}, {'learning_rate':[0.01,0.05,0.1], 'max_depth': [3,5]}], cv=3, scoring='neg_log_loss') est = model.fit(X, y).best_estimator_ if isinstance(est,LogisticRegressionCV): return LogisticRegression(C=est.C_[0]) else: model = LogisticRegressionCV(cv=5, max_iter=1000).fit(X, y) est = LogisticRegression(C=model.C_[0]) if make_regressor: return _RegressionWrapper(est) else: return est
在选择出最优模型和参数后,我们使用 econml 进行 DML 模型构建。
import numpy as np from econml.dml import LinearDML from econml.dr import LinearDRLearner defeconml_homo_model_wrapper(reg_data, control_names,outcome_name, \ model_type,*,cols_to_scale, print_summary=False): # get variables X = None# no heterogeneous treatment W = reg_data[control_names].values # scale W scaler = StandardScaler() W = np.hstack([scaler.fit_transform(W[:, :cols_to_scale]).astype(np.float32), W[:, cols_to_scale:]]) T = reg_data["treated"] y = reg_data[outcome_name] # select the best nuisances model out of econml estimator model_y=first_stage_reg(W, y) model_t=first_stage_clf(W, T)
if model_type=='dml': est = LinearDML(model_y=model_y, model_t=model_t, discrete_treatment=True, mc_iters=5,cv=5) elif model_type=='dr': est = LinearDRLearner(model_regression=model_y, model_propensity=model_t, mc_iters=5,cv=5) else: raise ValueError('invalid model type %s' % model_type) try: est.fit(y, T, X=X, W=W, inference="statsmodels") except np.linalg.LinAlgError as e: est.fit(y, T, X=X, W=W, inference="statsmodels")
# Get the final coefficient and intercept summary if model_type=="dml": inf=est.intercept__inference() else: inf=est.intercept__inference(T=1) effect=int(round(inf.point_estimate)) se=int(round(inf.stderr)) lb,ub=inf.conf_int(alpha=0.05) if print_summary: if model_type=='dml'
: print(est.summary(alpha=0.05)) else: print(est.summary(T=1,alpha=0.05)) return effect, se, int(round(lb)), int(round(ub))
最后,定义函数 get_summ_table 将所有回归的结果进行输出
from tqdm import tqdm defget_summ_table(dfs,treat_df,*,df_names,basic_ols_controls, \ complete_ols_controls,econml_controls,outcome_name,cols_to_scale): summ_dic={"control_name":[],"# of obs":[],"earning_growth":[],"OLS":[], \ "OLS full controls":[],"DML full controls":[]} summ_dic1={"control_name":[],"method":[],"point_estimate":[],"stderr":[],\ "lower_bound":[],"upper_bound":[]} for df, name in tqdm(zip(dfs, df_names)): summ_dic["control_name"].append(name) summ_dic["# of obs"].append(df.shape[0]) summ_dic1["control_name"]+=[name]*3 # get table 5 col 1 growth=int(np.round((df[outcome_name]-df["re75"]).mean(),0)) summ_dic["earning_growth"].append(growth) # get table 5 col 5 summ_dic1["method"].append("OLS") df_all=pd.concat([treat_df,df],axis=0,ignore_index=True) effect,se,lb,ub=ols_reg_wrapper(df_all,basic_ols_controls,outcome_name,print_summary=False) summ_dic["OLS"].append([effect,se]) summ_dic1["point_estimate"].append(effect) summ_dic1["stderr"].append(se) summ_dic1["lower_bound"].append(lb) summ_dic1["upper_bound"].append(ub) # get table 5 col 10 summ_dic1["method"].append("OLS full controls") effect,se,lb,ub=ols_reg_wrapper(df_all,complete_ols_controls,outcome_name,print_summary=False) summ_dic["OLS full controls"].append([effect,se]) summ_dic1["point_estimate"].append(effect) summ_dic1["stderr"].append(se) summ_dic1["lower_bound"].append(lb) summ_dic1["upper_bound"].append(ub) # dml summ_dic1["method"].append("DML full controls") effect,se,lb,ub=econml_homo_model_wrapper(df_all, econml_controls, \ outcome_name,"dml",cols_to_scale=cols_to_scale, print_summary=False) summ_dic["DML full controls"].append([effect,se]) summ_dic1["point_estimate"].append(effect) summ_dic1["stderr"].append(se) summ_dic1["lower_bound"].append(lb) summ_dic1["upper_bound"].append(ub) return summ_dic,summ_dic1
import matplotlib.pyplot as plt from matplotlib.transforms import ScaledTranslation %matplotlib inline defplot_errorbar(df,df_names): fig, ax = plt.subplots(figsize=(10,6)) for ind,control_name in enumerate(df_names): sub_df=df[df["control_name"]==control_name] method_name=sub_df["method"].values point=sub_df["point_estimate"].values yerr=np.zeros((2,point.shape[0])) yerr[0,:]=point-sub_df["lower_bound"].values yerr[1,:]=sub_df["upper_bound"].values-point trans = ax.transData + ScaledTranslation((-10+ind*5)/72, 0, fig.dpi_scale_trans) plt.errorbar(method_name,point,yerr,fmt="o",capsize=5, \ elinewidth=2,label=control_name,alpha=0.7,transform=trans) plt.axhline(y=0, color='black', linestyle='--',alpha=0.5) plt.legend() plt.xlabel("Methodology") plt.ylabel("ATE with CI") plt.title("Error bar of each method for each dataset") plt.show() summplot_male_df=pd.DataFrame(summplot_male) plot_errorbar(summplot_male_df,df_names_male)