python3 -m pip install -U pip python3 -m pip install -U setuptools wheel python3 -m pip install autogluon # cpu 版 python3 -m pip install -U "mxnet<2.0.0" # gpu版 # Here we assume CUDA 10.1 is installed. You should change the number # according to your own CUDA version (e.g. mxnet_cu100 for CUDA 10.0). python3 -m pip install -U "mxnet_cu101<2.0.0"
# import data import pandas import numpy from autogluon.tabular import TabularDataset, TabularPredictor train_data = TabularDataset('https://autogluon.s3.amazonaws.com/datasets/Inc/train.csv') subsample_size = 500# subsample subset of data for faster demo, try setting this to much larger values train_data = train_data.sample(n=subsample_size, random_state=0) train_data.head()
# load test set test_data = TabularDataset('https://autogluon.s3.amazonaws.com/datasets/Inc/test.csv') y_test = test_data[label] # values to predict test_data_nolab = test_data.drop(columns=[label]) # delete label column to prove we're not cheating test_data_nolab.head() predictor = TabularPredictor.load(save_path) # unnecessary, just demonstrates how to load previously-trained predictor from file
# 加载训练集 from autogluon.tabular import TabularDataset, TabularPredictor import numpy as np train_data = TabularDataset('https://autogluon.s3.amazonaws.com/datasets/Inc/train.csv') subsample_size = 500# subsample subset of data for faster demo, try setting this to much larger values train_data = train_data.sample(n=subsample_size, random_state=0) print(train_data.head())
# 加载验证集和测试集 new_data = TabularDataset('https://autogluon.s3.amazonaws.com/datasets/Inc/test.csv') test_data = new_data[5000:].copy() # this should be separate data in your applications y_test = test_data[label] test_data_nolabel = test_data.drop(columns=[label]) # delete label column val_data = new_data[:5000].copy()
# 定义评估指标,accuracy是默认的 metric = 'accuracy'# we specify eval-metric just for demo (unnecessary as it's the default)
# Objects defined below are dicts of various information (not printed here as they are quite large): model_info = specific_model.get_info() predictor_information = predictor.info()
time_limit = 60# for quick demonstration only, you should set this to longest time you are willing to wait (in seconds) metric = 'roc_auc'# specify your evaluation metric here predictor = TabularPredictor(label, eval_metric=metric).fit(train_data, time_limit=time_limit, presets='best_quality') predictor.leaderboard(test_data, silent=True)
定义搜索空间
import autogluon.core as ag
nn_options = { # specifies non-default hyperparameter values for neural network models 'num_epochs': 10, # number of training epochs (controls training time of NN models) 'learning_rate': ag.space.Real(1e-4, 1e-2, default=5e-4, log=True), # learning rate used in training (real-valued hyperparameter searched on log-scale) 'activation': ag.space.Categorical('relu', 'softrelu', 'tanh'), # activation function used in NN (categorical hyperparameter, default = first entry) 'layers': ag.space.Categorical([100], [1000], [200, 100], [300, 200, 100]), # each choice for categorical hyperparameter 'layers' corresponds to list of sizes for each NN layer to use 'dropout_prob': ag.space.Real(0.0, 0.5, default=0.1), # dropout probability (real-valued hyperparameter) }
gbm_options = { # specifies non-default hyperparameter values for lightGBM gradient boosted trees 'num_boost_round': 100, # number of boosting rounds (controls training time of GBM models) 'num_leaves': ag.space.Int(lower=26, upper=66, default=36), # number of leaves in trees (integer hyperparameter) }
hyperparameters = { # hyperparameters of each model type 'GBM': gbm_options, 'NN': nn_options, # NOTE: comment this line out if you get errors on Mac OSX } # When these keys are missing from hyperparameters dict, no models of that type are trained
time_limit = 2*60# train various models for ~2 min num_trials = 5# try at most 5 different hyperparameter configurations for each type of model search_strategy = 'auto'# to tune hyperparameters using Bayesian optimization routine with a local scheduler
hyperparameter_tune_kwargs = { # HPO is not performed unless hyperparameter_tune_kwargs is specified 'num_trials': num_trials, 'scheduler' : 'local', 'searcher': search_strategy, }
additional_ensembles = predictor.fit_weighted_ensemble(expand_pareto_frontier=True) print("Alternative ensembles you can use for prediction:", additional_ensembles)
predictor.leaderboard(only_pareto_frontier=True, silent=True) model_for_prediction = additional_ensembles[0] predictions = predictor.predict(test_data, model=model_for_prediction) predictor.delete_models(models_to_delete=additional_ensembles, dry_run=False) # delete these extra models so they don't affect rest of tutorial
将交叉验证的模型整合到一块去
refit_model_map = predictor.refit_full() print("Name of each refit-full model corresponding to a previous bagged ensemble:") print(refit_model_map) predictor.leaderboard(test_data, silent=True)
这边说这么做是可以大大降低memory/latency requirements (but may also reduce accuracy),因为最后整合了所有交叉模型,也就是利用了所有的数据,后面也就没有模型的验证评估了(原先的验证集是从训练集分出的一小块数据)
import pandas as pd import numpy as np from autogluon.tabular import TabularPredictor
directory = '~/github/ieee-fraud-detection/'# directory where you have downloaded the data CSV files from the competition label = 'isFraud'# name of target variable to predict in this competition eval_metric = 'roc_auc'# Optional: specify that competition evaluation metric is AUC save_path = directory + 'AutoGluonModels/'# where to store trained models
test_identity = pd.read_csv(directory+'test_identity.csv') test_transaction = pd.read_csv(directory+'test_transaction.csv') test_data = pd.merge(test_transaction, test_identity, on='TransactionID', how='left') # same join applied to training files a = [i.replace("-","_") for i in test_data.columns] subsample_size = 500 # subsample subset of data for faster demo, try setting this to much larger values test_data = test_data.sample(n=subsample_size, random_state=0)
y_predproba = predictor.predict_proba(test_data) y_predproba.head(5) # some example predicted fraud-probabilities
预测的正类是什么
predictor.positive_class
预测的分类时
predictor.class_labels # classes in this list correspond to columns of predict_proba() output