Module src.tune
Tune the model parameters.
Expand source code
"""Tune the model parameters."""
import json
from pathlib import Path
import ray.air as air
import yaml
from ray import tune
from ray.tune.integration.xgboost import TuneReportCallback
from ray.tune.schedulers import ASHAScheduler
from ray.tune.search.optuna import OptunaSearch
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier
from src.utils import get_model_path, load_config, load_data
def get_tune_param_space(tune_config: dict):
"""Return the parameter space to tune.
Parameters
----------
tune_config : dict
The configuration dict for the tune stage.
"""
hyperparams = config["hyperparams"]
return {
"objective": hyperparams["objective"],
"tree_method": hyperparams["tree_method"],
"early_stopping_rounds": hyperparams["early_stopping_rounds"],
"eval_metric": hyperparams["eval_metric"],
"n_estimators": tune.randint(200, 600),
"gamma": tune.randint(1, 5),
"max_depth": tune.randint(2, 9),
"min_child_weight": tune.randint(1, 5),
"subsample": tune.uniform(0.5, 1.0),
"eta": tune.loguniform(1e-4, 1e-1),
"colsample_bytree": tune.uniform(0.5, 1),
}
def get_objective(X_train, y_train):
"""Return the objective function to be optimized."""
def objective(config):
"""Objective to be optimized.
Uses a simple 0.8/0.2 train-validation-split and logs the
validation logloss using the `TuneReportCallback`.
Parameters
----------
config: dict
The config object.
"""
X_train_sub, X_val, y_train_sub, y_val = train_test_split(
X_train, y_train, stratify=y_train, shuffle=True, test_size=0.2
)
trc = TuneReportCallback({"loss": "validation_0-mlogloss"})
XGBClassifier(**config, callbacks=[trc]).fit(
X_train_sub, y_train_sub, eval_set=[(X_val, y_val)], verbose=False
)
return objective
def run_tune(config: dict):
"""Train the model."""
wines = load_data(raw=False, config=yaml.safe_load(open("params.yaml"))["prepare"])
X, y = wines.drop("quality", axis=1), wines["quality"]
X = X.astype("float32")
y = y.astype("long")
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=0
)
# apply standard scaling
scaler = StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)
objective = get_objective(X_train, y_train)
param_space = get_tune_param_space(config)
search_alg = OptunaSearch()
scheduler = ASHAScheduler(
grace_period=config["scheduler"]["grace_period"],
reduction_factor=config["scheduler"]["reduction_factor"],
)
tuner = tune.Tuner(
objective,
param_space=param_space,
tune_config=tune.TuneConfig(
num_samples=config["tune_config"]["num_samples"],
metric=config["tune_config"]["metric"],
mode=config["tune_config"]["mode"],
scheduler=scheduler,
search_alg=search_alg,
),
run_config=air.RunConfig(local_dir="logs/", name="tune-xgboost"),
)
results = tuner.fit()
best_result = results.get_best_result("loss", mode="min")
best_params = best_result.config
best_result.metrics_dataframe[["training_iteration", "loss"]].to_csv(
"eval/losses.csv", index=False
)
eval_dir = Path("eval")
eval_dir.mkdir(exist_ok=True)
json.dump(best_params, open("eval/params.json", "w"), indent=4)
clf = XGBClassifier(**best_params).fit(
X_train, y_train, eval_set=[(X_test, y_test)], verbose=50
)
models_dir = Path("models")
models_dir.mkdir(exist_ok=True)
clf.save_model(get_model_path(config))
acc = clf.score(X_test, y_test)
json.dump({"accuracy": acc}, open("eval/metrics.json", "w"), indent=4)
if __name__ == "__main__":
config = load_config("tune")
run_tune(config)
Functions
def get_objective(X_train, y_train)-
Return the objective function to be optimized.
Expand source code
def get_objective(X_train, y_train): """Return the objective function to be optimized.""" def objective(config): """Objective to be optimized. Uses a simple 0.8/0.2 train-validation-split and logs the validation logloss using the `TuneReportCallback`. Parameters ---------- config: dict The config object. """ X_train_sub, X_val, y_train_sub, y_val = train_test_split( X_train, y_train, stratify=y_train, shuffle=True, test_size=0.2 ) trc = TuneReportCallback({"loss": "validation_0-mlogloss"}) XGBClassifier(**config, callbacks=[trc]).fit( X_train_sub, y_train_sub, eval_set=[(X_val, y_val)], verbose=False ) return objective def get_tune_param_space(tune_config: dict)-
Return the parameter space to tune.
Parameters
tune_config:dict- The configuration dict for the tune stage.
Expand source code
def get_tune_param_space(tune_config: dict): """Return the parameter space to tune. Parameters ---------- tune_config : dict The configuration dict for the tune stage. """ hyperparams = config["hyperparams"] return { "objective": hyperparams["objective"], "tree_method": hyperparams["tree_method"], "early_stopping_rounds": hyperparams["early_stopping_rounds"], "eval_metric": hyperparams["eval_metric"], "n_estimators": tune.randint(200, 600), "gamma": tune.randint(1, 5), "max_depth": tune.randint(2, 9), "min_child_weight": tune.randint(1, 5), "subsample": tune.uniform(0.5, 1.0), "eta": tune.loguniform(1e-4, 1e-1), "colsample_bytree": tune.uniform(0.5, 1), } def run_tune(config: dict)-
Train the model.
Expand source code
def run_tune(config: dict): """Train the model.""" wines = load_data(raw=False, config=yaml.safe_load(open("params.yaml"))["prepare"]) X, y = wines.drop("quality", axis=1), wines["quality"] X = X.astype("float32") y = y.astype("long") X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=0 ) # apply standard scaling scaler = StandardScaler().fit(X_train) X_train = scaler.transform(X_train) X_test = scaler.transform(X_test) objective = get_objective(X_train, y_train) param_space = get_tune_param_space(config) search_alg = OptunaSearch() scheduler = ASHAScheduler( grace_period=config["scheduler"]["grace_period"], reduction_factor=config["scheduler"]["reduction_factor"], ) tuner = tune.Tuner( objective, param_space=param_space, tune_config=tune.TuneConfig( num_samples=config["tune_config"]["num_samples"], metric=config["tune_config"]["metric"], mode=config["tune_config"]["mode"], scheduler=scheduler, search_alg=search_alg, ), run_config=air.RunConfig(local_dir="logs/", name="tune-xgboost"), ) results = tuner.fit() best_result = results.get_best_result("loss", mode="min") best_params = best_result.config best_result.metrics_dataframe[["training_iteration", "loss"]].to_csv( "eval/losses.csv", index=False ) eval_dir = Path("eval") eval_dir.mkdir(exist_ok=True) json.dump(best_params, open("eval/params.json", "w"), indent=4) clf = XGBClassifier(**best_params).fit( X_train, y_train, eval_set=[(X_test, y_test)], verbose=50 ) models_dir = Path("models") models_dir.mkdir(exist_ok=True) clf.save_model(get_model_path(config)) acc = clf.score(X_test, y_test) json.dump({"accuracy": acc}, open("eval/metrics.json", "w"), indent=4)