From 761079517c2fcc1ada81d0e826cf928164621e73 Mon Sep 17 00:00:00 2001 From: Mike Tokic Date: Sun, 30 Jun 2024 19:15:23 -0700 Subject: [PATCH 1/2] ensemble model updates --- DESCRIPTION | 2 +- NEWS.md | 3 ++- R/ensemble_models.R | 9 +++++++-- R/models.R | 22 ++++++++++++++++++++-- R/parallel_util.R | 2 +- R/prep_models.R | 14 ++++++++++---- vignettes/models-used-in-finnts.Rmd | 6 +++--- 7 files changed, 44 insertions(+), 14 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 0f210549..67632286 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -24,7 +24,7 @@ License: MIT + file LICENSE Encoding: UTF-8 LazyData: true Roxygen: list(markdown = TRUE) -RoxygenNote: 7.1.1 +RoxygenNote: 7.3.1 Imports: cli, Cubist, diff --git a/NEWS.md b/NEWS.md index 444a0ad6..6a8c1699 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,10 +1,11 @@ -# finnts 0.4.0.9004 (DEVELOPMENT VERSION) +# finnts 0.4.0.9005 (DEVELOPMENT VERSION) ## Improvements - Added support for hierarchical forecasting with external regressors - Allow global models for hierarchical forecasts - Multistep horizon forecasts for R1 recipe, listed as `multistep_horizon` within `prep_data()` +- Updated model ensemble (stacking) process ## Bug Fixes diff --git a/R/ensemble_models.R b/R/ensemble_models.R index df4eba6b..0b777e41 100644 --- a/R/ensemble_models.R +++ b/R/ensemble_models.R @@ -81,6 +81,7 @@ ensemble_models <- function(run_info, run_local_models <- log_df$run_local_models models_to_run <- log_df$models_to_run models_not_to_run <- log_df$models_not_to_run + forecast_horizon <- as.numeric(log_df$forecast_horizon) if (log_df$run_ensemble_models == FALSE) { cli::cli_alert_info("Ensemble models have been turned off.") @@ -234,7 +235,11 @@ ensemble_models <- function(run_info, avail_arg_list <- list( "train_data" = prep_ensemble_tbl %>% dplyr::select(-Train_Test_ID), "model_type" = "ensemble", - "pca" = FALSE + "pca" = FALSE, # not used in ensemble + "multistep" = FALSE, # not used in ensemble + "horizon" = NULL, # not used in ensemble + "frequency" = NULL, # not used in ensemble + "external_regressors" = NULL # not used in ensemble ) # get specific model spec @@ -357,7 +362,7 @@ ensemble_models <- function(run_info, tune_results <- tune::tune_grid( object = workflow, - resamples = create_splits(prep_ensemble_tbl, model_train_test_tbl %>% dplyr::filter(Run_Type == "Validation")), + resamples = create_splits(prep_ensemble_tbl %>% dplyr::select(-Combo, -Train_Test_ID), model_train_test_tbl %>% dplyr::filter(Run_Type == "Validation")), grid = hyperparameters %>% dplyr::select(-Hyperparameter_Combo), control = tune::control_grid( allow_par = inner_parallel, diff --git a/R/models.R b/R/models.R index 8cc9e420..88d0bfb4 100644 --- a/R/models.R +++ b/R/models.R @@ -34,7 +34,7 @@ list_hyperparmater_models <- function() { #' @noRd list_ensemble_models <- function() { list <- c( - "cubist", "glmnet", "svm-poly", "svm-rbf", "xgboost" + "glmnet", "xgboost" ) return(list) @@ -692,6 +692,7 @@ ets <- function(train_data, #' @param horizon horizon #' @param external_regressors external regressors #' @param frequency frequency +#' @param model_type single or ensemble #' #' @return Get the GLM Net model #' @noRd @@ -700,7 +701,8 @@ glmnet <- function(train_data, multistep, horizon, external_regressors, - frequency) { + frequency, + model_type = "single") { # create model recipe and spec if (multistep) { @@ -722,6 +724,22 @@ glmnet <- function(train_data, lag_periods = get_lag_periods(NULL, get_date_type(frequency), horizon, TRUE) ) %>% parsnip::set_engine("glmnet_multistep_horizon") + } else if (model_type == "ensemble") { + recipe_spec_glmnet <- train_data %>% + get_recipe_configurable( + rm_date = "with_adj", + step_nzv = "zv", + one_hot = FALSE, + center_scale = FALSE, + pca = pca + ) + + model_spec_glmnet <- parsnip::linear_reg( + mode = "regression", + penalty = tune::tune(), + mixture = tune::tune() + ) %>% + parsnip::set_engine("glmnet", lower.limits = 0) } else { recipe_spec_glmnet <- train_data %>% get_recipe_configurable( diff --git a/R/parallel_util.R b/R/parallel_util.R index 751ccad1..3746d971 100644 --- a/R/parallel_util.R +++ b/R/parallel_util.R @@ -34,7 +34,7 @@ par_start <- function(run_info, ) parallel_packages <- c( - "gtools", "hts", "magrittr", "methods", "base", "modeltime.resample", + "gtools", "hts", "magrittr", "methods", "base", "plyr", "rsample" ) diff --git a/R/prep_models.R b/R/prep_models.R index 7b9dede0..3161047b 100644 --- a/R/prep_models.R +++ b/R/prep_models.R @@ -224,7 +224,7 @@ train_test_split <- function(run_info, if (sum(model_workflow_list %in% ensemble_model_list) == 0 & run_ensemble_models) { run_ensemble_models <- FALSE - cli::cli_alert_info("Turning ensemble models off since no multivariate models were chosen to run.") + cli::cli_alert_info("Turning ensemble models off since no ensemble models were chosen to run.") cli::cli_progress_update() } @@ -331,12 +331,14 @@ train_test_split <- function(run_info, test_tbl <- temp_tbl %>% dplyr::filter(Date <= min(back_test_date$Train_End)) - } else { + } else if (as.numeric(id) > back_test_scenarios_final & as.numeric(id) < max(back_test_scenarios_final + (forecast_horizon / back_test_spacing_final) + 1, back_test_scenarios_final * 2)){ run_type <- "Ensemble" test_tbl <- temp_tbl %>% dplyr::filter(.key == "testing") %>% dplyr::select(Date) + } else { + next } train_test_tbl <- tibble::tibble( @@ -352,9 +354,13 @@ train_test_split <- function(run_info, # check for back test and validation data if (!("Validation" %in% unique(train_test_final$Run_Type))) { - stop("No validation data produced. Add more historical data, shorten the forecast horizon, or shorten the number of back test scenarios") + stop("No validation data produced. Add more historical data, shorten the forecast horizon, or shorten the number of back test scenarios.") } else if (!("Back_Test" %in% unique(train_test_final$Run_Type))) { - stop("No back testing data produced. Shorten the forecast horizon, or shorten the number of back test scenarios or back test spacing") + stop("No back testing data produced. Shorten the forecast horizon, or shorten the number of back test scenarios or back test spacing.") + } else if(!("Ensemble" %in% unique(train_test_final$Run_Type)) & run_ensemble_models) { + run_ensemble_models <- FALSE + cli::cli_alert_info("Turning ensemble models off since no ensemble train/test splits could be created. To fix this either add more historical data, shorten the forecast horizon, or shorten the number of back test scenarios or back test spacing.") + cli::cli_progress_update() } # adjust based on models planned to run diff --git a/vignettes/models-used-in-finnts.Rmd b/vignettes/models-used-in-finnts.Rmd index 0e5eae27..9f5d0f9d 100644 --- a/vignettes/models-used-in-finnts.Rmd +++ b/vignettes/models-used-in-finnts.Rmd @@ -23,7 +23,7 @@ reactable::reactable( rbind(data.frame(Model = "arima", Type = "univariate, local", Underlying.Package = "modeltime, forecast", Description = "Regression model that is based on finding relationships between lagged values of the target variable you are trying to forecast.")) %>% rbind(data.frame(Model = "arima-boost", Type = "multivariate, local", Underlying.Package = "modeltime, forecast, xgboost", Description = "Arima model (refer to arima) that models the trend compoent of target variable, then uses xgboost model (refer to xgboost) to train on the remaining residuals.")) %>% rbind(data.frame(Model = "arimax", Type = "multivariate, local", Underlying.Package = "modeltime, forecast", Description = "ARIMA model that incorporates external regressors and other engineered features.")) %>% - rbind(data.frame(Model = "cubist", Type = "multivariate, local, global, ensemble", Underlying.Package = "rules", Description = "Hybrid of tree based and linear regression approach. Many decision trees are built, but regression coefficients are used at each terminal node instead of averging values in other tree based approaches.")) %>% + rbind(data.frame(Model = "cubist", Type = "multivariate, local, global", Underlying.Package = "rules", Description = "Hybrid of tree based and linear regression approach. Many decision trees are built, but regression coefficients are used at each terminal node instead of averging values in other tree based approaches.")) %>% rbind(data.frame(Model = "croston", Type = "univariate, local", Underlying.Package = "modeltime, forecast", Description = "Useful for intermittent demand forecasting, aka when there are a lot of periods of zero values. Involves simple exponential smoothing on non-zero values of target variable and another application of seasonal exponential smoothing on periods between non-zero elements of the target variable. Refer to ets for more details on exponential smoothing.")) %>% rbind(data.frame(Model = "ets", Type = "univariate, local", Underlying.Package = "modeltime, forecast", Description = "Forecasts produced using exponential smoothing methods are weighted averages of past observations, with the weights decaying exponentially as the observations get older. Exponential smoothing models try to forecast the components of a time series which can be broken down in to error, trend, and seasonality. These components can be forecasted separately then either added or multiplied together to get the final forecast output.")) %>% rbind(data.frame(Model = "glmnet", Type = "multivariate, local, global, ensemble", Underlying.Package = "parsnip, glmnet", Description = "Linear regression (line of best fit) with regularization to help prevent overfitting and built in variable selection.")) %>% @@ -37,8 +37,8 @@ reactable::reactable( rbind(data.frame(Model = "snaive", Type = "univariate, local", Underlying.Package = "modeltime, forecast", Description = "Simple model that takes the value from the same period in the previous year.")) %>% rbind(data.frame(Model = "stlm-arima", Type = "univariate, local", Underlying.Package = "modeltime, forecast", Description = "Applies an STL decomposition (breaks out target variable into seasonal, trend, and error/residual/remainder components), models the seasonally adjusted data, reseasonalizes, and returns the forecasts. An arima model (refer to arima) is used in forecasting the seasonaly adjusted data.")) %>% rbind(data.frame(Model = "stlm-ets", Type = "univariate, local", Underlying.Package = "modeltime, forecast", Description = "Applies an STL decomposition (breaks out target variable into seasonal, trend, and error/residual/remainder components), models the seasonally adjusted data, reseasonalizes, and returns the forecasts. An ets model (refer to ets) is used in forecasting the seasonaly adjusted data.")) %>% - rbind(data.frame(Model = "svm-poly", Type = "multivariate, local, global, ensemble", Underlying.Package = "parsnip, kernlab", Description = "Uses a nonlinear function, specifically a polynomial function, to create a regression line of the target variable.")) %>% - rbind(data.frame(Model = "svm-rbf", Type = "multivariate, local, global, ensemble", Underlying.Package = "parsnip, kernlab", Description = "Uses a nonlinear function, specifically a radial basis function, to create a regression line of the target variable.")) %>% + rbind(data.frame(Model = "svm-poly", Type = "multivariate, local, global", Underlying.Package = "parsnip, kernlab", Description = "Uses a nonlinear function, specifically a polynomial function, to create a regression line of the target variable.")) %>% + rbind(data.frame(Model = "svm-rbf", Type = "multivariate, local, global", Underlying.Package = "parsnip, kernlab", Description = "Uses a nonlinear function, specifically a radial basis function, to create a regression line of the target variable.")) %>% rbind(data.frame(Model = "tbats", Type = "univariate, local", Underlying.Package = "modeltime, forecast", Description = "A spin off of the traditional ets model (refer to ets), with some additional components to capture multiple seasonalities.")) %>% rbind(data.frame(Model = "theta", Type = "univariate, local", Underlying.Package = "modeltime, forecast", Description = "Theta is similar to exponential smoothing (refer to ets) but with another component called drift. Adding drift to exponential smoothing allows the forecast to increase or decrease over time, where the amount of change over time (called the drift) is set to be the average change seen within the historical data.")) %>% rbind(data.frame(Model = "xgboost", Type = "multivariate, local, global, ensemble", Underlying.Package = "parsnip, xgboost", Description = "Builds many decision trees (similar to random forests), but predictions that are initially inaccurate are applied more weight in subsequent training rounds to increase accuracy across all predictions.")) From 8e6194a348845f9d05291f14ef19435d2476f3ef Mon Sep 17 00:00:00 2001 From: Mike Tokic Date: Mon, 1 Jul 2024 20:05:38 -0700 Subject: [PATCH 2/2] version update --- DESCRIPTION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index 67632286..9354060e 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: finnts Title: Microsoft Finance Time Series Forecasting Framework -Version: 0.4.0.9004 +Version: 0.4.0.9005 Authors@R: c(person(given = "Mike", family = "Tokic",