From 323c06391123e1f3a7c2d1725653ade57f3723ec Mon Sep 17 00:00:00 2001 From: Lin Yuan Date: Fri, 25 Apr 2025 14:06:07 -0700 Subject: [PATCH 1/4] use original data type in identifier column --- runtime/databricks/automl_runtime/forecast/deepar/utils.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/runtime/databricks/automl_runtime/forecast/deepar/utils.py b/runtime/databricks/automl_runtime/forecast/deepar/utils.py index 016de93..62afc18 100644 --- a/runtime/databricks/automl_runtime/forecast/deepar/utils.py +++ b/runtime/databricks/automl_runtime/forecast/deepar/utils.py @@ -98,9 +98,11 @@ def set_index_and_fill_missing_time_steps(df: pd.DataFrame, time_col: str, df_dict = {} for grouped_id, grouped_df in df.groupby(id_cols): if isinstance(grouped_id, tuple): + # TODO (ML-52171): Fix the DeepAR library to support multi-time series id columns + # For now, we convert and concatenate the id_cols to a string ts_id = "-".join([str(x) for x in grouped_id]) else: - ts_id = str(grouped_id) + ts_id = grouped_id df_dict[ts_id] = (grouped_df.set_index(time_col).sort_index() .reindex(valid_index).drop(id_cols, axis=1)) From 5b2279612d61c0c81607c8b70733d5c749294e2f Mon Sep 17 00:00:00 2001 From: Lin Yuan Date: Sun, 27 Apr 2025 14:25:17 -0700 Subject: [PATCH 2/4] Fix DeepAR multi time series id column issue --- .../automl_runtime/forecast/deepar/model.py | 7 +++- .../automl_runtime/forecast/deepar/utils.py | 40 +++++++++---------- 2 files changed, 26 insertions(+), 21 deletions(-) diff --git a/runtime/databricks/automl_runtime/forecast/deepar/model.py b/runtime/databricks/automl_runtime/forecast/deepar/model.py index 137c37a..c2454fc 100644 --- a/runtime/databricks/automl_runtime/forecast/deepar/model.py +++ b/runtime/databricks/automl_runtime/forecast/deepar/model.py @@ -100,7 +100,7 @@ def predict(self, pred_df = pred_df.rename(columns={'index': self._time_col}) if self._id_cols: - id_col_name = '-'.join(self._id_cols) + id_col_name = self._id_cols[0] pred_df = pred_df.rename(columns={'item_id': id_col_name}) else: pred_df = pred_df.drop(columns='item_id') @@ -121,6 +121,7 @@ def predict_samples(self, if num_samples is None: num_samples = self._num_samples + print("Debug:enter predict_samples") # Group by the time column in case there are multiple rows for each time column, # for example, the user didn't provide all the identity columns for a multi-series dataset group_cols = [self._time_col] @@ -134,11 +135,15 @@ def predict_samples(self, self._frequency_quantity, self._id_cols) + print(f"Debug model_input_transformed keys type: {type(model_input_transformed)}") test_ds = PandasDataset(model_input_transformed, target=self._target_col) forecast_iter = self._model.predict(test_ds, num_samples=num_samples) forecast_sample_list = list(forecast_iter) + for forecast in forecast_sample_list: + print(f"Debug forecast.item_id type: {type(forecast.item_id)}") + return forecast_sample_list diff --git a/runtime/databricks/automl_runtime/forecast/deepar/utils.py b/runtime/databricks/automl_runtime/forecast/deepar/utils.py index 62afc18..9eeda46 100644 --- a/runtime/databricks/automl_runtime/forecast/deepar/utils.py +++ b/runtime/databricks/automl_runtime/forecast/deepar/utils.py @@ -13,11 +13,10 @@ # See the License for the specific language governing permissions and # limitations under the License. # -from typing import List, Optional +from typing import List, Optional, Union, Dict import pandas as pd - def validate_and_generate_index(df: pd.DataFrame, time_col: str, frequency_unit: str, @@ -66,10 +65,12 @@ def validate_and_generate_index(df: pd.DataFrame, return new_index_full -def set_index_and_fill_missing_time_steps(df: pd.DataFrame, time_col: str, - frequency_unit: str, - frequency_quantity: int, - id_cols: Optional[List[str]] = None): +def set_index_and_fill_missing_time_steps( + df: pd.DataFrame, time_col: str, + frequency_unit: str, + frequency_quantity: int, + id_cols: Optional[List[str]] = None +) -> Union[pd.DataFrame, Dict[any, pd.DataFrame]]: """ Transform the input dataframe to an acceptable format for the GluonTS library. @@ -85,7 +86,7 @@ def set_index_and_fill_missing_time_steps(df: pd.DataFrame, time_col: str, multi-series - dictionary of transformed dataframes, each key is the (concatenated) id of the time series """ total_min, total_max = df[time_col].min(), df[time_col].max() - + print("Debug:linyuan") # We need to adjust the frequency_unit for pd.date_range if it is weekly, # otherwise it would always be "W-SUN" if frequency_unit.upper() == "W": @@ -95,26 +96,25 @@ def set_index_and_fill_missing_time_steps(df: pd.DataFrame, time_col: str, valid_index = validate_and_generate_index(df=df, time_col=time_col, frequency_unit=frequency_unit, frequency_quantity=frequency_quantity) if id_cols is not None: + if len(id_cols) > 1: + raise ValueError("DeepAR does not support multiple time series id columns") df_dict = {} for grouped_id, grouped_df in df.groupby(id_cols): if isinstance(grouped_id, tuple): # TODO (ML-52171): Fix the DeepAR library to support multi-time series id columns - # For now, we convert and concatenate the id_cols to a string - ts_id = "-".join([str(x) for x in grouped_id]) - else: - ts_id = grouped_id - df_dict[ts_id] = (grouped_df.set_index(time_col).sort_index() - .reindex(valid_index).drop(id_cols, axis=1)) + # For now, DeepAR is dropped for multiple id_cols + raise ValueError("DeepAR does not support multiple time series id columns") + print(f"Debug groupe_id type: {type(grouped_id)}") + df_dict[grouped_id] = (grouped_df.set_index(time_col).sort_index() + .reindex(valid_index).drop(id_cols, axis=1)) return df_dict - - df = df.set_index(time_col).sort_index() - - # Fill in missing time steps between the min and max time steps - df = df.reindex(valid_index) + else: + df = df.set_index(time_col).sort_index() + # Fill in missing time steps between the min and max time steps + df = df.reindex(valid_index) + return df if frequency_unit.upper() == "MS": # Truncate the day of month to avoid issues with pandas frequency check df = df.to_period("M") - - return df From 7731f8d8a35dabfd0f497912f44ee3d412b72008 Mon Sep 17 00:00:00 2001 From: Lin Yuan Date: Sun, 27 Apr 2025 20:41:19 -0700 Subject: [PATCH 3/4] Do not concatenate multiple ids for DeepAR --- .../automl_runtime/forecast/deepar/utils.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/runtime/databricks/automl_runtime/forecast/deepar/utils.py b/runtime/databricks/automl_runtime/forecast/deepar/utils.py index 9eeda46..fbf6475 100644 --- a/runtime/databricks/automl_runtime/forecast/deepar/utils.py +++ b/runtime/databricks/automl_runtime/forecast/deepar/utils.py @@ -86,7 +86,7 @@ def set_index_and_fill_missing_time_steps( multi-series - dictionary of transformed dataframes, each key is the (concatenated) id of the time series """ total_min, total_max = df[time_col].min(), df[time_col].max() - print("Debug:linyuan") + # We need to adjust the frequency_unit for pd.date_range if it is weekly, # otherwise it would always be "W-SUN" if frequency_unit.upper() == "W": @@ -104,17 +104,18 @@ def set_index_and_fill_missing_time_steps( # TODO (ML-52171): Fix the DeepAR library to support multi-time series id columns # For now, DeepAR is dropped for multiple id_cols raise ValueError("DeepAR does not support multiple time series id columns") - print(f"Debug groupe_id type: {type(grouped_id)}") df_dict[grouped_id] = (grouped_df.set_index(time_col).sort_index() .reindex(valid_index).drop(id_cols, axis=1)) return df_dict - else: - df = df.set_index(time_col).sort_index() - # Fill in missing time steps between the min and max time steps - df = df.reindex(valid_index) - return df + + df = df.set_index(time_col).sort_index() + + # Fill in missing time steps between the min and max time steps + df = df.reindex(valid_index) if frequency_unit.upper() == "MS": # Truncate the day of month to avoid issues with pandas frequency check df = df.to_period("M") + + return df From 5ef4e7ecc3973e2238c3349e2aee286b4d524dd4 Mon Sep 17 00:00:00 2001 From: Lin Yuan Date: Mon, 28 Apr 2025 00:21:06 -0700 Subject: [PATCH 4/4] remove print debug --- runtime/databricks/automl_runtime/forecast/deepar/model.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/runtime/databricks/automl_runtime/forecast/deepar/model.py b/runtime/databricks/automl_runtime/forecast/deepar/model.py index c2454fc..5c13e4a 100644 --- a/runtime/databricks/automl_runtime/forecast/deepar/model.py +++ b/runtime/databricks/automl_runtime/forecast/deepar/model.py @@ -121,7 +121,6 @@ def predict_samples(self, if num_samples is None: num_samples = self._num_samples - print("Debug:enter predict_samples") # Group by the time column in case there are multiple rows for each time column, # for example, the user didn't provide all the identity columns for a multi-series dataset group_cols = [self._time_col] @@ -135,15 +134,11 @@ def predict_samples(self, self._frequency_quantity, self._id_cols) - print(f"Debug model_input_transformed keys type: {type(model_input_transformed)}") test_ds = PandasDataset(model_input_transformed, target=self._target_col) forecast_iter = self._model.predict(test_ds, num_samples=num_samples) forecast_sample_list = list(forecast_iter) - for forecast in forecast_sample_list: - print(f"Debug forecast.item_id type: {type(forecast.item_id)}") - return forecast_sample_list