autogen/flaml/automl/task/time_series_task.py

524 lines
19 KiB
Python

import logging
import time
from typing import List
import pandas as pd
import numpy as np
from scipy.sparse import issparse
from sklearn.model_selection import (
GroupKFold,
TimeSeriesSplit,
)
from flaml.automl.ml import get_val_loss, default_cv_score_agg_func
from flaml.automl.time_series.ts_data import (
TimeSeriesDataset,
DataTransformerTS,
normalize_ts_data,
)
from flaml.automl.task.task import (
Task,
get_classification_objective,
TS_FORECAST,
TS_FORECASTPANEL,
)
logger = logging.getLogger(__name__)
class TimeSeriesTask(Task):
@property
def estimators(self):
if self._estimators is None:
# put this into a function to avoid circular dependency
from flaml.automl.time_series import (
XGBoost_TS,
XGBoostLimitDepth_TS,
RF_TS,
LGBM_TS,
ExtraTrees_TS,
CatBoost_TS,
Prophet,
Orbit,
ARIMA,
SARIMAX,
TemporalFusionTransformerEstimator,
HoltWinters,
)
self._estimators = {
"xgboost": XGBoost_TS,
"xgb_limitdepth": XGBoostLimitDepth_TS,
"rf": RF_TS,
"lgbm": LGBM_TS,
"extra_tree": ExtraTrees_TS,
"arima": ARIMA,
"sarimax": SARIMAX,
"holt-winters": HoltWinters,
"catboost": CatBoost_TS,
"tft": TemporalFusionTransformerEstimator,
}
try:
from prophet import Prophet as foo
self._estimators["prophet"] = Prophet
except ImportError:
logger.info("Couldn't import Prophet, skipping")
try:
from orbit.models import DLT
self._estimators["orbit"] = Orbit
except ImportError:
logger.info("Couldn't import Prophet, skipping")
return self._estimators
# processed
def validate_data(
self,
automl,
state,
X_train_all,
y_train_all,
dataframe,
label,
X_val=None,
y_val=None,
groups_val=None,
groups=None,
):
# first beat the data into a TimeSeriesDataset shape
if isinstance(X_train_all, TimeSeriesDataset):
# in this case, we're most likely being called by another FLAML instance
# so all the preliminary cleaning has already been done
pre_data = X_train_all
val_len = len(pre_data.X_val)
else:
if label is None and dataframe is not None:
raise ValueError("If data is specified via dataframe parameter, you must also specify label")
if isinstance(y_train_all, pd.Series):
label = y_train_all.name
elif isinstance(y_train_all, np.ndarray):
label = "y" # Prophet convention
if isinstance(label, str):
target_names = [label]
else:
target_names = label
if self.time_col is None:
if isinstance(X_train_all, pd.DataFrame):
assert dataframe is None, "One of dataframe and X arguments must be None"
self.time_col = X_train_all.columns[0]
elif dataframe is not None:
assert X_train_all is None, "One of dataframe and X arguments must be None"
self.time_col = dataframe.columns[0]
else:
self.time_col = "ds"
automl._df = True
if X_train_all is not None:
assert y_train_all is not None, "If X_train_all is not None, y_train_all must also be"
assert dataframe is None, "If X_train_all is provided, dataframe must be None"
dataframe = TimeSeriesDataset.to_dataframe(X_train_all, y_train_all, target_names, self.time_col)
elif dataframe is not None:
assert label is not None, "A label or list of labels must be provided."
assert isinstance(dataframe, pd.DataFrame), "dataframe must be a pandas DataFrame"
assert label in dataframe.columns, f"{label} must a column name in dataframe"
else:
raise ValueError("Must supply either X_train_all and y_train_all, or dataframe and label")
try:
dataframe[self.time_col] = pd.to_datetime(dataframe[self.time_col])
except Exception:
raise ValueError(
f"For '{TS_FORECAST}' task, time column {self.time_col} must contain timestamp values."
)
dataframe = remove_ts_duplicates(dataframe, self.time_col)
if X_val is not None:
assert y_val is not None, "If X_val is not None, y_val must also be"
val_df = TimeSeriesDataset.to_dataframe(X_val, y_val, target_names, self.time_col)
val_len = len(val_df)
else:
val_len = 0
val_df = None
pre_data = TimeSeriesDataset(
train_data=dataframe,
time_col=self.time_col,
target_names=target_names,
test_data=val_df,
)
# TODO: should the transformer be a property of the dataset instead?
automl._transformer = DataTransformerTS(self.time_col, label)
Xt, yt = automl._transformer.fit_transform(pre_data.X_all, pre_data.y_all)
df_t = pd.concat([Xt, yt], axis=1)
data = TimeSeriesDataset(
train_data=df_t,
time_col=pre_data.time_col,
target_names=pre_data.target_names,
).move_validation_boundary(-val_len)
# now setup the properties of all the other relevant objects
# TODO: where are these used? Replace with pointers to data?
automl._X_train_all, automl._y_train_all = Xt, yt
# TODO: where are these used?
automl._nrow, automl._ndim = data.X_train.shape
# make a property instead? Or just fix the call?
automl._label_transformer = automl._transformer.label_transformer
automl._feature_names_in_ = (
automl._X_train_all.columns.to_list() if hasattr(automl._X_train_all, "columns") else None
)
self.time_col = data.time_col
self.target_names = data.target_names
automl._state.X_val = data
automl._state.X_train = data
automl._state.y_train = None
automl._state.y_val = None
if data.test_data is not None and len(data.test_data) > 0:
automl._state.X_train_all = data.move_validation_boundary(len(data.test_data))
else:
automl._state.X_train_all = data
automl._state.y_train_all = None
automl._state.data_size = data.train_data.shape
automl.data_size_full = len(data.all_data)
automl._state.groups = None
automl._sample_weight_full = None
def prepare_data(
self,
state,
X_train_all,
y_train_all,
auto_argument,
eval_method,
split_type,
split_ratio,
n_splits,
data_is_df,
sample_weight_full,
time_col=None,
):
state.kf = None
state.data_size_full = len(y_train_all)
if split_type in ["uniform", "stratified"]:
raise ValueError(f"Split type {split_type} is not valid for time series")
state.groups = None
state.groups_all = None
state.groups_val = None
ts_data = state.X_val
no_test_data = ts_data is None or ts_data.test_data is None or len(ts_data.test_data) == 0
if no_test_data and eval_method == "holdout":
# NOTE: _prepare_data is before kwargs is updated to fit_kwargs_by_estimator
period = state.fit_kwargs["period"]
if self.name == TS_FORECASTPANEL:
# TODO: move this into the TimeSeriesDataset class
X_train_all = ts_data.X_train
y_train_all = ts_data.y_train
X_train_all["time_idx"] -= X_train_all["time_idx"].min()
X_train_all["time_idx"] = X_train_all["time_idx"].astype("int")
ids = state.fit_kwargs["group_ids"].copy()
ids.append(ts_data.time_col)
ids.append("time_idx")
y_train_all = pd.DataFrame(y_train_all)
y_train_all[ids] = X_train_all[ids]
X_train_all = X_train_all.sort_values(ids)
y_train_all = y_train_all.sort_values(ids)
training_cutoff = X_train_all["time_idx"].max() - period
X_train = X_train_all[lambda x: x.time_idx <= training_cutoff]
y_train = y_train_all[lambda x: x.time_idx <= training_cutoff].drop(columns=ids)
X_val = X_train_all[lambda x: x.time_idx > training_cutoff]
y_val = y_train_all[lambda x: x.time_idx > training_cutoff].drop(columns=ids)
train_data = normalize_ts_data(
X_train,
ts_data.target_names,
ts_data.time_col,
y_train,
)
test_data = normalize_ts_data(
X_val,
ts_data.target_names,
ts_data.time_col,
y_val,
)
ts_data = TimeSeriesDataset(
train_data,
ts_data.time_col,
ts_data.target_names,
ts_data.frequency,
test_data,
)
state.X_val = ts_data
state.X_train = ts_data
else:
# if eval_method = holdout, make holdout data
num_samples = ts_data.train_data.shape[0]
assert period < num_samples, f"period={period}>#examples={num_samples}"
state.X_val = ts_data.move_validation_boundary(-period)
state.X_train = state.X_val
if eval_method != "holdout":
if self.name != TS_FORECASTPANEL:
period = state.fit_kwargs[
"period"
] # NOTE: _prepare_data is before kwargs is updated to fit_kwargs_by_estimator
step_size = state.fit_kwargs.get("cv_step_size", period)
ts_data = state.X_train
if n_splits * step_size + 2 * period > ts_data.y_train.size:
n_splits = int((ts_data.y_train.size - 2 * period) / step_size)
assert n_splits >= 2, (
f"cross validation for forecasting period={period}"
f" requires input data with at least {2*period + 2*step_size} examples."
)
logger.info(f"Using nsplits={n_splits} due to data size limit.")
state.kf = TimeSeriesSplit(n_splits=n_splits, test_size=period)
state.kf.step_size = step_size
else:
n_groups = ts_data.X_train.groupby(state.fit_kwargs.get("group_ids")).ngroups
period = state.fit_kwargs["period"]
state.kf = TimeSeriesSplit(n_splits=n_splits, test_size=period * n_groups)
# TODO: move task detection to Task.__init__!
def decide_split_type(
self,
split_type,
y_train_all,
fit_kwargs,
groups=None,
) -> str:
# TODO: move into task creation!!!
if self.name == "classification":
self.name = get_classification_objective(len(np.unique(y_train_all)))
# TODO: do we need this?
if not isinstance(split_type, str):
assert hasattr(split_type, "split") and hasattr(
split_type, "get_n_splits"
), "split_type must be a string or a splitter object with split and get_n_splits methods."
assert (
not isinstance(split_type, GroupKFold) or groups is not None
), "GroupKFold requires groups to be provided."
return split_type
else:
assert split_type in ["auto", "time"]
assert isinstance(
fit_kwargs.get("period"),
int, # NOTE: _decide_split_type is before kwargs is updated to fit_kwargs_by_estimator
), f"missing a required integer 'period' for '{TS_FORECAST}' task."
if fit_kwargs.get("group_ids"):
# TODO (MARK) This will likely not play well with the task class
self.name = TS_FORECASTPANEL
assert isinstance(
fit_kwargs.get("group_ids"), list
), f"missing a required List[str] 'group_ids' for '{TS_FORECASTPANEL}' task."
return "time"
# TODO: merge with preprocess() below
def _preprocess(self, X, transformer=None):
if isinstance(X, List):
try:
if isinstance(X[0], List):
X = [x for x in zip(*X)]
X = pd.DataFrame(
dict(
[
(transformer._str_columns[idx], X[idx])
if isinstance(X[0], List)
else (transformer._str_columns[idx], [X[idx]])
for idx in range(len(X))
]
)
)
except IndexError:
raise IndexError("Test data contains more columns than training data, exiting")
elif isinstance(X, int):
return X
elif issparse(X):
X = X.tocsr()
if self.is_ts_forecast():
X = pd.DataFrame(X)
if transformer:
X = transformer.transform(X)
return X
def preprocess(self, X, transformer=None):
if isinstance(X, pd.DataFrame) or isinstance(X, np.ndarray) or isinstance(X, pd.Series):
X = X.copy()
X = normalize_ts_data(X, self.target_names, self.time_col)
return self._preprocess(X, transformer)
elif isinstance(X, int):
return X
else:
raise ValueError(f"unknown type of X, {X.__class__}")
def evaluate_model_CV(
self,
config,
estimator,
X_train_all,
y_train_all,
budget,
kf,
eval_metric,
best_val_loss,
cv_score_agg_func=None,
log_training_metric=False,
fit_kwargs={},
free_mem_ratio=0, # what is this for?
):
if cv_score_agg_func is None:
cv_score_agg_func = default_cv_score_agg_func
start_time = time.time()
val_loss_folds = []
log_metric_folds = []
metric = None
train_time = pred_time = 0
total_fold_num = 0
n = kf.get_n_splits()
if self.is_classification():
labels = np.unique(y_train_all)
else:
labels = fit_kwargs.get("label_list") # pass the label list on to compute the evaluation metric
ts_data = X_train_all
budget_per_train = budget / n
ts_data = X_train_all
for data in ts_data.cv_train_val_sets(kf.n_splits, kf.test_size, kf.step_size):
estimator.cleanup()
val_loss_i, metric_i, train_time_i, pred_time_i = get_val_loss(
config,
estimator,
X_train=data,
y_train=None,
X_val=data,
y_val=None,
eval_metric=eval_metric,
labels=labels,
budget=budget_per_train,
log_training_metric=log_training_metric,
fit_kwargs=fit_kwargs,
task=self,
weight_val=None,
groups_val=None,
free_mem_ratio=free_mem_ratio,
)
if isinstance(metric_i, dict) and "intermediate_results" in metric_i:
del metric_i["intermediate_results"]
total_fold_num += 1
val_loss_folds.append(val_loss_i)
log_metric_folds.append(metric_i)
train_time += train_time_i
pred_time += pred_time_i
if time.time() - start_time >= budget:
break
val_loss, metric = cv_score_agg_func(val_loss_folds, log_metric_folds)
n = total_fold_num
pred_time /= n
return val_loss, metric, train_time, pred_time
def default_estimator_list(self, estimator_list: List[str], is_spark_dataframe: bool) -> List[str]:
assert not is_spark_dataframe, "Spark is not yet supported for time series"
# TODO: why not do this if/then in the calling function?
if "auto" != estimator_list:
return estimator_list
if self.is_ts_forecastpanel():
return ["tft"]
estimator_list = [
"lgbm",
"rf",
"xgboost",
"extra_tree",
"xgb_limitdepth",
]
# Catboost appears to be way slower than the others, don't include it by default
# try:
# import catboost
#
# estimator_list.append("catboost")
# except ImportError:
# pass
if self.is_regression():
estimator_list += ["arima", "sarimax"]
try:
import prophet
estimator_list.append("prophet")
except ImportError:
pass
return estimator_list
def default_metric(self, metric: str) -> str:
assert self.is_ts_forecast(), "If this is not a TS forecasting task, this code should never have been called"
if metric == "auto":
return "mape"
else:
return metric
@staticmethod
def prepare_sample_train_data(automlstate, sample_size):
# we take the tail, rather than the head, for compatibility with time series
shift = sample_size - len(automlstate.X_train.train_data)
sampled_X_train = automlstate.X_train.move_validation_boundary(shift)
return sampled_X_train, None, None, None
def remove_ts_duplicates(
X,
time_col,
):
"""
Assumes the targets are included
@param X:
@param time_col:
@param y:
@return:
"""
duplicates = X.duplicated()
if any(duplicates):
logger.warning("Duplicate timestamp values found in timestamp column. " f"\n{X.loc[duplicates, X][time_col]}")
X = X.drop_duplicates()
logger.warning("Removed duplicate rows based on all columns")
assert (
X[[X.columns[0]]].duplicated() is None
), "Duplicate timestamp values with different values for other columns."
return X