mirror of https://github.com/microsoft/autogen.git
524 lines
19 KiB
Python
524 lines
19 KiB
Python
import logging
|
|
import time
|
|
from typing import List
|
|
|
|
import pandas as pd
|
|
import numpy as np
|
|
from scipy.sparse import issparse
|
|
from sklearn.model_selection import (
|
|
GroupKFold,
|
|
TimeSeriesSplit,
|
|
)
|
|
|
|
from flaml.automl.ml import get_val_loss, default_cv_score_agg_func
|
|
from flaml.automl.time_series.ts_data import (
|
|
TimeSeriesDataset,
|
|
DataTransformerTS,
|
|
normalize_ts_data,
|
|
)
|
|
|
|
from flaml.automl.task.task import (
|
|
Task,
|
|
get_classification_objective,
|
|
TS_FORECAST,
|
|
TS_FORECASTPANEL,
|
|
)
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class TimeSeriesTask(Task):
|
|
@property
|
|
def estimators(self):
|
|
if self._estimators is None:
|
|
# put this into a function to avoid circular dependency
|
|
from flaml.automl.time_series import (
|
|
XGBoost_TS,
|
|
XGBoostLimitDepth_TS,
|
|
RF_TS,
|
|
LGBM_TS,
|
|
ExtraTrees_TS,
|
|
CatBoost_TS,
|
|
Prophet,
|
|
Orbit,
|
|
ARIMA,
|
|
SARIMAX,
|
|
TemporalFusionTransformerEstimator,
|
|
HoltWinters,
|
|
)
|
|
|
|
self._estimators = {
|
|
"xgboost": XGBoost_TS,
|
|
"xgb_limitdepth": XGBoostLimitDepth_TS,
|
|
"rf": RF_TS,
|
|
"lgbm": LGBM_TS,
|
|
"extra_tree": ExtraTrees_TS,
|
|
"arima": ARIMA,
|
|
"sarimax": SARIMAX,
|
|
"holt-winters": HoltWinters,
|
|
"catboost": CatBoost_TS,
|
|
"tft": TemporalFusionTransformerEstimator,
|
|
}
|
|
|
|
try:
|
|
from prophet import Prophet as foo
|
|
|
|
self._estimators["prophet"] = Prophet
|
|
except ImportError:
|
|
logger.info("Couldn't import Prophet, skipping")
|
|
|
|
try:
|
|
from orbit.models import DLT
|
|
|
|
self._estimators["orbit"] = Orbit
|
|
except ImportError:
|
|
logger.info("Couldn't import Prophet, skipping")
|
|
|
|
return self._estimators
|
|
|
|
# processed
|
|
def validate_data(
|
|
self,
|
|
automl,
|
|
state,
|
|
X_train_all,
|
|
y_train_all,
|
|
dataframe,
|
|
label,
|
|
X_val=None,
|
|
y_val=None,
|
|
groups_val=None,
|
|
groups=None,
|
|
):
|
|
# first beat the data into a TimeSeriesDataset shape
|
|
if isinstance(X_train_all, TimeSeriesDataset):
|
|
# in this case, we're most likely being called by another FLAML instance
|
|
# so all the preliminary cleaning has already been done
|
|
pre_data = X_train_all
|
|
val_len = len(pre_data.X_val)
|
|
else:
|
|
if label is None and dataframe is not None:
|
|
raise ValueError("If data is specified via dataframe parameter, you must also specify label")
|
|
|
|
if isinstance(y_train_all, pd.Series):
|
|
label = y_train_all.name
|
|
elif isinstance(y_train_all, np.ndarray):
|
|
label = "y" # Prophet convention
|
|
|
|
if isinstance(label, str):
|
|
target_names = [label]
|
|
else:
|
|
target_names = label
|
|
|
|
if self.time_col is None:
|
|
if isinstance(X_train_all, pd.DataFrame):
|
|
assert dataframe is None, "One of dataframe and X arguments must be None"
|
|
self.time_col = X_train_all.columns[0]
|
|
elif dataframe is not None:
|
|
assert X_train_all is None, "One of dataframe and X arguments must be None"
|
|
self.time_col = dataframe.columns[0]
|
|
else:
|
|
self.time_col = "ds"
|
|
|
|
automl._df = True
|
|
|
|
if X_train_all is not None:
|
|
assert y_train_all is not None, "If X_train_all is not None, y_train_all must also be"
|
|
assert dataframe is None, "If X_train_all is provided, dataframe must be None"
|
|
dataframe = TimeSeriesDataset.to_dataframe(X_train_all, y_train_all, target_names, self.time_col)
|
|
|
|
elif dataframe is not None:
|
|
assert label is not None, "A label or list of labels must be provided."
|
|
assert isinstance(dataframe, pd.DataFrame), "dataframe must be a pandas DataFrame"
|
|
assert label in dataframe.columns, f"{label} must a column name in dataframe"
|
|
else:
|
|
raise ValueError("Must supply either X_train_all and y_train_all, or dataframe and label")
|
|
|
|
try:
|
|
dataframe[self.time_col] = pd.to_datetime(dataframe[self.time_col])
|
|
except Exception:
|
|
raise ValueError(
|
|
f"For '{TS_FORECAST}' task, time column {self.time_col} must contain timestamp values."
|
|
)
|
|
|
|
dataframe = remove_ts_duplicates(dataframe, self.time_col)
|
|
|
|
if X_val is not None:
|
|
assert y_val is not None, "If X_val is not None, y_val must also be"
|
|
val_df = TimeSeriesDataset.to_dataframe(X_val, y_val, target_names, self.time_col)
|
|
val_len = len(val_df)
|
|
else:
|
|
val_len = 0
|
|
val_df = None
|
|
|
|
pre_data = TimeSeriesDataset(
|
|
train_data=dataframe,
|
|
time_col=self.time_col,
|
|
target_names=target_names,
|
|
test_data=val_df,
|
|
)
|
|
|
|
# TODO: should the transformer be a property of the dataset instead?
|
|
automl._transformer = DataTransformerTS(self.time_col, label)
|
|
Xt, yt = automl._transformer.fit_transform(pre_data.X_all, pre_data.y_all)
|
|
|
|
df_t = pd.concat([Xt, yt], axis=1)
|
|
|
|
data = TimeSeriesDataset(
|
|
train_data=df_t,
|
|
time_col=pre_data.time_col,
|
|
target_names=pre_data.target_names,
|
|
).move_validation_boundary(-val_len)
|
|
|
|
# now setup the properties of all the other relevant objects
|
|
|
|
# TODO: where are these used? Replace with pointers to data?
|
|
automl._X_train_all, automl._y_train_all = Xt, yt
|
|
|
|
# TODO: where are these used?
|
|
automl._nrow, automl._ndim = data.X_train.shape
|
|
|
|
# make a property instead? Or just fix the call?
|
|
automl._label_transformer = automl._transformer.label_transformer
|
|
|
|
automl._feature_names_in_ = (
|
|
automl._X_train_all.columns.to_list() if hasattr(automl._X_train_all, "columns") else None
|
|
)
|
|
|
|
self.time_col = data.time_col
|
|
self.target_names = data.target_names
|
|
|
|
automl._state.X_val = data
|
|
automl._state.X_train = data
|
|
automl._state.y_train = None
|
|
automl._state.y_val = None
|
|
if data.test_data is not None and len(data.test_data) > 0:
|
|
automl._state.X_train_all = data.move_validation_boundary(len(data.test_data))
|
|
else:
|
|
automl._state.X_train_all = data
|
|
automl._state.y_train_all = None
|
|
|
|
automl._state.data_size = data.train_data.shape
|
|
automl.data_size_full = len(data.all_data)
|
|
automl._state.groups = None
|
|
automl._sample_weight_full = None
|
|
|
|
def prepare_data(
|
|
self,
|
|
state,
|
|
X_train_all,
|
|
y_train_all,
|
|
auto_argument,
|
|
eval_method,
|
|
split_type,
|
|
split_ratio,
|
|
n_splits,
|
|
data_is_df,
|
|
sample_weight_full,
|
|
time_col=None,
|
|
):
|
|
state.kf = None
|
|
state.data_size_full = len(y_train_all)
|
|
|
|
if split_type in ["uniform", "stratified"]:
|
|
raise ValueError(f"Split type {split_type} is not valid for time series")
|
|
|
|
state.groups = None
|
|
state.groups_all = None
|
|
state.groups_val = None
|
|
|
|
ts_data = state.X_val
|
|
no_test_data = ts_data is None or ts_data.test_data is None or len(ts_data.test_data) == 0
|
|
if no_test_data and eval_method == "holdout":
|
|
# NOTE: _prepare_data is before kwargs is updated to fit_kwargs_by_estimator
|
|
period = state.fit_kwargs["period"]
|
|
|
|
if self.name == TS_FORECASTPANEL:
|
|
# TODO: move this into the TimeSeriesDataset class
|
|
X_train_all = ts_data.X_train
|
|
y_train_all = ts_data.y_train
|
|
|
|
X_train_all["time_idx"] -= X_train_all["time_idx"].min()
|
|
X_train_all["time_idx"] = X_train_all["time_idx"].astype("int")
|
|
ids = state.fit_kwargs["group_ids"].copy()
|
|
ids.append(ts_data.time_col)
|
|
ids.append("time_idx")
|
|
y_train_all = pd.DataFrame(y_train_all)
|
|
y_train_all[ids] = X_train_all[ids]
|
|
X_train_all = X_train_all.sort_values(ids)
|
|
y_train_all = y_train_all.sort_values(ids)
|
|
training_cutoff = X_train_all["time_idx"].max() - period
|
|
X_train = X_train_all[lambda x: x.time_idx <= training_cutoff]
|
|
y_train = y_train_all[lambda x: x.time_idx <= training_cutoff].drop(columns=ids)
|
|
X_val = X_train_all[lambda x: x.time_idx > training_cutoff]
|
|
y_val = y_train_all[lambda x: x.time_idx > training_cutoff].drop(columns=ids)
|
|
|
|
train_data = normalize_ts_data(
|
|
X_train,
|
|
ts_data.target_names,
|
|
ts_data.time_col,
|
|
y_train,
|
|
)
|
|
test_data = normalize_ts_data(
|
|
X_val,
|
|
ts_data.target_names,
|
|
ts_data.time_col,
|
|
y_val,
|
|
)
|
|
ts_data = TimeSeriesDataset(
|
|
train_data,
|
|
ts_data.time_col,
|
|
ts_data.target_names,
|
|
ts_data.frequency,
|
|
test_data,
|
|
)
|
|
state.X_val = ts_data
|
|
state.X_train = ts_data
|
|
|
|
else:
|
|
# if eval_method = holdout, make holdout data
|
|
num_samples = ts_data.train_data.shape[0]
|
|
assert period < num_samples, f"period={period}>#examples={num_samples}"
|
|
state.X_val = ts_data.move_validation_boundary(-period)
|
|
state.X_train = state.X_val
|
|
|
|
if eval_method != "holdout":
|
|
if self.name != TS_FORECASTPANEL:
|
|
period = state.fit_kwargs[
|
|
"period"
|
|
] # NOTE: _prepare_data is before kwargs is updated to fit_kwargs_by_estimator
|
|
step_size = state.fit_kwargs.get("cv_step_size", period)
|
|
|
|
ts_data = state.X_train
|
|
if n_splits * step_size + 2 * period > ts_data.y_train.size:
|
|
n_splits = int((ts_data.y_train.size - 2 * period) / step_size)
|
|
assert n_splits >= 2, (
|
|
f"cross validation for forecasting period={period}"
|
|
f" requires input data with at least {2*period + 2*step_size} examples."
|
|
)
|
|
logger.info(f"Using nsplits={n_splits} due to data size limit.")
|
|
state.kf = TimeSeriesSplit(n_splits=n_splits, test_size=period)
|
|
state.kf.step_size = step_size
|
|
|
|
else:
|
|
n_groups = ts_data.X_train.groupby(state.fit_kwargs.get("group_ids")).ngroups
|
|
period = state.fit_kwargs["period"]
|
|
state.kf = TimeSeriesSplit(n_splits=n_splits, test_size=period * n_groups)
|
|
|
|
# TODO: move task detection to Task.__init__!
|
|
def decide_split_type(
|
|
self,
|
|
split_type,
|
|
y_train_all,
|
|
fit_kwargs,
|
|
groups=None,
|
|
) -> str:
|
|
# TODO: move into task creation!!!
|
|
if self.name == "classification":
|
|
self.name = get_classification_objective(len(np.unique(y_train_all)))
|
|
|
|
# TODO: do we need this?
|
|
if not isinstance(split_type, str):
|
|
assert hasattr(split_type, "split") and hasattr(
|
|
split_type, "get_n_splits"
|
|
), "split_type must be a string or a splitter object with split and get_n_splits methods."
|
|
assert (
|
|
not isinstance(split_type, GroupKFold) or groups is not None
|
|
), "GroupKFold requires groups to be provided."
|
|
return split_type
|
|
|
|
else:
|
|
assert split_type in ["auto", "time"]
|
|
assert isinstance(
|
|
fit_kwargs.get("period"),
|
|
int, # NOTE: _decide_split_type is before kwargs is updated to fit_kwargs_by_estimator
|
|
), f"missing a required integer 'period' for '{TS_FORECAST}' task."
|
|
if fit_kwargs.get("group_ids"):
|
|
# TODO (MARK) This will likely not play well with the task class
|
|
self.name = TS_FORECASTPANEL
|
|
assert isinstance(
|
|
fit_kwargs.get("group_ids"), list
|
|
), f"missing a required List[str] 'group_ids' for '{TS_FORECASTPANEL}' task."
|
|
return "time"
|
|
|
|
# TODO: merge with preprocess() below
|
|
def _preprocess(self, X, transformer=None):
|
|
if isinstance(X, List):
|
|
try:
|
|
if isinstance(X[0], List):
|
|
X = [x for x in zip(*X)]
|
|
X = pd.DataFrame(
|
|
dict(
|
|
[
|
|
(transformer._str_columns[idx], X[idx])
|
|
if isinstance(X[0], List)
|
|
else (transformer._str_columns[idx], [X[idx]])
|
|
for idx in range(len(X))
|
|
]
|
|
)
|
|
)
|
|
except IndexError:
|
|
raise IndexError("Test data contains more columns than training data, exiting")
|
|
elif isinstance(X, int):
|
|
return X
|
|
elif issparse(X):
|
|
X = X.tocsr()
|
|
if self.is_ts_forecast():
|
|
X = pd.DataFrame(X)
|
|
if transformer:
|
|
X = transformer.transform(X)
|
|
return X
|
|
|
|
def preprocess(self, X, transformer=None):
|
|
if isinstance(X, pd.DataFrame) or isinstance(X, np.ndarray) or isinstance(X, pd.Series):
|
|
X = X.copy()
|
|
X = normalize_ts_data(X, self.target_names, self.time_col)
|
|
return self._preprocess(X, transformer)
|
|
elif isinstance(X, int):
|
|
return X
|
|
else:
|
|
raise ValueError(f"unknown type of X, {X.__class__}")
|
|
|
|
def evaluate_model_CV(
|
|
self,
|
|
config,
|
|
estimator,
|
|
X_train_all,
|
|
y_train_all,
|
|
budget,
|
|
kf,
|
|
eval_metric,
|
|
best_val_loss,
|
|
cv_score_agg_func=None,
|
|
log_training_metric=False,
|
|
fit_kwargs={},
|
|
free_mem_ratio=0, # what is this for?
|
|
):
|
|
if cv_score_agg_func is None:
|
|
cv_score_agg_func = default_cv_score_agg_func
|
|
start_time = time.time()
|
|
val_loss_folds = []
|
|
log_metric_folds = []
|
|
metric = None
|
|
train_time = pred_time = 0
|
|
total_fold_num = 0
|
|
n = kf.get_n_splits()
|
|
if self.is_classification():
|
|
labels = np.unique(y_train_all)
|
|
else:
|
|
labels = fit_kwargs.get("label_list") # pass the label list on to compute the evaluation metric
|
|
ts_data = X_train_all
|
|
budget_per_train = budget / n
|
|
ts_data = X_train_all
|
|
for data in ts_data.cv_train_val_sets(kf.n_splits, kf.test_size, kf.step_size):
|
|
estimator.cleanup()
|
|
val_loss_i, metric_i, train_time_i, pred_time_i = get_val_loss(
|
|
config,
|
|
estimator,
|
|
X_train=data,
|
|
y_train=None,
|
|
X_val=data,
|
|
y_val=None,
|
|
eval_metric=eval_metric,
|
|
labels=labels,
|
|
budget=budget_per_train,
|
|
log_training_metric=log_training_metric,
|
|
fit_kwargs=fit_kwargs,
|
|
task=self,
|
|
weight_val=None,
|
|
groups_val=None,
|
|
free_mem_ratio=free_mem_ratio,
|
|
)
|
|
if isinstance(metric_i, dict) and "intermediate_results" in metric_i:
|
|
del metric_i["intermediate_results"]
|
|
total_fold_num += 1
|
|
val_loss_folds.append(val_loss_i)
|
|
log_metric_folds.append(metric_i)
|
|
train_time += train_time_i
|
|
pred_time += pred_time_i
|
|
if time.time() - start_time >= budget:
|
|
break
|
|
val_loss, metric = cv_score_agg_func(val_loss_folds, log_metric_folds)
|
|
n = total_fold_num
|
|
pred_time /= n
|
|
return val_loss, metric, train_time, pred_time
|
|
|
|
def default_estimator_list(self, estimator_list: List[str], is_spark_dataframe: bool) -> List[str]:
|
|
assert not is_spark_dataframe, "Spark is not yet supported for time series"
|
|
|
|
# TODO: why not do this if/then in the calling function?
|
|
if "auto" != estimator_list:
|
|
return estimator_list
|
|
|
|
if self.is_ts_forecastpanel():
|
|
return ["tft"]
|
|
|
|
estimator_list = [
|
|
"lgbm",
|
|
"rf",
|
|
"xgboost",
|
|
"extra_tree",
|
|
"xgb_limitdepth",
|
|
]
|
|
|
|
# Catboost appears to be way slower than the others, don't include it by default
|
|
# try:
|
|
# import catboost
|
|
#
|
|
# estimator_list.append("catboost")
|
|
# except ImportError:
|
|
# pass
|
|
|
|
if self.is_regression():
|
|
estimator_list += ["arima", "sarimax"]
|
|
|
|
try:
|
|
import prophet
|
|
|
|
estimator_list.append("prophet")
|
|
except ImportError:
|
|
pass
|
|
|
|
return estimator_list
|
|
|
|
def default_metric(self, metric: str) -> str:
|
|
assert self.is_ts_forecast(), "If this is not a TS forecasting task, this code should never have been called"
|
|
if metric == "auto":
|
|
return "mape"
|
|
else:
|
|
return metric
|
|
|
|
@staticmethod
|
|
def prepare_sample_train_data(automlstate, sample_size):
|
|
# we take the tail, rather than the head, for compatibility with time series
|
|
|
|
shift = sample_size - len(automlstate.X_train.train_data)
|
|
sampled_X_train = automlstate.X_train.move_validation_boundary(shift)
|
|
|
|
return sampled_X_train, None, None, None
|
|
|
|
|
|
def remove_ts_duplicates(
|
|
X,
|
|
time_col,
|
|
):
|
|
"""
|
|
Assumes the targets are included
|
|
@param X:
|
|
@param time_col:
|
|
@param y:
|
|
@return:
|
|
"""
|
|
|
|
duplicates = X.duplicated()
|
|
|
|
if any(duplicates):
|
|
logger.warning("Duplicate timestamp values found in timestamp column. " f"\n{X.loc[duplicates, X][time_col]}")
|
|
X = X.drop_duplicates()
|
|
logger.warning("Removed duplicate rows based on all columns")
|
|
assert (
|
|
X[[X.columns[0]]].duplicated() is None
|
|
), "Duplicate timestamp values with different values for other columns."
|
|
|
|
return X
|