mirror of https://github.com/microsoft/autogen.git
881 lines
39 KiB
Python
881 lines
39 KiB
Python
import logging
|
|
import time
|
|
from typing import List, Optional
|
|
import numpy as np
|
|
from flaml.automl.data import TS_TIMESTAMP_COL, concat
|
|
from flaml.automl.ml import EstimatorSubclass, get_val_loss, default_cv_score_agg_func
|
|
|
|
from flaml.automl.task.task import (
|
|
Task,
|
|
get_classification_objective,
|
|
TS_FORECAST,
|
|
TS_FORECASTPANEL,
|
|
)
|
|
from flaml.config import RANDOM_SEED
|
|
from flaml.automl.spark import ps, psDataFrame, psSeries, pd
|
|
from flaml.automl.spark.utils import (
|
|
iloc_pandas_on_spark,
|
|
spark_kFold,
|
|
train_test_split_pyspark,
|
|
unique_pandas_on_spark,
|
|
unique_value_first_index,
|
|
len_labels,
|
|
set_option,
|
|
)
|
|
|
|
try:
|
|
from scipy.sparse import issparse
|
|
except ImportError:
|
|
pass
|
|
try:
|
|
from sklearn.utils import shuffle
|
|
from sklearn.model_selection import (
|
|
train_test_split,
|
|
RepeatedStratifiedKFold,
|
|
RepeatedKFold,
|
|
GroupKFold,
|
|
TimeSeriesSplit,
|
|
GroupShuffleSplit,
|
|
StratifiedGroupKFold,
|
|
)
|
|
except ImportError:
|
|
pass
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class GenericTask(Task):
|
|
@property
|
|
def estimators(self):
|
|
if self._estimators is None:
|
|
# put this into a function to avoid circular dependency
|
|
from flaml.automl.model import (
|
|
XGBoostSklearnEstimator,
|
|
XGBoostLimitDepthEstimator,
|
|
RandomForestEstimator,
|
|
LGBMEstimator,
|
|
LRL1Classifier,
|
|
LRL2Classifier,
|
|
CatBoostEstimator,
|
|
ExtraTreesEstimator,
|
|
KNeighborsEstimator,
|
|
TransformersEstimator,
|
|
TransformersEstimatorModelSelection,
|
|
SparkLGBMEstimator,
|
|
)
|
|
|
|
self._estimators = {
|
|
"xgboost": XGBoostSklearnEstimator,
|
|
"xgb_limitdepth": XGBoostLimitDepthEstimator,
|
|
"rf": RandomForestEstimator,
|
|
"lgbm": LGBMEstimator,
|
|
"lgbm_spark": SparkLGBMEstimator,
|
|
"lrl1": LRL1Classifier,
|
|
"lrl2": LRL2Classifier,
|
|
"catboost": CatBoostEstimator,
|
|
"extra_tree": ExtraTreesEstimator,
|
|
"kneighbor": KNeighborsEstimator,
|
|
"transformer": TransformersEstimator,
|
|
"transformer_ms": TransformersEstimatorModelSelection,
|
|
}
|
|
return self._estimators
|
|
|
|
def validate_data(
|
|
self,
|
|
automl,
|
|
state,
|
|
X_train_all,
|
|
y_train_all,
|
|
dataframe,
|
|
label,
|
|
X_val=None,
|
|
y_val=None,
|
|
groups_val=None,
|
|
groups=None,
|
|
):
|
|
if X_train_all is not None and y_train_all is not None:
|
|
assert isinstance(X_train_all, (np.ndarray, pd.DataFrame, psDataFrame)) or issparse(X_train_all), (
|
|
"X_train_all must be a numpy array, a pandas dataframe, "
|
|
"a Scipy sparse matrix or a pyspark.pandas dataframe."
|
|
)
|
|
assert isinstance(
|
|
y_train_all, (np.ndarray, pd.Series, psSeries)
|
|
), "y_train_all must be a numpy array, a pandas series or a pyspark.pandas series."
|
|
assert X_train_all.size != 0 and y_train_all.size != 0, "Input data must not be empty."
|
|
if isinstance(X_train_all, np.ndarray) and len(X_train_all.shape) == 1:
|
|
X_train_all = np.reshape(X_train_all, (X_train_all.size, 1))
|
|
if isinstance(y_train_all, np.ndarray):
|
|
y_train_all = y_train_all.flatten()
|
|
assert X_train_all.shape[0] == y_train_all.shape[0], "# rows in X_train must match length of y_train."
|
|
if isinstance(X_train_all, psDataFrame):
|
|
X_train_all = X_train_all.spark.cache() # cache data to improve compute speed
|
|
y_train_all = y_train_all.to_frame().spark.cache()[y_train_all.name]
|
|
logger.debug(f"X_train_all and y_train_all cached, shape of X_train_all: {X_train_all.shape}")
|
|
automl._df = isinstance(X_train_all, (pd.DataFrame, psDataFrame))
|
|
automl._nrow, automl._ndim = X_train_all.shape
|
|
if self.is_ts_forecast():
|
|
X_train_all = pd.DataFrame(X_train_all) if isinstance(X_train_all, np.ndarray) else X_train_all
|
|
X_train_all, y_train_all = self._validate_ts_data(X_train_all, y_train_all)
|
|
X, y = X_train_all, y_train_all
|
|
elif dataframe is not None and label is not None:
|
|
assert isinstance(
|
|
dataframe, (pd.DataFrame, psDataFrame)
|
|
), "dataframe must be a pandas DataFrame or a pyspark.pandas DataFrame."
|
|
assert (
|
|
label in dataframe.columns
|
|
), f"The provided label column name `{label}` doesn't exist in the provided dataframe."
|
|
if isinstance(dataframe, psDataFrame):
|
|
dataframe = dataframe.spark.cache() # cache data to improve compute speed
|
|
logger.debug(f"dataframe cached, shape of dataframe: {dataframe.shape}")
|
|
automl._df = True
|
|
if self.is_ts_forecast():
|
|
dataframe = self._validate_ts_data(dataframe)
|
|
# TODO: to support pyspark.sql.DataFrame and pure dataframe mode
|
|
X = dataframe.drop(columns=label)
|
|
automl._nrow, automl._ndim = X.shape
|
|
y = dataframe[label]
|
|
else:
|
|
raise ValueError("either X_train+y_train or dataframe+label are required")
|
|
|
|
# check the validity of input dimensions for NLP tasks, so need to check _is_nlp_task not estimator
|
|
if self.is_nlp():
|
|
from flaml.automl.nlp.utils import is_a_list_of_str
|
|
|
|
is_all_str = True
|
|
is_all_list = True
|
|
for column in X.columns:
|
|
assert X[column].dtype.name in (
|
|
"object",
|
|
"string",
|
|
), "If the task is an NLP task, X can only contain text columns"
|
|
for _, each_cell in X[column].items():
|
|
if each_cell is not None:
|
|
is_str = isinstance(each_cell, str)
|
|
is_list_of_int = isinstance(each_cell, list) and all(isinstance(x, int) for x in each_cell)
|
|
is_list_of_str = is_a_list_of_str(each_cell)
|
|
if self.is_token_classification():
|
|
assert is_list_of_str, (
|
|
"For the token-classification task, the input column needs to be a list of string,"
|
|
"instead of string, e.g., ['EU', 'rejects','German', 'call','to','boycott','British','lamb','.',].",
|
|
"For more examples, please refer to test/nlp/test_autohf_tokenclassification.py",
|
|
)
|
|
else:
|
|
assert is_str or is_list_of_int, (
|
|
"Each column of the input must either be str (untokenized) "
|
|
"or a list of integers (tokenized)"
|
|
)
|
|
is_all_str &= is_str
|
|
is_all_list &= is_list_of_int or is_list_of_str
|
|
assert is_all_str or is_all_list, (
|
|
"Currently FLAML only supports two modes for NLP: either all columns of X are string (non-tokenized), "
|
|
"or all columns of X are integer ids (tokenized)"
|
|
)
|
|
if isinstance(X, psDataFrame):
|
|
# TODO: support pyspark.pandas dataframe in DataTransformer
|
|
automl._skip_transform = True
|
|
if automl._skip_transform or issparse(X_train_all):
|
|
automl._transformer = automl._label_transformer = False
|
|
automl._X_train_all, automl._y_train_all = X, y
|
|
else:
|
|
from flaml.automl.data import DataTransformer
|
|
|
|
automl._transformer = DataTransformer()
|
|
|
|
(
|
|
automl._X_train_all,
|
|
automl._y_train_all,
|
|
) = automl._transformer.fit_transform(X, y, self)
|
|
automl._label_transformer = automl._transformer.label_transformer
|
|
if self.is_token_classification():
|
|
if hasattr(automl._label_transformer, "label_list"):
|
|
state.fit_kwargs.update({"label_list": automl._label_transformer.label_list})
|
|
elif "label_list" not in state.fit_kwargs:
|
|
for each_fit_kwargs in state.fit_kwargs_by_estimator.values():
|
|
assert (
|
|
"label_list" in each_fit_kwargs
|
|
), "For the token-classification task, you must either (1) pass token labels; or (2) pass id labels and the label list. "
|
|
"Please refer to the documentation for more details: https://microsoft.github.io/FLAML/docs/Examples/AutoML-NLP#a-simple-token-classification-example"
|
|
automl._feature_names_in_ = (
|
|
automl._X_train_all.columns.to_list() if hasattr(automl._X_train_all, "columns") else None
|
|
)
|
|
|
|
automl._sample_weight_full = state.fit_kwargs.get(
|
|
"sample_weight"
|
|
) # NOTE: _validate_data is before kwargs is updated to fit_kwargs_by_estimator
|
|
if X_val is not None and y_val is not None:
|
|
assert isinstance(X_val, (np.ndarray, pd.DataFrame, psDataFrame)) or issparse(X_train_all), (
|
|
"X_val must be None, a numpy array, a pandas dataframe, "
|
|
"a Scipy sparse matrix or a pyspark.pandas dataframe."
|
|
)
|
|
assert isinstance(y_val, (np.ndarray, pd.Series, psSeries)), (
|
|
"y_val must be None, a numpy array, a pandas series " "or a pyspark.pandas series."
|
|
)
|
|
assert X_val.size != 0 and y_val.size != 0, (
|
|
"Validation data are expected to be nonempty. " "Use None for X_val and y_val if no validation data."
|
|
)
|
|
if isinstance(y_val, np.ndarray):
|
|
y_val = y_val.flatten()
|
|
assert X_val.shape[0] == y_val.shape[0], "# rows in X_val must match length of y_val."
|
|
if automl._transformer:
|
|
state.X_val = automl._transformer.transform(X_val)
|
|
else:
|
|
state.X_val = X_val
|
|
# If it's NLG_TASKS, y_val is a pandas series containing the output sequence tokens,
|
|
# so we cannot use label_transformer.transform to process it
|
|
if automl._label_transformer:
|
|
state.y_val = automl._label_transformer.transform(y_val)
|
|
else:
|
|
state.y_val = y_val
|
|
else:
|
|
state.X_val = state.y_val = None
|
|
|
|
if groups is not None and len(groups) != automl._nrow:
|
|
# groups is given as group counts
|
|
state.groups = np.concatenate([[i] * c for i, c in enumerate(groups)])
|
|
assert len(state.groups) == automl._nrow, "the sum of group counts must match the number of examples"
|
|
state.groups_val = (
|
|
np.concatenate([[i] * c for i, c in enumerate(groups_val)]) if groups_val is not None else None
|
|
)
|
|
else:
|
|
state.groups_val = groups_val
|
|
state.groups = groups
|
|
|
|
automl.data_size_full = len(automl._y_train_all)
|
|
|
|
@staticmethod
|
|
def _split_pyspark(state, X_train_all, y_train_all, split_ratio, stratify=None):
|
|
# TODO: optimize this
|
|
set_option("compute.ops_on_diff_frames", True)
|
|
if not isinstance(y_train_all, (psDataFrame, psSeries)):
|
|
raise ValueError("y_train_all must be a pyspark.pandas dataframe or series")
|
|
df_all_in_one = X_train_all.join(y_train_all)
|
|
stratify_column = y_train_all.name if isinstance(y_train_all, psSeries) else y_train_all.columns[0]
|
|
ret_sample_weight = False
|
|
if (
|
|
"sample_weight" in state.fit_kwargs
|
|
): # NOTE: _prepare_data is before kwargs is updated to fit_kwargs_by_estimator
|
|
# fit_kwargs["sample_weight"] is an numpy array
|
|
ps_sample_weight = ps.DataFrame(
|
|
state.fit_kwargs["sample_weight"],
|
|
columns=["sample_weight"],
|
|
)
|
|
df_all_in_one = df_all_in_one.join(ps_sample_weight)
|
|
ret_sample_weight = True
|
|
df_all_train, df_all_val = train_test_split_pyspark(
|
|
df_all_in_one,
|
|
None if stratify is None else stratify_column,
|
|
test_fraction=split_ratio,
|
|
seed=RANDOM_SEED,
|
|
)
|
|
columns_to_drop = [c for c in df_all_train.columns if c in [stratify_column, "sample_weight"]]
|
|
X_train = df_all_train.drop(columns_to_drop)
|
|
X_val = df_all_val.drop(columns_to_drop)
|
|
y_train = df_all_train[stratify_column]
|
|
y_val = df_all_val[stratify_column]
|
|
|
|
if ret_sample_weight:
|
|
return (
|
|
X_train,
|
|
X_val,
|
|
y_train,
|
|
y_val,
|
|
df_all_train["sample_weight"],
|
|
df_all_val["sample_weight"],
|
|
)
|
|
return X_train, X_val, y_train, y_val
|
|
|
|
@staticmethod
|
|
def _train_test_split(state, X, y, first=None, rest=None, split_ratio=0.2, stratify=None):
|
|
condition_type = isinstance(X, (psDataFrame, psSeries))
|
|
# NOTE: _prepare_data is before kwargs is updated to fit_kwargs_by_estimator
|
|
condition_param = "sample_weight" in state.fit_kwargs
|
|
if not condition_type and condition_param:
|
|
sample_weight = (
|
|
state.fit_kwargs["sample_weight"] if rest is None else state.fit_kwargs["sample_weight"][rest]
|
|
)
|
|
(
|
|
X_train,
|
|
X_val,
|
|
y_train,
|
|
y_val,
|
|
weight_train,
|
|
weight_val,
|
|
) = train_test_split(
|
|
X,
|
|
y,
|
|
sample_weight,
|
|
test_size=split_ratio,
|
|
stratify=stratify,
|
|
random_state=RANDOM_SEED,
|
|
)
|
|
|
|
if first is not None:
|
|
weight1 = state.fit_kwargs["sample_weight"][first]
|
|
state.weight_val = concat(weight1, weight_val)
|
|
state.fit_kwargs["sample_weight"] = concat(weight1, weight_train)
|
|
else:
|
|
state.weight_val = weight_val
|
|
state.fit_kwargs["sample_weight"] = weight_train
|
|
elif not condition_type and not condition_param:
|
|
X_train, X_val, y_train, y_val = train_test_split(
|
|
X,
|
|
y,
|
|
test_size=split_ratio,
|
|
stratify=stratify,
|
|
random_state=RANDOM_SEED,
|
|
)
|
|
elif condition_type and condition_param:
|
|
(
|
|
X_train,
|
|
X_val,
|
|
y_train,
|
|
y_val,
|
|
weight_train,
|
|
weight_val,
|
|
) = GenericTask._split_pyspark(state, X, y, split_ratio, stratify)
|
|
|
|
if first is not None:
|
|
weight1 = state.fit_kwargs["sample_weight"][first]
|
|
state.weight_val = concat(weight1, weight_val)
|
|
state.fit_kwargs["sample_weight"] = concat(weight1, weight_train)
|
|
else:
|
|
state.weight_val = weight_val
|
|
state.fit_kwargs["sample_weight"] = weight_train
|
|
else:
|
|
X_train, X_val, y_train, y_val = GenericTask._split_pyspark(state, X, y, split_ratio, stratify)
|
|
return X_train, X_val, y_train, y_val
|
|
|
|
def prepare_data(
|
|
self,
|
|
state,
|
|
X_train_all,
|
|
y_train_all,
|
|
auto_augment,
|
|
eval_method,
|
|
split_type,
|
|
split_ratio,
|
|
n_splits,
|
|
data_is_df,
|
|
sample_weight_full,
|
|
) -> int:
|
|
X_val, y_val = state.X_val, state.y_val
|
|
if issparse(X_val):
|
|
X_val = X_val.tocsr()
|
|
if issparse(X_train_all):
|
|
X_train_all = X_train_all.tocsr()
|
|
is_spark_dataframe = isinstance(X_train_all, (psDataFrame, psSeries))
|
|
self.is_spark_dataframe = is_spark_dataframe
|
|
if (
|
|
self.is_classification()
|
|
and auto_augment
|
|
and state.fit_kwargs.get("sample_weight")
|
|
is None # NOTE: _prepare_data is before kwargs is updated to fit_kwargs_by_estimator
|
|
and split_type in ["stratified", "uniform"]
|
|
and not self.is_token_classification()
|
|
):
|
|
# logger.info(f"label {pd.unique(y_train_all)}")
|
|
if is_spark_dataframe:
|
|
label_set, counts = unique_pandas_on_spark(y_train_all)
|
|
# TODO: optimize this
|
|
set_option("compute.ops_on_diff_frames", True)
|
|
else:
|
|
label_set, counts = np.unique(y_train_all, return_counts=True)
|
|
# augment rare classes
|
|
rare_threshld = 20
|
|
rare = counts < rare_threshld
|
|
rare_label, rare_counts = label_set[rare], counts[rare]
|
|
for i, label in enumerate(rare_label.tolist()):
|
|
count = rare_count = rare_counts[i]
|
|
rare_index = y_train_all == label
|
|
n = len(y_train_all)
|
|
while count < rare_threshld:
|
|
if data_is_df:
|
|
X_train_all = concat(X_train_all, X_train_all.iloc[:n].loc[rare_index])
|
|
else:
|
|
X_train_all = concat(X_train_all, X_train_all[:n][rare_index, :])
|
|
if isinstance(y_train_all, (pd.Series, psSeries)):
|
|
y_train_all = concat(y_train_all, y_train_all.iloc[:n].loc[rare_index])
|
|
else:
|
|
y_train_all = np.concatenate([y_train_all, y_train_all[:n][rare_index]])
|
|
count += rare_count
|
|
logger.info(f"class {label} augmented from {rare_count} to {count}")
|
|
SHUFFLE_SPLIT_TYPES = ["uniform", "stratified"]
|
|
if is_spark_dataframe:
|
|
# no need to shuffle pyspark dataframe
|
|
pass
|
|
elif split_type in SHUFFLE_SPLIT_TYPES:
|
|
if sample_weight_full is not None:
|
|
X_train_all, y_train_all, state.sample_weight_all = shuffle(
|
|
X_train_all,
|
|
y_train_all,
|
|
sample_weight_full,
|
|
random_state=RANDOM_SEED,
|
|
)
|
|
state.fit_kwargs[
|
|
"sample_weight"
|
|
] = (
|
|
state.sample_weight_all
|
|
) # NOTE: _prepare_data is before kwargs is updated to fit_kwargs_by_estimator
|
|
if isinstance(state.sample_weight_all, pd.Series):
|
|
state.sample_weight_all.reset_index(drop=True, inplace=True)
|
|
else:
|
|
X_train_all, y_train_all = shuffle(X_train_all, y_train_all, random_state=RANDOM_SEED)
|
|
if data_is_df:
|
|
X_train_all.reset_index(drop=True, inplace=True)
|
|
if isinstance(y_train_all, pd.Series):
|
|
y_train_all.reset_index(drop=True, inplace=True)
|
|
|
|
X_train, y_train = X_train_all, y_train_all
|
|
state.groups_all = state.groups
|
|
if X_val is None and eval_method == "holdout":
|
|
if split_type == "time":
|
|
assert not self.is_ts_forecast(), "For a TS forecast task, this code should never be called"
|
|
|
|
is_sample_weight = "sample_weight" in state.fit_kwargs
|
|
if not is_spark_dataframe and is_sample_weight:
|
|
(
|
|
X_train,
|
|
X_val,
|
|
y_train,
|
|
y_val,
|
|
state.fit_kwargs[
|
|
"sample_weight"
|
|
], # NOTE: _prepare_data is before kwargs is updated to fit_kwargs_by_estimator
|
|
state.weight_val,
|
|
) = train_test_split(
|
|
X_train_all,
|
|
y_train_all,
|
|
state.fit_kwargs[
|
|
"sample_weight"
|
|
], # NOTE: _prepare_data is before kwargs is updated to fit_kwargs_by_estimator
|
|
test_size=split_ratio,
|
|
shuffle=False,
|
|
)
|
|
elif not is_spark_dataframe and not is_sample_weight:
|
|
X_train, X_val, y_train, y_val = train_test_split(
|
|
X_train_all,
|
|
y_train_all,
|
|
test_size=split_ratio,
|
|
shuffle=False,
|
|
)
|
|
elif is_spark_dataframe and is_sample_weight:
|
|
(
|
|
X_train,
|
|
X_val,
|
|
y_train,
|
|
y_val,
|
|
state.fit_kwargs[
|
|
"sample_weight"
|
|
], # NOTE: _prepare_data is before kwargs is updated to fit_kwargs_by_estimator
|
|
state.weight_val,
|
|
) = self._split_pyspark(state, X_train_all, y_train_all, split_ratio)
|
|
else:
|
|
X_train, X_val, y_train, y_val = self._split_pyspark(state, X_train_all, y_train_all, split_ratio)
|
|
if split_type == "group":
|
|
gss = GroupShuffleSplit(n_splits=1, test_size=split_ratio, random_state=RANDOM_SEED)
|
|
for train_idx, val_idx in gss.split(X_train_all, y_train_all, state.groups_all):
|
|
if data_is_df:
|
|
X_train = X_train_all.iloc[train_idx]
|
|
X_val = X_train_all.iloc[val_idx]
|
|
else:
|
|
X_train, X_val = X_train_all[train_idx], X_train_all[val_idx]
|
|
y_train, y_val = y_train_all[train_idx], y_train_all[val_idx]
|
|
state.groups = state.groups_all[train_idx]
|
|
state.groups_val = state.groups_all[val_idx]
|
|
elif self.is_classification():
|
|
# for classification, make sure the labels are complete in both
|
|
# training and validation data
|
|
label_set, first = unique_value_first_index(y_train_all)
|
|
rest = []
|
|
last = 0
|
|
first.sort()
|
|
for i in range(len(first)):
|
|
rest.extend(range(last, first[i]))
|
|
last = first[i] + 1
|
|
rest.extend(range(last, len(y_train_all)))
|
|
X_first = X_train_all.iloc[first] if data_is_df else X_train_all[first]
|
|
X_rest = X_train_all.iloc[rest] if data_is_df else X_train_all[rest]
|
|
y_rest = (
|
|
y_train_all[rest]
|
|
if isinstance(y_train_all, np.ndarray)
|
|
else iloc_pandas_on_spark(y_train_all, rest)
|
|
if is_spark_dataframe
|
|
else y_train_all.iloc[rest]
|
|
)
|
|
stratify = y_rest if split_type == "stratified" else None
|
|
X_train, X_val, y_train, y_val = self._train_test_split(
|
|
state, X_rest, y_rest, first, rest, split_ratio, stratify
|
|
)
|
|
X_train = concat(X_first, X_train)
|
|
y_train = concat(label_set, y_train) if data_is_df else np.concatenate([label_set, y_train])
|
|
X_val = concat(X_first, X_val)
|
|
y_val = concat(label_set, y_val) if data_is_df else np.concatenate([label_set, y_val])
|
|
elif self.is_regression():
|
|
X_train, X_val, y_train, y_val = self._train_test_split(
|
|
state, X_train_all, y_train_all, split_ratio=split_ratio
|
|
)
|
|
state.data_size = X_train.shape
|
|
state.data_size_full = len(y_train_all)
|
|
state.X_train, state.y_train = X_train, y_train
|
|
state.X_val, state.y_val = X_val, y_val
|
|
state.X_train_all = X_train_all
|
|
state.y_train_all = y_train_all
|
|
y_train_all_size = y_train_all.size
|
|
if eval_method == "holdout":
|
|
state.kf = None
|
|
return
|
|
if split_type == "group":
|
|
# logger.info("Using GroupKFold")
|
|
assert len(state.groups_all) == y_train_all_size, "the length of groups must match the number of examples"
|
|
assert (
|
|
len_labels(state.groups_all) >= n_splits
|
|
), "the number of groups must be equal or larger than n_splits"
|
|
state.kf = GroupKFold(n_splits)
|
|
elif split_type == "stratified":
|
|
# logger.info("Using StratifiedKFold")
|
|
assert y_train_all_size >= n_splits, (
|
|
f"{n_splits}-fold cross validation" f" requires input data with at least {n_splits} examples."
|
|
)
|
|
assert y_train_all_size >= 2 * n_splits, (
|
|
f"{n_splits}-fold cross validation with metric=r2 "
|
|
f"requires input data with at least {n_splits*2} examples."
|
|
)
|
|
state.kf = RepeatedStratifiedKFold(n_splits=n_splits, n_repeats=1, random_state=RANDOM_SEED)
|
|
elif split_type == "time":
|
|
# logger.info("Using TimeSeriesSplit")
|
|
if self.is_ts_forecast() and not self.is_ts_forecastpanel():
|
|
period = state.fit_kwargs[
|
|
"period"
|
|
] # NOTE: _prepare_data is before kwargs is updated to fit_kwargs_by_estimator
|
|
if period * (n_splits + 1) > y_train_all_size:
|
|
n_splits = int(y_train_all_size / period - 1)
|
|
assert n_splits >= 2, (
|
|
f"cross validation for forecasting period={period}"
|
|
f" requires input data with at least {3 * period} examples."
|
|
)
|
|
logger.info(f"Using nsplits={n_splits} due to data size limit.")
|
|
state.kf = TimeSeriesSplit(n_splits=n_splits, test_size=period)
|
|
elif self.is_ts_forecastpanel():
|
|
n_groups = len(X_train.groupby(state.fit_kwargs.get("group_ids")).size())
|
|
period = state.fit_kwargs.get("period")
|
|
state.kf = TimeSeriesSplit(n_splits=n_splits, test_size=period * n_groups)
|
|
else:
|
|
state.kf = TimeSeriesSplit(n_splits=n_splits)
|
|
# state.kf = TimeSeriesSplit(n_splits=n_splits)
|
|
elif isinstance(split_type, str):
|
|
# logger.info("Using RepeatedKFold")
|
|
state.kf = RepeatedKFold(n_splits=n_splits, n_repeats=1, random_state=RANDOM_SEED)
|
|
else:
|
|
# logger.info("Using splitter object")
|
|
state.kf = split_type
|
|
if isinstance(state.kf, (GroupKFold, StratifiedGroupKFold)):
|
|
# self._split_type is either "group", a GroupKFold object, or a StratifiedGroupKFold object
|
|
state.kf.groups = state.groups_all
|
|
|
|
def decide_split_type(
|
|
self,
|
|
split_type,
|
|
y_train_all,
|
|
fit_kwargs,
|
|
groups=None,
|
|
) -> str:
|
|
assert not self.is_ts_forecast(), "This function should never be called as part of a time-series task."
|
|
if self.name == "classification":
|
|
self.name = get_classification_objective(len_labels(y_train_all))
|
|
if not isinstance(split_type, str):
|
|
assert hasattr(split_type, "split") and hasattr(
|
|
split_type, "get_n_splits"
|
|
), "split_type must be a string or a splitter object with split and get_n_splits methods."
|
|
assert (
|
|
not isinstance(split_type, GroupKFold) or groups is not None
|
|
), "GroupKFold requires groups to be provided."
|
|
return split_type
|
|
|
|
elif self.is_classification():
|
|
assert split_type in ["auto", "stratified", "uniform", "time", "group"]
|
|
return split_type if split_type != "auto" else groups is None and "stratified" or "group"
|
|
|
|
elif self.is_regression():
|
|
assert split_type in ["auto", "uniform", "time", "group"]
|
|
return split_type if split_type != "auto" else "uniform"
|
|
|
|
elif self.is_rank():
|
|
assert groups is not None, "groups must be specified for ranking task."
|
|
assert split_type in ["auto", "group"]
|
|
return "group"
|
|
|
|
elif self.is_nlg():
|
|
assert split_type in ["auto", "uniform", "time", "group"]
|
|
return split_type if split_type != "auto" else "uniform"
|
|
|
|
def preprocess(self, X, transformer=None):
|
|
if isinstance(X, List):
|
|
try:
|
|
if isinstance(X[0], List):
|
|
X = [x for x in zip(*X)]
|
|
X = pd.DataFrame(
|
|
dict(
|
|
[
|
|
(transformer._str_columns[idx], X[idx])
|
|
if isinstance(X[0], List)
|
|
else (transformer._str_columns[idx], [X[idx]])
|
|
for idx in range(len(X))
|
|
]
|
|
)
|
|
)
|
|
except IndexError:
|
|
raise IndexError("Test data contains more columns than training data, exiting")
|
|
elif isinstance(X, int):
|
|
return X
|
|
elif isinstance(X, psDataFrame):
|
|
return X
|
|
elif issparse(X):
|
|
X = X.tocsr()
|
|
if self.is_ts_forecast():
|
|
X = pd.DataFrame(X)
|
|
if transformer:
|
|
X = transformer.transform(X)
|
|
return X
|
|
|
|
def evaluate_model_CV(
|
|
self,
|
|
config: dict,
|
|
estimator: EstimatorSubclass,
|
|
X_train_all,
|
|
y_train_all,
|
|
budget,
|
|
kf,
|
|
eval_metric,
|
|
best_val_loss,
|
|
cv_score_agg_func=None,
|
|
log_training_metric=False,
|
|
fit_kwargs: Optional[dict] = None,
|
|
free_mem_ratio=0,
|
|
):
|
|
if fit_kwargs is None:
|
|
fit_kwargs = {}
|
|
if cv_score_agg_func is None:
|
|
cv_score_agg_func = default_cv_score_agg_func
|
|
start_time = time.time()
|
|
val_loss_folds = []
|
|
log_metric_folds = []
|
|
metric = None
|
|
train_time = pred_time = 0
|
|
total_fold_num = 0
|
|
n = kf.get_n_splits()
|
|
rng = np.random.RandomState(2020)
|
|
budget_per_train = budget and budget / n
|
|
groups = None
|
|
if self.is_classification():
|
|
labels = _, labels = len_labels(y_train_all, return_labels=True)
|
|
else:
|
|
labels = fit_kwargs.get("label_list") # pass the label list on to compute the evaluation metric
|
|
if "sample_weight" in fit_kwargs:
|
|
weight = fit_kwargs["sample_weight"]
|
|
weight_val = None
|
|
else:
|
|
weight = weight_val = None
|
|
|
|
is_spark_dataframe = isinstance(X_train_all, (psDataFrame, psSeries))
|
|
if is_spark_dataframe:
|
|
dataframe = X_train_all.join(y_train_all)
|
|
if weight is not None:
|
|
dataframe = dataframe.join(weight)
|
|
if isinstance(kf, (GroupKFold, StratifiedGroupKFold)):
|
|
groups = kf.groups
|
|
dataframe = dataframe.join(groups)
|
|
kf = spark_kFold(dataframe, nFolds=n, foldCol=groups.name if groups is not None else "")
|
|
shuffle = False
|
|
else:
|
|
X_train_split, y_train_split = X_train_all, y_train_all
|
|
shuffle = getattr(kf, "shuffle", not self.is_ts_forecast())
|
|
if isinstance(kf, RepeatedStratifiedKFold):
|
|
kf = kf.split(X_train_split, y_train_split)
|
|
elif isinstance(kf, (GroupKFold, StratifiedGroupKFold)):
|
|
groups = kf.groups
|
|
kf = kf.split(X_train_split, y_train_split, groups)
|
|
shuffle = False
|
|
elif isinstance(kf, TimeSeriesSplit):
|
|
kf = kf.split(X_train_split, y_train_split)
|
|
else:
|
|
kf = kf.split(X_train_split)
|
|
|
|
for train_index, val_index in kf:
|
|
if shuffle:
|
|
train_index = rng.permutation(train_index)
|
|
if is_spark_dataframe:
|
|
# cache data to increase compute speed
|
|
X_train = train_index.spark.cache()
|
|
X_val = val_index.spark.cache()
|
|
y_train = X_train.pop(y_train_all.name)
|
|
y_val = X_val.pop(y_train_all.name)
|
|
if weight is not None:
|
|
weight_val = X_val.pop(weight.name)
|
|
fit_kwargs["sample_weight"] = X_train.pop(weight.name)
|
|
groups_val = None
|
|
elif isinstance(X_train_all, pd.DataFrame):
|
|
X_train = X_train_split.iloc[train_index]
|
|
X_val = X_train_split.iloc[val_index]
|
|
else:
|
|
X_train, X_val = X_train_split[train_index], X_train_split[val_index]
|
|
if not is_spark_dataframe:
|
|
y_train, y_val = y_train_split[train_index], y_train_split[val_index]
|
|
if weight is not None:
|
|
fit_kwargs["sample_weight"], weight_val = (
|
|
weight[train_index],
|
|
weight[val_index],
|
|
)
|
|
if groups is not None:
|
|
fit_kwargs["groups"] = (
|
|
groups[train_index] if isinstance(groups, np.ndarray) else groups.iloc[train_index]
|
|
)
|
|
groups_val = groups[val_index] if isinstance(groups, np.ndarray) else groups.iloc[val_index]
|
|
else:
|
|
groups_val = None
|
|
|
|
estimator.cleanup()
|
|
val_loss_i, metric_i, train_time_i, pred_time_i = get_val_loss(
|
|
config,
|
|
estimator,
|
|
X_train,
|
|
y_train,
|
|
X_val,
|
|
y_val,
|
|
weight_val,
|
|
groups_val,
|
|
eval_metric,
|
|
self,
|
|
labels,
|
|
budget_per_train,
|
|
log_training_metric=log_training_metric,
|
|
fit_kwargs=fit_kwargs,
|
|
free_mem_ratio=free_mem_ratio,
|
|
)
|
|
if isinstance(metric_i, dict) and "intermediate_results" in metric_i.keys():
|
|
del metric_i["intermediate_results"]
|
|
if weight is not None:
|
|
fit_kwargs["sample_weight"] = weight
|
|
total_fold_num += 1
|
|
val_loss_folds.append(val_loss_i)
|
|
log_metric_folds.append(metric_i)
|
|
train_time += train_time_i
|
|
pred_time += pred_time_i
|
|
if is_spark_dataframe:
|
|
X_train.spark.unpersist() # uncache data to free memory
|
|
X_val.spark.unpersist() # uncache data to free memory
|
|
if budget and time.time() - start_time >= budget:
|
|
break
|
|
val_loss, metric = cv_score_agg_func(val_loss_folds, log_metric_folds)
|
|
n = total_fold_num
|
|
pred_time /= n
|
|
return val_loss, metric, train_time, pred_time
|
|
|
|
def default_estimator_list(self, estimator_list: List[str], is_spark_dataframe: bool = False) -> List[str]:
|
|
if "auto" != estimator_list:
|
|
n_estimators = len(estimator_list)
|
|
if is_spark_dataframe:
|
|
# For spark dataframe, only estimators ending with '_spark' are supported
|
|
estimator_list = [est for est in estimator_list if est.endswith("_spark")]
|
|
if len(estimator_list) == 0:
|
|
raise ValueError(
|
|
"Spark dataframes only support estimator names ending with `_spark`. Non-supported "
|
|
"estimators are removed. No estimator is left."
|
|
)
|
|
elif n_estimators != len(estimator_list):
|
|
logger.warning(
|
|
"Spark dataframes only support estimator names ending with `_spark`. Non-supported "
|
|
"estimators are removed."
|
|
)
|
|
else:
|
|
# For non-spark dataframe, only estimators not ending with '_spark' are supported
|
|
estimator_list = [est for est in estimator_list if not est.endswith("_spark")]
|
|
if len(estimator_list) == 0:
|
|
raise ValueError(
|
|
"Non-spark dataframes only support estimator names not ending with `_spark`. Non-supported "
|
|
"estimators are removed. No estimator is left."
|
|
)
|
|
elif n_estimators != len(estimator_list):
|
|
logger.warning(
|
|
"Non-spark dataframes only support estimator names not ending with `_spark`. Non-supported "
|
|
"estimators are removed."
|
|
)
|
|
return estimator_list
|
|
if self.is_rank():
|
|
estimator_list = ["lgbm", "xgboost", "xgb_limitdepth", "lgbm_spark"]
|
|
elif self.is_nlp():
|
|
estimator_list = ["transformer"]
|
|
elif self.is_ts_forecastpanel():
|
|
estimator_list = ["tft"]
|
|
else:
|
|
try:
|
|
import catboost
|
|
|
|
estimator_list = [
|
|
"lgbm",
|
|
"rf",
|
|
"catboost",
|
|
"xgboost",
|
|
"extra_tree",
|
|
"xgb_limitdepth",
|
|
"lgbm_spark",
|
|
]
|
|
except ImportError:
|
|
estimator_list = [
|
|
"lgbm",
|
|
"rf",
|
|
"xgboost",
|
|
"extra_tree",
|
|
"xgb_limitdepth",
|
|
"lgbm_spark",
|
|
]
|
|
# if self.is_ts_forecast():
|
|
# # catboost is removed because it has a `name` parameter, making it incompatible with hcrystalball
|
|
# if "catboost" in estimator_list:
|
|
# estimator_list.remove("catboost")
|
|
# if self.is_ts_forecastregression():
|
|
# try:
|
|
# import prophet
|
|
#
|
|
# estimator_list += [
|
|
# "prophet",
|
|
# "arima",
|
|
# "sarimax",
|
|
# "holt-winters",
|
|
# ]
|
|
# except ImportError:
|
|
# estimator_list += ["arima", "sarimax", "holt-winters"]
|
|
if not self.is_regression():
|
|
estimator_list += ["lrl1"]
|
|
|
|
estimator_list = [
|
|
est
|
|
for est in estimator_list
|
|
if (est.endswith("_spark") if is_spark_dataframe else not est.endswith("_spark"))
|
|
]
|
|
return estimator_list
|
|
|
|
def default_metric(self, metric: str) -> str:
|
|
if "auto" != metric:
|
|
return metric
|
|
|
|
if self.is_nlp():
|
|
from flaml.automl.nlp.utils import (
|
|
load_default_huggingface_metric_for_task,
|
|
)
|
|
|
|
return load_default_huggingface_metric_for_task(self.name)
|
|
elif self.is_binary():
|
|
return "roc_auc"
|
|
elif self.is_multiclass():
|
|
return "log_loss"
|
|
elif self.is_ts_forecast():
|
|
return "mape"
|
|
elif self.is_rank():
|
|
return "ndcg"
|
|
else:
|
|
return "r2"
|
|
|
|
@staticmethod
|
|
def prepare_sample_train_data(automlstate, sample_size):
|
|
return automlstate.prepare_sample_train_data(sample_size)
|