autogen/flaml/automl/task/generic_task.py

import logging
import time
from typing import List, Optional
import numpy as np
from flaml.automl.data import TS_TIMESTAMP_COL, concat
from flaml.automl.ml import EstimatorSubclass, get_val_loss, default_cv_score_agg_func

from flaml.automl.task.task import (
    Task,
    get_classification_objective,
    TS_FORECAST,
    TS_FORECASTPANEL,
)
from flaml.config import RANDOM_SEED
from flaml.automl.spark import ps, psDataFrame, psSeries, pd
from flaml.automl.spark.utils import (
    iloc_pandas_on_spark,
    spark_kFold,
    train_test_split_pyspark,
    unique_pandas_on_spark,
    unique_value_first_index,
    len_labels,
    set_option,
)

try:
    from scipy.sparse import issparse
except ImportError:
    pass
try:
    from sklearn.utils import shuffle
    from sklearn.model_selection import (
        train_test_split,
        RepeatedStratifiedKFold,
        RepeatedKFold,
        GroupKFold,
        TimeSeriesSplit,
        GroupShuffleSplit,
        StratifiedGroupKFold,
    )
except ImportError:
    pass

logger = logging.getLogger(__name__)


class GenericTask(Task):
    @property
    def estimators(self):
        if self._estimators is None:
            # put this into a function to avoid circular dependency
            from flaml.automl.model import (
                XGBoostSklearnEstimator,
                XGBoostLimitDepthEstimator,
                RandomForestEstimator,
                LGBMEstimator,
                LRL1Classifier,
                LRL2Classifier,
                CatBoostEstimator,
                ExtraTreesEstimator,
                KNeighborsEstimator,
                TransformersEstimator,
                TransformersEstimatorModelSelection,
                SparkLGBMEstimator,
            )

            self._estimators = {
                "xgboost": XGBoostSklearnEstimator,
                "xgb_limitdepth": XGBoostLimitDepthEstimator,
                "rf": RandomForestEstimator,
                "lgbm": LGBMEstimator,
                "lgbm_spark": SparkLGBMEstimator,
                "lrl1": LRL1Classifier,
                "lrl2": LRL2Classifier,
                "catboost": CatBoostEstimator,
                "extra_tree": ExtraTreesEstimator,
                "kneighbor": KNeighborsEstimator,
                "transformer": TransformersEstimator,
                "transformer_ms": TransformersEstimatorModelSelection,
            }
        return self._estimators

    def validate_data(
        self,
        automl,
        state,
        X_train_all,
        y_train_all,
        dataframe,
        label,
        X_val=None,
        y_val=None,
        groups_val=None,
        groups=None,
    ):
        if X_train_all is not None and y_train_all is not None:
            assert isinstance(X_train_all, (np.ndarray, pd.DataFrame, psDataFrame)) or issparse(X_train_all), (
                "X_train_all must be a numpy array, a pandas dataframe, "
                "a Scipy sparse matrix or a pyspark.pandas dataframe."
            )
            assert isinstance(
                y_train_all, (np.ndarray, pd.Series, psSeries)
            ), "y_train_all must be a numpy array, a pandas series or a pyspark.pandas series."
            assert X_train_all.size != 0 and y_train_all.size != 0, "Input data must not be empty."
            if isinstance(X_train_all, np.ndarray) and len(X_train_all.shape) == 1:
                X_train_all = np.reshape(X_train_all, (X_train_all.size, 1))
            if isinstance(y_train_all, np.ndarray):
                y_train_all = y_train_all.flatten()
            assert X_train_all.shape[0] == y_train_all.shape[0], "# rows in X_train must match length of y_train."
            if isinstance(X_train_all, psDataFrame):
                X_train_all = X_train_all.spark.cache()  # cache data to improve compute speed
                y_train_all = y_train_all.to_frame().spark.cache()[y_train_all.name]
                logger.debug(f"X_train_all and y_train_all cached, shape of X_train_all: {X_train_all.shape}")
            automl._df = isinstance(X_train_all, (pd.DataFrame, psDataFrame))
            automl._nrow, automl._ndim = X_train_all.shape
            if self.is_ts_forecast():
                X_train_all = pd.DataFrame(X_train_all) if isinstance(X_train_all, np.ndarray) else X_train_all
                X_train_all, y_train_all = self._validate_ts_data(X_train_all, y_train_all)
            X, y = X_train_all, y_train_all
        elif dataframe is not None and label is not None:
            assert isinstance(
                dataframe, (pd.DataFrame, psDataFrame)
            ), "dataframe must be a pandas DataFrame or a pyspark.pandas DataFrame."
            assert (
                label in dataframe.columns
            ), f"The provided label column name `{label}` doesn't exist in the provided dataframe."
            if isinstance(dataframe, psDataFrame):
                dataframe = dataframe.spark.cache()  # cache data to improve compute speed
                logger.debug(f"dataframe cached, shape of dataframe: {dataframe.shape}")
            automl._df = True
            if self.is_ts_forecast():
                dataframe = self._validate_ts_data(dataframe)
            # TODO: to support pyspark.sql.DataFrame and pure dataframe mode
            X = dataframe.drop(columns=label)
            automl._nrow, automl._ndim = X.shape
            y = dataframe[label]
        else:
            raise ValueError("either X_train+y_train or dataframe+label are required")

        # check the validity of input dimensions for NLP tasks, so need to check _is_nlp_task not estimator
        if self.is_nlp():
            from flaml.automl.nlp.utils import is_a_list_of_str

            is_all_str = True
            is_all_list = True
            for column in X.columns:
                assert X[column].dtype.name in (
                    "object",
                    "string",
                ), "If the task is an NLP task, X can only contain text columns"
                for _, each_cell in X[column].items():
                    if each_cell is not None:
                        is_str = isinstance(each_cell, str)
                        is_list_of_int = isinstance(each_cell, list) and all(isinstance(x, int) for x in each_cell)
                        is_list_of_str = is_a_list_of_str(each_cell)
                        if self.is_token_classification():
                            assert is_list_of_str, (
                                "For the token-classification task, the input column needs to be a list of string,"
                                "instead of string, e.g., ['EU', 'rejects','German', 'call','to','boycott','British','lamb','.',].",
                                "For more examples, please refer to test/nlp/test_autohf_tokenclassification.py",
                            )
                        else:
                            assert is_str or is_list_of_int, (
                                "Each column of the input must either be str (untokenized) "
                                "or a list of integers (tokenized)"
                            )
                        is_all_str &= is_str
                        is_all_list &= is_list_of_int or is_list_of_str
            assert is_all_str or is_all_list, (
                "Currently FLAML only supports two modes for NLP: either all columns of X are string (non-tokenized), "
                "or all columns of X are integer ids (tokenized)"
            )
        if isinstance(X, psDataFrame):
            # TODO: support pyspark.pandas dataframe in DataTransformer
            automl._skip_transform = True
        if automl._skip_transform or issparse(X_train_all):
            automl._transformer = automl._label_transformer = False
            automl._X_train_all, automl._y_train_all = X, y
        else:
            from flaml.automl.data import DataTransformer

            automl._transformer = DataTransformer()

            (
                automl._X_train_all,
                automl._y_train_all,
            ) = automl._transformer.fit_transform(X, y, self)
            automl._label_transformer = automl._transformer.label_transformer
            if self.is_token_classification():
                if hasattr(automl._label_transformer, "label_list"):
                    state.fit_kwargs.update({"label_list": automl._label_transformer.label_list})
                elif "label_list" not in state.fit_kwargs:
                    for each_fit_kwargs in state.fit_kwargs_by_estimator.values():
                        assert (
                            "label_list" in each_fit_kwargs
                        ), "For the token-classification task, you must either (1) pass token labels; or (2) pass id labels and the label list. "
                        "Please refer to the documentation for more details: https://microsoft.github.io/FLAML/docs/Examples/AutoML-NLP#a-simple-token-classification-example"
            automl._feature_names_in_ = (
                automl._X_train_all.columns.to_list() if hasattr(automl._X_train_all, "columns") else None
            )

        automl._sample_weight_full = state.fit_kwargs.get(
            "sample_weight"
        )  # NOTE: _validate_data is before kwargs is updated to fit_kwargs_by_estimator
        if X_val is not None and y_val is not None:
            assert isinstance(X_val, (np.ndarray, pd.DataFrame, psDataFrame)) or issparse(X_train_all), (
                "X_val must be None, a numpy array, a pandas dataframe, "
                "a Scipy sparse matrix or a pyspark.pandas dataframe."
            )
            assert isinstance(y_val, (np.ndarray, pd.Series, psSeries)), (
                "y_val must be None, a numpy array, a pandas series " "or a pyspark.pandas series."
            )
            assert X_val.size != 0 and y_val.size != 0, (
                "Validation data are expected to be nonempty. " "Use None for X_val and y_val if no validation data."
            )
            if isinstance(y_val, np.ndarray):
                y_val = y_val.flatten()
            assert X_val.shape[0] == y_val.shape[0], "# rows in X_val must match length of y_val."
            if automl._transformer:
                state.X_val = automl._transformer.transform(X_val)
            else:
                state.X_val = X_val
            # If it's NLG_TASKS, y_val is a pandas series containing the output sequence tokens,
            # so we cannot use label_transformer.transform to process it
            if automl._label_transformer:
                state.y_val = automl._label_transformer.transform(y_val)
            else:
                state.y_val = y_val
        else:
            state.X_val = state.y_val = None

        if groups is not None and len(groups) != automl._nrow:
            # groups is given as group counts
            state.groups = np.concatenate([[i] * c for i, c in enumerate(groups)])
            assert len(state.groups) == automl._nrow, "the sum of group counts must match the number of examples"
            state.groups_val = (
                np.concatenate([[i] * c for i, c in enumerate(groups_val)]) if groups_val is not None else None
            )
        else:
            state.groups_val = groups_val
            state.groups = groups

        automl.data_size_full = len(automl._y_train_all)

    @staticmethod
    def _split_pyspark(state, X_train_all, y_train_all, split_ratio, stratify=None):
        # TODO: optimize this
        set_option("compute.ops_on_diff_frames", True)
        if not isinstance(y_train_all, (psDataFrame, psSeries)):
            raise ValueError("y_train_all must be a pyspark.pandas dataframe or series")
        df_all_in_one = X_train_all.join(y_train_all)
        stratify_column = y_train_all.name if isinstance(y_train_all, psSeries) else y_train_all.columns[0]
        ret_sample_weight = False
        if (
            "sample_weight" in state.fit_kwargs
        ):  # NOTE: _prepare_data is before kwargs is updated to fit_kwargs_by_estimator
            # fit_kwargs["sample_weight"] is an numpy array
            ps_sample_weight = ps.DataFrame(
                state.fit_kwargs["sample_weight"],
                columns=["sample_weight"],
            )
            df_all_in_one = df_all_in_one.join(ps_sample_weight)
            ret_sample_weight = True
        df_all_train, df_all_val = train_test_split_pyspark(
            df_all_in_one,
            None if stratify is None else stratify_column,
            test_fraction=split_ratio,
            seed=RANDOM_SEED,
        )
        columns_to_drop = [c for c in df_all_train.columns if c in [stratify_column, "sample_weight"]]
        X_train = df_all_train.drop(columns_to_drop)
        X_val = df_all_val.drop(columns_to_drop)
        y_train = df_all_train[stratify_column]
        y_val = df_all_val[stratify_column]

        if ret_sample_weight:
            return (
                X_train,
                X_val,
                y_train,
                y_val,
                df_all_train["sample_weight"],
                df_all_val["sample_weight"],
            )
        return X_train, X_val, y_train, y_val

    @staticmethod
    def _train_test_split(state, X, y, first=None, rest=None, split_ratio=0.2, stratify=None):
        condition_type = isinstance(X, (psDataFrame, psSeries))
        # NOTE: _prepare_data is before kwargs is updated to fit_kwargs_by_estimator
        condition_param = "sample_weight" in state.fit_kwargs
        if not condition_type and condition_param:
            sample_weight = (
                state.fit_kwargs["sample_weight"] if rest is None else state.fit_kwargs["sample_weight"][rest]
            )
            (
                X_train,
                X_val,
                y_train,
                y_val,
                weight_train,
                weight_val,
            ) = train_test_split(
                X,
                y,
                sample_weight,
                test_size=split_ratio,
                stratify=stratify,
                random_state=RANDOM_SEED,
            )

            if first is not None:
                weight1 = state.fit_kwargs["sample_weight"][first]
                state.weight_val = concat(weight1, weight_val)
                state.fit_kwargs["sample_weight"] = concat(weight1, weight_train)
            else:
                state.weight_val = weight_val
                state.fit_kwargs["sample_weight"] = weight_train
        elif not condition_type and not condition_param:
            X_train, X_val, y_train, y_val = train_test_split(
                X,
                y,
                test_size=split_ratio,
                stratify=stratify,
                random_state=RANDOM_SEED,
            )
        elif condition_type and condition_param:
            (
                X_train,
                X_val,
                y_train,
                y_val,
                weight_train,
                weight_val,
            ) = GenericTask._split_pyspark(state, X, y, split_ratio, stratify)

            if first is not None:
                weight1 = state.fit_kwargs["sample_weight"][first]
                state.weight_val = concat(weight1, weight_val)
                state.fit_kwargs["sample_weight"] = concat(weight1, weight_train)
            else:
                state.weight_val = weight_val
                state.fit_kwargs["sample_weight"] = weight_train
        else:
            X_train, X_val, y_train, y_val = GenericTask._split_pyspark(state, X, y, split_ratio, stratify)
        return X_train, X_val, y_train, y_val

    def prepare_data(
        self,
        state,
        X_train_all,
        y_train_all,
        auto_augment,
        eval_method,
        split_type,
        split_ratio,
        n_splits,
        data_is_df,
        sample_weight_full,
    ) -> int:
        X_val, y_val = state.X_val, state.y_val
        if issparse(X_val):
            X_val = X_val.tocsr()
        if issparse(X_train_all):
            X_train_all = X_train_all.tocsr()
        is_spark_dataframe = isinstance(X_train_all, (psDataFrame, psSeries))
        self.is_spark_dataframe = is_spark_dataframe
        if (
            self.is_classification()
            and auto_augment
            and state.fit_kwargs.get("sample_weight")
            is None  # NOTE: _prepare_data is before kwargs is updated to fit_kwargs_by_estimator
            and split_type in ["stratified", "uniform"]
            and not self.is_token_classification()
        ):
            # logger.info(f"label {pd.unique(y_train_all)}")
            if is_spark_dataframe:
                label_set, counts = unique_pandas_on_spark(y_train_all)
                # TODO: optimize this
                set_option("compute.ops_on_diff_frames", True)
            else:
                label_set, counts = np.unique(y_train_all, return_counts=True)
            # augment rare classes
            rare_threshld = 20
            rare = counts < rare_threshld
            rare_label, rare_counts = label_set[rare], counts[rare]
            for i, label in enumerate(rare_label.tolist()):
                count = rare_count = rare_counts[i]
                rare_index = y_train_all == label
                n = len(y_train_all)
                while count < rare_threshld:
                    if data_is_df:
                        X_train_all = concat(X_train_all, X_train_all.iloc[:n].loc[rare_index])
                    else:
                        X_train_all = concat(X_train_all, X_train_all[:n][rare_index, :])
                    if isinstance(y_train_all, (pd.Series, psSeries)):
                        y_train_all = concat(y_train_all, y_train_all.iloc[:n].loc[rare_index])
                    else:
                        y_train_all = np.concatenate([y_train_all, y_train_all[:n][rare_index]])
                    count += rare_count
                logger.info(f"class {label} augmented from {rare_count} to {count}")
        SHUFFLE_SPLIT_TYPES = ["uniform", "stratified"]
        if is_spark_dataframe:
            # no need to shuffle pyspark dataframe
            pass
        elif split_type in SHUFFLE_SPLIT_TYPES:
            if sample_weight_full is not None:
                X_train_all, y_train_all, state.sample_weight_all = shuffle(
                    X_train_all,
                    y_train_all,
                    sample_weight_full,
                    random_state=RANDOM_SEED,
                )
                state.fit_kwargs[
                    "sample_weight"
                ] = (
                    state.sample_weight_all
                )  # NOTE: _prepare_data is before kwargs is updated to fit_kwargs_by_estimator
                if isinstance(state.sample_weight_all, pd.Series):
                    state.sample_weight_all.reset_index(drop=True, inplace=True)
            else:
                X_train_all, y_train_all = shuffle(X_train_all, y_train_all, random_state=RANDOM_SEED)
            if data_is_df:
                X_train_all.reset_index(drop=True, inplace=True)
            if isinstance(y_train_all, pd.Series):
                y_train_all.reset_index(drop=True, inplace=True)

        X_train, y_train = X_train_all, y_train_all
        state.groups_all = state.groups
        if X_val is None and eval_method == "holdout":
            if split_type == "time":
                assert not self.is_ts_forecast(), "For a TS forecast task, this code should never be called"

                is_sample_weight = "sample_weight" in state.fit_kwargs
                if not is_spark_dataframe and is_sample_weight:
                    (
                        X_train,
                        X_val,
                        y_train,
                        y_val,
                        state.fit_kwargs[
                            "sample_weight"
                        ],  # NOTE: _prepare_data is before kwargs is updated to fit_kwargs_by_estimator
                        state.weight_val,
                    ) = train_test_split(
                        X_train_all,
                        y_train_all,
                        state.fit_kwargs[
                            "sample_weight"
                        ],  # NOTE: _prepare_data is before kwargs is updated to fit_kwargs_by_estimator
                        test_size=split_ratio,
                        shuffle=False,
                    )
                elif not is_spark_dataframe and not is_sample_weight:
                    X_train, X_val, y_train, y_val = train_test_split(
                        X_train_all,
                        y_train_all,
                        test_size=split_ratio,
                        shuffle=False,
                    )
                elif is_spark_dataframe and is_sample_weight:
                    (
                        X_train,
                        X_val,
                        y_train,
                        y_val,
                        state.fit_kwargs[
                            "sample_weight"
                        ],  # NOTE: _prepare_data is before kwargs is updated to fit_kwargs_by_estimator
                        state.weight_val,
                    ) = self._split_pyspark(state, X_train_all, y_train_all, split_ratio)
                else:
                    X_train, X_val, y_train, y_val = self._split_pyspark(state, X_train_all, y_train_all, split_ratio)
            if split_type == "group":
                gss = GroupShuffleSplit(n_splits=1, test_size=split_ratio, random_state=RANDOM_SEED)
                for train_idx, val_idx in gss.split(X_train_all, y_train_all, state.groups_all):
                    if data_is_df:
                        X_train = X_train_all.iloc[train_idx]
                        X_val = X_train_all.iloc[val_idx]
                    else:
                        X_train, X_val = X_train_all[train_idx], X_train_all[val_idx]
                    y_train, y_val = y_train_all[train_idx], y_train_all[val_idx]
                    state.groups = state.groups_all[train_idx]
                    state.groups_val = state.groups_all[val_idx]
            elif self.is_classification():
                # for classification, make sure the labels are complete in both
                # training and validation data
                label_set, first = unique_value_first_index(y_train_all)
                rest = []
                last = 0
                first.sort()
                for i in range(len(first)):
                    rest.extend(range(last, first[i]))
                    last = first[i] + 1
                rest.extend(range(last, len(y_train_all)))
                X_first = X_train_all.iloc[first] if data_is_df else X_train_all[first]
                X_rest = X_train_all.iloc[rest] if data_is_df else X_train_all[rest]
                y_rest = (
                    y_train_all[rest]
                    if isinstance(y_train_all, np.ndarray)
                    else iloc_pandas_on_spark(y_train_all, rest)
                    if is_spark_dataframe
                    else y_train_all.iloc[rest]
                )
                stratify = y_rest if split_type == "stratified" else None
                X_train, X_val, y_train, y_val = self._train_test_split(
                    state, X_rest, y_rest, first, rest, split_ratio, stratify
                )
                X_train = concat(X_first, X_train)
                y_train = concat(label_set, y_train) if data_is_df else np.concatenate([label_set, y_train])
                X_val = concat(X_first, X_val)
                y_val = concat(label_set, y_val) if data_is_df else np.concatenate([label_set, y_val])
            elif self.is_regression():
                X_train, X_val, y_train, y_val = self._train_test_split(
                    state, X_train_all, y_train_all, split_ratio=split_ratio
                )
        state.data_size = X_train.shape
        state.data_size_full = len(y_train_all)
        state.X_train, state.y_train = X_train, y_train
        state.X_val, state.y_val = X_val, y_val
        state.X_train_all = X_train_all
        state.y_train_all = y_train_all
        y_train_all_size = y_train_all.size
        if eval_method == "holdout":
            state.kf = None
            return
        if split_type == "group":
            # logger.info("Using GroupKFold")
            assert len(state.groups_all) == y_train_all_size, "the length of groups must match the number of examples"
            assert (
                len_labels(state.groups_all) >= n_splits
            ), "the number of groups must be equal or larger than n_splits"
            state.kf = GroupKFold(n_splits)
        elif split_type == "stratified":
            # logger.info("Using StratifiedKFold")
            assert y_train_all_size >= n_splits, (
                f"{n_splits}-fold cross validation" f" requires input data with at least {n_splits} examples."
            )
            assert y_train_all_size >= 2 * n_splits, (
                f"{n_splits}-fold cross validation with metric=r2 "
                f"requires input data with at least {n_splits*2} examples."
            )
            state.kf = RepeatedStratifiedKFold(n_splits=n_splits, n_repeats=1, random_state=RANDOM_SEED)
        elif split_type == "time":
            # logger.info("Using TimeSeriesSplit")
            if self.is_ts_forecast() and not self.is_ts_forecastpanel():
                period = state.fit_kwargs[
                    "period"
                ]  # NOTE: _prepare_data is before kwargs is updated to fit_kwargs_by_estimator
                if period * (n_splits + 1) > y_train_all_size:
                    n_splits = int(y_train_all_size / period - 1)
                    assert n_splits >= 2, (
                        f"cross validation for forecasting period={period}"
                        f" requires input data with at least {3 * period} examples."
                    )
                    logger.info(f"Using nsplits={n_splits} due to data size limit.")
                state.kf = TimeSeriesSplit(n_splits=n_splits, test_size=period)
            elif self.is_ts_forecastpanel():
                n_groups = len(X_train.groupby(state.fit_kwargs.get("group_ids")).size())
                period = state.fit_kwargs.get("period")
                state.kf = TimeSeriesSplit(n_splits=n_splits, test_size=period * n_groups)
            else:
                state.kf = TimeSeriesSplit(n_splits=n_splits)
            # state.kf = TimeSeriesSplit(n_splits=n_splits)
        elif isinstance(split_type, str):
            # logger.info("Using RepeatedKFold")
            state.kf = RepeatedKFold(n_splits=n_splits, n_repeats=1, random_state=RANDOM_SEED)
        else:
            # logger.info("Using splitter object")
            state.kf = split_type
        if isinstance(state.kf, (GroupKFold, StratifiedGroupKFold)):
            # self._split_type is either "group", a GroupKFold object, or a StratifiedGroupKFold object
            state.kf.groups = state.groups_all

    def decide_split_type(
        self,
        split_type,
        y_train_all,
        fit_kwargs,
        groups=None,
    ) -> str:
        assert not self.is_ts_forecast(), "This function should never be called as part of a time-series task."
        if self.name == "classification":
            self.name = get_classification_objective(len_labels(y_train_all))
        if not isinstance(split_type, str):
            assert hasattr(split_type, "split") and hasattr(
                split_type, "get_n_splits"
            ), "split_type must be a string or a splitter object with split and get_n_splits methods."
            assert (
                not isinstance(split_type, GroupKFold) or groups is not None
            ), "GroupKFold requires groups to be provided."
            return split_type

        elif self.is_classification():
            assert split_type in ["auto", "stratified", "uniform", "time", "group"]
            return split_type if split_type != "auto" else groups is None and "stratified" or "group"

        elif self.is_regression():
            assert split_type in ["auto", "uniform", "time", "group"]
            return split_type if split_type != "auto" else "uniform"

        elif self.is_rank():
            assert groups is not None, "groups must be specified for ranking task."
            assert split_type in ["auto", "group"]
            return "group"

        elif self.is_nlg():
            assert split_type in ["auto", "uniform", "time", "group"]
            return split_type if split_type != "auto" else "uniform"

    def preprocess(self, X, transformer=None):
        if isinstance(X, List):
            try:
                if isinstance(X[0], List):
                    X = [x for x in zip(*X)]
                X = pd.DataFrame(
                    dict(
                        [
                            (transformer._str_columns[idx], X[idx])
                            if isinstance(X[0], List)
                            else (transformer._str_columns[idx], [X[idx]])
                            for idx in range(len(X))
                        ]
                    )
                )
            except IndexError:
                raise IndexError("Test data contains more columns than training data, exiting")
        elif isinstance(X, int):
            return X
        elif isinstance(X, psDataFrame):
            return X
        elif issparse(X):
            X = X.tocsr()
        if self.is_ts_forecast():
            X = pd.DataFrame(X)
        if transformer:
            X = transformer.transform(X)
        return X

    def evaluate_model_CV(
        self,
        config: dict,
        estimator: EstimatorSubclass,
        X_train_all,
        y_train_all,
        budget,
        kf,
        eval_metric,
        best_val_loss,
        cv_score_agg_func=None,
        log_training_metric=False,
        fit_kwargs: Optional[dict] = None,
        free_mem_ratio=0,
    ):
        if fit_kwargs is None:
            fit_kwargs = {}
        if cv_score_agg_func is None:
            cv_score_agg_func = default_cv_score_agg_func
        start_time = time.time()
        val_loss_folds = []
        log_metric_folds = []
        metric = None
        train_time = pred_time = 0
        total_fold_num = 0
        n = kf.get_n_splits()
        rng = np.random.RandomState(2020)
        budget_per_train = budget and budget / n
        groups = None
        if self.is_classification():
            labels = _, labels = len_labels(y_train_all, return_labels=True)
        else:
            labels = fit_kwargs.get("label_list")  # pass the label list on to compute the evaluation metric
        if "sample_weight" in fit_kwargs:
            weight = fit_kwargs["sample_weight"]
            weight_val = None
        else:
            weight = weight_val = None

        is_spark_dataframe = isinstance(X_train_all, (psDataFrame, psSeries))
        if is_spark_dataframe:
            dataframe = X_train_all.join(y_train_all)
            if weight is not None:
                dataframe = dataframe.join(weight)
            if isinstance(kf, (GroupKFold, StratifiedGroupKFold)):
                groups = kf.groups
                dataframe = dataframe.join(groups)
            kf = spark_kFold(dataframe, nFolds=n, foldCol=groups.name if groups is not None else "")
            shuffle = False
        else:
            X_train_split, y_train_split = X_train_all, y_train_all
            shuffle = getattr(kf, "shuffle", not self.is_ts_forecast())
            if isinstance(kf, RepeatedStratifiedKFold):
                kf = kf.split(X_train_split, y_train_split)
            elif isinstance(kf, (GroupKFold, StratifiedGroupKFold)):
                groups = kf.groups
                kf = kf.split(X_train_split, y_train_split, groups)
                shuffle = False
            elif isinstance(kf, TimeSeriesSplit):
                kf = kf.split(X_train_split, y_train_split)
            else:
                kf = kf.split(X_train_split)

        for train_index, val_index in kf:
            if shuffle:
                train_index = rng.permutation(train_index)
            if is_spark_dataframe:
                # cache data to increase compute speed
                X_train = train_index.spark.cache()
                X_val = val_index.spark.cache()
                y_train = X_train.pop(y_train_all.name)
                y_val = X_val.pop(y_train_all.name)
                if weight is not None:
                    weight_val = X_val.pop(weight.name)
                    fit_kwargs["sample_weight"] = X_train.pop(weight.name)
                groups_val = None
            elif isinstance(X_train_all, pd.DataFrame):
                X_train = X_train_split.iloc[train_index]
                X_val = X_train_split.iloc[val_index]
            else:
                X_train, X_val = X_train_split[train_index], X_train_split[val_index]
            if not is_spark_dataframe:
                y_train, y_val = y_train_split[train_index], y_train_split[val_index]
                if weight is not None:
                    fit_kwargs["sample_weight"], weight_val = (
                        weight[train_index],
                        weight[val_index],
                    )
                if groups is not None:
                    fit_kwargs["groups"] = (
                        groups[train_index] if isinstance(groups, np.ndarray) else groups.iloc[train_index]
                    )
                    groups_val = groups[val_index] if isinstance(groups, np.ndarray) else groups.iloc[val_index]
                else:
                    groups_val = None

            estimator.cleanup()
            val_loss_i, metric_i, train_time_i, pred_time_i = get_val_loss(
                config,
                estimator,
                X_train,
                y_train,
                X_val,
                y_val,
                weight_val,
                groups_val,
                eval_metric,
                self,
                labels,
                budget_per_train,
                log_training_metric=log_training_metric,
                fit_kwargs=fit_kwargs,
                free_mem_ratio=free_mem_ratio,
            )
            if isinstance(metric_i, dict) and "intermediate_results" in metric_i.keys():
                del metric_i["intermediate_results"]
            if weight is not None:
                fit_kwargs["sample_weight"] = weight
            total_fold_num += 1
            val_loss_folds.append(val_loss_i)
            log_metric_folds.append(metric_i)
            train_time += train_time_i
            pred_time += pred_time_i
            if is_spark_dataframe:
                X_train.spark.unpersist()  # uncache data to free memory
                X_val.spark.unpersist()  # uncache data to free memory
            if budget and time.time() - start_time >= budget:
                break
        val_loss, metric = cv_score_agg_func(val_loss_folds, log_metric_folds)
        n = total_fold_num
        pred_time /= n
        return val_loss, metric, train_time, pred_time

    def default_estimator_list(self, estimator_list: List[str], is_spark_dataframe: bool = False) -> List[str]:
        if "auto" != estimator_list:
            n_estimators = len(estimator_list)
            if is_spark_dataframe:
                # For spark dataframe, only estimators ending with '_spark' are supported
                estimator_list = [est for est in estimator_list if est.endswith("_spark")]
                if len(estimator_list) == 0:
                    raise ValueError(
                        "Spark dataframes only support estimator names ending with `_spark`. Non-supported "
                        "estimators are removed. No estimator is left."
                    )
                elif n_estimators != len(estimator_list):
                    logger.warning(
                        "Spark dataframes only support estimator names ending with `_spark`. Non-supported "
                        "estimators are removed."
                    )
            else:
                # For non-spark dataframe, only estimators not ending with '_spark' are supported
                estimator_list = [est for est in estimator_list if not est.endswith("_spark")]
                if len(estimator_list) == 0:
                    raise ValueError(
                        "Non-spark dataframes only support estimator names not ending with `_spark`. Non-supported "
                        "estimators are removed. No estimator is left."
                    )
                elif n_estimators != len(estimator_list):
                    logger.warning(
                        "Non-spark dataframes only support estimator names not ending with `_spark`. Non-supported "
                        "estimators are removed."
                    )
            return estimator_list
        if self.is_rank():
            estimator_list = ["lgbm", "xgboost", "xgb_limitdepth", "lgbm_spark"]
        elif self.is_nlp():
            estimator_list = ["transformer"]
        elif self.is_ts_forecastpanel():
            estimator_list = ["tft"]
        else:
            try:
                import catboost

                estimator_list = [
                    "lgbm",
                    "rf",
                    "catboost",
                    "xgboost",
                    "extra_tree",
                    "xgb_limitdepth",
                    "lgbm_spark",
                ]
            except ImportError:
                estimator_list = [
                    "lgbm",
                    "rf",
                    "xgboost",
                    "extra_tree",
                    "xgb_limitdepth",
                    "lgbm_spark",
                ]
            # if self.is_ts_forecast():
            #     # catboost is removed because it has a `name` parameter, making it incompatible with hcrystalball
            #     if "catboost" in estimator_list:
            #         estimator_list.remove("catboost")
            #     if self.is_ts_forecastregression():
            #         try:
            #             import prophet
            #
            #             estimator_list += [
            #                 "prophet",
            #                 "arima",
            #                 "sarimax",
            #                 "holt-winters",
            #             ]
            #         except ImportError:
            #             estimator_list += ["arima", "sarimax", "holt-winters"]
            if not self.is_regression():
                estimator_list += ["lrl1"]

        estimator_list = [
            est
            for est in estimator_list
            if (est.endswith("_spark") if is_spark_dataframe else not est.endswith("_spark"))
        ]
        return estimator_list

    def default_metric(self, metric: str) -> str:
        if "auto" != metric:
            return metric

        if self.is_nlp():
            from flaml.automl.nlp.utils import (
                load_default_huggingface_metric_for_task,
            )

            return load_default_huggingface_metric_for_task(self.name)
        elif self.is_binary():
            return "roc_auc"
        elif self.is_multiclass():
            return "log_loss"
        elif self.is_ts_forecast():
            return "mape"
        elif self.is_rank():
            return "ndcg"
        else:
            return "r2"

    @staticmethod
    def prepare_sample_train_data(automlstate, sample_size):
        return automlstate.prepare_sample_train_data(sample_size)