mirror of https://github.com/microsoft/autogen.git
262 lines
11 KiB
Python
262 lines
11 KiB
Python
import numpy as np
|
|
import logging
|
|
import pathlib
|
|
import json
|
|
from flaml.automl.data import DataTransformer
|
|
from flaml.automl.task.task import CLASSIFICATION, get_classification_objective
|
|
from flaml.automl.task.generic_task import len_labels
|
|
from flaml.automl.task.factory import task_factory
|
|
from flaml.version import __version__
|
|
|
|
try:
|
|
from sklearn.neighbors import NearestNeighbors
|
|
except ImportError:
|
|
pass
|
|
|
|
LOCATION = pathlib.Path(__file__).parent.resolve()
|
|
logger = logging.getLogger(__name__)
|
|
CONFIG_PREDICTORS = {}
|
|
|
|
|
|
def meta_feature(task, X_train, y_train, meta_feature_names):
|
|
this_feature = []
|
|
n_row = X_train.shape[0]
|
|
n_feat = X_train.shape[1]
|
|
|
|
is_classification = task in CLASSIFICATION
|
|
for each_feature_name in meta_feature_names:
|
|
if each_feature_name == "NumberOfInstances":
|
|
this_feature.append(n_row)
|
|
elif each_feature_name == "NumberOfFeatures":
|
|
this_feature.append(n_feat)
|
|
elif each_feature_name == "NumberOfClasses":
|
|
this_feature.append(len_labels(y_train) if is_classification else 0)
|
|
elif each_feature_name == "PercentageOfNumericFeatures":
|
|
try:
|
|
# this feature is only supported for dataframe
|
|
this_feature.append(
|
|
X_train.select_dtypes(include=[np.number, "float", "int", "long"]).shape[1] / n_feat
|
|
)
|
|
except AttributeError:
|
|
# 'numpy.ndarray' object has no attribute 'select_dtypes'
|
|
this_feature.append(1) # all features are numeric
|
|
else:
|
|
raise ValueError("Feature {} not implemented. ".format(each_feature_name))
|
|
|
|
return this_feature
|
|
|
|
|
|
def load_config_predictor(estimator_name, task, location=None):
|
|
task = str(task)
|
|
key = f"{location}/{estimator_name}/{task}"
|
|
predictor = CONFIG_PREDICTORS.get(key)
|
|
if predictor:
|
|
return predictor
|
|
task = "multiclass" if task == "multi" else task # TODO: multi -> multiclass?
|
|
try:
|
|
location = location or LOCATION
|
|
with open(f"{location}/{estimator_name}/{task}.json", "r") as f:
|
|
CONFIG_PREDICTORS[key] = predictor = json.load(f)
|
|
except FileNotFoundError:
|
|
raise FileNotFoundError(f"Portfolio has not been built for {estimator_name} on {task} task.")
|
|
return predictor
|
|
|
|
|
|
def suggest_config(
|
|
task,
|
|
X,
|
|
y,
|
|
estimator_or_predictor,
|
|
location=None,
|
|
k=None,
|
|
meta_feature_fn=meta_feature,
|
|
):
|
|
"""Suggest a list of configs for the given task and training data.
|
|
|
|
The returned configs can be used as starting points for AutoML.fit().
|
|
`FLAML_sample_size` is removed from the configs.
|
|
"""
|
|
from packaging.version import parse as version_parse
|
|
|
|
task = get_classification_objective(len_labels(y)) if task == "classification" and y is not None else task
|
|
predictor = (
|
|
load_config_predictor(estimator_or_predictor, task, location)
|
|
if isinstance(estimator_or_predictor, str)
|
|
else estimator_or_predictor
|
|
)
|
|
|
|
older_version = "1.0.2"
|
|
# TODO: update older_version when the newer code can no longer handle the older version json file
|
|
assert version_parse(__version__) >= version_parse(predictor["version"]) >= version_parse(older_version)
|
|
prep = predictor["preprocessing"]
|
|
feature = meta_feature_fn(task, X_train=X, y_train=y, meta_feature_names=predictor["meta_feature_names"])
|
|
feature = (np.array(feature) - np.array(prep["center"])) / np.array(prep["scale"])
|
|
neighbors = predictor["neighbors"]
|
|
nn = NearestNeighbors(n_neighbors=1)
|
|
nn.fit([x["features"] for x in neighbors])
|
|
dist, ind = nn.kneighbors(feature.reshape(1, -1), return_distance=True)
|
|
logger.info(f"metafeature distance: {dist.item()}")
|
|
ind = int(ind.item())
|
|
choice = neighbors[ind]["choice"] if k is None else neighbors[ind]["choice"][:k]
|
|
configs = [predictor["portfolio"][x] for x in choice]
|
|
for config in configs:
|
|
if "hyperparameters" in config:
|
|
hyperparams = config["hyperparameters"]
|
|
if hyperparams and "FLAML_sample_size" in hyperparams:
|
|
hyperparams.pop("FLAML_sample_size")
|
|
return configs
|
|
|
|
|
|
def suggest_learner(task, X, y, estimator_or_predictor="all", estimator_list=None, location=None):
|
|
"""Suggest best learner within estimator_list."""
|
|
configs = suggest_config(task, X, y, estimator_or_predictor, location)
|
|
if not estimator_list:
|
|
return configs[0]["class"]
|
|
for c in configs:
|
|
if c["class"] in estimator_list:
|
|
return c["class"]
|
|
return estimator_list[0]
|
|
|
|
|
|
def suggest_hyperparams(task, X, y, estimator_or_predictor, location=None):
|
|
"""Suggest hyperparameter configurations and an estimator class.
|
|
|
|
The configurations can be used to initialize the estimator class like lightgbm.LGBMRegressor.
|
|
|
|
Example:
|
|
|
|
```python
|
|
hyperparams, estimator_class = suggest_hyperparams("regression", X_train, y_train, "lgbm")
|
|
model = estimator_class(**hyperparams) # estimator_class is LGBMRegressor
|
|
model.fit(X_train, y_train)
|
|
```
|
|
|
|
Args:
|
|
task: A string of the task type, e.g.,
|
|
'classification', 'regression', 'ts_forecast', 'rank',
|
|
'seq-classification', 'seq-regression'.
|
|
X: A dataframe of training data in shape n*m.
|
|
For 'ts_forecast' task, the first column of X_train
|
|
must be the timestamp column (datetime type). Other
|
|
columns in the dataframe are assumed to be exogenous
|
|
variables (categorical or numeric).
|
|
y: A series of labels in shape n*1.
|
|
estimator_or_predictor: A str of the learner name or a dict of the learned config predictor.
|
|
If a dict, it contains:
|
|
- "version": a str of the version number.
|
|
- "preprocessing": a dictionary containing:
|
|
* "center": a list of meta feature value offsets for normalization.
|
|
* "scale": a list of meta feature scales to normalize each dimension.
|
|
- "neighbors": a list of dictionaries. Each dictionary contains:
|
|
* "features": a list of the normalized meta features for a neighbor.
|
|
* "choice": an integer of the configuration id in the portfolio.
|
|
- "portfolio": a list of dictionaries, each corresponding to a configuration:
|
|
* "class": a str of the learner name.
|
|
* "hyperparameters": a dict of the config. The key "FLAML_sample_size" will be ignored.
|
|
location: (Optional) A str of the location containing mined portfolio file.
|
|
Only valid when the portfolio is a str, by default the location is flaml/default.
|
|
|
|
Returns:
|
|
hyperparams: A dict of the hyperparameter configurations.
|
|
estiamtor_class: A class of the underlying estimator, e.g., lightgbm.LGBMClassifier.
|
|
"""
|
|
config = suggest_config(task, X, y, estimator_or_predictor, location=location, k=1)[0]
|
|
estimator = config["class"]
|
|
task = task_factory(task)
|
|
model_class = task.estimator_class_from_str(estimator)
|
|
hyperparams = config["hyperparameters"]
|
|
model = model_class(task=task.name, **hyperparams)
|
|
estimator_class = model.estimator_class
|
|
hyperparams = hyperparams and model.params
|
|
return hyperparams, estimator_class
|
|
|
|
|
|
class AutoMLTransformer:
|
|
def __init__(self, model, data_transformer):
|
|
self._model = model
|
|
self._dt = data_transformer
|
|
|
|
def transform(self, X):
|
|
return self._model._preprocess(self._dt.transform(X))
|
|
|
|
|
|
def preprocess_and_suggest_hyperparams(
|
|
task,
|
|
X,
|
|
y,
|
|
estimator_or_predictor,
|
|
location=None,
|
|
):
|
|
"""Preprocess the data and suggest hyperparameters.
|
|
|
|
Example:
|
|
|
|
```python
|
|
hyperparams, estimator_class, X, y, feature_transformer, label_transformer = \
|
|
preprocess_and_suggest_hyperparams("classification", X_train, y_train, "xgb_limitdepth")
|
|
model = estimator_class(**hyperparams) # estimator_class is XGBClassifier
|
|
model.fit(X, y)
|
|
X_test = feature_transformer.transform(X_test)
|
|
y_pred = label_transformer.inverse_transform(pd.Series(model.predict(X_test).astype(int)))
|
|
```
|
|
|
|
Args:
|
|
task: A string of the task type, e.g.,
|
|
'classification', 'regression', 'ts_forecast', 'rank',
|
|
'seq-classification', 'seq-regression'.
|
|
X: A dataframe of training data in shape n*m.
|
|
For 'ts_forecast' task, the first column of X_train
|
|
must be the timestamp column (datetime type). Other
|
|
columns in the dataframe are assumed to be exogenous
|
|
variables (categorical or numeric).
|
|
y: A series of labels in shape n*1.
|
|
estimator_or_predictor: A str of the learner name or a dict of the learned config predictor.
|
|
"choose_xgb" means choosing between xgb_limitdepth and xgboost.
|
|
If a dict, it contains:
|
|
- "version": a str of the version number.
|
|
- "preprocessing": a dictionary containing:
|
|
* "center": a list of meta feature value offsets for normalization.
|
|
* "scale": a list of meta feature scales to normalize each dimension.
|
|
- "neighbors": a list of dictionaries. Each dictionary contains:
|
|
* "features": a list of the normalized meta features for a neighbor.
|
|
* "choice": a integer of the configuration id in the portfolio.
|
|
- "portfolio": a list of dictionaries, each corresponding to a configuration:
|
|
* "class": a str of the learner name.
|
|
* "hyperparameters": a dict of the config. They key "FLAML_sample_size" will be ignored.
|
|
location: (Optional) A str of the location containing mined portfolio file.
|
|
Only valid when the portfolio is a str, by default the location is flaml/default.
|
|
|
|
Returns:
|
|
hyperparams: A dict of the hyperparameter configurations.
|
|
estiamtor_class: A class of the underlying estimator, e.g., lightgbm.LGBMClassifier.
|
|
X: the preprocessed X.
|
|
y: the preprocessed y.
|
|
feature_transformer: a data transformer that can be applied to X_test.
|
|
label_transformer: a label transformer that can be applied to y_test.
|
|
"""
|
|
dt = DataTransformer()
|
|
X, y = dt.fit_transform(X, y, task)
|
|
if "choose_xgb" == estimator_or_predictor:
|
|
# choose between xgb_limitdepth and xgboost
|
|
estimator_or_predictor = suggest_learner(
|
|
task,
|
|
X,
|
|
y,
|
|
estimator_list=["xgb_limitdepth", "xgboost"],
|
|
location=location,
|
|
)
|
|
config = suggest_config(task, X, y, estimator_or_predictor, location=location, k=1)[0]
|
|
estimator = config["class"]
|
|
model_class = task_factory(task).estimator_class_from_str(estimator)
|
|
hyperparams = config["hyperparameters"]
|
|
model = model_class(task=task, **hyperparams)
|
|
if model.estimator_class is None:
|
|
return hyperparams, model_class, X, y, None, None
|
|
else:
|
|
estimator_class = model.estimator_class
|
|
X = model._preprocess(X)
|
|
hyperparams = hyperparams and model.params
|
|
|
|
transformer = AutoMLTransformer(model, dt)
|
|
return hyperparams, estimator_class, X, y, transformer, dt.label_transformer
|