mirror of https://github.com/microsoft/autogen.git
402 lines
16 KiB
Python
402 lines
16 KiB
Python
import inspect
|
|
import copy
|
|
import time
|
|
from typing import Any, Optional
|
|
import numpy as np
|
|
from flaml import tune
|
|
from flaml.automl.logger import logger
|
|
from flaml.automl.ml import compute_estimator, train_estimator
|
|
from flaml.automl.time_series.ts_data import TimeSeriesDataset
|
|
from flaml.automl.spark import psDataFrame, psSeries, DataFrame, Series
|
|
|
|
|
|
class SearchState:
|
|
@property
|
|
def search_space(self):
|
|
return self._search_space_domain
|
|
|
|
@property
|
|
def estimated_cost4improvement(self):
|
|
return max(
|
|
self.time_best_found - self.time_best_found_old,
|
|
self.total_time_used - self.time_best_found,
|
|
)
|
|
|
|
def valid_starting_point_one_dim(self, value_one_dim, domain_one_dim):
|
|
from flaml.tune.space import sample
|
|
|
|
"""
|
|
For each hp in the starting point, check the following 3 conditions:
|
|
(1) If the type of the starting point does not match the required type in search space, return false
|
|
(2) If the starting point is not in the required search space, return false
|
|
(3) If the search space is a value instead of domain, and the value is not equal to the starting point
|
|
Notice (2) include the case starting point not in user specified search space custom_hp
|
|
"""
|
|
if isinstance(domain_one_dim, sample.Domain):
|
|
renamed_type = list(inspect.signature(domain_one_dim.is_valid).parameters.values())[0].annotation
|
|
type_match = (
|
|
renamed_type == Any
|
|
or isinstance(value_one_dim, renamed_type)
|
|
or isinstance(value_one_dim, int)
|
|
and renamed_type is float
|
|
)
|
|
if not (type_match and domain_one_dim.is_valid(value_one_dim)):
|
|
return False
|
|
elif value_one_dim != domain_one_dim:
|
|
return False
|
|
return True
|
|
|
|
def valid_starting_point(self, starting_point, search_space):
|
|
return all(
|
|
self.valid_starting_point_one_dim(value, search_space[name].get("domain"))
|
|
for name, value in starting_point.items()
|
|
if name != "FLAML_sample_size"
|
|
)
|
|
|
|
def __init__(
|
|
self,
|
|
learner_class,
|
|
data,
|
|
task,
|
|
starting_point=None,
|
|
period=None,
|
|
custom_hp=None,
|
|
max_iter=None,
|
|
budget=None,
|
|
):
|
|
self.init_eci = learner_class.cost_relative2lgbm() if budget >= 0 else 1
|
|
self._search_space_domain = {}
|
|
self.init_config = None
|
|
self.low_cost_partial_config = {}
|
|
self.cat_hp_cost = {}
|
|
|
|
self.ls_ever_converged = False
|
|
self.learner_class = learner_class
|
|
self._budget = budget
|
|
|
|
if task.is_ts_forecast():
|
|
data_size = data.train_data.shape
|
|
search_space = learner_class.search_space(data=data, task=task, pred_horizon=period)
|
|
else:
|
|
data_size = data.shape
|
|
search_space = learner_class.search_space(data_size=data_size, task=task)
|
|
self.data_size = data_size
|
|
|
|
if custom_hp is not None:
|
|
search_space.update(custom_hp)
|
|
|
|
if isinstance(starting_point, dict):
|
|
starting_point = AutoMLState.sanitize(starting_point)
|
|
if max_iter > 1 and not self.valid_starting_point(starting_point, search_space):
|
|
# If the number of iterations is larger than 1, remove invalid point
|
|
logger.warning(
|
|
"Starting point {} removed because it is outside of the search space".format(starting_point)
|
|
)
|
|
starting_point = None
|
|
elif isinstance(starting_point, list):
|
|
starting_point = [AutoMLState.sanitize(x) for x in starting_point]
|
|
if max_iter > len(starting_point):
|
|
# If the number of starting points is no smaller than max iter, avoid the checking
|
|
starting_point_len = len(starting_point)
|
|
starting_point = [x for x in starting_point if self.valid_starting_point(x, search_space)]
|
|
if starting_point_len > len(starting_point):
|
|
logger.warning(
|
|
"Starting points outside of the search space are removed. "
|
|
f"Remaining starting points for {learner_class}: {starting_point}"
|
|
)
|
|
starting_point = starting_point or None
|
|
|
|
for name, space in search_space.items():
|
|
assert "domain" in space, f"{name}'s domain is missing in the search space spec {space}"
|
|
if space["domain"] is None:
|
|
# don't search this hp
|
|
continue
|
|
self._search_space_domain[name] = space["domain"]
|
|
|
|
if "low_cost_init_value" in space:
|
|
self.low_cost_partial_config[name] = space["low_cost_init_value"]
|
|
if "cat_hp_cost" in space:
|
|
self.cat_hp_cost[name] = space["cat_hp_cost"]
|
|
# if a starting point is provided, set the init config to be
|
|
# the starting point provided
|
|
if isinstance(starting_point, dict) and starting_point.get(name) is not None:
|
|
if self.init_config is None:
|
|
self.init_config = {}
|
|
self.init_config[name] = starting_point[name]
|
|
elif (
|
|
not isinstance(starting_point, list)
|
|
and "init_value" in space
|
|
and self.valid_starting_point_one_dim(space["init_value"], space["domain"])
|
|
):
|
|
if self.init_config is None:
|
|
self.init_config = {}
|
|
self.init_config[name] = space["init_value"]
|
|
|
|
if isinstance(starting_point, list):
|
|
self.init_config = starting_point
|
|
else:
|
|
self.init_config = [] if self.init_config is None else [self.init_config]
|
|
|
|
self._hp_names = list(self._search_space_domain.keys())
|
|
self.search_alg = None
|
|
self.best_config = None
|
|
self.best_result = None
|
|
self.best_loss = self.best_loss_old = np.inf
|
|
self.total_time_used = 0
|
|
self.total_iter = 0
|
|
self.base_eci = None
|
|
self.time_best_found = self.time_best_found_old = 0
|
|
self.time2eval_best = 0
|
|
self.time2eval_best_old = 0
|
|
self.trained_estimator = None
|
|
self.sample_size = None
|
|
self.trial_time = 0
|
|
|
|
def update(self, result, time_used):
|
|
if result:
|
|
config = result["config"]
|
|
if config and "FLAML_sample_size" in config:
|
|
self.sample_size = config["FLAML_sample_size"]
|
|
else:
|
|
self.sample_size = self.data_size[0]
|
|
obj = result["val_loss"]
|
|
metric_for_logging = result["metric_for_logging"]
|
|
time2eval = result["time_total_s"]
|
|
trained_estimator = result["trained_estimator"]
|
|
del result["trained_estimator"] # free up RAM
|
|
n_iter = (
|
|
trained_estimator
|
|
and hasattr(trained_estimator, "ITER_HP")
|
|
and trained_estimator.params.get(trained_estimator.ITER_HP)
|
|
)
|
|
if n_iter:
|
|
if "ml" in config:
|
|
config["ml"][trained_estimator.ITER_HP] = n_iter
|
|
else:
|
|
config[trained_estimator.ITER_HP] = n_iter
|
|
else:
|
|
obj, time2eval, trained_estimator = np.inf, 0.0, None
|
|
metric_for_logging = config = None
|
|
self.trial_time = time2eval
|
|
self.total_time_used += time_used if self._budget >= 0 else 1
|
|
self.total_iter += 1
|
|
|
|
if self.base_eci is None:
|
|
self.base_eci = time_used
|
|
if (obj is not None) and (obj < self.best_loss):
|
|
self.best_loss_old = self.best_loss if self.best_loss < np.inf else 2 * obj
|
|
self.best_loss = obj
|
|
self.best_result = result
|
|
self.time_best_found_old = self.time_best_found
|
|
self.time_best_found = self.total_time_used
|
|
self.iter_best_found = self.total_iter
|
|
self.best_config = config
|
|
self.best_config_sample_size = self.sample_size
|
|
self.best_config_train_time = time_used
|
|
if time2eval:
|
|
self.time2eval_best_old = self.time2eval_best
|
|
self.time2eval_best = time2eval
|
|
if self.trained_estimator and trained_estimator and self.trained_estimator != trained_estimator:
|
|
self.trained_estimator.cleanup()
|
|
if trained_estimator:
|
|
self.trained_estimator = trained_estimator
|
|
elif trained_estimator:
|
|
trained_estimator.cleanup()
|
|
self.metric_for_logging = metric_for_logging
|
|
self.val_loss, self.config = obj, config
|
|
|
|
def get_hist_config_sig(self, sample_size, config):
|
|
config_values = tuple([config[k] for k in self._hp_names if k in config])
|
|
config_sig = str(sample_size) + "_" + str(config_values)
|
|
return config_sig
|
|
|
|
def est_retrain_time(self, retrain_sample_size):
|
|
assert self.best_config_sample_size is not None, "need to first get best_config_sample_size"
|
|
return self.time2eval_best * retrain_sample_size / self.best_config_sample_size
|
|
|
|
|
|
class AutoMLState:
|
|
def prepare_sample_train_data(self, sample_size: int):
|
|
sampled_weight = groups = None
|
|
if sample_size <= self.data_size[0]:
|
|
if isinstance(self.X_train, TimeSeriesDataset):
|
|
sampled_X_train = copy.copy(self.X_train)
|
|
sampled_X_train.train_data = self.X_train.train_data.iloc[-sample_size:]
|
|
sampled_y_train = None
|
|
else:
|
|
if isinstance(self.X_train, (DataFrame, psDataFrame)):
|
|
sampled_X_train = self.X_train.iloc[:sample_size]
|
|
else:
|
|
sampled_X_train = self.X_train[:sample_size]
|
|
if isinstance(self.y_train, (Series, psSeries)):
|
|
sampled_y_train = self.y_train.iloc[:sample_size]
|
|
else:
|
|
sampled_y_train = self.y_train[:sample_size]
|
|
weight = self.fit_kwargs.get(
|
|
"sample_weight"
|
|
) # NOTE: _prepare_sample_train_data is before kwargs is updated to fit_kwargs_by_estimator
|
|
if weight is not None:
|
|
sampled_weight = (
|
|
weight.iloc[:sample_size] if isinstance(weight, (Series, psSeries)) else weight[:sample_size]
|
|
)
|
|
if self.groups is not None:
|
|
groups = (
|
|
self.groups.iloc[:sample_size]
|
|
if isinstance(self.groups, (Series, psSeries))
|
|
else self.groups[:sample_size]
|
|
)
|
|
else:
|
|
sampled_X_train = self.X_train_all
|
|
sampled_y_train = self.y_train_all
|
|
if (
|
|
"sample_weight" in self.fit_kwargs
|
|
): # NOTE: _prepare_sample_train_data is before kwargs is updated to fit_kwargs_by_estimator
|
|
sampled_weight = self.sample_weight_all
|
|
if self.groups is not None:
|
|
groups = self.groups_all
|
|
return sampled_X_train, sampled_y_train, sampled_weight, groups
|
|
|
|
@staticmethod
|
|
def _compute_with_config_base(
|
|
config_w_resource: dict,
|
|
state: "AutoMLState",
|
|
estimator: str,
|
|
is_report: bool = True,
|
|
) -> dict:
|
|
if "FLAML_sample_size" in config_w_resource:
|
|
sample_size = int(config_w_resource["FLAML_sample_size"])
|
|
else:
|
|
sample_size = state.data_size[0]
|
|
|
|
this_estimator_kwargs = state.fit_kwargs_by_estimator.get(
|
|
estimator
|
|
).copy() # NOTE: _compute_with_config_base is after kwargs is updated to fit_kwargs_by_estimator
|
|
(
|
|
sampled_X_train,
|
|
sampled_y_train,
|
|
sampled_weight,
|
|
groups,
|
|
) = state.task.prepare_sample_train_data(state, sample_size)
|
|
if sampled_weight is not None:
|
|
weight = this_estimator_kwargs["sample_weight"]
|
|
this_estimator_kwargs["sample_weight"] = sampled_weight
|
|
if groups is not None:
|
|
this_estimator_kwargs["groups"] = groups
|
|
config = config_w_resource.copy()
|
|
if "FLAML_sample_size" in config:
|
|
del config["FLAML_sample_size"]
|
|
budget = (
|
|
None
|
|
if state.time_budget < 0
|
|
else state.time_budget - state.time_from_start
|
|
if sample_size == state.data_size[0]
|
|
else (state.time_budget - state.time_from_start) / 2 * sample_size / state.data_size[0]
|
|
)
|
|
|
|
(
|
|
trained_estimator,
|
|
val_loss,
|
|
metric_for_logging,
|
|
_,
|
|
pred_time,
|
|
) = compute_estimator(
|
|
sampled_X_train,
|
|
sampled_y_train,
|
|
state.X_val,
|
|
state.y_val,
|
|
state.weight_val,
|
|
state.groups_val,
|
|
state.train_time_limit if budget is None else min(budget, state.train_time_limit or np.inf),
|
|
state.kf,
|
|
config,
|
|
state.task,
|
|
estimator,
|
|
state.eval_method,
|
|
state.metric,
|
|
state.best_loss,
|
|
state.n_jobs,
|
|
state.learner_classes.get(estimator),
|
|
state.cv_score_agg_func,
|
|
state.log_training_metric,
|
|
this_estimator_kwargs,
|
|
state.free_mem_ratio,
|
|
)
|
|
if state.retrain_final and not state.model_history:
|
|
trained_estimator.cleanup()
|
|
|
|
result = {
|
|
"pred_time": pred_time,
|
|
"wall_clock_time": time.time() - state._start_time_flag,
|
|
"metric_for_logging": metric_for_logging,
|
|
"val_loss": val_loss,
|
|
"trained_estimator": trained_estimator,
|
|
}
|
|
if sampled_weight is not None:
|
|
this_estimator_kwargs["sample_weight"] = weight
|
|
if is_report is True:
|
|
tune.report(**result)
|
|
return result
|
|
|
|
@classmethod
|
|
def sanitize(cls, config: dict) -> dict:
|
|
"""Make a config ready for passing to estimator."""
|
|
config = config.get("ml", config).copy()
|
|
config.pop("FLAML_sample_size", None)
|
|
config.pop("learner", None)
|
|
config.pop("_choice_", None)
|
|
return config
|
|
|
|
def _train_with_config(
|
|
self,
|
|
estimator: str,
|
|
config_w_resource: dict,
|
|
sample_size: Optional[int] = None,
|
|
):
|
|
if not sample_size:
|
|
sample_size = config_w_resource.get("FLAML_sample_size", len(self.y_train_all))
|
|
config = AutoMLState.sanitize(config_w_resource)
|
|
|
|
this_estimator_kwargs = self.fit_kwargs_by_estimator.get(
|
|
estimator
|
|
).copy() # NOTE: _train_with_config is after kwargs is updated to fit_kwargs_by_estimator
|
|
(
|
|
sampled_X_train,
|
|
sampled_y_train,
|
|
sampled_weight,
|
|
groups,
|
|
) = self.task.prepare_sample_train_data(self, sample_size)
|
|
if sampled_weight is not None:
|
|
weight = this_estimator_kwargs[
|
|
"sample_weight"
|
|
] # NOTE: _train_with_config is after kwargs is updated to fit_kwargs_by_estimator
|
|
this_estimator_kwargs[
|
|
"sample_weight"
|
|
] = sampled_weight # NOTE: _train_with_config is after kwargs is updated to fit_kwargs_by_estimator
|
|
if groups is not None:
|
|
this_estimator_kwargs[
|
|
"groups"
|
|
] = groups # NOTE: _train_with_config is after kwargs is updated to fit_kwargs_by_estimator
|
|
|
|
budget = None if self.time_budget < 0 else self.time_budget - self.time_from_start
|
|
|
|
estimator, train_time = train_estimator(
|
|
X_train=sampled_X_train,
|
|
y_train=sampled_y_train,
|
|
config_dic=config,
|
|
task=self.task,
|
|
estimator_name=estimator,
|
|
n_jobs=self.n_jobs,
|
|
estimator_class=self.learner_classes.get(estimator),
|
|
budget=budget,
|
|
fit_kwargs=this_estimator_kwargs, # NOTE: _train_with_config is after kwargs is updated to fit_kwargs_by_estimator
|
|
eval_metric=self.metric if hasattr(self, "metric") else "train_time",
|
|
free_mem_ratio=self.free_mem_ratio,
|
|
)
|
|
|
|
if sampled_weight is not None:
|
|
this_estimator_kwargs[
|
|
"sample_weight"
|
|
] = weight # NOTE: _train_with_config is after kwargs is updated to fit_kwargs_by_estimator
|
|
|
|
return estimator, train_time
|