mirror of https://github.com/microsoft/autogen.git
223 lines
8.1 KiB
Python
223 lines
8.1 KiB
Python
import pandas as pd
|
|
import numpy as np
|
|
import argparse
|
|
from pathlib import Path
|
|
import json
|
|
from sklearn.preprocessing import RobustScaler
|
|
from flaml.default import greedy
|
|
from flaml.default.regret import load_result, build_regret
|
|
from flaml.version import __version__
|
|
|
|
regret_bound = 0.01
|
|
|
|
|
|
def config_predictor_tuple(tasks, configs, meta_features, regret_matrix):
|
|
"""Config predictor represented in tuple.
|
|
|
|
The returned tuple consists of (meta_features, preferences, proc).
|
|
|
|
Returns:
|
|
meta_features_norm: A dataframe of normalized meta features, each column for a task.
|
|
preferences: A dataframe of sorted configuration indicies by their performance per task (column).
|
|
regret_matrix: A dataframe of the configuration(row)-task(column) regret matrix.
|
|
"""
|
|
# pre-processing
|
|
scaler = RobustScaler()
|
|
meta_features_norm = meta_features.loc[tasks] # this makes a copy
|
|
meta_features_norm.loc[:, :] = scaler.fit_transform(meta_features_norm)
|
|
|
|
proc = {
|
|
"center": scaler.center_.tolist(),
|
|
"scale": scaler.scale_.tolist(),
|
|
}
|
|
|
|
# best model for each dataset in training
|
|
# choices = regret_matrix[tasks].loc[configs].reset_index(drop=True).idxmin()
|
|
|
|
# break ties using the order in configs
|
|
regret = (
|
|
regret_matrix[tasks]
|
|
.loc[configs]
|
|
.reset_index(drop=True)
|
|
.apply(lambda row: row.apply(lambda x: (x, row.name)), axis=1)
|
|
)
|
|
print(regret)
|
|
preferences = pd.DataFrame(np.argsort(regret, axis=0), columns=regret.columns)
|
|
print(preferences)
|
|
return (meta_features_norm, preferences, proc)
|
|
|
|
|
|
def build_portfolio(meta_features, regret, strategy):
|
|
"""Build a portfolio from meta features and regret matrix.
|
|
|
|
Args:
|
|
meta_features: A dataframe of metafeatures matrix.
|
|
regret: A dataframe of regret matrix.
|
|
strategy: A str of the strategy, one of ("greedy", "greedy-feedback").
|
|
"""
|
|
assert strategy in ("greedy", "greedy-feedback")
|
|
if strategy == "greedy":
|
|
portfolio = greedy.construct_portfolio(regret, None, regret_bound)
|
|
elif strategy == "greedy-feedback":
|
|
portfolio = greedy.construct_portfolio(regret, meta_features, regret_bound)
|
|
if "default" not in portfolio and "default" in regret.index:
|
|
portfolio += ["default"]
|
|
return portfolio
|
|
|
|
|
|
def load_json(filename):
|
|
"""Returns the contents of json file filename."""
|
|
with open(filename, "r") as f:
|
|
return json.load(f)
|
|
|
|
|
|
def _filter(preference, regret):
|
|
"""Remove choices after default or have NaN regret."""
|
|
try:
|
|
last = regret.index.get_loc("default") # len(preference) - 1
|
|
preference = preference[: preference[preference == last].index[0] + 1]
|
|
except KeyError: # no "default"
|
|
pass
|
|
finally:
|
|
regret = regret.reset_index(drop=True)
|
|
preference = preference[regret[preference].notna().to_numpy()]
|
|
# regret = regret[preference].reset_index(drop=True)
|
|
# dup = regret[regret.duplicated()]
|
|
# if not dup.empty:
|
|
# # break ties using the order in configs
|
|
# unique = dup.drop_duplicates()
|
|
# for u in unique:
|
|
# subset = regret == u
|
|
# preference[subset].sort_values(inplace=True)
|
|
# # raise ValueError(preference)
|
|
return preference.tolist()
|
|
|
|
|
|
def serialize(configs, regret, meta_features, output_file, config_path):
|
|
"""Store to disk all information FLAML-metalearn needs at runtime.
|
|
|
|
configs: names of model configs
|
|
regret: regret matrix
|
|
meta_features: task metafeatures
|
|
output_file: filename
|
|
config_path: path containing config json files
|
|
"""
|
|
output_file = Path(output_file)
|
|
# delete if exists
|
|
try:
|
|
output_file.unlink()
|
|
except FileNotFoundError:
|
|
pass
|
|
|
|
meta_features_norm, preferences, proc = config_predictor_tuple(regret.columns, configs, meta_features, regret)
|
|
portfolio = [load_json(config_path.joinpath(m + ".json")) for m in configs]
|
|
regret = regret.loc[configs]
|
|
|
|
meta_predictor = {
|
|
"version": __version__,
|
|
"meta_feature_names": list(meta_features.columns),
|
|
"portfolio": portfolio,
|
|
"preprocessing": proc,
|
|
"neighbors": [
|
|
{"features": x.tolist(), "choice": _filter(preferences[y], regret[y])}
|
|
for x, y in zip(meta_features_norm.to_records(index=False), preferences.columns)
|
|
],
|
|
"configsource": list(configs),
|
|
}
|
|
with open(output_file, "w+") as f:
|
|
json.dump(meta_predictor, f, indent=4)
|
|
return meta_predictor
|
|
|
|
|
|
# def analyze(regret_matrix, meta_predictor):
|
|
# tasks = regret_matrix.columns
|
|
# neighbors = meta_predictor["neighbors"]
|
|
# from sklearn.neighbors import NearestNeighbors
|
|
|
|
# nn = NearestNeighbors(n_neighbors=1)
|
|
# for i, task in enumerate(neighbors):
|
|
# other_tasks = [j for j in range(len(neighbors)) if j != i]
|
|
# # find the nn and the regret
|
|
# nn.fit([neighbors[j]["features"] for j in other_tasks])
|
|
# dist, ind = nn.kneighbors(
|
|
# np.array(task["features"]).reshape(1, -1), return_distance=True
|
|
# )
|
|
# ind = other_tasks[int(ind.item())]
|
|
# choice = int(neighbors[ind]["choice"][0])
|
|
# r = regret_matrix.iloc[choice, i]
|
|
# if r > regret_bound:
|
|
# label = "outlier"
|
|
# else:
|
|
# label = "normal"
|
|
# print(tasks[i], label, tasks[ind], "dist", dist, "regret", r)
|
|
# # find the best model and the regret
|
|
# regrets = regret_matrix.iloc[other_tasks, i]
|
|
# best = regrets.min()
|
|
# if best > regret_bound:
|
|
# print(tasks[i], "best_regret", best, "task", regrets.idxmin())
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description="Build a portfolio.")
|
|
parser.add_argument("--strategy", help="One of {greedy, greedy-feedback}", default="greedy")
|
|
parser.add_argument("--input", help="Input path")
|
|
parser.add_argument("--metafeatures", help="CSV of task metafeatures")
|
|
parser.add_argument("--exclude", help="One task name to exclude (for LOO purposes)")
|
|
parser.add_argument("--output", help="Location to write portfolio JSON")
|
|
parser.add_argument("--task", help="Task to merge portfolios", default="binary")
|
|
parser.add_argument(
|
|
"--estimator",
|
|
help="Estimators to merge portfolios",
|
|
default=["lgbm", "xgboost"],
|
|
nargs="+",
|
|
)
|
|
args = parser.parse_args()
|
|
|
|
meta_features = pd.read_csv(args.metafeatures, index_col=0).groupby(level=0).first()
|
|
if args.exclude:
|
|
meta_features.drop(args.exclude, inplace=True)
|
|
|
|
baseline_best = None
|
|
all_results = None
|
|
for estimator in args.estimator:
|
|
# produce regret
|
|
all, baseline = load_result(f"{args.input}/{estimator}/results.csv", args.task, "result")
|
|
regret = build_regret(all, baseline)
|
|
regret = regret.replace(np.inf, np.nan).dropna(axis=1, how="all")
|
|
|
|
if args.exclude:
|
|
regret = regret.loc[[i for i in regret.index if args.exclude not in i]]
|
|
regret = regret[[c for c in regret.columns if args.exclude not in c]]
|
|
|
|
print(f"Regret matrix complete: {100 * regret.count().sum() / regret.shape[0] / regret.shape[1]}%")
|
|
print(f"Num models considered: {regret.shape[0]}")
|
|
|
|
configs = build_portfolio(meta_features, regret, args.strategy)
|
|
meta_predictor = serialize(
|
|
configs,
|
|
regret,
|
|
meta_features,
|
|
f"{args.output}/{estimator}/{args.task}.json",
|
|
Path(f"{args.input}/{estimator}"),
|
|
)
|
|
configsource = meta_predictor["configsource"]
|
|
all = all.loc[configsource]
|
|
all.rename({x: f"{estimator}/{x}" for x in regret.index.values}, inplace=True)
|
|
baseline_best = baseline if baseline_best is None else pd.DataFrame({0: baseline_best, 1: baseline}).max(1)
|
|
all_results = all if all_results is None else pd.concat([all_results, all])
|
|
# analyze(regret, meta_predictor)
|
|
regrets = build_regret(all_results, baseline_best)
|
|
if len(args.estimator) > 1:
|
|
meta_predictor = serialize(
|
|
regrets.index,
|
|
regrets,
|
|
meta_features,
|
|
f"{args.output}/all/{args.task}.json",
|
|
Path(args.input),
|
|
)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
# execute only if run as a script
|
|
main()
|