JCC-DeepOD/testbed/testbed_unsupervised_ad.py

137 lines
5.3 KiB
Python

# -*- coding: utf-8 -*-
"""
testbed of unsupervised tabular anomaly detection
@Author: Hongzuo Xu <hongzuoxu@126.com, xuhongzuo13@nudt.edu.cn>
"""
import os
import sys
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
import warnings
import argparse
import getpass
import time
import numpy as np
import importlib as imp
from utils import get_data_lst, read_data
from deepod.metrics import tabular_metrics
dataset_root = f'/home/{getpass.getuser()}/dataset/1-tabular/'
parser = argparse.ArgumentParser()
parser.add_argument("--runs", type=int, default=5,
help="how many times we repeat the experiments to obtain the average performance")
parser.add_argument("--input_dir", type=str,
default='ADBench-classical',
help="the path of the data sets")
parser.add_argument("--output_dir", type=str, default='@record/',
help="the output file path")
parser.add_argument("--dataset", type=str, default='FULL',
help="FULL represents all the csv file in the folder, "
"or a list of data set names split by comma")
parser.add_argument("--model", type=str, default='DeepSVDD', help="",)
parser.add_argument("--auto_hyper", default=True, action='store_true', help="")
parser.add_argument("--normalization", type=str, default='min-max', help="",)
parser.add_argument('--silent_header', action='store_true')
parser.add_argument("--flag", type=str, default='')
args = parser.parse_args()
os.makedirs(args.output_dir, exist_ok=True)
data_lst = get_data_lst(os.path.join(dataset_root, args.input_dir), args.dataset)
print(os.path.join(dataset_root, args.input_dir))
print(data_lst)
module = imp.import_module('deepod.models.tabular')
model_class = getattr(module, args.model)
cur_time = time.strftime("%m-%d %H.%M.%S", time.localtime())
result_file = os.path.join(args.output_dir, f'{args.model}.{args.input_dir}.{args.flag}.csv')
if not args.silent_header:
f = open(result_file, 'a')
print('\n---------------------------------------------------------', file=f)
print(f'model: {args.model}, collection: {args.input_dir}, '
f'datasets: {args.dataset}, normalization: {args.normalization}, {args.runs}runs, ', file=f)
print('---------------------------------------------------------', file=f)
print('data, auc-roc, std, auc-pr, std, f1, std, time', file=f)
f.close()
for file in data_lst:
dataset_name = os.path.splitext(os.path.split(file)[1])[0]
print(f'\n-------------------------{dataset_name}-----------------------')
split = '50%-normal'
print(f'train-test split: {split}, normalization: {args.normalization}')
x_train, y_train, x_test, y_test = read_data(file=file, split=split,
normalization=args.normalization,
seed=42)
if x_train is None:
continue
auc_lst, ap_lst, f1_lst = np.zeros(args.runs), np.zeros(args.runs), np.zeros(args.runs)
t1_lst, t2_lst = [], []
runs = args.runs
model_configs = {}
if args.auto_hyper:
clf = model_class(random_state=42)
# check whether the anomaly detection model supports ray tuning
if not hasattr(clf, 'fit_auto_hyper'):
warnings.warn(f'anomaly detection model {args.model} '
f'does not support auto tuning hyper-parameters currently.')
break
print(f'\nRunning [1/{args.runs}] of [{args.model}] on Dataset [{dataset_name}] (rat tune)')
tuned_model_configs = clf.fit_auto_hyper(X=x_train,
X_test=x_test, y_test=y_test,
n_ray_samples=1, time_budget_s=None)
model_configs = tuned_model_configs
print(f'model parameter configure update to: {model_configs}')
scores = clf.decision_function(x_test)
auc, ap, f1 = tabular_metrics(y_test, scores)
print(f'{dataset_name}, {auc:.4f}, {ap:.4f}, {f1:.4f}, {args.model}')
for i in range(runs):
start_time = time.time()
print(f'\nRunning [{i+1}/{args.runs}] of [{args.model}] on Dataset [{dataset_name}]')
clf = model_class(**model_configs, random_state=42+i)
clf.fit(x_train)
train_time = time.time()
scores = clf.decision_function(x_test)
done_time = time.time()
auc, ap, f1 = tabular_metrics(y_test, scores)
auc_lst[i], ap_lst[i], f1_lst[i] = auc, ap, f1
t1_lst.append(train_time - start_time)
t2_lst.append(done_time - start_time)
print(f'{dataset_name}, {auc_lst[i]:.4f}, {ap_lst[i]:.4f}, {f1_lst[i]:.4f}, '
f'{t1_lst[i]:.1f}/{t2_lst[i]:.1f}, {args.model}, {str(model_configs)}')
avg_auc, avg_ap, avg_f1 = np.average(auc_lst), np.average(ap_lst), np.average(f1_lst)
std_auc, std_ap, std_f1 = np.std(auc_lst), np.std(ap_lst), np.std(f1_lst)
avg_time1 = np.average(t1_lst)
avg_time2 = np.average(t2_lst)
f = open(result_file, 'a')
txt = f'{dataset_name}, ' \
f'{avg_auc:.4f}, {std_auc:.4f}, ' \
f'{avg_ap:.4f}, {std_ap:.4f}, ' \
f'{avg_f1:.4f}, {std_f1:.4f}, ' \
f'{avg_time1:.1f}/{avg_time2:.1f}, {args.model}, {str(model_configs)}'
print(txt, file=f)
print(txt)
f.close()