autogen/python/packages/agbench/benchmarks/AssistantBench/Scripts/assistantbench_evaluator.py

128 lines
3.8 KiB
Python

# This Script is slightly modified from the creators of the AssistantBench dataset https://huggingface.co/spaces/AssistantBench/leaderboard/blob/main/evaluation/evaluator.py
import json
from evaluate_utils.evaluate_factory import get_evaluator
import numpy as np
def find_isnan(samp):
try:
if np.isnan(samp):
return True
else:
return False
except:
return False
def fix_ans(answer):
try:
answer = answer.replace("{'", '{"').replace("', '", '", "').replace("': '", '": "').replace("'}", '"}')
answer = answer.replace("': ", '": ')
return answer
except:
return answer
def parse_answer(answer):
if len(answer) == 1:
ans, is_num = fix_number(answer[0])
if is_num:
return ans, "number"
try:
ans = json.loads(fix_ans(answer[0]))
return [ans], "json"
except:
ans, is_num = fix_number(answer[0])
if is_num:
return ans, "number"
else:
return answer[0], "string"
else:
try:
ans = [json.loads(fix_ans(ex)) for ex in answer]
return ans, "json"
except:
return answer, "string list"
def fix_number(number):
if type(number) == str:
copy_ans = number
copy_ans = " ".join(" ".join(" ".join(copy_ans.split("$")).split("%")).split("sqft")).strip()
copy_ans = copy_ans.strip()
copy_ans = copy_ans.replace(",", ".").replace(" square kilometers", "")
try:
return float(copy_ans), True
except:
return number, False
elif type(number) == int:
return float(number), True
else:
return number, True
def fix_prediction(prediction, gold_answer, evaluator):
if (
type(prediction) == list
and len(prediction) == 1
and (type(prediction[0]) == int or ((type(prediction[0]) == str) and prediction[0].isnumeric()))
):
prediction = fix_number(prediction[0])
if type(prediction) != list:
prediction, is_num = fix_number(prediction)
if evaluator == "json":
try:
prediction = [json.loads(pred) for pred in prediction.split("\n")]
except:
prediction = [prediction]
if (hasattr(type(prediction), "__len__")) and (len(prediction) == 0):
return prediction, False
if (type(prediction) == list and len(prediction) > 1) and type(gold_answer) == float:
return prediction, False
return prediction, True
def question_scorer(prediction, gold_answer):
"""
prediction: str or list of str
gold_answer: str or list of str
returns a float between 0 and 1
"""
try:
try:
prediction = json.loads(prediction)
except:
prediction = prediction
answer_list = (
[x for x in gold_answer.split("\n") if len(x.strip()) > 0] if type(gold_answer) != list else gold_answer
)
gold_answer, evaluator = parse_answer(answer_list)
prediction, run_eval = fix_prediction(prediction, gold_answer, evaluator)
has_ans = 1.0
if (type(prediction) != float and len(prediction) == 0) or find_isnan(prediction):
has_ans = 0.0
if not run_eval:
return 0.0
metric_eval = get_evaluator(evaluator)
accuracy = metric_eval(prediction, gold_answer)
# double check if the accuracy is a number between 0 and 1
if 0 <= accuracy <= 1:
return accuracy
else:
# throw exception
raise ValueError(f"Accuracy should be a float between 0 and 1, but got {accuracy}")
except Exception as e:
print(
f"Something went wrong while evaluating prediction {prediction} vs gold answer {gold_answer} with error {e}"
)
return 0.0