mirror of https://github.com/microsoft/autogen.git
268 lines
9.6 KiB
Python
268 lines
9.6 KiB
Python
import asyncio
|
|
from abc import ABC, abstractmethod
|
|
from typing import Any, Dict, List, Optional, Tuple
|
|
|
|
from autogen_core import CancellationToken, Component, ComponentBase
|
|
from autogen_core.models import ChatCompletionClient, UserMessage
|
|
from loguru import logger
|
|
from pydantic import BaseModel
|
|
from typing_extensions import Self
|
|
|
|
from ..datamodel.eval import EvalDimensionScore, EvalJudgeCriteria, EvalRunResult, EvalScore, EvalTask
|
|
|
|
|
|
class BaseEvalJudgeConfig(BaseModel):
|
|
"""Base configuration for evaluation judges."""
|
|
|
|
name: str = "Base Judge"
|
|
description: str = ""
|
|
metadata: Dict[str, Any] = {}
|
|
|
|
|
|
class BaseEvalJudge(ABC, ComponentBase[BaseEvalJudgeConfig]):
|
|
"""Abstract base class for evaluation judges."""
|
|
|
|
component_type = "eval_judge"
|
|
|
|
def __init__(self, name: str = "Base Judge", description: str = "", metadata: Optional[Dict[str, Any]] = None):
|
|
self.name = name
|
|
self.description = description
|
|
self.metadata = metadata or {}
|
|
|
|
@abstractmethod
|
|
async def judge(
|
|
self,
|
|
task: EvalTask,
|
|
result: EvalRunResult,
|
|
criteria: List[EvalJudgeCriteria],
|
|
cancellation_token: Optional[CancellationToken] = None,
|
|
) -> EvalScore:
|
|
"""Judge the result of an evaluation run."""
|
|
pass
|
|
|
|
def _to_config(self) -> BaseEvalJudgeConfig:
|
|
"""Convert the judge configuration to a configuration object for serialization."""
|
|
return BaseEvalJudgeConfig(name=self.name, description=self.description, metadata=self.metadata)
|
|
|
|
|
|
class LLMEvalJudgeConfig(BaseEvalJudgeConfig):
|
|
"""Configuration for LLMEvalJudge."""
|
|
|
|
model_client: Any # ComponentModel
|
|
|
|
|
|
class LLMEvalJudge(BaseEvalJudge, Component[LLMEvalJudgeConfig]):
|
|
"""Judge that uses an LLM to evaluate results."""
|
|
|
|
component_config_schema = LLMEvalJudgeConfig
|
|
component_type = "eval_judge"
|
|
component_provider_override = "autogenstudio.eval.judges.LLMEvalJudge"
|
|
|
|
def __init__(
|
|
self,
|
|
model_client: ChatCompletionClient,
|
|
name: str = "LLM Judge",
|
|
description: str = "Evaluates results using an LLM",
|
|
metadata: Optional[Dict[str, Any]] = None,
|
|
):
|
|
super().__init__(name, description, metadata)
|
|
self.model_client = model_client
|
|
|
|
async def judge(
|
|
self,
|
|
task: EvalTask,
|
|
result: EvalRunResult,
|
|
criteria: List[EvalJudgeCriteria],
|
|
cancellation_token: Optional[CancellationToken] = None,
|
|
) -> EvalScore:
|
|
"""Judge the result using an LLM."""
|
|
# Create a score object
|
|
score = EvalScore(max_value=10.0)
|
|
|
|
# Judge each dimension in parallel
|
|
dimension_score_tasks = []
|
|
for criterion in criteria:
|
|
dimension_score_tasks.append(self._judge_dimension(task, result, criterion, cancellation_token))
|
|
|
|
dimension_scores = await asyncio.gather(*dimension_score_tasks)
|
|
score.dimension_scores = dimension_scores
|
|
|
|
# Calculate overall score (average of dimension scores)
|
|
valid_scores = [ds.score for ds in dimension_scores if ds.score is not None]
|
|
if valid_scores:
|
|
score.overall_score = sum(valid_scores) / len(valid_scores)
|
|
|
|
return score
|
|
|
|
async def _judge_dimension(
|
|
self,
|
|
task: EvalTask,
|
|
result: EvalRunResult,
|
|
criterion: EvalJudgeCriteria,
|
|
cancellation_token: Optional[CancellationToken] = None,
|
|
) -> EvalDimensionScore:
|
|
"""Judge a specific dimension."""
|
|
# Format task and result for the LLM
|
|
task_description = self._format_task(task)
|
|
result_description = result.model_dump()
|
|
|
|
# Create the prompt
|
|
prompt = f"""
|
|
You are evaluating the quality of a system response to a task.
|
|
Task: {task_description}Response: {result_description}
|
|
Evaluation criteria: {criterion.dimension}
|
|
{criterion.prompt}
|
|
Score the response on a scale from {criterion.min_value} to {criterion.max_value}.
|
|
First, provide a detailed explanation of your evaluation.
|
|
Then, give your final score as a single number between 0 and {criterion.max_value}.
|
|
Format your answer should be a json for the EvalDimensionScore class:
|
|
{{
|
|
"dimension": "{criterion.dimension}",
|
|
"reason": "<explanation>",
|
|
"score": <score>
|
|
}}
|
|
Please ensure the score is a number between {criterion.min_value} and {criterion.max_value}.
|
|
If you cannot evaluate the response, please return a score of null.
|
|
If the response is not relevant, please return a score of 0.
|
|
If the response is perfect, please return a score of {criterion.max_value}.
|
|
If the response is not relevant, please return a score of 0.
|
|
If the response is perfect, please return a score of {criterion.max_value}.
|
|
"""
|
|
|
|
# Get judgment from LLM
|
|
model_input = []
|
|
text_message = UserMessage(content=prompt, source="user")
|
|
model_input.append(text_message)
|
|
|
|
# Run with the model client in the same format as used in runners
|
|
model_result = await self.model_client.create(
|
|
messages=model_input,
|
|
cancellation_token=cancellation_token,
|
|
json_output=EvalDimensionScore,
|
|
)
|
|
|
|
# Extract content from the response
|
|
model_response = model_result.content if isinstance(model_result.content, str) else str(model_result.content)
|
|
|
|
try:
|
|
# validate response string as EvalDimensionScore
|
|
model_response = EvalDimensionScore.model_validate_json(model_response)
|
|
return model_response
|
|
except Exception as e:
|
|
logger.warning(f"Failed to parse LLM response: {e}", model_result.content)
|
|
return EvalDimensionScore(
|
|
dimension=criterion.dimension,
|
|
reason="Failed to parse response",
|
|
score=0.0,
|
|
max_value=criterion.max_value,
|
|
min_value=criterion.min_value,
|
|
)
|
|
|
|
def _format_task(self, task: EvalTask) -> str:
|
|
"""Format the task for the LLM."""
|
|
task_parts = []
|
|
|
|
if task.description:
|
|
task_parts.append(task.description)
|
|
if isinstance(task.input, str):
|
|
task_parts.append(task.input)
|
|
elif isinstance(task.input, list):
|
|
task_parts.append("\n".join(str(x) for x in task.input if isinstance(x, str)))
|
|
|
|
return "\n".join(task_parts)
|
|
|
|
def _parse_judgment(self, judgment_text: str, max_value: float) -> Tuple[str, Optional[float]]:
|
|
"""Parse judgment text to extract explanation and score."""
|
|
explanation = ""
|
|
score = None
|
|
|
|
# Simple parsing - could be improved with regex
|
|
lines = judgment_text.split("\n")
|
|
for line in lines:
|
|
if line.strip().lower().startswith("explanation:"):
|
|
explanation = line.split(":", 1)[1].strip()
|
|
elif line.strip().lower().startswith("score:"):
|
|
try:
|
|
score_str = line.split(":", 1)[1].strip()
|
|
score = float(score_str)
|
|
# Ensure score is within bounds
|
|
score = min(max(score, 0), max_value)
|
|
except (ValueError, IndexError):
|
|
pass
|
|
|
|
return explanation, score
|
|
|
|
def _to_config(self) -> LLMEvalJudgeConfig:
|
|
"""Convert to configuration object including model client configuration."""
|
|
base_config = super()._to_config()
|
|
return LLMEvalJudgeConfig(
|
|
name=base_config.name,
|
|
description=base_config.description,
|
|
metadata=base_config.metadata,
|
|
model_client=self.model_client.dump_component(),
|
|
)
|
|
|
|
@classmethod
|
|
def _from_config(cls, config: LLMEvalJudgeConfig) -> Self:
|
|
"""Create from configuration object with serialized model client."""
|
|
model_client = ChatCompletionClient.load_component(config.model_client)
|
|
return cls(
|
|
model_client=model_client, name=config.name, description=config.description, metadata=config.metadata
|
|
)
|
|
|
|
|
|
# # Usage example
|
|
# async def example_usage():
|
|
# # Create a model client
|
|
# from autogen_ext.models import OpenAIChatCompletionClient
|
|
|
|
# model_client = OpenAIChatCompletionClient(
|
|
# model="gpt-4",
|
|
# api_key="your-api-key"
|
|
# )
|
|
|
|
# # Create a judge
|
|
# llm_judge = LLMEvalJudge(model_client=model_client)
|
|
|
|
# # Serialize the judge to a ComponentModel
|
|
# judge_config = llm_judge.dump_component()
|
|
# print(f"Serialized judge: {judge_config}")
|
|
|
|
# # Deserialize back to a LLMEvalJudge
|
|
# deserialized_judge = LLMEvalJudge.load_component(judge_config)
|
|
|
|
# # Create criteria for evaluation
|
|
# criteria = [
|
|
# EvalJudgeCriteria(
|
|
# dimension="relevance",
|
|
# prompt="Evaluate how relevant the response is to the query.",
|
|
# min_value=0,
|
|
# max_value=10
|
|
# ),
|
|
# EvalJudgeCriteria(
|
|
# dimension="accuracy",
|
|
# prompt="Evaluate the factual accuracy of the response.",
|
|
# min_value=0,
|
|
# max_value=10
|
|
# )
|
|
# ]
|
|
|
|
# # Create a mock task and result
|
|
# task = EvalTask(
|
|
# id="task-123",
|
|
# name="Sample Task",
|
|
# description="A sample task for evaluation",
|
|
# input="What is the capital of France?"
|
|
# )
|
|
|
|
# result = EvalRunResult(
|
|
# status=True,
|
|
# result={
|
|
# "messages": [{"content": "The capital of France is Paris.", "source": "model"}]
|
|
# }
|
|
# )
|
|
|
|
# # Run the evaluation
|
|
# score = await deserialized_judge.judge(task, result, criteria)
|
|
# print(f"Evaluation score: {score}")
|