Source code for agent_inspect.models.metrics.agent_data_sample
from dataclasses import dataclass
from typing import Optional, List, Union, Any
[docs]
@dataclass
class SubGoal:
"""
A subgoal is a natural language assertion that defines the success criteria of an agent within a larger task. Subgoals specify intermediate success criteria and can also include the final goal.
eg:
"Agent should call search_messages after getting the current timestamp"
"Agent should inform the user: Your oldest message says 'Hey kid, you want some GPU?'.
"""
details: str
"""
Details containing the information and criteria used by the metric to determine when the agent has achieved this subgoal.
"""
type: Optional[str] = None
"""
Represents the type of subgoal (e.g. grading notes).
"""
turn: Optional[Union[int|str]] = None
"""
Represents which turn(s) should this subgoal be considered. Optional and this is only a placeholder for future implementation.
"""
[docs]
@dataclass
class ToolInputParameter:
"""
Represents a parameter which the agent should invoke with the tool.
"""
name: str
"""
Represents the expected name of the parameter (variable name).
"""
value: Optional[Any] = None
"""
Represents the expected value of the parameter at the moment the tool is invoked. This will be converted to str during metric calculation.
"""
check: Optional[str] = None
"""
Represents the llm prompt to check the correctness of the parameter name and the value that the agent invokes with the tool.
"""
[docs]
@dataclass
class ToolOutput:
"""
Represents the expected output from the tool after the agent invokes it.
"""
value: Optional[Any] = None
"""
Represents the expected output value from the tool after the agent invokes it. This will be converted to str during metric calculation.
"""
check: Optional[str] = None
"""
Represents the llm prompt to check the correctness of the output value from the tool after the agent invokes it.
"""
[docs]
@dataclass
class ExpectedToolCall:
"""
Represents the correct tool invocation an agent is expected to make for a particular task. It serves as the ground-truth tool call for the evaluations.
"""
tool: str
"""
Represents a tool that should be called or utilized by the agent at the time of evaluation. Can be a name, a description of the tool, the url of api call, etc.
"""
expected_parameters: Optional[List[ToolInputParameter]] = None
"""
A list of parameters with which the tool should be called by the agent during the time of evaluation.
"""
expected_output: Optional[ToolOutput] = None
"""
Represents the expected output from the tool after the agent invokes it.
"""
turn: Optional[int | str] = None
"""
Represents which turn(s) should this tool call be considered. Optional and this is only a placeholder for future implementation.
"""
[docs]
@dataclass
class Conversation:
"""
Represents one back-and-forth exchange between a user and an agent (one turn), containing an input message and an optional expected response.
"""
turn_id: int
"""
ID representing the sequence number of conversation in the list of conversations.
"""
message: str
"""
Input message to the agent.
"""
expected_response: Optional[str] = None
"""
Expected response from the agent given the agent input message. This can be none during the evaluation with user proxy.
"""
[docs]
@dataclass
class EvaluationSample:
"""
Represents an item in the evaluation dataset.
"""
sub_goals: List[SubGoal]
"""
A list of sub goals which should be achieved by an agent during an evaluation run.
"""
id: Optional[int] = None
"""
Unique identifier for the evaluation sample.
"""
expected_tool_calls: Optional[List[ExpectedToolCall]] = None
"""
A list of expected tools that an agent should call or utilize during an evaluation run. (e.g. API calls, calculator, web search, etc.)
"""
conversation: Optional[List[Conversation]] = None
"""
A list of conversation between an agent and a user/user proxy. The sequence of conversations in this list, matters for metric to understand which comes after next.
"""
user_instruction: Optional[str] = None
"""
An instruction/instructions for user proxy how it should behave/response while communicating with the agent during an evaluation run.
"""