Source code for agent_inspect.models.metrics.agent_data_sample

from dataclasses import dataclass
from typing import Optional, List, Union, Any



[docs]
@dataclass
class SubGoal:
    """
    A subgoal is a natural language assertion that defines the success criteria of an agent within a larger task. Subgoals specify intermediate success criteria and can also include the final goal.
    eg:
    "Agent should call search_messages after getting the current timestamp"
    "Agent should inform the user: Your oldest message says 'Hey kid, you want some GPU?'.
    """

    details: str
    """
    Details containing the information and criteria used by the metric to determine when the agent has achieved this subgoal.
    """
    type: Optional[str] = None
    """
    Represents the type of subgoal (e.g. grading notes).
    """
    turn: Optional[Union[int|str]] = None
    """
    Represents which turn(s) should this subgoal be considered. Optional and this is only a placeholder for future implementation.
    """




[docs]
@dataclass
class ToolInputParameter:
    """
    Represents a parameter which the agent should invoke with the tool.
    """

    name: str
    """
    Represents the expected name of the parameter (variable name).
    """
    value: Optional[Any] = None
    """
    Represents the expected value of the parameter at the moment the tool is invoked. This will be converted to str during metric calculation.
    """
    check: Optional[str] = None
    """
    Represents the llm prompt to check the correctness of the parameter name and the value that the agent invokes with the tool.
    """



[docs]
@dataclass
class ToolOutput:
    """
    Represents the expected output from the tool after the agent invokes it.
    """

    value: Optional[Any] = None
    """
    Represents the expected output value from the tool after the agent invokes it. This will be converted to str during metric calculation.
    """
    check: Optional[str] = None
    """
    Represents the llm prompt to check the correctness of the output value from the tool after the agent invokes it.
    """




[docs]
@dataclass
class ExpectedToolCall:
    """
    Represents the correct tool invocation an agent is expected to make for a particular task. It serves as the ground-truth tool call for the evaluations.
    """

    tool: str
    """
    Represents a tool that should be called or utilized by the agent at the time of evaluation. Can be a name, a description of the tool, the url of api call, etc.
    """
    expected_parameters: Optional[List[ToolInputParameter]] = None
    """
    A list of parameters with which the tool should be called by the agent during the time of evaluation.
    """
    expected_output: Optional[ToolOutput] = None
    """
    Represents the expected output from the tool after the agent invokes it.
    """
    turn: Optional[int | str] = None
    """
    Represents which turn(s) should this tool call be considered. Optional and this is only a placeholder for future implementation.
    """




[docs]
@dataclass
class Conversation:
    """
    Represents one back-and-forth exchange between a user and an agent (one turn), containing an input message and an optional expected response.
    """

    turn_id: int
    """
    ID representing the sequence number of conversation in the list of conversations.
    """
    message: str
    """
    Input message to the agent.
    """
    expected_response: Optional[str] = None
    """
    Expected response from the agent given the agent input message. This can be none during the evaluation with user proxy.
    """




[docs]
@dataclass
class EvaluationSample:
    """
    Represents an item in the evaluation dataset.
    """

    sub_goals: List[SubGoal]
    """
    A list of sub goals which should be achieved by an agent during an evaluation run.
    """
    id: Optional[int] = None
    """
    Unique identifier for the evaluation sample.
    """
    expected_tool_calls: Optional[List[ExpectedToolCall]] = None
    """
    A list of expected tools that an agent should call or utilize during an evaluation run. (e.g. API calls, calculator, web search, etc.)
    """
    conversation: Optional[List[Conversation]] = None
    """
    A list of conversation between an agent and a user/user proxy. The sequence of conversations in this list, matters for metric to understand which comes after next.
    """
    user_instruction: Optional[str] = None
    """
    An instruction/instructions for user proxy how it should behave/response while communicating with the agent during an evaluation run.
    """