Source code for agent_inspect.metrics.multi_samples.multi_sample_metric

from abc import ABC, abstractmethod
from typing import Any, Dict, Optional, List

from agent_inspect.models.metrics.metric_score import NumericalScore



[docs]
class MultiSampleMetric(ABC):
    """
    Base abstract class for metrics that aggregate results across multiple samples
    or trials.

    Concrete subclasses should implement logic that combines multiple
    ``NumericalScore`` objects into a single aggregated score.

    :param config: Optional configuration dictionary for metric initialization.
        Defaults to ``None``.
    """

    def __init__(self, config: Optional[Dict[str, Any]] = None):
        self.config = config or {}


[docs]
    @abstractmethod
    def compute(
            self,
            scorer_results: List[NumericalScore],
    ):
        """
        Computes an aggregated metric score from multiple scorer results.

        This method is intended to be implemented by concrete subclasses that define
        how multiple trial-level or sample-level ``NumericalScore`` objects should be
        combined (for example, pass@k-style metrics).

        :param scorer_results: A list of
            :obj:`~agent_inspect.models.metrics.metric_score.NumericalScore` objects
            produced by scorer metrics, one per trial or sample.
        :return: A
            :obj:`~agent_inspect.models.metrics.metric_score.NumericalScore` object
            containing the aggregated result.
        """
        ...