Source code for agent_inspect.metrics.utils.metrics_utils

import re
from typing import Any, Dict, List

from agent_inspect.exception.error_codes import ErrorCode
from agent_inspect.exception import EvaluationError
from agent_inspect.metrics.constants import DEFAULT_GRADE_PATTERN, STATUS_200



[docs]
def get_majority_voted_score(score_to_vote_count: Dict[Any, int]):
    return max(score_to_vote_count, key=score_to_vote_count.get)



[docs]
def get_config_or_default(config: Dict[str, Any], config_key: str, default: Any):
    if config and config_key in config:
        return config[config_key]
    return default



[docs]
def match_to_int(completion):
    match = re.search(DEFAULT_GRADE_PATTERN, completion)
    if not match:
        raise EvaluationError(internal_code=ErrorCode.INVALID_JUDGE_RESPONSE_FORMAT_ERROR.value,
                              message=f"Could not find the judge grade from the completion: {completion}")
    if match.group(1) == "C":
        correct_int = 1
    elif match.group(1) == "I":
        correct_int = 0
    else:
        raise EvaluationError(internal_code=ErrorCode.INVALID_JUDGE_RESPONSE_FORMAT_ERROR.value,
                              message=f"Invalid judge grade from the completion: {completion}")
    return correct_int



[docs]
def map_subgoal_validations_to_binary_matrix(completions: List[str]) -> List[int]:
    binary_matrix = []
    for completion in completions:
        try:
            score = match_to_int(completion)
            binary_matrix.append(score)
        except EvaluationError:
            # TODO: assume the completion includes the specific matching pattern
            continue  # Skip invalid responses
    return binary_matrix



[docs]
def tally_votes(complete_cnt, incomplete_cnt, invalid_cnt, completions):
    for completion in completions:
        try:
            score = match_to_int(completion)
            if score == 1:
                complete_cnt += 1
            elif score == 0:
                incomplete_cnt += 1
        except EvaluationError:
            invalid_cnt += 1
    return complete_cnt, incomplete_cnt, invalid_cnt



[docs]
def tally_judge_voting(complete_cnt, incomplete_cnt, invalid_cnt, judge_responses):
    completions = []
    for judge_response in judge_responses:
        if judge_response.status != STATUS_200:
            invalid_cnt += 1
        else:
            completions.append(judge_response.completion)
    complete_cnt, incomplete_cnt, invalid_cnt = tally_votes(complete_cnt, incomplete_cnt, invalid_cnt, completions)
    return complete_cnt, incomplete_cnt, invalid_cnt



[docs]
def validate_inputs_for_pass_k_initialisation(k_value: int, num_trials: int):
        
    if not num_trials:
        raise EvaluationError(ErrorCode.INVALID_VALUE.value, "num_trials is invalid and must be provided.")
    
    if k_value <= 0:
            raise EvaluationError(ErrorCode.INVALID_VALUE.value, f"k_value ({k_value}) must be greater than 0")
        
    if num_trials <= 0:
        raise EvaluationError(ErrorCode.INVALID_VALUE.value, f"num_trials ({num_trials}) must be greater than 0")
    
    if k_value > num_trials:
        raise EvaluationError(ErrorCode.INVALID_VALUE.value, f"k_value ({k_value}) cannot be greater than num_trials ({num_trials})")