File: test_aoai_alignment_missing_rows.py

package info (click to toggle)
python-azure 20251104%2Bgit-1
  • links: PTS, VCS
  • area: main
  • in suites: forky
  • size: 770,224 kB
  • sloc: python: 6,357,217; ansic: 804; javascript: 287; makefile: 198; sh: 193; xml: 109
file content (90 lines) | stat: -rw-r--r-- 3,438 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
# ---------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# ---------------------------------------------------------

import logging
from typing import List
from unittest.mock import Mock, patch

import pandas as pd
import pytest

from azure.ai.evaluation._evaluate._evaluate_aoai import (
    OAIEvalRunCreationInfo,
    _get_single_run_results,
)


class MockOutputItem:
    def __init__(self, id: str, datasource_item_id: int, results: List[dict]):
        self.id = id
        self.datasource_item_id = datasource_item_id
        self.results = results


class MockOutputItemsList:
    def __init__(self, data, has_more=False):
        self.data = data
        self.has_more = has_more


@pytest.mark.unittest
def test_aoai_results_preserve_order_with_unordered_output_items(caplog):
    """AOAI output_items can arrive unordered; results should align to row ids (0..N-1)."""
    mock_client = Mock()
    expected_rows = 5
    run_info = OAIEvalRunCreationInfo(
        client=mock_client,
        eval_group_id="grp",
        eval_run_id="run",
        grader_name_map={"grader-1": "rel"},
        expected_rows=expected_rows,
    )

    # Completed run; pass_rate comes from per_testing_criteria_results
    mock_run_results = Mock()
    mock_run_results.status = "completed"
    mock_run_results.per_testing_criteria_results = [Mock(testing_criteria="grader-1", passed=4, failed=1)]

    # Unordered items: ids [3,0,4,1,2]; score equals its id for easy checks
    unordered_items = [
        MockOutputItem(id="i3", datasource_item_id=3, results=[{"name": "grader-1", "passed": True, "score": 3.0}]),
        MockOutputItem(id="i0", datasource_item_id=0, results=[{"name": "grader-1", "passed": True, "score": 0.0}]),
        MockOutputItem(id="i4", datasource_item_id=4, results=[{"name": "grader-1", "passed": False, "score": 4.0}]),
        MockOutputItem(id="i1", datasource_item_id=1, results=[{"name": "grader-1", "passed": True, "score": 1.0}]),
        MockOutputItem(id="i2", datasource_item_id=2, results=[{"name": "grader-1", "passed": True, "score": 2.0}]),
    ]
    mock_client.evals.runs.output_items.list.return_value = MockOutputItemsList(data=unordered_items, has_more=False)

    caplog.set_level(logging.WARNING, logger="azure.ai.evaluation._evaluate._evaluate_aoai")

    with patch(
        "azure.ai.evaluation._evaluate._evaluate_aoai._wait_for_run_conclusion",
        return_value=mock_run_results,
    ):
        df, metrics = _get_single_run_results(run_info)

    # Shape and index
    assert len(df) == expected_rows
    assert list(df.index) == list(range(expected_rows))

    score_col = "outputs.rel.score"
    assert score_col in df.columns

    # Each row i should have score == float(i), proving correct alignment after sort/reindex
    for i in range(expected_rows):
        assert df.loc[i, score_col] == float(i)

    # No missing-row padding in this test; the row_missing flag should not exist
    missing_flag_col = "outputs.rel.row_missing"
    assert missing_flag_col not in df.columns

    # Pass rate surfaced from per_testing_criteria_results
    assert metrics["rel.pass_rate"] == 4 / 5

    # No warning about padding missing rows in this scenario
    assert not any(
        "missing row(s) padded with NaN for alignment" in rec.message
        for rec in caplog.records
        if rec.levelno >= logging.WARNING
    )