File: test_evaluate_mismatch.py

package info (click to toggle)
python-azure 20251104%2Bgit-1
  • links: PTS, VCS
  • area: main
  • in suites: forky
  • size: 770,224 kB
  • sloc: python: 6,357,217; ansic: 804; javascript: 287; makefile: 198; sh: 193; xml: 109
file content (488 lines) | stat: -rw-r--r-- 23,828 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
import json
import math
import os
import pathlib
import tempfile
import pytest
import pandas as pd
from unittest.mock import Mock, patch, mock_open, MagicMock
from pandas.testing import assert_frame_equal

from azure.ai.evaluation import evaluate, F1ScoreEvaluator
from azure.ai.evaluation._evaluate._evaluate import (
    _preprocess_data,
    _run_callable_evaluators,
    __ValidatedData,  # Keep double underscore
)
from azure.ai.evaluation._evaluate._batch_run import ProxyClient, CodeClient, RunSubmitterClient
from azure.ai.evaluation._constants import Prefixes
from azure.ai.evaluation._exceptions import EvaluationException

# Create alias to avoid name mangling issues in class scope
ValidatedData = __ValidatedData


def _get_file(name):
    """Get the file from the unittest data folder."""
    data_path = os.path.join(pathlib.Path(__file__).parent.resolve(), "data")
    return os.path.join(data_path, name)


def _target_with_failures(query):
    """A target function that fails for certain inputs."""
    if "LV-426" in query:
        raise Exception("Target failure for LV-426")
    if "central heating" in query:
        raise Exception("Target failure for central heating")
    return {"response": f"Response to: {query}"}


def _successful_target(query):
    """A target function that always succeeds."""
    return {"response": f"Response to: {query}"}


def _simple_evaluator(query, response):
    """A simple evaluator for testing."""
    return {"score": len(response) if response else 0}


@pytest.fixture
def sample_questions_file():
    """Create a temporary test file with sample questions."""
    test_data = [
        {"query": "How long is flight from Earth to LV-426?"},
        {"query": "Why there is no central heating on the street?"},
        {"query": "Why these questions are so strange?"},
        {"query": "What is the weather like today?"},
    ]

    temp_file = tempfile.NamedTemporaryFile(mode="w", suffix=".jsonl", delete=False)
    for item in test_data:
        temp_file.write(json.dumps(item) + "\n")
    temp_file.close()

    yield temp_file.name

    # Cleanup
    if os.path.exists(temp_file.name):
        os.unlink(temp_file.name)


@pytest.fixture
def sample_dataframe_with_target_outputs():
    """Create a sample dataframe with target outputs including failures."""
    return pd.DataFrame(
        {
            "query": [
                "How long is flight from Earth to LV-426?",
                "Why there is no central heating on the street?",
                "Why these questions are so strange?",
                "What is the weather like today?",
            ],
            "__outputs.response": [
                None,  # Failed
                None,  # Failed
                "Response to: Why these questions are so strange?",  # Success
                "Response to: What is the weather like today?",  # Success
            ],
            "line_number": [0, 1, 2, 3],
        }
    )


@pytest.mark.unittest
class TestTargetFailureHandling:
    """Test cases for target failure handling functionality."""

    @patch("azure.ai.evaluation._evaluate._evaluate._apply_target_to_data")
    @patch("azure.ai.evaluation._evaluate._evaluate._validate_and_load_data")
    def test_preprocess_data_creates_temp_file_for_proxy_client_with_target_failures(
        self, mock_load_data, mock_apply_target, sample_dataframe_with_target_outputs
    ):
        """Test that _preprocess_data creates a temporary file for ProxyClient when target has failures."""
        # Setup mocks
        mock_load_data.return_value = pd.DataFrame({"query": ["test"]})
        mock_apply_target.return_value = (sample_dataframe_with_target_outputs, {"response"}, Mock())

        # Test data
        evaluators_and_graders = {"test_eval": _simple_evaluator}

        with patch("tempfile.NamedTemporaryFile") as mock_temp_file:
            mock_file = Mock()
            mock_file.name = "/tmp/test_temp_file.jsonl"
            mock_file.__enter__ = Mock(return_value=mock_file)
            mock_file.__exit__ = Mock(return_value=None)
            mock_temp_file.return_value = mock_file

            with patch("json.dumps") as mock_json_dumps:
                mock_json_dumps.return_value = '{"test": "data"}'

                result = _preprocess_data(
                    data="/test/path.jsonl",
                    evaluators_and_graders=evaluators_and_graders,
                    target=_target_with_failures,
                    _use_pf_client=True,
                )

                # Verify temp file was created
                mock_temp_file.assert_called_once()

                # Verify batch_run_data points to temp file
                assert result["batch_run_data"] == "/tmp/test_temp_file.jsonl"

                # Verify target_run is None (we don't use previous run)
                assert result["target_run"] is None

                # Verify column mapping uses data references instead of run outputs
                assert "response" in result["column_mapping"]["default"]
                assert result["column_mapping"]["default"]["response"] == "${data.__outputs.response}"

    @patch("azure.ai.evaluation._evaluate._evaluate._apply_target_to_data")
    @patch("azure.ai.evaluation._evaluate._evaluate._validate_and_load_data")
    def test_preprocess_data_uses_dataframe_for_non_proxy_clients_with_target_failures(
        self, mock_load_data, mock_apply_target, sample_dataframe_with_target_outputs
    ):
        """Test that _preprocess_data uses dataframe for non-ProxyClient when target has failures."""
        # Setup mocks
        mock_load_data.return_value = pd.DataFrame({"query": ["test"]})
        mock_apply_target.return_value = (sample_dataframe_with_target_outputs, {"response"}, Mock())

        # Test data
        evaluators_and_graders = {"test_eval": _simple_evaluator}

        result = _preprocess_data(
            data="/test/path.jsonl",
            evaluators_and_graders=evaluators_and_graders,
            target=_target_with_failures,
            _use_run_submitter_client=True,
        )

        # Verify batch_run_data is the dataframe
        assert isinstance(result["batch_run_data"], pd.DataFrame)
        assert_frame_equal(result["batch_run_data"], sample_dataframe_with_target_outputs)

        # Verify column mapping uses data references
        assert "response" in result["column_mapping"]["default"]
        assert result["column_mapping"]["default"]["response"] == "${data.__outputs.response}"

    @patch("azure.ai.evaluation._evaluate._evaluate.json.dumps")
    @patch("azure.ai.evaluation._evaluate._evaluate.pd.isna")
    def test_temp_file_creation_handles_nan_values(
        self, mock_isna, mock_json_dumps, sample_dataframe_with_target_outputs
    ):
        """Test that NaN values are properly converted to None in temp file creation."""
        # Setup mocks - simulate NaN detection
        mock_isna.side_effect = lambda x: x is None
        mock_json_dumps.return_value = '{"test": "data"}'

        with patch("tempfile.NamedTemporaryFile") as mock_temp_file:
            mock_file = Mock()
            mock_file.name = "/tmp/test.jsonl"
            mock_file.write = Mock()
            mock_file.close = Mock()
            mock_temp_file.return_value = mock_file

            with patch("azure.ai.evaluation._evaluate._evaluate._apply_target_to_data") as mock_apply_target:
                with patch("azure.ai.evaluation._evaluate._evaluate._validate_and_load_data") as mock_load_data:
                    mock_load_data.return_value = pd.DataFrame({"query": ["test"]})
                    mock_apply_target.return_value = (sample_dataframe_with_target_outputs, {"response"}, Mock())

                    _preprocess_data(
                        data="/test/path.jsonl",
                        evaluators_and_graders={"test_eval": _simple_evaluator},
                        target=_target_with_failures,
                        _use_pf_client=True,
                    )

                    # Verify json.dumps was called (temp file creation happened)
                    assert mock_json_dumps.call_count > 0

    def test_temp_file_cleanup_on_exception(self):
        """Test that temporary files are cleaned up when exceptions occur."""
        with patch("tempfile.NamedTemporaryFile") as mock_temp_file:
            mock_file = Mock()
            mock_file.name = "/tmp/test_temp_file.jsonl"
            mock_temp_file.return_value = mock_file

            with patch("os.path.exists") as mock_exists:
                with patch("os.unlink") as mock_unlink:
                    mock_exists.return_value = True

                    with patch("azure.ai.evaluation._evaluate._evaluate._apply_target_to_data") as mock_apply_target:
                        with patch("azure.ai.evaluation._evaluate._evaluate._validate_and_load_data") as mock_load_data:
                            mock_load_data.return_value = pd.DataFrame({"query": ["test"]})
                            mock_apply_target.return_value = (
                                pd.DataFrame({"query": ["test"], "__outputs.response": ["response"]}),
                                {"response"},
                                Mock(),
                            )

                            # Mock json.dumps to raise an exception
                            with patch("json.dumps", side_effect=Exception("JSON error")):
                                with pytest.raises(Exception):
                                    _preprocess_data(
                                        data="/test/path.jsonl",
                                        evaluators_and_graders={"test_eval": _simple_evaluator},
                                        target=_target_with_failures,
                                        _use_pf_client=True,
                                    )

                                # Verify cleanup was attempted
                                mock_unlink.assert_called_once_with("/tmp/test_temp_file.jsonl")

    @patch("azure.ai.evaluation._evaluate._evaluate.EvalRunContext")
    def test_run_callable_evaluators_temp_file_cleanup(self, mock_eval_context):
        """Test that _run_callable_evaluators cleans up temporary files."""
        # Create mock validated data with temp file
        temp_file_path = "/tmp/test_eval_temp.jsonl"
        validated_data = ValidatedData(
            evaluators={"test_eval": _simple_evaluator},
            graders={},
            input_data_df=pd.DataFrame({"query": ["test"], "__outputs.response": ["response"]}),
            column_mapping={"default": {"response": "${data.__outputs.response}"}},
            target_run=None,
            batch_run_client=Mock(spec=ProxyClient),
            batch_run_data=temp_file_path,
        )

        # Mock the batch client run methods
        mock_run = Mock()
        validated_data["batch_run_client"].run.return_value = mock_run
        validated_data["batch_run_client"].get_details.return_value = pd.DataFrame({"outputs.test_eval.score": [10]})
        validated_data["batch_run_client"].get_metrics.return_value = {}
        validated_data["batch_run_client"].get_run_summary.return_value = {"failed_lines": 0, "status": "Completed"}

        with patch("tempfile.gettempdir", return_value="/tmp"):
            with patch("os.path.exists") as mock_exists:
                with patch("os.unlink") as mock_unlink:
                    mock_exists.return_value = True

                    # Run the function
                    _run_callable_evaluators(validated_data)

                    # Verify cleanup was called
                    mock_unlink.assert_called_once_with(temp_file_path)

    @patch("azure.ai.evaluation._evaluate._evaluate.EvalRunContext")
    def test_run_callable_evaluators_no_cleanup_for_non_temp_files(self, mock_eval_context):
        """Test that _run_callable_evaluators doesn't clean up non-temp files."""
        # Create mock validated data with regular file (not in temp directory)
        regular_file_path = "/data/test_eval.jsonl"
        validated_data = ValidatedData(
            evaluators={"test_eval": _simple_evaluator},
            graders={},
            input_data_df=pd.DataFrame({"query": ["test"], "__outputs.response": ["response"]}),
            column_mapping={"default": {"response": "${data.__outputs.response}"}},
            target_run=None,
            batch_run_client=Mock(spec=ProxyClient),
            batch_run_data=regular_file_path,
        )

        # Mock the batch client run methods
        mock_run = Mock()
        validated_data["batch_run_client"].run.return_value = mock_run
        validated_data["batch_run_client"].get_details.return_value = pd.DataFrame({"outputs.test_eval.score": [10]})
        validated_data["batch_run_client"].get_metrics.return_value = {}
        validated_data["batch_run_client"].get_run_summary.return_value = {"failed_lines": 0, "status": "Completed"}

        with patch("tempfile.gettempdir", return_value="/tmp"):
            with patch("os.unlink") as mock_unlink:
                # Run the function
                _run_callable_evaluators(validated_data)

                # Verify cleanup was NOT called for non-temp file
                mock_unlink.assert_not_called()

    def test_column_mapping_uses_data_reference_for_proxy_client_with_target(self):
        """Test that column mapping uses ${data.__outputs.column} for ProxyClient with target failures."""
        with patch("azure.ai.evaluation._evaluate._evaluate._apply_target_to_data") as mock_apply_target:
            with patch("azure.ai.evaluation._evaluate._evaluate._validate_and_load_data") as mock_load_data:
                mock_load_data.return_value = pd.DataFrame({"query": ["test"]})
                mock_apply_target.return_value = (
                    pd.DataFrame({"query": ["test"], "__outputs.response": ["response"]}),
                    {"response"},
                    Mock(),
                )

                with patch("tempfile.NamedTemporaryFile") as mock_temp_file:
                    mock_file = Mock()
                    mock_file.name = "/tmp/test.jsonl"
                    mock_file.close = Mock()
                    mock_temp_file.return_value = mock_file

                    with patch("json.dumps"):
                        result = _preprocess_data(
                            data="/test/path.jsonl",
                            evaluators_and_graders={"test_eval": _simple_evaluator},
                            target=_target_with_failures,
                            _use_pf_client=True,
                        )

                        # Verify column mapping uses data reference
                        assert result["column_mapping"]["default"]["response"] == "${data.__outputs.response}"

    def test_column_mapping_uses_data_reference_for_dataframe_clients_with_target(self):
        """Test that column mapping uses ${data.__outputs.column} for DataFrame clients with target."""
        with patch("azure.ai.evaluation._evaluate._evaluate._apply_target_to_data") as mock_apply_target:
            with patch("azure.ai.evaluation._evaluate._evaluate._validate_and_load_data") as mock_load_data:
                mock_load_data.return_value = pd.DataFrame({"query": ["test"]})
                mock_apply_target.return_value = (
                    pd.DataFrame({"query": ["test"], "__outputs.response": ["response"]}),
                    {"response"},
                    Mock(),
                )

                result = _preprocess_data(
                    data="/test/path.jsonl",
                    evaluators_and_graders={"test_eval": _simple_evaluator},
                    target=_target_with_failures,
                    _use_run_submitter_client=True,
                )

                # Verify column mapping uses data reference
                assert result["column_mapping"]["default"]["response"] == "${data.__outputs.response}"

    @patch("azure.ai.evaluation._evaluate._evaluate.EvalRunContext")
    def test_run_callable_evaluators_doesnt_pass_target_run_when_using_complete_dataframe(self, mock_eval_context):
        """Test that target_run is not passed when using complete dataframe with ProxyClient."""
        validated_data = ValidatedData(
            evaluators={"test_eval": _simple_evaluator},
            graders={},
            input_data_df=pd.DataFrame({"query": ["test"], "__outputs.response": ["response"]}),
            column_mapping={"default": {"response": "${data.__outputs.response}"}},
            target_run=Mock(),  # This should not be passed to run()
            batch_run_client=Mock(spec=ProxyClient),
            batch_run_data="/tmp/test_temp.jsonl",
        )

        # Mock the batch client run methods
        mock_run = Mock()
        validated_data["batch_run_client"].run.return_value = mock_run
        validated_data["batch_run_client"].get_details.return_value = pd.DataFrame({"outputs.test_eval.score": [10]})
        validated_data["batch_run_client"].get_metrics.return_value = {}
        validated_data["batch_run_client"].get_run_summary.return_value = {"failed_lines": 0, "status": "Completed"}

        with patch("tempfile.gettempdir", return_value="/tmp"):
            with patch("os.path.exists", return_value=True):
                with patch("os.unlink"):
                    _run_callable_evaluators(validated_data)

                    # Verify run was called with target_run (the original target_run should still be passed)
                    validated_data["batch_run_client"].run.assert_called_once()
                    call_args = validated_data["batch_run_client"].run.call_args
                    assert "run" in call_args[1]  # target_run should be passed in kwargs

    @patch("azure.ai.evaluation._evaluate._evaluate.LOGGER")
    def test_temp_file_cleanup_warning_on_failure(self, mock_logger):
        """Test that warnings are logged when temp file cleanup fails."""
        validated_data = ValidatedData(
            evaluators={"test_eval": _simple_evaluator},
            graders={},
            input_data_df=pd.DataFrame({"query": ["test"], "__outputs.response": ["response"]}),
            column_mapping={"default": {"response": "${data.__outputs.response}"}},
            target_run=None,
            batch_run_client=Mock(spec=ProxyClient),
            batch_run_data="/tmp/test_temp.jsonl",
        )

        # Mock the batch client run methods
        mock_run = Mock()
        validated_data["batch_run_client"].run.return_value = mock_run
        validated_data["batch_run_client"].get_details.return_value = pd.DataFrame({"outputs.test_eval.score": [10]})
        validated_data["batch_run_client"].get_metrics.return_value = {}
        validated_data["batch_run_client"].get_run_summary.return_value = {"failed_lines": 0, "status": "Completed"}

        with patch("tempfile.gettempdir", return_value="/tmp"):
            with patch("os.path.exists", return_value=True):
                with patch("os.unlink", side_effect=Exception("Cleanup failed")):
                    with patch("azure.ai.evaluation._evaluate._evaluate.EvalRunContext"):
                        _run_callable_evaluators(validated_data)

                        # Verify warning was logged
                        mock_logger.warning.assert_called_once()
                        warning_call = mock_logger.warning.call_args[0][0]
                        assert "Failed to clean up temporary file" in warning_call
                        assert "/tmp/test_temp.jsonl" in warning_call

    @patch("azure.ai.evaluation._evaluate._evaluate._validate_columns_for_evaluators")
    @patch("azure.ai.evaluation._evaluate._evaluate._apply_target_to_data")
    @patch("azure.ai.evaluation._evaluate._evaluate._validate_and_load_data")
    def test_preprocess_data_no_temp_file_without_target(
        self, mock_load_data, mock_apply_target, mock_validate_columns
    ):
        """Test that no temp file is created when there's no target function."""
        mock_load_data.return_value = pd.DataFrame({"query": ["test"], "response": ["response"]})

        with patch("tempfile.NamedTemporaryFile") as mock_temp_file:
            result = _preprocess_data(
                data="/test/path.jsonl",
                evaluators_and_graders={"test_eval": _simple_evaluator},
                target=None,  # No target
                _use_pf_client=True,
            )

            # Verify no temp file was created
            mock_temp_file.assert_not_called()

            # Verify batch_run_data is still the original file path
            assert result["batch_run_data"] == os.path.abspath("/test/path.jsonl")

    def test_temp_file_creation_path_with_proxy_client(self):
        """Test that the temp file creation path is exercised for ProxyClient."""
        with patch("azure.ai.evaluation._evaluate._evaluate._apply_target_to_data") as mock_apply_target:
            with patch("azure.ai.evaluation._evaluate._evaluate._validate_and_load_data") as mock_load_data:
                mock_load_data.return_value = pd.DataFrame({"query": ["test"]})
                mock_apply_target.return_value = (
                    pd.DataFrame({"query": ["test"], "__outputs.response": ["response"]}),
                    {"response"},
                    Mock(),
                )

                with patch("tempfile.NamedTemporaryFile") as mock_temp_file:
                    mock_file = Mock()
                    mock_file.name = "/tmp/eval_temp.jsonl"
                    mock_file.close = Mock()
                    mock_temp_file.return_value = mock_file

                    with patch("json.dumps", return_value='{"test": "data"}') as mock_json_dumps:
                        result = _preprocess_data(
                            data="/test/path.jsonl",
                            evaluators_and_graders={"test_eval": _simple_evaluator},
                            target=_target_with_failures,
                            _use_pf_client=True,
                        )

                        # Verify that temp file was created and used
                        mock_temp_file.assert_called_once()
                        assert result["batch_run_data"] == "/tmp/eval_temp.jsonl"
                        assert result["target_run"] is None

                        # Verify JSON serialization was called
                        assert mock_json_dumps.call_count > 0

    def test_dataframe_client_preserves_all_rows_with_failures(self):
        """Test that DataFrame-based clients preserve all rows including failures."""
        sample_df = pd.DataFrame(
            {
                "query": ["test1", "test2", "test3"],
                "__outputs.response": [None, "response2", None],  # Mixed success/failure
            }
        )

        with patch("azure.ai.evaluation._evaluate._evaluate._apply_target_to_data") as mock_apply_target:
            with patch("azure.ai.evaluation._evaluate._evaluate._validate_and_load_data") as mock_load_data:
                mock_load_data.return_value = pd.DataFrame({"query": ["test1", "test2", "test3"]})
                mock_apply_target.return_value = (sample_df, {"response"}, Mock())

                result = _preprocess_data(
                    data="/test/path.jsonl",
                    evaluators_and_graders={"test_eval": _simple_evaluator},
                    target=_target_with_failures,
                    _use_run_submitter_client=True,
                )

                # Verify all rows are preserved in batch_run_data
                assert isinstance(result["batch_run_data"], pd.DataFrame)
                assert len(result["batch_run_data"]) == 3
                assert_frame_equal(result["batch_run_data"], sample_df)