File: test_analyze_text_recognize_pii_redaction_policies.py

package info (click to toggle)
python-azure 20251118%2Bgit-1
links: PTS, VCS
area: main
in suites: sid
size: 783,356 kB
sloc: python: 6,474,533; ansic: 804; javascript: 287; sh: 205; makefile: 198; xml: 109
file content (106 lines) | stat: -rw-r--r-- 4,114 bytes
import functools

from devtools_testutils import AzureRecordedTestCase, EnvironmentVariableLoader, recorded_by_proxy
from azure.core.credentials import AzureKeyCredential
from azure.ai.textanalytics import TextAnalysisClient
from azure.ai.textanalytics.models import (
    MultiLanguageTextInput,
    MultiLanguageInput,
    TextPiiEntitiesRecognitionInput,
    AnalyzeTextPiiResult,
    PiiActionContent,
    EntityMaskPolicyType,
    CharacterMaskPolicyType,
    SyntheticReplacementPolicyType,
)

TextAnalysisPreparer = functools.partial(
    EnvironmentVariableLoader,
    "text_analysis",
    text_analysis_endpoint="https://Sanitized.azure-api.net/",
    text_analysis_key="fake_key",
)


class TestTextAnalysis(AzureRecordedTestCase):
    def create_client(self, endpoint: str, key: str) -> TextAnalysisClient:
        return TextAnalysisClient(endpoint, AzureKeyCredential(key))


class TestTextAnalysisCase(TestTextAnalysis):
    @TextAnalysisPreparer()
    @recorded_by_proxy
    def test_analyze_text_recognize_pii_redaction_policies(self, text_analysis_endpoint, text_analysis_key):
        client = self.create_client(text_analysis_endpoint, text_analysis_key)

        # Documents
        documents = [
            MultiLanguageInput(
                id="1", text="My name is John Doe. My ssn is 123-45-6789. My email is john@example.com..", language="en"
            ),
            MultiLanguageInput(
                id="2", text="My name is John Doe. My ssn is 123-45-6789. My email is john@example.com..", language="en"
            ),
        ]

        text_input = MultiLanguageTextInput(multi_language_inputs=documents)

        # Redaction Policies
        default_policy = EntityMaskPolicyType(policy_name="defaultPolicy", is_default=True)

        ssn_policy = CharacterMaskPolicyType(
            policy_name="customMaskForSSN",
            unmask_length=4,
            unmask_from_end=False,
            entity_types=["USSocialSecurityNumber"],
        )

        synthetic_policy = SyntheticReplacementPolicyType(
            policy_name="syntheticMaskForPerson", entity_types=["Person", "Email"]
        )

        parameters = PiiActionContent(
            pii_categories=["All"], redaction_policies=[default_policy, ssn_policy, synthetic_policy]
        )

        body = TextPiiEntitiesRecognitionInput(text_input=text_input, action_content=parameters)

        # Sync (non-LRO)
        result = client.analyze_text(body=body)

        # Basic validation
        assert result is not None
        assert isinstance(result, AnalyzeTextPiiResult)
        assert result.results is not None
        assert result.results.documents is not None
        assert len(result.results.documents) == 2

        for doc in result.results.documents:
            # Redacted text must exist and original PII must not appear
            redacted = doc.redacted_text
            assert redacted is not None
            assert "John Doe" not in redacted
            assert "123-45-6789" not in redacted
            assert "john@example.com" not in redacted

            # Must detect 3 PII entities
            assert len(doc.entities) == 3
            categories = {e.category for e in doc.entities}
            assert categories == {"Person", "USSocialSecurityNumber", "Email"}

            # Validate Person entity was replaced (synthetic replacement)
            person = next(e for e in doc.entities if e.category == "Person")
            assert person.mask is not None
            assert person.mask != person.text  # replaced with a different name

            # Validate SSN is masked with asterisks
            ssn = next(e for e in doc.entities if e.category == "USSocialSecurityNumber")
            assert ssn.mask is not None
            assert "*" in ssn.mask
            assert "123-45-6789" not in redacted

            # Validate Email is replaced (synthetic replacement)
            email = next(e for e in doc.entities if e.category == "Email")
            assert email.mask is not None
            assert email.mask != email.text
            assert "john@example.com" not in redacted