File: sample_transcribe_with_diarization.py

package info (click to toggle)
python-azure 20251202%2Bgit-1
links: PTS, VCS
area: main
in suites: forky, sid
size: 786,344 kB
sloc: python: 6,510,493; ansic: 804; javascript: 287; sh: 204; makefile: 198; xml: 109
file content (77 lines) | stat: -rw-r--r-- 2,857 bytes
# coding=utf-8
# --------------------------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License. See License.txt in the project root for
# license information.
# --------------------------------------------------------------------------

"""
FILE: sample_transcribe_with_diarization.py

DESCRIPTION:
    This sample demonstrates how to transcribe an audio file with speaker diarization
    (speaker separation) using the Azure AI Transcription client. This identifies
    different speakers in the audio.

USAGE:
    python sample_transcribe_with_diarization.py

    Set the environment variables with your own values before running the sample:
    1) AZURE_SPEECH_ENDPOINT - the endpoint to your Speech resource.
    2) AZURE_SPEECH_API_KEY - your Speech API key.
"""

import os


def sample_transcribe_with_diarization():
    # [START transcribe_with_diarization]
    from azure.core.credentials import AzureKeyCredential
    from azure.ai.transcription import TranscriptionClient
    from azure.ai.transcription.models import (
        TranscriptionContent,
        TranscriptionOptions,
        TranscriptionDiarizationOptions,
    )

    # Get configuration from environment variables
    endpoint = os.environ["AZURE_SPEECH_ENDPOINT"]
    api_key = os.environ["AZURE_SPEECH_API_KEY"]

    # Create the transcription client
    client = TranscriptionClient(endpoint=endpoint, credential=AzureKeyCredential(api_key))

    # Path to your audio file with multiple speakers
    import pathlib

    audio_file_path = pathlib.Path(__file__).parent / "assets" / "audio.wav"

    # Open and read the audio file
    with open(audio_file_path, "rb") as audio_file:
        # Create diarization options
        diarization_options = TranscriptionDiarizationOptions(
            max_speakers=5  # Hint for maximum number of speakers (2-35)
        )

        # Create transcription options with diarization
        options = TranscriptionOptions(locales=["en-US"], diarization_options=diarization_options)

        # Create the request content
        request_content = TranscriptionContent(definition=options, audio=audio_file)

        # Transcribe the audio
        result = client.transcribe(request_content)

        # Print transcription with speaker information
        print("Transcription with speaker diarization:\n")
        if result.phrases:
            for phrase in result.phrases:
                speaker = phrase.speaker if phrase.speaker is not None else "Unknown"
                print(f"Speaker {speaker} [{phrase.offset_milliseconds}ms]: {phrase.text}")
        else:
            print(f"Full transcription: {result.combined_phrases[0].text}")
    # [END transcribe_with_diarization]


if __name__ == "__main__":
    sample_transcribe_with_diarization()