File: sample_transcribe_with_diarization.py

package info (click to toggle)
python-azure 20251202%2Bgit-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 786,344 kB
  • sloc: python: 6,510,493; ansic: 804; javascript: 287; sh: 204; makefile: 198; xml: 109
file content (77 lines) | stat: -rw-r--r-- 2,857 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
# coding=utf-8
# --------------------------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License. See License.txt in the project root for
# license information.
# --------------------------------------------------------------------------

"""
FILE: sample_transcribe_with_diarization.py

DESCRIPTION:
    This sample demonstrates how to transcribe an audio file with speaker diarization
    (speaker separation) using the Azure AI Transcription client. This identifies
    different speakers in the audio.

USAGE:
    python sample_transcribe_with_diarization.py

    Set the environment variables with your own values before running the sample:
    1) AZURE_SPEECH_ENDPOINT - the endpoint to your Speech resource.
    2) AZURE_SPEECH_API_KEY - your Speech API key.
"""

import os


def sample_transcribe_with_diarization():
    # [START transcribe_with_diarization]
    from azure.core.credentials import AzureKeyCredential
    from azure.ai.transcription import TranscriptionClient
    from azure.ai.transcription.models import (
        TranscriptionContent,
        TranscriptionOptions,
        TranscriptionDiarizationOptions,
    )

    # Get configuration from environment variables
    endpoint = os.environ["AZURE_SPEECH_ENDPOINT"]
    api_key = os.environ["AZURE_SPEECH_API_KEY"]

    # Create the transcription client
    client = TranscriptionClient(endpoint=endpoint, credential=AzureKeyCredential(api_key))

    # Path to your audio file with multiple speakers
    import pathlib

    audio_file_path = pathlib.Path(__file__).parent / "assets" / "audio.wav"

    # Open and read the audio file
    with open(audio_file_path, "rb") as audio_file:
        # Create diarization options
        diarization_options = TranscriptionDiarizationOptions(
            max_speakers=5  # Hint for maximum number of speakers (2-35)
        )

        # Create transcription options with diarization
        options = TranscriptionOptions(locales=["en-US"], diarization_options=diarization_options)

        # Create the request content
        request_content = TranscriptionContent(definition=options, audio=audio_file)

        # Transcribe the audio
        result = client.transcribe(request_content)

        # Print transcription with speaker information
        print("Transcription with speaker diarization:\n")
        if result.phrases:
            for phrase in result.phrases:
                speaker = phrase.speaker if phrase.speaker is not None else "Unknown"
                print(f"Speaker {speaker} [{phrase.offset_milliseconds}ms]: {phrase.text}")
        else:
            print(f"Full transcription: {result.combined_phrases[0].text}")
    # [END transcribe_with_diarization]


if __name__ == "__main__":
    sample_transcribe_with_diarization()