1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77
|
# coding=utf-8
# --------------------------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License. See License.txt in the project root for
# license information.
# --------------------------------------------------------------------------
"""
FILE: sample_transcribe_with_diarization.py
DESCRIPTION:
This sample demonstrates how to transcribe an audio file with speaker diarization
(speaker separation) using the Azure AI Transcription client. This identifies
different speakers in the audio.
USAGE:
python sample_transcribe_with_diarization.py
Set the environment variables with your own values before running the sample:
1) AZURE_SPEECH_ENDPOINT - the endpoint to your Speech resource.
2) AZURE_SPEECH_API_KEY - your Speech API key.
"""
import os
def sample_transcribe_with_diarization():
# [START transcribe_with_diarization]
from azure.core.credentials import AzureKeyCredential
from azure.ai.transcription import TranscriptionClient
from azure.ai.transcription.models import (
TranscriptionContent,
TranscriptionOptions,
TranscriptionDiarizationOptions,
)
# Get configuration from environment variables
endpoint = os.environ["AZURE_SPEECH_ENDPOINT"]
api_key = os.environ["AZURE_SPEECH_API_KEY"]
# Create the transcription client
client = TranscriptionClient(endpoint=endpoint, credential=AzureKeyCredential(api_key))
# Path to your audio file with multiple speakers
import pathlib
audio_file_path = pathlib.Path(__file__).parent / "assets" / "audio.wav"
# Open and read the audio file
with open(audio_file_path, "rb") as audio_file:
# Create diarization options
diarization_options = TranscriptionDiarizationOptions(
max_speakers=5 # Hint for maximum number of speakers (2-35)
)
# Create transcription options with diarization
options = TranscriptionOptions(locales=["en-US"], diarization_options=diarization_options)
# Create the request content
request_content = TranscriptionContent(definition=options, audio=audio_file)
# Transcribe the audio
result = client.transcribe(request_content)
# Print transcription with speaker information
print("Transcription with speaker diarization:\n")
if result.phrases:
for phrase in result.phrases:
speaker = phrase.speaker if phrase.speaker is not None else "Unknown"
print(f"Speaker {speaker} [{phrase.offset_milliseconds}ms]: {phrase.text}")
else:
print(f"Full transcription: {result.combined_phrases[0].text}")
# [END transcribe_with_diarization]
if __name__ == "__main__":
sample_transcribe_with_diarization()
|