File size: 2,477 Bytes
6f744e3 15b48b5 6f744e3 f9ef35e 6f744e3 f9ef35e 6f744e3 15b48b5 6f744e3 a4109dd 6f744e3 a4109dd f6dd816 6f744e3 f6dd816 6f744e3 6a71189 f6dd816 6f744e3 a4109dd 6f744e3 f6dd816 6f744e3 15b48b5 6f744e3 15b48b5 6f744e3 15b48b5 a4109dd 6f744e3 6a71189 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 |
from typing import Dict
from pyannote.audio import Pipeline
import torch
import base64
import numpy as np
import os
SAMPLE_RATE = 16000
class EndpointHandler():
def __init__(self, path=""):
# Retrieve the Hugging Face authentication token from the environment variable
hf_token = os.getenv("MY_KEY")
if not hf_token:
raise ValueError("Hugging Face authentication token (MY_KEY) is missing.")
# Initialize the pipeline with the authentication token
self.pipeline = Pipeline.from_pretrained(
"pyannote/speaker-diarization-3.1", use_auth_token=hf_token
)
# Move the pipeline to the appropriate device (CPU or GPU)
self.pipeline.to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))
def __call__(self, data: Dict) -> Dict:
"""
Args:
data (Dict):
'inputs': Base64-encoded audio bytes
'parameters': Additional diarization parameters (currently unused)
Return:
Dict: Speaker diarization results
"""
inputs = data.get("inputs")
parameters = data.get("parameters", {}) # Currently not using them
# Decode the base64 audio data
audio_data = base64.b64decode(inputs)
audio_nparray = np.frombuffer(audio_data, dtype=np.int16)
# Handle multi-channel audio (convert to mono)
if audio_nparray.ndim > 1:
audio_nparray = audio_nparray.mean(axis=0) # Average channels to create mono
# Convert to PyTorch tensor
audio_tensor = torch.from_numpy(audio_nparray).float().unsqueeze(0)
if audio_tensor.dim() == 1:
audio_tensor = audio_tensor.unsqueeze(0)
pyannote_input = {"waveform": audio_tensor, "sample_rate": SAMPLE_RATE}
# Run diarization pipeline
try:
diarization = self.pipeline(pyannote_input) # No num_speakers parameter
except Exception as e:
print(f"An unexpected error occurred: {e}")
return {"error": "Diarization failed unexpectedly"}
# Build a friendly JSON response
processed_diarization = [
{
"label": str(label),
"start": str(segment.start),
"stop": str(segment.end),
}
for segment, _, label in diarization.itertracks(yield_label=True)
]
return {"diarization": processed_diarization}
|