File size: 2,477 Bytes
6f744e3
 
15b48b5
6f744e3
 
 
f9ef35e
6f744e3
f9ef35e
6f744e3
15b48b5
6f744e3
 
 
a4109dd
 
6f744e3
 
 
a4109dd
f6dd816
6f744e3
 
f6dd816
6f744e3
 
 
 
 
 
 
 
 
 
6a71189
f6dd816
6f744e3
 
 
a4109dd
6f744e3
 
 
 
 
 
 
 
f6dd816
6f744e3
 
 
 
 
 
 
 
 
 
15b48b5
 
6f744e3
 
 
15b48b5
6f744e3
15b48b5
a4109dd
6f744e3
6a71189
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
from typing import Dict
from pyannote.audio import Pipeline
import torch
import base64
import numpy as np
import os

SAMPLE_RATE = 16000

class EndpointHandler():
    def __init__(self, path=""):
        # Retrieve the Hugging Face authentication token from the environment variable
        hf_token = os.getenv("MY_KEY")
        if not hf_token:
            raise ValueError("Hugging Face authentication token (MY_KEY) is missing.")

        # Initialize the pipeline with the authentication token
        self.pipeline = Pipeline.from_pretrained(
            "pyannote/speaker-diarization-3.1", use_auth_token=hf_token
        )

        # Move the pipeline to the appropriate device (CPU or GPU)
        self.pipeline.to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))

    def __call__(self, data: Dict) -> Dict:
        """
        Args:
            data (Dict):
                'inputs': Base64-encoded audio bytes
                'parameters': Additional diarization parameters (currently unused)
        Return:
            Dict: Speaker diarization results
        """
        inputs = data.get("inputs")
        parameters = data.get("parameters", {})  # Currently not using them

        # Decode the base64 audio data
        audio_data = base64.b64decode(inputs)
        audio_nparray = np.frombuffer(audio_data, dtype=np.int16)

        # Handle multi-channel audio (convert to mono)
        if audio_nparray.ndim > 1:
            audio_nparray = audio_nparray.mean(axis=0)  # Average channels to create mono

        # Convert to PyTorch tensor
        audio_tensor = torch.from_numpy(audio_nparray).float().unsqueeze(0)
        if audio_tensor.dim() == 1:
            audio_tensor = audio_tensor.unsqueeze(0)

        pyannote_input = {"waveform": audio_tensor, "sample_rate": SAMPLE_RATE}

        # Run diarization pipeline
        try:
            diarization = self.pipeline(pyannote_input)  # No num_speakers parameter
        except Exception as e:
            print(f"An unexpected error occurred: {e}")
            return {"error": "Diarization failed unexpectedly"}

        # Build a friendly JSON response
        processed_diarization = [
            {
                "label": str(label),
                "start": str(segment.start),
                "stop": str(segment.end),
            }
            for segment, _, label in diarization.itertracks(yield_label=True)
        ]
        return {"diarization": processed_diarization}