KIFF
/

pyannote-speaker-diarization-endpoint

@@ -1,109 +1,35 @@
-import os
-import requests
-import json
-import base64
-import soundfile as sf
-import numpy as np
-from scipy.signal import resample
-# --- Configuration ---
-# Replace with your actual API key/token
-HF_TOKEN = os.environ.get("HF_API_TOKEN")  # Get the token from environment variable
-# Replace with your actual endpoint URL
-STG_API_URL = "https://YOUR_ENDPOINT_URL"
-# --- Functions ---
-def query_to_hf(filename):
-    """Sends audio file to Hugging Face API using requests."""
-    try:
-        data, sr = sf.read(filename)
-    except sf.LibsndfileError as e:
-        print(f"Error reading audio file: {e}")
-        return None
-    # Handle multi-channel audio (convert to mono)
-    if len(data.shape) > 1:
-        data = data.mean(axis=1)  # Average channels to create mono
-    data = resample(data, num=int(len(data) * 16000 / sr))
-    data = (data * np.iinfo(np.int16).max).astype(np.int16)
-    # Prepare the data payload
-    data_payload = {
-        "inputs": base64.b64encode(data.tobytes()).decode("utf-8")
-        # No parameters needed
-    }
-    json_data = json.dumps(data_payload)
-    # Use requests to send the POST request
-    try:
-        response = requests.post(
-            url=STG_API_URL,
-            data=json_data,
-            headers={
-                "Content-Type": "application/json",
-                "Authorization": f"Bearer {HF_TOKEN}"
-            },
         )
-        response.raise_for_status()
-        return response.json()
-    except requests.exceptions.RequestException as e:
-        print(f"Error during API request: {e}")
-        print(f"Response content: {response.content}")
-        return None
-def format_timecode(seconds):
-    """Formats seconds into HH:MM:SS:mmm format."""
-    m, s = divmod(seconds, 60)
-    h, m = divmod(m, 60)
-    return f"{int(h):02}:{int(m):02}:{int(s):02}:{int((s%1)*1000):03}"
-def process_and_format_output(output, input_file):
-    """Formats the API response (now a dict) and saves it to a file."""
-    if output is None:
-        print("No output received from API.")
-        return None
-    # Check if the output is a dictionary and has the expected key
-    if not isinstance(output, dict) or "diarization" not in output:
-        print(f"Unexpected output format: {output}")
-        return None
-    try:
-        formatted_output = []
-        for speaker in output["diarization"]:
-            start_time = format_timecode(float(speaker["start"]))
-            end_time = format_timecode(float(speaker["stop"]))
-            formatted_output.append(f"{speaker['label']} START: {start_time} END: {end_time}")
-        base_filename = os.path.splitext(os.path.basename(input_file))[0]
-        output_dir = "TMP_STG"
-        os.makedirs(output_dir, exist_ok=True)
-        output_filename = os.path.join(output_dir, base_filename + "_voicerec-output.txt")
-        with open(output_filename, "w", encoding="utf-8") as f:
-            for line in formatted_output:
-                f.write(line + "\n")
-        return output_filename
-    except (KeyError, ValueError) as e:
-        print(f"Error processing API output: {e}")
-        return None
-# --- Main Script ---
-if __name__ == "__main__":
-    # --- Configuration for Standalone Testing ---
-    SAMPLE_AUDIO_FILE = "sample.wav"  # Put your sample audio file in the same directory
-    # --- Main Script Logic ---
-    print(f"Sending {SAMPLE_AUDIO_FILE} to Hugging Face API...")
-    api_output = query_to_hf(SAMPLE_AUDIO_FILE)
-    if api_output:
-        output_file = process_and_format_output(api_output, SAMPLE_AUDIO_FILE)
-        if output_file:
-            print(f"Output saved to: {output_file}")
-    else:
-        print("API request failed.")

+from pyannote.audio import Pipeline, Audio
+import torch
+class EndpointHandler:
+    def __init__(self, path=""):
+        # initialize pretrained pipeline
+        self._pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization-3.1")
+        # send pipeline to GPU if available
+        if torch.cuda.is_available():
+            self._pipeline.to(torch.device("cuda"))
+        # initialize audio reader
+        self._io = Audio()
+    def __call__(self, data):
+        inputs = data.pop("inputs", data)
+        waveform, sample_rate = self._io(inputs)
+        parameters = data.pop("parameters", dict())
+        diarization = self.pipeline(
+            {"waveform": waveform, "sample_rate": sample_rate}, **parameters
         )
+        processed_diarization = [
+            {
+                "speaker": speaker,
+                "start": f"{turn.start:.3f}",
+                "end": f"{turn.end:.3f}",
+            }
+            for turn, _, speaker in diarization.itertracks(yield_label=True)
+        ]
+        return {"diarization": processed_diarization}