KIFF
/

pyannote-speaker-diarization-endpoint

@@ -1,65 +1,109 @@
-from typing import Dict
-from pyannote.audio import Pipeline
-import torch
 import base64
 import numpy as np
-SAMPLE_RATE = 16000
-class EndpointHandler():
-    def __init__(self, path=""):
-        # Initialize the pipeline (no authentication needed for public models)
-        self.pipeline = Pipeline.from_pretrained(
-            "pyannote/speaker-diarization-3.1"
         )
-        # Move the pipeline to the appropriate device (CPU or GPU)
-        self.pipeline.to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))
-        # Instantiate the pipeline with its parameters
-        self.pipeline = self.pipeline.instantiate(self.pipeline.parameters)
-    def __call__(self, data: Dict) -> Dict:
-        """
-        Args:
-            data (Dict):
-                'inputs': Base64-encoded audio bytes
-                'parameters': Additional diarization parameters (currently unused)
-        Return:
-            Dict: Speaker diarization results
-        """
-        inputs = data.get("inputs")
-        parameters = data.get("parameters", {})  # We are not using them now
-        # Decode the base64 audio data
-        audio_data = base64.b64decode(inputs)
-        audio_nparray = np.frombuffer(audio_data, dtype=np.int16)
-        # Handle multi-channel audio (convert to mono)
-        if audio_nparray.ndim > 1:
-            audio_nparray = audio_nparray.mean(axis=0)  # Average channels to create mono
-        # Convert to PyTorch tensor
-        audio_tensor = torch.from_numpy(audio_nparray).float().unsqueeze(0)
-        if audio_tensor.dim() == 1:
-            audio_tensor = audio_tensor.unsqueeze(0)
-        pyannote_input = {"waveform": audio_tensor, "sample_rate": SAMPLE_RATE}
-        # Run diarization pipeline
-        try:
-            diarization = self.pipeline(pyannote_input)  # No num_speakers parameter
-        except Exception as e:
-            print(f"An unexpected error occurred: {e}")
-            return {"error": "Diarization failed unexpectedly"}
-        # Build a friendly JSON response
-        processed_diarization = [
-            {
-                "label": str(label),
-                "start": str(segment.start),
-                "stop": str(segment.end),
-            }
-            for segment, _, label in diarization.itertracks(yield_label=True)
-        ]
-        return {"diarization": processed_diarization}

+import os
+import requests
+import json
 import base64
+import soundfile as sf
 import numpy as np
+from scipy.signal import resample
+# --- Configuration ---
+# Replace with your actual API key/token
+HF_TOKEN = os.environ.get("HF_API_TOKEN")  # Get the token from environment variable
+# Replace with your actual endpoint URL
+STG_API_URL = "https://YOUR_ENDPOINT_URL"
+# --- Functions ---
+def query_to_hf(filename):
+    """Sends audio file to Hugging Face API using requests."""
+    try:
+        data, sr = sf.read(filename)
+    except sf.LibsndfileError as e:
+        print(f"Error reading audio file: {e}")
+        return None
+    # Handle multi-channel audio (convert to mono)
+    if len(data.shape) > 1:
+        data = data.mean(axis=1)  # Average channels to create mono
+    data = resample(data, num=int(len(data) * 16000 / sr))
+    data = (data * np.iinfo(np.int16).max).astype(np.int16)
+    # Prepare the data payload
+    data_payload = {
+        "inputs": base64.b64encode(data.tobytes()).decode("utf-8")
+        # No parameters needed
+    }
+    json_data = json.dumps(data_payload)
+    # Use requests to send the POST request
+    try:
+        response = requests.post(
+            url=STG_API_URL,
+            data=json_data,
+            headers={
+                "Content-Type": "application/json",
+                "Authorization": f"Bearer {HF_TOKEN}"
+            },
         )
+        response.raise_for_status()
+        return response.json()
+    except requests.exceptions.RequestException as e:
+        print(f"Error during API request: {e}")
+        print(f"Response content: {response.content}")
+        return None
+def format_timecode(seconds):
+    """Formats seconds into HH:MM:SS:mmm format."""
+    m, s = divmod(seconds, 60)
+    h, m = divmod(m, 60)
+    return f"{int(h):02}:{int(m):02}:{int(s):02}:{int((s%1)*1000):03}"
+def process_and_format_output(output, input_file):
+    """Formats the API response (now a dict) and saves it to a file."""
+    if output is None:
+        print("No output received from API.")
+        return None
+    # Check if the output is a dictionary and has the expected key
+    if not isinstance(output, dict) or "diarization" not in output:
+        print(f"Unexpected output format: {output}")
+        return None
+    try:
+        formatted_output = []
+        for speaker in output["diarization"]:
+            start_time = format_timecode(float(speaker["start"]))
+            end_time = format_timecode(float(speaker["stop"]))
+            formatted_output.append(f"{speaker['label']} START: {start_time} END: {end_time}")
+        base_filename = os.path.splitext(os.path.basename(input_file))[0]
+        output_dir = "TMP_STG"
+        os.makedirs(output_dir, exist_ok=True)
+        output_filename = os.path.join(output_dir, base_filename + "_voicerec-output.txt")
+        with open(output_filename, "w", encoding="utf-8") as f:
+            for line in formatted_output:
+                f.write(line + "\n")
+        return output_filename
+    except (KeyError, ValueError) as e:
+        print(f"Error processing API output: {e}")
+        return None
+# --- Main Script ---
+if __name__ == "__main__":
+    # --- Configuration for Standalone Testing ---
+    SAMPLE_AUDIO_FILE = "sample.wav"  # Put your sample audio file in the same directory
+    # --- Main Script Logic ---
+    print(f"Sending {SAMPLE_AUDIO_FILE} to Hugging Face API...")
+    api_output = query_to_hf(SAMPLE_AUDIO_FILE)
+    if api_output:
+        output_file = process_and_format_output(api_output, SAMPLE_AUDIO_FILE)
+        if output_file:
+            print(f"Output saved to: {output_file}")
+    else:
+        print("API request failed.")