KIFF commited on
Commit
15b48b5
·
verified ·
1 Parent(s): f6dd816

Update handler.py

Browse files
Files changed (1) hide show
  1. handler.py +26 -100
handler.py CHANGED
@@ -1,109 +1,35 @@
1
- import os
2
- import requests
3
- import json
4
- import base64
5
- import soundfile as sf
6
- import numpy as np
7
- from scipy.signal import resample
8
 
9
- # --- Configuration ---
10
- # Replace with your actual API key/token
11
- HF_TOKEN = os.environ.get("HF_API_TOKEN") # Get the token from environment variable
12
- # Replace with your actual endpoint URL
13
- STG_API_URL = "https://YOUR_ENDPOINT_URL"
14
 
15
- # --- Functions ---
 
 
 
16
 
17
- def query_to_hf(filename):
18
- """Sends audio file to Hugging Face API using requests."""
19
- try:
20
- data, sr = sf.read(filename)
21
- except sf.LibsndfileError as e:
22
- print(f"Error reading audio file: {e}")
23
- return None
24
 
25
- # Handle multi-channel audio (convert to mono)
26
- if len(data.shape) > 1:
27
- data = data.mean(axis=1) # Average channels to create mono
28
 
29
- data = resample(data, num=int(len(data) * 16000 / sr))
30
- data = (data * np.iinfo(np.int16).max).astype(np.int16)
 
31
 
32
- # Prepare the data payload
33
- data_payload = {
34
- "inputs": base64.b64encode(data.tobytes()).decode("utf-8")
35
- # No parameters needed
36
- }
37
- json_data = json.dumps(data_payload)
38
-
39
- # Use requests to send the POST request
40
- try:
41
- response = requests.post(
42
- url=STG_API_URL,
43
- data=json_data,
44
- headers={
45
- "Content-Type": "application/json",
46
- "Authorization": f"Bearer {HF_TOKEN}"
47
- },
48
  )
49
- response.raise_for_status()
50
- return response.json()
51
- except requests.exceptions.RequestException as e:
52
- print(f"Error during API request: {e}")
53
- print(f"Response content: {response.content}")
54
- return None
55
-
56
- def format_timecode(seconds):
57
- """Formats seconds into HH:MM:SS:mmm format."""
58
- m, s = divmod(seconds, 60)
59
- h, m = divmod(m, 60)
60
- return f"{int(h):02}:{int(m):02}:{int(s):02}:{int((s%1)*1000):03}"
61
-
62
- def process_and_format_output(output, input_file):
63
- """Formats the API response (now a dict) and saves it to a file."""
64
- if output is None:
65
- print("No output received from API.")
66
- return None
67
-
68
- # Check if the output is a dictionary and has the expected key
69
- if not isinstance(output, dict) or "diarization" not in output:
70
- print(f"Unexpected output format: {output}")
71
- return None
72
-
73
- try:
74
- formatted_output = []
75
- for speaker in output["diarization"]:
76
- start_time = format_timecode(float(speaker["start"]))
77
- end_time = format_timecode(float(speaker["stop"]))
78
- formatted_output.append(f"{speaker['label']} START: {start_time} END: {end_time}")
79
-
80
- base_filename = os.path.splitext(os.path.basename(input_file))[0]
81
- output_dir = "TMP_STG"
82
- os.makedirs(output_dir, exist_ok=True)
83
- output_filename = os.path.join(output_dir, base_filename + "_voicerec-output.txt")
84
-
85
- with open(output_filename, "w", encoding="utf-8") as f:
86
- for line in formatted_output:
87
- f.write(line + "\n")
88
-
89
- return output_filename
90
- except (KeyError, ValueError) as e:
91
- print(f"Error processing API output: {e}")
92
- return None
93
-
94
- # --- Main Script ---
95
-
96
- if __name__ == "__main__":
97
- # --- Configuration for Standalone Testing ---
98
- SAMPLE_AUDIO_FILE = "sample.wav" # Put your sample audio file in the same directory
99
 
100
- # --- Main Script Logic ---
101
- print(f"Sending {SAMPLE_AUDIO_FILE} to Hugging Face API...")
102
- api_output = query_to_hf(SAMPLE_AUDIO_FILE)
 
 
 
 
 
103
 
104
- if api_output:
105
- output_file = process_and_format_output(api_output, SAMPLE_AUDIO_FILE)
106
- if output_file:
107
- print(f"Output saved to: {output_file}")
108
- else:
109
- print("API request failed.")
 
1
+ from pyannote.audio import Pipeline, Audio
2
+ import torch
 
 
 
 
 
3
 
 
 
 
 
 
4
 
5
+ class EndpointHandler:
6
+ def __init__(self, path=""):
7
+ # initialize pretrained pipeline
8
+ self._pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization-3.1")
9
 
10
+ # send pipeline to GPU if available
11
+ if torch.cuda.is_available():
12
+ self._pipeline.to(torch.device("cuda"))
 
 
 
 
13
 
14
+ # initialize audio reader
15
+ self._io = Audio()
 
16
 
17
+ def __call__(self, data):
18
+ inputs = data.pop("inputs", data)
19
+ waveform, sample_rate = self._io(inputs)
20
 
21
+ parameters = data.pop("parameters", dict())
22
+ diarization = self.pipeline(
23
+ {"waveform": waveform, "sample_rate": sample_rate}, **parameters
 
 
 
 
 
 
 
 
 
 
 
 
 
24
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
 
26
+ processed_diarization = [
27
+ {
28
+ "speaker": speaker,
29
+ "start": f"{turn.start:.3f}",
30
+ "end": f"{turn.end:.3f}",
31
+ }
32
+ for turn, _, speaker in diarization.itertracks(yield_label=True)
33
+ ]
34
 
35
+ return {"diarization": processed_diarization}