helvekami commited on
Commit
b5f86ee
·
verified ·
1 Parent(s): 9c37c06

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +10 -26
app.py CHANGED
@@ -15,42 +15,26 @@ def transcribe_and_respond(audio_file):
15
  torch_dtype=torch.bfloat16
16
  )
17
 
18
- # Load the audio file, requesting a sample rate of 16000
19
  audio, sr = librosa.load(audio_file, sr=16000)
20
-
21
- # Convert the loaded audio to a contiguous float32 array
22
- audio = np.ascontiguousarray(audio, dtype=np.float32)
23
-
24
- # If audio has more than one channel, convert to mono by averaging channels
25
- if audio.ndim > 1:
26
- audio = np.mean(audio, axis=-1)
27
-
28
- # Debug: Print audio properties
29
  print(f"Audio dtype: {audio.dtype}, Audio shape: {audio.shape}, Sample rate: {sr}")
30
-
31
- # Although we requested 16000 Hz, double-check the sample rate.
32
- # If not 16000, force conversion:
33
- if sr != 16000:
34
- # Ensure the audio is float32 before resampling
35
- audio = audio.astype(np.float32)
36
- audio = librosa.resample(audio, orig_sr=sr, target_sr=16000)
37
- sr = 16000
38
 
39
- # Set up the transcription prompt to get exact transcription
40
  turns = [
41
- {'role': 'system', 'content': 'Please transcribe the following audio exactly.'},
42
  {'role': 'user', 'content': '<|audio|>'}
43
  ]
44
-
45
  # Debug: Print the initial turns
46
  print(f"Initial turns: {turns}")
47
-
48
  # Call the model with the audio and prompt
49
  output = pipe({'audio': audio, 'turns': turns, 'sampling_rate': sr}, max_new_tokens=512)
50
-
51
  # Debug: Print the final output from the model
52
  print(f"Model output: {output}")
53
-
54
  return output
55
 
56
  except Exception as e:
@@ -61,9 +45,9 @@ iface = gr.Interface(
61
  inputs=gr.Audio(sources="microphone", type="filepath"),
62
  outputs="text",
63
  title="Live Transcription and Response",
64
- description="Speak into your microphone, and the model will transcribe your speech.",
65
  live=True
66
  )
67
 
68
  if __name__ == "__main__":
69
- iface.launch()
 
15
  torch_dtype=torch.bfloat16
16
  )
17
 
18
+ # Load the audio file
19
  audio, sr = librosa.load(audio_file, sr=16000)
20
+
21
+ # Print audio properties for debugging
 
 
 
 
 
 
 
22
  print(f"Audio dtype: {audio.dtype}, Audio shape: {audio.shape}, Sample rate: {sr}")
 
 
 
 
 
 
 
 
23
 
 
24
  turns = [
25
+ {'role': 'system', 'content': 'repeat the previous text exactly with no changes'},
26
  {'role': 'user', 'content': '<|audio|>'}
27
  ]
28
+
29
  # Debug: Print the initial turns
30
  print(f"Initial turns: {turns}")
31
+
32
  # Call the model with the audio and prompt
33
  output = pipe({'audio': audio, 'turns': turns, 'sampling_rate': sr}, max_new_tokens=512)
34
+
35
  # Debug: Print the final output from the model
36
  print(f"Model output: {output}")
37
+
38
  return output
39
 
40
  except Exception as e:
 
45
  inputs=gr.Audio(sources="microphone", type="filepath"),
46
  outputs="text",
47
  title="Live Transcription and Response",
48
+ description="Speak into your microphone, and the model will respond naturally and informatively.",
49
  live=True
50
  )
51
 
52
  if __name__ == "__main__":
53
+ iface.launch()