Spaces:
Running
Running
File size: 7,487 Bytes
5cc1949 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 |
"""Audio"""
import os
import wave
from io import BytesIO
import numpy as np
import torch
import speech_recognition as sr
import pyttsx3
from gtts import gTTS
from pydub import AudioSegment
from pydub.playback import play
from transformers import pipeline
from st_audiorec import st_audiorec # does not have audio processing
# If having trouble with ffmpeg, setting these may help
# AudioSegment.converter = "C:\\ffmpeg\\ffmpeg\\bin\\ffmpeg.exe"
# AudioSegment.ffmpeg = "C:\\ffmpeg\\ffmpeg\\bin\\ffmpeg.exe"
# AudioSegment.ffprobe = "C:\\ffmpeg\\ffmpeg\\bin\\ffprobe.exe"
class Audio:
"""Audio Class"""
def __init__(self) -> None:
"""Initialize speech recognition object"""
self.recognizer = sr.Recognizer()
self.microphone = None
# Disable mic by default
self.mic_enabled = False
def initialize_microphone(self, device_index):
"""Initialize microphone object with appropriate device
device_index: int indicating the index of the microphone
"""
self.microphone = sr.Microphone(device_index)
self.mic_enabled = True
def communicate(self, phrase="You forgot to pass the text"):
"""Audio approach that saves to a file and then plays it.
Could be sped up by doing a sentence at a time.
phrase: the string to convert to speech
"""
try: # online
temp_file = "temp.mp3"
gTTS(phrase).save(temp_file)
audio_file = AudioSegment.from_mp3(temp_file)
play(audio_file)
os.remove(temp_file)
except (IOError, OSError) as e: # offline
# Handle specific file-related exceptions
print(f"Error handling audio file: {e}")
# Option without temporary mp3 but it's more robotic
engine = pyttsx3.init()
engine.say(phrase)
engine.runAndWait()
except Exception as e:
# Catch other unexpected exceptions
raise ValueError(f"Unexpected error: {e}") from e
def recognize_speech_from_mic(self):
"""Transcribes speech from a microphone
Returns a dictionary with the following keys:
"success": A boolean indicating whether or not the request was successful
"error": 'None' if successful, otherwise a string containing an error message
"transcription": A string containing the transcribed text or 'None' if speech was
unrecognizable
"""
# Adjust the recognizer sensitivity for ambient noise and listen to the microphone
with self.microphone as source:
self.recognizer.adjust_for_ambient_noise(source)
audio = self.recognizer.listen(source)
# Initialize response object
response = {"success": True, "error": None, "transcription": None}
# Try to recognize the speech and handle exceptions accordingly
try:
response["transcription"] = self.recognizer.recognize_google(audio)
except sr.RequestError:
# API was unreachable or unresponsive
response["success"] = False
response["error"] = "API unavailable"
except sr.UnknownValueError:
# Speech was unintelligible
response["success"] = False
response["error"] = "Unable to recognize speech"
return response
def get_streamlit_audio(self):
"""
Uses streamlit component to get the audio data
https://github.com/stefanrmmr/streamlit-audio-recorder
"""
try:
audio_wave_bytes = st_audiorec()
except Exception as e:
raise ValueError("Unable to capture audio from browser") from e
return self.convert_streamlit_audio_to_gradio_format(audio_wave_bytes)
def convert_streamlit_audio_to_gradio_format(self, audio_wave_bytes):
"""Takes audio wave bytes and returns it in the format of gradio audio object
sampling_rate, raw_audio_data = audio
"""
if not audio_wave_bytes:
raise ValueError("No audio wave bytes received.")
with wave.open(BytesIO(audio_wave_bytes), "rb") as wf:
params = wf.getparams()
sampling_rate = params.framerate
num_channels = params.nchannels
num_frames = params.nframes
raw_audio_data = np.frombuffer(wf.readframes(num_frames), dtype=np.int16)
if num_channels > 1:
raw_audio_data = raw_audio_data.reshape(-1, num_channels)
return (sampling_rate, raw_audio_data)
def transcribe_from_transformer(
self, audio, model_name_and_version="openai/whisper-base.en"
):
"""Convert audio data to text using transformers"""
device = "cuda" if torch.cuda.is_available() else "cpu"
transcriber = pipeline(
task="automatic-speech-recognition",
model=model_name_and_version,
device=device,
)
try:
sampling_rate, raw_audio_data = audio
except TypeError as e:
raise TypeError("No audio data received. Please speak louder.") from e
# Convert to mono if stereo
if raw_audio_data.ndim > 1:
raw_audio_data = raw_audio_data.mean(axis=1)
raw_audio_data = raw_audio_data.astype(np.float32)
raw_audio_data /= np.max(np.abs(raw_audio_data))
prompt = transcriber({"sampling_rate": sampling_rate, "raw": raw_audio_data})[
"text"
]
return prompt
def get_prompt_from_gradio_audio(self, audio):
"""
Converts audio captured from gradio to text.
See https://www.gradio.app/guides/real-time-speech-recognition for more info.
audio: object containing sampling frequency and raw audio data
"""
device = "cuda" if torch.cuda.is_available() else "cpu"
transcriber = pipeline(
"automatic-speech-recognition",
model="openai/whisper-base.en",
device=device,
)
try:
sampling_rate, raw_audio_data = audio
except TypeError as e:
raise TypeError("No audio data received. Please speak louder.") from e
# Convert to mono if stereo
if raw_audio_data.ndim > 1:
raw_audio_data = raw_audio_data.mean(axis=1)
raw_audio_data = raw_audio_data.astype(np.float32)
raw_audio_data /= np.max(np.abs(raw_audio_data))
prompt = transcriber({"sampling_rate": sampling_rate, "raw": raw_audio_data})[
"text"
]
return prompt
def get_prompt_from_file(self, file):
"""Get Prompt from audio file"""
try:
speech = sr.AudioFile(file)
except Exception as e:
raise IOError(f"Unable to read the audio file: {e}") from e
with speech as source:
speech = self.recognizer.record(source)
text = self.recognizer.recognize_google(speech)
return text
if __name__ == "__main__":
recognized_mics = {}
test_audio = Audio()
for i, mic in enumerate(sr.Microphone.list_microphone_names()):
print(f"{i}: {mic}")
recognized_mics.update({mic: i})
built_in_idx = recognized_mics['Built-in Microphone']
print(recognized_mics)
test_audio.initialize_microphone(built_in_idx)
test_audio.communicate("Hello class.")
print(test_audio.recognize_speech_from_mic())
|