voldemortuk's picture
Upload 113 files
5cc1949 verified
raw
history blame contribute delete
7.49 kB
"""Audio"""
import os
import wave
from io import BytesIO
import numpy as np
import torch
import speech_recognition as sr
import pyttsx3
from gtts import gTTS
from pydub import AudioSegment
from pydub.playback import play
from transformers import pipeline
from st_audiorec import st_audiorec # does not have audio processing
# If having trouble with ffmpeg, setting these may help
# AudioSegment.converter = "C:\\ffmpeg\\ffmpeg\\bin\\ffmpeg.exe"
# AudioSegment.ffmpeg = "C:\\ffmpeg\\ffmpeg\\bin\\ffmpeg.exe"
# AudioSegment.ffprobe = "C:\\ffmpeg\\ffmpeg\\bin\\ffprobe.exe"
class Audio:
"""Audio Class"""
def __init__(self) -> None:
"""Initialize speech recognition object"""
self.recognizer = sr.Recognizer()
self.microphone = None
# Disable mic by default
self.mic_enabled = False
def initialize_microphone(self, device_index):
"""Initialize microphone object with appropriate device
device_index: int indicating the index of the microphone
"""
self.microphone = sr.Microphone(device_index)
self.mic_enabled = True
def communicate(self, phrase="You forgot to pass the text"):
"""Audio approach that saves to a file and then plays it.
Could be sped up by doing a sentence at a time.
phrase: the string to convert to speech
"""
try: # online
temp_file = "temp.mp3"
gTTS(phrase).save(temp_file)
audio_file = AudioSegment.from_mp3(temp_file)
play(audio_file)
os.remove(temp_file)
except (IOError, OSError) as e: # offline
# Handle specific file-related exceptions
print(f"Error handling audio file: {e}")
# Option without temporary mp3 but it's more robotic
engine = pyttsx3.init()
engine.say(phrase)
engine.runAndWait()
except Exception as e:
# Catch other unexpected exceptions
raise ValueError(f"Unexpected error: {e}") from e
def recognize_speech_from_mic(self):
"""Transcribes speech from a microphone
Returns a dictionary with the following keys:
"success": A boolean indicating whether or not the request was successful
"error": 'None' if successful, otherwise a string containing an error message
"transcription": A string containing the transcribed text or 'None' if speech was
unrecognizable
"""
# Adjust the recognizer sensitivity for ambient noise and listen to the microphone
with self.microphone as source:
self.recognizer.adjust_for_ambient_noise(source)
audio = self.recognizer.listen(source)
# Initialize response object
response = {"success": True, "error": None, "transcription": None}
# Try to recognize the speech and handle exceptions accordingly
try:
response["transcription"] = self.recognizer.recognize_google(audio)
except sr.RequestError:
# API was unreachable or unresponsive
response["success"] = False
response["error"] = "API unavailable"
except sr.UnknownValueError:
# Speech was unintelligible
response["success"] = False
response["error"] = "Unable to recognize speech"
return response
def get_streamlit_audio(self):
"""
Uses streamlit component to get the audio data
https://github.com/stefanrmmr/streamlit-audio-recorder
"""
try:
audio_wave_bytes = st_audiorec()
except Exception as e:
raise ValueError("Unable to capture audio from browser") from e
return self.convert_streamlit_audio_to_gradio_format(audio_wave_bytes)
def convert_streamlit_audio_to_gradio_format(self, audio_wave_bytes):
"""Takes audio wave bytes and returns it in the format of gradio audio object
sampling_rate, raw_audio_data = audio
"""
if not audio_wave_bytes:
raise ValueError("No audio wave bytes received.")
with wave.open(BytesIO(audio_wave_bytes), "rb") as wf:
params = wf.getparams()
sampling_rate = params.framerate
num_channels = params.nchannels
num_frames = params.nframes
raw_audio_data = np.frombuffer(wf.readframes(num_frames), dtype=np.int16)
if num_channels > 1:
raw_audio_data = raw_audio_data.reshape(-1, num_channels)
return (sampling_rate, raw_audio_data)
def transcribe_from_transformer(
self, audio, model_name_and_version="openai/whisper-base.en"
):
"""Convert audio data to text using transformers"""
device = "cuda" if torch.cuda.is_available() else "cpu"
transcriber = pipeline(
task="automatic-speech-recognition",
model=model_name_and_version,
device=device,
)
try:
sampling_rate, raw_audio_data = audio
except TypeError as e:
raise TypeError("No audio data received. Please speak louder.") from e
# Convert to mono if stereo
if raw_audio_data.ndim > 1:
raw_audio_data = raw_audio_data.mean(axis=1)
raw_audio_data = raw_audio_data.astype(np.float32)
raw_audio_data /= np.max(np.abs(raw_audio_data))
prompt = transcriber({"sampling_rate": sampling_rate, "raw": raw_audio_data})[
"text"
]
return prompt
def get_prompt_from_gradio_audio(self, audio):
"""
Converts audio captured from gradio to text.
See https://www.gradio.app/guides/real-time-speech-recognition for more info.
audio: object containing sampling frequency and raw audio data
"""
device = "cuda" if torch.cuda.is_available() else "cpu"
transcriber = pipeline(
"automatic-speech-recognition",
model="openai/whisper-base.en",
device=device,
)
try:
sampling_rate, raw_audio_data = audio
except TypeError as e:
raise TypeError("No audio data received. Please speak louder.") from e
# Convert to mono if stereo
if raw_audio_data.ndim > 1:
raw_audio_data = raw_audio_data.mean(axis=1)
raw_audio_data = raw_audio_data.astype(np.float32)
raw_audio_data /= np.max(np.abs(raw_audio_data))
prompt = transcriber({"sampling_rate": sampling_rate, "raw": raw_audio_data})[
"text"
]
return prompt
def get_prompt_from_file(self, file):
"""Get Prompt from audio file"""
try:
speech = sr.AudioFile(file)
except Exception as e:
raise IOError(f"Unable to read the audio file: {e}") from e
with speech as source:
speech = self.recognizer.record(source)
text = self.recognizer.recognize_google(speech)
return text
if __name__ == "__main__":
recognized_mics = {}
test_audio = Audio()
for i, mic in enumerate(sr.Microphone.list_microphone_names()):
print(f"{i}: {mic}")
recognized_mics.update({mic: i})
built_in_idx = recognized_mics['Built-in Microphone']
print(recognized_mics)
test_audio.initialize_microphone(built_in_idx)
test_audio.communicate("Hello class.")
print(test_audio.recognize_speech_from_mic())