Spaces:

voldemortuk
/

Solution_2

Running

App Files Files Community

Solution_2 / genai_voice /processing /audio.py

voldemortuk

Upload 113 files

5cc1949 verified 3 months ago

raw

history blame contribute delete

7.49 kB

	"""Audio"""

	import os
	import wave
	from io import BytesIO

	import numpy as np
	import torch

	import speech_recognition as sr
	import pyttsx3
	from gtts import gTTS
	from pydub import AudioSegment
	from pydub.playback import play
	from transformers import pipeline
	from st_audiorec import st_audiorec # does not have audio processing

	# If having trouble with ffmpeg, setting these may help
	# AudioSegment.converter = "C:\\ffmpeg\\ffmpeg\\bin\\ffmpeg.exe"
	# AudioSegment.ffmpeg = "C:\\ffmpeg\\ffmpeg\\bin\\ffmpeg.exe"
	# AudioSegment.ffprobe = "C:\\ffmpeg\\ffmpeg\\bin\\ffprobe.exe"


	class Audio:
	"""Audio Class"""

	def __init__(self) -> None:
	"""Initialize speech recognition object"""
	self.recognizer = sr.Recognizer()
	self.microphone = None

	# Disable mic by default
	self.mic_enabled = False

	def initialize_microphone(self, device_index):
	"""Initialize microphone object with appropriate device

	device_index: int indicating the index of the microphone
	"""
	self.microphone = sr.Microphone(device_index)
	self.mic_enabled = True

	def communicate(self, phrase="You forgot to pass the text"):
	"""Audio approach that saves to a file and then plays it.
	Could be sped up by doing a sentence at a time.

	phrase: the string to convert to speech
	"""

	try: # online
	temp_file = "temp.mp3"
	gTTS(phrase).save(temp_file)
	audio_file = AudioSegment.from_mp3(temp_file)
	play(audio_file)
	os.remove(temp_file)
	except (IOError, OSError) as e: # offline
	# Handle specific file-related exceptions
	print(f"Error handling audio file: {e}")
	# Option without temporary mp3 but it's more robotic
	engine = pyttsx3.init()
	engine.say(phrase)
	engine.runAndWait()
	except Exception as e:
	# Catch other unexpected exceptions
	raise ValueError(f"Unexpected error: {e}") from e

	def recognize_speech_from_mic(self):
	"""Transcribes speech from a microphone

	Returns a dictionary with the following keys:
	"success": A boolean indicating whether or not the request was successful
	"error": 'None' if successful, otherwise a string containing an error message
	"transcription": A string containing the transcribed text or 'None' if speech was
	unrecognizable
	"""

	# Adjust the recognizer sensitivity for ambient noise and listen to the microphone
	with self.microphone as source:
	self.recognizer.adjust_for_ambient_noise(source)
	audio = self.recognizer.listen(source)

	# Initialize response object
	response = {"success": True, "error": None, "transcription": None}

	# Try to recognize the speech and handle exceptions accordingly
	try:
	response["transcription"] = self.recognizer.recognize_google(audio)
	except sr.RequestError:
	# API was unreachable or unresponsive
	response["success"] = False
	response["error"] = "API unavailable"
	except sr.UnknownValueError:
	# Speech was unintelligible
	response["success"] = False
	response["error"] = "Unable to recognize speech"

	return response

	def get_streamlit_audio(self):
	"""
	Uses streamlit component to get the audio data
	https://github.com/stefanrmmr/streamlit-audio-recorder
	"""
	try:
	audio_wave_bytes = st_audiorec()
	except Exception as e:
	raise ValueError("Unable to capture audio from browser") from e
	return self.convert_streamlit_audio_to_gradio_format(audio_wave_bytes)

	def convert_streamlit_audio_to_gradio_format(self, audio_wave_bytes):
	"""Takes audio wave bytes and returns it in the format of gradio audio object
	sampling_rate, raw_audio_data = audio
	"""
	if not audio_wave_bytes:
	raise ValueError("No audio wave bytes received.")
	with wave.open(BytesIO(audio_wave_bytes), "rb") as wf:
	params = wf.getparams()
	sampling_rate = params.framerate
	num_channels = params.nchannels
	num_frames = params.nframes
	raw_audio_data = np.frombuffer(wf.readframes(num_frames), dtype=np.int16)

	if num_channels > 1:
	raw_audio_data = raw_audio_data.reshape(-1, num_channels)
	return (sampling_rate, raw_audio_data)

	def transcribe_from_transformer(
	self, audio, model_name_and_version="openai/whisper-base.en"
	):
	"""Convert audio data to text using transformers"""
	device = "cuda" if torch.cuda.is_available() else "cpu"
	transcriber = pipeline(
	task="automatic-speech-recognition",
	model=model_name_and_version,
	device=device,
	)
	try:
	sampling_rate, raw_audio_data = audio
	except TypeError as e:
	raise TypeError("No audio data received. Please speak louder.") from e

	# Convert to mono if stereo
	if raw_audio_data.ndim > 1:
	raw_audio_data = raw_audio_data.mean(axis=1)

	raw_audio_data = raw_audio_data.astype(np.float32)
	raw_audio_data /= np.max(np.abs(raw_audio_data))

	prompt = transcriber({"sampling_rate": sampling_rate, "raw": raw_audio_data})[
	"text"
	]
	return prompt

	def get_prompt_from_gradio_audio(self, audio):
	"""
	Converts audio captured from gradio to text.
	See https://www.gradio.app/guides/real-time-speech-recognition for more info.
	audio: object containing sampling frequency and raw audio data

	"""
	device = "cuda" if torch.cuda.is_available() else "cpu"
	transcriber = pipeline(
	"automatic-speech-recognition",
	model="openai/whisper-base.en",
	device=device,
	)
	try:
	sampling_rate, raw_audio_data = audio
	except TypeError as e:
	raise TypeError("No audio data received. Please speak louder.") from e

	# Convert to mono if stereo
	if raw_audio_data.ndim > 1:
	raw_audio_data = raw_audio_data.mean(axis=1)

	raw_audio_data = raw_audio_data.astype(np.float32)
	raw_audio_data /= np.max(np.abs(raw_audio_data))

	prompt = transcriber({"sampling_rate": sampling_rate, "raw": raw_audio_data})[
	"text"
	]
	return prompt

	def get_prompt_from_file(self, file):
	"""Get Prompt from audio file"""
	try:
	speech = sr.AudioFile(file)
	except Exception as e:
	raise IOError(f"Unable to read the audio file: {e}") from e
	with speech as source:
	speech = self.recognizer.record(source)
	text = self.recognizer.recognize_google(speech)
	return text


	if __name__ == "__main__":
	recognized_mics = {}
	test_audio = Audio()
	for i, mic in enumerate(sr.Microphone.list_microphone_names()):
	print(f"{i}: {mic}")
	recognized_mics.update({mic: i})
	built_in_idx = recognized_mics['Built-in Microphone']
	print(recognized_mics)
	test_audio.initialize_microphone(built_in_idx)
	test_audio.communicate("Hello class.")
	print(test_audio.recognize_speech_from_mic())