import torch import os import numpy as np import tempfile import base64 import gc import sys import traceback import gradio as gr import librosa from scipy.io.wavfile import write from gtts import gTTS import soundfile as sf import whisper # Official OpenAI Whisper package # Define device for processing DEVICE = "cuda" if torch.cuda.is_available() else "cpu" print(f"Using device: {DEVICE}") # Free up memory gc.collect() if DEVICE == "cuda": torch.cuda.empty_cache() print(f"CUDA memory allocated: {torch.cuda.memory_allocated()/1024**2:.2f} MB") print(f"CUDA memory reserved: {torch.cuda.memory_reserved()/1024**2:.2f} MB") # Try importing transformers, with fallback try: from transformers import WhisperProcessor, WhisperForConditionalGeneration from transformers import BertForSequenceClassification, BertTokenizer, pipeline TRANSFORMERS_AVAILABLE = True print("Transformers package loaded successfully") except Exception as e: TRANSFORMERS_AVAILABLE = False print(f"Warning: Could not import from transformers: {e}") class WhisperTranscriber: def __init__(self, model_size="tiny"): print(f"Initializing Whisper transcriber with model size: {model_size}") self.model_size = model_size self.processor = None self.model = None self.official_model = None # Try to initialize using transformers first if TRANSFORMERS_AVAILABLE: try: print(f"Loading Whisper processor: openai/whisper-{model_size}") self.processor = WhisperProcessor.from_pretrained(f"openai/whisper-{model_size}") print(f"Loading Whisper model: openai/whisper-{model_size}") self.model = WhisperForConditionalGeneration.from_pretrained(f"openai/whisper-{model_size}") if DEVICE == "cuda": print("Moving model to CUDA") self.model = self.model.to(DEVICE) print("Transformers Whisper initialization complete") except Exception as e: print(f"Error initializing Whisper with transformers: {e}") traceback.print_exc() self.processor = None self.model = None # If transformers failed or not available, try official OpenAI implementation if self.processor is None or self.model is None: try: print(f"Falling back to official OpenAI Whisper implementation with model size: {model_size}") self.official_model = whisper.load_model(model_size) print("Official Whisper model loaded successfully") except Exception as e: print(f"Error initializing official Whisper model: {e}") traceback.print_exc() self.official_model = None # Check if any model was loaded if (self.processor is None or self.model is None) and self.official_model is None: print("WARNING: All Whisper initialization attempts failed!") else: print("Whisper initialized successfully with at least one implementation") def transcribe(self, audio_path): # Try transcribing with transformers implementation first if self.processor is not None and self.model is not None: try: print("Transcribing with transformers implementation...") # Load audio waveform, sample_rate = librosa.load(audio_path, sr=16000) # Process audio input_features = self.processor(waveform, sampling_rate=16000, return_tensors="pt").input_features if DEVICE == "cuda": input_features = input_features.to(DEVICE) # Generate transcription with torch.no_grad(): predicted_ids = self.model.generate(input_features, max_length=100) # Decode the transcription transcription = self.processor.batch_decode(predicted_ids, skip_special_tokens=True)[0] print("Transcription successful with transformers implementation") return transcription except Exception as e: print(f"Error in transformers transcription: {e}") traceback.print_exc() # Fall back to official implementation if available if self.official_model is not None: try: print("Falling back to official Whisper implementation...") result = self.official_model.transcribe(audio_path) transcription = result["text"] print("Transcription successful with official implementation") return transcription except Exception as e: print(f"Error in official Whisper transcription: {e}") traceback.print_exc() print("All transcription attempts failed") return "Error: Transcription failed. Please check the logs for details." class GrammarCorrector: def __init__(self): print("Initializing grammar corrector...") try: # Initialize grammar correction pipeline self.corrector = pipeline("text2text-generation", model="pszemraj/grammar-synthesis-small") print("Grammar corrector initialized successfully") except Exception as e: print(f"Error initializing grammar corrector: {e}") traceback.print_exc() self.corrector = None def correct(self, text): if not text or not text.strip(): return text if self.corrector is not None: try: # Use the grammar correction pipeline corrected_text = self.corrector(f"grammar correction: {text}")[0]['generated_text'] return corrected_text except Exception as e: print(f"Error in grammar correction: {e}") return text else: print("No valid grammar correction model available. Returning original text.") return text class TextToSpeech: def __init__(self): print("Initializing text-to-speech engine...") def speak(self, text, output_file="output_speech.mp3"): try: tts = gTTS(text=text, lang='en', slow=False) tts.save(output_file) print(f"Speech saved to {output_file}") return output_file except Exception as e: print(f"Error with gTTS: {e}") traceback.print_exc() return False class SpeechProcessor: def __init__(self, whisper_model_size="tiny"): print(f"Initializing Speech Processor with Whisper model size: {whisper_model_size}") self.transcriber = WhisperTranscriber(model_size=whisper_model_size) self.grammar_corrector = GrammarCorrector() self.tts = TextToSpeech() def process_text(self, text): """Process text input: correct grammar and generate speech""" print("Processing text input...") # Correct grammar and punctuation corrected_text = self.grammar_corrector.correct(text) # Generate speech from corrected text speech_file = self.tts.speak(corrected_text, "output_speech.mp3") return corrected_text, speech_file def process_audio(self, audio_path): """Process audio input: transcribe, correct grammar, and generate speech""" print(f"Processing audio input from: {audio_path}") if not audio_path: return "Failed to get audio", None, None # Transcribe audio transcription = self.transcriber.transcribe(audio_path) if transcription.startswith("Error:"): return transcription, None, None # Correct grammar and punctuation corrected_text = self.grammar_corrector.correct(transcription) # Generate speech from corrected text speech_file = self.tts.speak(corrected_text, "output_speech.mp3") return transcription, corrected_text, speech_file # Initialize the processor processor = SpeechProcessor(whisper_model_size="tiny") # Define Gradio functions for the interface def process_text_input(text): """Handle text input from Gradio interface""" corrected_text, speech_file = processor.process_text(text) return corrected_text, speech_file def process_audio_input(audio_file): """Handle audio upload/recording from Gradio interface""" if audio_file is None: return "No audio provided", "No audio provided", None transcription, corrected_text, speech_file = processor.process_audio(audio_file) if transcription.startswith("Error:"): return transcription, "", None return transcription, corrected_text, speech_file # Create the Gradio interface def create_gradio_interface(): with gr.Blocks(title="Speech Processing System") as demo: gr.Markdown("# Speech Processing System") gr.Markdown("Transcribe, correct grammar, and generate speech.") with gr.Tab("Text Input"): with gr.Row(): text_input = gr.Textbox(placeholder="Enter text to process", label="Input Text", lines=5) text_button = gr.Button("Process Text") with gr.Row(): corrected_text_output = gr.Textbox(label="Corrected Text", lines=5) speech_output = gr.Audio(label="Speech Output") text_button.click( fn=process_text_input, inputs=[text_input], outputs=[corrected_text_output, speech_output] ) with gr.Tab("Audio Input"): with gr.Row(): audio_input = gr.Audio( sources=["microphone", "upload"], type="filepath", label="Upload or Record Audio" ) audio_button = gr.Button("Process Audio") with gr.Row(): transcription_output = gr.Textbox(label="Transcription", lines=3) audio_corrected_text = gr.Textbox(label="Corrected Text", lines=3) with gr.Row(): audio_speech_output = gr.Audio(label="Speech Output") audio_button.click( fn=process_audio_input, inputs=[audio_input], outputs=[transcription_output, audio_corrected_text, audio_speech_output] ) gr.Markdown("## How to use") gr.Markdown(""" 1. **Text Input Tab**: Enter text, click 'Process Text'. The system will correct grammar and generate speech. 2. **Audio Input Tab**: Upload an audio file or record using your microphone, then click 'Process Audio'. The system will transcribe your speech, correct grammar, and generate improved speech. """) return demo # Launch the interface demo = create_gradio_interface() if __name__ == "__main__": demo.launch()