import streamlit as st from PIL import Image import numpy as np import tempfile import soundfile as sf import torch import easyocr # --------------------------- # Caching the OCR reader for performance # --------------------------- @st.cache_resource(show_spinner=False) def load_ocr_reader(languages): # EasyOCR expects language codes like "en", "es", "ch_sim", "ar" return easyocr.Reader(languages, gpu=False) # --------------------------- # Caching TTS model loading (Silero TTS) # --------------------------- @st.cache_resource(show_spinner=False) def load_tts_model(language): # Map our language codes to Silero model speakers. # Note: Silero officially supports 'en' (and some community models for other languages). # For demonstration, if a language isn’t available, we fallback to English. lang_speaker_map = { 'en': 'v3_en', 'es': 'v3_es', # if available; otherwise, you might need to train or use an English model 'ch': 'v3_en', # fallback to English for now (or replace with an experimental Chinese model) 'ar': 'v3_en' # fallback to English (or an experimental Arabic model if available) } speaker = lang_speaker_map.get(language, 'v3_en') device = torch.device('cpu') # Load the Silero TTS model from torch.hub. # This command will download the model the first time you run it. model, example_text, sample_rate, speakers = torch.hub.load( repo_or_dir='snakers4/silero-models', model='silero_tts', language=language, speaker=speaker ) return model, sample_rate, speaker def synthesize_speech(text, language): model, sample_rate, speaker = load_tts_model(language) # Synthesize speech; the output is a NumPy array with the audio waveform. audio = model.apply_tts(text=text, speaker=speaker, sample_rate=sample_rate) return audio, sample_rate def save_audio(audio, sample_rate): # Save audio to a temporary file and return its path. with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as f: sf.write(f.name, audio, sample_rate) return f.name def extract_text_from_image(image_array, languages): reader = load_ocr_reader(languages) results = reader.readtext(image_array) # Concatenate detected text parts. extracted_text = " ".join([res[1] for res in results]) return extracted_text # --------------------------- # Mapping for EasyOCR language codes (EasyOCR uses 'ch_sim' for Chinese simplified) # --------------------------- ocr_language_map = { 'en': 'en', 'es': 'es', 'ch': 'ch_sim', 'ar': 'ar' } # --------------------------- # Streamlit App UI # --------------------------- st.title("Image-to-Audio Description App") st.write("Upload an image or enter text to generate audio descriptions.") # Select language for both OCR and TTS language = st.selectbox("Select language", options=['en', 'es', 'ch', 'ar'], index=0) # Choose input method input_method = st.radio("Input method", options=["Upload Image", "Enter Text"]) text = "" if input_method == "Upload Image": uploaded_file = st.file_uploader("Choose an image file", type=["jpg", "jpeg", "png"]) if uploaded_file is not None: image = Image.open(uploaded_file) st.image(image, caption='Uploaded Image', use_column_width=True) # Convert PIL image to numpy array for EasyOCR image_array = np.array(image) with st.spinner("Extracting text from image..."): # EasyOCR expects language codes; here we wrap our choice. ocr_lang = [ocr_language_map.get(language, 'en')] text = extract_text_from_image(image_array, ocr_lang) st.write("**Extracted Text:**") st.write(text) else: text = st.text_area("Enter text to synthesize", "Type your description here...") if text and st.button("Generate Speech"): with st.spinner("Synthesizing speech..."): audio, sr = synthesize_speech(text, language) audio_file = save_audio(audio, sr) st.success("Audio generated!") st.audio(audio_file)