Spaces:
Sleeping
Sleeping
File size: 4,091 Bytes
201eb03 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 |
import streamlit as st
from PIL import Image
import numpy as np
import tempfile
import soundfile as sf
import torch
import easyocr
# ---------------------------
# Caching the OCR reader for performance
# ---------------------------
@st.cache_resource(show_spinner=False)
def load_ocr_reader(languages):
# EasyOCR expects language codes like "en", "es", "ch_sim", "ar"
return easyocr.Reader(languages, gpu=False)
# ---------------------------
# Caching TTS model loading (Silero TTS)
# ---------------------------
@st.cache_resource(show_spinner=False)
def load_tts_model(language):
# Map our language codes to Silero model speakers.
# Note: Silero officially supports 'en' (and some community models for other languages).
# For demonstration, if a language isn’t available, we fallback to English.
lang_speaker_map = {
'en': 'v3_en',
'es': 'v3_es', # if available; otherwise, you might need to train or use an English model
'ch': 'v3_en', # fallback to English for now (or replace with an experimental Chinese model)
'ar': 'v3_en' # fallback to English (or an experimental Arabic model if available)
}
speaker = lang_speaker_map.get(language, 'v3_en')
device = torch.device('cpu')
# Load the Silero TTS model from torch.hub.
# This command will download the model the first time you run it.
model, example_text, sample_rate, speakers = torch.hub.load(
repo_or_dir='snakers4/silero-models',
model='silero_tts',
language=language,
speaker=speaker
)
return model, sample_rate, speaker
def synthesize_speech(text, language):
model, sample_rate, speaker = load_tts_model(language)
# Synthesize speech; the output is a NumPy array with the audio waveform.
audio = model.apply_tts(text=text, speaker=speaker, sample_rate=sample_rate)
return audio, sample_rate
def save_audio(audio, sample_rate):
# Save audio to a temporary file and return its path.
with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as f:
sf.write(f.name, audio, sample_rate)
return f.name
def extract_text_from_image(image_array, languages):
reader = load_ocr_reader(languages)
results = reader.readtext(image_array)
# Concatenate detected text parts.
extracted_text = " ".join([res[1] for res in results])
return extracted_text
# ---------------------------
# Mapping for EasyOCR language codes (EasyOCR uses 'ch_sim' for Chinese simplified)
# ---------------------------
ocr_language_map = {
'en': 'en',
'es': 'es',
'ch': 'ch_sim',
'ar': 'ar'
}
# ---------------------------
# Streamlit App UI
# ---------------------------
st.title("Image-to-Audio Description App")
st.write("Upload an image or enter text to generate audio descriptions.")
# Select language for both OCR and TTS
language = st.selectbox("Select language", options=['en', 'es', 'ch', 'ar'], index=0)
# Choose input method
input_method = st.radio("Input method", options=["Upload Image", "Enter Text"])
text = ""
if input_method == "Upload Image":
uploaded_file = st.file_uploader("Choose an image file", type=["jpg", "jpeg", "png"])
if uploaded_file is not None:
image = Image.open(uploaded_file)
st.image(image, caption='Uploaded Image', use_column_width=True)
# Convert PIL image to numpy array for EasyOCR
image_array = np.array(image)
with st.spinner("Extracting text from image..."):
# EasyOCR expects language codes; here we wrap our choice.
ocr_lang = [ocr_language_map.get(language, 'en')]
text = extract_text_from_image(image_array, ocr_lang)
st.write("**Extracted Text:**")
st.write(text)
else:
text = st.text_area("Enter text to synthesize", "Type your description here...")
if text and st.button("Generate Speech"):
with st.spinner("Synthesizing speech..."):
audio, sr = synthesize_speech(text, language)
audio_file = save_audio(audio, sr)
st.success("Audio generated!")
st.audio(audio_file)
|