Spaces:

Kartheesh
/

speech-to-speech

Sleeping

App Files Files Community

Kartheesh commited on Aug 8, 2024

Commit

5803861

verified ·

1 Parent(s): 0594aa4

Update app.py

Browse files

Files changed (1) hide show

app.py +68 -0

app.py CHANGED Viewed

	@@ -1,2 +1,70 @@





1































































2

+import fitz  # PyMuPDF
+from transformers import VitsModel, MBartForConditionalGeneration, AutoTokenizer
+import torch
+import soundfile as sf
+import gradio as gr
+# Load the translation model and tokenizer
+translation_tokenizer = AutoTokenizer.from_pretrained("facebook/mbart-large-50-one-to-many-mmt", use_fast=False)
+translation_model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50-one-to-many-mmt")
+# Load the TTS model and tokenizer
+tts_tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-hin")
+tts_model = VitsModel.from_pretrained("facebook/mms-tts-hin")
+def extract_text_from_pdf(pdf_file):
+    """Extract text from a PDF file."""
+    doc = fitz.open(pdf_file)
+    text = ""
+    for page in doc:
+        text += page.get_text()
+    return text
+def process_pdf(pdf_file):
+    # Extract text from the PDF
+    input_text = extract_text_from_pdf(pdf_file)
+    # Convert sentences to tensors
+    model_inputs = translation_tokenizer(input_text, return_tensors="pt", padding=True, truncation=True)
+    # Translate from English to Hindi
+    generated_tokens = translation_model.generate(
+        **model_inputs,
+        forced_bos_token_id=translation_tokenizer.lang_code_to_id["hi_IN"]
+    )
+    # Decode the translated tokens to text
+    translation = translation_tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
+    translated_text = " ".join(translation)  # Join all translated sentences
+    # Tokenize the translated text for TTS
+    tts_inputs = tts_tokenizer(translated_text, return_tensors="pt")
+    # Generate the waveform
+    try:
+        with torch.no_grad():
+            tts_output = tts_model(**tts_inputs)
+            waveform = tts_output.waveform.squeeze().cpu().numpy()
+    except RuntimeError as e:
+        return f"Runtime Error: {e}"
+    # Save the waveform to an audio file
+    audio_path = "output.wav"
+    sf.write(audio_path, waveform, 22050)
+    return audio_path
+def gradio_interface(pdf_file):
+    audio_path = process_pdf(pdf_file.name)
+    return audio_path
+# Create the Gradio interface
+iface = gr.Interface(
+    fn=gradio_interface,
+    inputs=gr.File(file_count="single"),
+    outputs="audio"
+)
+# Launch the Gradio app
+iface.launch(debug=True)