Spaces:

Athspi-ai
/

Audio-translation

Running

App Files Files Community

Athspi commited on Mar 8

Commit

11a3089

verified ·

1 Parent(s): 2435954

Update app.py

Browse files

Files changed (1) hide show

app.py +59 -19

app.py CHANGED Viewed

@@ -1,10 +1,10 @@
 import os
 from flask import Flask, request, jsonify, send_file, send_from_directory
-from faster_whisper import WhisperModel
-import google.generativeai as genai
 from gtts import gTTS, lang
-import tempfile
-import soundfile as sf
 from kokoro import KPipeline
 from werkzeug.utils import secure_filename
 from flask_cors import CORS
@@ -16,14 +16,9 @@ CORS(app)
 GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
 if not GEMINI_API_KEY:
     raise ValueError("GEMINI_API_KEY environment variable not set")
-genai.configure(api_key=GEMINI_API_KEY)
-# Initialize Whisper model
-model_size = "Systran/faster-whisper-large-v3"
-try:
-    whisper_model = WhisperModel(model_size, device="auto", compute_type="float16")
-except ValueError:
-    whisper_model = WhisperModel(model_size, device="cpu", compute_type="int8")
 # Language configurations
 KOKORO_LANGUAGES = {
@@ -66,15 +61,60 @@ def translate_audio():
         temp_input_path = os.path.join(tempfile.gettempdir(), filename)
         audio_file.save(temp_input_path)
-        # Transcribe audio
-        segments, info = whisper_model.transcribe(temp_input_path, beam_size=5)
-        transcription = " ".join([segment.text for segment in segments])
-        # Translate text
-        model = genai.GenerativeModel("gemini-2.0-flash")
-        prompt = f"Translate to {target_language} preserving meaning and cultural nuances:\n\n{transcription}"
-        response = model.generate_content(prompt)
-        translated_text = response.text.strip()
         # Generate TTS
         if target_language in KOKORO_LANGUAGES:

 import os
+import tempfile
+import base64
 from flask import Flask, request, jsonify, send_file, send_from_directory
+from google import genai
+from google.genai import types
 from gtts import gTTS, lang
 from kokoro import KPipeline
 from werkzeug.utils import secure_filename
 from flask_cors import CORS
 GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
 if not GEMINI_API_KEY:
     raise ValueError("GEMINI_API_KEY environment variable not set")
+# Initialize Gemini client
+client = genai.Client(api_key=GEMINI_API_KEY)
 # Language configurations
 KOKORO_LANGUAGES = {
         temp_input_path = os.path.join(tempfile.gettempdir(), filename)
         audio_file.save(temp_input_path)
+        # Transcribe audio using Gemini
+        with open(temp_input_path, "rb") as audio_file:
+            audio_data = base64.b64encode(audio_file.read()).decode("utf-8")
+        files = [client.files.upload(file=temp_input_path)]
+        contents = [
+            types.Content(
+                role="user",
+                parts=[
+                    types.Part.from_uri(
+                        file_uri=files[0].uri,
+                        mime_type=files[0].mime_type,
+                    ),
+                    types.Part.from_text(text="Transcript the audio and provide only the text. Do not include any explanations or additional information."),
+                ],
+            ),
+        ]
+        generate_content_config = types.GenerateContentConfig(
+            temperature=1,
+            top_p=0.95,
+            top_k=40,
+            max_output_tokens=8192,
+            response_mime_type="text/plain",
+        )
+        transcription = ""
+        for chunk in client.models.generate_content_stream(
+            model="gemini-2.0-flash-lite",
+            contents=contents,
+            config=generate_content_config,
+        ):
+            transcription += chunk.text
+        # Translate text using Gemini
+        translate_prompt = f"Translate the following text to {target_language} and return only the translated text with no additional explanation or commentary:\n\n{transcription}"
+        translate_contents = [
+            types.Content(
+                role="user",
+                parts=[
+                    types.Part.from_text(text=translate_prompt),
+                ],
+            ),
+        ]
+        translated_text = ""
+        for chunk in client.models.generate_content_stream(
+            model="gemini-2.0-flash-lite",
+            contents=translate_contents,
+            config=generate_content_config,
+        ):
+            translated_text += chunk.text
         # Generate TTS
         if target_language in KOKORO_LANGUAGES: