Spaces:

Athspi-ai
/

Audio-translation

Running

App Files Files Community

Athspi commited on Mar 9

Commit

c07d698

verified ·

1 Parent(s): 073ce19

Update app.py

Browse files

Files changed (1) hide show

app.py +22 -51

app.py CHANGED Viewed

@@ -1,10 +1,10 @@
 import os
-import tempfile
 import base64
-from flask import Flask, request, jsonify, send_file, send_from_directory
 import google.generativeai as genai
-from google.generativeai.types import Content, Part, GenerateContentConfig
 from gtts import gTTS, lang
 from kokoro import KPipeline
 from werkzeug.utils import secure_filename
 from flask_cors import CORS
@@ -16,8 +16,6 @@ CORS(app)
 GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
 if not GEMINI_API_KEY:
     raise ValueError("GEMINI_API_KEY environment variable not set")
-# Initialize Gemini client
 genai.configure(api_key=GEMINI_API_KEY)
 # Language configurations
@@ -61,59 +59,32 @@ def translate_audio():
         temp_input_path = os.path.join(tempfile.gettempdir(), filename)
         audio_file.save(temp_input_path)
-        # Transcribe audio using Gemini
         with open(temp_input_path, "rb") as audio_file:
             audio_data = base64.b64encode(audio_file.read()).decode("utf-8")
-        # Upload file to Gemini
-        uploaded_file = genai.upload_file(path=temp_input_path)
-        # Generate transcription
-        transcription = ""
-        response = genai.generate_content(
-            model="gemini-2.0-flash-lite",
-            contents=[
-                Content(
-                    role="user",
-                    parts=[
-                        Part.from_uri(file_uri=uploaded_file.uri, mime_type=uploaded_file.mime_type),
-                        Part.from_text(text="Transcript the audio and provide only the text. Do not include any explanations or additional information."),
-                    ],
-                ),
-            ],
-            config=GenerateContentConfig(
-                temperature=1,
-                top_p=0.95,
-                top_k=40,
-                max_output_tokens=8192,
-                response_mime_type="text/plain",
-            ),
         )
-        transcription = response.text
-        # Translate text using Gemini
-        translate_prompt = f"Translate the following text to {target_language} and return only the translated text with no additional explanation or commentary:\n\n{transcription}"
-        translated_text = ""
-        response = genai.generate_content(
-            model="gemini-2.0-flash-lite",
-            contents=[
-                Content(
-                    role="user",
-                    parts=[
-                        Part.from_text(text=translate_prompt),
-                    ],
-                ),
-            ],
-            config=GenerateContentConfig(
-                temperature=1,
-                top_p=0.95,
-                top_k=40,
-                max_output_tokens=8192,
-                response_mime_type="text/plain",
-            ),
-        )
-        translated_text = response.text
         # Generate TTS
         if target_language in KOKORO_LANGUAGES:

 import os
 import base64
+from flask import Flask, request, jsonify, send_file
 import google.generativeai as genai
 from gtts import gTTS, lang
+import tempfile
+import soundfile as sf
 from kokoro import KPipeline
 from werkzeug.utils import secure_filename
 from flask_cors import CORS
 GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
 if not GEMINI_API_KEY:
     raise ValueError("GEMINI_API_KEY environment variable not set")
 genai.configure(api_key=GEMINI_API_KEY)
 # Language configurations
         temp_input_path = os.path.join(tempfile.gettempdir(), filename)
         audio_file.save(temp_input_path)
+        # Read audio file as base64
         with open(temp_input_path, "rb") as audio_file:
             audio_data = base64.b64encode(audio_file.read()).decode("utf-8")
+        # Transcribe with Gemini
+        model = genai.GenerativeModel("gemini-1.5-pro-latest")
+        prompt = """Accurately transcribe this audio file. Return only the raw text without any formatting,
+                   punctuation, or additional commentary. Preserve the original language and meaning."""
+        response = model.generate_content(
+            [
+                prompt,
+                {
+                    "mime_type": "audio/" + filename.split('.')[-1],
+                    "data": audio_data
+                }
+            ]
         )
+        transcription = response.text.strip()
+        # Translate with Gemini
+        translate_prompt = f"""Translate this text to {target_language} preserving exact meaning and cultural nuances.
+                            Return only the translated text without any explanations or formatting: {transcription}"""
+        translated_response = model.generate_content(translate_prompt)
+        translated_text = translated_response.text.strip()
         # Generate TTS
         if target_language in KOKORO_LANGUAGES: