Athspi commited on
Commit
11a3089
·
verified ·
1 Parent(s): 2435954

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +59 -19
app.py CHANGED
@@ -1,10 +1,10 @@
1
  import os
 
 
2
  from flask import Flask, request, jsonify, send_file, send_from_directory
3
- from faster_whisper import WhisperModel
4
- import google.generativeai as genai
5
  from gtts import gTTS, lang
6
- import tempfile
7
- import soundfile as sf
8
  from kokoro import KPipeline
9
  from werkzeug.utils import secure_filename
10
  from flask_cors import CORS
@@ -16,14 +16,9 @@ CORS(app)
16
  GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
17
  if not GEMINI_API_KEY:
18
  raise ValueError("GEMINI_API_KEY environment variable not set")
19
- genai.configure(api_key=GEMINI_API_KEY)
20
 
21
- # Initialize Whisper model
22
- model_size = "Systran/faster-whisper-large-v3"
23
- try:
24
- whisper_model = WhisperModel(model_size, device="auto", compute_type="float16")
25
- except ValueError:
26
- whisper_model = WhisperModel(model_size, device="cpu", compute_type="int8")
27
 
28
  # Language configurations
29
  KOKORO_LANGUAGES = {
@@ -66,15 +61,60 @@ def translate_audio():
66
  temp_input_path = os.path.join(tempfile.gettempdir(), filename)
67
  audio_file.save(temp_input_path)
68
 
69
- # Transcribe audio
70
- segments, info = whisper_model.transcribe(temp_input_path, beam_size=5)
71
- transcription = " ".join([segment.text for segment in segments])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
72
 
73
- # Translate text
74
- model = genai.GenerativeModel("gemini-2.0-flash")
75
- prompt = f"Translate to {target_language} preserving meaning and cultural nuances:\n\n{transcription}"
76
- response = model.generate_content(prompt)
77
- translated_text = response.text.strip()
 
 
78
 
79
  # Generate TTS
80
  if target_language in KOKORO_LANGUAGES:
 
1
  import os
2
+ import tempfile
3
+ import base64
4
  from flask import Flask, request, jsonify, send_file, send_from_directory
5
+ from google import genai
6
+ from google.genai import types
7
  from gtts import gTTS, lang
 
 
8
  from kokoro import KPipeline
9
  from werkzeug.utils import secure_filename
10
  from flask_cors import CORS
 
16
  GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
17
  if not GEMINI_API_KEY:
18
  raise ValueError("GEMINI_API_KEY environment variable not set")
 
19
 
20
+ # Initialize Gemini client
21
+ client = genai.Client(api_key=GEMINI_API_KEY)
 
 
 
 
22
 
23
  # Language configurations
24
  KOKORO_LANGUAGES = {
 
61
  temp_input_path = os.path.join(tempfile.gettempdir(), filename)
62
  audio_file.save(temp_input_path)
63
 
64
+ # Transcribe audio using Gemini
65
+ with open(temp_input_path, "rb") as audio_file:
66
+ audio_data = base64.b64encode(audio_file.read()).decode("utf-8")
67
+
68
+ files = [client.files.upload(file=temp_input_path)]
69
+
70
+ contents = [
71
+ types.Content(
72
+ role="user",
73
+ parts=[
74
+ types.Part.from_uri(
75
+ file_uri=files[0].uri,
76
+ mime_type=files[0].mime_type,
77
+ ),
78
+ types.Part.from_text(text="Transcript the audio and provide only the text. Do not include any explanations or additional information."),
79
+ ],
80
+ ),
81
+ ]
82
+
83
+ generate_content_config = types.GenerateContentConfig(
84
+ temperature=1,
85
+ top_p=0.95,
86
+ top_k=40,
87
+ max_output_tokens=8192,
88
+ response_mime_type="text/plain",
89
+ )
90
+
91
+ transcription = ""
92
+ for chunk in client.models.generate_content_stream(
93
+ model="gemini-2.0-flash-lite",
94
+ contents=contents,
95
+ config=generate_content_config,
96
+ ):
97
+ transcription += chunk.text
98
+
99
+ # Translate text using Gemini
100
+ translate_prompt = f"Translate the following text to {target_language} and return only the translated text with no additional explanation or commentary:\n\n{transcription}"
101
+
102
+ translate_contents = [
103
+ types.Content(
104
+ role="user",
105
+ parts=[
106
+ types.Part.from_text(text=translate_prompt),
107
+ ],
108
+ ),
109
+ ]
110
 
111
+ translated_text = ""
112
+ for chunk in client.models.generate_content_stream(
113
+ model="gemini-2.0-flash-lite",
114
+ contents=translate_contents,
115
+ config=generate_content_config,
116
+ ):
117
+ translated_text += chunk.text
118
 
119
  # Generate TTS
120
  if target_language in KOKORO_LANGUAGES: