Athspi commited on
Commit
7cc4829
·
verified ·
1 Parent(s): 0f15ec7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +76 -148
app.py CHANGED
@@ -1,27 +1,31 @@
1
  import os
2
- import gradio as gr
3
  from faster_whisper import WhisperModel
4
  import google.generativeai as genai
5
  from gtts import gTTS, lang
6
  import tempfile
7
  import soundfile as sf
8
  from kokoro import KPipeline
 
 
9
 
10
- # Configure Gemini API (use environment variable for Hugging Face Spaces)
 
 
 
11
  GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
12
  if not GEMINI_API_KEY:
13
- raise ValueError("GEMINI_API_KEY environment variable not set. Please set it in the Hugging Face Spaces Secrets.")
14
  genai.configure(api_key=GEMINI_API_KEY)
15
 
16
- # Initialize the faster-whisper model with fallback compute type
17
  model_size = "Systran/faster-whisper-large-v3"
18
  try:
19
  whisper_model = WhisperModel(model_size, device="auto", compute_type="float16")
20
  except ValueError:
21
- print("Float16 not supported, falling back to int8 on CPU")
22
  whisper_model = WhisperModel(model_size, device="cpu", compute_type="int8")
23
 
24
- # Language codes for Kokoro TTS
25
  KOKORO_LANGUAGES = {
26
  "American English": "a",
27
  "British English": "b",
@@ -34,156 +38,80 @@ KOKORO_LANGUAGES = {
34
  "Brazilian Portuguese": "p"
35
  }
36
 
37
- # Function to transcribe audio using faster-whisper
38
- def transcribe_audio(audio_file):
39
- try:
40
- segments, info = whisper_model.transcribe(audio_file, beam_size=5)
41
- transcription = " ".join([segment.text for segment in segments])
42
- detected_language = info.language
43
- return transcription, detected_language, None
44
- except Exception as e:
45
- return None, None, f"Transcription error: {str(e)}"
 
46
 
47
- # Function to translate text using Gemini API with a magic prompt
48
- def translate_text(text, target_language):
49
  try:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
  model = genai.GenerativeModel("gemini-2.0-flash")
51
- prompt = f"Translate the following text to {target_language} and return only the translated text with no additional explanation or commentary:\n\n{text}"
52
  response = model.generate_content(prompt)
53
  translated_text = response.text.strip()
54
- return translated_text, None
55
- except Exception as e:
56
- return None, f"Translation error: {str(e)}"
57
-
58
- # Function to convert text to speech using Kokoro or gTTS based on language
59
- def text_to_speech(text, language):
60
- try:
61
- # Check if the language is supported by Kokoro
62
- if language in KOKORO_LANGUAGES:
63
- # Use Kokoro TTS
64
- lang_code = KOKORO_LANGUAGES[language]
65
  pipeline = KPipeline(lang_code=lang_code)
66
- generator = pipeline(text, voice="af_heart", speed=1, split_pattern=r'\n+')
67
- audio_data = None
68
- for i, (gs, ps, audio) in enumerate(generator):
69
- audio_data = audio # Use the first segment
70
- break
71
- if audio_data is None:
72
- raise ValueError("No audio generated by Kokoro")
73
- with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as fp:
74
- sf.write(fp.name, audio_data, 24000)
75
- return fp.name, None
76
  else:
77
- # Fallback to gTTS
78
- lang_map = lang.tts_langs()
79
- tts_lang = next((k for k, v in lang_map.items() if v.lower() == language.lower()), "en")
80
- tts = gTTS(text=text, lang=tts_lang, slow=False)
81
- with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as fp:
82
- tts.save(fp.name)
83
- return fp.name, None
 
 
 
 
84
  except Exception as e:
85
- return None, f"TTS error: {str(e)}"
86
-
87
- # Main function to process audio input and return outputs
88
- def process_audio(audio_file, target_language):
89
- if audio_file is None:
90
- return "Please upload an audio file or record audio.", None, None, None
91
-
92
- transcription, detected_language, error = transcribe_audio(audio_file)
93
- if error:
94
- return error, None, None, None
95
-
96
- translated_text, error = translate_text(transcription, target_language)
97
- if error:
98
- return error, transcription, None, None
99
-
100
- audio_output, error = text_to_speech(translated_text, target_language)
101
- if error:
102
- return error, transcription, translated_text, None
103
-
104
- return None, transcription, translated_text, audio_output
105
 
106
- # Gradio interface with custom CSS and JavaScript
107
- css = """
108
- body {
109
- font-family: 'Arial', sans-serif;
110
- background-color: #f4f4f4;
111
- color: #333;
112
- }
113
- .gradio-container {
114
- max-width: 800px;
115
- margin: 0 auto;
116
- padding: 20px;
117
- background-color: #fff;
118
- border-radius: 10px;
119
- box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
120
- }
121
- .gradio-header {
122
- text-align: center;
123
- margin-bottom: 20px;
124
- }
125
- .gradio-header h1 {
126
- font-size: 2.5em;
127
- color: #444;
128
- }
129
- .gradio-row {
130
- display: flex;
131
- flex-direction: column;
132
- gap: 15px;
133
- }
134
- .gradio-button {
135
- background-color: #007bff;
136
- color: white;
137
- border: none;
138
- padding: 10px 20px;
139
- border-radius: 5px;
140
- cursor: pointer;
141
- font-size: 1em;
142
- }
143
- .gradio-button:hover {
144
- background-color: #0056b3;
145
- }
146
- .gradio-output {
147
- background-color: #f9f9f9;
148
- padding: 15px;
149
- border-radius: 5px;
150
- border: 1px solid #ddd;
151
- }
152
- """
153
-
154
- js = """
155
- function updateUI() {
156
- // Add any custom JavaScript here if needed
157
- }
158
- """
159
-
160
- with gr.Blocks(css=css, title="AI Audio Translator") as demo:
161
- gr.Markdown("# AI Audio Translator", elem_classes="gradio-header")
162
- gr.Markdown("Upload an audio file or record via microphone, select a target language, and get the transcription, translation, and translated audio! Uses Kokoro TTS for supported languages, otherwise gTTS.")
163
-
164
- supported_langs = list(set(list(KOKORO_LANGUAGES.keys()) + list({v: k for k, v in lang.tts_langs().items()}.keys())))
165
-
166
- with gr.Row(elem_classes="gradio-row"):
167
- audio_input = gr.Audio(sources=["upload", "microphone"], type="filepath", label="Input Audio")
168
- target_lang = gr.Dropdown(
169
- choices=sorted(supported_langs),
170
- value="Spanish",
171
- label="Target Language"
172
  )
173
-
174
- submit_btn = gr.Button("Translate", elem_classes="gradio-button")
175
-
176
- with gr.Row(elem_classes="gradio-row"):
177
- error_output = gr.Textbox(label="Error", visible=True, elem_classes="gradio-output")
178
- transcription_output = gr.Textbox(label="Transcription", elem_classes="gradio-output")
179
- translation_output = gr.Textbox(label="Translated Text", elem_classes="gradio-output")
180
- audio_output = gr.Audio(label="Translated Audio", elem_classes="gradio-output")
181
-
182
- submit_btn.click(
183
- fn=process_audio,
184
- inputs=[audio_input, target_lang],
185
- outputs=[error_output, transcription_output, translation_output, audio_output]
186
- )
187
 
188
- # Launch the app
189
- demo.launch()
 
1
  import os
2
+ from flask import Flask, request, jsonify, send_file, send_from_directory
3
  from faster_whisper import WhisperModel
4
  import google.generativeai as genai
5
  from gtts import gTTS, lang
6
  import tempfile
7
  import soundfile as sf
8
  from kokoro import KPipeline
9
+ from werkzeug.utils import secure_filename
10
+ from flask_cors import CORS
11
 
12
+ app = Flask(__name__, static_folder='static')
13
+ CORS(app)
14
+
15
+ # Configure Gemini API
16
  GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
17
  if not GEMINI_API_KEY:
18
+ raise ValueError("GEMINI_API_KEY environment variable not set")
19
  genai.configure(api_key=GEMINI_API_KEY)
20
 
21
+ # Initialize Whisper model
22
  model_size = "Systran/faster-whisper-large-v3"
23
  try:
24
  whisper_model = WhisperModel(model_size, device="auto", compute_type="float16")
25
  except ValueError:
 
26
  whisper_model = WhisperModel(model_size, device="cpu", compute_type="int8")
27
 
28
+ # Language configurations
29
  KOKORO_LANGUAGES = {
30
  "American English": "a",
31
  "British English": "b",
 
38
  "Brazilian Portuguese": "p"
39
  }
40
 
41
+ GTTS_LANGUAGES = lang.tts_langs()
42
+ SUPPORTED_LANGUAGES = sorted(list(KOKORO_LANGUAGES.keys()) + list(GTTS_LANGUAGES.values()))
43
+
44
+ @app.route('/')
45
+ def serve_index():
46
+ return send_from_directory(app.static_folder, 'index.html')
47
+
48
+ @app.route('/languages')
49
+ def get_languages():
50
+ return jsonify(SUPPORTED_LANGUAGES)
51
 
52
+ @app.route('/translate', methods=['POST'])
53
+ def translate_audio():
54
  try:
55
+ if 'audio' not in request.files:
56
+ return jsonify({'error': 'No audio file uploaded'}), 400
57
+
58
+ audio_file = request.files['audio']
59
+ target_language = request.form.get('language', 'English')
60
+
61
+ if not audio_file or audio_file.filename == '':
62
+ return jsonify({'error': 'Invalid audio file'}), 400
63
+
64
+ # Save temporary audio file
65
+ filename = secure_filename(audio_file.filename)
66
+ temp_input_path = os.path.join(tempfile.gettempdir(), filename)
67
+ audio_file.save(temp_input_path)
68
+
69
+ # Transcribe audio
70
+ segments, info = whisper_model.transcribe(temp_input_path, beam_size=5)
71
+ transcription = " ".join([segment.text for segment in segments])
72
+
73
+ # Translate text
74
  model = genai.GenerativeModel("gemini-2.0-flash")
75
+ prompt = f"Translate to {target_language} preserving meaning and cultural nuances:\n\n{transcription}"
76
  response = model.generate_content(prompt)
77
  translated_text = response.text.strip()
78
+
79
+ # Generate TTS
80
+ if target_language in KOKORO_LANGUAGES:
81
+ lang_code = KOKORO_LANGUAGES[target_language]
 
 
 
 
 
 
 
82
  pipeline = KPipeline(lang_code=lang_code)
83
+ generator = pipeline(translated_text, voice="af_heart", speed=1)
84
+ audio_data = next((audio for _, _, audio in generator), None)
85
+ if audio_data:
86
+ _, temp_output_path = tempfile.mkstemp(suffix=".wav")
87
+ sf.write(temp_output_path, audio_data, 24000)
 
 
 
 
 
88
  else:
89
+ lang_code = next((k for k, v in GTTS_LANGUAGES.items() if v == target_language), 'en')
90
+ tts = gTTS(translated_text, lang=lang_code)
91
+ _, temp_output_path = tempfile.mkstemp(suffix=".mp3")
92
+ tts.save(temp_output_path)
93
+
94
+ return jsonify({
95
+ 'transcription': transcription,
96
+ 'translation': translated_text,
97
+ 'audio_url': f'/download/{os.path.basename(temp_output_path)}'
98
+ })
99
+
100
  except Exception as e:
101
+ app.logger.error(f"Error processing request: {str(e)}")
102
+ return jsonify({'error': str(e)}), 500
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
103
 
104
+ @app.route('/download/<filename>')
105
+ def download_file(filename):
106
+ try:
107
+ return send_file(
108
+ os.path.join(tempfile.gettempdir(), filename),
109
+ mimetype="audio/mpeg",
110
+ as_attachment=True,
111
+ download_name=f"translated_{filename}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
112
  )
113
+ except FileNotFoundError:
114
+ return jsonify({'error': 'File not found'}), 404
 
 
 
 
 
 
 
 
 
 
 
 
115
 
116
+ if __name__ == '__main__':
117
+ app.run(host='0.0.0.0', port=5000, debug=True)