import os import numpy as np from flask import Flask, request, jsonify, send_file, send_from_directory import google.generativeai as genai from gtts import gTTS, lang import tempfile import soundfile as sf from kokoro import KPipeline from werkzeug.utils import secure_filename from flask_cors import CORS app = Flask(__name__, static_folder='static') CORS(app) # Configure Gemini API GEMINI_API_KEY = os.getenv("GEMINI_API_KEY") if not GEMINI_API_KEY: raise ValueError("GEMINI_API_KEY environment variable not set") genai.configure(api_key=GEMINI_API_KEY) # Language configurations KOKORO_LANGUAGES = { "American English": "a", "British English": "b", "Mandarin Chinese": "z", "Spanish": "e", "French": "f", "Hindi": "h", "Italian": "i", "Brazilian Portuguese": "p" } GTTS_LANGUAGES = lang.tts_langs() GTTS_LANGUAGES['ja'] = 'Japanese' # Explicit Japanese support SUPPORTED_LANGUAGES = sorted( list(set(list(KOKORO_LANGUAGES.keys()) + list(GTTS_LANGUAGES.values()))) ) @app.route('/') def serve_index(): return send_from_directory(app.static_folder, 'index.html') @app.route('/languages') def get_languages(): return jsonify(SUPPORTED_LANGUAGES) @app.route('/translate', methods=['POST']) def translate_audio(): try: if 'audio' not in request.files: return jsonify({'error': 'No audio file uploaded'}), 400 audio_file = request.files['audio'] target_language = request.form.get('language', 'English') if not audio_file or audio_file.filename == '': return jsonify({'error': 'Invalid audio file'}), 400 # Validate MIME type allowed_mime_types = ['audio/wav', 'audio/mpeg', 'audio/mp4', 'audio/webm'] if audio_file.mimetype not in allowed_mime_types: return jsonify({'error': f'Unsupported file type: {audio_file.mimetype}'}), 400 # Transcribe audio using Gemini model = genai.GenerativeModel("gemini-2.0-flash") # Create proper audio blob audio_blob = { 'mime_type': audio_file.mimetype, 'data': audio_file.read() } # Get transcription convo = model.start_chat() convo.send_message("You are a professional transcriber. Transcribe this audio accurately and verbatim in the original language. Respond only with the transcription.") response = convo.send_message(audio_blob) transcription = response.text.strip() # Translate text using Gemini prompt = f"Translate the following text to {target_language} preserving meaning and cultural nuances. Respond only with the translation:\n\n{transcription}" response = model.generate_content(prompt) translated_text = response.text.strip() # Generate TTS if target_language in KOKORO_LANGUAGES: lang_code = KOKORO_LANGUAGES[target_language] pipeline = KPipeline(lang_code=lang_code) generator = pipeline(translated_text, voice="af_heart", speed=1) # Collect all audio segments audio_segments = [] for _, _, audio in generator: if audio is not None: audio_segments.append(audio) if audio_segments: audio_data = np.concatenate(audio_segments) _, temp_output_path = tempfile.mkstemp(suffix=".wav") sf.write(temp_output_path, audio_data, 24000) else: raise ValueError("No audio generated by Kokoro") else: # Standard gTTS handling lang_code = next((k for k, v in GTTS_LANGUAGES.items() if v == target_language), 'en') tts = gTTS(translated_text, lang=lang_code) _, temp_output_path = tempfile.mkstemp(suffix=".mp3") tts.save(temp_output_path) return jsonify({ 'transcription': transcription, 'translation': translated_text, 'audio_url': f'/download/{os.path.basename(temp_output_path)}' }) except Exception as e: app.logger.error(f"Error processing request: {str(e)}") return jsonify({'error': str(e)}), 500 @app.route('/download/') def download_file(filename): try: return send_file( os.path.join(tempfile.gettempdir(), filename), mimetype="audio/mpeg", as_attachment=True, download_name=f"translated_{filename}" ) except FileNotFoundError: return jsonify({'error': 'File not found'}), 404 if __name__ == '__main__': app.run(host="0.0.0.0", port=7860)