Spaces:

Athspi-ai
/

Audio-translation

Running

File size: 4,712 Bytes

5f33e0e
7582b7f
d0dd39c
ab0df5d
dbe8a71
c07d698
6ebed08
 
7cc4829
 
dbe8a71
6ebed08
5ddb059
7cc4829
 
5ddb059
 
 
 
dbe8a71
7cc4829
6ebed08
 
 
 
 
 
 
 
 
 
 
7cc4829
dbed07a
317b2f2
dbed07a
 
 
6ebed08
dbed07a
 
 
63a0fca
dbed07a
 
 
965bd2d
5ddb059
 
6c131f6
5ddb059
 
dbed07a
5ddb059
 
dbed07a
5ddb059
 
 
dbed07a
 
 
 
5ddb059
dbed07a
63a0fca
 
dbed07a
 
 
 
 
 
 
 
 
 
 
 
 
 
6ebed08
 
dbed07a
 
6ebed08
 
 
 
7582b7f
dbed07a
7582b7f
 
70e979d
7582b7f
 
dbed07a
 
 
 
 
7582b7f
6ebed08
dbed07a
6ebed08
 
dbed07a
 
 
5ddb059
7cc4829
 
dbed07a
5ddb059
dbed07a
5ddb059
dbed07a
5ddb059
dbe8a71
7cc4829
 
 
 
 
 
 
 
ef2c8e0
dbed07a
 
dbe8a71
7cc4829
889b89d

import os
import numpy as np
from flask import Flask, request, jsonify, send_file, send_from_directory
import google.generativeai as genai
from gtts import gTTS, lang
import tempfile
import soundfile as sf
from kokoro import KPipeline
from werkzeug.utils import secure_filename
from flask_cors import CORS

app = Flask(__name__, static_folder='static')
CORS(app)

# Configure Gemini API
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
if not GEMINI_API_KEY:
    raise ValueError("GEMINI_API_KEY environment variable not set")
genai.configure(api_key=GEMINI_API_KEY)

# Language configurations
KOKORO_LANGUAGES = {
    "American English": "a",
    "British English": "b",
    "Mandarin Chinese": "z",
    "Spanish": "e",
    "French": "f",
    "Hindi": "h",
    "Italian": "i",
    "Brazilian Portuguese": "p"
}

GTTS_LANGUAGES = lang.tts_langs()
GTTS_LANGUAGES['ja'] = 'Japanese'  # Explicit Japanese support

SUPPORTED_LANGUAGES = sorted(
    list(set(list(KOKORO_LANGUAGES.keys()) + list(GTTS_LANGUAGES.values())))
)

@app.route('/')
def serve_index():
    return send_from_directory(app.static_folder, 'index.html')

@app.route('/languages')
def get_languages():
    return jsonify(SUPPORTED_LANGUAGES)

@app.route('/translate', methods=['POST'])
def translate_audio():
    try:
        if 'audio' not in request.files:
            return jsonify({'error': 'No audio file uploaded'}), 400
            
        audio_file = request.files['audio']
        target_language = request.form.get('language', 'English')
        
        if not audio_file or audio_file.filename == '':
            return jsonify({'error': 'Invalid audio file'}), 400

        # Validate MIME type
        allowed_mime_types = ['audio/wav', 'audio/mpeg', 'audio/mp4', 'audio/webm']
        if audio_file.mimetype not in allowed_mime_types:
            return jsonify({'error': f'Unsupported file type: {audio_file.mimetype}'}), 400

        # Transcribe audio using Gemini
        model = genai.GenerativeModel("gemini-2.0-flash")
        
        # Create proper audio blob
        audio_blob = {
            'mime_type': audio_file.mimetype,
            'data': audio_file.read()
        }

        # Get transcription
        convo = model.start_chat()
        convo.send_message("You are a professional transcriber. Transcribe this audio accurately and verbatim in the original language. Respond only with the transcription.")
        response = convo.send_message(audio_blob)
        transcription = response.text.strip()

        # Translate text using Gemini
        prompt = f"Translate the following text to {target_language} preserving meaning and cultural nuances. Respond only with the translation:\n\n{transcription}"
        response = model.generate_content(prompt)
        translated_text = response.text.strip()
        
        # Generate TTS
        if target_language in KOKORO_LANGUAGES:
            lang_code = KOKORO_LANGUAGES[target_language]
            pipeline = KPipeline(lang_code=lang_code)
            generator = pipeline(translated_text, voice="af_heart", speed=1)
            
            # Collect all audio segments
            audio_segments = []
            for _, _, audio in generator:
                if audio is not None:
                    audio_segments.append(audio)
            
            if audio_segments:
                audio_data = np.concatenate(audio_segments)
                _, temp_output_path = tempfile.mkstemp(suffix=".wav")
                sf.write(temp_output_path, audio_data, 24000)
            else:
                raise ValueError("No audio generated by Kokoro")
        else:
            # Standard gTTS handling
            lang_code = next((k for k, v in GTTS_LANGUAGES.items() if v == target_language), 'en')
            tts = gTTS(translated_text, lang=lang_code)
            _, temp_output_path = tempfile.mkstemp(suffix=".mp3")
            tts.save(temp_output_path)
        
        return jsonify({
            'transcription': transcription,
            'translation': translated_text,
            'audio_url': f'/download/{os.path.basename(temp_output_path)}'
        })
        
    except Exception as e:
        app.logger.error(f"Error processing request: {str(e)}")
        return jsonify({'error': str(e)}), 500

@app.route('/download/<filename>')
def download_file(filename):
    try:
        return send_file(
            os.path.join(tempfile.gettempdir(), filename),
            mimetype="audio/mpeg",
            as_attachment=True,
            download_name=f"translated_{filename}"
        )
    except FileNotFoundError:
        return jsonify({'error': 'File not found'}), 404

if __name__ == '__main__':
    app.run(host="0.0.0.0", port=7860)