Spaces:

mac9087
/

metanice

Build error

App Files Files Community

mac9087 commited on 8 days ago

Commit

596a84e

verified ·

1 Parent(s): a77cb2e

Update app.py

Browse files

Files changed (1) hide show

app.py +135 -34

app.py CHANGED Viewed

@@ -24,8 +24,13 @@ models_loaded = False
 loading_thread = None
 load_queue = queue.Queue()
-# Use a smaller Whisper model for faster inference
-WHISPER_MODEL_SIZE = "tiny"  # Changed from "small" to "tiny"
 def load_models():
     """Load all models in background thread"""
@@ -33,13 +38,14 @@ def load_models():
     print("Starting model loading...")
-    # Load Whisper model with optimized settings
-    whisper_model = WhisperModel(
-        WHISPER_MODEL_SIZE,
-        device="cpu",
-        compute_type="int8",
-        download_root="./models"  # Cache models to disk
-    )
     print("Whisper model loaded")
     # Use a smaller, faster LLM
@@ -62,13 +68,47 @@ def load_models():
     with model_lock:
         models_loaded = True
-    print("All models loaded successfully")
     # Process any pending requests that arrived during loading
     while not load_queue.empty():
         callback = load_queue.get()
         callback()
 # Start loading models in background thread
 def start_loading_models():
     global loading_thread
@@ -76,7 +116,16 @@ def start_loading_models():
     loading_thread.daemon = True
     loading_thread.start()
-start_loading_models()
 def ensure_models_loaded(callback):
     """Ensure models are loaded before processing a request"""
@@ -140,13 +189,18 @@ def generate_ai_response(user_input):
         return "I'm listening. Please say more."
     try:
-        start_time = time.time()
-        # Generate response with fewer tokens
-        raw_response = llm(user_input)[0]["generated_text"]
-        # Process to get clean, short response
-        final_response = process_response(user_input, raw_response)
-        print(f"LLM processing time: {time.time() - start_time:.2f}s")
         # Cache the response for future identical requests
         response_cache[cache_key] = final_response
@@ -173,19 +227,31 @@ def talk():
     def process_request():
         nonlocal audio_file
         try:
-            with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
                 audio_path = tmp.name
                 audio_file.save(audio_path)
-            # Transcribe with optimized settings
             transcribe_start = time.time()
-            segments, _ = whisper_model.transcribe(
-                audio_path,
-                beam_size=1,  # Reduce beam size for speed
-                vad_filter=True,  # Use voice activity detection to process only speech
-                vad_parameters=dict(min_silence_duration_ms=500)  # Tune VAD for speed
-            )
-            transcription = "".join([seg.text for seg in segments])
             print(f"Transcription time: {time.time() - transcribe_start:.2f}s")
             if not transcription.strip():
@@ -196,14 +262,29 @@ def talk():
             # Prepare TTS output path
             tts_audio_path = audio_path.replace(".wav", "_reply.wav")
-            # Synthesize speech with optimized settings
             tts_start = time.time()
-            tts.tts_to_file(
-                text=final_response,
-                file_path=tts_audio_path,
-                speaker_wav=None,
-                speed=1.1  # Slightly faster speech
-            )
             print(f"TTS time: {time.time() - tts_start:.2f}s")
             # Return both the audio file and the text response
@@ -278,6 +359,26 @@ def status():
 def index():
     return "Metaverse AI Character API running."
 if __name__ == "__main__":
     # Use threaded server for better concurrency
     app.run(host="0.0.0.0", port=7860, threaded=True)

 loading_thread = None
 load_queue = queue.Queue()
+# Define paths with proper permissions
+TEMP_DIR = "/tmp/ai_models"
+os.makedirs(TEMP_DIR, exist_ok=True)
+# Environment variable to control model size
+# Set to "tiny" for fastest response, "base" for better quality but still fast
+WHISPER_MODEL_SIZE = os.environ.get("WHISPER_MODEL_SIZE", "tiny")
 def load_models():
     """Load all models in background thread"""
     print("Starting model loading...")
+    try:
+        # Load Whisper model with optimized settings
+        whisper_model = WhisperModel(
+            WHISPER_MODEL_SIZE,
+            device="cpu",
+            compute_type="int8",
+            download_root=TEMP_DIR  # Use temp directory with write permissions
+        )
     print("Whisper model loaded")
     # Use a smaller, faster LLM
     with model_lock:
         models_loaded = True
+    except Exception as e:
+        print(f"Error loading Whisper model: {str(e)}")
+        whisper_model = None
+    # Mark models as loaded even if some failed - we'll use fallbacks
+    with model_lock:
+        models_loaded = True
+    print("Model loading completed")
     # Process any pending requests that arrived during loading
     while not load_queue.empty():
         callback = load_queue.get()
         callback()
+# Fallback methods for when models fail to load
+def fallback_transcribe(audio_path):
+    """Simple fallback when Whisper fails to load"""
+    # Just return empty text - in production you might want a more sophisticated fallback
+    return "I couldn't transcribe the audio due to technical issues."
+def fallback_generate_text(user_input):
+    """Simple rule-based response when LLM fails to load"""
+    # Very basic template responses
+    if not user_input or len(user_input) < 5:
+        return "I'm listening. Please continue."
+    if "?" in user_input:
+        return "That's an interesting question. I'm processing it now."
+    # Simple acknowledgment responses
+    responses = [
+        "I understand what you're saying.",
+        "I'm following your thoughts.",
+        "I hear you loud and clear.",
+        "I'm processing that information.",
+        "That makes sense to me."
+    ]
+    import random
+    return random.choice(responses)
 # Start loading models in background thread
 def start_loading_models():
     global loading_thread
     loading_thread.daemon = True
     loading_thread.start()
+# Create temp directory and start loading
+try:
+    os.makedirs(TEMP_DIR, exist_ok=True)
+    print(f"Created model cache directory at {TEMP_DIR}")
+    start_loading_models()
+except Exception as e:
+    print(f"Error setting up model loading: {str(e)}")
+    # Automatically mark as loaded with no models
+    with model_lock:
+        models_loaded = True
 def ensure_models_loaded(callback):
     """Ensure models are loaded before processing a request"""
         return "I'm listening. Please say more."
     try:
+        # If LLM failed to load, use fallback
+        if llm is None:
+            print("Using fallback text generation")
+            final_response = fallback_generate_text(user_input)
+        else:
+            start_time = time.time()
+            # Generate response with fewer tokens
+            raw_response = llm(user_input)[0]["generated_text"]
+            # Process to get clean, short response
+            final_response = process_response(user_input, raw_response)
+            print(f"LLM processing time: {time.time() - start_time:.2f}s")
         # Cache the response for future identical requests
         response_cache[cache_key] = final_response
     def process_request():
         nonlocal audio_file
         try:
+            # Prepare file paths
+            with tempfile.NamedTemporaryFile(delete=False, suffix=".wav", dir=TEMP_DIR) as tmp:
                 audio_path = tmp.name
                 audio_file.save(audio_path)
+            # Transcribe audio
             transcribe_start = time.time()
+            if whisper_model is None:
+                # Fallback if model failed to load
+                print("Using fallback transcription")
+                transcription = fallback_transcribe(audio_path)
+            else:
+                try:
+                    # Transcribe with optimized settings
+                    segments, _ = whisper_model.transcribe(
+                        audio_path,
+                        beam_size=1,  # Reduce beam size for speed
+                        vad_filter=True,  # Use voice activity detection to process only speech
+                        vad_parameters=dict(min_silence_duration_ms=500)  # Tune VAD for speed
+                    )
+                    transcription = "".join([seg.text for seg in segments])
+                except Exception as e:
+                    print(f"Whisper transcription error: {str(e)}")
+                    transcription = ""
             print(f"Transcription time: {time.time() - transcribe_start:.2f}s")
             if not transcription.strip():
             # Prepare TTS output path
             tts_audio_path = audio_path.replace(".wav", "_reply.wav")
+            # Synthesize speech
             tts_start = time.time()
+            if tts is None:
+                # If TTS failed to load, create a simple audio file with message
+                print("Using fallback TTS (no speech synthesis)")
+                # Just copy the input file as a placeholder
+                import shutil
+                shutil.copyfile(audio_path, tts_audio_path)
+            else:
+                try:
+                    # Synthesize speech with optimized settings
+                    tts.tts_to_file(
+                        text=final_response,
+                        file_path=tts_audio_path,
+                        speaker_wav=None,
+                        speed=1.1  # Slightly faster speech
+                    )
+                except Exception as e:
+                    print(f"TTS error: {str(e)}")
+                    # Just copy the input file as a placeholder
+                    import shutil
+                    shutil.copyfile(audio_path, tts_audio_path)
             print(f"TTS time: {time.time() - tts_start:.2f}s")
             # Return both the audio file and the text response
 def index():
     return "Metaverse AI Character API running."
+# Add direct-response mode for maximum performance
+@app.route("/quick_chat", methods=["POST"])
+def quick_chat():
+    """Ultra-fast endpoint that skips ML models completely for instant responses"""
+    data = request.get_json()
+    if not data or "text" not in data:
+        return jsonify({"error": "Missing 'text' in request body"}), 400
+    try:
+        user_input = data["text"]
+        print(f"Quick chat input: {user_input}")
+        # Use simple rule-based responses for maximum speed
+        final_response = fallback_generate_text(user_input)
+        return jsonify({"response": final_response})
+    except Exception as e:
+        print(f"Error in quick_chat: {str(e)}")
+        return jsonify({"response": "I'm listening."})
 if __name__ == "__main__":
     # Use threaded server for better concurrency
     app.run(host="0.0.0.0", port=7860, threaded=True)