Spaces:

mac9087
/

metanice

Build error

App Files Files Community

mac9087 commited on 7 days ago

Commit

0426b81

verified ·

1 Parent(s): d97558b

Update app.py

Browse files

Files changed (1) hide show

app.py +428 -109

app.py CHANGED Viewed

@@ -1,133 +1,364 @@
 from flask import Flask, request, jsonify, send_file
 from flask_cors import CORS
 import tempfile
 import os
-import time
-import random
 import base64
 app = Flask(__name__)
 CORS(app)
-# Simple storage for responses
-response_cache = {}
-# Configure paths
-TEMP_DIR = "/tmp/ai_responses"
-os.makedirs(TEMP_DIR, exist_ok=True)
-# Quick responses library for when no ML is needed
-QUICK_RESPONSES = [
-    "I understand what you're saying.",
-    "I'm following your thoughts.",
-    "I hear you loud and clear.",
-    "That makes sense to me.",
-    "I'm processing that information.",
-    "I hear what you're saying.",
-    "Interesting point.",
-    "I see where you're coming from.",
-    "That's a good perspective.",
-    "I'm with you on that.",
-    "Tell me more about that.",
-    "I'm listening carefully.",
-    "I appreciate your thoughts on this.",
-    "That's an interesting way to look at it.",
-    "I'm taking that into consideration."
-]
-# Responses for questions
-QUESTION_RESPONSES = [
-    "That's a good question. Let me think about it.",
-    "I'm considering different perspectives on that question.",
-    "That's something I've been thinking about as well.",
-    "That's an interesting question to explore.",
-    "I'm processing your question and considering how to respond."
-]
-def get_quick_response(user_input):
-    """Generate a fast response based on simple rules"""
-    # Check cache first for identical requests
-    cache_key = user_input.strip().lower()
-    if cache_key in response_cache:
-        return response_cache[cache_key]
-    # Minimal processing
-    if not user_input or len(user_input.strip()) < 3:
-        response = "I'm listening. Please tell me more."
-    elif "?" in user_input:
-        response = random.choice(QUESTION_RESPONSES)
-    else:
-        response = random.choice(QUICK_RESPONSES)
-    # Cache the response
-    response_cache[cache_key] = response
-    # Limit cache size
-    if len(response_cache) > 100:
-        keys_to_remove = list(response_cache.keys())[:-50]
-        for k in keys_to_remove:
-            response_cache.pop(k, None)
-    return response
-@app.route("/chat", methods=["POST"])
-def chat():
-    data = request.get_json()
-    if not data or "text" not in data:
-        return jsonify({"error": "Missing 'text' in request body"}), 400
     try:
-        user_input = data["text"]
-        print(f"Text input: {user_input}")
-        # Add a tiny delay to make it seem like it's "thinking" (50-150ms)
-        time.sleep(random.uniform(0.05, 0.15))
-        # Get response
-        final_response = get_quick_response(user_input)
-        print(f"Text response: {final_response}")
-        return jsonify({"response": final_response})
     except Exception as e:
-        print(f"Error in chat endpoint: {str(e)}")
-        return jsonify({"response": "I'm listening."})
 @app.route("/talk", methods=["POST"])
 def talk():
     if "audio" not in request.files:
         return jsonify({"error": "No audio file"}), 400
     audio_file = request.files["audio"]
     try:
-        # Save the input audio temporarily
-        with tempfile.NamedTemporaryFile(delete=False, suffix=".wav", dir=TEMP_DIR) as tmp:
             audio_path = tmp.name
             audio_file.save(audio_path)
-        # We're not actually processing the audio, just echoing back a response
-        # In a real app, you would transcribe here
-        # Get a quick canned response
-        final_response = get_quick_response("Hello")
-        # In a real app, you would generate speech here
-        # For now, we'll just copy the input file as a placeholder
-        tts_audio_path = audio_path.replace(".wav", "_reply.wav")
-        # Add a small delay to mimic processing time
-        time.sleep(random.uniform(0.1, 0.3))
-        # Just copy the file for now since we can't actually generate speech
-        import shutil
-        shutil.copyfile(audio_path, tts_audio_path)
         # Return both the audio file and the text response
         try:
             response = send_file(tts_audio_path, mimetype="audio/wav")
             encoded_response = base64.b64encode(final_response.encode('utf-8')).decode('ascii')
             response.headers["X-Response-Text-Base64"] = encoded_response
             response.headers["Access-Control-Expose-Headers"] = "X-Response-Text-Base64"
             return response
         except Exception as e:
             print(f"Error sending file: {str(e)}")
@@ -135,6 +366,7 @@ def talk():
                 "error": "Could not send audio response",
                 "text_response": final_response
             }), 500
     except Exception as e:
         print(f"Error in talk endpoint: {str(e)}")
         return jsonify({"error": str(e)}), 500
@@ -143,29 +375,116 @@ def talk():
         try:
             if 'audio_path' in locals() and os.path.exists(audio_path):
                 os.unlink(audio_path)
-            if 'tts_audio_path' in locals() and os.path.exists(tts_audio_path) and tts_audio_path != audio_path:
                 os.unlink(tts_audio_path)
         except Exception as cleanup_error:
             print(f"Error cleaning up files: {str(cleanup_error)}")
-@app.route("/quick_chat", methods=["POST"])
-def quick_chat():
-    """Alias for chat endpoint for compatibility"""
-    return chat()
-@app.route("/status", methods=["GET"])
-def status():
-    """Simple status endpoint"""
     return jsonify({
-        "status": "ready",
-        "message": "Simple response system running and ready"
     })
-@app.route("/")
-def index():
-    return "Metaverse AI Character API running. Ultra-fast version."
 if __name__ == "__main__":
-    print("Starting ultra-fast response API...")
-    # Use threaded server for better concurrency
-    app.run(host="0.0.0.0", port=7860, threaded=True)

 from flask import Flask, request, jsonify, send_file
 from flask_cors import CORS
+from faster_whisper import WhisperModel
+from transformers import pipeline
+from TTS.api import TTS
 import tempfile
 import os
+import re
 import base64
+import threading
+import functools
+import time
+from cachetools import LRUCache, cached, TTLCache
+import gc
+import psutil
 app = Flask(__name__)
 CORS(app)
+# Global configuration for low CPU environment
+MODEL_CACHE_SIZE = 200  # Increased cache size to reduce recomputation
+MODEL_CACHE_TTL = 7200  # Increased cache TTL to 2 hours
+USE_GPU = False  # No GPU available
+# Load models lazily
+whisper_model = None
+llm = None
+tts = None
+models_loaded = False
+models_lock = threading.Lock()
+# Initialize caches
+response_cache = TTLCache(maxsize=MODEL_CACHE_SIZE, ttl=MODEL_CACHE_TTL)
+def load_models():
+    """Load models optimized for low CPU environments"""
+    global whisper_model, llm, tts, models_loaded
+    if models_loaded:
+        return
+    with models_lock:
+        if models_loaded:  # Double-check to avoid race condition
+            return
+        print("Loading models for low-resource environment...")
+        start_time = time.time()
+        # Force garbage collection before loading models
+        gc.collect()
+        # Choose smallest/fastest model options and optimize for CPU
+        device = "cpu"  # Force CPU for limited resources
+        compute_type = "int8"  # Use int8 quantization for faster inference
+        # Monitor memory usage
+        def log_memory():
+            process = psutil.Process(os.getpid())
+            memory_info = process.memory_info()
+            memory_mb = memory_info.rss / 1024 / 1024
+            print(f"Memory usage: {memory_mb:.2f} MB")
+        # Load whisper model first (most critical for voice input)
+        print("Loading whisper model...")
+        log_memory()
+        whisper_model = WhisperModel("tiny", device=device, compute_type=compute_type)
+        # Load LLM next
+        print("Loading language model...")
+        log_memory()
+        llm = pipeline(
+            "text-generation",
+            model="tiiuae/falcon-rw-1b",  # Consider switching to a smaller model if available
+            max_new_tokens=30,  # Reduced token count for faster generation
+            device=-1,  # Force CPU
+        )
+        # Finally load TTS
+        print("Loading TTS model...")
+        log_memory()
+        tts = TTS(model_name="tts_models/en/ljspeech/tacotron2-DDC",
+                 progress_bar=False,
+                 gpu=False)
+        # Force garbage collection again after loading
+        gc.collect()
+        models_loaded = True
+        log_memory()
+        print(f"Models loaded in {time.time() - start_time:.2f} seconds")
+@cached(cache=response_cache)
+def generate_ai_response(user_input):
+    """
+    Generate AI responses with caching to avoid repetitive processing.
+    Optimized for low CPU environments.
+    """
+    load_models()  # Ensure models are loaded
+    # Handle empty or too short input
+    if not user_input or len(user_input.strip()) < 2:
+        return "I'm listening. Please say more."
+    # Normalize and simplify input to improve cache hits
+    normalized_input = user_input.lower().strip()
+    # Check for very similar recent inputs to maximize cache usage
+    for cached_input in response_cache.keys():
+        if cached_input and normalized_input and (
+           cached_input.lower() in normalized_input or
+           normalized_input in cached_input.lower() or
+           levenshtein_distance(normalized_input, cached_input.lower()) < 5):
+            print(f"Using cached similar response for: {cached_input}")
+            return response_cache[cached_input]
     try:
+        # Start with a small timeout for real-time experience
+        start_time = time.time()
+        timeout = 3.0  # 3 seconds max for real-time response
+        # Generate response with monitoring
+        raw_response = llm(user_input, max_new_tokens=30)[0]["generated_text"]
+        # Check if we're taking too long
+        elapsed = time.time() - start_time
+        if elapsed > timeout:
+            print(f"Response generation taking too long: {elapsed:.2f}s")
+            return "Let me think about that for a moment."
+        # Process to get clean, short response
+        final_response = process_response(user_input, raw_response)
+        # Force garbage collection after processing to keep memory usage low
+        gc.collect()
+        return final_response
     except Exception as e:
+        print(f"Error generating AI response: {str(e)}")
+        # Return a default response if anything goes wrong
+        return "I heard you, but I'm having trouble forming a response right now."
+def levenshtein_distance(s1, s2):
+    """
+    Calculate simple string similarity for cache optimization.
+    A simpler implementation than full Levenshtein to save CPU cycles.
+    """
+    if len(s1) < len(s2):
+        return levenshtein_distance(s2, s1)
+    if not s2:
+        return len(s1)
+    previous_row = range(len(s2) + 1)
+    for i, c1 in enumerate(s1):
+        current_row = [i + 1]
+        for j, c2 in enumerate(s2):
+            insertions = previous_row[j + 1] + 1
+            deletions = current_row[j] + 1
+            substitutions = previous_row[j] + (c1 != c2)
+            current_row.append(min(insertions, deletions, substitutions))
+        previous_row = current_row
+    return previous_row[-1]
+def process_response(input_text, generated_text):
+    """Optimized response processing function"""
+    # Handle the case where generated_text might be None
+    if not generated_text:
+        return "I'm not sure what to say about that."
+    # Make sure both are strings
+    input_text = str(input_text).strip()
+    generated_text = str(generated_text).strip()
+    # Skip empty input
+    if not input_text:
+        clean_response = generated_text
+    # Remove the input text from the beginning of the response
+    elif generated_text.startswith(input_text):
+        clean_response = generated_text[len(input_text):].strip()
+    else:
+        clean_response = generated_text.strip()
+    # If we ended up with nothing, provide a default response
+    if not clean_response:
+        return "I'm listening."
+    # Split into sentences more efficiently
+    sentences = re.split(r'(?<=[.!?])\s+', clean_response)
+    # Filter out empty or very short sentences
+    meaningful_sentences = [s for s in sentences if len(s) > 5]
+    # Take just 1-2 sentences for a casual, human-like response
+    if meaningful_sentences:
+        if len(meaningful_sentences) > 2:
+            result = " ".join(meaningful_sentences[:2])
+        else:
+            result = " ".join(meaningful_sentences)
+    else:
+        # If no meaningful sentences, but we have short sentences, use those
+        short_sentences = [s for s in sentences if s.strip()]
+        if short_sentences:
+            result = " ".join(short_sentences[:2])
+        else:
+            # Fallback if no good sentences were found
+            result = "I'm not sure what to say about that."
+    # Remove any repetitive phrases
+    result = remove_repetitions(result)
+    # Normalize quotes to ASCII equivalents
+    result = normalize_quotes(result)
+    return result
+def normalize_quotes(text):
+    """Replace curly quotes with straight quotes - optimized version"""
+    replacements = {
+        '"': '"', '"': '"',
+        ''': "'", ''': "'"
+    }
+    for old, new in replacements.items():
+        text = text.replace(old, new)
+    return text
+def remove_repetitions(text):
+    """Optimized repetition removal function"""
+    words = text.split()
+    if len(words) <= 5:  # Don't process very short responses
+        return text
+    result = []
+    text_so_far = ""
+    for i in range(len(words)):
+        # Check if this word starts a repeated phrase
+        if i < len(words) - 3:  # Need at least 3 words to check for repetition
+            # Check if next 3+ words appear earlier in the text
+            is_repetition = False
+            for j in range(3, min(10, len(words) - i)):  # Check phrases of length 3 to 10
+                phrase = " ".join(words[i:i+j])
+                if phrase in text_so_far:
+                    is_repetition = True
+                    break
+            if not is_repetition:
+                result.append(words[i])
+                text_so_far += words[i] + " "
+        else:
+            result.append(words[i])
+            text_so_far += words[i] + " "
+    return " ".join(result)
 @app.route("/talk", methods=["POST"])
 def talk():
+    """Optimized voice API endpoint for low-resource environments"""
     if "audio" not in request.files:
         return jsonify({"error": "No audio file"}), 400
+    # Get current memory usage
+    process = psutil.Process(os.getpid())
+    memory_before = process.memory_info().rss / 1024 / 1024
+    print(f"Memory before processing: {memory_before:.2f} MB")
+    # Ensure models are loaded
+    load_models()
+    # Start timing for end-to-end processing
+    start_time = time.time()
+    # Save audio
     audio_file = request.files["audio"]
     try:
+        # Use in-memory processing when possible to avoid disk I/O
+        with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
             audio_path = tmp.name
             audio_file.save(audio_path)
+        # Transcribe with optimized settings
+        try:
+            # Set beam_size=1 for faster transcription with slight accuracy trade-off
+            segments, _ = whisper_model.transcribe(
+                audio_path,
+                beam_size=1,
+                vad_filter=True,  # Filter out non-speech
+                language="en"      # Specify language if known
+            )
+            transcription = "".join([seg.text for seg in segments])
+            print(f"Transcription: {transcription}")
+            print(f"Transcription time: {time.time() - start_time:.2f}s")
+            if not transcription.strip():
+                final_response = "I didn't catch that. Could you please speak again?"
+            else:
+                # Use the cached response generator
+                final_response = generate_ai_response(transcription)
+            print(f"Voice response: {final_response}")
+            print(f"Response generation time: {time.time() - start_time:.2f}s")
+            # Cache frequently used responses as pre-synthesized audio files
+            response_hash = str(hash(final_response))
+            cached_audio_path = os.path.join(tempfile.gettempdir(), f"cached_response_{response_hash}.wav")
+            if os.path.exists(cached_audio_path):
+                print("Using cached audio response")
+                tts_audio_path = cached_audio_path
+            else:
+                # Prepare TTS output path
+                tts_audio_path = audio_path.replace(".wav", "_reply.wav")
+                try:
+                    # Synthesize speech with optimized settings
+                    tts.tts_to_file(
+                        text=final_response,
+                        file_path=tts_audio_path,
+                        speed=1.1  # Slightly faster speech for quicker responses
+                    )
+                    if not os.path.exists(tts_audio_path) or os.path.getsize(tts_audio_path) == 0:
+                        raise Exception("TTS failed to generate audio file")
+                    # Cache this response for future use
+                    if len(final_response) < 100:  # Only cache short responses
+                        try:
+                            import shutil
+                            shutil.copy(tts_audio_path, cached_audio_path)
+                        except Exception as cache_error:
+                            print(f"Error caching audio: {str(cache_error)}")
+                except Exception as e:
+                    print(f"TTS error: {str(e)}")
+                    tts_audio_path = audio_path
+                    final_response = "Sorry, I couldn't generate audio right now."
+        except Exception as e:
+            print(f"Transcription error: {str(e)}")
+            final_response = "I had trouble understanding that. Could you try again?"
+            tts_audio_path = audio_path
         # Return both the audio file and the text response
         try:
             response = send_file(tts_audio_path, mimetype="audio/wav")
+            # Base64 encode the response text
             encoded_response = base64.b64encode(final_response.encode('utf-8')).decode('ascii')
             response.headers["X-Response-Text-Base64"] = encoded_response
             response.headers["Access-Control-Expose-Headers"] = "X-Response-Text-Base64"
+            # Log total processing time
+            print(f"Total processing time: {time.time() - start_time:.2f}s")
+            memory_after = process.memory_info().rss / 1024 / 1024
+            print(f"Memory after processing: {memory_after:.2f} MB")
+            # Force garbage collection
+            gc.collect()
             return response
         except Exception as e:
             print(f"Error sending file: {str(e)}")
                 "error": "Could not send audio response",
                 "text_response": final_response
             }), 500
     except Exception as e:
         print(f"Error in talk endpoint: {str(e)}")
         return jsonify({"error": str(e)}), 500
         try:
             if 'audio_path' in locals() and os.path.exists(audio_path):
                 os.unlink(audio_path)
+            if 'tts_audio_path' in locals() and tts_audio_path != cached_audio_path and tts_audio_path != audio_path and os.path.exists(tts_audio_path):
                 os.unlink(tts_audio_path)
         except Exception as cleanup_error:
             print(f"Error cleaning up files: {str(cleanup_error)}")
+        # Final garbage collection
+        gc.collect()
+@app.route("/chat", methods=["POST"])
+def chat():
+    data = request.get_json()
+    if not data or "text" not in data:
+        return jsonify({"error": "Missing 'text' in request body"}), 400
+    # Ensure models are loaded
+    load_models()
+    try:
+        user_input = data["text"]
+        print(f"Text input: {user_input}")  # Debugging
+        # Use the cached response generator
+        final_response = generate_ai_response(user_input)
+        print(f"Text response: {final_response}")  # Debugging
+        return jsonify({"response": final_response})
+    except Exception as e:
+        print(f"Error in chat endpoint: {str(e)}")
+        return jsonify({"response": "I'm having trouble processing that. Could you try again?", "error": str(e)})
+@app.route("/")
+def index():
+    return "Metaverse AI Character API running."
+# Cache for frequently used TTS responses
+tts_audio_cache = {}
+# Pre-cache common responses
+def precache_common_responses():
+    """Pre-generate audio for common responses to save processing time"""
+    common_responses = [
+        "I didn't catch that. Could you please speak again?",
+        "I'm listening. Please say more.",
+        "I heard you, but I'm having trouble forming a response right now.",
+        "I'm not sure what to say about that.",
+        "Let me think about that for a moment."
+    ]
+    global tts
+    if tts is None:
+        load_models()
+    print("Pre-caching common audio responses...")
+    for response in common_responses:
+        try:
+            response_hash = str(hash(response))
+            cached_path = os.path.join(tempfile.gettempdir(), f"cached_response_{response_hash}.wav")
+            if not os.path.exists(cached_path):
+                with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
+                    tmp_path = tmp.name
+                tts.tts_to_file(text=response, file_path=tmp_path)
+                os.rename(tmp_path, cached_path)
+            tts_audio_cache[response] = cached_path
+            print(f"Cached: {response}")
+        except Exception as e:
+            print(f"Failed to cache response '{response}': {str(e)}")
+    print("Finished pre-caching")
+# Health check endpoint to verify API is running properly
+@app.route("/health", methods=["GET"])
+def health_check():
+    """Health check endpoint to verify API is running"""
+    memory_usage = psutil.Process(os.getpid()).memory_info().rss / 1024 / 1024
     return jsonify({
+        "status": "ok",
+        "models_loaded": models_loaded,
+        "memory_usage_mb": round(memory_usage, 2),
+        "cache_size": len(response_cache),
+        "uptime_seconds": time.time() - startup_time
     })
+# Track startup time
+startup_time = time.time()
 if __name__ == "__main__":
+    print("Starting Metaverse AI Character API (Optimized for real-time on 2vCPU)...")
+    # Start loading models in a background thread
+    model_thread = threading.Thread(target=load_models)
+    model_thread.daemon = True  # Allow the thread to be terminated when the main program exits
+    model_thread.start()
+    # Start pre-caching in another thread
+    cache_thread = threading.Thread(target=precache_common_responses)
+    cache_thread.daemon = True
+    cache_thread.start()
+    # Optimize Flask for low-resource environment
+    # Use threaded=True with lower thread count to prevent CPU overload
+    app.run(
+        host="0.0.0.0",
+        port=7860,
+        threaded=True,
+        # Options below reduce resource usage
+        debug=False,  # Disable debug mode for production
+        use_reloader=False  # Disable reloader to prevent duplicate processes
+    )