Spaces:

mac9087
/

magical-box

Sleeping

App Files Files Community

mac9087 commited on 14 days ago

Commit

542f872

verified ·

1 Parent(s): d5ed7cc

Update app.py

Browse files

Files changed (1) hide show

app.py +255 -89

app.py CHANGED Viewed

@@ -101,74 +101,114 @@ job_results = {}
 generation_thread = None
 is_thread_running = False
 def load_models_if_needed():
-    global xm, model, diffusion
-    if xm is None or model is None or diffusion is None:
-        print("Loading models for the first time...")
-        try:
-            # Set lower precision for memory optimization
-            torch.set_default_dtype(torch.float32)  # Use float32 instead of float64
-            xm = load_model('transmitter', device=device)
-            model = load_model('text300M', device=device)
-            diffusion = diffusion_from_config(load_config('diffusion'))
-            print("Models loaded successfully!")
-        except Exception as e:
-            print(f"Error loading models: {e}")
-            raise
-def worker_thread():
-    global is_thread_running
-    is_thread_running = True
-    try:
-        while True:
-            try:
-                # Get job from queue with a timeout
-                job_id, prompt = job_queue.get(timeout=1)
-                print(f"Processing job {job_id} with prompt: {prompt}")
-                # Process the job
-                result = process_job(job_id, prompt)
-                # Store the result
-                job_results[job_id] = result
-            except queue.Empty:
-                # No jobs in queue, continue waiting
-                pass
-            except Exception as e:
-                print(f"Error in worker thread: {e}")
-                import traceback
-                traceback.print_exc()
-                # If there was a job being processed, mark it as failed
-                if 'job_id' in locals():
-                    job_results[job_id] = {
-                        "success": False,
-                        "error": str(e)
-                    }
-    finally:
-        is_thread_running = False
 def process_job(job_id, prompt):
     try:
-        # Load models if not already loaded
-        load_models_if_needed()
-        # Set parameters for CPU performance (reduced steps and other optimizations)
-        batch_size = 1
-        guidance_scale = 15.0
-        # *** EXTREME OPTIMIZATION: Significantly reduce steps for low-memory environments ***
-        karras_steps = 8  # Reduced from 16 to 8 for even better performance
-        # *** OPTIMIZATION: Run garbage collection before starting intensive task ***
         gc.collect()
         torch.cuda.empty_cache() if torch.cuda.is_available() else None
-        # Generate latents with the text-to-3D model
         print(f"Starting latent generation for job {job_id} with {karras_steps} steps...")
-        # Force lower precision
         with torch.inference_mode():
             latents = sample_latents(
                 batch_size=batch_size,
@@ -187,22 +227,51 @@ def process_job(job_id, prompt):
             )
         print(f"Latent generation complete for job {job_id}!")
-        # *** OPTIMIZATION: Run garbage collection after intensive step ***
-        gc.collect()
         # Generate a unique filename
         unique_id = str(uuid.uuid4())
         filename = f"{output_dir}/{unique_id}"
-        # Convert latent to mesh with optimization settings
         print(f"Decoding mesh for job {job_id}...")
         t0 = time.time()
-        # *** OPTIMIZATION: Use simplified decoding with lower resolution ***
-        mesh = decode_latent_mesh(xm, latents[0], max_points=4000).tri_mesh()  # Reduced point count
         print(f"Mesh decoded in {time.time() - t0:.2f} seconds")
-        # *** OPTIMIZATION: Clear latents from memory as they're no longer needed ***
         del latents
         gc.collect()
@@ -217,13 +286,12 @@ def process_job(job_id, prompt):
         with open(obj_path, 'w') as f:
             mesh.write_obj(f)
-        # *** OPTIMIZATION: Clear mesh from memory ***
         del mesh
         gc.collect()
         print(f"Files saved successfully for job {job_id}!")
-        # Return paths to the generated files
         return {
             "success": True,
             "message": "3D model generated successfully",
@@ -240,6 +308,41 @@ def process_job(job_id, prompt):
             "error": str(e)
         }
 def ensure_worker_thread_running():
     global generation_thread, is_thread_running
@@ -248,8 +351,21 @@ def ensure_worker_thread_running():
         generation_thread = threading.Thread(target=worker_thread, daemon=True)
         generation_thread.start()
 @app.route('/generate', methods=['POST'])
 def generate_3d():
     # Get the prompt from the request
     data = request.json
     if not data or 'prompt' not in data:
@@ -264,6 +380,7 @@ def generate_3d():
     # Add job to queue
     ensure_worker_thread_running()
     job_queue.put((job_id, prompt))
     # Return job ID immediately
     return jsonify({
@@ -278,10 +395,7 @@ def job_status(job_id):
     if job_id in job_results:
         result = job_results[job_id]
         # Clean up memory if the job is complete and successful
-        if result.get("success", False):
-            return jsonify(result)
-        else:
-            return jsonify({"error": result.get("error", "Unknown error")}), 500
     else:
         # Job is still in progress
         return jsonify({
@@ -299,35 +413,53 @@ def download_file(filename):
 @app.route('/health', methods=['GET'])
 def health_check():
-    """Simple health check endpoint to verify the app is running"""
-    # Check available memory
     try:
         memory_info = psutil.virtual_memory()
         memory_usage = f"{memory_info.percent}% (Available: {memory_info.available / (1024**3):.2f} GB)"
-        # Check CPU usage
         cpu_usage = f"{psutil.cpu_percent(interval=0.1)}%"
-        # Get queue status
         queue_size = job_queue.qsize()
-        # Get active jobs
-        active_jobs = len(job_results)
     except Exception as e:
-        memory_usage = "Error getting system info"
-        cpu_usage = "Error getting CPU info"
-        queue_size = "Unknown"
-        active_jobs = "Unknown"
-    return jsonify({
-        "status": "ok",
-        "message": "Service is running",
-        "memory_usage": memory_usage,
-        "cpu_usage": cpu_usage,
-        "queue_size": queue_size,
-        "active_jobs": active_jobs,
-        "worker_running": is_thread_running
-    })
 @app.route('/', methods=['GET'])
 def home():
@@ -399,10 +531,44 @@ GET /status/123e4567-e89b-12d3-a456-426614174000
     </html>
     """
 if __name__ == '__main__':
-    # Start the worker thread
     ensure_worker_thread_running()
     # Recommended to run with gunicorn for production with increased timeout:
     # $ gunicorn app:app --bind 0.0.0.0:7860 --timeout 300 --workers 1
-    app.run(host='0.0.0.0', port=7860, debug=False)  # Set debug=False in production

 generation_thread = None
 is_thread_running = False
+# New global variables for optimizations
+last_usage_time = None
+active_jobs = 0
+max_concurrent_jobs = 1  # Limit concurrent jobs for 2vCPU
+def get_adaptive_parameters():
+    """Adjust parameters based on current system resources"""
+    mem = psutil.virtual_memory()
+    # Base parameters
+    params = {
+        'karras_steps': 8,
+        'batch_size': 1,
+        'guidance_scale': 15.0
+    }
+    # If memory is tight, reduce steps further
+    if mem.percent > 70:
+        params['karras_steps'] = 6
+    # If we have more memory to spare, can be slightly more generous
+    if mem.percent < 50:
+        params['karras_steps'] = 10
+    print(f"Adaptive parameters chosen: karras_steps={params['karras_steps']}, mem={mem.percent}%")
+    return params
+def check_memory_pressure():
+    """Check if memory is getting too high and take action if needed"""
+    mem = psutil.virtual_memory()
+    if mem.percent > 85:  # Critical threshold
+        print("WARNING: Memory pressure critical. Forcing garbage collection.")
+        gc.collect()
+        torch.cuda.empty_cache() if torch.cuda.is_available() else None
+        # If still critical, try more aggressive measures
+        if psutil.virtual_memory().percent > 80:
+            print("EMERGENCY: Memory still critical. Clearing model cache.")
+            # Reset global models to force reload when memory is better
+            global xm, model, diffusion
+            xm, model, diffusion = None, None, None
+            gc.collect()
+            return True
+    return False
+def load_transmitter_model():
+    global xm, last_usage_time
+    last_usage_time = time.time()
+    if xm is None:
+        print("Loading transmitter model...")
+        xm = load_model('transmitter', device=device)
+        print("Transmitter model loaded!")
+def load_primary_model():
+    global model, diffusion, last_usage_time
+    last_usage_time = time.time()
+    if model is None or diffusion is None:
+        print("Loading primary models...")
+        torch.set_default_dtype(torch.float32)  # Use float32 instead of float64
+        model = load_model('text300M', device=device)
+        diffusion = diffusion_from_config(load_config('diffusion'))
+        print("Primary models loaded!")
 def load_models_if_needed():
+    """Legacy function for compatibility"""
+    load_primary_model()
+    load_transmitter_model()
+def model_unloader_thread():
+    """Thread that periodically unloads models if they haven't been used"""
+    global xm, model, diffusion, last_usage_time
+    while True:
+        time.sleep(300)  # Check every 5 minutes
+        if last_usage_time is not None:
+            idle_time = time.time() - last_usage_time
+            # If models have been idle for more than 10 minutes and no active jobs
+            if idle_time > 600 and active_jobs == 0:
+                # Check memory usage
+                mem = psutil.virtual_memory()
+                if mem.percent > 50:  # Only unload if memory usage is significant
+                    print(f"Models idle for {idle_time:.1f} seconds and memory at {mem.percent}%. Unloading...")
+                    xm, model, diffusion = None, None, None
+                    gc.collect()
+                    torch.cuda.empty_cache() if torch.cuda.is_available() else None
 def process_job(job_id, prompt):
     try:
+        # Get adaptive parameters
+        adaptive_params = get_adaptive_parameters()
+        karras_steps = adaptive_params['karras_steps']
+        batch_size = adaptive_params['batch_size']
+        guidance_scale = adaptive_params['guidance_scale']
+        # Load primary models for generation
+        load_primary_model()
+        # Optimization: Run garbage collection before starting intensive task
         gc.collect()
         torch.cuda.empty_cache() if torch.cuda.is_available() else None
         print(f"Starting latent generation for job {job_id} with {karras_steps} steps...")
+        # Generate latents
         with torch.inference_mode():
             latents = sample_latents(
                 batch_size=batch_size,
             )
         print(f"Latent generation complete for job {job_id}!")
+        # Optimization: Clear unnecessary memory and load next model
+        check_memory_pressure()
         # Generate a unique filename
         unique_id = str(uuid.uuid4())
         filename = f"{output_dir}/{unique_id}"
+        # Load transmitter model for decoding
+        load_transmitter_model()
+        # Convert latent to mesh
         print(f"Decoding mesh for job {job_id}...")
         t0 = time.time()
+        # Monitor memory
+        mem_before = psutil.Process().memory_info().rss / (1024 * 1024)
+        print(f"Memory before mesh decoding: {mem_before:.2f} MB")
+        # Decode the mesh (fixed: removed 'max_points' parameter from original code)
+        mesh = decode_latent_mesh(xm, latents[0]).tri_mesh()
         print(f"Mesh decoded in {time.time() - t0:.2f} seconds")
+        mem_after = psutil.Process().memory_info().rss / (1024 * 1024)
+        print(f"Memory after decoding: {mem_after:.2f} MB (delta: {mem_after - mem_before:.2f} MB)")
+        # Report mesh complexity if possible
+        try:
+            print(f"Mesh complexity: {len(mesh.vertices)} vertices, {len(mesh.faces)} faces")
+        except:
+            print("Could not determine mesh complexity")
+        # Simplify mesh if it's too complex (if supported)
+        try:
+            if hasattr(mesh, 'simplify') and hasattr(mesh, 'faces') and len(mesh.faces) > 5000:
+                target_faces = min(5000, int(len(mesh.faces) * 0.6))
+                print(f"Simplifying mesh to target {target_faces} faces...")
+                t0 = time.time()
+                simplified = mesh.simplify_quadratic_decimation(target_faces)
+                mesh = simplified
+                print(f"Mesh simplified in {time.time() - t0:.2f} seconds")
+                print(f"New complexity: {len(mesh.vertices)} vertices, {len(mesh.faces)} faces")
+        except Exception as e:
+            print(f"Mesh simplification not available or failed: {e}")
+        # Clear latents from memory
         del latents
         gc.collect()
         with open(obj_path, 'w') as f:
             mesh.write_obj(f)
+        # Clear mesh from memory
         del mesh
         gc.collect()
         print(f"Files saved successfully for job {job_id}!")
         return {
             "success": True,
             "message": "3D model generated successfully",
             "error": str(e)
         }
+def worker_thread():
+    global is_thread_running, active_jobs
+    is_thread_running = True
+    try:
+        while True:
+            try:
+                # Get job from queue with a timeout
+                job_id, prompt = job_queue.get(timeout=1)
+                print(f"Processing job {job_id} with prompt: {prompt}")
+                # Process the job
+                result = process_job(job_id, prompt)
+                # Store the result and update counter
+                job_results[job_id] = result
+                active_jobs -= 1
+            except queue.Empty:
+                # No jobs in queue, continue waiting
+                pass
+            except Exception as e:
+                print(f"Error in worker thread: {e}")
+                import traceback
+                traceback.print_exc()
+                # If there was a job being processed, mark it as failed
+                if 'job_id' in locals():
+                    job_results[job_id] = {
+                        "success": False,
+                        "error": str(e)
+                    }
+                    active_jobs -= 1
+    finally:
+        is_thread_running = False
 def ensure_worker_thread_running():
     global generation_thread, is_thread_running
         generation_thread = threading.Thread(target=worker_thread, daemon=True)
         generation_thread.start()
+def start_model_unloader():
+    threading.Thread(target=model_unloader_thread, daemon=True).start()
 @app.route('/generate', methods=['POST'])
 def generate_3d():
+    global active_jobs
+    # Check if we're already at max capacity
+    if active_jobs >= max_concurrent_jobs:
+        return jsonify({
+            "success": False,
+            "error": "Server is at maximum capacity. Please try again later.",
+            "retry_after": 300
+        }), 503
     # Get the prompt from the request
     data = request.json
     if not data or 'prompt' not in data:
     # Add job to queue
     ensure_worker_thread_running()
     job_queue.put((job_id, prompt))
+    active_jobs += 1
     # Return job ID immediately
     return jsonify({
     if job_id in job_results:
         result = job_results[job_id]
         # Clean up memory if the job is complete and successful
+        return jsonify(result)
     else:
         # Job is still in progress
         return jsonify({
 @app.route('/health', methods=['GET'])
 def health_check():
+    """Enhanced health check endpoint to monitor resource usage"""
     try:
+        # Memory info
         memory_info = psutil.virtual_memory()
         memory_usage = f"{memory_info.percent}% (Available: {memory_info.available / (1024**3):.2f} GB)"
+        # CPU info
         cpu_usage = f"{psutil.cpu_percent(interval=0.1)}%"
+        # Process specific info
+        process = psutil.Process()
+        process_memory = f"{process.memory_info().rss / (1024**3):.2f} GB"
+        # Models status
+        models_loaded = []
+        if model is not None:
+            models_loaded.append("text300M")
+        if diffusion is not None:
+            models_loaded.append("diffusion")
+        if xm is not None:
+            models_loaded.append("transmitter")
+        # Queue status
         queue_size = job_queue.qsize()
+        # Check for model inactivity
+        model_inactive = "N/A"
+        if last_usage_time is not None:
+            model_inactive = f"{(time.time() - last_usage_time) / 60:.1f} minutes"
+        return jsonify({
+            "status": "ok",
+            "message": "Service is running",
+            "memory_usage": memory_usage,
+            "process_memory": process_memory,
+            "cpu_usage": cpu_usage,
+            "queue_size": queue_size,
+            "active_jobs": active_jobs,
+            "worker_running": is_thread_running,
+            "models_loaded": models_loaded,
+            "model_inactive_time": model_inactive
+        })
     except Exception as e:
+        return jsonify({
+            "status": "warning",
+            "error": str(e)
+        })
 @app.route('/', methods=['GET'])
 def home():
     </html>
     """
+@app.route('/purge-results', methods=['POST'])
+def purge_old_results():
+    """Endpoint to manually purge old job results to free memory"""
+    try:
+        # Get the time threshold from request (default to 1 hour)
+        threshold_hours = request.json.get('threshold_hours', 1) if request.json else 1
+        threshold_time = time.time() - (threshold_hours * 3600)
+        # Track jobs to be removed
+        jobs_to_remove = []
+        for job_id, result in job_results.items():
+            # If the job has a timestamp and it's older than threshold
+            if result.get('timestamp', time.time()) < threshold_time:
+                jobs_to_remove.append(job_id)
+        # Remove the old jobs
+        for job_id in jobs_to_remove:
+            job_results.pop(job_id, None)
+        # Force garbage collection
+        gc.collect()
+        return jsonify({
+            "success": True,
+            "message": f"Purged {len(jobs_to_remove)} old job results",
+            "remaining_jobs": len(job_results)
+        })
+    except Exception as e:
+        return jsonify({
+            "success": False,
+            "error": str(e)
+        }), 500
 if __name__ == '__main__':
+    # Start the worker thread and model unloader
     ensure_worker_thread_running()
+    start_model_unloader()
     # Recommended to run with gunicorn for production with increased timeout:
     # $ gunicorn app:app --bind 0.0.0.0:7860 --timeout 300 --workers 1
+    app.run(host='0.0.0.0', port=7860, debug=False)  # Set debug=False in production