#!/usr/bin/env python3 """ GPU Metrics JSON Server This script provides a simple HTTP server that serves NVIDIA GPU metrics in JSON format. It runs on the remote machine and is accessed via an SSH tunnel. """ import json import subprocess import re from flask import Flask, jsonify import logging # Configure logging logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' ) logger = logging.getLogger('gpu_server') app = Flask(__name__) def get_gpu_info(): """ Get NVIDIA GPU information and parse it into a structured format Returns: dict: Dictionary containing GPU information """ try: # Run nvidia-smi to get GPU information nvidia_smi_output = subprocess.check_output( [ 'nvidia-smi', '--query-gpu=index,name,temperature.gpu,utilization.gpu,utilization.memory,memory.total,memory.used,memory.free,power.draw,power.limit', '--format=csv,noheader,nounits' ], universal_newlines=True ) # Parse the CSV output gpus = [] for line in nvidia_smi_output.strip().split('\n'): values = [v.strip() for v in line.split(',')] if len(values) >= 10: gpu = { 'index': int(values[0]), 'name': values[1], 'temperature': float(values[2]), 'gpu_utilization': float(values[3]), 'memory_utilization': float(values[4]), 'memory_total': float(values[5]), 'memory_used': float(values[6]), 'memory_free': float(values[7]), 'power_draw': float(values[8]), 'power_limit': float(values[9]) } gpus.append(gpu) # Get GPU processes information process_output = subprocess.check_output( ['nvidia-smi', '--query-compute-apps=pid,process_name,used_memory', '--format=csv,noheader,nounits'], universal_newlines=True ) processes = [] for line in process_output.strip().split('\n'): if line: # Skip empty lines values = [v.strip() for v in line.split(',')] if len(values) >= 3: process = { 'pid': int(values[0]), 'name': values[1], 'memory_used': float(values[2]) } processes.append(process) return { 'timestamp': subprocess.check_output(['date', '+%Y-%m-%d %H:%M:%S'], universal_newlines=True).strip(), 'gpus': gpus, 'processes': processes, 'success': True } except Exception as e: logger.error(f"Error getting GPU information: {str(e)}") return { 'timestamp': subprocess.check_output(['date', '+%Y-%m-%d %H:%M:%S'], universal_newlines=True).strip(), 'error': str(e), 'success': False } @app.route('/gpu/json') def gpu_json(): """ API endpoint for GPU information in JSON format """ return jsonify(get_gpu_info()) @app.route('/gpu/txt') def gpu_txt(): """ API endpoint for traditional nvidia-smi text output (for backward compatibility) """ try: # Run nvidia-smi with standard output format nvidia_smi_output = subprocess.check_output(['nvidia-smi'], universal_newlines=True) return nvidia_smi_output except Exception as e: logger.error(f"Error getting nvidia-smi output: {str(e)}") return f"Error: {str(e)}" @app.route('/health') def health_check(): """ Simple health check endpoint """ return jsonify({'status': 'ok'}) if __name__ == '__main__': # Note: In production, consider using a proper WSGI server like gunicorn # and configure proper authentication/security app.run(host='0.0.0.0', port=5000)