Spaces:
Running
Running
#!/usr/bin/env python3 | |
""" | |
GPU Metrics JSON Server | |
This script provides a simple HTTP server that serves NVIDIA GPU metrics in JSON format. | |
It runs on the remote machine and is accessed via an SSH tunnel. | |
""" | |
import json | |
import subprocess | |
import re | |
from flask import Flask, jsonify | |
import logging | |
# Configure logging | |
logging.basicConfig( | |
level=logging.INFO, | |
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' | |
) | |
logger = logging.getLogger('gpu_server') | |
app = Flask(__name__) | |
def get_gpu_info(): | |
""" | |
Get NVIDIA GPU information and parse it into a structured format | |
Returns: | |
dict: Dictionary containing GPU information | |
""" | |
try: | |
# Run nvidia-smi to get GPU information | |
nvidia_smi_output = subprocess.check_output( | |
[ | |
'nvidia-smi', | |
'--query-gpu=index,name,temperature.gpu,utilization.gpu,utilization.memory,memory.total,memory.used,memory.free,power.draw,power.limit', | |
'--format=csv,noheader,nounits' | |
], | |
universal_newlines=True | |
) | |
# Parse the CSV output | |
gpus = [] | |
for line in nvidia_smi_output.strip().split('\n'): | |
values = [v.strip() for v in line.split(',')] | |
if len(values) >= 10: | |
gpu = { | |
'index': int(values[0]), | |
'name': values[1], | |
'temperature': float(values[2]), | |
'gpu_utilization': float(values[3]), | |
'memory_utilization': float(values[4]), | |
'memory_total': float(values[5]), | |
'memory_used': float(values[6]), | |
'memory_free': float(values[7]), | |
'power_draw': float(values[8]), | |
'power_limit': float(values[9]) | |
} | |
gpus.append(gpu) | |
# Get GPU processes information | |
process_output = subprocess.check_output( | |
['nvidia-smi', '--query-compute-apps=pid,process_name,used_memory', '--format=csv,noheader,nounits'], | |
universal_newlines=True | |
) | |
processes = [] | |
for line in process_output.strip().split('\n'): | |
if line: # Skip empty lines | |
values = [v.strip() for v in line.split(',')] | |
if len(values) >= 3: | |
process = { | |
'pid': int(values[0]), | |
'name': values[1], | |
'memory_used': float(values[2]) | |
} | |
processes.append(process) | |
return { | |
'timestamp': subprocess.check_output(['date', '+%Y-%m-%d %H:%M:%S'], universal_newlines=True).strip(), | |
'gpus': gpus, | |
'processes': processes, | |
'success': True | |
} | |
except Exception as e: | |
logger.error(f"Error getting GPU information: {str(e)}") | |
return { | |
'timestamp': subprocess.check_output(['date', '+%Y-%m-%d %H:%M:%S'], universal_newlines=True).strip(), | |
'error': str(e), | |
'success': False | |
} | |
def gpu_json(): | |
""" | |
API endpoint for GPU information in JSON format | |
""" | |
return jsonify(get_gpu_info()) | |
def gpu_txt(): | |
""" | |
API endpoint for traditional nvidia-smi text output (for backward compatibility) | |
""" | |
try: | |
# Run nvidia-smi with standard output format | |
nvidia_smi_output = subprocess.check_output(['nvidia-smi'], universal_newlines=True) | |
return nvidia_smi_output | |
except Exception as e: | |
logger.error(f"Error getting nvidia-smi output: {str(e)}") | |
return f"Error: {str(e)}" | |
def health_check(): | |
""" | |
Simple health check endpoint | |
""" | |
return jsonify({'status': 'ok'}) | |
if __name__ == '__main__': | |
# Note: In production, consider using a proper WSGI server like gunicorn | |
# and configure proper authentication/security | |
app.run(host='0.0.0.0', port=5000) |