Spaces:
Running
Running
File size: 4,086 Bytes
7d5c9a2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 |
#!/usr/bin/env python3
"""
GPU Metrics JSON Server
This script provides a simple HTTP server that serves NVIDIA GPU metrics in JSON format.
It runs on the remote machine and is accessed via an SSH tunnel.
"""
import json
import subprocess
import re
from flask import Flask, jsonify
import logging
# Configure logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger('gpu_server')
app = Flask(__name__)
def get_gpu_info():
"""
Get NVIDIA GPU information and parse it into a structured format
Returns:
dict: Dictionary containing GPU information
"""
try:
# Run nvidia-smi to get GPU information
nvidia_smi_output = subprocess.check_output(
[
'nvidia-smi',
'--query-gpu=index,name,temperature.gpu,utilization.gpu,utilization.memory,memory.total,memory.used,memory.free,power.draw,power.limit',
'--format=csv,noheader,nounits'
],
universal_newlines=True
)
# Parse the CSV output
gpus = []
for line in nvidia_smi_output.strip().split('\n'):
values = [v.strip() for v in line.split(',')]
if len(values) >= 10:
gpu = {
'index': int(values[0]),
'name': values[1],
'temperature': float(values[2]),
'gpu_utilization': float(values[3]),
'memory_utilization': float(values[4]),
'memory_total': float(values[5]),
'memory_used': float(values[6]),
'memory_free': float(values[7]),
'power_draw': float(values[8]),
'power_limit': float(values[9])
}
gpus.append(gpu)
# Get GPU processes information
process_output = subprocess.check_output(
['nvidia-smi', '--query-compute-apps=pid,process_name,used_memory', '--format=csv,noheader,nounits'],
universal_newlines=True
)
processes = []
for line in process_output.strip().split('\n'):
if line: # Skip empty lines
values = [v.strip() for v in line.split(',')]
if len(values) >= 3:
process = {
'pid': int(values[0]),
'name': values[1],
'memory_used': float(values[2])
}
processes.append(process)
return {
'timestamp': subprocess.check_output(['date', '+%Y-%m-%d %H:%M:%S'], universal_newlines=True).strip(),
'gpus': gpus,
'processes': processes,
'success': True
}
except Exception as e:
logger.error(f"Error getting GPU information: {str(e)}")
return {
'timestamp': subprocess.check_output(['date', '+%Y-%m-%d %H:%M:%S'], universal_newlines=True).strip(),
'error': str(e),
'success': False
}
@app.route('/gpu/json')
def gpu_json():
"""
API endpoint for GPU information in JSON format
"""
return jsonify(get_gpu_info())
@app.route('/gpu/txt')
def gpu_txt():
"""
API endpoint for traditional nvidia-smi text output (for backward compatibility)
"""
try:
# Run nvidia-smi with standard output format
nvidia_smi_output = subprocess.check_output(['nvidia-smi'], universal_newlines=True)
return nvidia_smi_output
except Exception as e:
logger.error(f"Error getting nvidia-smi output: {str(e)}")
return f"Error: {str(e)}"
@app.route('/health')
def health_check():
"""
Simple health check endpoint
"""
return jsonify({'status': 'ok'})
if __name__ == '__main__':
# Note: In production, consider using a proper WSGI server like gunicorn
# and configure proper authentication/security
app.run(host='0.0.0.0', port=5000) |