Gemma3-Chat / remote /gpu_stats_srv.py
ProximileAdmin's picture
Create remote/gpu_stats_srv.py
7d5c9a2 verified
raw
history blame contribute delete
4.09 kB
#!/usr/bin/env python3
"""
GPU Metrics JSON Server
This script provides a simple HTTP server that serves NVIDIA GPU metrics in JSON format.
It runs on the remote machine and is accessed via an SSH tunnel.
"""
import json
import subprocess
import re
from flask import Flask, jsonify
import logging
# Configure logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger('gpu_server')
app = Flask(__name__)
def get_gpu_info():
"""
Get NVIDIA GPU information and parse it into a structured format
Returns:
dict: Dictionary containing GPU information
"""
try:
# Run nvidia-smi to get GPU information
nvidia_smi_output = subprocess.check_output(
[
'nvidia-smi',
'--query-gpu=index,name,temperature.gpu,utilization.gpu,utilization.memory,memory.total,memory.used,memory.free,power.draw,power.limit',
'--format=csv,noheader,nounits'
],
universal_newlines=True
)
# Parse the CSV output
gpus = []
for line in nvidia_smi_output.strip().split('\n'):
values = [v.strip() for v in line.split(',')]
if len(values) >= 10:
gpu = {
'index': int(values[0]),
'name': values[1],
'temperature': float(values[2]),
'gpu_utilization': float(values[3]),
'memory_utilization': float(values[4]),
'memory_total': float(values[5]),
'memory_used': float(values[6]),
'memory_free': float(values[7]),
'power_draw': float(values[8]),
'power_limit': float(values[9])
}
gpus.append(gpu)
# Get GPU processes information
process_output = subprocess.check_output(
['nvidia-smi', '--query-compute-apps=pid,process_name,used_memory', '--format=csv,noheader,nounits'],
universal_newlines=True
)
processes = []
for line in process_output.strip().split('\n'):
if line: # Skip empty lines
values = [v.strip() for v in line.split(',')]
if len(values) >= 3:
process = {
'pid': int(values[0]),
'name': values[1],
'memory_used': float(values[2])
}
processes.append(process)
return {
'timestamp': subprocess.check_output(['date', '+%Y-%m-%d %H:%M:%S'], universal_newlines=True).strip(),
'gpus': gpus,
'processes': processes,
'success': True
}
except Exception as e:
logger.error(f"Error getting GPU information: {str(e)}")
return {
'timestamp': subprocess.check_output(['date', '+%Y-%m-%d %H:%M:%S'], universal_newlines=True).strip(),
'error': str(e),
'success': False
}
@app.route('/gpu/json')
def gpu_json():
"""
API endpoint for GPU information in JSON format
"""
return jsonify(get_gpu_info())
@app.route('/gpu/txt')
def gpu_txt():
"""
API endpoint for traditional nvidia-smi text output (for backward compatibility)
"""
try:
# Run nvidia-smi with standard output format
nvidia_smi_output = subprocess.check_output(['nvidia-smi'], universal_newlines=True)
return nvidia_smi_output
except Exception as e:
logger.error(f"Error getting nvidia-smi output: {str(e)}")
return f"Error: {str(e)}"
@app.route('/health')
def health_check():
"""
Simple health check endpoint
"""
return jsonify({'status': 'ok'})
if __name__ == '__main__':
# Note: In production, consider using a proper WSGI server like gunicorn
# and configure proper authentication/security
app.run(host='0.0.0.0', port=5000)