File size: 4,086 Bytes
7d5c9a2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
#!/usr/bin/env python3
"""
GPU Metrics JSON Server

This script provides a simple HTTP server that serves NVIDIA GPU metrics in JSON format.
It runs on the remote machine and is accessed via an SSH tunnel.
"""

import json
import subprocess
import re
from flask import Flask, jsonify
import logging

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger('gpu_server')

app = Flask(__name__)

def get_gpu_info():
    """
    Get NVIDIA GPU information and parse it into a structured format
    
    Returns:
        dict: Dictionary containing GPU information
    """
    try:
        # Run nvidia-smi to get GPU information
        nvidia_smi_output = subprocess.check_output(
            [
                'nvidia-smi', 
                '--query-gpu=index,name,temperature.gpu,utilization.gpu,utilization.memory,memory.total,memory.used,memory.free,power.draw,power.limit',
                '--format=csv,noheader,nounits'
            ],
            universal_newlines=True
        )
        
        # Parse the CSV output
        gpus = []
        for line in nvidia_smi_output.strip().split('\n'):
            values = [v.strip() for v in line.split(',')]
            if len(values) >= 10:
                gpu = {
                    'index': int(values[0]),
                    'name': values[1],
                    'temperature': float(values[2]),
                    'gpu_utilization': float(values[3]),
                    'memory_utilization': float(values[4]),
                    'memory_total': float(values[5]),
                    'memory_used': float(values[6]),
                    'memory_free': float(values[7]),
                    'power_draw': float(values[8]),
                    'power_limit': float(values[9])
                }
                gpus.append(gpu)
        
        # Get GPU processes information
        process_output = subprocess.check_output(
            ['nvidia-smi', '--query-compute-apps=pid,process_name,used_memory', '--format=csv,noheader,nounits'],
            universal_newlines=True
        )
        
        processes = []
        for line in process_output.strip().split('\n'):
            if line:  # Skip empty lines
                values = [v.strip() for v in line.split(',')]
                if len(values) >= 3:
                    process = {
                        'pid': int(values[0]),
                        'name': values[1],
                        'memory_used': float(values[2])
                    }
                    processes.append(process)
        
        return {
            'timestamp': subprocess.check_output(['date', '+%Y-%m-%d %H:%M:%S'], universal_newlines=True).strip(),
            'gpus': gpus,
            'processes': processes,
            'success': True
        }
    
    except Exception as e:
        logger.error(f"Error getting GPU information: {str(e)}")
        return {
            'timestamp': subprocess.check_output(['date', '+%Y-%m-%d %H:%M:%S'], universal_newlines=True).strip(),
            'error': str(e),
            'success': False
        }

@app.route('/gpu/json')
def gpu_json():
    """
    API endpoint for GPU information in JSON format
    """
    return jsonify(get_gpu_info())

@app.route('/gpu/txt')
def gpu_txt():
    """
    API endpoint for traditional nvidia-smi text output (for backward compatibility)
    """
    try:
        # Run nvidia-smi with standard output format
        nvidia_smi_output = subprocess.check_output(['nvidia-smi'], universal_newlines=True)
        return nvidia_smi_output
    except Exception as e:
        logger.error(f"Error getting nvidia-smi output: {str(e)}")
        return f"Error: {str(e)}"

@app.route('/health')
def health_check():
    """
    Simple health check endpoint
    """
    return jsonify({'status': 'ok'})

if __name__ == '__main__':
    # Note: In production, consider using a proper WSGI server like gunicorn
    # and configure proper authentication/security
    app.run(host='0.0.0.0', port=5000)