File size: 8,263 Bytes
2913579
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.

# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.

import subprocess
import re
import sys
import argparse
from rich.console import Console
from rich.table import Table
from rich import box

# ---------------------- USAGE GUIDE ----------------------
# This script fetches and displays the GPU, CPU, and memory usage 
# per user for each Quality of Service (QOS) associated with a given account. 
# It shows the resources used in active jobs for each QOS and the 
# total allowable resources for each QOS.
# 
# Usage:
# python script.py --account <account_name>
# 
# Example:
# python script.py --account cortex
# ---------------------------------------------------------

# Initialize the console
console = Console()

# Setup argparse to handle account as command-line argument (no QOS argument)
parser = argparse.ArgumentParser(description='Fetch resource usage from SLURM.')
parser.add_argument('--account', type=str, required=True, help='Account name for SLURM query')

args = parser.parse_args()
account_name = args.account

# Function to extract available QOS for the account using the pipe-separated format
def get_qos_for_account(account):
    sacctmgr_qos_command = (
        f"sacctmgr show assoc format=Account,User,QOS where Account={account} -P"
    )
    
    # Run the command and capture stdout and stderr
    result_qos = subprocess.run(sacctmgr_qos_command, shell=True, capture_output=True, text=True)
    
    output = result_qos.stdout.strip()
    
    # Extract the QOS column (3rd column) from the pipe-separated output
    qos_set = set()  # Use a set to avoid duplicates
    for line in output.splitlines()[1:]:  # Skip the header
        fields = line.split("|")
        if len(fields) == 3 and fields[2].strip():  # Ensure we have 3 fields and QOS is not empty
            # QOS can have multiple values, split them by commas
            qos_values = fields[2].split(",")
            for qos in qos_values:
                qos_set.add(qos.strip())  # Strip any whitespace
    
    # Ensure that "lowest" QOS comes first, if present
    qos_list = sorted(qos_set, key=lambda x: (x != 'lowest', x))
    return qos_list

# Fetch QOS values associated with the account
qos_list = get_qos_for_account(account_name)

# Check if we found any QOS values
if not qos_list:
    console.print(f"[red]No QOS found for account {account_name}[/red]")
    sys.exit(1)

# Function to extract CPU, GPU, and memory usage from the ReqTRES or AllocTRES column
def extract_tres_usage(tres_str):
    cpu_usage = gpu_usage = memory_usage = 0
    if 'cpu=' in tres_str:
        cpu_match = re.search(r'cpu=([0-9]+)', tres_str)
        cpu_usage = int(cpu_match.group(1)) if cpu_match else 0
    if 'gres/gpu=' in tres_str:
        gpu_match = re.search(r'gres/gpu=([0-9]+)', tres_str)
        gpu_usage = int(gpu_match.group(1)) if gpu_match else 0
    if 'mem=' in tres_str:
        mem_match = re.search(r'mem=([0-9]+)([A-Za-z]+)', tres_str)
        if mem_match:
            mem_value, mem_unit = int(mem_match.group(1)), mem_match.group(2)
            # Convert memory to GB for consistent reporting
            memory_usage = mem_value if mem_unit == 'G' else mem_value / 1024 if mem_unit == 'M' else mem_value * 1024
    return cpu_usage, memory_usage, gpu_usage

# Loop through each QOS for the account
for qos_name in qos_list:
    console.print(f"\n[bold green]Fetching data for QOS: {qos_name}[/bold green]\n")
    
    # Construct the sacctmgr command to get total allowable resources for the QOS
    sacctmgr_command = [
        "sacctmgr", 
        "show", 
        "qos", 
        qos_name, 
        "format=GrpTRES%50", 
        "-P"
    ]
    
    # Fetch total allowable resources for the QOS
    result_qos_resources = subprocess.run(sacctmgr_command, capture_output=True, text=True)
    qos_output = result_qos_resources.stdout.strip()
    
    # Extract the CPU, GPU, and memory limits from GrpTRES
    cpu_limit = gpu_limit = memory_limit = "N/A"
    cpu_match = re.search(r'cpu=([0-9]+)', qos_output)
    gpu_match = re.search(r'gres/gpu=([0-9]+)', qos_output)
    mem_match = re.search(r'mem=([0-9]+)([A-Za-z]+)', qos_output)

    if cpu_match:
        cpu_limit = int(cpu_match.group(1))
    if gpu_match:
        gpu_limit = int(gpu_match.group(1))
    if mem_match:
        mem_value, mem_unit = int(mem_match.group(1)), mem_match.group(2)
        # Convert memory to GB
        memory_limit = mem_value if mem_unit == 'G' else mem_value / 1024 if mem_unit == 'M' else mem_value * 1024
    
    # Fetch job information for the given account and QOS
    sacct_command = [
        "sacct", 
        "-a", 
        "--qos=" + qos_name, 
        "--account=" + account_name,
        "--format=JobID,User%20,Partition,JobName,State,ReqTRES%60,AllocTRES%60", 
        "-P"
    ]
    result_jobs = subprocess.run(sacct_command, capture_output=True, text=True)
    job_output = result_jobs.stdout.strip()
    
    # Split the output into lines
    job_lines = job_output.splitlines()

    # Split the first row as the header
    header = job_lines[0].split("|")

    # Parse the output rows into a list of dictionaries
    job_data = []
    for line in job_lines[1:]:
        fields = line.split("|")
        if len(fields) == len(header):
            job_data.append(dict(zip(header, fields)))

    # Collect CPU, GPU, and memory usage per user for RUNNING jobs only, using ReqTRES for memory
    cpu_usage_per_user = {}
    mem_usage_per_user = {}
    gpu_usage_per_user = {}
    grand_total_cpu_usage = 0
    grand_total_mem_usage = 0
    grand_total_gpu_usage = 0

    for job in job_data:
        job_id = job['JobID']
        user = job['User'].strip()
        state = job['State'].strip()

        # Only consider master jobs (JobID without dots), non-empty users, and RUNNING jobs
        if '.' not in job_id and user and state == 'RUNNING':
            # Use ReqTRES for memory usage since AllocTRES doesn't show it
            cpu_usage, mem_usage, gpu_usage = extract_tres_usage(job['ReqTRES'])

            if user not in cpu_usage_per_user:
                cpu_usage_per_user[user] = 0
                mem_usage_per_user[user] = 0
                gpu_usage_per_user[user] = 0

            cpu_usage_per_user[user] += cpu_usage
            mem_usage_per_user[user] += int(mem_usage)  # rounding memory usage to integer
            gpu_usage_per_user[user] += gpu_usage

            grand_total_cpu_usage += cpu_usage
            grand_total_mem_usage += int(mem_usage)
            grand_total_gpu_usage += gpu_usage

    # Create a table using rich (without a divider)
    table = Table(title=f"Resource Usage Summary per User (QOS: {qos_name})", box=box.SQUARE)

    # Add columns to the table (GPU moved after user)
    table.add_column("User", justify="left", style="cyan", no_wrap=True)
    table.add_column("Total GPU Usage", justify="right", style="magenta")
    table.add_column("Total CPU Usage", justify="right", style="magenta")
    table.add_column("Total Memory Usage (GB)", justify="right", style="magenta")

    # Add rows for each user with commas for readability
    for user in cpu_usage_per_user:
        table.add_row(
            user, 
            f"{gpu_usage_per_user[user]:,}", 
            f"{cpu_usage_per_user[user]:,}", 
            f"{mem_usage_per_user[user]:,}"
        )

    # Add rows for the grand total
    table.add_row("Grand Total", f"{grand_total_gpu_usage:,}", f"{grand_total_cpu_usage:,}", f"{grand_total_mem_usage:,}", style="bold")

    # Format the total allowable resources and add to the table
    formatted_cpu_limit = f"{cpu_limit:,}" if cpu_limit != "N/A" else cpu_limit
    formatted_gpu_limit = f"{gpu_limit:,}" if gpu_limit != "N/A" else gpu_limit
    formatted_mem_limit = f"{int(memory_limit):,}" if memory_limit != "N/A" else memory_limit

    # Add a row for the total allowable resources for the specific QOS
    table.add_row(f"Total Allowable Resources (QOS: {qos_name})", formatted_gpu_limit, formatted_cpu_limit, formatted_mem_limit, style="bold cyan")

    # Display the table for the current QOS
    console.print(table)