|
|
|
|
|
|
|
|
|
|
|
|
|
import subprocess |
|
import re |
|
import sys |
|
import argparse |
|
from rich.console import Console |
|
from rich.table import Table |
|
from rich import box |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
console = Console() |
|
|
|
|
|
parser = argparse.ArgumentParser(description='Fetch resource usage from SLURM.') |
|
parser.add_argument('--account', type=str, required=True, help='Account name for SLURM query') |
|
|
|
args = parser.parse_args() |
|
account_name = args.account |
|
|
|
|
|
def get_qos_for_account(account): |
|
sacctmgr_qos_command = ( |
|
f"sacctmgr show assoc format=Account,User,QOS where Account={account} -P" |
|
) |
|
|
|
|
|
result_qos = subprocess.run(sacctmgr_qos_command, shell=True, capture_output=True, text=True) |
|
|
|
output = result_qos.stdout.strip() |
|
|
|
|
|
qos_set = set() |
|
for line in output.splitlines()[1:]: |
|
fields = line.split("|") |
|
if len(fields) == 3 and fields[2].strip(): |
|
|
|
qos_values = fields[2].split(",") |
|
for qos in qos_values: |
|
qos_set.add(qos.strip()) |
|
|
|
|
|
qos_list = sorted(qos_set, key=lambda x: (x != 'lowest', x)) |
|
return qos_list |
|
|
|
|
|
qos_list = get_qos_for_account(account_name) |
|
|
|
|
|
if not qos_list: |
|
console.print(f"[red]No QOS found for account {account_name}[/red]") |
|
sys.exit(1) |
|
|
|
|
|
def extract_tres_usage(tres_str): |
|
cpu_usage = gpu_usage = memory_usage = 0 |
|
if 'cpu=' in tres_str: |
|
cpu_match = re.search(r'cpu=([0-9]+)', tres_str) |
|
cpu_usage = int(cpu_match.group(1)) if cpu_match else 0 |
|
if 'gres/gpu=' in tres_str: |
|
gpu_match = re.search(r'gres/gpu=([0-9]+)', tres_str) |
|
gpu_usage = int(gpu_match.group(1)) if gpu_match else 0 |
|
if 'mem=' in tres_str: |
|
mem_match = re.search(r'mem=([0-9]+)([A-Za-z]+)', tres_str) |
|
if mem_match: |
|
mem_value, mem_unit = int(mem_match.group(1)), mem_match.group(2) |
|
|
|
memory_usage = mem_value if mem_unit == 'G' else mem_value / 1024 if mem_unit == 'M' else mem_value * 1024 |
|
return cpu_usage, memory_usage, gpu_usage |
|
|
|
|
|
for qos_name in qos_list: |
|
console.print(f"\n[bold green]Fetching data for QOS: {qos_name}[/bold green]\n") |
|
|
|
|
|
sacctmgr_command = [ |
|
"sacctmgr", |
|
"show", |
|
"qos", |
|
qos_name, |
|
"format=GrpTRES%50", |
|
"-P" |
|
] |
|
|
|
|
|
result_qos_resources = subprocess.run(sacctmgr_command, capture_output=True, text=True) |
|
qos_output = result_qos_resources.stdout.strip() |
|
|
|
|
|
cpu_limit = gpu_limit = memory_limit = "N/A" |
|
cpu_match = re.search(r'cpu=([0-9]+)', qos_output) |
|
gpu_match = re.search(r'gres/gpu=([0-9]+)', qos_output) |
|
mem_match = re.search(r'mem=([0-9]+)([A-Za-z]+)', qos_output) |
|
|
|
if cpu_match: |
|
cpu_limit = int(cpu_match.group(1)) |
|
if gpu_match: |
|
gpu_limit = int(gpu_match.group(1)) |
|
if mem_match: |
|
mem_value, mem_unit = int(mem_match.group(1)), mem_match.group(2) |
|
|
|
memory_limit = mem_value if mem_unit == 'G' else mem_value / 1024 if mem_unit == 'M' else mem_value * 1024 |
|
|
|
|
|
sacct_command = [ |
|
"sacct", |
|
"-a", |
|
"--qos=" + qos_name, |
|
"--account=" + account_name, |
|
"--format=JobID,User%20,Partition,JobName,State,ReqTRES%60,AllocTRES%60", |
|
"-P" |
|
] |
|
result_jobs = subprocess.run(sacct_command, capture_output=True, text=True) |
|
job_output = result_jobs.stdout.strip() |
|
|
|
|
|
job_lines = job_output.splitlines() |
|
|
|
|
|
header = job_lines[0].split("|") |
|
|
|
|
|
job_data = [] |
|
for line in job_lines[1:]: |
|
fields = line.split("|") |
|
if len(fields) == len(header): |
|
job_data.append(dict(zip(header, fields))) |
|
|
|
|
|
cpu_usage_per_user = {} |
|
mem_usage_per_user = {} |
|
gpu_usage_per_user = {} |
|
grand_total_cpu_usage = 0 |
|
grand_total_mem_usage = 0 |
|
grand_total_gpu_usage = 0 |
|
|
|
for job in job_data: |
|
job_id = job['JobID'] |
|
user = job['User'].strip() |
|
state = job['State'].strip() |
|
|
|
|
|
if '.' not in job_id and user and state == 'RUNNING': |
|
|
|
cpu_usage, mem_usage, gpu_usage = extract_tres_usage(job['ReqTRES']) |
|
|
|
if user not in cpu_usage_per_user: |
|
cpu_usage_per_user[user] = 0 |
|
mem_usage_per_user[user] = 0 |
|
gpu_usage_per_user[user] = 0 |
|
|
|
cpu_usage_per_user[user] += cpu_usage |
|
mem_usage_per_user[user] += int(mem_usage) |
|
gpu_usage_per_user[user] += gpu_usage |
|
|
|
grand_total_cpu_usage += cpu_usage |
|
grand_total_mem_usage += int(mem_usage) |
|
grand_total_gpu_usage += gpu_usage |
|
|
|
|
|
table = Table(title=f"Resource Usage Summary per User (QOS: {qos_name})", box=box.SQUARE) |
|
|
|
|
|
table.add_column("User", justify="left", style="cyan", no_wrap=True) |
|
table.add_column("Total GPU Usage", justify="right", style="magenta") |
|
table.add_column("Total CPU Usage", justify="right", style="magenta") |
|
table.add_column("Total Memory Usage (GB)", justify="right", style="magenta") |
|
|
|
|
|
for user in cpu_usage_per_user: |
|
table.add_row( |
|
user, |
|
f"{gpu_usage_per_user[user]:,}", |
|
f"{cpu_usage_per_user[user]:,}", |
|
f"{mem_usage_per_user[user]:,}" |
|
) |
|
|
|
|
|
table.add_row("Grand Total", f"{grand_total_gpu_usage:,}", f"{grand_total_cpu_usage:,}", f"{grand_total_mem_usage:,}", style="bold") |
|
|
|
|
|
formatted_cpu_limit = f"{cpu_limit:,}" if cpu_limit != "N/A" else cpu_limit |
|
formatted_gpu_limit = f"{gpu_limit:,}" if gpu_limit != "N/A" else gpu_limit |
|
formatted_mem_limit = f"{int(memory_limit):,}" if memory_limit != "N/A" else memory_limit |
|
|
|
|
|
table.add_row(f"Total Allowable Resources (QOS: {qos_name})", formatted_gpu_limit, formatted_cpu_limit, formatted_mem_limit, style="bold cyan") |
|
|
|
|
|
console.print(table) |
|
|