BulkCalcSpace / app.py
meg's picture
meg HF Staff
Update app.py
92305c2 verified
raw
history blame
2.66 kB
import os
from fastapi import FastAPI
from huggingface_hub import HfApi
import time
TOKEN = os.environ.get("BULK_ENERGY_TOKEN")
API = HfApi(token=TOKEN)
REPO_ID = "AIEnergyScore/BulkCalcSpace"
app = FastAPI()
def check_for_traceback(run_dir):
# run_dir="./runs/${experiment_name}/${backend_model}/${now}"
found_error = False
error_message = ""
try:
# Read error message
with open(f"{run_dir}/error.log", 'r') as f:
# There may be a better way to do this that finds the
# index of Traceback, then prints from there : end-of-file index (the file length-1).
for line in f:
# Question: Do we even need to check for this? The presence of the
# error file, or at least a non-empty one,
# means there's been an error, no?
if 'Traceback (most recent call last):' in line:
found_error = True
if found_error:
error_message += line
except FileNotFoundError as e:
# When does this happen?
print(f"Could not find {run_dir}/error.log")
return error_message
@app.get("/")
def start_train():
model_file = open("models.txt", "r+").readlines()
task_file = open("tasks.txt", "r+").readlines()
hardware_file = open("hardware.txt", "r+").readlines()
for hardware in hardware_file:
hardware = hardware.strip()
print(f"Hardware is {hardware}")
curr_runtime = API.get_space_runtime(repo_id=REPO_ID)
print(f"Current hardware is {curr_runtime}")
if curr_runtime != hardware:
print("Trying to switch.")
API.request_space_hardware(repo_id=REPO_ID, hardware=hardware)
for model in model_file:
model = model.strip()
for task in task_file:
task = task.strip()
# Create the name of the directory for output.
now = time.time()
run_dir = f"/runs/${task}/${model}/${now}"
os.system(f"./entrypoint.sh {REPO_ID} {model} {task} {hardware} {run_dir}")
#space_variables = API.get_space_variables(repo_id=REPO_ID)
#if 'STATUS' not in space_variables or space_variables['STATUS'] != 'COMPUTING':
# print("Beginning processing.")
# API.add_space_variable(repo_id=REPO_ID, key='STATUS', value='COMPUTING')
#os.system(f"./entrypoint.sh {REPO_ID} {model} {task} {hardware}")
#API.add_space_variable(repo_id=REPO_ID, key='STATUS', value='NOT_COMPUTING')
#print("Pausing space")
#API.pause_space(REPO_ID)
return {"Status": "Done"}#space_variables['STATUS']}