import os import json import pytz import logging import asyncio from datetime import datetime from pathlib import Path import huggingface_hub from huggingface_hub.errors import RepositoryNotFoundError, RevisionNotFoundError from dotenv import load_dotenv from git import Repo from datetime import datetime from tqdm.auto import tqdm from tqdm.contrib.logging import logging_redirect_tqdm from app.config.hf_config import HF_TOKEN, API from app.utils.model_validation import ModelValidator huggingface_hub.logging.set_verbosity_error() huggingface_hub.utils.disable_progress_bars() logging.basicConfig( level=logging.ERROR, format='%(message)s' ) logger = logging.getLogger(__name__) load_dotenv() validator = ModelValidator() def get_changed_files(repo_path, start_date, end_date): repo = Repo(repo_path) start = datetime.strptime(start_date, '%Y-%m-%d') end = datetime.strptime(end_date, '%Y-%m-%d') changed_files = set() pbar = tqdm(repo.iter_commits(), desc=f"Reading commits from {end_date} to {start_date}") for commit in pbar: commit_date = datetime.fromtimestamp(commit.committed_date) pbar.set_postfix_str(f"Commit date: {commit_date}") if start <= commit_date <= end: changed_files.update(item.a_path for item in commit.diff(commit.parents[0])) if commit_date < start: break return changed_files def read_json(repo_path, file): with open(f"{repo_path}/{file}") as file: return json.load(file) def write_json(repo_path, file, content): with open(f"{repo_path}/{file}", "w") as file: json.dump(content, file, indent=2) def main(): requests_path = "/requests" start_date = "2024-12-09" end_date = "2025-01-07" changed_files = get_changed_files(requests_path, start_date, end_date) for file in tqdm(changed_files): try: request_data = read_json(requests_path, file) except FileNotFoundError as e: tqdm.write(f"File {file} not found") continue try: model_info = API.model_info( repo_id=request_data["model"], revision=request_data["revision"], token=HF_TOKEN ) except (RepositoryNotFoundError, RevisionNotFoundError) as e: tqdm.write(f"Model info for {request_data["model"]} not found") continue with logging_redirect_tqdm(): new_model_size, error = asyncio.run(validator.get_model_size( model_info=model_info, precision=request_data["precision"], base_model=request_data["base_model"], revision=request_data["revision"] )) if error: tqdm.write(f"Error getting model size info for {request_data["model"]}, {error}") continue old_model_size = request_data["params"] if old_model_size != new_model_size: if new_model_size > 100: tqdm.write(f"Model: {request_data["model"]}, size is more 100B: {new_model_size}") tqdm.write(f"Model: {request_data["model"]}, old size: {request_data["params"]} new size: {new_model_size}") tqdm.write(f"Updating request file {file}") request_data["params"] = new_model_size write_json(requests_path, file, content=request_data) if __name__ == "__main__": main()