File size: 3,482 Bytes
2e3dc13
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23c96f8
2e3dc13
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23c96f8
2e3dc13
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
import os
import json
import pytz
import logging
import asyncio
from datetime import datetime
from pathlib import Path
import huggingface_hub
from huggingface_hub.errors import RepositoryNotFoundError, RevisionNotFoundError
from dotenv import load_dotenv
from git import Repo
from datetime import datetime
from tqdm.auto import tqdm
from tqdm.contrib.logging import logging_redirect_tqdm

from app.config.hf_config import HF_TOKEN, API

from app.utils.model_validation import ModelValidator

huggingface_hub.logging.set_verbosity_error()
huggingface_hub.utils.disable_progress_bars()

logging.basicConfig(
    level=logging.ERROR,
    format='%(message)s'
)
logger = logging.getLogger(__name__)
load_dotenv()

validator = ModelValidator()

def get_changed_files(repo_path, start_date, end_date):
    repo = Repo(repo_path)
    start = datetime.strptime(start_date, '%Y-%m-%d')
    end = datetime.strptime(end_date, '%Y-%m-%d')
    
    changed_files = set()
    pbar = tqdm(repo.iter_commits(), desc=f"Reading commits from {end_date} to {start_date}")
    for commit in pbar:
        commit_date = datetime.fromtimestamp(commit.committed_date)
        pbar.set_postfix_str(f"Commit date: {commit_date}")
        if start <= commit_date <= end:
            changed_files.update(item.a_path for item in commit.diff(commit.parents[0]))

        if commit_date < start:
            break

    return changed_files


def read_json(repo_path, file):
    with open(f"{repo_path}/{file}") as file:
        return json.load(file)


def write_json(repo_path, file, content):
    with open(f"{repo_path}/{file}", "w") as file:
        json.dump(content, file, indent=2)


def main():
    requests_path = "/requests"
    start_date = "2024-12-09"
    end_date = "2025-01-07"
  
    changed_files = get_changed_files(requests_path, start_date, end_date)

    for file in tqdm(changed_files):
        try:
            request_data = read_json(requests_path, file)
        except FileNotFoundError as e:
            tqdm.write(f"File {file} not found")
            continue
    
        try:
            model_info = API.model_info(
                repo_id=request_data["model"],
                revision=request_data["revision"],
                token=HF_TOKEN
            )
        except (RepositoryNotFoundError, RevisionNotFoundError) as e:
            tqdm.write(f"Model info for {request_data["model"]} not found")
            continue
        
        with logging_redirect_tqdm():
            new_model_size, error = asyncio.run(validator.get_model_size(
                model_info=model_info,
                precision=request_data["precision"],
                base_model=request_data["base_model"],
                revision=request_data["revision"]
            ))

        if error:
            tqdm.write(f"Error getting model size info for {request_data["model"]}, {error}")
            continue
        
        old_model_size = request_data["params"]
        if old_model_size != new_model_size:
            if new_model_size > 100:
                tqdm.write(f"Model: {request_data["model"]}, size is more 100B: {new_model_size}")
            
            tqdm.write(f"Model: {request_data["model"]}, old size: {request_data["params"]} new size: {new_model_size}")
            tqdm.write(f"Updating request file {file}")

            request_data["params"] = new_model_size
            write_json(requests_path, file, content=request_data)


if __name__ == "__main__":
    main()