File size: 4,901 Bytes
a2100ac
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
import os
import sys
import subprocess
from dotenv import find_dotenv, load_dotenv

from llm_toolkit.llm_utils import *
from llm_toolkit.translation_utils import *


def evaluate_model_all_epochs(
    model,
    tokenizer,
    model_name,
    adapter_path_base,
    num_of_entries=-1,
    result_file=None,
    start_epoch=0,
    end_epoch=-1,
):
    new_env = os.environ.copy()
    new_env["MODEL_NAME"] = model_name
    model = model_name.split("/")[-1]

    new_env["LOAD_IN_4BIT"] = "true" if load_in_4bit else "false"
    if result_file is not None:
        new_env["RESULTS_PATH"] = result_file

    if adapter_path_base is None:
        num_train_epochs = 0
        print(f"No adapter path provided. Running with base model:{model_name}")
    else:
        if end_epoch >= 0:
            num_train_epochs = end_epoch
            print(f"Running from epoch {start_epoch} to {end_epoch}")
        else:
            # find subdirectories in adapter_path_base
            # and sort them by epoch number
            subdirs = [
                d
                for d in os.listdir(adapter_path_base)
                if os.path.isdir(os.path.join(adapter_path_base, d))
            ]

            subdirs = sorted(subdirs, key=lambda x: int(x.split("-")[-1]))
            num_train_epochs = len(subdirs)
            print(f"found {num_train_epochs} checkpoints: {subdirs}")

    for i in range(start_epoch, num_train_epochs + 1):
        print(f"Epoch {i}")
        if i == 0:
            os.unsetenv("ADAPTER_NAME_OR_PATH")
        else:
            adapter_path = adapter_path_base + "/" + subdirs[i - 1]
            new_env["ADAPTER_NAME_OR_PATH"] = adapter_path

        print(f"adapter path: {new_env.get('ADAPTER_NAME_OR_PATH')}")

        log_file = "./logs/{}_epoch_{}.txt".format(model, i)
        with open(log_file, "w") as f_obj:
            subprocess.run(
                f"python llm_toolkit/eval_shots.py {num_of_entries}",
                shell=True,
                env=new_env,
                stdout=f_obj,
                text=True,
            )


if __name__ == "__main__":
    found_dotenv = find_dotenv(".env")

    if len(found_dotenv) == 0:
        found_dotenv = find_dotenv(".env.example")
    print(f"loading env vars from: {found_dotenv}")
    load_dotenv(found_dotenv, override=False)

    workding_dir = os.path.dirname(found_dotenv)
    os.chdir(workding_dir)
    sys.path.append(workding_dir)
    print("workding dir:", workding_dir)
    print(f"adding {workding_dir} to sys.path")
    sys.path.append(workding_dir)

    model_name = os.getenv("MODEL_NAME")
    adapter_path_base = os.getenv("ADAPTER_PATH_BASE")
    start_epoch = int(os.getenv("START_EPOCH", 0))
    end_epoch = os.getenv("END_EPOCH", -1)
    load_in_4bit = os.getenv("LOAD_IN_4BIT", "true").lower() == "true"
    result_file = os.getenv("RESULTS_PATH", None)

    num_of_entries = int(sys.argv[1]) if len(sys.argv) > 1 else -1

    print(
        model_name,
        adapter_path_base,
        load_in_4bit,
        start_epoch,
        result_file,
    )

    device = check_gpu()
    is_cuda = torch.cuda.is_available()

    print(f"Evaluating model: {model_name} on {device}")

    if is_cuda:
        torch.cuda.empty_cache()
        gpu_stats = torch.cuda.get_device_properties(0)
        start_gpu_memory = round(
            torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3
        )
        max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
        print(f"(0) GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
        print(f"{start_gpu_memory} GB of memory reserved.")

    model, tokenizer = load_model(model_name, load_in_4bit=load_in_4bit)

    datasets = load_translation_dataset(data_path, tokenizer, num_shots=0)
    print_row_details(datasets["test"].to_pandas())

    if is_cuda:
        gpu_stats = torch.cuda.get_device_properties(0)
        start_gpu_memory = round(
            torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3
        )
        max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
        print(f"(1) GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
        print(f"{start_gpu_memory} GB of memory reserved.")

    evaluate_model_all_epochs(
        model,
        tokenizer,
        model_name,
        adapter_path_base,
        start_epoch=start_epoch,
        end_epoch=end_epoch,
        load_in_4bit=load_in_4bit,
        num_of_entries=num_of_entries,
        result_file=result_file,
    )

    if is_cuda:
        gpu_stats = torch.cuda.get_device_properties(0)
        start_gpu_memory = round(
            torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3
        )
        max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
        print(f"(3) GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
        print(f"{start_gpu_memory} GB of memory reserved.")