File size: 7,183 Bytes
5002792
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c755e09
 
 
 
 
 
 
 
 
 
 
5002792
 
 
c755e09
5002792
c755e09
5002792
c755e09
5002792
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
import os
import re
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from datasets import load_dataset
from llm_toolkit.llm_utils import extract_answer
from tqdm import tqdm

print(f"loading {__file__}")

def calc_metrics(references, predictions, debug=False):
    assert len(references) == len(
        predictions
    ), f"lengths are difference: {len(references)} != {len(predictions)}"

    predictions = [extract_answer(text) for text in predictions]

    correct = [1 if ref == pred else 0 for ref, pred in zip(references, predictions)]
    accuracy = sum(correct) / len(references)

    results = {"accuracy": accuracy}
    if debug:
        incorrect_ids = [i for i, c in enumerate(correct) if c == 0]
        results["incorrect_ids"] = incorrect_ids

    return results


def save_results(model_name, results_path, dataset, predictions, debug=False):
    if not os.path.exists(results_path):
        # Get the directory part of the file path
        dir_path = os.path.dirname(results_path)

        # Create all directories in the path (if they don't exist)
        os.makedirs(dir_path, exist_ok=True)
        df = dataset.to_pandas()
        df.drop(columns=["answer", "prompt", "train_text"], inplace=True)
    else:
        df = pd.read_csv(results_path, on_bad_lines="warn")

    df[model_name] = predictions

    if debug:
        print(df.head(1))

    df.to_csv(results_path, index=False)


def load_logical_reasoning_dataset(data_path, tokenizer=None):
    train_data_file = data_path + "/train.csv"
    test_data_file = data_path + "/dev.csv"

    print("loading train/test data files")
    datasets = load_dataset(
        "csv",
        data_files={"train": train_data_file, "test": test_data_file},
    )

    if tokenizer:
        reasoning_prompt = """你是一个情景猜谜游戏的主持人。游戏规则如下:

1. 参与者会得到一个谜面,谜面会描述一个简单又难以理解的事件。
2. 主持人知道谜底,谜底是谜面的答案。
3. 参与者可以询问任何封闭式问题来找寻事件的真相。
4. 对于每个问题,主持人将根据实际情况回答以下五个选项之一:是、不是、不重要、回答正确、问法错误。各回答的判断标准如下:
   - 若谜面和谜底能找到问题的答案,回答:是或者不是
   - 若谜面和谜底不能直接或者间接推断出问题的答案,回答:不重要
   - 若参与者提问不是一个封闭式问题或者问题难以理解,回答:问法错误
   - 若参与者提问基本还原了谜底真相,回答:回答正确
5. 回答中不能添加任何其它信息,也不能省略选项中的任何一个字。例如,不可以把“不是”省略成“不”。

请严格按照这些规则回答参与者提出的问题。

**谜面:** {}

**谜底:** {}

**参与者提出的问题:** {}
"""
        def formatting_prompts_func(examples):
            inputs = examples["text"]
            outputs = examples["label"]
            puzzles = examples["puzzle"]
            truths = examples["truth"]

            messages = [
                {
                    "role": "system",
                    "content": "You are an expert in logical reasoning.",
                },
                None,
            ]

            model_name = os.getenv("MODEL_NAME")

            if "mistral" in model_name.lower():
                messages = messages[1:]

            texts = []
            prompts = []
            for input, output,  puzzle, truth in zip(inputs, outputs, puzzles, truths):
                prompt = reasoning_prompt.format(puzzle, truth, input)
                messages[-1] = {"role": "user", "content": prompt}

                prompt = tokenizer.apply_chat_template(
                    messages, tokenize=False, add_generation_prompt=True
                )
                prompts.append(prompt)
                texts.append(prompt + output + tokenizer.eos_token)
            return {"train_text": texts, "prompt": prompts}

        datasets = datasets.map(
            formatting_prompts_func,
            batched=True,
        )

    print(datasets)
    return datasets


def eval_model(model, tokenizer, eval_dataset):
    total = len(eval_dataset)
    predictions = []
    for i in tqdm(range(total)):
        inputs = tokenizer(
            eval_dataset["prompt"][i : i + 1],
            return_tensors="pt",
        ).to("cuda")

        outputs = model.generate(**inputs, max_new_tokens=4096, use_cache=False)
        decoded_output = tokenizer.batch_decode(outputs)
        debug = i == 0
        decoded_output = [
            extract_answer(output, debug=debug) for output in decoded_output
        ]
        predictions.extend(decoded_output)

    return predictions


def save_model(
    model,
    tokenizer,
    include_gguf=True,
    include_merged=True,
    publish=True,
):
    try:
        token = os.getenv("HF_TOKEN") or None
        model_name = os.getenv("MODEL_NAME")

        save_method = "lora"
        quantization_method = "q5_k_m"

        model_names = get_model_names(
            model_name, save_method=save_method, quantization_method=quantization_method
        )

        model.save_pretrained(model_names["local"])
        tokenizer.save_pretrained(model_names["local"])

        if publish:
            model.push_to_hub(
                model_names["hub"],
                token=token,
            )
            tokenizer.push_to_hub(
                model_names["hub"],
                token=token,
            )

        if include_merged:
            model.save_pretrained_merged(
                model_names["local"] + "-merged", tokenizer, save_method=save_method
            )
            if publish:
                model.push_to_hub_merged(
                    model_names["hub"] + "-merged",
                    tokenizer,
                    save_method="lora",
                    token="",
                )

        if include_gguf:
            model.save_pretrained_gguf(
                model_names["local-gguf"],
                tokenizer,
                quantization_method=quantization_method,
            )

            if publish:
                model.push_to_hub_gguf(
                    model_names["hub-gguf"],
                    tokenizer,
                    quantization_method=quantization_method,
                    token=token,
                )
    except Exception as e:
        print(e)


def get_metrics(df):
    metrics_df = pd.DataFrame(df.columns.T)[2:]
    metrics_df.rename(columns={0: "model"}, inplace=True)
    metrics_df["model"] = metrics_df["model"].apply(lambda x: x.split("/")[-1])
    metrics_df.reset_index(inplace=True)
    metrics_df = metrics_df.drop(columns=["index"])

    accuracy = []
    meteor = []
    bleu_1 = []
    rouge_l = []
    all_metrics = []
    for col in df.columns[2:]:
        metrics = calc_metrics(df["english"], df[col], debug=True)
        print(f"{col}: {metrics}")

        accuracy.append(metrics["accuracy"])
        all_metrics.append(metrics)

    metrics_df["accuracy"] = accuracy
    metrics_df["all_metrics"] = all_metrics

    return metrics_df