import os import re import pandas as pd import seaborn as sns import matplotlib.pyplot as plt from datasets import load_dataset from llm_toolkit.llm_utils import extract_answer from tqdm import tqdm print(f"loading {__file__}") def calc_metrics(references, predictions, debug=False): assert len(references) == len( predictions ), f"lengths are difference: {len(references)} != {len(predictions)}" predictions = [extract_answer(text) for text in predictions] correct = [1 if ref == pred else 0 for ref, pred in zip(references, predictions)] accuracy = sum(correct) / len(references) results = {"accuracy": accuracy} if debug: incorrect_ids = [i for i, c in enumerate(correct) if c == 0] results["incorrect_ids"] = incorrect_ids return results def save_results(model_name, results_path, dataset, predictions, debug=False): if not os.path.exists(results_path): # Get the directory part of the file path dir_path = os.path.dirname(results_path) # Create all directories in the path (if they don't exist) os.makedirs(dir_path, exist_ok=True) df = dataset.to_pandas() df.drop(columns=["answer", "prompt", "train_text"], inplace=True) else: df = pd.read_csv(results_path, on_bad_lines="warn") df[model_name] = predictions if debug: print(df.head(1)) df.to_csv(results_path, index=False) def load_logical_reasoning_dataset(data_path, tokenizer=None): train_data_file = data_path + "/train.csv" test_data_file = data_path + "/dev.csv" print("loading train/test data files") datasets = load_dataset( "csv", data_files={"train": train_data_file, "test": test_data_file}, ) if tokenizer: reasoning_prompt = """你是一个情景猜谜游戏的主持人。游戏规则如下: 1. 参与者会得到一个谜面,谜面会描述一个简单又难以理解的事件。 2. 主持人知道谜底,谜底是谜面的答案。 3. 参与者可以询问任何封闭式问题来找寻事件的真相。 4. 对于每个问题,主持人将根据实际情况回答以下五个选项之一:是、不是、不重要、回答正确、问法错误。各回答的判断标准如下: - 若谜面和谜底能找到问题的答案,回答:是或者不是 - 若谜面和谜底不能直接或者间接推断出问题的答案,回答:不重要 - 若参与者提问不是一个封闭式问题或者问题难以理解,回答:问法错误 - 若参与者提问基本还原了谜底真相,回答:回答正确 5. 回答中不能添加任何其它信息,也不能省略选项中的任何一个字。例如,不可以把“不是”省略成“不”。 请严格按照这些规则回答参与者提出的问题。 **谜面:** {} **谜底:** {} **参与者提出的问题:** {} """ def formatting_prompts_func(examples): inputs = examples["text"] outputs = examples["label"] puzzles = examples["puzzle"] truths = examples["truth"] messages = [ { "role": "system", "content": "You are an expert in logical reasoning.", }, None, ] model_name = os.getenv("MODEL_NAME") if "mistral" in model_name.lower(): messages = messages[1:] texts = [] prompts = [] for input, output, puzzle, truth in zip(inputs, outputs, puzzles, truths): prompt = reasoning_prompt.format(puzzle, truth, input) messages[-1] = {"role": "user", "content": prompt} prompt = tokenizer.apply_chat_template( messages, tokenize=False, add_generation_prompt=True ) prompts.append(prompt) texts.append(prompt + output + tokenizer.eos_token) return {"train_text": texts, "prompt": prompts} datasets = datasets.map( formatting_prompts_func, batched=True, ) print(datasets) return datasets def eval_model(model, tokenizer, eval_dataset): total = len(eval_dataset) predictions = [] for i in tqdm(range(total)): inputs = tokenizer( eval_dataset["prompt"][i : i + 1], return_tensors="pt", ).to("cuda") outputs = model.generate(**inputs, max_new_tokens=4096, use_cache=False) decoded_output = tokenizer.batch_decode(outputs) debug = i == 0 decoded_output = [ extract_answer(output, debug=debug) for output in decoded_output ] predictions.extend(decoded_output) return predictions def save_model( model, tokenizer, include_gguf=True, include_merged=True, publish=True, ): try: token = os.getenv("HF_TOKEN") or None model_name = os.getenv("MODEL_NAME") save_method = "lora" quantization_method = "q5_k_m" model_names = get_model_names( model_name, save_method=save_method, quantization_method=quantization_method ) model.save_pretrained(model_names["local"]) tokenizer.save_pretrained(model_names["local"]) if publish: model.push_to_hub( model_names["hub"], token=token, ) tokenizer.push_to_hub( model_names["hub"], token=token, ) if include_merged: model.save_pretrained_merged( model_names["local"] + "-merged", tokenizer, save_method=save_method ) if publish: model.push_to_hub_merged( model_names["hub"] + "-merged", tokenizer, save_method="lora", token="", ) if include_gguf: model.save_pretrained_gguf( model_names["local-gguf"], tokenizer, quantization_method=quantization_method, ) if publish: model.push_to_hub_gguf( model_names["hub-gguf"], tokenizer, quantization_method=quantization_method, token=token, ) except Exception as e: print(e) def get_metrics(df): metrics_df = pd.DataFrame(df.columns.T)[2:] metrics_df.rename(columns={0: "model"}, inplace=True) metrics_df["model"] = metrics_df["model"].apply(lambda x: x.split("/")[-1]) metrics_df.reset_index(inplace=True) metrics_df = metrics_df.drop(columns=["index"]) accuracy = [] meteor = [] bleu_1 = [] rouge_l = [] all_metrics = [] for col in df.columns[2:]: metrics = calc_metrics(df["english"], df[col], debug=True) print(f"{col}: {metrics}") accuracy.append(metrics["accuracy"]) all_metrics.append(metrics) metrics_df["accuracy"] = accuracy metrics_df["all_metrics"] = all_metrics return metrics_df