File size: 3,387 Bytes
4450c0d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 |
import re
import textwrap
import argparse
from pathlib import Path
import tqdm
import jsonlines
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers.generation import GenerationConfig
"""
Get the HumanEval.jsonl file from [here](https://github.com/openai/human-eval/tree/master/data)
python eval/evaluate_chat_humaneval.py -f HumanEval.jsonl -o HumanEval_res.jsonl
git clone https://github.com/openai/human-eval
pip install -e human-eval
evaluate_functional_correctness HumanEval_res.jsonl
"""
DEVICE = "cuda:0"
def extract_code(text, entry_point):
# 正则表达式匹配代码块
code_block_pattern = re.compile(
rf"```(?:[Pp]ython\n)?.*?def\s+{entry_point}.*?:\n(.*?)\n```", re.DOTALL
)
code_block = code_block_pattern.search(text)
if code_block is None:
code_block_pattern = re.compile(
rf"def\s+{entry_point}.*?:\n(.*?)(?:\n(?!\n*(?: |\t))|$)", re.DOTALL
)
code_block = code_block_pattern.search(text)
if code_block is None:
code_block_pattern = re.compile(
r"def.*?:\n(.*?)(?:\n(?!\n*(?: |\t))|$)", re.DOTALL
)
code_block = code_block_pattern.search(text)
if code_block is not None:
return code_block.group(1)
# if no code block is found, assume the LM is simply filling the code
return textwrap.indent(text, " " * 4)
def generate_sample(model, tokenizer, question, entry_point):
response, _ = model.chat(
tokenizer,
question,
history=None,
)
print(question)
print(response)
answer = extract_code(response, entry_point)
return answer, response
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Test HF checkpoint.")
parser.add_argument(
"-c",
"--checkpoint-path",
type=Path,
help="Checkpoint path",
default="Qwen/Qwen-7B-Chat",
)
parser.add_argument(
"-f",
"--sample-input-file",
type=str,
default=None,
help="data path to HumanEval.jsonl",
)
parser.add_argument(
"-o", "--sample-output-file", type=str, default="HumanEval_res.jsonl"
)
args = parser.parse_args()
print("Loading tokenizer ...")
tokenizer = AutoTokenizer.from_pretrained(
args.checkpoint_path, trust_remote_code=True
)
print("Loading model ...")
model = AutoModelForCausalLM.from_pretrained(
args.checkpoint_path,
device_map="auto",
trust_remote_code=True,
bf16=True,
use_flash_attn=True,
).eval()
model.generation_config = GenerationConfig.from_pretrained(
args.checkpoint_path, trust_remote_code=True
)
model.generation_config.do_sample = False # use greedy decoding
f_output = jsonlines.Writer(open(args.sample_output_file, "w", encoding="utf-8"))
f = jsonlines.open(args.sample_input_file)
with f_output as output:
for jobj in tqdm.tqdm(f, desc="task_idx"):
prompt = "Help me fill the following code.\n" + jobj["prompt"]
task_id = jobj["task_id"]
answer, response = generate_sample(
model, tokenizer, prompt, jobj["entry_point"]
)
gen_jobjs = {"task_id": task_id, "completion": answer, "response": response}
output.write(gen_jobjs)
f_output.close()
|