dh-mc's picture
refined English translations and Chinese prompt
c755e09
raw
history blame
4.82 kB
import os
import re
import sys
import torch
from llamafactory.chat import ChatModel
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TextStreamer
def load_model(
model_name,
max_seq_length=2048,
dtype=torch.bfloat16,
load_in_4bit=False,
adapter_name_or_path=None,
):
print(f"loading model: {model_name}")
if adapter_name_or_path:
template = "llama3" if "llama-3" in model_name.lower() else "chatml"
args = dict(
model_name_or_path=model_name,
adapter_name_or_path=adapter_name_or_path, # load the saved LoRA adapters
template=template, # same to the one in training
finetuning_type="lora", # same to the one in training
quantization_bit=4 if load_in_4bit else None, # load 4-bit quantized model
)
chat_model = ChatModel(args)
return chat_model.engine.model, chat_model.engine.tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
bnb_config = BitsAndBytesConfig(
load_in_4bit=load_in_4bit,
bnb_4bit_quant_type="nf4",
bnb_4bit_use_double_quant=False,
bnb_4bit_compute_dtype=dtype,
)
model = AutoModelForCausalLM.from_pretrained(
model_name,
quantization_config=bnb_config,
torch_dtype=dtype,
trust_remote_code=True,
device_map="auto",
) if load_in_4bit else AutoModelForCausalLM.from_pretrained(
model_name,
torch_dtype=dtype,
trust_remote_code=True,
device_map="auto",
)
return model, tokenizer
def test_model(model, tokenizer, prompt):
inputs = tokenizer(
[prompt],
return_tensors="pt",
).to("cuda")
text_streamer = TextStreamer(tokenizer)
_ = model.generate(
**inputs, max_new_tokens=2048, streamer=text_streamer, use_cache=True
)
def extract_answer(text, debug=False):
if text:
# Remove the begin and end tokens
text = re.sub(
r".*?(assistant|\[/INST\]).+?\b", "", text, flags=re.DOTALL | re.MULTILINE
)
if debug:
print("--------\nstep 1:", text)
text = re.sub(r"<.+?>.*", "", text, flags=re.DOTALL | re.MULTILINE)
if debug:
print("--------\nstep 2:", text)
text = re.sub(
r".*?end_header_id\|>\n\n", "", text, flags=re.DOTALL | re.MULTILINE
)
if debug:
print("--------\nstep 3:", text)
return text
def eval_model(model, tokenizer, eval_dataset):
total = len(eval_dataset)
predictions = []
for i in tqdm(range(total)):
inputs = tokenizer(
eval_dataset["prompt"][i : i + 1],
return_tensors="pt",
).to("cuda")
outputs = model.generate(**inputs, max_new_tokens=4096, use_cache=False)
decoded_output = tokenizer.batch_decode(outputs)
debug = i == 0
decoded_output = [
extract_answer(output, debug=debug) for output in decoded_output
]
predictions.extend(decoded_output)
return predictions
def save_model(
model,
tokenizer,
include_gguf=True,
include_merged=True,
publish=True,
):
try:
token = os.getenv("HF_TOKEN") or None
model_name = os.getenv("MODEL_NAME")
save_method = "lora"
quantization_method = "q5_k_m"
model_names = get_model_names(
model_name, save_method=save_method, quantization_method=quantization_method
)
model.save_pretrained(model_names["local"])
tokenizer.save_pretrained(model_names["local"])
if publish:
model.push_to_hub(
model_names["hub"],
token=token,
)
tokenizer.push_to_hub(
model_names["hub"],
token=token,
)
if include_merged:
model.save_pretrained_merged(
model_names["local"] + "-merged", tokenizer, save_method=save_method
)
if publish:
model.push_to_hub_merged(
model_names["hub"] + "-merged",
tokenizer,
save_method="lora",
token="",
)
if include_gguf:
model.save_pretrained_gguf(
model_names["local-gguf"],
tokenizer,
quantization_method=quantization_method,
)
if publish:
model.push_to_hub_gguf(
model_names["hub-gguf"],
tokenizer,
quantization_method=quantization_method,
token=token,
)
except Exception as e:
print(e)