|
from dotenv import load_dotenv
|
|
import logging
|
|
|
|
import torch
|
|
from transformers import (
|
|
AutoModelForCausalLM,
|
|
AutoTokenizer,
|
|
BitsAndBytesConfig,
|
|
AutoTokenizer,
|
|
BitsAndBytesConfig,
|
|
pipeline,
|
|
)
|
|
|
|
from langchain_huggingface import HuggingFacePipeline
|
|
from langchain.globals import set_debug
|
|
from langchain.globals import set_verbose
|
|
|
|
from config import HF_MODEL_ID
|
|
from config import LLM_VERBOSE
|
|
|
|
set_verbose(LLM_VERBOSE)
|
|
set_debug(LLM_VERBOSE)
|
|
|
|
logger = logging.getLogger(__name__)
|
|
load_dotenv()
|
|
|
|
cuda_check = torch.cuda.is_available()
|
|
logger.info(f"torch.cuda.is_available : {cuda_check}")
|
|
print(f"> torch.cuda.is_available : {cuda_check}")
|
|
|
|
|
|
model_id = HF_MODEL_ID
|
|
|
|
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
|
|
|
|
|
|
|
device_map = "auto"
|
|
compute_dtype = getattr(torch, "float16")
|
|
bnb_config = BitsAndBytesConfig(
|
|
load_in_4bit=True,
|
|
bnb_4bit_use_double_quant=False,
|
|
bnb_4bit_quant_type="nf4",
|
|
|
|
bnb_4bit_compute_dtype=compute_dtype,
|
|
|
|
)
|
|
|
|
model = AutoModelForCausalLM.from_pretrained(
|
|
model_id,
|
|
device_map=device_map,
|
|
|
|
quantization_config=bnb_config,
|
|
)
|
|
|
|
model.generation_config.pad_token_id = tokenizer.eos_token_id
|
|
|
|
pipe = pipeline(
|
|
"text-generation",
|
|
model=model,
|
|
tokenizer=tokenizer,
|
|
max_new_tokens=50,
|
|
return_full_text=False,
|
|
num_return_sequences=1,
|
|
eos_token_id=tokenizer.eos_token_id,
|
|
temperature=0.0001,
|
|
do_sample=True,
|
|
)
|
|
|
|
llm = HuggingFacePipeline(pipeline=pipe)
|
|
|