|
from dotenv import load_dotenv |
|
import logging |
|
|
|
import torch |
|
from transformers import ( |
|
AutoModelForCausalLM, |
|
AutoTokenizer, |
|
pipeline, |
|
) |
|
|
|
from langchain_huggingface import HuggingFacePipeline |
|
from langchain.globals import set_debug |
|
from langchain.globals import set_verbose |
|
|
|
from config import HF_MODEL_ID |
|
from config import LLM_VERBOSE |
|
|
|
set_verbose(LLM_VERBOSE) |
|
set_debug(LLM_VERBOSE) |
|
|
|
logger = logging.getLogger(__name__) |
|
load_dotenv() |
|
|
|
cuda_check = torch.cuda.is_available() |
|
logger.info(f"torch.cuda.is_available : {cuda_check}") |
|
print(f"> torch.cuda.is_available : {cuda_check}") |
|
|
|
|
|
model_id = HF_MODEL_ID |
|
|
|
tokenizer = AutoTokenizer.from_pretrained(model_id) |
|
|
|
|
|
|
|
device_map = "auto" |
|
compute_dtype = getattr(torch, "float16") |
|
|
|
model = AutoModelForCausalLM.from_pretrained( |
|
model_id, |
|
device_map=device_map, |
|
|
|
|
|
) |
|
|
|
model.generation_config.pad_token_id = tokenizer.eos_token_id |
|
|
|
pipe = pipeline( |
|
"text-generation", |
|
model=model, |
|
tokenizer=tokenizer, |
|
max_new_tokens=50, |
|
return_full_text=False, |
|
num_return_sequences=1, |
|
eos_token_id=tokenizer.eos_token_id, |
|
temperature=0.0001, |
|
do_sample=True, |
|
) |
|
|
|
llm = HuggingFacePipeline(pipeline=pipe) |
|
|