Spaces:
Running
Running
File size: 7,039 Bytes
ba9f995 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 |
"""The below code is borrowed from: https://github.com/PromtEngineer/localGPT
The reason to use gguf/ggml models: https://huggingface.co/TheBloke/wizardLM-7B-GGML/discussions/3"""
import logging
import torch
from huggingface_hub import hf_hub_download
from huggingface_hub import login
from langchain.llms import LlamaCpp, HuggingFacePipeline
from transformers import (
AutoModelForCausalLM,
AutoTokenizer,
LlamaForCausalLM,
LlamaTokenizer,
GenerationConfig,
pipeline,
)
from toolkit.utils import Config
configs = Config("configparser.ini")
logger = logging.getLogger(__name__)
def load_gguf_hf_model(
model_id: str,
model_basename: str,
max_tokens: int,
temperature: float,
device_type: str,
):
"""
Load a GGUF/GGML quantized model using LlamaCpp.
This function attempts to load a GGUF/GGML quantized model using the LlamaCpp library.
If the model is of type GGML, and newer version of LLAMA-CPP is used which does not support GGML,
it logs a message indicating that LLAMA-CPP has dropped support for GGML.
Parameters:
- model_id (str): The identifier for the model on HuggingFace Hub.
- model_basename (str): The base name of the model file.
- max_tokens (int): The maximum number of tokens to generate in the completion.
- temperature (float): The temperature of LLM.
- device_type (str): The type of device where the model will run, e.g., 'mps', 'cuda', etc.
Returns:
- LlamaCpp: An instance of the LlamaCpp model if successful, otherwise None.
Notes:
- The function uses the `hf_hub_download` function to download the model from the HuggingFace Hub.
- The number of GPU layers is set based on the device type.
"""
try:
logger.info("Using Llamacpp for GGUF/GGML quantized models")
model_path = hf_hub_download(
repo_id=model_id,
filename=model_basename,
resume_download=True,
cache_dir=configs.local_model_dir,
)
kwargs = {
"model_path": model_path,
"n_ctx": configs.max_llm_context,
"max_tokens": max_tokens,
"temperature": temperature,
"n_batch": configs.n_batch, # set this based on your GPU & CPU RAM
"verbose": False,
}
if device_type.lower() == "mps":
kwargs["n_gpu_layers"] = 1
if device_type.lower() == "cuda":
kwargs["n_gpu_layers"] = configs.n_gpu_layers # set this based on your GPU
return LlamaCpp(**kwargs)
except:
if "ggml" in model_basename:
logger.info(
"If you were using GGML model, LLAMA-CPP Dropped Support, Use GGUF Instead"
)
return None
def load_full_hf_model(model_id: str, model_basename: str, device_type: str):
"""
Load a full model using either LlamaTokenizer or AutoModelForCausalLM.
This function loads a full model based on the specified device type.
If the device type is 'mps' or 'cpu', it uses LlamaTokenizer and LlamaForCausalLM.
Otherwise, it uses AutoModelForCausalLM.
Parameters:
- model_id (str): The identifier for the model on HuggingFace Hub.
- model_basename (str): The base name of the model file.
- device_type (str): The type of device where the model will run.
Returns:
- model (Union[LlamaForCausalLM, AutoModelForCausalLM]): The loaded model.
- tokenizer (Union[LlamaTokenizer, AutoTokenizer]): The tokenizer associated with the model.
Notes:
- The function uses the `from_pretrained` method to load both the model and the tokenizer.
- Additional settings are provided for NVIDIA GPUs, such as loading in 4-bit and setting the compute dtype.
"""
if "meta-llama" in model_id.lower():
login(token=configs.huggingface_token)
if device_type.lower() in ["mps", "cpu"]:
logger.info("Using LlamaTokenizer")
tokenizer = LlamaTokenizer.from_pretrained(
model_id,
cache_dir=configs.local_model_dir,
)
model = LlamaForCausalLM.from_pretrained(
model_id,
cache_dir=configs.local_model_dir,
)
else:
logger.info("Using AutoModelForCausalLM for full models")
tokenizer = AutoTokenizer.from_pretrained(
model_id, cache_dir=configs.local_model_dir
)
logger.info("Tokenizer loaded")
model = AutoModelForCausalLM.from_pretrained(
model_id,
device_map="auto",
torch_dtype=torch.float16,
low_cpu_mem_usage=True,
cache_dir=configs.local_model_dir,
# trust_remote_code=True, # set these if you are using NVIDIA GPU
# load_in_4bit=True,
# bnb_4bit_quant_type="nf4",
# bnb_4bit_compute_dtype=torch.float16,
# max_memory={0: "15GB"} # Uncomment this line with you encounter CUDA out of memory errors
)
model.tie_weights()
return model, tokenizer
def load_local_llm(
model_id: str,
model_basename: str,
temperature: float,
max_tokens: int,
device_type: str,
):
"""
Select a model for text generation using the HuggingFace library.
If you are running this for the first time, it will download a model for you.
subsequent runs will use the model from the disk.
Args:
device_type (str): Type of device to use, e.g., "cuda" for GPU or "cpu" for CPU.
model_id (str): Identifier of the model to load from HuggingFace's model hub.
model_basename (str, optional): Basename of the model if using quantized models.
Defaults to None.
Returns:
HuggingFacePipeline: A pipeline object for text generation using the loaded model.
Raises:
ValueError: If an unsupported model or device type is provided.
"""
logger.info(f"Loading Model: {model_id}, on: {device_type}")
logger.info("This action can take a few minutes!")
if model_basename.lower() != "none":
if ".gguf" in model_basename.lower():
llm = load_gguf_hf_model(
model_id, model_basename, max_tokens, temperature, device_type
)
return llm
model, tokenizer = load_full_hf_model(model_id, None, device_type)
# Load configuration from the model to avoid warnings
generation_config = GenerationConfig.from_pretrained(model_id)
# see here for details:
# https://huggingface.co/docs/transformers/
# main_classes/text_generation#transformers.GenerationConfig.from_pretrained.returns
# Create a pipeline for text generation
pipe = pipeline(
"text-generation",
model=model,
tokenizer=tokenizer,
max_length=max_tokens,
temperature=temperature,
# top_p=0.95,
repetition_penalty=1.15,
generation_config=generation_config,
)
local_llm = HuggingFacePipeline(pipeline=pipe)
logger.info("Local LLM Loaded")
return local_llm
|