Spaces:

Asiya057
/

Incarna-Mind

Running

File size: 7,039 Bytes

ba9f995

"""The below code is borrowed from: https://github.com/PromtEngineer/localGPT
The reason to use gguf/ggml models: https://huggingface.co/TheBloke/wizardLM-7B-GGML/discussions/3"""
import logging
import torch
from huggingface_hub import hf_hub_download
from huggingface_hub import login
from langchain.llms import LlamaCpp, HuggingFacePipeline
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    LlamaForCausalLM,
    LlamaTokenizer,
    GenerationConfig,
    pipeline,
)
from toolkit.utils import Config


configs = Config("configparser.ini")
logger = logging.getLogger(__name__)


def load_gguf_hf_model(
    model_id: str,
    model_basename: str,
    max_tokens: int,
    temperature: float,
    device_type: str,
):
    """
    Load a GGUF/GGML quantized model using LlamaCpp.

    This function attempts to load a GGUF/GGML quantized model using the LlamaCpp library.
    If the model is of type GGML, and newer version of LLAMA-CPP is used which does not support GGML,
    it logs a message indicating that LLAMA-CPP has dropped support for GGML.

    Parameters:
    - model_id (str): The identifier for the model on HuggingFace Hub.
    - model_basename (str): The base name of the model file.
    - max_tokens (int): The maximum number of tokens to generate in the completion.
    - temperature (float): The temperature of LLM.
    - device_type (str): The type of device where the model will run, e.g., 'mps', 'cuda', etc.

    Returns:
    - LlamaCpp: An instance of the LlamaCpp model if successful, otherwise None.

    Notes:
    - The function uses the `hf_hub_download` function to download the model from the HuggingFace Hub.
    - The number of GPU layers is set based on the device type.
    """

    try:
        logger.info("Using Llamacpp for GGUF/GGML quantized models")
        model_path = hf_hub_download(
            repo_id=model_id,
            filename=model_basename,
            resume_download=True,
            cache_dir=configs.local_model_dir,
        )
        kwargs = {
            "model_path": model_path,
            "n_ctx": configs.max_llm_context,
            "max_tokens": max_tokens,
            "temperature": temperature,
            "n_batch": configs.n_batch,  # set this based on your GPU & CPU RAM
            "verbose": False,
        }
        if device_type.lower() == "mps":
            kwargs["n_gpu_layers"] = 1
        if device_type.lower() == "cuda":
            kwargs["n_gpu_layers"] = configs.n_gpu_layers  # set this based on your GPU

        return LlamaCpp(**kwargs)
    except:
        if "ggml" in model_basename:
            logger.info(
                "If you were using GGML model, LLAMA-CPP Dropped Support, Use GGUF Instead"
            )
        return None


def load_full_hf_model(model_id: str, model_basename: str, device_type: str):
    """
    Load a full model using either LlamaTokenizer or AutoModelForCausalLM.

    This function loads a full model based on the specified device type.
    If the device type is 'mps' or 'cpu', it uses LlamaTokenizer and LlamaForCausalLM.
    Otherwise, it uses AutoModelForCausalLM.

    Parameters:
    - model_id (str): The identifier for the model on HuggingFace Hub.
    - model_basename (str): The base name of the model file.
    - device_type (str): The type of device where the model will run.

    Returns:
    - model (Union[LlamaForCausalLM, AutoModelForCausalLM]): The loaded model.
    - tokenizer (Union[LlamaTokenizer, AutoTokenizer]): The tokenizer associated with the model.

    Notes:
    - The function uses the `from_pretrained` method to load both the model and the tokenizer.
    - Additional settings are provided for NVIDIA GPUs, such as loading in 4-bit and setting the compute dtype.
    """
    if "meta-llama" in model_id.lower():
        login(token=configs.huggingface_token)

    if device_type.lower() in ["mps", "cpu"]:
        logger.info("Using LlamaTokenizer")
        tokenizer = LlamaTokenizer.from_pretrained(
            model_id,
            cache_dir=configs.local_model_dir,
        )
        model = LlamaForCausalLM.from_pretrained(
            model_id,
            cache_dir=configs.local_model_dir,
        )
    else:
        logger.info("Using AutoModelForCausalLM for full models")
        tokenizer = AutoTokenizer.from_pretrained(
            model_id, cache_dir=configs.local_model_dir
        )
        logger.info("Tokenizer loaded")
        model = AutoModelForCausalLM.from_pretrained(
            model_id,
            device_map="auto",
            torch_dtype=torch.float16,
            low_cpu_mem_usage=True,
            cache_dir=configs.local_model_dir,
            # trust_remote_code=True, # set these if you are using NVIDIA GPU
            # load_in_4bit=True,
            # bnb_4bit_quant_type="nf4",
            # bnb_4bit_compute_dtype=torch.float16,
            # max_memory={0: "15GB"} # Uncomment this line with you encounter CUDA out of memory errors
        )
        model.tie_weights()
    return model, tokenizer


def load_local_llm(
    model_id: str,
    model_basename: str,
    temperature: float,
    max_tokens: int,
    device_type: str,
):
    """
    Select a model for text generation using the HuggingFace library.
    If you are running this for the first time, it will download a model for you.
    subsequent runs will use the model from the disk.

    Args:
        device_type (str): Type of device to use, e.g., "cuda" for GPU or "cpu" for CPU.
        model_id (str): Identifier of the model to load from HuggingFace's model hub.
        model_basename (str, optional): Basename of the model if using quantized models.
            Defaults to None.

    Returns:
        HuggingFacePipeline: A pipeline object for text generation using the loaded model.

    Raises:
        ValueError: If an unsupported model or device type is provided.
    """
    logger.info(f"Loading Model: {model_id}, on: {device_type}")
    logger.info("This action can take a few minutes!")

    if model_basename.lower() != "none":
        if ".gguf" in model_basename.lower():
            llm = load_gguf_hf_model(
                model_id, model_basename, max_tokens, temperature, device_type
            )
            return llm

    model, tokenizer = load_full_hf_model(model_id, None, device_type)
    # Load configuration from the model to avoid warnings
    generation_config = GenerationConfig.from_pretrained(model_id)
    # see here for details:
    # https://huggingface.co/docs/transformers/
    # main_classes/text_generation#transformers.GenerationConfig.from_pretrained.returns

    # Create a pipeline for text generation
    pipe = pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
        max_length=max_tokens,
        temperature=temperature,
        # top_p=0.95,
        repetition_penalty=1.15,
        generation_config=generation_config,
    )
    local_llm = HuggingFacePipeline(pipeline=pipe)
    logger.info("Local LLM Loaded")

    return local_llm