Spaces:
Runtime error
Runtime error
from typing import Any, Dict, Iterator, List, Optional | |
from langchain_core.callbacks import CallbackManagerForLLMRun | |
from langchain_core.language_models import LLM | |
from langchain_core.outputs import GenerationChunk | |
from langchain_core.pydantic_v1 import Field, root_validator | |
class ExLlamaV2(LLM): | |
"""ExllamaV2 API. | |
- working only with GPTQ models for now. | |
- Lora models are not supported yet. | |
To use, you should have the exllamav2 library installed, and provide the | |
path to the Llama model as a named parameter to the constructor. | |
Check out: | |
Example: | |
.. code-block:: python | |
from langchain_community.llms import Exllamav2 | |
llm = Exllamav2(model_path="/path/to/llama/model") | |
#TODO: | |
- Add loras support | |
- Add support for custom settings | |
- Add support for custom stop sequences | |
""" | |
client: Any | |
model_path: str | |
exllama_cache: Any = None | |
config: Any = None | |
generator: Any = None | |
tokenizer: Any = None | |
# If settings is None, it will be used as the default settings for the model. | |
# All other parameters won't be used. | |
settings: Any = None | |
# Langchain parameters | |
logfunc = print | |
stop_sequences: List[str] = Field("") | |
"""Sequences that immediately will stop the generator.""" | |
max_new_tokens: int = Field(150) | |
"""Maximum number of tokens to generate.""" | |
streaming: bool = Field(True) | |
"""Whether to stream the results, token by token.""" | |
verbose: bool = Field(True) | |
"""Whether to print debug information.""" | |
# Generator parameters | |
disallowed_tokens: List[int] = Field(None) | |
"""List of tokens to disallow during generation.""" | |
def validate_environment(cls, values: Dict[str, Any]) -> Dict[str, Any]: | |
try: | |
import torch | |
except ImportError as e: | |
raise ImportError( | |
"Unable to import torch, please install with `pip install torch`." | |
) from e | |
# check if cuda is available | |
if not torch.cuda.is_available(): | |
raise EnvironmentError("CUDA is not available. ExllamaV2 requires CUDA.") | |
try: | |
from exllamav2 import ( | |
ExLlamaV2, | |
ExLlamaV2Cache, | |
ExLlamaV2Config, | |
ExLlamaV2Tokenizer, | |
) | |
from exllamav2.generator import ( | |
ExLlamaV2BaseGenerator, | |
ExLlamaV2StreamingGenerator, | |
) | |
except ImportError: | |
raise ImportError( | |
"Could not import exllamav2 library. " | |
"Please install the exllamav2 library with (cuda 12.1 is required)" | |
"example : " | |
"!python -m pip install https://github.com/turboderp/exllamav2/releases/download/v0.0.12/exllamav2-0.0.12+cu121-cp311-cp311-linux_x86_64.whl" | |
) | |
# Set logging function if verbose or set to empty lambda | |
verbose = values["verbose"] | |
if not verbose: | |
values["logfunc"] = lambda *args, **kwargs: None | |
logfunc = values["logfunc"] | |
if values["settings"]: | |
settings = values["settings"] | |
logfunc(settings.__dict__) | |
else: | |
raise NotImplementedError( | |
"settings is required. Custom settings are not supported yet." | |
) | |
config = ExLlamaV2Config() | |
config.model_dir = values["model_path"] | |
config.prepare() | |
model = ExLlamaV2(config) | |
exllama_cache = ExLlamaV2Cache(model, lazy=True) | |
model.load_autosplit(exllama_cache) | |
tokenizer = ExLlamaV2Tokenizer(config) | |
if values["streaming"]: | |
generator = ExLlamaV2StreamingGenerator(model, exllama_cache, tokenizer) | |
else: | |
generator = ExLlamaV2BaseGenerator(model, exllama_cache, tokenizer) | |
# Configure the model and generator | |
values["stop_sequences"] = [x.strip().lower() for x in values["stop_sequences"]] | |
setattr(settings, "stop_sequences", values["stop_sequences"]) | |
logfunc(f"stop_sequences {values['stop_sequences']}") | |
disallowed = values.get("disallowed_tokens") | |
if disallowed: | |
settings.disallow_tokens(tokenizer, disallowed) | |
values["client"] = model | |
values["generator"] = generator | |
values["config"] = config | |
values["tokenizer"] = tokenizer | |
values["exllama_cache"] = exllama_cache | |
return values | |
def _llm_type(self) -> str: | |
"""Return type of llm.""" | |
return "ExLlamaV2" | |
def get_num_tokens(self, text: str) -> int: | |
"""Get the number of tokens present in the text.""" | |
return self.generator.tokenizer.num_tokens(text) | |
def _call( | |
self, | |
prompt: str, | |
stop: Optional[List[str]] = None, | |
run_manager: Optional[CallbackManagerForLLMRun] = None, | |
**kwargs: Any, | |
) -> str: | |
generator = self.generator | |
if self.streaming: | |
combined_text_output = "" | |
for chunk in self._stream( | |
prompt=prompt, stop=stop, run_manager=run_manager, kwargs=kwargs | |
): | |
combined_text_output += str(chunk) | |
return combined_text_output | |
else: | |
output = generator.generate_simple( | |
prompt=prompt, | |
gen_settings=self.settings, | |
num_tokens=self.max_new_tokens, | |
) | |
# subtract subtext from output | |
output = output[len(prompt) :] | |
return output | |
def _stream( | |
self, | |
prompt: str, | |
stop: Optional[List[str]] = None, | |
run_manager: Optional[CallbackManagerForLLMRun] = None, | |
**kwargs: Any, | |
) -> Iterator[GenerationChunk]: | |
input_ids = self.tokenizer.encode(prompt) | |
self.generator.warmup() | |
self.generator.set_stop_conditions([]) | |
self.generator.begin_stream(input_ids, self.settings) | |
generated_tokens = 0 | |
while True: | |
chunk, eos, _ = self.generator.stream() | |
generated_tokens += 1 | |
if run_manager: | |
run_manager.on_llm_new_token( | |
token=chunk, | |
verbose=self.verbose, | |
) | |
yield chunk | |
if eos or generated_tokens == self.max_new_tokens: | |
break | |
return | |