Spaces:

anpigon
/

langchain-qa-bot

Runtime error

App Files Files Community

langchain-qa-bot / docs /langchain /libs /community /langchain_community /llms /exllamav2.py

anpigon

add langchain docs

ed4d993 11 months ago

raw

history blame

6.46 kB

	from typing import Any, Dict, Iterator, List, Optional

	from langchain_core.callbacks import CallbackManagerForLLMRun
	from langchain_core.language_models import LLM
	from langchain_core.outputs import GenerationChunk
	from langchain_core.pydantic_v1 import Field, root_validator


	class ExLlamaV2(LLM):
	"""ExllamaV2 API.

	- working only with GPTQ models for now.
	- Lora models are not supported yet.

	To use, you should have the exllamav2 library installed, and provide the
	path to the Llama model as a named parameter to the constructor.
	Check out:

	Example:
	.. code-block:: python

	from langchain_community.llms import Exllamav2

	llm = Exllamav2(model_path="/path/to/llama/model")

	#TODO:
	- Add loras support
	- Add support for custom settings
	- Add support for custom stop sequences
	"""

	client: Any
	model_path: str
	exllama_cache: Any = None
	config: Any = None
	generator: Any = None
	tokenizer: Any = None
	# If settings is None, it will be used as the default settings for the model.
	# All other parameters won't be used.
	settings: Any = None

	# Langchain parameters
	logfunc = print

	stop_sequences: List[str] = Field("")
	"""Sequences that immediately will stop the generator."""

	max_new_tokens: int = Field(150)
	"""Maximum number of tokens to generate."""

	streaming: bool = Field(True)
	"""Whether to stream the results, token by token."""

	verbose: bool = Field(True)
	"""Whether to print debug information."""

	# Generator parameters
	disallowed_tokens: List[int] = Field(None)
	"""List of tokens to disallow during generation."""

	@root_validator()
	def validate_environment(cls, values: Dict[str, Any]) -> Dict[str, Any]:
	try:
	import torch
	except ImportError as e:
	raise ImportError(
	"Unable to import torch, please install with `pip install torch`."
	) from e
	# check if cuda is available
	if not torch.cuda.is_available():
	raise EnvironmentError("CUDA is not available. ExllamaV2 requires CUDA.")
	try:
	from exllamav2 import (
	ExLlamaV2,
	ExLlamaV2Cache,
	ExLlamaV2Config,
	ExLlamaV2Tokenizer,
	)
	from exllamav2.generator import (
	ExLlamaV2BaseGenerator,
	ExLlamaV2StreamingGenerator,
	)
	except ImportError:
	raise ImportError(
	"Could not import exllamav2 library. "
	"Please install the exllamav2 library with (cuda 12.1 is required)"
	"example : "
	"!python -m pip install https://github.com/turboderp/exllamav2/releases/download/v0.0.12/exllamav2-0.0.12+cu121-cp311-cp311-linux_x86_64.whl"
	)

	# Set logging function if verbose or set to empty lambda
	verbose = values["verbose"]
	if not verbose:
	values["logfunc"] = lambda args, *kwargs: None
	logfunc = values["logfunc"]

	if values["settings"]:
	settings = values["settings"]
	logfunc(settings.__dict__)
	else:
	raise NotImplementedError(
	"settings is required. Custom settings are not supported yet."
	)

	config = ExLlamaV2Config()
	config.model_dir = values["model_path"]
	config.prepare()

	model = ExLlamaV2(config)

	exllama_cache = ExLlamaV2Cache(model, lazy=True)
	model.load_autosplit(exllama_cache)

	tokenizer = ExLlamaV2Tokenizer(config)
	if values["streaming"]:
	generator = ExLlamaV2StreamingGenerator(model, exllama_cache, tokenizer)
	else:
	generator = ExLlamaV2BaseGenerator(model, exllama_cache, tokenizer)

	# Configure the model and generator
	values["stop_sequences"] = [x.strip().lower() for x in values["stop_sequences"]]
	setattr(settings, "stop_sequences", values["stop_sequences"])
	logfunc(f"stop_sequences {values['stop_sequences']}")

	disallowed = values.get("disallowed_tokens")
	if disallowed:
	settings.disallow_tokens(tokenizer, disallowed)

	values["client"] = model
	values["generator"] = generator
	values["config"] = config
	values["tokenizer"] = tokenizer
	values["exllama_cache"] = exllama_cache

	return values

	@property
	def _llm_type(self) -> str:
	"""Return type of llm."""
	return "ExLlamaV2"

	def get_num_tokens(self, text: str) -> int:
	"""Get the number of tokens present in the text."""
	return self.generator.tokenizer.num_tokens(text)

	def _call(
	self,
	prompt: str,
	stop: Optional[List[str]] = None,
	run_manager: Optional[CallbackManagerForLLMRun] = None,
	**kwargs: Any,
	) -> str:
	generator = self.generator

	if self.streaming:
	combined_text_output = ""
	for chunk in self._stream(
	prompt=prompt, stop=stop, run_manager=run_manager, kwargs=kwargs
	):
	combined_text_output += str(chunk)
	return combined_text_output
	else:
	output = generator.generate_simple(
	prompt=prompt,
	gen_settings=self.settings,
	num_tokens=self.max_new_tokens,
	)
	# subtract subtext from output
	output = output[len(prompt) :]
	return output

	def _stream(
	self,
	prompt: str,
	stop: Optional[List[str]] = None,
	run_manager: Optional[CallbackManagerForLLMRun] = None,
	**kwargs: Any,
	) -> Iterator[GenerationChunk]:
	input_ids = self.tokenizer.encode(prompt)
	self.generator.warmup()
	self.generator.set_stop_conditions([])
	self.generator.begin_stream(input_ids, self.settings)

	generated_tokens = 0

	while True:
	chunk, eos, _ = self.generator.stream()
	generated_tokens += 1

	if run_manager:
	run_manager.on_llm_new_token(
	token=chunk,
	verbose=self.verbose,
	)
	yield chunk
	if eos or generated_tokens == self.max_new_tokens:
	break

	return