Spaces:
Running
Running
disable use_fast tokenizing
Browse files
app.py
CHANGED
@@ -41,16 +41,19 @@ st.write('loading chunks into vector db')
|
|
41 |
model_name = "hkunlp/instructor-large"
|
42 |
hf_embeddings = HuggingFaceInstructEmbeddings(
|
43 |
model_name = model_name)
|
44 |
-
db = Chroma.from_documents(texts, hf_embeddings)
|
45 |
|
46 |
-
st.write('loading
|
47 |
#model_name_or_path = "TheBloke/Llama-2-13B-chat-GPTQ"
|
48 |
model_name_or_path = "TheBloke/Llama-2-13B-chat-GGUF"
|
|
|
49 |
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True)
|
50 |
|
51 |
model_basename = "model"
|
|
|
52 |
use_triton = False
|
53 |
DEVICE = "cuda:0" if torch.cuda.is_available() else "cpu"
|
|
|
54 |
model = AutoGPTQForCausalLM.from_quantized(model_name_or_path,
|
55 |
model_basename=model_basename,
|
56 |
use_safetensors=True,
|
|
|
41 |
model_name = "hkunlp/instructor-large"
|
42 |
hf_embeddings = HuggingFaceInstructEmbeddings(
|
43 |
model_name = model_name)
|
44 |
+
# db = Chroma.from_documents(texts, hf_embeddings)
|
45 |
|
46 |
+
st.write('loading tokenizer')
|
47 |
#model_name_or_path = "TheBloke/Llama-2-13B-chat-GPTQ"
|
48 |
model_name_or_path = "TheBloke/Llama-2-13B-chat-GGUF"
|
49 |
+
#tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True)
|
50 |
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True)
|
51 |
|
52 |
model_basename = "model"
|
53 |
+
|
54 |
use_triton = False
|
55 |
DEVICE = "cuda:0" if torch.cuda.is_available() else "cpu"
|
56 |
+
st.write('loading LLM')
|
57 |
model = AutoGPTQForCausalLM.from_quantized(model_name_or_path,
|
58 |
model_basename=model_basename,
|
59 |
use_safetensors=True,
|