Mattral commited on
Commit
a7f3e27
·
verified ·
1 Parent(s): 694b129

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +12 -1
app.py CHANGED
@@ -15,7 +15,7 @@ print("Embedding model loaded...")
15
 
16
  # Load the LLM
17
  callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])
18
-
19
  llm = AutoModelForCausalLM.from_pretrained(
20
  "TheBloke/Llama-2-7B-Chat-GGUF",
21
  model_file="llama-2-7b-chat.Q3_K_S.gguf",
@@ -24,6 +24,17 @@ llm = AutoModelForCausalLM.from_pretrained(
24
  repetition_penalty=1.5,
25
  max_new_tokens=300,
26
  )
 
 
 
 
 
 
 
 
 
 
 
27
  print("LLM loaded...")
28
 
29
  client = QdrantClient(path="./db")
 
15
 
16
  # Load the LLM
17
  callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])
18
+ '''
19
  llm = AutoModelForCausalLM.from_pretrained(
20
  "TheBloke/Llama-2-7B-Chat-GGUF",
21
  model_file="llama-2-7b-chat.Q3_K_S.gguf",
 
24
  repetition_penalty=1.5,
25
  max_new_tokens=300,
26
  )
27
+ '''
28
+ llm = LlamaCpp(
29
+ model_path="./llama-2-7b-chat.Q3_K_S.gguf",
30
+ temperature = 0.2,
31
+ n_ctx=2048,
32
+ f16_kv=True, # MUST set to True, otherwise you will run into problem after a couple of calls
33
+ max_tokens = 500,
34
+ callback_manager=callback_manager,
35
+ verbose=True,
36
+ )
37
+
38
  print("LLM loaded...")
39
 
40
  client = QdrantClient(path="./db")