Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -7,8 +7,8 @@ from llama_cpp import Llama
|
|
7 |
st.set_page_config(page_title="Phi-3 Mini Chatbot", layout="centered")
|
8 |
|
9 |
# β
Define model path
|
10 |
-
MODEL_PATH = "./Phi-3-mini-4k-instruct-
|
11 |
-
MODEL_URL = "https://huggingface.co/microsoft/Phi-3-mini-4k-instruct-gguf/resolve/main/Phi-3-mini-4k-instruct-
|
12 |
|
13 |
# β
Check if model exists, otherwise download
|
14 |
if not os.path.exists(MODEL_PATH):
|
@@ -29,10 +29,10 @@ try:
|
|
29 |
if "model" not in st.session_state:
|
30 |
st.session_state["model"] = Llama(
|
31 |
model_path=MODEL_PATH,
|
32 |
-
n_ctx=
|
33 |
n_threads=2, # Matches available vCPUs
|
34 |
numa=True,
|
35 |
-
n_batch=
|
36 |
)
|
37 |
st.write("β
Model loaded successfully!")
|
38 |
except Exception as e:
|
@@ -65,11 +65,11 @@ if st.button("Send") and user_input:
|
|
65 |
# β
Use a minimal prompt format (no system message)
|
66 |
formatted_messages = [{"role": "user", "content": user_input}]
|
67 |
|
68 |
-
# β
|
69 |
response_data = st.session_state["model"].create_chat_completion(
|
70 |
messages=formatted_messages,
|
71 |
-
max_tokens=
|
72 |
-
stream=False #
|
73 |
)
|
74 |
|
75 |
# β
Debugging output
|
|
|
7 |
st.set_page_config(page_title="Phi-3 Mini Chatbot", layout="centered")
|
8 |
|
9 |
# β
Define model path
|
10 |
+
MODEL_PATH = "./Phi-3-mini-4k-instruct-q3.gguf"
|
11 |
+
MODEL_URL = "https://huggingface.co/microsoft/Phi-3-mini-4k-instruct-gguf/resolve/main/Phi-3-mini-4k-instruct-q3.gguf"
|
12 |
|
13 |
# β
Check if model exists, otherwise download
|
14 |
if not os.path.exists(MODEL_PATH):
|
|
|
29 |
if "model" not in st.session_state:
|
30 |
st.session_state["model"] = Llama(
|
31 |
model_path=MODEL_PATH,
|
32 |
+
n_ctx=256, # β
Lower memory usage, speeds up responses
|
33 |
n_threads=2, # Matches available vCPUs
|
34 |
numa=True,
|
35 |
+
n_batch=64 # β
Faster token processing
|
36 |
)
|
37 |
st.write("β
Model loaded successfully!")
|
38 |
except Exception as e:
|
|
|
65 |
# β
Use a minimal prompt format (no system message)
|
66 |
formatted_messages = [{"role": "user", "content": user_input}]
|
67 |
|
68 |
+
# β
Speed improvements: Reduce response length & force short answers
|
69 |
response_data = st.session_state["model"].create_chat_completion(
|
70 |
messages=formatted_messages,
|
71 |
+
max_tokens=64, temperature=0.5, top_p=0.8,
|
72 |
+
stream=False # β
No streaming for debugging
|
73 |
)
|
74 |
|
75 |
# β
Debugging output
|