Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
Update app.py
Browse files
app.py
CHANGED
@@ -25,12 +25,18 @@ WRITE_ACCESS_TOKEN = st.secrets["Llama_3_1"]
|
|
25 |
def get_rag_answer(query, top_results):
|
26 |
"""
|
27 |
Constructs a prompt from the query and the page contexts of the top results,
|
28 |
-
|
|
|
29 |
"""
|
30 |
-
# Combine the context from the top results (
|
31 |
context = "\n\n".join([res.payload["page_content"] for res in top_results])
|
32 |
|
33 |
-
#
|
|
|
|
|
|
|
|
|
|
|
34 |
prompt = (
|
35 |
"Using the following context, answer the question concisely. "
|
36 |
"Only output the final answer below, without repeating the context or question.\n\n"
|
@@ -43,7 +49,7 @@ def get_rag_answer(query, top_results):
|
|
43 |
payload = {
|
44 |
"inputs": prompt,
|
45 |
"parameters": {
|
46 |
-
"max_new_tokens":
|
47 |
}
|
48 |
}
|
49 |
|
|
|
25 |
def get_rag_answer(query, top_results):
|
26 |
"""
|
27 |
Constructs a prompt from the query and the page contexts of the top results,
|
28 |
+
truncates the context to avoid exceeding the token limit, then sends it to the
|
29 |
+
dedicated endpoint and returns only the generated answer.
|
30 |
"""
|
31 |
+
# Combine the context from the top results (adjust the separator as needed)
|
32 |
context = "\n\n".join([res.payload["page_content"] for res in top_results])
|
33 |
|
34 |
+
# Truncate the context to a maximum number of characters (e.g., 12000 characters)
|
35 |
+
max_context_chars = 15000
|
36 |
+
if len(context) > max_context_chars:
|
37 |
+
context = context[:max_context_chars]
|
38 |
+
|
39 |
+
# Build the prompt, instructing the model to only output the final answer.
|
40 |
prompt = (
|
41 |
"Using the following context, answer the question concisely. "
|
42 |
"Only output the final answer below, without repeating the context or question.\n\n"
|
|
|
49 |
payload = {
|
50 |
"inputs": prompt,
|
51 |
"parameters": {
|
52 |
+
"max_new_tokens": 150 # Adjust max tokens as needed
|
53 |
}
|
54 |
}
|
55 |
|