Spaces:
Running
on
L4
Running
on
L4
Commit
·
737a09d
1
Parent(s):
9f756e6
add llama syntesis model
Browse files- app.py +134 -72
- utils/llama_utils.py +3 -0
app.py
CHANGED
@@ -35,18 +35,12 @@ st.markdown("""
|
|
35 |
# ---------------------------------------
|
36 |
base_path = "data/"
|
37 |
base_model_path = "meta-llama/Llama-3.2-11B-Vision-Instruct"
|
38 |
-
adapter_path = "./LLaMA-TOMMI-1.0/"
|
39 |
|
40 |
st.title(":red[AI University] :gray[/] FEM")
|
41 |
# st.markdown("### Finite Element Method")
|
42 |
st.markdown("Welcome to :red[AI University]—an AI-powered platform designed to address scientific course queries, dynamically adapting to instructors' teaching styles and students' learning needs. This prototype demonstrates the capabilities of the AI University platform by providing expert answers to queries related to a graduate-level :red[Finite Element Method (FEM)] course")
|
43 |
|
44 |
-
# st.markdown(":gray[Welcome to] :red[AI University]:gray[, developed at the] :red[University of Southern California]:gray[. This app leverages AI to provide expert answers to queries related to] :red[Finite Element Method (FEM)]:gray[.]")
|
45 |
-
|
46 |
-
# st.markdown(":gray[Welcome to] :red[AI University]:gray[, developed at the] :red[University of Southern California]:gray[. This app leverages AI to provide expert answers to queries related to] :red[Finite Element Methods (FEM)]:gray[.]")
|
47 |
-
|
48 |
-
# As the content is AI-generated, we strongly recommend independently verifying the information provided.
|
49 |
-
|
50 |
st.markdown(" ")
|
51 |
st.markdown(" ")
|
52 |
# st.divider()
|
@@ -89,47 +83,60 @@ with st.sidebar:
|
|
89 |
# latex_overlap_tokens = latex_chunk_tokens // 4
|
90 |
latex_overlap_tokens = 0
|
91 |
|
92 |
-
st.write(' ')
|
93 |
with st.expander('Expert model', expanded=False):
|
94 |
-
|
95 |
use_expert_answer = st.toggle("Use expert answer", value=True)
|
96 |
show_expert_responce = st.toggle("Show initial expert answer", value=False)
|
97 |
|
98 |
st.session_state.expert_model = st.selectbox(
|
99 |
"Choose the LLM model",
|
100 |
-
["gpt-4o-mini",
|
101 |
-
|
102 |
-
"LLaMA-TOMMI-1.0"],
|
103 |
key='a1model'
|
104 |
)
|
105 |
|
106 |
-
if st.session_state.expert_model
|
107 |
-
|
108 |
|
109 |
-
if
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
else:
|
114 |
-
|
115 |
|
116 |
-
|
117 |
else:
|
118 |
-
|
119 |
-
|
120 |
|
121 |
with st.expander('Synthesis model',expanded=False):
|
122 |
-
|
123 |
# with st.container(border=True):
|
124 |
# Choose the LLM model
|
125 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
126 |
|
127 |
-
|
128 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
129 |
|
130 |
-
|
131 |
|
132 |
-
|
133 |
# Main content area
|
134 |
if "question" not in st.session_state:
|
135 |
st.session_state.question = ""
|
@@ -138,7 +145,7 @@ if "question" not in st.session_state:
|
|
138 |
text_area_placeholder = st.empty()
|
139 |
question_help = "Including details or instructions improves the answer."
|
140 |
st.session_state.question = text_area_placeholder.text_area(
|
141 |
-
"**Enter your
|
142 |
height=120,
|
143 |
value=st.session_state.question,
|
144 |
help=question_help
|
@@ -156,12 +163,26 @@ with col2:
|
|
156 |
break
|
157 |
st.session_state.question = random_question
|
158 |
text_area_placeholder.text_area(
|
159 |
-
"**Enter your
|
160 |
height=120,
|
161 |
value=st.session_state.question,
|
162 |
help=question_help
|
163 |
)
|
164 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
165 |
# Load YouTube and LaTeX data
|
166 |
text_data_YT, context_embeddings_YT = load_youtube_data(base_path, model_name, yt_chunk_tokens, yt_overlap_tokens)
|
167 |
text_data_Latex, context_embeddings_Latex = load_book_data(base_path, model_name, latex_chunk_tokens, latex_overlap_tokens)
|
@@ -178,9 +199,13 @@ if 'answer' not in st.session_state:
|
|
178 |
if 'playing_video_id' not in st.session_state:
|
179 |
st.session_state.playing_video_id = None
|
180 |
|
181 |
-
|
182 |
if submit_button_placeholder.button("AI Answer", type="primary"):
|
183 |
-
if st.session_state.question
|
|
|
|
|
|
|
|
|
|
|
184 |
with st.spinner("Finding relevant contexts..."):
|
185 |
question_embedding = embed_question_openai(st.session_state.question, model_name)
|
186 |
initial_max_k = int(0.1 * context_embeddings_YT.shape[0])
|
@@ -216,54 +241,97 @@ if submit_button_placeholder.button("AI Answer", type="primary"):
|
|
216 |
for context_item in contexts:
|
217 |
context += context_item['text'] + '\n\n'
|
218 |
|
|
|
|
|
|
|
219 |
if use_expert_answer:
|
220 |
-
if st.session_state.expert_model
|
221 |
-
|
222 |
-
|
223 |
-
st.session_state.tommi_model
|
224 |
-
st.session_state.tommi_tokenizer
|
225 |
-
|
|
|
|
|
|
|
|
|
226 |
messages = [
|
227 |
-
{"role": "system", "content":
|
228 |
{"role": "user", "content": st.session_state.question}
|
229 |
]
|
230 |
|
231 |
expert_answer = generate_response(
|
232 |
-
model=
|
233 |
-
tokenizer=
|
234 |
messages=messages,
|
235 |
-
do_sample=
|
236 |
-
temperature=
|
237 |
-
top_k=
|
238 |
-
top_p=
|
239 |
-
num_beams=
|
240 |
-
max_new_tokens=
|
241 |
)
|
242 |
-
|
|
|
243 |
expert_answer = openai_domain_specific_answer_generation(
|
244 |
get_expert_system_prompt(),
|
245 |
st.session_state.question,
|
246 |
-
model=
|
247 |
-
temperature=
|
248 |
-
top_p=
|
249 |
)
|
|
|
250 |
st.session_state.expert_answer = fix_latex(expert_answer)
|
|
|
251 |
else:
|
252 |
st.session_state.expert_answer = 'No Expert Answer. Only use the context.'
|
253 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
254 |
|
255 |
-
|
256 |
-
|
257 |
-
|
258 |
-
|
259 |
-
|
260 |
-
|
261 |
-
|
262 |
-
|
263 |
-
|
264 |
-
|
265 |
-
|
266 |
-
|
|
|
267 |
st.markdown("")
|
268 |
st.markdown("#### Query:")
|
269 |
st.markdown(fix_latex(st.session_state.question))
|
@@ -272,21 +340,15 @@ if submit_button_placeholder.button("AI Answer", type="primary"):
|
|
272 |
st.markdown(st.session_state.expert_answer)
|
273 |
st.markdown("#### Answer:")
|
274 |
st.write(":smiling_face_with_tear:")
|
275 |
-
st.markdown(
|
276 |
st.divider()
|
277 |
st.caption(get_disclaimer())
|
278 |
# st.caption("The AI Teaching Assistant project")
|
279 |
st.session_state.question_answered = False
|
280 |
st.stop()
|
281 |
else:
|
282 |
-
st.session_state.answer =
|
283 |
-
|
284 |
-
st.session_state.question_answered = True
|
285 |
-
|
286 |
-
else:
|
287 |
-
st.markdown("")
|
288 |
-
st.write("Please enter a question. :smirk:")
|
289 |
-
st.session_state.question_answered = False
|
290 |
|
291 |
if st.session_state.question_answered:
|
292 |
st.markdown("")
|
|
|
35 |
# ---------------------------------------
|
36 |
base_path = "data/"
|
37 |
base_model_path = "meta-llama/Llama-3.2-11B-Vision-Instruct"
|
38 |
+
adapter_path = "./LLaMA-TOMMI-1.0-11B/"
|
39 |
|
40 |
st.title(":red[AI University] :gray[/] FEM")
|
41 |
# st.markdown("### Finite Element Method")
|
42 |
st.markdown("Welcome to :red[AI University]—an AI-powered platform designed to address scientific course queries, dynamically adapting to instructors' teaching styles and students' learning needs. This prototype demonstrates the capabilities of the AI University platform by providing expert answers to queries related to a graduate-level :red[Finite Element Method (FEM)] course")
|
43 |
|
|
|
|
|
|
|
|
|
|
|
|
|
44 |
st.markdown(" ")
|
45 |
st.markdown(" ")
|
46 |
# st.divider()
|
|
|
83 |
# latex_overlap_tokens = latex_chunk_tokens // 4
|
84 |
latex_overlap_tokens = 0
|
85 |
|
86 |
+
st.write(' ')
|
87 |
with st.expander('Expert model', expanded=False):
|
|
|
88 |
use_expert_answer = st.toggle("Use expert answer", value=True)
|
89 |
show_expert_responce = st.toggle("Show initial expert answer", value=False)
|
90 |
|
91 |
st.session_state.expert_model = st.selectbox(
|
92 |
"Choose the LLM model",
|
93 |
+
["LLaMA-TOMMI-1.0-11B", "LLaMA-3.2-11B", "gpt-4o-mini"],
|
94 |
+
index=0,
|
|
|
95 |
key='a1model'
|
96 |
)
|
97 |
|
98 |
+
if st.session_state.expert_model in ["LLaMA-TOMMI-1.0-11B", "LLaMA-3.2-11B"]:
|
99 |
+
expert_do_sample = st.toggle("Enable Sampling", value=False, key='expert_sample')
|
100 |
|
101 |
+
if expert_do_sample:
|
102 |
+
expert_temperature = st.slider("Temperature", 0.0, 1.5, 0.7, key='expert_temp')
|
103 |
+
expert_top_k = st.slider("Top K", 0, 100, 50, key='expert_top_k')
|
104 |
+
expert_top_p = st.slider("Top P", 0.0, 1.0, 0.95, key='expert_top_p')
|
105 |
else:
|
106 |
+
expert_num_beams = st.slider("Num Beams", 1, 4, 1, key='expert_num_beams')
|
107 |
|
108 |
+
expert_max_new_tokens = st.slider("Max New Tokens", 100, 2000, 500, step=50, key='expert_max_new_tokens')
|
109 |
else:
|
110 |
+
expert_api_temperature = st.slider("Temperature", 0.0, 1.5, 0.7, key='a1t')
|
111 |
+
expert_api_top_p = st.slider("Top P", 0.0, 1.0, 0.9, key='a1p')
|
112 |
|
113 |
with st.expander('Synthesis model',expanded=False):
|
|
|
114 |
# with st.container(border=True):
|
115 |
# Choose the LLM model
|
116 |
+
st.session_state.synthesis_model = st.selectbox(
|
117 |
+
"Choose the LLM model",
|
118 |
+
["LLaMA-3.2-11B", "gpt-4o-mini"],
|
119 |
+
index=0,
|
120 |
+
key='a2model'
|
121 |
+
)
|
122 |
+
|
123 |
+
if st.session_state.synthesis_model == "LLaMA-3.2-11B":
|
124 |
+
synthesis_do_sample = st.toggle("Enable Sampling", value=False, key='synthesis_sample')
|
125 |
|
126 |
+
if synthesis_do_sample:
|
127 |
+
synthesis_temperature = st.slider("Temperature", 0.0, 1.5, 0.7, key='synthesis_temp')
|
128 |
+
synthesis_top_k = st.slider("Top K", 0, 100, 50, key='synthesis_top_k')
|
129 |
+
synthesis_top_p = st.slider("Top P", 0.0, 1.0, 0.95, key='synthesis_top_p')
|
130 |
+
else:
|
131 |
+
synthesis_num_beams = st.slider("Num Beams", 1, 4, 1, key='synthesis_num_beams')
|
132 |
+
|
133 |
+
synthesis_max_new_tokens = st.slider("Max New Tokens", 100, 2000, 500, step=50, key='synthesis_max_new_tokens')
|
134 |
+
else:
|
135 |
+
# Temperature
|
136 |
+
synthesis_api_temperature = st.slider("Temperature", 0.0, .3, .5, help="Defines the randomness in the next token prediction. Lower: More predictable and focused. Higher: More adventurous and diverse.", key='a2t')
|
137 |
|
138 |
+
synthesis_api_top_p = st.slider("Top P", 0.1, 0.5, .3, help="Defines the range of token choices the model can consider in the next prediction. Lower: More focused and restricted to high-probability options. Higher: More creative, allowing consideration of less likely options.", key='a2p')
|
139 |
|
|
|
140 |
# Main content area
|
141 |
if "question" not in st.session_state:
|
142 |
st.session_state.question = ""
|
|
|
145 |
text_area_placeholder = st.empty()
|
146 |
question_help = "Including details or instructions improves the answer."
|
147 |
st.session_state.question = text_area_placeholder.text_area(
|
148 |
+
"**Enter your query about Finite Element Method**",
|
149 |
height=120,
|
150 |
value=st.session_state.question,
|
151 |
help=question_help
|
|
|
163 |
break
|
164 |
st.session_state.question = random_question
|
165 |
text_area_placeholder.text_area(
|
166 |
+
"**Enter your query about Finite Element Method:**",
|
167 |
height=120,
|
168 |
value=st.session_state.question,
|
169 |
help=question_help
|
170 |
)
|
171 |
|
172 |
+
with st.spinner("Loading LLaMA-3.2-11B..."):
|
173 |
+
if "LLaMA-3.2-11B" in [st.session_state.expert_model, st.session_state.synthesis_model]:
|
174 |
+
if 'llama_model' not in st.session_state:
|
175 |
+
llama_model, llama_tokenizer = load_base_model(base_model_path)
|
176 |
+
st.session_state.llama_model = llama_model
|
177 |
+
st.session_state.llama_tokenizer = llama_tokenizer
|
178 |
+
|
179 |
+
with st.spinner("Loading LLaMA-TOMMI-1.0-11B..."):
|
180 |
+
if st.session_state.expert_model == "LLaMA-TOMMI-1.0-11B":
|
181 |
+
if 'tommi_model' not in st.session_state:
|
182 |
+
tommi_model, tommi_tokenizer = load_fine_tuned_model(adapter_path, base_model_path)
|
183 |
+
st.session_state.tommi_model = tommi_model
|
184 |
+
st.session_state.tommi_tokenizer = tommi_tokenizer
|
185 |
+
|
186 |
# Load YouTube and LaTeX data
|
187 |
text_data_YT, context_embeddings_YT = load_youtube_data(base_path, model_name, yt_chunk_tokens, yt_overlap_tokens)
|
188 |
text_data_Latex, context_embeddings_Latex = load_book_data(base_path, model_name, latex_chunk_tokens, latex_overlap_tokens)
|
|
|
199 |
if 'playing_video_id' not in st.session_state:
|
200 |
st.session_state.playing_video_id = None
|
201 |
|
|
|
202 |
if submit_button_placeholder.button("AI Answer", type="primary"):
|
203 |
+
if st.session_state.question == "":
|
204 |
+
st.markdown("")
|
205 |
+
st.write("Please enter a query. :smirk:")
|
206 |
+
st.session_state.question_answered = False
|
207 |
+
|
208 |
+
else:
|
209 |
with st.spinner("Finding relevant contexts..."):
|
210 |
question_embedding = embed_question_openai(st.session_state.question, model_name)
|
211 |
initial_max_k = int(0.1 * context_embeddings_YT.shape[0])
|
|
|
241 |
for context_item in contexts:
|
242 |
context += context_item['text'] + '\n\n'
|
243 |
|
244 |
+
#-------------------------
|
245 |
+
# getting expert answer
|
246 |
+
#-------------------------
|
247 |
if use_expert_answer:
|
248 |
+
if st.session_state.expert_model in ["LLaMA-TOMMI-1.0-11B", "LLaMA-3.2-11B"]:
|
249 |
+
|
250 |
+
if st.session_state.expert_model == "LLaMA-TOMMI-1.0-11B":
|
251 |
+
model_ = st.session_state.tommi_model
|
252 |
+
tokenizer_ = st.session_state.tommi_tokenizer
|
253 |
+
|
254 |
+
elif st.session_state.expert_model == "LLaMA-3.2-11B":
|
255 |
+
model_ = st.session_state.llama_model
|
256 |
+
tokenizer_ = st.session_state.llama_tokenizer
|
257 |
+
|
258 |
messages = [
|
259 |
+
{"role": "system", "content": get_expert_system_prompt()},
|
260 |
{"role": "user", "content": st.session_state.question}
|
261 |
]
|
262 |
|
263 |
expert_answer = generate_response(
|
264 |
+
model=model_,
|
265 |
+
tokenizer=tokenizer_,
|
266 |
messages=messages,
|
267 |
+
do_sample=expert_do_sample,
|
268 |
+
temperature=expert_temperature if expert_do_sample else None,
|
269 |
+
top_k=expert_top_k if expert_do_sample else None,
|
270 |
+
top_p=expert_top_p if expert_do_sample else None,
|
271 |
+
num_beams=expert_num_beams if not expert_do_sample else 1,
|
272 |
+
max_new_tokens=expert_max_new_tokens
|
273 |
)
|
274 |
+
|
275 |
+
else: # openai
|
276 |
expert_answer = openai_domain_specific_answer_generation(
|
277 |
get_expert_system_prompt(),
|
278 |
st.session_state.question,
|
279 |
+
model=st.session_state.expert_model,
|
280 |
+
temperature=expert_api_temperature,
|
281 |
+
top_p=expert_api_top_p
|
282 |
)
|
283 |
+
|
284 |
st.session_state.expert_answer = fix_latex(expert_answer)
|
285 |
+
|
286 |
else:
|
287 |
st.session_state.expert_answer = 'No Expert Answer. Only use the context.'
|
288 |
|
289 |
+
#-------------------------
|
290 |
+
# synthesis responses
|
291 |
+
#-------------------------
|
292 |
+
if st.session_state.synthesis_model == "LLaMA-3.2-11B":
|
293 |
+
synthesis_prompt = f"""
|
294 |
+
Question:
|
295 |
+
{st.session_state.question}
|
296 |
+
|
297 |
+
Direct Answer:
|
298 |
+
{st.session_state.expert_answer}
|
299 |
+
|
300 |
+
Retrieved Context:
|
301 |
+
{context}
|
302 |
+
|
303 |
+
Final Answer:
|
304 |
+
"""
|
305 |
+
messages = [
|
306 |
+
{"role": "system", "content": get_synthesis_system_prompt("Finite Element Method")},
|
307 |
+
{"role": "user", "content": synthesis_prompt}
|
308 |
+
]
|
309 |
+
|
310 |
+
synthesis_answer = generate_response(
|
311 |
+
model=st.session_state.llama_model,
|
312 |
+
tokenizer=st.session_state.llama_tokenizer,
|
313 |
+
messages=messages,
|
314 |
+
do_sample=synthesis_do_sample,
|
315 |
+
temperature=synthesis_temperature if synthesis_do_sample else None,
|
316 |
+
top_k=synthesis_top_k if synthesis_do_sample else None,
|
317 |
+
top_p=synthesis_top_p if synthesis_do_sample else None,
|
318 |
+
num_beams=synthesis_num_beams if not synthesis_do_sample else 1,
|
319 |
+
max_new_tokens=synthesis_max_new_tokens
|
320 |
+
)
|
321 |
|
322 |
+
else:
|
323 |
+
synthesis_answer = openai_context_integration(
|
324 |
+
get_synthesis_system_prompt("Finite Element Method"),
|
325 |
+
st.session_state.question,
|
326 |
+
st.session_state.expert_answer,
|
327 |
+
context,
|
328 |
+
model=st.session_state.synthesis_model,
|
329 |
+
temperature=synthesis_api_temperature,
|
330 |
+
top_p=synthesis_api_top_p
|
331 |
+
)
|
332 |
+
|
333 |
+
# quick check after getting the answer
|
334 |
+
if synthesis_answer.split()[0] == "NOT_ENOUGH_INFO":
|
335 |
st.markdown("")
|
336 |
st.markdown("#### Query:")
|
337 |
st.markdown(fix_latex(st.session_state.question))
|
|
|
340 |
st.markdown(st.session_state.expert_answer)
|
341 |
st.markdown("#### Answer:")
|
342 |
st.write(":smiling_face_with_tear:")
|
343 |
+
st.markdown(synthesis_answer.split('NOT_ENOUGH_INFO')[1])
|
344 |
st.divider()
|
345 |
st.caption(get_disclaimer())
|
346 |
# st.caption("The AI Teaching Assistant project")
|
347 |
st.session_state.question_answered = False
|
348 |
st.stop()
|
349 |
else:
|
350 |
+
st.session_state.answer = fix_latex(synthesis_answer)
|
351 |
+
st.session_state.question_answered = True
|
|
|
|
|
|
|
|
|
|
|
|
|
352 |
|
353 |
if st.session_state.question_answered:
|
354 |
st.markdown("")
|
utils/llama_utils.py
CHANGED
@@ -2,6 +2,7 @@ import os
|
|
2 |
import torch
|
3 |
from transformers import BitsAndBytesConfig, AutoModelForCausalLM, PreTrainedTokenizerFast
|
4 |
from peft import PeftModel
|
|
|
5 |
|
6 |
# Set the cache directory to persistent storage
|
7 |
os.environ["HF_HOME"] = "/data/.cache/huggingface"
|
@@ -21,6 +22,7 @@ def get_bnb_config():
|
|
21 |
#-----------------------------------------
|
22 |
# Base Model Loader
|
23 |
#-----------------------------------------
|
|
|
24 |
def load_base_model(base_model_path: str):
|
25 |
"""
|
26 |
Loads a base LLM model with 4-bit quantization and tokenizer.
|
@@ -49,6 +51,7 @@ def load_base_model(base_model_path: str):
|
|
49 |
#-----------------------------------------
|
50 |
# Fine-Tuned Model Loader
|
51 |
#-----------------------------------------
|
|
|
52 |
def load_fine_tuned_model(adapter_path: str, base_model_path: str):
|
53 |
"""
|
54 |
Loads the fine-tuned model by applying LoRA adapter to a base model.
|
|
|
2 |
import torch
|
3 |
from transformers import BitsAndBytesConfig, AutoModelForCausalLM, PreTrainedTokenizerFast
|
4 |
from peft import PeftModel
|
5 |
+
import streamlit as st
|
6 |
|
7 |
# Set the cache directory to persistent storage
|
8 |
os.environ["HF_HOME"] = "/data/.cache/huggingface"
|
|
|
22 |
#-----------------------------------------
|
23 |
# Base Model Loader
|
24 |
#-----------------------------------------
|
25 |
+
@st.cache_resource
|
26 |
def load_base_model(base_model_path: str):
|
27 |
"""
|
28 |
Loads a base LLM model with 4-bit quantization and tokenizer.
|
|
|
51 |
#-----------------------------------------
|
52 |
# Fine-Tuned Model Loader
|
53 |
#-----------------------------------------
|
54 |
+
@st.cache_resource
|
55 |
def load_fine_tuned_model(adapter_path: str, base_model_path: str):
|
56 |
"""
|
57 |
Loads the fine-tuned model by applying LoRA adapter to a base model.
|