Spaces:
Running
on
L4
Running
on
L4
Commit
·
0f9d3df
1
Parent(s):
49fe168
add all-MiniLM-L6-v2 embeddings
Browse files- .gitignore +2 -1
- app.py +44 -27
- data/yt_embedding_space_all-MiniLM-L6-v2_tpc128_o32.json +3 -0
- data/yt_embedding_space_all-MiniLM-L6-v2_tpc256_o64.json +3 -0
- utils/openai_utils.py +3 -0
- utils/rag_utils.py +7 -0
.gitignore
CHANGED
@@ -1,3 +1,4 @@
|
|
1 |
__pycache__/
|
2 |
.devcontainer/
|
3 |
-
.streamlit/
|
|
|
|
1 |
__pycache__/
|
2 |
.devcontainer/
|
3 |
+
.streamlit/
|
4 |
+
.env
|
app.py
CHANGED
@@ -6,7 +6,7 @@ os.environ["HF_HOME"] = "/data/.cache/huggingface"
|
|
6 |
import streamlit as st
|
7 |
from utils.help import get_disclaimer
|
8 |
from utils.format import sec_to_time, fix_latex, get_youtube_embed
|
9 |
-
from utils.rag_utils import load_youtube_data, load_book_data, load_summary, fixed_knn_retrieval, get_random_question
|
10 |
from utils.system_prompts import get_expert_system_prompt, get_synthesis_system_prompt
|
11 |
from utils.openai_utils import embed_question_openai, openai_domain_specific_answer_generation, openai_context_integration
|
12 |
from utils.llama_utils import get_bnb_config, load_base_model, load_fine_tuned_model, generate_response
|
@@ -53,10 +53,10 @@ with st.sidebar:
|
|
53 |
# with st.container(border=True):
|
54 |
# Embedding model
|
55 |
|
56 |
-
|
57 |
"text-embedding-3-small",
|
58 |
# "text-embedding-3-large",
|
59 |
-
|
60 |
# "all-mpnet-base-v2"
|
61 |
],
|
62 |
# help="""
|
@@ -65,26 +65,33 @@ with st.sidebar:
|
|
65 |
# used SentenceTransformers models.
|
66 |
# """
|
67 |
)
|
68 |
-
|
69 |
-
with st.container(border=
|
70 |
-
|
|
|
|
|
|
|
71 |
yt_token_choice = st.select_slider("Token per content", [256, 512, 1024], value=256, help="Larger values lead to an increase in the length of each retrieved piece of content", key="yt_token_len")
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
with st.container(border=
|
79 |
-
|
80 |
-
|
81 |
-
|
|
|
|
|
|
|
|
|
82 |
latex_token_choice = st.select_slider("Token per content", [128, 256, 512, 1024], value=256, help="Larger values lead to an increase in the length of each retrieved piece of content", key="latex_token_len")
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
|
89 |
st.write(' ')
|
90 |
with st.expander('Expert model', expanded=False):
|
@@ -94,7 +101,7 @@ with st.sidebar:
|
|
94 |
st.session_state.expert_model = st.selectbox(
|
95 |
"Choose the LLM model",
|
96 |
["LLaMA-TOMMI-1.0-11B", "LLaMA-3.2-11B", "gpt-4o-mini"],
|
97 |
-
index=
|
98 |
key='a1model'
|
99 |
)
|
100 |
|
@@ -116,6 +123,7 @@ with st.sidebar:
|
|
116 |
with st.expander('Synthesis model',expanded=False):
|
117 |
# with st.container(border=True):
|
118 |
# Choose the LLM model
|
|
|
119 |
st.session_state.synthesis_model = st.selectbox(
|
120 |
"Choose the LLM model",
|
121 |
["LLaMA-3.2-3B","gpt-4o-mini"], # "LLaMA-3.2-11B",
|
@@ -195,8 +203,8 @@ with st.spinner("Loading LLaMA-3.2-3B..."):
|
|
195 |
st.session_state.llama_tokenizer_3B = llama_tokenizer_3B
|
196 |
|
197 |
# Load YouTube and LaTeX data
|
198 |
-
text_data_YT, context_embeddings_YT = load_youtube_data(base_path,
|
199 |
-
text_data_Latex, context_embeddings_Latex = load_book_data(base_path,
|
200 |
summary = load_summary('data/KG_FEM_summary.json')
|
201 |
|
202 |
if 'question_answered' not in st.session_state:
|
@@ -218,7 +226,12 @@ if submit_button_placeholder.button("AI Answer", type="primary"):
|
|
218 |
|
219 |
else:
|
220 |
with st.spinner("Finding relevant contexts..."):
|
221 |
-
|
|
|
|
|
|
|
|
|
|
|
222 |
initial_max_k = int(0.1 * context_embeddings_YT.shape[0])
|
223 |
idx_YT = fixed_knn_retrieval(question_embedding, context_embeddings_YT, top_k=top_k_YT, min_k=0)
|
224 |
idx_Latex = fixed_knn_retrieval(question_embedding, context_embeddings_Latex, top_k=top_k_Latex, min_k=0)
|
@@ -245,7 +258,8 @@ if submit_button_placeholder.button("AI Answer", type="primary"):
|
|
245 |
for context_item in contexts:
|
246 |
start_time = int(context_item['start'])
|
247 |
context += f'Video {i}, time: {sec_to_time(start_time)}:' + context_item['text'] + '\n\n'
|
248 |
-
|
|
|
249 |
for i, (section_id, contexts) in enumerate(st.session_state.context_by_section.items(), start=1):
|
250 |
context += f'Section {i} ({section_id}):\n'
|
251 |
for context_item in contexts:
|
@@ -381,7 +395,10 @@ if st.session_state.question_answered:
|
|
381 |
st.markdown(st.session_state.expert_answer)
|
382 |
st.markdown("#### Answer:")
|
383 |
st.markdown(st.session_state.answer)
|
384 |
-
|
|
|
|
|
|
|
385 |
if top_k_YT > 0:
|
386 |
st.markdown("#### Retrieved content in lecture videos")
|
387 |
for i, (video_id, contexts) in enumerate(st.session_state.context_by_video.items(), start=1):
|
|
|
6 |
import streamlit as st
|
7 |
from utils.help import get_disclaimer
|
8 |
from utils.format import sec_to_time, fix_latex, get_youtube_embed
|
9 |
+
from utils.rag_utils import load_youtube_data, load_book_data, load_summary, embed_question_sentence_transformer, fixed_knn_retrieval, get_random_question
|
10 |
from utils.system_prompts import get_expert_system_prompt, get_synthesis_system_prompt
|
11 |
from utils.openai_utils import embed_question_openai, openai_domain_specific_answer_generation, openai_context_integration
|
12 |
from utils.llama_utils import get_bnb_config, load_base_model, load_fine_tuned_model, generate_response
|
|
|
53 |
# with st.container(border=True):
|
54 |
# Embedding model
|
55 |
|
56 |
+
embedding_model = st.selectbox("Choose content embedding model", [
|
57 |
"text-embedding-3-small",
|
58 |
# "text-embedding-3-large",
|
59 |
+
"all-MiniLM-L6-v2",
|
60 |
# "all-mpnet-base-v2"
|
61 |
],
|
62 |
# help="""
|
|
|
65 |
# used SentenceTransformers models.
|
66 |
# """
|
67 |
)
|
68 |
+
st.divider()
|
69 |
+
# with st.container(border=False):
|
70 |
+
st.write('**Video lectures**')
|
71 |
+
if embedding_model == "all-MiniLM-L6-v2":
|
72 |
+
yt_token_choice = st.select_slider("Token per content", [128, 256], value=256, help="Larger values lead to an increase in the length of each retrieved piece of content", key="yt_token_len")
|
73 |
+
elif embedding_model == "text-embedding-3-small":
|
74 |
yt_token_choice = st.select_slider("Token per content", [256, 512, 1024], value=256, help="Larger values lead to an increase in the length of each retrieved piece of content", key="yt_token_len")
|
75 |
+
yt_chunk_tokens = yt_token_choice
|
76 |
+
yt_max_content = {128: 32, 256: 16, 512: 8, 1024: 4}[yt_chunk_tokens]
|
77 |
+
top_k_YT = st.slider("Number of content pieces to retrieve", 0, yt_max_content, 4, key="yt_token_num")
|
78 |
+
yt_overlap_tokens = yt_chunk_tokens // 4
|
79 |
+
|
80 |
+
st.divider()
|
81 |
+
# with st.container(border=False):
|
82 |
+
st.write('**Textbook**')
|
83 |
+
show_textbook = False
|
84 |
+
# show_textbook = st.toggle("Show Textbook Content", value=False)
|
85 |
+
|
86 |
+
if embedding_model == "all-MiniLM-L6-v2":
|
87 |
+
latex_token_choice = st.select_slider("Token per content", [128, 256], value=256, help="Larger values lead to an increase in the length of each retrieved piece of content", key="latex_token_len")
|
88 |
+
elif embedding_model == "text-embedding-3-small":
|
89 |
latex_token_choice = st.select_slider("Token per content", [128, 256, 512, 1024], value=256, help="Larger values lead to an increase in the length of each retrieved piece of content", key="latex_token_len")
|
90 |
+
latex_chunk_tokens = latex_token_choice
|
91 |
+
latex_max_content = {128: 32, 256: 16, 512: 8, 1024: 4}[latex_chunk_tokens]
|
92 |
+
top_k_Latex = st.slider("Number of content pieces to retrieve", 0, latex_max_content, 4, key="latex_token_num")
|
93 |
+
# latex_overlap_tokens = latex_chunk_tokens // 4
|
94 |
+
latex_overlap_tokens = 0
|
95 |
|
96 |
st.write(' ')
|
97 |
with st.expander('Expert model', expanded=False):
|
|
|
101 |
st.session_state.expert_model = st.selectbox(
|
102 |
"Choose the LLM model",
|
103 |
["LLaMA-TOMMI-1.0-11B", "LLaMA-3.2-11B", "gpt-4o-mini"],
|
104 |
+
index=2,
|
105 |
key='a1model'
|
106 |
)
|
107 |
|
|
|
123 |
with st.expander('Synthesis model',expanded=False):
|
124 |
# with st.container(border=True):
|
125 |
# Choose the LLM model
|
126 |
+
show_yt_context = st.toggle("Show retrieved video content", value=False)
|
127 |
st.session_state.synthesis_model = st.selectbox(
|
128 |
"Choose the LLM model",
|
129 |
["LLaMA-3.2-3B","gpt-4o-mini"], # "LLaMA-3.2-11B",
|
|
|
203 |
st.session_state.llama_tokenizer_3B = llama_tokenizer_3B
|
204 |
|
205 |
# Load YouTube and LaTeX data
|
206 |
+
text_data_YT, context_embeddings_YT = load_youtube_data(base_path, embedding_model, yt_chunk_tokens, yt_overlap_tokens)
|
207 |
+
text_data_Latex, context_embeddings_Latex = load_book_data(base_path, embedding_model, latex_chunk_tokens, latex_overlap_tokens)
|
208 |
summary = load_summary('data/KG_FEM_summary.json')
|
209 |
|
210 |
if 'question_answered' not in st.session_state:
|
|
|
226 |
|
227 |
else:
|
228 |
with st.spinner("Finding relevant contexts..."):
|
229 |
+
|
230 |
+
if embedding_model == "all-MiniLM-L6-v2":
|
231 |
+
question_embedding = embed_question_sentence_transformer(st.session_state.question, model_name="all-MiniLM-L6-v2")
|
232 |
+
elif embedding_model == "text-embedding-3-small":
|
233 |
+
question_embedding = embed_question_openai(st.session_state.question, embedding_model)
|
234 |
+
|
235 |
initial_max_k = int(0.1 * context_embeddings_YT.shape[0])
|
236 |
idx_YT = fixed_knn_retrieval(question_embedding, context_embeddings_YT, top_k=top_k_YT, min_k=0)
|
237 |
idx_Latex = fixed_knn_retrieval(question_embedding, context_embeddings_Latex, top_k=top_k_Latex, min_k=0)
|
|
|
258 |
for context_item in contexts:
|
259 |
start_time = int(context_item['start'])
|
260 |
context += f'Video {i}, time: {sec_to_time(start_time)}:' + context_item['text'] + '\n\n'
|
261 |
+
st.session_state.yt_context = fix_latex(context)
|
262 |
+
|
263 |
for i, (section_id, contexts) in enumerate(st.session_state.context_by_section.items(), start=1):
|
264 |
context += f'Section {i} ({section_id}):\n'
|
265 |
for context_item in contexts:
|
|
|
395 |
st.markdown(st.session_state.expert_answer)
|
396 |
st.markdown("#### Answer:")
|
397 |
st.markdown(st.session_state.answer)
|
398 |
+
if show_yt_context:
|
399 |
+
st.markdown("#### Retrieved lecture video transcripts:")
|
400 |
+
st.markdown(st.session_state.yt_context)
|
401 |
+
|
402 |
if top_k_YT > 0:
|
403 |
st.markdown("#### Retrieved content in lecture videos")
|
404 |
for i, (video_id, contexts) in enumerate(st.session_state.context_by_video.items(), start=1):
|
data/yt_embedding_space_all-MiniLM-L6-v2_tpc128_o32.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:25393a46b720884076694fc10f9edb80a67003e452a9187077c69c632f2d45dd
|
3 |
+
size 36670448
|
data/yt_embedding_space_all-MiniLM-L6-v2_tpc256_o64.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:fb839d14084e8c305100359eabaefb517b83c9673368cb09b0da23673ce05df3
|
3 |
+
size 17898177
|
utils/openai_utils.py
CHANGED
@@ -1,6 +1,9 @@
|
|
1 |
import os
|
2 |
from openai import OpenAI
|
3 |
|
|
|
|
|
|
|
4 |
#--------------------------------------------------------
|
5 |
# Initialize OpenAI client
|
6 |
#--------------------------------------------------------
|
|
|
1 |
import os
|
2 |
from openai import OpenAI
|
3 |
|
4 |
+
# from dotenv import load_dotenv
|
5 |
+
# load_dotenv() #
|
6 |
+
|
7 |
#--------------------------------------------------------
|
8 |
# Initialize OpenAI client
|
9 |
#--------------------------------------------------------
|
utils/rag_utils.py
CHANGED
@@ -2,6 +2,7 @@ import json
|
|
2 |
import numpy as np
|
3 |
import random
|
4 |
import streamlit as st
|
|
|
5 |
|
6 |
@st.cache_resource
|
7 |
def load_youtube_data(base_path, embedding_model_name, chunk_tokens, overlap_tokens):
|
@@ -27,6 +28,12 @@ def load_summary(file_path):
|
|
27 |
transcripts = json.load(file)
|
28 |
return transcripts
|
29 |
|
|
|
|
|
|
|
|
|
|
|
|
|
30 |
def fixed_knn_retrieval(question_embedding, context_embeddings, top_k=5, min_k=1):
|
31 |
|
32 |
question_embedding = np.array(question_embedding)
|
|
|
2 |
import numpy as np
|
3 |
import random
|
4 |
import streamlit as st
|
5 |
+
from sentence_transformers import SentenceTransformer
|
6 |
|
7 |
@st.cache_resource
|
8 |
def load_youtube_data(base_path, embedding_model_name, chunk_tokens, overlap_tokens):
|
|
|
28 |
transcripts = json.load(file)
|
29 |
return transcripts
|
30 |
|
31 |
+
def embed_question_sentence_transformer(texts, model_name="sentence-transformers/all-MiniLM-L6-v2"):
|
32 |
+
model = SentenceTransformer(model_name)
|
33 |
+
embeddings = model.encode(texts)
|
34 |
+
|
35 |
+
return embeddings.tolist()
|
36 |
+
|
37 |
def fixed_knn_retrieval(question_embedding, context_embeddings, top_k=5, min_k=1):
|
38 |
|
39 |
question_embedding = np.array(question_embedding)
|