mostafa-sh commited on
Commit
0f9d3df
·
1 Parent(s): 49fe168

add all-MiniLM-L6-v2 embeddings

Browse files
.gitignore CHANGED
@@ -1,3 +1,4 @@
1
  __pycache__/
2
  .devcontainer/
3
- .streamlit/
 
 
1
  __pycache__/
2
  .devcontainer/
3
+ .streamlit/
4
+ .env
app.py CHANGED
@@ -6,7 +6,7 @@ os.environ["HF_HOME"] = "/data/.cache/huggingface"
6
  import streamlit as st
7
  from utils.help import get_disclaimer
8
  from utils.format import sec_to_time, fix_latex, get_youtube_embed
9
- from utils.rag_utils import load_youtube_data, load_book_data, load_summary, fixed_knn_retrieval, get_random_question
10
  from utils.system_prompts import get_expert_system_prompt, get_synthesis_system_prompt
11
  from utils.openai_utils import embed_question_openai, openai_domain_specific_answer_generation, openai_context_integration
12
  from utils.llama_utils import get_bnb_config, load_base_model, load_fine_tuned_model, generate_response
@@ -53,10 +53,10 @@ with st.sidebar:
53
  # with st.container(border=True):
54
  # Embedding model
55
 
56
- model_name = st.selectbox("Choose content embedding model", [
57
  "text-embedding-3-small",
58
  # "text-embedding-3-large",
59
- # "all-MiniLM-L6-v2",
60
  # "all-mpnet-base-v2"
61
  ],
62
  # help="""
@@ -65,26 +65,33 @@ with st.sidebar:
65
  # used SentenceTransformers models.
66
  # """
67
  )
68
-
69
- with st.container(border=True):
70
- st.write('**Video lectures**')
 
 
 
71
  yt_token_choice = st.select_slider("Token per content", [256, 512, 1024], value=256, help="Larger values lead to an increase in the length of each retrieved piece of content", key="yt_token_len")
72
- yt_chunk_tokens = yt_token_choice
73
- yt_max_content = {128: 32, 256: 16, 512: 8, 1024: 4}[yt_chunk_tokens]
74
- top_k_YT = st.slider("Number of relevant content pieces to retrieve", 0, yt_max_content, 4, key="yt_token_num")
75
- yt_overlap_tokens = yt_chunk_tokens // 4
76
-
77
- # st.divider()
78
- with st.container(border=True):
79
- st.write('**Textbook**')
80
- show_textbook = False
81
- # show_textbook = st.toggle("Show Textbook Content", value=False)
 
 
 
 
82
  latex_token_choice = st.select_slider("Token per content", [128, 256, 512, 1024], value=256, help="Larger values lead to an increase in the length of each retrieved piece of content", key="latex_token_len")
83
- latex_chunk_tokens = latex_token_choice
84
- latex_max_content = {128: 32, 256: 16, 512: 8, 1024: 4}[latex_chunk_tokens]
85
- top_k_Latex = st.slider("Number of relevant content pieces to retrieve", 0, latex_max_content, 4, key="latex_token_num")
86
- # latex_overlap_tokens = latex_chunk_tokens // 4
87
- latex_overlap_tokens = 0
88
 
89
  st.write(' ')
90
  with st.expander('Expert model', expanded=False):
@@ -94,7 +101,7 @@ with st.sidebar:
94
  st.session_state.expert_model = st.selectbox(
95
  "Choose the LLM model",
96
  ["LLaMA-TOMMI-1.0-11B", "LLaMA-3.2-11B", "gpt-4o-mini"],
97
- index=0,
98
  key='a1model'
99
  )
100
 
@@ -116,6 +123,7 @@ with st.sidebar:
116
  with st.expander('Synthesis model',expanded=False):
117
  # with st.container(border=True):
118
  # Choose the LLM model
 
119
  st.session_state.synthesis_model = st.selectbox(
120
  "Choose the LLM model",
121
  ["LLaMA-3.2-3B","gpt-4o-mini"], # "LLaMA-3.2-11B",
@@ -195,8 +203,8 @@ with st.spinner("Loading LLaMA-3.2-3B..."):
195
  st.session_state.llama_tokenizer_3B = llama_tokenizer_3B
196
 
197
  # Load YouTube and LaTeX data
198
- text_data_YT, context_embeddings_YT = load_youtube_data(base_path, model_name, yt_chunk_tokens, yt_overlap_tokens)
199
- text_data_Latex, context_embeddings_Latex = load_book_data(base_path, model_name, latex_chunk_tokens, latex_overlap_tokens)
200
  summary = load_summary('data/KG_FEM_summary.json')
201
 
202
  if 'question_answered' not in st.session_state:
@@ -218,7 +226,12 @@ if submit_button_placeholder.button("AI Answer", type="primary"):
218
 
219
  else:
220
  with st.spinner("Finding relevant contexts..."):
221
- question_embedding = embed_question_openai(st.session_state.question, model_name)
 
 
 
 
 
222
  initial_max_k = int(0.1 * context_embeddings_YT.shape[0])
223
  idx_YT = fixed_knn_retrieval(question_embedding, context_embeddings_YT, top_k=top_k_YT, min_k=0)
224
  idx_Latex = fixed_knn_retrieval(question_embedding, context_embeddings_Latex, top_k=top_k_Latex, min_k=0)
@@ -245,7 +258,8 @@ if submit_button_placeholder.button("AI Answer", type="primary"):
245
  for context_item in contexts:
246
  start_time = int(context_item['start'])
247
  context += f'Video {i}, time: {sec_to_time(start_time)}:' + context_item['text'] + '\n\n'
248
-
 
249
  for i, (section_id, contexts) in enumerate(st.session_state.context_by_section.items(), start=1):
250
  context += f'Section {i} ({section_id}):\n'
251
  for context_item in contexts:
@@ -381,7 +395,10 @@ if st.session_state.question_answered:
381
  st.markdown(st.session_state.expert_answer)
382
  st.markdown("#### Answer:")
383
  st.markdown(st.session_state.answer)
384
-
 
 
 
385
  if top_k_YT > 0:
386
  st.markdown("#### Retrieved content in lecture videos")
387
  for i, (video_id, contexts) in enumerate(st.session_state.context_by_video.items(), start=1):
 
6
  import streamlit as st
7
  from utils.help import get_disclaimer
8
  from utils.format import sec_to_time, fix_latex, get_youtube_embed
9
+ from utils.rag_utils import load_youtube_data, load_book_data, load_summary, embed_question_sentence_transformer, fixed_knn_retrieval, get_random_question
10
  from utils.system_prompts import get_expert_system_prompt, get_synthesis_system_prompt
11
  from utils.openai_utils import embed_question_openai, openai_domain_specific_answer_generation, openai_context_integration
12
  from utils.llama_utils import get_bnb_config, load_base_model, load_fine_tuned_model, generate_response
 
53
  # with st.container(border=True):
54
  # Embedding model
55
 
56
+ embedding_model = st.selectbox("Choose content embedding model", [
57
  "text-embedding-3-small",
58
  # "text-embedding-3-large",
59
+ "all-MiniLM-L6-v2",
60
  # "all-mpnet-base-v2"
61
  ],
62
  # help="""
 
65
  # used SentenceTransformers models.
66
  # """
67
  )
68
+ st.divider()
69
+ # with st.container(border=False):
70
+ st.write('**Video lectures**')
71
+ if embedding_model == "all-MiniLM-L6-v2":
72
+ yt_token_choice = st.select_slider("Token per content", [128, 256], value=256, help="Larger values lead to an increase in the length of each retrieved piece of content", key="yt_token_len")
73
+ elif embedding_model == "text-embedding-3-small":
74
  yt_token_choice = st.select_slider("Token per content", [256, 512, 1024], value=256, help="Larger values lead to an increase in the length of each retrieved piece of content", key="yt_token_len")
75
+ yt_chunk_tokens = yt_token_choice
76
+ yt_max_content = {128: 32, 256: 16, 512: 8, 1024: 4}[yt_chunk_tokens]
77
+ top_k_YT = st.slider("Number of content pieces to retrieve", 0, yt_max_content, 4, key="yt_token_num")
78
+ yt_overlap_tokens = yt_chunk_tokens // 4
79
+
80
+ st.divider()
81
+ # with st.container(border=False):
82
+ st.write('**Textbook**')
83
+ show_textbook = False
84
+ # show_textbook = st.toggle("Show Textbook Content", value=False)
85
+
86
+ if embedding_model == "all-MiniLM-L6-v2":
87
+ latex_token_choice = st.select_slider("Token per content", [128, 256], value=256, help="Larger values lead to an increase in the length of each retrieved piece of content", key="latex_token_len")
88
+ elif embedding_model == "text-embedding-3-small":
89
  latex_token_choice = st.select_slider("Token per content", [128, 256, 512, 1024], value=256, help="Larger values lead to an increase in the length of each retrieved piece of content", key="latex_token_len")
90
+ latex_chunk_tokens = latex_token_choice
91
+ latex_max_content = {128: 32, 256: 16, 512: 8, 1024: 4}[latex_chunk_tokens]
92
+ top_k_Latex = st.slider("Number of content pieces to retrieve", 0, latex_max_content, 4, key="latex_token_num")
93
+ # latex_overlap_tokens = latex_chunk_tokens // 4
94
+ latex_overlap_tokens = 0
95
 
96
  st.write(' ')
97
  with st.expander('Expert model', expanded=False):
 
101
  st.session_state.expert_model = st.selectbox(
102
  "Choose the LLM model",
103
  ["LLaMA-TOMMI-1.0-11B", "LLaMA-3.2-11B", "gpt-4o-mini"],
104
+ index=2,
105
  key='a1model'
106
  )
107
 
 
123
  with st.expander('Synthesis model',expanded=False):
124
  # with st.container(border=True):
125
  # Choose the LLM model
126
+ show_yt_context = st.toggle("Show retrieved video content", value=False)
127
  st.session_state.synthesis_model = st.selectbox(
128
  "Choose the LLM model",
129
  ["LLaMA-3.2-3B","gpt-4o-mini"], # "LLaMA-3.2-11B",
 
203
  st.session_state.llama_tokenizer_3B = llama_tokenizer_3B
204
 
205
  # Load YouTube and LaTeX data
206
+ text_data_YT, context_embeddings_YT = load_youtube_data(base_path, embedding_model, yt_chunk_tokens, yt_overlap_tokens)
207
+ text_data_Latex, context_embeddings_Latex = load_book_data(base_path, embedding_model, latex_chunk_tokens, latex_overlap_tokens)
208
  summary = load_summary('data/KG_FEM_summary.json')
209
 
210
  if 'question_answered' not in st.session_state:
 
226
 
227
  else:
228
  with st.spinner("Finding relevant contexts..."):
229
+
230
+ if embedding_model == "all-MiniLM-L6-v2":
231
+ question_embedding = embed_question_sentence_transformer(st.session_state.question, model_name="all-MiniLM-L6-v2")
232
+ elif embedding_model == "text-embedding-3-small":
233
+ question_embedding = embed_question_openai(st.session_state.question, embedding_model)
234
+
235
  initial_max_k = int(0.1 * context_embeddings_YT.shape[0])
236
  idx_YT = fixed_knn_retrieval(question_embedding, context_embeddings_YT, top_k=top_k_YT, min_k=0)
237
  idx_Latex = fixed_knn_retrieval(question_embedding, context_embeddings_Latex, top_k=top_k_Latex, min_k=0)
 
258
  for context_item in contexts:
259
  start_time = int(context_item['start'])
260
  context += f'Video {i}, time: {sec_to_time(start_time)}:' + context_item['text'] + '\n\n'
261
+ st.session_state.yt_context = fix_latex(context)
262
+
263
  for i, (section_id, contexts) in enumerate(st.session_state.context_by_section.items(), start=1):
264
  context += f'Section {i} ({section_id}):\n'
265
  for context_item in contexts:
 
395
  st.markdown(st.session_state.expert_answer)
396
  st.markdown("#### Answer:")
397
  st.markdown(st.session_state.answer)
398
+ if show_yt_context:
399
+ st.markdown("#### Retrieved lecture video transcripts:")
400
+ st.markdown(st.session_state.yt_context)
401
+
402
  if top_k_YT > 0:
403
  st.markdown("#### Retrieved content in lecture videos")
404
  for i, (video_id, contexts) in enumerate(st.session_state.context_by_video.items(), start=1):
data/yt_embedding_space_all-MiniLM-L6-v2_tpc128_o32.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:25393a46b720884076694fc10f9edb80a67003e452a9187077c69c632f2d45dd
3
+ size 36670448
data/yt_embedding_space_all-MiniLM-L6-v2_tpc256_o64.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fb839d14084e8c305100359eabaefb517b83c9673368cb09b0da23673ce05df3
3
+ size 17898177
utils/openai_utils.py CHANGED
@@ -1,6 +1,9 @@
1
  import os
2
  from openai import OpenAI
3
 
 
 
 
4
  #--------------------------------------------------------
5
  # Initialize OpenAI client
6
  #--------------------------------------------------------
 
1
  import os
2
  from openai import OpenAI
3
 
4
+ # from dotenv import load_dotenv
5
+ # load_dotenv() #
6
+
7
  #--------------------------------------------------------
8
  # Initialize OpenAI client
9
  #--------------------------------------------------------
utils/rag_utils.py CHANGED
@@ -2,6 +2,7 @@ import json
2
  import numpy as np
3
  import random
4
  import streamlit as st
 
5
 
6
  @st.cache_resource
7
  def load_youtube_data(base_path, embedding_model_name, chunk_tokens, overlap_tokens):
@@ -27,6 +28,12 @@ def load_summary(file_path):
27
  transcripts = json.load(file)
28
  return transcripts
29
 
 
 
 
 
 
 
30
  def fixed_knn_retrieval(question_embedding, context_embeddings, top_k=5, min_k=1):
31
 
32
  question_embedding = np.array(question_embedding)
 
2
  import numpy as np
3
  import random
4
  import streamlit as st
5
+ from sentence_transformers import SentenceTransformer
6
 
7
  @st.cache_resource
8
  def load_youtube_data(base_path, embedding_model_name, chunk_tokens, overlap_tokens):
 
28
  transcripts = json.load(file)
29
  return transcripts
30
 
31
+ def embed_question_sentence_transformer(texts, model_name="sentence-transformers/all-MiniLM-L6-v2"):
32
+ model = SentenceTransformer(model_name)
33
+ embeddings = model.encode(texts)
34
+
35
+ return embeddings.tolist()
36
+
37
  def fixed_knn_retrieval(question_embedding, context_embeddings, top_k=5, min_k=1):
38
 
39
  question_embedding = np.array(question_embedding)