Spaces:

my-ai-university
/

finite-element-method

Running on L4

App Files Files Community

mostafa-sh commited on 20 days ago

Commit

0f9d3df

1 Parent(s): 49fe168

add all-MiniLM-L6-v2 embeddings

Browse files

Files changed (6) hide show

.gitignore +2 -1
app.py +44 -27
data/yt_embedding_space_all-MiniLM-L6-v2_tpc128_o32.json +3 -0
data/yt_embedding_space_all-MiniLM-L6-v2_tpc256_o64.json +3 -0
utils/openai_utils.py +3 -0
utils/rag_utils.py +7 -0

.gitignore CHANGED Viewed

@@ -1,3 +1,4 @@
 __pycache__/
 .devcontainer/
-.streamlit/

 __pycache__/
 .devcontainer/
+.streamlit/
+.env

app.py CHANGED Viewed

@@ -6,7 +6,7 @@ os.environ["HF_HOME"] = "/data/.cache/huggingface"
 import streamlit as st
 from utils.help import get_disclaimer
 from utils.format import sec_to_time, fix_latex, get_youtube_embed
-from utils.rag_utils import load_youtube_data, load_book_data, load_summary, fixed_knn_retrieval, get_random_question
 from utils.system_prompts import get_expert_system_prompt, get_synthesis_system_prompt
 from utils.openai_utils import embed_question_openai, openai_domain_specific_answer_generation, openai_context_integration
 from utils.llama_utils import get_bnb_config, load_base_model, load_fine_tuned_model, generate_response
@@ -53,10 +53,10 @@ with st.sidebar:
         # with st.container(border=True):
         # Embedding model
-        model_name = st.selectbox("Choose content embedding model", [
             "text-embedding-3-small",
             # "text-embedding-3-large",
-            # "all-MiniLM-L6-v2",
             # "all-mpnet-base-v2"
         ],
         # help="""
@@ -65,26 +65,33 @@ with st.sidebar:
         # used SentenceTransformers models.
         # """
         )
-        with st.container(border=True):
-            st.write('**Video lectures**')
             yt_token_choice = st.select_slider("Token per content", [256, 512, 1024], value=256, help="Larger values lead to an increase in the length of each retrieved piece of content", key="yt_token_len")
-            yt_chunk_tokens = yt_token_choice
-            yt_max_content = {128: 32, 256: 16, 512: 8, 1024: 4}[yt_chunk_tokens]
-            top_k_YT = st.slider("Number of relevant content pieces to retrieve", 0, yt_max_content, 4, key="yt_token_num")
-            yt_overlap_tokens = yt_chunk_tokens // 4
-        # st.divider()
-        with st.container(border=True):
-            st.write('**Textbook**')
-            show_textbook = False
-            # show_textbook = st.toggle("Show Textbook Content", value=False)
             latex_token_choice = st.select_slider("Token per content", [128, 256, 512, 1024], value=256, help="Larger values lead to an increase in the length of each retrieved piece of content", key="latex_token_len")
-            latex_chunk_tokens = latex_token_choice
-            latex_max_content = {128: 32, 256: 16, 512: 8, 1024: 4}[latex_chunk_tokens]
-            top_k_Latex = st.slider("Number of relevant content pieces to retrieve", 0, latex_max_content, 4, key="latex_token_num")
-            # latex_overlap_tokens = latex_chunk_tokens // 4
-            latex_overlap_tokens = 0
     st.write(' ')
     with st.expander('Expert model', expanded=False):
@@ -94,7 +101,7 @@ with st.sidebar:
             st.session_state.expert_model = st.selectbox(
                 "Choose the LLM model",
                 ["LLaMA-TOMMI-1.0-11B", "LLaMA-3.2-11B", "gpt-4o-mini"],
-                index=0,
                 key='a1model'
             )
@@ -116,6 +123,7 @@ with st.sidebar:
     with st.expander('Synthesis model',expanded=False):
         # with st.container(border=True):
         # Choose the LLM model
         st.session_state.synthesis_model = st.selectbox(
             "Choose the LLM model",
             ["LLaMA-3.2-3B","gpt-4o-mini"], # "LLaMA-3.2-11B",
@@ -195,8 +203,8 @@ with st.spinner("Loading LLaMA-3.2-3B..."):
             st.session_state.llama_tokenizer_3B = llama_tokenizer_3B
 # Load YouTube and LaTeX data
-text_data_YT, context_embeddings_YT = load_youtube_data(base_path, model_name, yt_chunk_tokens, yt_overlap_tokens)
-text_data_Latex, context_embeddings_Latex = load_book_data(base_path, model_name, latex_chunk_tokens, latex_overlap_tokens)
 summary = load_summary('data/KG_FEM_summary.json')
 if 'question_answered' not in st.session_state:
@@ -218,7 +226,12 @@ if submit_button_placeholder.button("AI Answer", type="primary"):
     else:
         with st.spinner("Finding relevant contexts..."):
-            question_embedding = embed_question_openai(st.session_state.question, model_name)
             initial_max_k = int(0.1 * context_embeddings_YT.shape[0])
             idx_YT = fixed_knn_retrieval(question_embedding, context_embeddings_YT, top_k=top_k_YT, min_k=0)
             idx_Latex = fixed_knn_retrieval(question_embedding, context_embeddings_Latex, top_k=top_k_Latex, min_k=0)
@@ -245,7 +258,8 @@ if submit_button_placeholder.button("AI Answer", type="primary"):
                 for context_item in contexts:
                     start_time = int(context_item['start'])
                     context += f'Video {i}, time: {sec_to_time(start_time)}:' + context_item['text'] + '\n\n'
             for i, (section_id, contexts) in enumerate(st.session_state.context_by_section.items(), start=1):
                 context += f'Section {i} ({section_id}):\n'
                 for context_item in contexts:
@@ -381,7 +395,10 @@ if st.session_state.question_answered:
         st.markdown(st.session_state.expert_answer)
     st.markdown("#### Answer:")
     st.markdown(st.session_state.answer)
     if top_k_YT > 0:
         st.markdown("#### Retrieved content in lecture videos")
         for i, (video_id, contexts) in enumerate(st.session_state.context_by_video.items(), start=1):

 import streamlit as st
 from utils.help import get_disclaimer
 from utils.format import sec_to_time, fix_latex, get_youtube_embed
+from utils.rag_utils import load_youtube_data, load_book_data, load_summary, embed_question_sentence_transformer, fixed_knn_retrieval, get_random_question
 from utils.system_prompts import get_expert_system_prompt, get_synthesis_system_prompt
 from utils.openai_utils import embed_question_openai, openai_domain_specific_answer_generation, openai_context_integration
 from utils.llama_utils import get_bnb_config, load_base_model, load_fine_tuned_model, generate_response
         # with st.container(border=True):
         # Embedding model
+        embedding_model = st.selectbox("Choose content embedding model", [
             "text-embedding-3-small",
             # "text-embedding-3-large",
+            "all-MiniLM-L6-v2",
             # "all-mpnet-base-v2"
         ],
         # help="""
         # used SentenceTransformers models.
         # """
         )
+        st.divider()
+        # with st.container(border=False):
+        st.write('**Video lectures**')
+        if embedding_model == "all-MiniLM-L6-v2":
+            yt_token_choice = st.select_slider("Token per content", [128, 256], value=256, help="Larger values lead to an increase in the length of each retrieved piece of content", key="yt_token_len")
+        elif embedding_model == "text-embedding-3-small":
             yt_token_choice = st.select_slider("Token per content", [256, 512, 1024], value=256, help="Larger values lead to an increase in the length of each retrieved piece of content", key="yt_token_len")
+        yt_chunk_tokens = yt_token_choice
+        yt_max_content = {128: 32, 256: 16, 512: 8, 1024: 4}[yt_chunk_tokens]
+        top_k_YT = st.slider("Number of content pieces to retrieve", 0, yt_max_content, 4, key="yt_token_num")
+        yt_overlap_tokens = yt_chunk_tokens // 4
+        st.divider()
+        # with st.container(border=False):
+        st.write('**Textbook**')
+        show_textbook = False
+        # show_textbook = st.toggle("Show Textbook Content", value=False)
+        if embedding_model == "all-MiniLM-L6-v2":
+            latex_token_choice = st.select_slider("Token per content", [128, 256], value=256, help="Larger values lead to an increase in the length of each retrieved piece of content", key="latex_token_len")
+        elif embedding_model == "text-embedding-3-small":
             latex_token_choice = st.select_slider("Token per content", [128, 256, 512, 1024], value=256, help="Larger values lead to an increase in the length of each retrieved piece of content", key="latex_token_len")
+        latex_chunk_tokens = latex_token_choice
+        latex_max_content = {128: 32, 256: 16, 512: 8, 1024: 4}[latex_chunk_tokens]
+        top_k_Latex = st.slider("Number of content pieces to retrieve", 0, latex_max_content, 4, key="latex_token_num")
+        # latex_overlap_tokens = latex_chunk_tokens // 4
+        latex_overlap_tokens = 0
     st.write(' ')
     with st.expander('Expert model', expanded=False):
             st.session_state.expert_model = st.selectbox(
                 "Choose the LLM model",
                 ["LLaMA-TOMMI-1.0-11B", "LLaMA-3.2-11B", "gpt-4o-mini"],
+                index=2,
                 key='a1model'
             )
     with st.expander('Synthesis model',expanded=False):
         # with st.container(border=True):
         # Choose the LLM model
+        show_yt_context = st.toggle("Show retrieved video content", value=False)
         st.session_state.synthesis_model = st.selectbox(
             "Choose the LLM model",
             ["LLaMA-3.2-3B","gpt-4o-mini"], # "LLaMA-3.2-11B",
             st.session_state.llama_tokenizer_3B = llama_tokenizer_3B
 # Load YouTube and LaTeX data
+text_data_YT, context_embeddings_YT = load_youtube_data(base_path, embedding_model, yt_chunk_tokens, yt_overlap_tokens)
+text_data_Latex, context_embeddings_Latex = load_book_data(base_path, embedding_model, latex_chunk_tokens, latex_overlap_tokens)
 summary = load_summary('data/KG_FEM_summary.json')
 if 'question_answered' not in st.session_state:
     else:
         with st.spinner("Finding relevant contexts..."):
+            if embedding_model == "all-MiniLM-L6-v2":
+                question_embedding = embed_question_sentence_transformer(st.session_state.question, model_name="all-MiniLM-L6-v2")
+            elif embedding_model ==  "text-embedding-3-small":
+                question_embedding = embed_question_openai(st.session_state.question, embedding_model)
             initial_max_k = int(0.1 * context_embeddings_YT.shape[0])
             idx_YT = fixed_knn_retrieval(question_embedding, context_embeddings_YT, top_k=top_k_YT, min_k=0)
             idx_Latex = fixed_knn_retrieval(question_embedding, context_embeddings_Latex, top_k=top_k_Latex, min_k=0)
                 for context_item in contexts:
                     start_time = int(context_item['start'])
                     context += f'Video {i}, time: {sec_to_time(start_time)}:' + context_item['text'] + '\n\n'
+            st.session_state.yt_context = fix_latex(context)
             for i, (section_id, contexts) in enumerate(st.session_state.context_by_section.items(), start=1):
                 context += f'Section {i} ({section_id}):\n'
                 for context_item in contexts:
         st.markdown(st.session_state.expert_answer)
     st.markdown("#### Answer:")
     st.markdown(st.session_state.answer)
+    if show_yt_context:
+        st.markdown("#### Retrieved lecture video transcripts:")
+        st.markdown(st.session_state.yt_context)
     if top_k_YT > 0:
         st.markdown("#### Retrieved content in lecture videos")
         for i, (video_id, contexts) in enumerate(st.session_state.context_by_video.items(), start=1):

data/yt_embedding_space_all-MiniLM-L6-v2_tpc128_o32.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:25393a46b720884076694fc10f9edb80a67003e452a9187077c69c632f2d45dd
+size 36670448

data/yt_embedding_space_all-MiniLM-L6-v2_tpc256_o64.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fb839d14084e8c305100359eabaefb517b83c9673368cb09b0da23673ce05df3
+size 17898177

utils/openai_utils.py CHANGED Viewed

@@ -1,6 +1,9 @@
 import os
 from openai import OpenAI
 #--------------------------------------------------------
 # Initialize OpenAI client
 #--------------------------------------------------------

 import os
 from openai import OpenAI
+# from dotenv import load_dotenv
+# load_dotenv()  #
 #--------------------------------------------------------
 # Initialize OpenAI client
 #--------------------------------------------------------

utils/rag_utils.py CHANGED Viewed

@@ -2,6 +2,7 @@ import json
 import numpy as np
 import random
 import streamlit as st
 @st.cache_resource
 def load_youtube_data(base_path, embedding_model_name, chunk_tokens, overlap_tokens):
@@ -27,6 +28,12 @@ def load_summary(file_path):
         transcripts = json.load(file)
     return transcripts
 def fixed_knn_retrieval(question_embedding, context_embeddings, top_k=5, min_k=1):
     question_embedding = np.array(question_embedding)

 import numpy as np
 import random
 import streamlit as st
+from sentence_transformers import SentenceTransformer
 @st.cache_resource
 def load_youtube_data(base_path, embedding_model_name, chunk_tokens, overlap_tokens):
         transcripts = json.load(file)
     return transcripts
+def embed_question_sentence_transformer(texts, model_name="sentence-transformers/all-MiniLM-L6-v2"):
+    model = SentenceTransformer(model_name)
+    embeddings = model.encode(texts)
+    return embeddings.tolist()
 def fixed_knn_retrieval(question_embedding, context_embeddings, top_k=5, min_k=1):
     question_embedding = np.array(question_embedding)