File size: 1,970 Bytes
f7429e0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
import json
import numpy as np
import streamlit as st

@st.cache_resource
def load_youtube_data(base_path, embedding_model_name, chunk_tokens, overlap_tokens):
    embedding_space_file_name = f'{base_path}/yt_embedding_space_{embedding_model_name}_tpc{chunk_tokens}_o{overlap_tokens}.json'
    with open(embedding_space_file_name, 'r') as json_file:
        loaded_data = json.load(json_file)
    
    embedding_space = np.array(loaded_data['embedding_space'])
    return loaded_data['chunks'], embedding_space

@st.cache_resource
def load_book_data(base_path, embedding_model_name, chunk_tokens, overlap_tokens):
    embedding_space_file_name = f'{base_path}/latex_embedding_space_by_sections_{embedding_model_name}_tpc{chunk_tokens}_o{overlap_tokens}.json'
    with open(embedding_space_file_name, 'r') as json_file:
        loaded_data = json.load(json_file)
    
    embedding_space = np.array(loaded_data['embedding_space'])
    return loaded_data['chunks'], embedding_space

@st.cache_resource
def load_summary(file_path):
    with open(file_path, 'r') as file:
        transcripts = json.load(file)
    return transcripts

def fixed_knn_retrieval(question_embedding, context_embeddings, top_k=5, min_k=1):
    
    
    question_embedding = np.array(question_embedding)

    # Normalize 
    question_embedding = question_embedding / np.linalg.norm(question_embedding)
    context_embeddings = context_embeddings / np.linalg.norm(context_embeddings, axis=1, keepdims=True)
    
    # Calculate cosine similarities between the question embedding and all context embeddings.
    similarities = np.dot(context_embeddings, question_embedding)
    # Sort the similarities in descending order and get the corresponding indices.
    sorted_indices = np.argsort(similarities)[::-1]
    # Select the top_k most similar contexts, ensuring at least min_k contexts are selected.
    selected_indices = sorted_indices[:max(top_k, min_k)].tolist()
    return selected_indices