|
import json |
|
import numpy as np |
|
import random |
|
import streamlit as st |
|
|
|
@st.cache_resource |
|
def load_youtube_data(base_path, embedding_model_name, chunk_tokens, overlap_tokens): |
|
embedding_space_file_name = f'{base_path}/yt_embedding_space_{embedding_model_name}_tpc{chunk_tokens}_o{overlap_tokens}.json' |
|
with open(embedding_space_file_name, 'r') as json_file: |
|
loaded_data = json.load(json_file) |
|
|
|
embedding_space = np.array(loaded_data['embedding_space']) |
|
return loaded_data['chunks'], embedding_space |
|
|
|
@st.cache_resource |
|
def load_book_data(base_path, embedding_model_name, chunk_tokens, overlap_tokens): |
|
embedding_space_file_name = f'{base_path}/latex_embedding_space_by_sections_{embedding_model_name}_tpc{chunk_tokens}_o{overlap_tokens}.json' |
|
with open(embedding_space_file_name, 'r') as json_file: |
|
loaded_data = json.load(json_file) |
|
|
|
embedding_space = np.array(loaded_data['embedding_space']) |
|
return loaded_data['chunks'], embedding_space |
|
|
|
@st.cache_resource |
|
def load_summary(file_path): |
|
with open(file_path, 'r') as file: |
|
transcripts = json.load(file) |
|
return transcripts |
|
|
|
def fixed_knn_retrieval(question_embedding, context_embeddings, top_k=5, min_k=1): |
|
|
|
question_embedding = np.array(question_embedding) |
|
|
|
|
|
question_embedding = question_embedding / np.linalg.norm(question_embedding) |
|
context_embeddings = context_embeddings / np.linalg.norm(context_embeddings, axis=1, keepdims=True) |
|
|
|
|
|
similarities = np.dot(context_embeddings, question_embedding) |
|
|
|
sorted_indices = np.argsort(similarities)[::-1] |
|
|
|
selected_indices = sorted_indices[:max(top_k, min_k)].tolist() |
|
return selected_indices |
|
|
|
|
|
def get_random_question(text_file): |
|
with open(text_file, "r") as file: |
|
questions = [line.strip() for line in file] |
|
return random.choice(questions) |