Spaces:
Running
on
L4
Running
on
L4
File size: 2,142 Bytes
f7429e0 c1f91db f7429e0 c1f91db |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 |
import json
import numpy as np
import random
import streamlit as st
@st.cache_resource
def load_youtube_data(base_path, embedding_model_name, chunk_tokens, overlap_tokens):
embedding_space_file_name = f'{base_path}/yt_embedding_space_{embedding_model_name}_tpc{chunk_tokens}_o{overlap_tokens}.json'
with open(embedding_space_file_name, 'r') as json_file:
loaded_data = json.load(json_file)
embedding_space = np.array(loaded_data['embedding_space'])
return loaded_data['chunks'], embedding_space
@st.cache_resource
def load_book_data(base_path, embedding_model_name, chunk_tokens, overlap_tokens):
embedding_space_file_name = f'{base_path}/latex_embedding_space_by_sections_{embedding_model_name}_tpc{chunk_tokens}_o{overlap_tokens}.json'
with open(embedding_space_file_name, 'r') as json_file:
loaded_data = json.load(json_file)
embedding_space = np.array(loaded_data['embedding_space'])
return loaded_data['chunks'], embedding_space
@st.cache_resource
def load_summary(file_path):
with open(file_path, 'r') as file:
transcripts = json.load(file)
return transcripts
def fixed_knn_retrieval(question_embedding, context_embeddings, top_k=5, min_k=1):
question_embedding = np.array(question_embedding)
# Normalize
question_embedding = question_embedding / np.linalg.norm(question_embedding)
context_embeddings = context_embeddings / np.linalg.norm(context_embeddings, axis=1, keepdims=True)
# Calculate cosine similarities between the question embedding and all context embeddings.
similarities = np.dot(context_embeddings, question_embedding)
# Sort the similarities in descending order and get the corresponding indices.
sorted_indices = np.argsort(similarities)[::-1]
# Select the top_k most similar contexts, ensuring at least min_k contexts are selected.
selected_indices = sorted_indices[:max(top_k, min_k)].tolist()
return selected_indices
def get_random_question(text_file):
with open(text_file, "r") as file:
questions = [line.strip() for line in file]
return random.choice(questions) |