Spaces:
Running
Running
import json | |
import os | |
from typing import Literal | |
import openai | |
from llama_index.core import ( | |
Document, | |
Settings, | |
SimpleDirectoryReader, | |
StorageContext, | |
VectorStoreIndex, | |
load_index_from_storage, | |
) | |
from llama_index.core.node_parser import ( | |
HierarchicalNodeParser, | |
SentenceWindowNodeParser, | |
get_leaf_nodes, | |
) | |
from llama_index.core.postprocessor import ( | |
MetadataReplacementPostProcessor, | |
SentenceTransformerRerank, | |
) | |
from llama_index.core.query_engine import RetrieverQueryEngine | |
from llama_index.core.retrievers import AutoMergingRetriever | |
from llama_index.embeddings.huggingface import HuggingFaceEmbedding | |
from llama_index.llms.openai import OpenAI | |
from src.mythesis_chatbot.utils import get_config_hash, get_openai_api_key | |
SupportedRags = Literal[ | |
"classic retrieval", "sentence window retrieval", "auto-merging retrieval" | |
] | |
SupportedOpenAIllms = Literal["gpt-4o-mini", "gpt-3.5-turbo"] | |
SupportedEmbedModels = Literal["BAAI/bge-small-en-v1.5"] | |
SupportedRerankModels = Literal["cross-encoder/ms-marco-MiniLM-L-2-v2"] | |
def load_data(input_file: str) -> Document: | |
reader = SimpleDirectoryReader(input_files=[input_file]) | |
documents = reader.load_data() # List of Document objects (one object per page) | |
# Merge into single document | |
document = Document(text="\n\n".join([doc.text for doc in documents])) | |
return document | |
def build_sentence_window_index( | |
input_file: str, | |
save_dir: str, | |
index_config: dict[str, str | int], | |
): | |
config_hash = get_config_hash(index_config) | |
save_dir = os.path.join(save_dir, "sentence_window", config_hash) | |
Settings.embed_model = HuggingFaceEmbedding(model_name=index_config["embed_model"]) | |
if not os.path.exists(save_dir): | |
document = load_data(input_file) | |
# Create the sentence window node parser w/ default settings. | |
# A node is a chunck of text. Each node returned by the sentence window node | |
# parser also contains its context as metadata (closest chuncks of texts) | |
node_parser = SentenceWindowNodeParser.from_defaults( | |
window_size=index_config["sentence_window_size"], | |
window_metadata_key="window", | |
original_text_metadata_key="original_text", | |
) | |
Settings.node_parser = node_parser | |
sentence_index = VectorStoreIndex.from_documents([document]) | |
sentence_index.storage_context.persist(persist_dir=save_dir) | |
with open(os.path.join(save_dir, "meta.json"), "w") as f: | |
json.dump(index_config, f, indent=2) | |
else: | |
sentence_index = load_index_from_storage( | |
StorageContext.from_defaults(persist_dir=save_dir) | |
) | |
return sentence_index | |
def build_automerging_index( | |
input_file: str, | |
save_dir: str, | |
index_config: dict[str, str | list[int]], | |
): | |
config_hash = get_config_hash(index_config) | |
save_dir = os.path.join(save_dir, "auto_merging", config_hash) | |
Settings.embed_model = HuggingFaceEmbedding(model_name=index_config["embed_model"]) | |
if not os.path.exists(save_dir): | |
document = load_data(input_file) | |
node_parser = HierarchicalNodeParser.from_defaults( | |
chunk_sizes=index_config["chunk_sizes"] | |
) | |
nodes = node_parser.get_nodes_from_documents([document]) | |
leaf_nodes = get_leaf_nodes(nodes) | |
Settings.node_parser = node_parser | |
storage_context = StorageContext.from_defaults() | |
storage_context.docstore.add_documents(nodes) | |
automerging_index = VectorStoreIndex( | |
leaf_nodes, | |
storage_context=storage_context, | |
) | |
automerging_index.storage_context.persist(persist_dir=save_dir) | |
with open(os.path.join(save_dir, "meta.json"), "w") as f: | |
json.dump(index_config, f, indent=2) | |
else: | |
automerging_index = load_index_from_storage( | |
StorageContext.from_defaults(persist_dir=save_dir), | |
) | |
return automerging_index | |
def get_sentence_window_query_engine( | |
sentence_index, | |
similarity_top_k: int = 6, | |
rerank_top_n: int = 2, | |
rerank_model: str = "cross-encoder/ms-marco-MiniLM-L-2-v2", | |
): | |
# Used to replace the node content with a field from the node metadata. | |
postproc = MetadataReplacementPostProcessor(target_metadata_key="window") | |
# Rerank can speed up an LLM query without sacrificing accuracy. It does so by | |
# pruning away irrelevant nodes from the context. | |
rerank = SentenceTransformerRerank(top_n=rerank_top_n, model=rerank_model) | |
sentence_window_engine = sentence_index.as_query_engine( | |
similarity_top_k=similarity_top_k, node_postprocessors=[postproc, rerank] | |
) | |
return sentence_window_engine | |
def get_automerging_query_engine( | |
automerging_index, | |
similarity_top_k: int = 12, | |
rerank_top_n: int = 6, | |
rerank_model: str = "cross-encoder/ms-marco-MiniLM-L-2-v2", | |
): | |
base_retriever = automerging_index.as_retriever(similarity_top_k=similarity_top_k) | |
retriever = AutoMergingRetriever( | |
base_retriever, automerging_index.storage_context, verbose=True | |
) | |
rerank = SentenceTransformerRerank(top_n=rerank_top_n, model=rerank_model) | |
auto_merging_engine = RetrieverQueryEngine.from_args( | |
retriever, node_postprocessors=[rerank] | |
) | |
return auto_merging_engine | |
def sentence_window_retrieval_setup( | |
input_file: str, | |
save_dir: str, | |
llm_openai_model: SupportedOpenAIllms = "gpt-4o-mini", | |
temperature: float = 0.1, | |
embed_model: SupportedEmbedModels = "BAAI/bge-small-en-v1.5", | |
sentence_window_size: int = 3, | |
similarity_top_k: int = 6, | |
rerank_model: SupportedRerankModels = "cross-encoder/ms-marco-MiniLM-L-2-v2", | |
rerank_top_n: int = 2, | |
**kwargs | |
): | |
openai.api_key = get_openai_api_key() | |
# This allows to uniquely identify the index | |
config = { | |
"doc_source": os.path.basename(input_file), | |
"embed_model": embed_model, | |
"sentence_window_size": sentence_window_size, | |
} | |
# 1. Build index | |
index = build_sentence_window_index(input_file, save_dir, config) | |
Settings.llm = OpenAI(model=llm_openai_model, temperature=temperature) | |
# 2. Get engine | |
sentence_window_engine = get_sentence_window_query_engine( | |
index, | |
similarity_top_k=similarity_top_k, | |
rerank_model=rerank_model, | |
rerank_top_n=rerank_top_n, | |
) | |
return sentence_window_engine | |
def automerging_retrieval_setup( | |
input_file: str, | |
save_dir: str, | |
llm_openai_model: SupportedOpenAIllms = "gpt-4o-mini", | |
temperature: float = 0.1, | |
embed_model: SupportedEmbedModels = "BAAI/bge-small-en-v1.5", | |
chunk_sizes=[2048, 512, 128], | |
similarity_top_k: int = 6, | |
rerank_model: SupportedRerankModels = "cross-encoder/ms-marco-MiniLM-L-2-v2", | |
rerank_top_n: int = 2, | |
**kwargs | |
): | |
openai.api_key = get_openai_api_key() | |
# This allows to uniquely identify the index | |
config = { | |
"doc_source": os.path.basename(input_file), | |
"embed_model": embed_model, | |
"chunk_sizes": chunk_sizes, | |
} | |
# 1. Build index | |
index = build_automerging_index(input_file, save_dir, config) | |
Settings.llm = OpenAI(model=llm_openai_model, temperature=temperature) | |
# 2. Get engine | |
automerging_engine = get_sentence_window_query_engine( | |
index, | |
similarity_top_k=similarity_top_k, | |
rerank_model=rerank_model, | |
rerank_top_n=rerank_top_n, | |
) | |
return automerging_engine | |
def basic_rag_setup( | |
input_file: str, | |
save_dir: str, | |
llm_openai_model: SupportedOpenAIllms = "gpt-4o-mini", | |
temperature: float = 0.1, | |
embed_model: SupportedEmbedModels = "BAAI/bge-small-en-v1.5", | |
similarity_top_k: int = 6, | |
rerank_model: SupportedRerankModels = "cross-encoder/ms-marco-MiniLM-L-2-v2", | |
rerank_top_n: int = 2, | |
**kwargs | |
): | |
openai.api_key = get_openai_api_key() | |
Settings.embed_model = HuggingFaceEmbedding(model_name=embed_model) | |
save_dir = os.path.join(save_dir, "basic") | |
if not os.path.exists(save_dir): | |
document = load_data(input_file) | |
index = VectorStoreIndex.from_documents([document]) | |
index.storage_context.persist(persist_dir=save_dir) | |
else: | |
index = load_index_from_storage( | |
StorageContext.from_defaults(persist_dir=save_dir) | |
) | |
rerank = SentenceTransformerRerank(top_n=rerank_top_n, model=rerank_model) | |
engine = index.as_query_engine( | |
llm=OpenAI(model=llm_openai_model, temperature=temperature), | |
similarity_top_k=similarity_top_k, | |
node_postprocessors=[rerank], | |
) | |
return engine | |