Spaces:

ankanghosh
/

anveshak

Sleeping

App Files Files Community

ankanghosh commited on Apr 1

Commit

c72a64f

verified ·

1 Parent(s): 77c7ac2

Update rag_engine.py

Browse files

Files changed (1) hide show

rag_engine.py +174 -19

rag_engine.py CHANGED Viewed

@@ -13,10 +13,12 @@ import streamlit as st
 from utils import setup_gcp_auth, setup_openai_auth
 import gc
-# Force model to CPU for stability
 os.environ["CUDA_VISIBLE_DEVICES"] = ""
-# Local Paths
 local_embeddings_file = "all_embeddings.npy"
 local_faiss_index_file = "faiss_index.faiss"
 local_text_chunks_file = "text_chunks.txt"
@@ -28,9 +30,18 @@ local_metadata_file = "metadata.jsonl"
 @st.cache_resource(show_spinner=False)
 def cached_load_model():
-    """Cached version of load_model() for embedding model loading."""
     try:
-        # Force model to CPU
         device = torch.device("cpu")
         # Get embedding model path from secrets
@@ -47,11 +58,11 @@ def cached_load_model():
             torch_dtype=torch.float16
         )
-        # Move model to CPU and set to eval mode
         model = model.to(device)
         model.eval()
-        # Disable gradient computation
         torch.set_grad_enabled(False)
         print("✅ Model loaded successfully (cached)")
@@ -63,7 +74,19 @@ def cached_load_model():
 @st.cache_resource(show_spinner=False)
 def cached_load_data_files():
-    """Cached version of load_data_files() for FAISS index, text chunks, and metadata."""
     # Initialize GCP and OpenAI clients
     bucket = setup_gcp_client()
     openai_initialized = setup_openai_client()
@@ -130,6 +153,14 @@ def cached_load_data_files():
 # =============================================================================
 def setup_gcp_client():
     try:
         credentials = setup_gcp_auth()
         try:
@@ -147,6 +178,14 @@ def setup_gcp_client():
         return None
 def setup_openai_client():
     try:
         setup_openai_auth()
         print("✅ OpenAI client initialized successfully")
@@ -156,7 +195,19 @@ def setup_openai_client():
         return False
 def download_file_from_gcs(bucket, gcs_path, local_path):
-    """Download a file from GCS to local storage if not already present."""
     try:
         if os.path.exists(local_path):
             print(f"File already exists locally: {local_path}")
@@ -171,15 +222,38 @@ def download_file_from_gcs(bucket, gcs_path, local_path):
         return False
 def average_pool(last_hidden_states, attention_mask):
-    """Average pooling for sentence embeddings."""
     last_hidden = last_hidden_states.masked_fill(~attention_mask[..., None].bool(), 0.0)
     return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]
-# In-memory cache for query embeddings
 query_embedding_cache = {}
 def get_embedding(text):
-    """Generate embeddings for a text query using the cached model."""
     if text in query_embedding_cache:
         return query_embedding_cache[text]
@@ -189,6 +263,8 @@ def get_embedding(text):
             print("Model is None, returning zero embedding")
             return np.zeros((1, 384), dtype=np.float32)
         input_text = f"query: {text}" if len(text) < 512 else f"passage: {text}"
         inputs = tokenizer(
             input_text,
@@ -212,7 +288,27 @@ def get_embedding(text):
         return np.zeros((1, 384), dtype=np.float32)
 def retrieve_passages(query, faiss_index, text_chunks, metadata_dict, top_k=5, similarity_threshold=0.5):
-    """Retrieve top-k most relevant passages using FAISS and accompanying metadata."""
     try:
         print(f"\n🔍 Retrieving passages for query: {query}")
         query_embedding = get_embedding(query)
@@ -244,12 +340,32 @@ def retrieve_passages(query, faiss_index, text_chunks, metadata_dict, top_k=5, s
         return [], []
 def answer_with_llm(query, context=None, word_limit=100):
-    """Generate an answer using the OpenAI GPT model with formatted citations."""
     try:
         if context:
             formatted_contexts = []
             total_chars = 0
-            max_context_chars = 4000
             for (title, author, publisher), text in context:
                 remaining_space = max(0, max_context_chars - total_chars)
                 excerpt_len = min(150, remaining_space)
@@ -274,6 +390,7 @@ def answer_with_llm(query, context=None, word_limit=100):
             "Maintain appropriate, respectful language at all times."
             "Do not use profanity, expletives, obscenities, slurs, hate speech, sexually explicit content, or language promoting violence."
             "As a spiritual guidance system, ensure all responses reflect dignity, peace, love, and compassion consistent with spiritual traditions."
         )
         user_message = f"""
@@ -298,6 +415,8 @@ def answer_with_llm(query, context=None, word_limit=100):
             max_tokens=200,
             temperature=0.7
         )
         answer = response.choices[0].message.content.strip()
         words = answer.split()
         if len(words) > word_limit:
@@ -310,7 +429,18 @@ def answer_with_llm(query, context=None, word_limit=100):
         return "I apologize, but I'm unable to answer at the moment."
 def format_citations(sources):
-    """Format citations so that each appears on a new line, ending with proper punctuation."""
     formatted_citations = []
     for title, author, publisher in sources:
         if publisher.endswith(('.', '!', '?')):
@@ -325,7 +455,21 @@ def format_citations(sources):
 @st.cache_data(ttl=3600, show_spinner=False)
 def cached_process_query(query, top_k=5, word_limit=100):
-    """Cached query processing to avoid redundant computation for repeated queries."""
     print(f"\n🔍 Processing query (cached): {query}")
     faiss_index, text_chunks, metadata_dict = cached_load_data_files()
     if faiss_index is None or text_chunks is None or metadata_dict is None:
@@ -350,10 +494,21 @@ def cached_process_query(query, top_k=5, word_limit=100):
     return {"query": query, "answer_with_rag": llm_answer_with_rag, "citations": sources}
 def process_query(query, top_k=5, word_limit=100):
-    """Process a query through the RAG pipeline with proper formatting.
-       This function wraps the cached query processing.
     """
     return cached_process_query(query, top_k, word_limit)
-# Alias for backward compatibility.
 load_model = cached_load_model

 from utils import setup_gcp_auth, setup_openai_auth
 import gc
+# Force model to CPU for stability and to avoid GPU memory issues on resource-constrained environments
+# This is especially important for deployment on platforms like Hugging Face Spaces
 os.environ["CUDA_VISIBLE_DEVICES"] = ""
+# Define local paths for files downloaded from Google Cloud Storage
+# These files are cached locally to avoid repeated downloads and improve performance
 local_embeddings_file = "all_embeddings.npy"
 local_faiss_index_file = "faiss_index.faiss"
 local_text_chunks_file = "text_chunks.txt"
 @st.cache_resource(show_spinner=False)
 def cached_load_model():
+    """
+    Load and cache the E5-large-v2 embedding model and tokenizer.
+    Uses Streamlit's cache_resource decorator to ensure the model
+    is loaded only once during the application session, improving
+    performance and reducing memory usage.
+    Returns:
+        tuple: (tokenizer, model) pair or (None, None) if loading fails
+    """
     try:
+        # Force model to CPU for stability
         device = torch.device("cpu")
         # Get embedding model path from secrets
             torch_dtype=torch.float16
         )
+        # Move model to CPU and set to eval mode for inference
         model = model.to(device)
         model.eval()
+        # Disable gradient computation to save memory during inference
         torch.set_grad_enabled(False)
         print("✅ Model loaded successfully (cached)")
 @st.cache_resource(show_spinner=False)
 def cached_load_data_files():
+    """
+    Load and cache data files needed for the RAG system.
+    This function loads:
+    - FAISS index for vector similarity search
+    - Text chunks containing the original spiritual text passages
+    - Metadata dictionary with publication and author information
+    All files are downloaded from Google Cloud Storage if not already present locally.
+    Returns:
+        tuple: (faiss_index, text_chunks, metadata_dict) or (None, None, None) if loading fails
+    """
     # Initialize GCP and OpenAI clients
     bucket = setup_gcp_client()
     openai_initialized = setup_openai_client()
 # =============================================================================
 def setup_gcp_client():
+    """
+    Initialize and return the Google Cloud Storage client.
+    Sets up GCP authentication and creates a client for the configured bucket.
+    Returns:
+        google.cloud.storage.bucket.Bucket: The GCS bucket object or None if initialization fails
+    """
     try:
         credentials = setup_gcp_auth()
         try:
         return None
 def setup_openai_client():
+    """
+    Initialize the OpenAI client.
+    Sets up OpenAI API authentication for generating answers using the LLM.
+    Returns:
+        bool: True if initialization was successful, False otherwise
+    """
     try:
         setup_openai_auth()
         print("✅ OpenAI client initialized successfully")
         return False
 def download_file_from_gcs(bucket, gcs_path, local_path):
+    """
+    Download a file from Google Cloud Storage to local storage.
+    Only downloads if the file isn't already present locally, avoiding redundant downloads.
+    Args:
+        bucket: GCS bucket object
+        gcs_path (str): Path to the file in GCS
+        local_path (str): Local path where the file should be saved
+    Returns:
+        bool: True if download was successful or file already exists, False otherwise
+    """
     try:
         if os.path.exists(local_path):
             print(f"File already exists locally: {local_path}")
         return False
 def average_pool(last_hidden_states, attention_mask):
+    """
+    Perform average pooling on model outputs for sentence embeddings.
+    This function creates a fixed-size vector representation of a text sequence by averaging
+    the token embeddings, accounting for padding tokens using the attention mask.
+    Args:
+        last_hidden_states: Hidden states output from the model
+        attention_mask: Attention mask indicating which tokens to include
+    Returns:
+        torch.Tensor: Pooled representation of the input sequence
+    """
     last_hidden = last_hidden_states.masked_fill(~attention_mask[..., None].bool(), 0.0)
     return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]
+# In-memory cache for query embeddings to avoid redundant computations
 query_embedding_cache = {}
 def get_embedding(text):
+    """
+    Generate embeddings for a text query using the cached model.
+    Uses an in-memory cache to avoid redundant embedding generation for repeated queries.
+    Properly prefixes inputs with "query:" or "passage:" as required by the E5 model.
+    Args:
+        text (str): The query text to embed
+    Returns:
+        numpy.ndarray: The embedding vector or a zero vector if embedding fails
+    """
     if text in query_embedding_cache:
         return query_embedding_cache[text]
             print("Model is None, returning zero embedding")
             return np.zeros((1, 384), dtype=np.float32)
+        # Format input based on text length
+        # For E5 models, "query:" prefix is for questions, "passage:" for documents
         input_text = f"query: {text}" if len(text) < 512 else f"passage: {text}"
         inputs = tokenizer(
             input_text,
         return np.zeros((1, 384), dtype=np.float32)
 def retrieve_passages(query, faiss_index, text_chunks, metadata_dict, top_k=5, similarity_threshold=0.5):
+    """
+    Retrieve the most relevant passages for a given spiritual query.
+    This function:
+    1. Embeds the user query using the same model used for text chunks
+    2. Finds similar passages using the FAISS index with cosine similarity
+    3. Filters results based on similarity threshold to ensure relevance
+    4. Enriches results with metadata (title, author, publisher)
+    5. Ensures passage diversity by including only one passage per source title
+    Args:
+        query (str): The user's spiritual question
+        faiss_index: FAISS index containing passage embeddings
+        text_chunks (dict): Dictionary mapping IDs to text chunks and metadata
+        metadata_dict (dict): Dictionary containing publication information
+        top_k (int): Maximum number of passages to retrieve
+        similarity_threshold (float): Minimum similarity score (0.0-1.0) for retrieved passages
+    Returns:
+        tuple: (retrieved_passages, retrieved_sources) containing the text and source information
+    """
     try:
         print(f"\n🔍 Retrieving passages for query: {query}")
         query_embedding = get_embedding(query)
         return [], []
 def answer_with_llm(query, context=None, word_limit=100):
+    """
+    Generate an answer using the OpenAI GPT model with formatted citations.
+    This function:
+    1. Formats retrieved passages with source information
+    2. Creates a prompt with system and user messages
+    3. Calls the OpenAI API to generate an answer
+    4. Trims the response to the specified word limit
+    The system prompt ensures answers maintain appropriate respect for spiritual traditions,
+    synthesize rather than quote directly, and acknowledge gaps when relevant information
+    isn't available.
+    Args:
+        query (str): The user's spiritual question
+        context (list, optional): List of (source_info, text) tuples for context
+        word_limit (int): Maximum word count for the generated answer
+    Returns:
+        str: The generated answer or an error message
+    """
     try:
         if context:
             formatted_contexts = []
             total_chars = 0
+            max_context_chars = 4000  # Limit context size to avoid exceeding token limits
             for (title, author, publisher), text in context:
                 remaining_space = max(0, max_context_chars - total_chars)
                 excerpt_len = min(150, remaining_space)
             "Maintain appropriate, respectful language at all times."
             "Do not use profanity, expletives, obscenities, slurs, hate speech, sexually explicit content, or language promoting violence."
             "As a spiritual guidance system, ensure all responses reflect dignity, peace, love, and compassion consistent with spiritual traditions."
+            "Provide concise, focused answers without lists or lengthy explanations."
         )
         user_message = f"""
             max_tokens=200,
             temperature=0.7
         )
+        # Extract the answer and apply word limit
         answer = response.choices[0].message.content.strip()
         words = answer.split()
         if len(words) > word_limit:
         return "I apologize, but I'm unable to answer at the moment."
 def format_citations(sources):
+    """
+    Format citations for display to the user.
+    Creates properly formatted citations for each source used in generating the answer.
+    Each citation appears on a new line with consistent formatting.
+    Args:
+        sources (list): List of (title, author, publisher) tuples
+    Returns:
+        str: Formatted citations as a string with each citation on a new line
+    """
     formatted_citations = []
     for title, author, publisher in sources:
         if publisher.endswith(('.', '!', '?')):
 @st.cache_data(ttl=3600, show_spinner=False)
 def cached_process_query(query, top_k=5, word_limit=100):
+    """
+    Process a user query with caching to avoid redundant computation.
+    This function is cached with a time-to-live (TTL) of 1 hour, meaning identical
+    queries within this time period will return cached results rather than
+    reprocessing, improving responsiveness.
+    Args:
+        query (str): The user's spiritual question
+        top_k (int): Number of sources to retrieve and use for answer generation
+        word_limit (int): Maximum word count for the generated answer
+    Returns:
+        dict: Dictionary containing the query, answer, and citations
+    """
     print(f"\n🔍 Processing query (cached): {query}")
     faiss_index, text_chunks, metadata_dict = cached_load_data_files()
     if faiss_index is None or text_chunks is None or metadata_dict is None:
     return {"query": query, "answer_with_rag": llm_answer_with_rag, "citations": sources}
 def process_query(query, top_k=5, word_limit=100):
+    """
+    Process a query through the RAG pipeline with proper formatting.
+    This is the main entry point for query processing, wrapping the cached
+    query processing function.
+    Args:
+        query (str): The user's spiritual question
+        top_k (int): Number of sources to retrieve and use for answer generation
+        word_limit (int): Maximum word count for the generated answer
+    Returns:
+        dict: Dictionary containing the query, answer, and citations
     """
     return cached_process_query(query, top_k, word_limit)
+# Alias for backward compatibility
 load_model = cached_load_model