Spaces:

userisanillusion
/

RAGsystem

Build error

App Files Files Community

userisanillusion commited on 17 days ago

Commit

fa9395d

verified ·

1 Parent(s): 736f9b1

Update app.py

Browse files

Files changed (1) hide show

app.py +464 -285

app.py CHANGED Viewed

@@ -1,5 +1,12 @@
 import os
 import re
 import numpy as np
 import gc
 import torch
@@ -10,31 +17,16 @@ import pickle
 import traceback
 from typing import List, Dict, Any, Tuple, Optional, Union, Generator
 from dataclasses import dataclass
-import gradio as gr
-# Import dependencies (no need for pip install commands)
-import fitz  # PyMuPDF
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain_community.vectorstores import FAISS
 from langchain_community.embeddings import HuggingFaceEmbeddings
 from llama_cpp import Llama
 from rank_bm25 import BM25Okapi
 import nltk
 from nltk.tokenize import word_tokenize
 from nltk.corpus import stopwords
-from huggingface_hub import hf_hub_download
-# Download nltk resources
-try:
-    nltk.download('punkt', quiet=True)
-    nltk.download('stopwords', quiet=True)
-except:
-    print("Failed to download NLTK resources, continuing without them")
-# Setup directories for Spaces
-os.makedirs("pdfs", exist_ok=True)
-os.makedirs("models", exist_ok=True)
-os.makedirs("pdf_cache", exist_ok=True)
 # Download nltk resources
 try:
@@ -43,14 +35,6 @@ try:
 except:
     print("Failed to download NLTK resources, continuing without them")
-# Download model from Hugging Face Hub
-model_path = hf_hub_download(
-    repo_id="TheBloke/phi-2-GGUF",
-    filename="phi-2.Q8_0.gguf",
-    repo_type="model",
-    local_dir="models"
-)
 # === MEMORY MANAGEMENT UTILITIES ===
 def clear_memory():
     """Clear memory to prevent OOM errors"""
@@ -81,7 +65,7 @@ class PDFProcessor:
             length_function=len,
             is_separator_regex=False,
         )
         # Create cache directory
         self.cache_dir = os.path.join(os.getcwd(), "pdf_cache")
         os.makedirs(self.cache_dir, exist_ok=True)
@@ -96,12 +80,12 @@ class PDFProcessor:
         """Get the cache file path for a PDF"""
         pdf_hash = hashlib.md5(open(pdf_path, 'rb').read(8192)).hexdigest()
         return os.path.join(self.cache_dir, f"{os.path.basename(pdf_path)}_{pdf_hash}.pkl")
     def _is_cached(self, pdf_path: str) -> bool:
         """Check if a PDF is cached"""
         cache_path = self._get_cache_path(pdf_path)
         return os.path.exists(cache_path)
     def _load_from_cache(self, pdf_path: str) -> List[PDFChunk]:
         """Load chunks from cache"""
         cache_path = self._get_cache_path(pdf_path)
@@ -110,7 +94,7 @@ class PDFProcessor:
                 return pickle.load(f)
         except:
             return None
     def _save_to_cache(self, pdf_path: str, chunks: List[PDFChunk]) -> None:
         """Save chunks to cache"""
         cache_path = self._get_cache_path(pdf_path)
@@ -143,15 +127,15 @@ class PDFProcessor:
             if cached_chunks:
                 print(f"Loaded {len(cached_chunks)} chunks from cache for {os.path.basename(pdf_path)}")
                 return cached_chunks
         try:
             doc = fitz.open(pdf_path)
             pdf_chunks = []
             pdf_name = os.path.basename(pdf_path)
             for page_num in range(len(doc)):
                 page = doc.load_page(page_num)
                 # Extract text with more options for better quality
                 page_text = page.get_text("text", sort=True)
                 # Try to extract text with alternative layout analysis if the text is too short
@@ -172,10 +156,10 @@ class PDFProcessor:
                     except:
                         # Fallback to default extraction
                         page_text = page.get_text("text")
                 # Clean the text
                 page_text = self.clean_text(page_text)
                 # Extract tables
                 try:
                     tables = page.find_tables()
@@ -191,17 +175,17 @@ class PDFProcessor:
                                         row_cells.append(cell_text)
                                 if row_cells:
                                     table_text += " | ".join(row_cells) + "\n"
                             # Add table text to page text
                             if table_text.strip():
                                 page_text += "\n\nTABLE:\n" + table_text
                 except Exception as table_err:
                     print(f"Warning: Skipping table extraction for page {page_num}: {str(table_err)}")
                 # Split the page text into chunks
                 if page_text.strip():
                     page_chunks = self.text_splitter.split_text(page_text)
                     # Create PDFChunk objects
                     for i, chunk_text in enumerate(page_chunks):
                         pdf_chunks.append(PDFChunk(
@@ -210,16 +194,16 @@ class PDFProcessor:
                             page_num=page_num + 1,  # 1-based page numbering for humans
                             chunk_id=i
                         ))
                 # Clear memory periodically
                 if page_num % 10 == 0:
                     clear_memory()
             doc.close()
             # Cache the results
             self._save_to_cache(pdf_path, pdf_chunks)
             return pdf_chunks
         except Exception as e:
             print(f"Error extracting text from {pdf_path}: {str(e)}")
@@ -237,7 +221,7 @@ class PDFProcessor:
         pdf_path = os.path.join(self.pdf_dir, pdf_name)
         return self.extract_text_from_pdf(pdf_path)
-    def process_all_pdfs(self, batch_size: int = 2) -> List[PDFChunk]:
         """Process all PDFs in batches to manage memory
         Args:
@@ -292,7 +276,14 @@ class VectorDBManager:
                 model_kwargs={"device": "cpu"},
                 encode_kwargs={"normalize_embeddings": True}
             )
         self.vectordb = None
         # BM25 index for hybrid search
         self.bm25_index = None
@@ -310,14 +301,15 @@ class VectorDBManager:
                 stop_words = set(stopwords.words('english'))
                 filtered_tokens = [w for w in tokens if w.isalnum() and w not in stop_words]
                 tokenized_chunks.append(filtered_tokens)
             # Create BM25 index
             self.bm25_index = BM25Okapi(tokenized_chunks)
         except Exception as e:
             print(f"Error creating BM25 index: {str(e)}")
             print(traceback.format_exc())
             self.bm25_index = None
     def create_vector_db(self, chunks: List[PDFChunk]) -> None:
         """Create vector database from text chunks
@@ -330,19 +322,19 @@ class VectorDBManager:
                 return
             print(f"Creating vector DB with {len(chunks)} chunks")
             # Store chunks for hybrid search
             self.chunks = chunks
             # Prepare data for vector DB
             chunk_texts = [chunk.text for chunk in chunks]
             # Create BM25 index for hybrid search
             print("Creating BM25 index for hybrid search")
             self._prepare_bm25(chunks)
             # Process in smaller batches to manage memory
-            batch_size = 16  # Reduced for Spaces
             all_embeddings = []
             for i in range(0, len(chunk_texts), batch_size):
@@ -362,7 +354,7 @@ class VectorDBManager:
                 text_embeddings=list(zip(chunk_texts, all_embeddings)),
                 embedding=self.embedding_model
             )
             print(f"Vector database created with {len(chunks)} documents")
         except Exception as e:
@@ -374,8 +366,51 @@ class VectorDBManager:
         """Format a chunk with its metadata for better context"""
         return f"Source: {chunk.source} | Page: {chunk.page_num}\n\n{chunk.text}"
     def hybrid_search(self, query: str, k: int = 5, alpha: float = 0.7) -> List[str]:
-        """Hybrid search combining vector search and BM25
         Args:
             query: Query text
@@ -391,11 +426,11 @@ class VectorDBManager:
         try:
             # Get vector search results
-            vector_results = self.vectordb.similarity_search(query, k=k*2)
             vector_texts = [doc.page_content for doc in vector_results]
-            final_results = []
             # Combine with BM25 if available
             if self.bm25_index is not None:
                 try:
@@ -403,14 +438,14 @@ class VectorDBManager:
                     query_tokens = word_tokenize(query.lower())
                     stop_words = set(stopwords.words('english'))
                     filtered_query = [w for w in query_tokens if w.isalnum() and w not in stop_words]
                     # Get BM25 scores
                     bm25_scores = self.bm25_index.get_scores(filtered_query)
                     # Combine scores (normalized)
                     combined_results = []
                     seen_texts = set()
                     # First add vector results with their positions as scores
                     for i, text in enumerate(vector_texts):
                         if text not in seen_texts:
@@ -424,26 +459,34 @@ class VectorDBManager:
                                     # Normalize BM25 score
                                     bm25_score = bm25_scores[j] / max(bm25_scores) if max(bm25_scores) > 0 else 0
                                     combined_score = alpha * vector_score + (1-alpha) * bm25_score
                                     combined_results.append((chunk, combined_score))
                                     break
                     # Sort by combined score
                     combined_results.sort(key=lambda x: x[1], reverse=True)
-                    # Get top k results
-                    top_chunks = [item[0] for item in combined_results[:k]]
-                    # Format results with metadata
-                    final_results = [self._format_chunk_with_metadata(chunk) for chunk in top_chunks]
                 except Exception as e:
                     print(f"Error in BM25 scoring: {str(e)}")
                     # Fallback to vector search results
-                    final_results = vector_texts[:k]
             else:
                 # Just use vector search results if BM25 is not available
                 final_results = vector_texts[:k]
             return final_results
         except Exception as e:
             print(f"Error during hybrid search: {str(e)}")
@@ -458,7 +501,7 @@ class QueryExpander:
             llm_model: LLM model for query expansion
         """
         self.llm = llm_model
     def expand_query(self, query: str) -> str:
         """Expand the query using the LLM to improve retrieval
@@ -469,46 +512,44 @@ class QueryExpander:
             Expanded query
         """
         try:
-            prompt = f"""I need to search for documents related to this question: "{query}"
 Please help me expand this query by identifying key concepts, synonyms, and related terms that might be used in the documents.
-Return only the expanded search query, without any explanations or additional text.
-Expanded query:"""
             expanded = self.llm.generate(prompt, max_tokens=100, temperature=0.3)
             # Combine original and expanded
             combined = f"{query} {expanded}"
             # Limit length
             if len(combined) > 300:
                 combined = combined[:300]
             return combined
         except:
             # Return original query if expansion fails
             return query
 # === LLM SETUP ===
-class Phi2Model:
-    def __init__(self, model_path: str = model_path):
-        """Initialize Phi-2 model
         Args:
             model_path: Path to the model file
         """
         try:
-            # Initialize Phi-2 with llama.cpp - optimized for Spaces
             self.llm = Llama(
                 model_path=model_path,
-                n_ctx=1024,         # Reduced context window for Spaces
-                n_batch=64,         # Reduced batch size
-                n_gpu_layers=0,     # Run on CPU for compatibility
                 verbose=False
             )
         except Exception as e:
-            print(f"Error initializing Phi-2 model: {str(e)}")
             raise
     def generate(self, prompt: str,
@@ -516,7 +557,7 @@ class Phi2Model:
                  temperature: float = 0.7,
                  top_p: float = 0.9,
                  stream: bool = False) -> Union[str, Generator[str, None, None]]:
-        """Generate text using Phi-2
         Args:
             prompt: Input prompt
@@ -539,7 +580,7 @@ class Phi2Model:
                     top_p=top_p,
                     echo=False
                 )
-                return output["choices"][0]["text"]
         except Exception as e:
             print(f"Error generating text: {str(e)}")
             return "Error: Could not generate response."
@@ -548,7 +589,7 @@ class Phi2Model:
                          max_tokens: int = 512,
                          temperature: float = 0.7,
                          top_p: float = 0.9) -> Generator[str, None, None]:
-        """Stream text generation using Phi-2
         Args:
             prompt: Input prompt
@@ -572,11 +613,96 @@ class Phi2Model:
             response += token
             yield response
 # === RAG SYSTEM ===
 class RAGSystem:
     def __init__(self, pdf_processor: PDFProcessor,
                  vector_db: VectorDBManager,
-                 model: Phi2Model):
         """Initialize RAG system
         Args:
@@ -588,11 +714,12 @@ class RAGSystem:
         self.vector_db = vector_db
         self.model = model
         self.query_expander = QueryExpander(model)
         self.is_initialized = False
     def process_documents(self) -> bool:
         """Process all documents and create vector database
         Returns:
             True if successful, False otherwise
         """
@@ -602,22 +729,22 @@ class RAGSystem:
             if not chunks:
                 print("No chunks were extracted from PDFs")
                 return False
             print(f"Total chunks extracted: {len(chunks)}")
             # Create vector database
             print("Creating vector database...")
             self.vector_db.create_vector_db(chunks)
             # Verify success
             if self.vector_db.vectordb is None:
                 print("Failed to create vector database")
                 return False
             # Set initialization flag
             self.is_initialized = True
             return True
         except Exception as e:
             print(f"Error processing documents: {str(e)}")
             print(traceback.format_exc())
@@ -638,8 +765,8 @@ class RAGSystem:
         for i, context in enumerate(contexts):
             formatted_contexts += f"[CONTEXT {i+1}]\n{context}\n\n"
-        # Create prompt with better instructions
-        prompt = f"""You are an AI assistant that answers questions based on the provided context information.
 User Query: {query}
@@ -647,27 +774,26 @@ Below are relevant passages from documents that might help answer the query:
 {formatted_contexts}
-Using ONLY the information provided in the context above, provide a comprehensive answer to the user's query.
 If the provided context doesn't contain relevant information to answer the query, clearly state: "I don't have enough information in the provided context to answer this question."
 Do not use any prior knowledge that is not contained in the provided context.
 If quoting from the context, mention the source document and page number.
-Organize your answer in a clear, coherent manner.
-Answer:"""
         return prompt
     def answer_query(self, query: str, k: int = 5, max_tokens: int = 512,
-                 temperature: float = 0.7, stream: bool = False) -> Union[str, Generator[str, None, None]]:
-        """Answer a query using RAG with query expansion
         Args:
             query: User query
             k: Number of contexts to retrieve
             max_tokens: Maximum number of tokens to generate
             temperature: Temperature for generation
             stream: Whether to stream the output
         Returns:
             Answer text or generator if streaming
         """
@@ -679,7 +805,7 @@ Answer:"""
             # Expand query for better retrieval
             expanded_query = self.query_expander.expand_query(query)
             print(f"Expanded query: {expanded_query}")
             # Retrieve relevant contexts using hybrid search
             contexts = self.vector_db.hybrid_search(expanded_query, k=k)
@@ -689,240 +815,293 @@ Answer:"""
             # Generate prompt with improved instructions
             prompt = self.generate_prompt(query, contexts)
-            # Generate answer
-            return self.model.generate(
                 prompt,
                 max_tokens=max_tokens,
                 temperature=temperature,
-                stream=stream
             )
         except Exception as e:
             print(f"Error answering query: {str(e)}")
             print(traceback.format_exc())
             return f"Error processing your query: {str(e)}"
-# === GRADIO INTERFACE ===
-class RAGInterface:
     def __init__(self, rag_system: RAGSystem):
-        """Initialize Gradio interface
         Args:
             rag_system: RAG system instance
         """
         self.rag_system = rag_system
         self.interface = None
-        self.is_processing = False
-    def upload_file(self, files):
-        """Upload PDF files"""
         try:
-            os.makedirs("pdfs", exist_ok=True)
-            uploaded_files = []
             for file in files:
-                destination = os.path.join("pdfs", os.path.basename(file.name))
-                shutil.copy(file.name, destination)
-                uploaded_files.append(os.path.basename(file.name))
-            # Verify files exist in the directory
-            pdf_files = [f for f in os.listdir("pdfs") if f.lower().endswith('.pdf')]
-            if not pdf_files:
-                return "No PDF files were uploaded successfully."
-            return f"Successfully uploaded {len(uploaded_files)} files: {', '.join(uploaded_files)}"
         except Exception as e:
             return f"Error uploading files: {str(e)}"
-    def process_documents(self):
-        """Process all documents
         Returns:
             Status message
         """
-        if self.is_processing:
-            return "Document processing is already in progress. Please wait."
         try:
-            self.is_processing = True
             start_time = time.time()
             success = self.rag_system.process_documents()
-            elapsed = time.time() - start_time
-            self.is_processing = False
             if success:
-                return f"Documents processed successfully in {elapsed:.2f} seconds."
             else:
-                return "Failed to process documents. Check the logs for more information."
         except Exception as e:
-            self.is_processing = False
             return f"Error processing documents: {str(e)}"
-    def answer_query(self, query, k, max_tokens, temperature):
-        """Answer a query
         Args:
             query: User query
             k: Number of contexts to retrieve
-            max_tokens: Maximum number of tokens to generate
-            temperature: Sampling temperature
         Returns:
-            Answer
         """
-        if not query.strip():
-            return "Please enter a question."
-        try:
-            return self.rag_system.answer_query(
-                query,
-                k=k,
-                max_tokens=max_tokens,
-                temperature=temperature,
-                stream=False
-            )
-        except Exception as e:
-            return f"Error answering query: {str(e)}"
-    def answer_query_stream(self, query, k, max_tokens, temperature):
-        """Stream answer to a query
-        Args:
-            query: User query
-            k: Number of contexts to retrieve
-            max_tokens: Maximum number of tokens to generate
-            temperature: Sampling temperature
-        Yields:
-            Generated text
-        """
-        if not query.strip():
-            yield "Please enter a question."
-            return
         try:
-            yield from self.rag_system.answer_query(
-                query,
-                k=k,
-                max_tokens=max_tokens,
-                temperature=temperature,
-                stream=True
-            )
         except Exception as e:
-            yield f"Error answering query: {str(e)}"
-    def create_interface(self):
-        """Create Gradio interface"""
-        with gr.Blocks(title="PDF RAG System") as interface:
-            gr.Markdown("# PDF RAG System with Phi-2")
-            gr.Markdown("Upload your PDF documents, process them, and ask questions to get answers based on the content.")
-            with gr.Tab("Upload & Process"):
-                with gr.Row():
-                    pdf_files = gr.File(
-                        file_count="multiple",
-                        label="Upload PDF Files",
-                        file_types=[".pdf"]
-                    )
-                    upload_button = gr.Button("Upload", variant="primary")
-                upload_output = gr.Textbox(label="Upload Status", lines=2)
-                upload_button.click(self.upload_file, inputs=[pdf_files], outputs=upload_output)
-                process_button = gr.Button("Process Documents", variant="primary")
-                process_output = gr.Textbox(label="Processing Status", lines=2)
-                process_button.click(self.process_documents, inputs=[], outputs=process_output)
-            with gr.Tab("Query"):
-                with gr.Row():
-                    with gr.Column():
-                        query_input = gr.Textbox(
-                            label="Question",
-                            lines=3,
-                            placeholder="Ask a question about your documents..."
-                        )
-                        with gr.Row():
-                            k_slider = gr.Slider(
-                                minimum=1,
-                                maximum=10,
-                                value=3,
-                                step=1,
-                                label="Number of Contexts"
                             )
-                            max_tokens_slider = gr.Slider(
-                                minimum=100,
-                                maximum=800,
-                                value=400,
-                                step=50,
-                                label="Max Tokens"
                             )
-                            temperature_slider = gr.Slider(
-                                minimum=0.1,
-                                maximum=1.0,value=0.7,
-                                step=0.1,
-                                label="Temperature"
                             )
-                        submit_button = gr.Button("Submit", variant="primary")
-                    answer_output = gr.Textbox(label="Answer", lines=10)
-                submit_button.click(
-                    self.answer_query,
-                    inputs=[query_input, k_slider, max_tokens_slider, temperature_slider],
-                    outputs=answer_output
                 )
-                # Add streaming capability
-                stream_button = gr.Button("Submit (Streaming)", variant="secondary")
-                stream_button.click(
-                    self.answer_query_stream,
-                    inputs=[query_input, k_slider, max_tokens_slider, temperature_slider],
-                    outputs=answer_output
                 )
-            gr.Markdown("""
-            ## Instructions
-            1. Upload PDF files in the 'Upload & Process' tab.
-            2. Click the 'Process Documents' button to extract and index content.
-            3. Switch to the 'Query' tab to ask questions about your documents.
-            4. Adjust parameters as needed:
-               - Number of Contexts: More contexts provide more information but may be less focused.
-               - Max Tokens: Controls the length of the response.
-               - Temperature: Lower values (0.1-0.5) give more focused answers, higher values (0.6-1.0) give more creative answers.
-            """)
-        self.interface = interface
-        return interface
-    def launch(self, **kwargs):
-        """Launch the Gradio interface"""
-        if self.interface is None:
-            self.create_interface()
-        self.interface.launch(**kwargs)
 # === MAIN APPLICATION ===
 def main():
-    """Main function to set up and launch the application"""
-    try:
-        # Initialize components
-        pdf_processor = PDFProcessor(pdf_dir="pdfs")
-        vector_db = VectorDBManager()
-        phi2_model = Phi2Model()
-        # Initialize RAG system
-        rag_system = RAGSystem(pdf_processor, vector_db, phi2_model)
-        # Create interface
-        interface = RAGInterface(rag_system)
-        # Launch application
-        interface.launch(share=True)
-    except Exception as e:
-        print(f"Error initializing application: {str(e)}")
-        print(traceback.format_exc())
 if __name__ == "__main__":
     main()

+!pip install -q pymupdf langchain langchain_community sentence-transformers faiss-cpu llama-cpp-python gradio transformers rank_bm25
+!pip install -q git+https://github.com/chroma-core/chroma.git
+!mkdir -p pdfs
+!mkdir -p models
+!wget -q -O models/mistral-7b-instruct-v0.3.Q8_0.gguf https://huggingface.co/MaziyarPanahi/Mistral-7B-Instruct-v0.3-GGUF/resolve/main/Mistral-7B-Instruct-v0.3.Q8_0.gguf
 import os
 import re
+import fitz  # PyMuPDF
 import numpy as np
 import gc
 import torch
 import traceback
 from typing import List, Dict, Any, Tuple, Optional, Union, Generator
 from dataclasses import dataclass
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain_community.vectorstores import FAISS
 from langchain_community.embeddings import HuggingFaceEmbeddings
 from llama_cpp import Llama
+import gradio as gr
 from rank_bm25 import BM25Okapi
 import nltk
 from nltk.tokenize import word_tokenize
 from nltk.corpus import stopwords
+from sentence_transformers import CrossEncoder
 # Download nltk resources
 try:
 except:
     print("Failed to download NLTK resources, continuing without them")
 # === MEMORY MANAGEMENT UTILITIES ===
 def clear_memory():
     """Clear memory to prevent OOM errors"""
             length_function=len,
             is_separator_regex=False,
         )
         # Create cache directory
         self.cache_dir = os.path.join(os.getcwd(), "pdf_cache")
         os.makedirs(self.cache_dir, exist_ok=True)
         """Get the cache file path for a PDF"""
         pdf_hash = hashlib.md5(open(pdf_path, 'rb').read(8192)).hexdigest()
         return os.path.join(self.cache_dir, f"{os.path.basename(pdf_path)}_{pdf_hash}.pkl")
     def _is_cached(self, pdf_path: str) -> bool:
         """Check if a PDF is cached"""
         cache_path = self._get_cache_path(pdf_path)
         return os.path.exists(cache_path)
     def _load_from_cache(self, pdf_path: str) -> List[PDFChunk]:
         """Load chunks from cache"""
         cache_path = self._get_cache_path(pdf_path)
                 return pickle.load(f)
         except:
             return None
     def _save_to_cache(self, pdf_path: str, chunks: List[PDFChunk]) -> None:
         """Save chunks to cache"""
         cache_path = self._get_cache_path(pdf_path)
             if cached_chunks:
                 print(f"Loaded {len(cached_chunks)} chunks from cache for {os.path.basename(pdf_path)}")
                 return cached_chunks
         try:
             doc = fitz.open(pdf_path)
             pdf_chunks = []
             pdf_name = os.path.basename(pdf_path)
             for page_num in range(len(doc)):
                 page = doc.load_page(page_num)
                 # Extract text with more options for better quality
                 page_text = page.get_text("text", sort=True)
                 # Try to extract text with alternative layout analysis if the text is too short
                     except:
                         # Fallback to default extraction
                         page_text = page.get_text("text")
                 # Clean the text
                 page_text = self.clean_text(page_text)
                 # Extract tables
                 try:
                     tables = page.find_tables()
                                         row_cells.append(cell_text)
                                 if row_cells:
                                     table_text += " | ".join(row_cells) + "\n"
                             # Add table text to page text
                             if table_text.strip():
                                 page_text += "\n\nTABLE:\n" + table_text
                 except Exception as table_err:
                     print(f"Warning: Skipping table extraction for page {page_num}: {str(table_err)}")
                 # Split the page text into chunks
                 if page_text.strip():
                     page_chunks = self.text_splitter.split_text(page_text)
                     # Create PDFChunk objects
                     for i, chunk_text in enumerate(page_chunks):
                         pdf_chunks.append(PDFChunk(
                             page_num=page_num + 1,  # 1-based page numbering for humans
                             chunk_id=i
                         ))
                 # Clear memory periodically
                 if page_num % 10 == 0:
                     clear_memory()
             doc.close()
             # Cache the results
             self._save_to_cache(pdf_path, pdf_chunks)
             return pdf_chunks
         except Exception as e:
             print(f"Error extracting text from {pdf_path}: {str(e)}")
         pdf_path = os.path.join(self.pdf_dir, pdf_name)
         return self.extract_text_from_pdf(pdf_path)
+    def process_all_pdfs(self, batch_size: int = 3) -> List[PDFChunk]:
         """Process all PDFs in batches to manage memory
         Args:
                 model_kwargs={"device": "cpu"},
                 encode_kwargs={"normalize_embeddings": True}
             )
+        # Initialize cross-encoder for re-ranking
+        try:
+            self.cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')
+        except Exception as e:
+            print(f"Error initializing cross-encoder: {str(e)}")
+            self.cross_encoder = None
         self.vectordb = None
         # BM25 index for hybrid search
         self.bm25_index = None
                 stop_words = set(stopwords.words('english'))
                 filtered_tokens = [w for w in tokens if w.isalnum() and w not in stop_words]
                 tokenized_chunks.append(filtered_tokens)
             # Create BM25 index
             self.bm25_index = BM25Okapi(tokenized_chunks)
+            self.tokenized_chunks = tokenized_chunks
         except Exception as e:
             print(f"Error creating BM25 index: {str(e)}")
             print(traceback.format_exc())
             self.bm25_index = None
     def create_vector_db(self, chunks: List[PDFChunk]) -> None:
         """Create vector database from text chunks
                 return
             print(f"Creating vector DB with {len(chunks)} chunks")
             # Store chunks for hybrid search
             self.chunks = chunks
             # Prepare data for vector DB
             chunk_texts = [chunk.text for chunk in chunks]
             # Create BM25 index for hybrid search
             print("Creating BM25 index for hybrid search")
             self._prepare_bm25(chunks)
             # Process in smaller batches to manage memory
+            batch_size = 32
             all_embeddings = []
             for i in range(0, len(chunk_texts), batch_size):
                 text_embeddings=list(zip(chunk_texts, all_embeddings)),
                 embedding=self.embedding_model
             )
             print(f"Vector database created with {len(chunks)} documents")
         except Exception as e:
         """Format a chunk with its metadata for better context"""
         return f"Source: {chunk.source} | Page: {chunk.page_num}\n\n{chunk.text}"
+    def _rerank_with_cross_encoder(self, query: str, chunks: List[PDFChunk], k: int = 5) -> List[PDFChunk]:
+        """Re-rank chunks using cross-encoder
+        Args:
+            query: User query
+            chunks: List of retrieved chunks
+            k: Number of top chunks to return
+        Returns:
+            Re-ranked chunks
+        """
+        if not self.cross_encoder or not chunks:
+            return chunks[:k] if len(chunks) > k else chunks
+        try:
+            # Prepare passage pairs for re-ranking
+            pairs = [[query, chunk.text] for chunk in chunks]
+            # Score passages in smaller batches to prevent OOM
+            batch_size = 16
+            all_scores = []
+            for i in range(0, len(pairs), batch_size):
+                batch_pairs = pairs[i:i+batch_size]
+                batch_scores = self.cross_encoder.predict(batch_pairs)
+                all_scores.extend(batch_scores)
+                # Clear memory
+                clear_memory()
+            # Create chunk-score pairs
+            scored_chunks = list(zip(chunks, all_scores))
+            # Sort by score
+            scored_chunks.sort(key=lambda x: x[1], reverse=True)
+            # Return top k chunks
+            return [chunk for chunk, score in scored_chunks[:k]]
+        except Exception as e:
+            print(f"Error during cross-encoder re-ranking: {str(e)}")
+            # Fallback to original chunks
+            return chunks[:k] if len(chunks) > k else chunks
     def hybrid_search(self, query: str, k: int = 5, alpha: float = 0.7) -> List[str]:
+        """Hybrid search combining vector search and BM25 with cross-encoder re-ranking
         Args:
             query: Query text
         try:
             # Get vector search results
+            vector_results = self.vectordb.similarity_search(query, k=k*3)  # Get more for re-ranking
             vector_texts = [doc.page_content for doc in vector_results]
+            retrieved_chunks = []
             # Combine with BM25 if available
             if self.bm25_index is not None:
                 try:
                     query_tokens = word_tokenize(query.lower())
                     stop_words = set(stopwords.words('english'))
                     filtered_query = [w for w in query_tokens if w.isalnum() and w not in stop_words]
                     # Get BM25 scores
                     bm25_scores = self.bm25_index.get_scores(filtered_query)
                     # Combine scores (normalized)
                     combined_results = []
                     seen_texts = set()
                     # First add vector results with their positions as scores
                     for i, text in enumerate(vector_texts):
                         if text not in seen_texts:
                                     # Normalize BM25 score
                                     bm25_score = bm25_scores[j] / max(bm25_scores) if max(bm25_scores) > 0 else 0
                                     combined_score = alpha * vector_score + (1-alpha) * bm25_score
                                     combined_results.append((chunk, combined_score))
                                     break
                     # Sort by combined score
                     combined_results.sort(key=lambda x: x[1], reverse=True)
+                    # Get top k*2 results for re-ranking
+                    retrieved_chunks = [item[0] for item in combined_results[:k*2]]
                 except Exception as e:
                     print(f"Error in BM25 scoring: {str(e)}")
                     # Fallback to vector search results
+                    retrieved_chunks = [self.chunks[i] for i, text in enumerate(self.chunks)
+                                        if text.text in vector_texts[:k*2]]
             else:
                 # Just use vector search results if BM25 is not available
+                retrieved_chunks = [self.chunks[i] for i, chunk in enumerate(self.chunks)
+                                     if chunk.text in vector_texts[:k*2]]
+            # Re-rank with cross-encoder
+            if retrieved_chunks:
+                reranked_chunks = self._rerank_with_cross_encoder(query, retrieved_chunks, k)
+                # Format results with metadata
+                final_results = [self._format_chunk_with_metadata(chunk) for chunk in reranked_chunks]
+            else:
+                # Fallback to basic results
                 final_results = vector_texts[:k]
             return final_results
         except Exception as e:
             print(f"Error during hybrid search: {str(e)}")
             llm_model: LLM model for query expansion
         """
         self.llm = llm_model
     def expand_query(self, query: str) -> str:
         """Expand the query using the LLM to improve retrieval
             Expanded query
         """
         try:
+            prompt = f"""<s>[INST] I need to search for documents related to this question: "{query}"
 Please help me expand this query by identifying key concepts, synonyms, and related terms that might be used in the documents.
+Return only the expanded search query, without any explanations or additional text. [/INST]"""
             expanded = self.llm.generate(prompt, max_tokens=100, temperature=0.3)
             # Combine original and expanded
             combined = f"{query} {expanded}"
             # Limit length
             if len(combined) > 300:
                 combined = combined[:300]
             return combined
         except:
             # Return original query if expansion fails
             return query
 # === LLM SETUP ===
+class MistralModel:
+    def __init__(self, model_path: str = "models/mistral-7b-instruct-v0.3.Q8_0.gguf"):
+        """Initialize Mistral model
         Args:
             model_path: Path to the model file
         """
         try:
+            # Initialize Mistral with llama.cpp
             self.llm = Llama(
                 model_path=model_path,
+                n_ctx=4096,         # Increased context window for better reasoning
+                n_batch=256,        # Batch size to save memory
+                n_gpu_layers=0,     # Run on CPU only for Colab free tier
                 verbose=False
             )
         except Exception as e:
+            print(f"Error initializing Mistral model: {str(e)}")
             raise
     def generate(self, prompt: str,
                  temperature: float = 0.7,
                  top_p: float = 0.9,
                  stream: bool = False) -> Union[str, Generator[str, None, None]]:
+        """Generate text using Mistral
         Args:
             prompt: Input prompt
                     top_p=top_p,
                     echo=False
                 )
+                return output["choices"][0]["text"].strip()
         except Exception as e:
             print(f"Error generating text: {str(e)}")
             return "Error: Could not generate response."
                          max_tokens: int = 512,
                          temperature: float = 0.7,
                          top_p: float = 0.9) -> Generator[str, None, None]:
+        """Stream text generation using Mistral
         Args:
             prompt: Input prompt
             response += token
             yield response
+# === SELF-CHECKING ===
+class SelfChecker:
+    def __init__(self, llm_model):
+        """Initialize self-checker for improved response quality
+        Args:
+            llm_model: LLM model to use for checking
+        """
+        self.llm = llm_model
+    def check_answer(self, query: str, initial_answer: str, contexts: List[str]) -> str:
+        """Check if answer is correct and complete based on the contexts
+        Args:
+            query: User query
+            initial_answer: Initial generated answer
+            contexts: Retrieved contexts used to generate the answer
+        Returns:
+            Improved answer after reflection
+        """
+        # Guard against very long inputs that could cause runtime disconnection
+        # Limit contexts to prevent excessive token usage
+        max_contexts_len = 4000
+        contexts_text = "\n\n".join(contexts)
+        if len(contexts_text) > max_contexts_len:
+            # Truncate while keeping as many complete contexts as possible
+            truncated_contexts = []
+            current_len = 0
+            for ctx in contexts:
+                if current_len + len(ctx) + 2 <= max_contexts_len:
+                    truncated_contexts.append(ctx)
+                    current_len += len(ctx) + 2
+                else:
+                    break
+            contexts_text = "\n\n".join(truncated_contexts)
+        # Check if we should skip reflection to prevent disconnection
+        if len(initial_answer) + len(contexts_text) + len(query) > 6000:
+            print("Skipping reflection due to excessive input length")
+            return initial_answer
+        try:
+            prompt = f"""<s>[INST] You're an AI assistant tasked with evaluating and improving an answer to a user query.
+QUERY: {query}
+INITIAL ANSWER: {initial_answer}
+AVAILABLE CONTEXTS:
+{contexts_text}
+First, carefully check if the initial answer:
+1. Is factually accurate based on the provided contexts
+2. Addresses all aspects of the user's query
+3. Contains any information not supported by the contexts
+4. Misses important information from the contexts
+Then improve the answer to fix any issues identified. The final answer should:
+- Be comprehensive and accurate based ONLY on the contexts
+- Not include any unsupported information
+- Be well-structured and clear
+- Cite specific sources when appropriate (e.g., "According to [Source, Page X]...")
+Provide ONLY the improved answer without explanations about your reasoning process. [/INST]"""
+            # We use slightly lower temperature for more focused reflection
+            improved_answer = self.llm.generate(
+                prompt,
+                max_tokens=1024,
+                temperature=0.3,
+                stream=False
+            )
+            # If reflection produced nothing useful, return original answer
+            if not improved_answer or len(improved_answer) < 10:
+                return initial_answer
+            return improved_answer
+        except Exception as e:
+            # On any error, return the original answer to ensure robustness
+            print(f"Self-check error: {str(e)}")
+            return initial_answer
 # === RAG SYSTEM ===
 class RAGSystem:
     def __init__(self, pdf_processor: PDFProcessor,
                  vector_db: VectorDBManager,
+                 model: MistralModel):
         """Initialize RAG system
         Args:
         self.vector_db = vector_db
         self.model = model
         self.query_expander = QueryExpander(model)
+        self.self_checker = SelfChecker(model)
         self.is_initialized = False
     def process_documents(self) -> bool:
         """Process all documents and create vector database
         Returns:
             True if successful, False otherwise
         """
             if not chunks:
                 print("No chunks were extracted from PDFs")
                 return False
             print(f"Total chunks extracted: {len(chunks)}")
             # Create vector database
             print("Creating vector database...")
             self.vector_db.create_vector_db(chunks)
             # Verify success
             if self.vector_db.vectordb is None:
                 print("Failed to create vector database")
                 return False
             # Set initialization flag
             self.is_initialized = True
             return True
         except Exception as e:
             print(f"Error processing documents: {str(e)}")
             print(traceback.format_exc())
         for i, context in enumerate(contexts):
             formatted_contexts += f"[CONTEXT {i+1}]\n{context}\n\n"
+        # Create prompt with Mistral's chat format
+        prompt = f"""<s>[INST] You are an AI assistant that answers questions based on the provided context information.
 User Query: {query}
 {formatted_contexts}
+Using ONLY the information provided in the context above, provide a comprehensive answer to the user's query.
 If the provided context doesn't contain relevant information to answer the query, clearly state: "I don't have enough information in the provided context to answer this question."
 Do not use any prior knowledge that is not contained in the provided context.
 If quoting from the context, mention the source document and page number.
+Organize your answer in a clear, coherent manner. [/INST]"""
         return prompt
     def answer_query(self, query: str, k: int = 5, max_tokens: int = 512,
+                 temperature: float = 0.7, stream: bool = False, enable_reflection: bool = True) -> Union[str, Generator[str, None, None]]:
+        """Answer a query using RAG with query expansion and self-checking
         Args:
             query: User query
             k: Number of contexts to retrieve
             max_tokens: Maximum number of tokens to generate
             temperature: Temperature for generation
             stream: Whether to stream the output
+            enable_reflection: Whether to enable self-reflection for better answers
         Returns:
             Answer text or generator if streaming
         """
             # Expand query for better retrieval
             expanded_query = self.query_expander.expand_query(query)
             print(f"Expanded query: {expanded_query}")
             # Retrieve relevant contexts using hybrid search
             contexts = self.vector_db.hybrid_search(expanded_query, k=k)
             # Generate prompt with improved instructions
             prompt = self.generate_prompt(query, contexts)
+            # For streaming, we can't do self-checking
+            if stream:
+                return self.model.generate(
+                    prompt,
+                    max_tokens=max_tokens,
+                    temperature=temperature,
+                    stream=True
+                )
+            # Generate initial answer
+            initial_answer = self.model.generate(
                 prompt,
                 max_tokens=max_tokens,
                 temperature=temperature,
+                stream=False
             )
+            # Perform self-checking if enabled and initial answer exists
+            if enable_reflection and initial_answer and len(initial_answer) > 10:
+                try:
+                    print("Performing self-checking to improve answer quality...")
+                    improved_answer = self.self_checker.check_answer(query, initial_answer, contexts)
+                    return improved_answer
+                except Exception as e:
+                    print(f"Error during self-checking: {str(e)}")
+                    # Fallback to initial answer if self-checking fails
+                    return initial_answer
+            else:
+                return initial_answer
         except Exception as e:
             print(f"Error answering query: {str(e)}")
             print(traceback.format_exc())
             return f"Error processing your query: {str(e)}"
+# === GRADIO UI ===
+class RAGUI:
     def __init__(self, rag_system: RAGSystem):
+        """Initialize RAG UI
         Args:
             rag_system: RAG system instance
         """
         self.rag_system = rag_system
+        self.pdf_dir = rag_system.pdf_processor.pdf_dir
         self.interface = None
+    def _list_uploaded_pdfs(self) -> str:
+        """List all uploaded PDFs"""
+        pdfs = self.rag_system.pdf_processor.list_pdfs()
+        if not pdfs:
+            return "No PDFs uploaded yet."
+        return "\n".join([f"- {pdf}" for pdf in pdfs])
+    def upload_pdf(self, files) -> str:
+        """Upload PDF files
+        Args:
+            files: File objects
+        Returns:
+            Status message
+        """
         try:
+            # Create directory if it doesn't exist
+            os.makedirs(self.pdf_dir, exist_ok=True)
+            # Copy files to pdf directory
             for file in files:
+                shutil.copy(file.name, os.path.join(self.pdf_dir, os.path.basename(file.name)))
+            return f"Successfully uploaded {len(files)} file(s). Please process documents to make them searchable."
         except Exception as e:
             return f"Error uploading files: {str(e)}"
+    def process_documents(self) -> str:
+        """Process documents and create vector database
         Returns:
             Status message
         """
         try:
+            # Check if there are PDFs
+            pdf_files = self.rag_system.pdf_processor.list_pdfs()
+            if not pdf_files:
+                return "No PDF files uploaded. Please upload PDFs first."
+            # Process PDFs
             start_time = time.time()
             success = self.rag_system.process_documents()
+            process_time = time.time() - start_time
             if success:
+                return f"Successfully processed {len(pdf_files)} PDF file(s) in {process_time:.2f} seconds. You can now ask questions."
             else:
+                return "Failed to process documents. Check the logs for details."
         except Exception as e:
             return f"Error processing documents: {str(e)}"
+    def answer_query(self, query: str, stream_output: bool = True,
+                    k: int = 4, temperature: float = 0.7,
+                    enable_reflection: bool = True) -> str:
+        """Answer a query using RAG
         Args:
             query: User query
+            stream_output: Whether to stream the output
             k: Number of contexts to retrieve
+            temperature: Temperature for text generation
+            enable_reflection: Whether to use reflection to improve answers
         Returns:
+            Answer text
         """
+        if not query or query.strip() == "":
+            return "Please enter a query."
+        # Check if system is initialized
+        if not self.rag_system.is_initialized:
+            return "Documents have not been processed yet. Please process documents first."
         try:
+            # For streaming, we need to handle gradio uniqueness
+            if stream_output:
+                # We can't stream with reflection
+                return self.rag_system.answer_query(
+                    query,
+                    k=k,
+                    max_tokens=1024,
+                    temperature=temperature,
+                    stream=True,
+                    enable_reflection=False
+                )
+            else:
+                return self.rag_system.answer_query(
+                    query,
+                    k=k,
+                    max_tokens=1024,
+                    temperature=temperature,
+                    stream=False,
+                    enable_reflection=enable_reflection
+                )
         except Exception as e:
+            print(f"Error in answer_query: {str(e)}")
+            print(traceback.format_exc())
+            return f"Error processing your query: {str(e)}"
+    def launch(self):
+        """Launch Gradio UI"""
+        try:
+            with gr.Blocks(title="Document Q&A System") as self.interface:
+                gr.Markdown("# PDF Question Answering System")
+                gr.Markdown("Upload PDF documents and ask questions about their content.")
+                with gr.Tab("Upload & Process"):
+                    with gr.Row():
+                        with gr.Column():
+                            upload_button = gr.File(
+                                label="Upload PDF Files",
+                                file_count="multiple",
+                                file_types=[".pdf"]
                             )
+                            upload_output = gr.Textbox(
+                                label="Upload Status",
+                                interactive=False
                             )
+                            upload_btn = gr.Button("Upload Files")
+                        with gr.Column():
+                            pdf_list = gr.Textbox(
+                                label="Uploaded PDFs",
+                                value=self._list_uploaded_pdfs(),
+                                interactive=False
                             )
+                            refresh_btn = gr.Button("Refresh List")
+                    process_btn = gr.Button("Process Documents")
+                    process_output = gr.Textbox(
+                        label="Processing Status",
+                        interactive=False
+                    )
+                with gr.Tab("Ask Questions"):
+                    with gr.Row():
+                        with gr.Column():
+                            query_input = gr.Textbox(
+                                label="Enter your question",
+                                placeholder="What are the main findings of the report?",
+                                lines=2
+                            )
+                            with gr.Row():
+                                k_slider = gr.Slider(
+                                    minimum=1,
+                                    maximum=10,
+                                    value=4,
+                                    step=1,
+                                    label="Number of contexts to retrieve"
+                                )
+                                temp_slider = gr.Slider(
+                                    minimum=0.1,
+                                    maximum=1.0,
+                                    value=0.7,
+                                    step=0.1,
+                                    label="Temperature"
+                                )
+                            with gr.Row():
+                                stream_checkbox = gr.Checkbox(
+                                    label="Stream output",
+                                    value=True
+                                )
+                                reflection_checkbox = gr.Checkbox(
+                                    label="Use self-reflection (disables streaming)",
+                                    value=True
+                                )
+                            query_btn = gr.Button("Submit Question")
+                    answer_output = gr.Textbox(
+                        label="Answer",
+                        interactive=False,
+                        lines=15
+                    )
+                # Event handlers
+                upload_btn.click(
+                    fn=self.upload_pdf,
+                    inputs=[upload_button],
+                    outputs=[upload_output]
                 )
+                refresh_btn.click(
+                    fn=lambda: self._list_uploaded_pdfs(),
+                    inputs=[],
+                    outputs=[pdf_list]
+                )
+                process_btn.click(
+                    fn=self.process_documents,
+                    inputs=[],
+                    outputs=[process_output]
                 )
+                query_btn.click(
+                    fn=self.answer_query,
+                    inputs=[query_input, stream_checkbox, k_slider, temp_slider, reflection_checkbox],
+                    outputs=[answer_output]
+                )
+                # Checkbox dependency
+                def update_stream_state(reflection_enabled):
+                    return not reflection_enabled if reflection_enabled else gr.update()
+                reflection_checkbox.change(
+                    fn=update_stream_state,
+                    inputs=[reflection_checkbox],
+                    outputs=[stream_checkbox]
+                )
+            # Launch UI
+            self.interface.launch(share=True)
+        except Exception as e:
+            print(f"Error launching UI: {str(e)}")
+            print(traceback.format_exc())
 # === MAIN APPLICATION ===
 def main():
+    # Initialize components
+    print("Initializing PDF processor...")
+    pdf_processor = PDFProcessor()
+    print("Initializing vector database manager...")
+    vector_db = VectorDBManager()
+    print("Initializing Mistral model...")
+    model = MistralModel()
+    print("Initializing RAG system...")
+    rag_system = RAGSystem(pdf_processor, vector_db, model)
+    print("Initializing UI...")
+    ui = RAGUI(rag_system)
+    print("Launching UI...")
+    ui.launch()
 if __name__ == "__main__":
     main()