Spaces:

chaaim123
/

finalproj01

Running

App Files Files Community

finalproj01 / retriever /document_manager.py

chaaim123

Create retriever/document_manager.py

f1a2317 verified 8 days ago

raw

history blame contribute delete

4.7 kB

	import logging
	import os
	from typing import Any, Dict, List
	import uuid
	from data.document_loader import DocumentLoader
	from data.pdf_reader import PDFReader
	from retriever.chunk_documents import chunk_documents
	from retriever.vector_store_manager import VectorStoreManager

	class DocumentManager:
	def __init__(self):
	self.doc_loader = DocumentLoader()
	self.pdf_reader = PDFReader()
	self.vector_manager = VectorStoreManager()
	self.uploaded_documents = {}
	self.chunked_documents = {}
	self.document_ids = {}
	logging.info("DocumentManager initialized")

	def process_document(self, file):
	"""
	Process an uploaded file: load, read PDF, chunk, and store in vector store.
	Returns: (status_message, page_list, filename, doc_id)
	"""
	try:
	if file is None:
	return "No file uploaded", None, None

	logging.info(f"Processing file: {file}")

	# Load and validate file
	file_path = self.doc_loader.load_file(file)
	filename = os.path.basename(file_path)

	# Read PDF content
	page_list = self.pdf_reader.read_pdf(file_path)

	# Store the uploaded document
	self.uploaded_documents[filename] = file_path

	# Generate a unique document ID
	doc_id = str(uuid.uuid4())
	self.document_ids[filename] = doc_id

	# Chunk the pages
	chunks = chunk_documents(page_list, doc_id, chunk_size=2000, chunk_overlap=300)
	self.chunked_documents[filename] = chunks

	# Add chunks to vector store
	self.vector_manager.add_documents(chunks)

	return (
	f"Successfully loaded {filename} with {len(page_list)} pages",
	filename,
	doc_id
	)

	except Exception as e:
	logging.error(f"Error processing document: {str(e)}")
	return f"Error: {str(e)}", [], None, None

	def get_uploaded_documents(self):
	"""Return the list of uploaded document filenames."""
	return list(self.uploaded_documents.keys())

	def get_chunks(self, filename):
	"""Return chunks for a given filename."""
	return self.chunked_documents.get(filename, [])

	def get_document_id(self, filename):
	"""Return the document ID for a given filename."""
	return self.document_ids.get(filename, None)

	def retrieve_top_k(self, query: str, selected_docs: List[str], k: int = 5) -> List[Dict[str, Any]]:
	"""
	Retrieve the top K chunks across the selected documents based on the user's query.

	Args:
	query (str): The user's query.
	selected_docs (List[str]): List of selected document filenames from the dropdown.
	k (int): Number of top results to return (default is 5).

	Returns:
	List[Dict[str, Any]]: List of top K chunks with their text, metadata, and scores.
	"""
	if not selected_docs:
	logging.warning("No documents selected for retrieval")
	return []

	all_results = []
	for filename in selected_docs:
	doc_id = self.get_document_id(filename)
	if not doc_id:
	logging.warning(f"No document ID found for filename: {filename}")
	continue

	# Search for relevant chunks within this document
	results = self.vector_manager.search(query, doc_id, k=k)
	all_results.extend(results)

	# Sort all results by score in descending order and take the top K
	all_results.sort(key=lambda x: x['score'], reverse=True)
	top_k_results = all_results[:k]

	# Log the list of retrieved documents
	#logging.info(f"Result from search :{all_results} ")
	logging.info(f"Retrieved top {k} documents:")
	for i, result in enumerate(top_k_results, 1):
	doc_id = result['metadata'].get('doc_id', 'Unknown')
	filename = next((name for name, d_id in self.document_ids.items() if d_id == doc_id), 'Unknown')
	logging.info(f"{i}. Filename: {filename}, Doc ID: {doc_id}, Score: {result['score']:.4f}, Text: {result['text'][:200]}...")

	return top_k_results

	def retrieve_summary_chunks(self, query: str, doc_id : str, k: int = 10):
	logging.info(f"Retrieving {k} chunks for summary: {query}, Document Id: {doc_id}")
	results = self.vector_manager.search(query, doc_id, k=k)
	top_k_results = results[:k]
	logging.info(f"Retrieved {len(top_k_results)} chunks for summary")

	return top_k_results