Spaces:

NEXAS
/

docling_rag

Running

App Files Files Community

NEXAS commited on Mar 2

Commit

4a97e8c

verified ·

1 Parent(s): 2723c4f

Update utils/ingestion.py

Browse files

Files changed (1) hide show

utils/ingestion.py +45 -111

utils/ingestion.py CHANGED Viewed

@@ -7,150 +7,85 @@ import chromadb
 from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
 from docling.datamodel.base_models import InputFormat
-from docling.datamodel.pipeline_options import (
-    AcceleratorDevice,
-    AcceleratorOptions,
-    PdfPipelineOptions,
-    TableFormerMode
 )
-from docling.document_converter import DocumentConverter, PdfFormatOption
-from docling_core.transforms.chunker.hybrid_chunker import HybridChunker
 from langchain_community.embeddings.fastembed import FastEmbedEmbeddings
-from docx import Document  # DOCX support
-from pptx import Presentation  # PPTX support
-from bs4 import BeautifulSoup  # HTML support
 class DocumentProcessor:
     def __init__(self):
-        """Initialize document processor with necessary components"""
         self.setup_document_converter()
         self.embed_model = FastEmbedEmbeddings()
-        self.client = chromadb.PersistentClient(path="chroma_db")  # Persistent Storage
     def setup_document_converter(self):
-        """Configure document converter with advanced processing capabilities"""
         pipeline_options = PdfPipelineOptions()
         pipeline_options.do_ocr = True
         pipeline_options.do_table_structure = True
-        pipeline_options.table_structure_options.do_cell_matching = True
-        pipeline_options.ocr_options.lang = ["en"]
-        pipeline_options.table_structure_options.mode = TableFormerMode.ACCURATE
-        try:
-            pipeline_options.accelerator_options = AcceleratorOptions(
-                num_threads=8, device=AcceleratorDevice.MPS
-            )
-        except Exception:
-            print("⚠️ MPS is not available. Falling back to CPU.")
-            pipeline_options.accelerator_options = AcceleratorOptions(
-                num_threads=8, device=AcceleratorDevice.CPU
-            )
         self.converter = DocumentConverter(
             format_options={
                 InputFormat.PDF: PdfFormatOption(
                     pipeline_options=pipeline_options,
-                    backend=PyPdfiumDocumentBackend
-                )
-            }
         )
-    def extract_chunk_metadata(self, chunk) -> Dict[str, Any]:
-        """Extract essential metadata from a chunk"""
-        metadata = {
-            "text": chunk.text.strip(),
-            "headings": [],
-            "page_info": None,
-            "content_type": None
-        }
-        if hasattr(chunk, 'meta'):
-            if hasattr(chunk.meta, 'headings') and chunk.meta.headings:
-                metadata["headings"] = chunk.meta.headings
-            if hasattr(chunk.meta, 'doc_items'):
-                for item in chunk.meta.doc_items:
-                    if hasattr(item, 'label'):
-                        metadata["content_type"] = str(item.label)
-                    if hasattr(item, 'prov') and item.prov:
-                        for prov in item.prov:
-                            if hasattr(prov, 'page_no'):
-                                metadata["page_info"] = prov.page_no
-        return metadata
-    def extract_text_from_docx(self, docx_path: str) -> List[str]:
-        """Extract text from a DOCX file"""
-        doc = Document(docx_path)
-        return [para.text.strip() for para in doc.paragraphs if para.text.strip()]
-    def extract_text_from_pptx(self, pptx_path: str) -> List[str]:
-        """Extract text from a PPTX file"""
-        ppt = Presentation(pptx_path)
-        slides_text = []
-        for slide in ppt.slides:
-            text = " ".join([shape.text for shape in slide.shapes if hasattr(shape, "text")])
-            if text.strip():
-                slides_text.append(text.strip())
-        return slides_text
-    def extract_text_from_html(self, html_path: str) -> List[str]:
-        """Extract text from an HTML file"""
-        with open(html_path, "r", encoding="utf-8") as file:
-            soup = BeautifulSoup(file, "html.parser")
-        return [text.strip() for text in soup.stripped_strings if text.strip()]
     def process_document(self, file_path: str):
         """Process document and create searchable index with metadata"""
         print(f"📄 Processing document: {file_path}")
         start_time = time.time()
         file_ext = Path(file_path).suffix.lower()
-        if file_ext == ".pdf":
-            result = self.converter.convert(file_path)
-            doc = result.document
-            chunker = HybridChunker(tokenizer="jinaai/jina-embeddings-v3")
-            chunks = list(chunker.chunk(doc))
-            processed_chunks = []
-            for chunk in chunks:
-                metadata = self.extract_chunk_metadata(chunk)
-                processed_chunks.append(metadata)
-        elif file_ext == ".docx":
-            texts = self.extract_text_from_docx(file_path)
-            processed_chunks = [{"text": text, "headings": [], "content_type": "DOCX"} for text in texts]
-        elif file_ext == ".pptx":
-            texts = self.extract_text_from_pptx(file_path)
-            processed_chunks = [{"text": text, "headings": [], "content_type": "PPTX"} for text in texts]
-        elif file_ext == ".html":
-            texts = self.extract_text_from_html(file_path)
-            processed_chunks = [{"text": text, "headings": [], "content_type": "HTML"} for text in texts]
-        else:
-            print(f"❌ Unsupported file format: {file_ext}")
-            return None
         print("✅ Chunking completed. Creating vector database...")
         collection = self.client.get_or_create_collection(name="document_chunks")
-        documents = []
-        embeddings = []
-        metadata_list = []
-        ids = []
         for idx, chunk in enumerate(processed_chunks):
             text = chunk.get('text', '').strip()
             if not text:
-                print(f"⚠️ Skipping empty chunk at index {idx}")
-                continue  # Skip empty chunks
-            embedding = self.embed_model.embed_documents([text])[0]  # ✅ Corrected method
             documents.append(text)
             embeddings.append(embedding)
             metadata_list.append({
@@ -168,6 +103,5 @@ class DocumentProcessor:
             )
             print(f"✅ Successfully added {len(documents)} chunks to the database.")
-        processing_time = time.time() - start_time
-        print(f"✅ Document processing completed in {processing_time:.2f} seconds")
         return collection

 from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
 from docling.datamodel.base_models import InputFormat
+from docling.datamodel.pipeline_options import PdfPipelineOptions
+from docling.document_converter import (
+    DocumentConverter,
+    PdfFormatOption,
+    WordFormatOption,
 )
+from docling.pipeline.simple_pipeline import SimplePipeline
+from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
+from docling.document import DoclingDocument
+from docling.chunking.hierarchical_chunker import HierarchicalChunker
 from langchain_community.embeddings.fastembed import FastEmbedEmbeddings
 class DocumentProcessor:
     def __init__(self):
+        """Initialize document processor with Docling v2 changes"""
         self.setup_document_converter()
         self.embed_model = FastEmbedEmbeddings()
+        self.client = chromadb.PersistentClient(path="chroma_db")
     def setup_document_converter(self):
+        """Configure document converter to support multiple formats"""
         pipeline_options = PdfPipelineOptions()
         pipeline_options.do_ocr = True
         pipeline_options.do_table_structure = True
         self.converter = DocumentConverter(
+            allowed_formats=[
+                InputFormat.PDF,
+                InputFormat.IMAGE,
+                InputFormat.DOCX,
+                InputFormat.HTML,
+                InputFormat.PPTX,
+            ],
             format_options={
                 InputFormat.PDF: PdfFormatOption(
                     pipeline_options=pipeline_options,
+                    backend=PyPdfiumDocumentBackend()
+                ),
+                InputFormat.DOCX: WordFormatOption(
+                    pipeline_cls=SimplePipeline
+                ),
+            },
         )
     def process_document(self, file_path: str):
         """Process document and create searchable index with metadata"""
         print(f"📄 Processing document: {file_path}")
         start_time = time.time()
         file_ext = Path(file_path).suffix.lower()
+        try:
+            conv_result = self.converter.convert(file_path)
+            doc: DoclingDocument = conv_result.document
+        except Exception as e:
+            print(f"❌ Conversion failed: {e}")
+            return None
+        chunker = HierarchicalChunker()
+        chunks = list(chunker.chunk(doc))
+        processed_chunks = []
+        for chunk in chunks:
+            metadata = {
+                "text": chunk.text.strip(),
+                "headings": [item.text for item in chunk.doc_items if hasattr(item, "text")],
+                "content_type": chunk.doc_items[0].label if chunk.doc_items else "Unknown",
+            }
+            processed_chunks.append(metadata)
         print("✅ Chunking completed. Creating vector database...")
         collection = self.client.get_or_create_collection(name="document_chunks")
+        documents, embeddings, metadata_list, ids = [], [], [], []
         for idx, chunk in enumerate(processed_chunks):
             text = chunk.get('text', '').strip()
             if not text:
+                continue
+            embedding = self.embed_model.embed_documents([text])[0]
             documents.append(text)
             embeddings.append(embedding)
             metadata_list.append({
             )
             print(f"✅ Successfully added {len(documents)} chunks to the database.")
+        print(f"✅ Document processing completed in {time.time() - start_time:.2f} seconds")
         return collection