Spaces:
Running
Running
File size: 4,343 Bytes
f65750f 2723c4f f65750f 4a97e8c f65750f 4a97e8c f65750f 2723c4f f65750f 4a97e8c f65750f 4a97e8c f65750f 4a97e8c f65750f 0a394f8 f65750f 4a97e8c 0a394f8 4a97e8c f65750f 4a97e8c f65750f 2723c4f f65750f 2723c4f f65750f 2723c4f 4a97e8c f65750f 0a394f8 4a97e8c f65750f 4a97e8c f65750f 7a013c2 f65750f 4a97e8c f65750f 29df71b 4a97e8c 29df71b 4a97e8c 29df71b f65750f 29df71b f65750f 7a013c2 f65750f 4a97e8c f65750f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 |
import json
import time
import os
from pathlib import Path
from typing import Dict, Any, List
import chromadb
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.document_converter import (
DocumentConverter,
PdfFormatOption,
WordFormatOption,
)
from docling.pipeline.simple_pipeline import SimplePipeline
from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
from docling.document import DoclingDocument
from docling.chunking.hierarchical_chunker import HierarchicalChunker
from langchain_community.embeddings.fastembed import FastEmbedEmbeddings
class DocumentProcessor:
def __init__(self):
"""Initialize document processor with Docling v2 changes"""
self.setup_document_converter()
self.embed_model = FastEmbedEmbeddings()
self.client = chromadb.PersistentClient(path="chroma_db")
def setup_document_converter(self):
"""Configure document converter to support multiple formats"""
pipeline_options = PdfPipelineOptions()
pipeline_options.do_ocr = False
pipeline_options.do_table_structure = True
self.converter = DocumentConverter(
allowed_formats=[
InputFormat.PDF,
InputFormat.IMAGE,
InputFormat.DOCX,
InputFormat.HTML,
InputFormat.PPTX,
InputFormat.TXT, # Added text format
InputFormat.CSV, # Added CSV format
],
format_options={
InputFormat.PDF: PdfFormatOption(
pipeline_options=pipeline_options,
backend=PyPdfiumDocumentBackend()
),
InputFormat.DOCX: WordFormatOption(
pipeline_cls=SimplePipeline
),
},
)
def process_document(self, file_path: str):
"""Process document and create searchable index with metadata"""
print(f"π Processing document: {file_path}")
start_time = time.time()
file_ext = Path(file_path).suffix.lower()
try:
conv_result = self.converter.convert(file_path)
doc: DoclingDocument = conv_result.document
except Exception as e:
print(f"β Conversion failed: {e}")
return None
# Save document as markdown
output_dir = Path("parsed-doc")
output_dir.mkdir(parents=True, exist_ok=True)
doc_filename = Path(file_path).stem
md_filename = output_dir / f"{doc_filename}.md"
doc.save_as_markdown(md_filename)
chunker = HierarchicalChunker()
chunks = list(chunker.chunk(doc))
processed_chunks = []
for chunk in chunks:
metadata = {
"text": chunk.text.strip(),
"headings": [item.text for item in chunk.doc_items if hasattr(item, "text")],
"content_type": chunk.doc_items[0].label if chunk.doc_items else "Unknown",
}
processed_chunks.append(metadata)
print("β
Chunking completed. Creating vector database...")
collection = self.client.get_or_create_collection(name="document_chunks")
documents, embeddings, metadata_list, ids = [], [], [], []
for idx, chunk in enumerate(processed_chunks):
text = chunk.get('text', '').strip()
if not text:
continue
embedding = self.embed_model.embed_documents([text])[0]
documents.append(text)
embeddings.append(embedding)
metadata_list.append({
"headings": json.dumps(chunk.get('headings', [])),
"content_type": chunk.get('content_type', None)
})
ids.append(str(idx))
if documents:
collection.add(
ids=ids,
embeddings=embeddings,
documents=documents,
metadatas=metadata_list
)
print(f"β
Successfully added {len(documents)} chunks to the database.")
print(f"β
Document processing completed in {time.time() - start_time:.2f} seconds")
return collection
|