Spaces:

Ronochieng
/

DocMindAI

Build error

File size: 7,966 Bytes

18a68e7

import os
import pymupdf4llm
import pandas as pd
import tempfile
from typing import Dict, Any, Optional, List

# Import unstructured components for different file types
from unstructured.partition.auto import partition
from unstructured.partition.pdf import partition_pdf
from unstructured.partition.docx import partition_docx
from unstructured.partition.pptx import partition_pptx
from unstructured.partition.xlsx import partition_xlsx
from unstructured.partition.md import partition_md
from unstructured.partition.html import partition_html
from unstructured.partition.xml import partition_xml
from unstructured.partition.email import partition_email
from unstructured.partition.text import partition_text
from unstructured.partition.epub import partition_epub

def get_processor_for_file(file_path: str) -> Optional[callable]:
    """
    Determine the appropriate processor function for the given file type
    """
    file_extension = os.path.splitext(file_path)[1].lower()
    
    # Map file extensions to specific partition functions
    processors = {
        ".pdf": process_pdf,
        ".docx": process_docx,
        ".doc": process_docx,
        ".pptx": process_pptx,
        ".ppt": process_pptx,
        ".xlsx": process_xlsx,
        ".xls": process_xlsx,
        ".md": process_markdown,
        ".html": process_html,
        ".htm": process_html,
        ".xml": process_xml,
        ".msg": process_email,
        ".eml": process_email,
        ".epub": process_epub,
        ".txt": process_text,
        ".csv": process_text,
        ".rtf": process_text,
        
        # Code files
        ".py": process_text,
        ".js": process_text,
        ".java": process_text,
        ".ts": process_text,
        ".tsx": process_text,
        ".jsx": process_text,
        ".c": process_text,
        ".cpp": process_text,
        ".h": process_text,
        ".cs": process_text,
        ".rb": process_text,
        ".go": process_text,
        ".rs": process_text,
        ".php": process_text,
        ".sql": process_text,
        ".css": process_text,
    }
    
    return processors.get(file_extension, process_generic)

def process_document(file_path: str) -> Optional[str]:
    """
    Process a document using the appropriate processor based on file type
    """
    processor = get_processor_for_file(file_path)
    if processor:
        return processor(file_path)
    return None

def process_pdf(file_path: str) -> str:
    """
    Process PDF documents using unstructured
    """
    temp_dir = tempfile.mkdtemp()
    
    try:
        # Try hi_res mode first with OCR capabilities
        elements = partition_pdf(
            filename=file_path,
            strategy="hi_res",
            extract_images_in_pdf=True,
            extract_image_block_types=["Image", "Table"],
            extract_image_block_to_payload=False,
            extract_image_block_output_dir=temp_dir,
            hi_res_model_name="yolox",
            infer_table_structure=True,
            chunking_strategy="by_title",
            max_characters=4000,
            new_after_n_chars=3800,
            combine_text_under_n_chars=2000,
        )
    except Exception as e:
        # Fall back to fast mode if hi_res fails
        elements = partition_pdf(
            filename=file_path,
            strategy="fast",
            chunking_strategy="by_title",
            max_characters=4000,
            new_after_n_chars=3800,
            combine_text_under_n_chars=2000,
        )
    
    # Extract text from elements
    texts = [element.text for element in elements if hasattr(element, 'text') and element.text]
    combined_text = "\n\n".join(texts)
    
    return combined_text

def process_docx(file_path: str) -> str:
    """
    Process DOCX documents using unstructured
    """
    elements = partition_docx(
        filename=file_path,
        chunking_strategy="by_title",
        max_characters=4000,
        new_after_n_chars=3800,
        combine_text_under_n_chars=2000,
    )
    
    texts = [element.text for element in elements if hasattr(element, 'text') and element.text]
    combined_text = "\n\n".join(texts)
    
    return combined_text

def process_pptx(file_path: str) -> str:
    """
    Process PPTX documents using unstructured
    """
    elements = partition_pptx(
        filename=file_path,
    )
    
    texts = [element.text for element in elements if hasattr(element, 'text') and element.text]
    combined_text = "\n\n".join(texts)
    
    return combined_text

def process_xlsx(file_path: str) -> str:
    """
    Process XLSX documents using unstructured
    """
    elements = partition_xlsx(
        filename=file_path,
    )
    
    texts = [element.text for element in elements if hasattr(element, 'text') and element.text]
    combined_text = "\n\n".join(texts)
    
    return combined_text

def process_markdown(file_path: str) -> str:
    """
    Process Markdown documents using unstructured
    """
    elements = partition_md(
        filename=file_path,
    )
    
    texts = [element.text for element in elements if hasattr(element, 'text') and element.text]
    combined_text = "\n\n".join(texts)
    
    return combined_text

def process_html(file_path: str) -> str:
    """
    Process HTML documents using unstructured
    """
    elements = partition_html(
        filename=file_path,
    )
    
    texts = [element.text for element in elements if hasattr(element, 'text') and element.text]
    combined_text = "\n\n".join(texts)
    
    return combined_text

def process_xml(file_path: str) -> str:
    """
    Process XML documents using unstructured
    """
    elements = partition_xml(
        filename=file_path,
    )
    
    texts = [element.text for element in elements if hasattr(element, 'text') and element.text]
    combined_text = "\n\n".join(texts)
    
    return combined_text

def process_email(file_path: str) -> str:
    """
    Process email documents using unstructured
    """
    elements = partition_email(
        filename=file_path,
    )
    
    texts = [element.text for element in elements if hasattr(element, 'text') and element.text]
    combined_text = "\n\n".join(texts)
    
    return combined_text

def process_text(file_path: str) -> str:
    """
    Process text documents using unstructured
    """
    elements = partition_text(
        filename=file_path,
        chunking_strategy="by_title",
        max_characters=4000,
        new_after_n_chars=3800,
        combine_text_under_n_chars=2000,
    )
    
    texts = [element.text for element in elements if hasattr(element, 'text') and element.text]
    combined_text = "\n\n".join(texts)
    
    return combined_text

def process_epub(file_path: str) -> str:
    """
    Process EPUB documents using unstructured
    """
    elements = partition_epub(
        filename=file_path,
    )
    
    texts = [element.text for element in elements if hasattr(element, 'text') and element.text]
    combined_text = "\n\n".join(texts)
    
    return combined_text

def process_generic(file_path: str) -> str:
    """
    Generic document processor using unstructured's auto partitioning
    """
    try:
        elements = partition(
            filename=file_path,
        )
        
        texts = [element.text for element in elements if hasattr(element, 'text') and element.text]
        combined_text = "\n\n".join(texts)
        
        return combined_text
    except Exception as e:
        # Fall back to basic text processing if auto-partition fails
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                return f.read()
        except Exception:
            # Try with a different encoding if utf-8 fails
            try:
                with open(file_path, 'r', encoding='latin-1') as f:
                    return f.read()
            except Exception as e2:
                raise Exception(f"Could not process file: {str(e)} / {str(e2)}")