Spaces:

Ronochieng
/

DocMindAI

Build error

App Files Files Community

DocMindAI / Ingestion /ingest.py

Adeptschneider

Feat: DocMindAI

18a68e7 17 days ago

raw

history blame

7.97 kB

	import os
	import pymupdf4llm
	import pandas as pd
	import tempfile
	from typing import Dict, Any, Optional, List

	# Import unstructured components for different file types
	from unstructured.partition.auto import partition
	from unstructured.partition.pdf import partition_pdf
	from unstructured.partition.docx import partition_docx
	from unstructured.partition.pptx import partition_pptx
	from unstructured.partition.xlsx import partition_xlsx
	from unstructured.partition.md import partition_md
	from unstructured.partition.html import partition_html
	from unstructured.partition.xml import partition_xml
	from unstructured.partition.email import partition_email
	from unstructured.partition.text import partition_text
	from unstructured.partition.epub import partition_epub

	def get_processor_for_file(file_path: str) -> Optional[callable]:
	"""
	Determine the appropriate processor function for the given file type
	"""
	file_extension = os.path.splitext(file_path)[1].lower()

	# Map file extensions to specific partition functions
	processors = {
	".pdf": process_pdf,
	".docx": process_docx,
	".doc": process_docx,
	".pptx": process_pptx,
	".ppt": process_pptx,
	".xlsx": process_xlsx,
	".xls": process_xlsx,
	".md": process_markdown,
	".html": process_html,
	".htm": process_html,
	".xml": process_xml,
	".msg": process_email,
	".eml": process_email,
	".epub": process_epub,
	".txt": process_text,
	".csv": process_text,
	".rtf": process_text,

	# Code files
	".py": process_text,
	".js": process_text,
	".java": process_text,
	".ts": process_text,
	".tsx": process_text,
	".jsx": process_text,
	".c": process_text,
	".cpp": process_text,
	".h": process_text,
	".cs": process_text,
	".rb": process_text,
	".go": process_text,
	".rs": process_text,
	".php": process_text,
	".sql": process_text,
	".css": process_text,
	}

	return processors.get(file_extension, process_generic)

	def process_document(file_path: str) -> Optional[str]:
	"""
	Process a document using the appropriate processor based on file type
	"""
	processor = get_processor_for_file(file_path)
	if processor:
	return processor(file_path)
	return None

	def process_pdf(file_path: str) -> str:
	"""
	Process PDF documents using unstructured
	"""
	temp_dir = tempfile.mkdtemp()

	try:
	# Try hi_res mode first with OCR capabilities
	elements = partition_pdf(
	filename=file_path,
	strategy="hi_res",
	extract_images_in_pdf=True,
	extract_image_block_types=["Image", "Table"],
	extract_image_block_to_payload=False,
	extract_image_block_output_dir=temp_dir,
	hi_res_model_name="yolox",
	infer_table_structure=True,
	chunking_strategy="by_title",
	max_characters=4000,
	new_after_n_chars=3800,
	combine_text_under_n_chars=2000,
	)
	except Exception as e:
	# Fall back to fast mode if hi_res fails
	elements = partition_pdf(
	filename=file_path,
	strategy="fast",
	chunking_strategy="by_title",
	max_characters=4000,
	new_after_n_chars=3800,
	combine_text_under_n_chars=2000,
	)

	# Extract text from elements
	texts = [element.text for element in elements if hasattr(element, 'text') and element.text]
	combined_text = "\n\n".join(texts)

	return combined_text

	def process_docx(file_path: str) -> str:
	"""
	Process DOCX documents using unstructured
	"""
	elements = partition_docx(
	filename=file_path,
	chunking_strategy="by_title",
	max_characters=4000,
	new_after_n_chars=3800,
	combine_text_under_n_chars=2000,
	)

	texts = [element.text for element in elements if hasattr(element, 'text') and element.text]
	combined_text = "\n\n".join(texts)

	return combined_text

	def process_pptx(file_path: str) -> str:
	"""
	Process PPTX documents using unstructured
	"""
	elements = partition_pptx(
	filename=file_path,
	)

	texts = [element.text for element in elements if hasattr(element, 'text') and element.text]
	combined_text = "\n\n".join(texts)

	return combined_text

	def process_xlsx(file_path: str) -> str:
	"""
	Process XLSX documents using unstructured
	"""
	elements = partition_xlsx(
	filename=file_path,
	)

	texts = [element.text for element in elements if hasattr(element, 'text') and element.text]
	combined_text = "\n\n".join(texts)

	return combined_text

	def process_markdown(file_path: str) -> str:
	"""
	Process Markdown documents using unstructured
	"""
	elements = partition_md(
	filename=file_path,
	)

	texts = [element.text for element in elements if hasattr(element, 'text') and element.text]
	combined_text = "\n\n".join(texts)

	return combined_text

	def process_html(file_path: str) -> str:
	"""
	Process HTML documents using unstructured
	"""
	elements = partition_html(
	filename=file_path,
	)

	texts = [element.text for element in elements if hasattr(element, 'text') and element.text]
	combined_text = "\n\n".join(texts)

	return combined_text

	def process_xml(file_path: str) -> str:
	"""
	Process XML documents using unstructured
	"""
	elements = partition_xml(
	filename=file_path,
	)

	texts = [element.text for element in elements if hasattr(element, 'text') and element.text]
	combined_text = "\n\n".join(texts)

	return combined_text

	def process_email(file_path: str) -> str:
	"""
	Process email documents using unstructured
	"""
	elements = partition_email(
	filename=file_path,
	)

	texts = [element.text for element in elements if hasattr(element, 'text') and element.text]
	combined_text = "\n\n".join(texts)

	return combined_text

	def process_text(file_path: str) -> str:
	"""
	Process text documents using unstructured
	"""
	elements = partition_text(
	filename=file_path,
	chunking_strategy="by_title",
	max_characters=4000,
	new_after_n_chars=3800,
	combine_text_under_n_chars=2000,
	)

	texts = [element.text for element in elements if hasattr(element, 'text') and element.text]
	combined_text = "\n\n".join(texts)

	return combined_text

	def process_epub(file_path: str) -> str:
	"""
	Process EPUB documents using unstructured
	"""
	elements = partition_epub(
	filename=file_path,
	)

	texts = [element.text for element in elements if hasattr(element, 'text') and element.text]
	combined_text = "\n\n".join(texts)

	return combined_text

	def process_generic(file_path: str) -> str:
	"""
	Generic document processor using unstructured's auto partitioning
	"""
	try:
	elements = partition(
	filename=file_path,
	)

	texts = [element.text for element in elements if hasattr(element, 'text') and element.text]
	combined_text = "\n\n".join(texts)

	return combined_text
	except Exception as e:
	# Fall back to basic text processing if auto-partition fails
	try:
	with open(file_path, 'r', encoding='utf-8') as f:
	return f.read()
	except Exception:
	# Try with a different encoding if utf-8 fails
	try:
	with open(file_path, 'r', encoding='latin-1') as f:
	return f.read()
	except Exception as e2:
	raise Exception(f"Could not process file: {str(e)} / {str(e2)}")