Spaces:

MrSimple01
/

1_full_demo_webinarium

Sleeping

1_full_demo_webinarium / src /documentProcessing.py

Upload 7 files

b0ba8c2 verified 24 days ago

1.32 kB

	import os
	import fitz
	import docx
	import tempfile
	from typing import Tuple, Optional

	def process_pdf_file(pdf_file) -> Tuple[str, str]:
	try:
	text_path = tempfile.mktemp(suffix='.txt')

	doc = fitz.open(pdf_file)
	text_content = ""

	for page_num in range(len(doc)):
	page = doc.load_page(page_num)
	text_content += page.get_text()

	with open(text_path, 'w', encoding='utf-8') as text_file:
	text_file.write(text_content)

	return text_path, text_content

	except Exception as e:
	error_message = f"Error processing PDF file: {str(e)}"
	return None, error_message

	def process_docx_file(docx_file) -> Tuple[str, str]:
	try:
	text_path = tempfile.mktemp(suffix='.txt')

	doc = docx.Document(docx_file)
	text_content = ""

	for para in doc.paragraphs:
	text_content += para.text + "\n"

	with open(text_path, 'w', encoding='utf-8') as text_file:
	text_file.write(text_content)

	return text_path, text_content

	except Exception as e:
	error_message = f"Error processing Word document: {str(e)}"
	return None, error_message