1_full_demo_webinarium / src /documentProcessing.py
MrSimple01's picture
Upload 7 files
b0ba8c2 verified
raw
history blame contribute delete
1.32 kB
import os
import fitz
import docx
import tempfile
from typing import Tuple, Optional
def process_pdf_file(pdf_file) -> Tuple[str, str]:
try:
text_path = tempfile.mktemp(suffix='.txt')
doc = fitz.open(pdf_file)
text_content = ""
for page_num in range(len(doc)):
page = doc.load_page(page_num)
text_content += page.get_text()
with open(text_path, 'w', encoding='utf-8') as text_file:
text_file.write(text_content)
return text_path, text_content
except Exception as e:
error_message = f"Error processing PDF file: {str(e)}"
return None, error_message
def process_docx_file(docx_file) -> Tuple[str, str]:
try:
text_path = tempfile.mktemp(suffix='.txt')
doc = docx.Document(docx_file)
text_content = ""
for para in doc.paragraphs:
text_content += para.text + "\n"
with open(text_path, 'w', encoding='utf-8') as text_file:
text_file.write(text_content)
return text_path, text_content
except Exception as e:
error_message = f"Error processing Word document: {str(e)}"
return None, error_message