Spaces:
Sleeping
Sleeping
File size: 1,319 Bytes
b0ba8c2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 |
import os
import fitz
import docx
import tempfile
from typing import Tuple, Optional
def process_pdf_file(pdf_file) -> Tuple[str, str]:
try:
text_path = tempfile.mktemp(suffix='.txt')
doc = fitz.open(pdf_file)
text_content = ""
for page_num in range(len(doc)):
page = doc.load_page(page_num)
text_content += page.get_text()
with open(text_path, 'w', encoding='utf-8') as text_file:
text_file.write(text_content)
return text_path, text_content
except Exception as e:
error_message = f"Error processing PDF file: {str(e)}"
return None, error_message
def process_docx_file(docx_file) -> Tuple[str, str]:
try:
text_path = tempfile.mktemp(suffix='.txt')
doc = docx.Document(docx_file)
text_content = ""
for para in doc.paragraphs:
text_content += para.text + "\n"
with open(text_path, 'w', encoding='utf-8') as text_file:
text_file.write(text_content)
return text_path, text_content
except Exception as e:
error_message = f"Error processing Word document: {str(e)}"
return None, error_message |