Spaces:
Sleeping
Sleeping
import os | |
import fitz | |
import docx | |
import tempfile | |
from typing import Tuple, Optional | |
def process_pdf_file(pdf_file) -> Tuple[str, str]: | |
try: | |
text_path = tempfile.mktemp(suffix='.txt') | |
doc = fitz.open(pdf_file) | |
text_content = "" | |
for page_num in range(len(doc)): | |
page = doc.load_page(page_num) | |
text_content += page.get_text() | |
with open(text_path, 'w', encoding='utf-8') as text_file: | |
text_file.write(text_content) | |
return text_path, text_content | |
except Exception as e: | |
error_message = f"Error processing PDF file: {str(e)}" | |
return None, error_message | |
def process_docx_file(docx_file) -> Tuple[str, str]: | |
try: | |
text_path = tempfile.mktemp(suffix='.txt') | |
doc = docx.Document(docx_file) | |
text_content = "" | |
for para in doc.paragraphs: | |
text_content += para.text + "\n" | |
with open(text_path, 'w', encoding='utf-8') as text_file: | |
text_file.write(text_content) | |
return text_path, text_content | |
except Exception as e: | |
error_message = f"Error processing Word document: {str(e)}" | |
return None, error_message |