File size: 1,319 Bytes
b0ba8c2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
import os
import fitz 
import docx
import tempfile
from typing import Tuple, Optional

def process_pdf_file(pdf_file) -> Tuple[str, str]:
    try:
        text_path = tempfile.mktemp(suffix='.txt')
        
        doc = fitz.open(pdf_file)
        text_content = ""
        
        for page_num in range(len(doc)):
            page = doc.load_page(page_num)
            text_content += page.get_text()
        
        with open(text_path, 'w', encoding='utf-8') as text_file:
            text_file.write(text_content)
        
        return text_path, text_content
    
    except Exception as e:
        error_message = f"Error processing PDF file: {str(e)}"
        return None, error_message

def process_docx_file(docx_file) -> Tuple[str, str]:
    try:
        text_path = tempfile.mktemp(suffix='.txt')
        
        doc = docx.Document(docx_file)
        text_content = ""
        
        for para in doc.paragraphs:
            text_content += para.text + "\n"
        
        with open(text_path, 'w', encoding='utf-8') as text_file:
            text_file.write(text_content)
        
        return text_path, text_content
    
    except Exception as e:
        error_message = f"Error processing Word document: {str(e)}"
        return None, error_message