Spaces:

MrSimple01
/

SimpleLearn_2

Running

File size: 2,350 Bytes

4cc0ea8
 
bcf9fd7
 
 
 
3a765ef
bcf9fd7
4cc0ea8

import docx
import PyPDF2
import os
import re
import json
import time
import tempfile
from typing import Dict, Any, List, Optional

def extract_text_from_pdf(pdf_path):
    text = ""
    try:
        with open(pdf_path, 'rb') as file:
            reader = PyPDF2.PdfReader(file)
            for page_num in range(len(reader.pages)):
                text += reader.pages[page_num].extract_text() + "\n"
        return text
    except Exception as e:
        raise Exception(f"Error extracting text from PDF: {str(e)}")

def extract_text_from_docx(docx_path):
    try:
        doc = docx.Document(docx_path)
        text = "\n".join([paragraph.text for paragraph in doc.paragraphs])
        return text
    except Exception as e:
        raise Exception(f"Error extracting text from DOCX: {str(e)}")

def extract_text_from_txt(txt_path):
    try:
        with open(txt_path, 'r', encoding='utf-8') as file:
            text = file.read()
        return text
    except Exception as e:
        raise Exception(f"Error extracting text from TXT: {str(e)}")

def process_document(document_path, gemini_api_key, language, content_type):
    try:
        temp_file = tempfile.mktemp(suffix=os.path.splitext(document_path.name)[-1])
        with open(temp_file, 'wb') as f:
            f.write(document_path.read())
        
        file_extension = os.path.splitext(document_path.name)[-1].lower()
        if file_extension == '.pdf':
            text = extract_text_from_pdf(temp_file)
        elif file_extension == '.docx':
            text = extract_text_from_docx(temp_file)
        elif file_extension == '.txt':
            text = extract_text_from_txt(temp_file)
        else:
            raise Exception(f"Unsupported file type: {file_extension}")
        
        text_file_path = tempfile.mktemp(suffix='.txt')
        with open(text_file_path, 'w', encoding='utf-8') as f:
            f.write(text)
        
        formatted_output, json_path, txt_path = analyze_document(
            text, 
            gemini_api_key, 
            language, 
            content_type
        )
        
        return f"Document processed successfully", text_file_path, formatted_output, txt_path, json_path
    except Exception as e:
        error_message = f"Error processing document: {str(e)}"
        return error_message, None, error_message, None, None