Spaces:
Running
Running
import docx | |
import PyPDF2 | |
import os | |
import re | |
import json | |
import time | |
import tempfile | |
from typing import Dict, Any, List, Optional | |
from src.quiz_processing import analyze_document | |
def extract_text_from_pdf(pdf_path): | |
text = "" | |
try: | |
with open(pdf_path, 'rb') as file: | |
reader = PyPDF2.PdfReader(file) | |
for page_num in range(len(reader.pages)): | |
text += reader.pages[page_num].extract_text() + "\n" | |
return text | |
except Exception as e: | |
raise Exception(f"Error extracting text from PDF: {str(e)}") | |
def extract_text_from_docx(docx_path): | |
try: | |
doc = docx.Document(docx_path) | |
text = "\n".join([paragraph.text for paragraph in doc.paragraphs]) | |
return text | |
except Exception as e: | |
raise Exception(f"Error extracting text from DOCX: {str(e)}") | |
def extract_text_from_txt(txt_path): | |
try: | |
with open(txt_path, 'r', encoding='utf-8') as file: | |
text = file.read() | |
return text | |
except Exception as e: | |
raise Exception(f"Error extracting text from TXT: {str(e)}") | |
def process_document(document_path, gemini_api_key, language, content_type): | |
try: | |
# Create a temporary file | |
file_extension = os.path.splitext(document_path.name)[-1].lower() | |
temp_file = tempfile.mktemp(suffix=file_extension) | |
# Handle different file-like objects | |
if hasattr(document_path, 'read'): | |
# If it's a file-like object with read method | |
with open(temp_file, 'wb') as f: | |
f.write(document_path.read()) | |
elif hasattr(document_path, 'file'): | |
# If it's a Django or similar web framework file upload | |
with open(temp_file, 'wb') as f: | |
for chunk in document_path.file.chunks(): | |
f.write(chunk) | |
elif isinstance(document_path, str): | |
# If it's a file path string | |
temp_file = document_path | |
else: | |
raise Exception("Unsupported document_path type") | |
# Process based on file type | |
if file_extension == '.pdf': | |
text = extract_text_from_pdf(temp_file) | |
elif file_extension == '.docx': | |
text = extract_text_from_docx(temp_file) | |
elif file_extension == '.txt': | |
text = extract_text_from_txt(temp_file) | |
else: | |
raise Exception(f"Unsupported file type: {file_extension}") | |
text_file_path = tempfile.mktemp(suffix='.txt') | |
with open(text_file_path, 'w', encoding='utf-8') as f: | |
f.write(text) | |
# Assume this function is defined elsewhere | |
formatted_output, json_path, txt_path = analyze_document( | |
text, gemini_api_key, language, content_type | |
) | |
return f"Document processed successfully", text_file_path, formatted_output, txt_path, json_path | |
except Exception as e: | |
error_message = f"Error processing document: {str(e)}" | |
return error_message, None, error_message, None, None |