import docx import PyPDF2 import os import re import json import time import tempfile from typing import Dict, Any, List, Optional from src.quiz_processing import analyze_document def extract_text_from_pdf(pdf_path): text = "" try: with open(pdf_path, 'rb') as file: reader = PyPDF2.PdfReader(file) for page_num in range(len(reader.pages)): text += reader.pages[page_num].extract_text() + "\n" return text except Exception as e: raise Exception(f"Error extracting text from PDF: {str(e)}") def extract_text_from_docx(docx_path): try: doc = docx.Document(docx_path) text = "\n".join([paragraph.text for paragraph in doc.paragraphs]) return text except Exception as e: raise Exception(f"Error extracting text from DOCX: {str(e)}") def extract_text_from_txt(txt_path): try: with open(txt_path, 'r', encoding='utf-8') as file: text = file.read() return text except Exception as e: raise Exception(f"Error extracting text from TXT: {str(e)}") def process_document(document_path, gemini_api_key, language, content_type): try: # Create a temporary file file_extension = os.path.splitext(document_path.name)[-1].lower() temp_file = tempfile.mktemp(suffix=file_extension) # Handle different file-like objects if hasattr(document_path, 'read'): # If it's a file-like object with read method with open(temp_file, 'wb') as f: f.write(document_path.read()) elif hasattr(document_path, 'file'): # If it's a Django or similar web framework file upload with open(temp_file, 'wb') as f: for chunk in document_path.file.chunks(): f.write(chunk) elif isinstance(document_path, str): # If it's a file path string temp_file = document_path else: raise Exception("Unsupported document_path type") # Process based on file type if file_extension == '.pdf': text = extract_text_from_pdf(temp_file) elif file_extension == '.docx': text = extract_text_from_docx(temp_file) elif file_extension == '.txt': text = extract_text_from_txt(temp_file) else: raise Exception(f"Unsupported file type: {file_extension}") text_file_path = tempfile.mktemp(suffix='.txt') with open(text_file_path, 'w', encoding='utf-8') as f: f.write(text) # Assume this function is defined elsewhere formatted_output, json_path, txt_path = analyze_document( text, gemini_api_key, language, content_type ) return f"Document processed successfully", text_file_path, formatted_output, txt_path, json_path except Exception as e: error_message = f"Error processing document: {str(e)}" return error_message, None, error_message, None, None