# app.py import fitz # PyMuPDF for PDFs import easyocr # OCR for images import openpyxl # XLSX processing import pptx # PPTX processing import docx # DOCX processing from transformers import pipeline # Initialize AI Models qa_model = pipeline("question-answering", model="deepset/roberta-base-squad2") reader = easyocr.Reader(['en', 'fr']) # OCR for English & French # ---- TEXT EXTRACTION FUNCTIONS ---- def extract_text_from_pdf(pdf_file): text = [] try: with fitz.open(pdf_file) as doc: for page in doc: text.append(page.get_text("text")) except Exception as e: return f"Error reading PDF: {e}" return "\n".join(text) def extract_text_from_docx(docx_file): doc = docx.Document(docx_file) return "\n".join([p.text for p in doc.paragraphs if p.text.strip()]) def extract_text_from_pptx(pptx_file): text = [] try: presentation = pptx.Presentation(pptx_file) for slide in presentation.slides: for shape in slide.shapes: if hasattr(shape, "text"): text.append(shape.text) except Exception as e: return f"Error reading PPTX: {e}" return "\n".join(text) def extract_text_from_xlsx(xlsx_file): text = [] try: wb = openpyxl.load_workbook(xlsx_file) for sheet in wb.sheetnames: ws = wb[sheet] for row in ws.iter_rows(values_only=True): text.append(" ".join(str(cell) for cell in row if cell)) except Exception as e: return f"Error reading XLSX: {e}" return "\n".join(text) # ---- MAIN QA FUNCTION ---- def answer_question_from_doc(file, question): ext = file.name.split(".")[-1].lower() if ext == "pdf": context = extract_text_from_pdf(file.name) elif ext == "docx": context = extract_text_from_docx(file.name) elif ext == "pptx": context = extract_text_from_pptx(file.name) elif ext == "xlsx": context = extract_text_from_xlsx(file.name) else: return "Unsupported file format." if not context.strip(): return "No text found in the document." try: result = qa_model({"question": question, "context": context}) return result["answer"] except Exception as e: return f"Error generating answer: {e}"