File size: 2,348 Bytes
811b0b3
5b863e3
 
 
 
 
 
04626e2
5b863e3
50195a6
5f2bd1b
04626e2
5b863e3
04626e2
5b863e3
 
 
 
 
 
 
 
04626e2
 
 
5b863e3
04626e2
 
 
5b863e3
 
 
 
 
 
 
 
04626e2
 
 
 
5b863e3
 
 
 
 
 
 
 
04626e2
 
811b0b3
5b863e3
 
811b0b3
5b863e3
 
 
 
 
 
 
 
 
811b0b3
04626e2
5b863e3
811b0b3
04626e2
5f2bd1b
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
# app.py
import fitz  # PyMuPDF for PDFs
import easyocr  # OCR for images
import openpyxl  # XLSX processing
import pptx  # PPTX processing
import docx  # DOCX processing
from transformers import pipeline

# Initialize AI Models
qa_model = pipeline("question-answering", model="deepset/roberta-base-squad2")
reader = easyocr.Reader(['en', 'fr'])  # OCR for English & French

# ---- TEXT EXTRACTION FUNCTIONS ----
def extract_text_from_pdf(pdf_file):
    text = []
    try:
        with fitz.open(pdf_file) as doc:
            for page in doc:
                text.append(page.get_text("text"))
    except Exception as e:
        return f"Error reading PDF: {e}"
    return "\n".join(text)

def extract_text_from_docx(docx_file):
    doc = docx.Document(docx_file)
    return "\n".join([p.text for p in doc.paragraphs if p.text.strip()])

def extract_text_from_pptx(pptx_file):
    text = []
    try:
        presentation = pptx.Presentation(pptx_file)
        for slide in presentation.slides:
            for shape in slide.shapes:
                if hasattr(shape, "text"):
                    text.append(shape.text)
    except Exception as e:
        return f"Error reading PPTX: {e}"
    return "\n".join(text)

def extract_text_from_xlsx(xlsx_file):
    text = []
    try:
        wb = openpyxl.load_workbook(xlsx_file)
        for sheet in wb.sheetnames:
            ws = wb[sheet]
            for row in ws.iter_rows(values_only=True):
                text.append(" ".join(str(cell) for cell in row if cell))
    except Exception as e:
        return f"Error reading XLSX: {e}"
    return "\n".join(text)

# ---- MAIN QA FUNCTION ----
def answer_question_from_doc(file, question):
    ext = file.name.split(".")[-1].lower()

    if ext == "pdf":
        context = extract_text_from_pdf(file.name)
    elif ext == "docx":
        context = extract_text_from_docx(file.name)
    elif ext == "pptx":
        context = extract_text_from_pptx(file.name)
    elif ext == "xlsx":
        context = extract_text_from_xlsx(file.name)
    else:
        return "Unsupported file format."

    if not context.strip():
        return "No text found in the document."

    try:
        result = qa_model({"question": question, "context": context})
        return result["answer"]
    except Exception as e:
        return f"Error generating answer: {e}"