File size: 2,606 Bytes
5b863e3
 
 
 
 
 
2644d07
 
04626e2
5b863e3
50195a6
5f2bd1b
04626e2
5b863e3
04626e2
5b863e3
 
 
 
 
 
 
 
04626e2
 
 
5b863e3
04626e2
 
 
5b863e3
 
 
 
 
 
 
 
04626e2
 
 
 
5b863e3
 
 
 
 
 
 
 
04626e2
 
811b0b3
5b863e3
 
811b0b3
5b863e3
 
 
 
 
 
 
 
 
2644d07
04626e2
5b863e3
2644d07
04626e2
5f2bd1b
 
2644d07
 
 
 
 
 
5f2bd1b
2644d07
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
import fitz  # PyMuPDF for PDFs
import easyocr  # OCR for images
import openpyxl  # XLSX processing
import pptx  # PPTX processing
import docx  # DOCX processing
from transformers import pipeline
from gtts import gTTS
import tempfile

# Initialize AI Models
qa_model = pipeline("question-answering", model="deepset/roberta-base-squad2")
reader = easyocr.Reader(['en', 'fr'])  # OCR for English & French

# ---- TEXT EXTRACTION FUNCTIONS ----
def extract_text_from_pdf(pdf_file):
    text = []
    try:
        with fitz.open(pdf_file) as doc:
            for page in doc:
                text.append(page.get_text("text"))
    except Exception as e:
        return f"Error reading PDF: {e}"
    return "\n".join(text)

def extract_text_from_docx(docx_file):
    doc = docx.Document(docx_file)
    return "\n".join([p.text for p in doc.paragraphs if p.text.strip()])

def extract_text_from_pptx(pptx_file):
    text = []
    try:
        presentation = pptx.Presentation(pptx_file)
        for slide in presentation.slides:
            for shape in slide.shapes:
                if hasattr(shape, "text"):
                    text.append(shape.text)
    except Exception as e:
        return f"Error reading PPTX: {e}"
    return "\n".join(text)

def extract_text_from_xlsx(xlsx_file):
    text = []
    try:
        wb = openpyxl.load_workbook(xlsx_file)
        for sheet in wb.sheetnames:
            ws = wb[sheet]
            for row in ws.iter_rows(values_only=True):
                text.append(" ".join(str(cell) for cell in row if cell))
    except Exception as e:
        return f"Error reading XLSX: {e}"
    return "\n".join(text)

# ---- MAIN QA FUNCTION ----
def answer_question_from_doc(file, question):
    ext = file.name.split(".")[-1].lower()

    if ext == "pdf":
        context = extract_text_from_pdf(file.name)
    elif ext == "docx":
        context = extract_text_from_docx(file.name)
    elif ext == "pptx":
        context = extract_text_from_pptx(file.name)
    elif ext == "xlsx":
        context = extract_text_from_xlsx(file.name)
    else:
        return "Unsupported file format.", None

    if not context.strip():
        return "No text found in the document.", None

    try:
        result = qa_model({"question": question, "context": context})
        answer = result["answer"]
        tts = gTTS(text=answer)
        with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp:
            tts.save(tmp.name)
            audio_path = tmp.name
        return answer, audio_path
    except Exception as e:
        return f"Error generating answer: {e}", None