File size: 2,393 Bytes
4f031a5
 
 
 
 
 
 
 
5b863e3
2644d07
 
e540abd
 
2a21b95
e540abd
50195a6
e540abd
04626e2
 
5b863e3
 
4f031a5
5b863e3
 
04626e2
 
 
e540abd
04626e2
 
5b863e3
e540abd
4f031a5
5b863e3
 
04626e2
 
5b863e3
 
4f031a5
5b863e3
 
04626e2
5b863e3
e540abd
 
 
 
4f031a5
811b0b3
5b863e3
e540abd
5b863e3
e540abd
5b863e3
e540abd
5b863e3
e540abd
5b863e3
2644d07
04626e2
5b863e3
2644d07
04626e2
5f2bd1b
 
2644d07
e540abd
2644d07
 
4f031a5
5f2bd1b
2644d07
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
### βœ… app.py β€” Document QA Backend (Cleaned)
from fastapi import FastAPI
from fastapi.responses import FileResponse, JSONResponse
import fitz  # PyMuPDF
import easyocr
import openpyxl
import pptx
import docx
from transformers import pipeline
from gtts import gTTS
import tempfile
import os

app = FastAPI()

qa_model = pipeline("question-answering", model="deepset/roberta-base-squad2")
reader = easyocr.Reader(['en', 'fr'])

def extract_text_from_pdf(pdf_file):
    try:
        with fitz.open(pdf_file) as doc:
            return "\n".join(page.get_text("text") for page in doc)
    except Exception as e:
        return f"Error reading PDF: {e}"

def extract_text_from_docx(docx_file):
    doc = docx.Document(docx_file)
    return "\n".join(p.text for p in doc.paragraphs if p.text.strip())

def extract_text_from_pptx(pptx_file):
    try:
        prs = pptx.Presentation(pptx_file)
        return "\n".join(shape.text for slide in prs.slides for shape in slide.shapes if hasattr(shape, "text"))
    except Exception as e:
        return f"Error reading PPTX: {e}"

def extract_text_from_xlsx(xlsx_file):
    try:
        wb = openpyxl.load_workbook(xlsx_file)
        return "\n".join(" ".join(str(cell) for cell in row if cell) for sheet in wb.sheetnames for row in wb[sheet].iter_rows(values_only=True))
    except Exception as e:
        return f"Error reading XLSX: {e}"

def answer_question_from_doc(file, question):
    ext = file.filename.split(".")[-1].lower()
    file_path = f"/tmp/{file.filename}"

    with open(file_path, "wb") as f:
        f.write(file.read())

    if ext == "pdf":
        context = extract_text_from_pdf(file_path)
    elif ext == "docx":
        context = extract_text_from_docx(file_path)
    elif ext == "pptx":
        context = extract_text_from_pptx(file_path)
    elif ext == "xlsx":
        context = extract_text_from_xlsx(file_path)
    else:
        return "Unsupported file format.", None

    if not context.strip():
        return "No text found in the document.", None

    try:
        result = qa_model({"question": question, "context": context})
        answer = result["answer"]
        tts = gTTS(answer)
        with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp:
            tts.save(tmp.name)
            return answer, tmp.name
    except Exception as e:
        return f"Error generating answer: {e}", None