Spaces:
Running
Running
File size: 2,393 Bytes
4f031a5 5b863e3 2644d07 e540abd 2a21b95 e540abd 50195a6 e540abd 04626e2 5b863e3 4f031a5 5b863e3 04626e2 e540abd 04626e2 5b863e3 e540abd 4f031a5 5b863e3 04626e2 5b863e3 4f031a5 5b863e3 04626e2 5b863e3 e540abd 4f031a5 811b0b3 5b863e3 e540abd 5b863e3 e540abd 5b863e3 e540abd 5b863e3 e540abd 5b863e3 2644d07 04626e2 5b863e3 2644d07 04626e2 5f2bd1b 2644d07 e540abd 2644d07 4f031a5 5f2bd1b 2644d07 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 |
### β
app.py β Document QA Backend (Cleaned)
from fastapi import FastAPI
from fastapi.responses import FileResponse, JSONResponse
import fitz # PyMuPDF
import easyocr
import openpyxl
import pptx
import docx
from transformers import pipeline
from gtts import gTTS
import tempfile
import os
app = FastAPI()
qa_model = pipeline("question-answering", model="deepset/roberta-base-squad2")
reader = easyocr.Reader(['en', 'fr'])
def extract_text_from_pdf(pdf_file):
try:
with fitz.open(pdf_file) as doc:
return "\n".join(page.get_text("text") for page in doc)
except Exception as e:
return f"Error reading PDF: {e}"
def extract_text_from_docx(docx_file):
doc = docx.Document(docx_file)
return "\n".join(p.text for p in doc.paragraphs if p.text.strip())
def extract_text_from_pptx(pptx_file):
try:
prs = pptx.Presentation(pptx_file)
return "\n".join(shape.text for slide in prs.slides for shape in slide.shapes if hasattr(shape, "text"))
except Exception as e:
return f"Error reading PPTX: {e}"
def extract_text_from_xlsx(xlsx_file):
try:
wb = openpyxl.load_workbook(xlsx_file)
return "\n".join(" ".join(str(cell) for cell in row if cell) for sheet in wb.sheetnames for row in wb[sheet].iter_rows(values_only=True))
except Exception as e:
return f"Error reading XLSX: {e}"
def answer_question_from_doc(file, question):
ext = file.filename.split(".")[-1].lower()
file_path = f"/tmp/{file.filename}"
with open(file_path, "wb") as f:
f.write(file.read())
if ext == "pdf":
context = extract_text_from_pdf(file_path)
elif ext == "docx":
context = extract_text_from_docx(file_path)
elif ext == "pptx":
context = extract_text_from_pptx(file_path)
elif ext == "xlsx":
context = extract_text_from_xlsx(file_path)
else:
return "Unsupported file format.", None
if not context.strip():
return "No text found in the document.", None
try:
result = qa_model({"question": question, "context": context})
answer = result["answer"]
tts = gTTS(answer)
with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp:
tts.save(tmp.name)
return answer, tmp.name
except Exception as e:
return f"Error generating answer: {e}", None
|