qtAnswering / app.py
ikraamkb's picture
Update app.py
2644d07 verified
raw
history blame
2.61 kB
import fitz # PyMuPDF for PDFs
import easyocr # OCR for images
import openpyxl # XLSX processing
import pptx # PPTX processing
import docx # DOCX processing
from transformers import pipeline
from gtts import gTTS
import tempfile
# Initialize AI Models
qa_model = pipeline("question-answering", model="deepset/roberta-base-squad2")
reader = easyocr.Reader(['en', 'fr']) # OCR for English & French
# ---- TEXT EXTRACTION FUNCTIONS ----
def extract_text_from_pdf(pdf_file):
text = []
try:
with fitz.open(pdf_file) as doc:
for page in doc:
text.append(page.get_text("text"))
except Exception as e:
return f"Error reading PDF: {e}"
return "\n".join(text)
def extract_text_from_docx(docx_file):
doc = docx.Document(docx_file)
return "\n".join([p.text for p in doc.paragraphs if p.text.strip()])
def extract_text_from_pptx(pptx_file):
text = []
try:
presentation = pptx.Presentation(pptx_file)
for slide in presentation.slides:
for shape in slide.shapes:
if hasattr(shape, "text"):
text.append(shape.text)
except Exception as e:
return f"Error reading PPTX: {e}"
return "\n".join(text)
def extract_text_from_xlsx(xlsx_file):
text = []
try:
wb = openpyxl.load_workbook(xlsx_file)
for sheet in wb.sheetnames:
ws = wb[sheet]
for row in ws.iter_rows(values_only=True):
text.append(" ".join(str(cell) for cell in row if cell))
except Exception as e:
return f"Error reading XLSX: {e}"
return "\n".join(text)
# ---- MAIN QA FUNCTION ----
def answer_question_from_doc(file, question):
ext = file.name.split(".")[-1].lower()
if ext == "pdf":
context = extract_text_from_pdf(file.name)
elif ext == "docx":
context = extract_text_from_docx(file.name)
elif ext == "pptx":
context = extract_text_from_pptx(file.name)
elif ext == "xlsx":
context = extract_text_from_xlsx(file.name)
else:
return "Unsupported file format.", None
if not context.strip():
return "No text found in the document.", None
try:
result = qa_model({"question": question, "context": context})
answer = result["answer"]
tts = gTTS(text=answer)
with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp:
tts.save(tmp.name)
audio_path = tmp.name
return answer, audio_path
except Exception as e:
return f"Error generating answer: {e}", None