Spaces:

ikraamkb
/

qtAnswering

Running

App Files Files Community

ikraamkb commited on Apr 12

Commit

4f031a5

verified ·

1 Parent(s): 7abb419

Update app.py

Browse files

Files changed (1) hide show

app.py +13 -44

app.py CHANGED Viewed

@@ -1,10 +1,11 @@
-from fastapi import FastAPI, UploadFile, Form
-from fastapi.responses import JSONResponse, FileResponse
-import fitz  # PyMuPDF for PDFs
-import easyocr  # OCR for images
-import openpyxl  # XLSX processing
-import pptx  # PPTX processing
-import docx  # DOCX processing
 from transformers import pipeline
 from gtts import gTTS
 import tempfile
@@ -12,56 +13,40 @@ import os
 app = FastAPI()
-# Load AI models
 qa_model = pipeline("question-answering", model="deepset/roberta-base-squad2")
 reader = easyocr.Reader(['en', 'fr'])
-# Text Extraction
 def extract_text_from_pdf(pdf_file):
-    text = []
     try:
         with fitz.open(pdf_file) as doc:
-            for page in doc:
-                text.append(page.get_text("text"))
     except Exception as e:
         return f"Error reading PDF: {e}"
-    return "\n".join(text)
 def extract_text_from_docx(docx_file):
     doc = docx.Document(docx_file)
     return "\n".join(p.text for p in doc.paragraphs if p.text.strip())
 def extract_text_from_pptx(pptx_file):
-    text = []
     try:
         prs = pptx.Presentation(pptx_file)
-        for slide in prs.slides:
-            for shape in slide.shapes:
-                if hasattr(shape, "text"):
-                    text.append(shape.text)
     except Exception as e:
         return f"Error reading PPTX: {e}"
-    return "\n".join(text)
 def extract_text_from_xlsx(xlsx_file):
-    text = []
     try:
         wb = openpyxl.load_workbook(xlsx_file)
-        for sheet in wb.sheetnames:
-            ws = wb[sheet]
-            for row in ws.iter_rows(values_only=True):
-                text.append(" ".join(str(cell) for cell in row if cell))
     except Exception as e:
         return f"Error reading XLSX: {e}"
-    return "\n".join(text)
-# Main QA logic
 def answer_question_from_doc(file, question):
     ext = file.filename.split(".")[-1].lower()
     file_path = f"/tmp/{file.filename}"
     with open(file_path, "wb") as f:
-        f.write(file.file.read())
     if ext == "pdf":
         context = extract_text_from_pdf(file_path)
@@ -83,22 +68,6 @@ def answer_question_from_doc(file, question):
         tts = gTTS(answer)
         with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp:
             tts.save(tmp.name)
-            audio_path = tmp.name
-        return answer, audio_path
     except Exception as e:
         return f"Error generating answer: {e}", None
-# API route for prediction
-@app.post("/predict")
-async def predict(file: UploadFile, question: str = Form(...)):
-    answer, audio_path = answer_question_from_doc(file, question)
-    if audio_path:
-        return JSONResponse(content={"answer": answer, "audio": f"/audio/{os.path.basename(audio_path)}"})
-    else:
-        return JSONResponse(content={"answer": answer})
-# Route to serve audio
-@app.get("/audio/{filename}")
-async def get_audio(filename: str):
-    file_path = os.path.join(tempfile.gettempdir(), filename)
-    return FileResponse(path=file_path, media_type="audio/mpeg")

+### ✅ app.py — Document QA Backend (Cleaned)
+from fastapi import FastAPI
+from fastapi.responses import FileResponse, JSONResponse
+import fitz  # PyMuPDF
+import easyocr
+import openpyxl
+import pptx
+import docx
 from transformers import pipeline
 from gtts import gTTS
 import tempfile
 app = FastAPI()
 qa_model = pipeline("question-answering", model="deepset/roberta-base-squad2")
 reader = easyocr.Reader(['en', 'fr'])
 def extract_text_from_pdf(pdf_file):
     try:
         with fitz.open(pdf_file) as doc:
+            return "\n".join(page.get_text("text") for page in doc)
     except Exception as e:
         return f"Error reading PDF: {e}"
 def extract_text_from_docx(docx_file):
     doc = docx.Document(docx_file)
     return "\n".join(p.text for p in doc.paragraphs if p.text.strip())
 def extract_text_from_pptx(pptx_file):
     try:
         prs = pptx.Presentation(pptx_file)
+        return "\n".join(shape.text for slide in prs.slides for shape in slide.shapes if hasattr(shape, "text"))
     except Exception as e:
         return f"Error reading PPTX: {e}"
 def extract_text_from_xlsx(xlsx_file):
     try:
         wb = openpyxl.load_workbook(xlsx_file)
+        return "\n".join(" ".join(str(cell) for cell in row if cell) for sheet in wb.sheetnames for row in wb[sheet].iter_rows(values_only=True))
     except Exception as e:
         return f"Error reading XLSX: {e}"
 def answer_question_from_doc(file, question):
     ext = file.filename.split(".")[-1].lower()
     file_path = f"/tmp/{file.filename}"
     with open(file_path, "wb") as f:
+        f.write(file.read())
     if ext == "pdf":
         context = extract_text_from_pdf(file_path)
         tts = gTTS(answer)
         with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp:
             tts.save(tmp.name)
+            return answer, tmp.name
     except Exception as e:
         return f"Error generating answer: {e}", None