Spaces:
Running
Running
File size: 2,606 Bytes
5b863e3 2644d07 04626e2 5b863e3 50195a6 5f2bd1b 04626e2 5b863e3 04626e2 5b863e3 04626e2 5b863e3 04626e2 5b863e3 04626e2 5b863e3 04626e2 811b0b3 5b863e3 811b0b3 5b863e3 2644d07 04626e2 5b863e3 2644d07 04626e2 5f2bd1b 2644d07 5f2bd1b 2644d07 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 |
import fitz # PyMuPDF for PDFs
import easyocr # OCR for images
import openpyxl # XLSX processing
import pptx # PPTX processing
import docx # DOCX processing
from transformers import pipeline
from gtts import gTTS
import tempfile
# Initialize AI Models
qa_model = pipeline("question-answering", model="deepset/roberta-base-squad2")
reader = easyocr.Reader(['en', 'fr']) # OCR for English & French
# ---- TEXT EXTRACTION FUNCTIONS ----
def extract_text_from_pdf(pdf_file):
text = []
try:
with fitz.open(pdf_file) as doc:
for page in doc:
text.append(page.get_text("text"))
except Exception as e:
return f"Error reading PDF: {e}"
return "\n".join(text)
def extract_text_from_docx(docx_file):
doc = docx.Document(docx_file)
return "\n".join([p.text for p in doc.paragraphs if p.text.strip()])
def extract_text_from_pptx(pptx_file):
text = []
try:
presentation = pptx.Presentation(pptx_file)
for slide in presentation.slides:
for shape in slide.shapes:
if hasattr(shape, "text"):
text.append(shape.text)
except Exception as e:
return f"Error reading PPTX: {e}"
return "\n".join(text)
def extract_text_from_xlsx(xlsx_file):
text = []
try:
wb = openpyxl.load_workbook(xlsx_file)
for sheet in wb.sheetnames:
ws = wb[sheet]
for row in ws.iter_rows(values_only=True):
text.append(" ".join(str(cell) for cell in row if cell))
except Exception as e:
return f"Error reading XLSX: {e}"
return "\n".join(text)
# ---- MAIN QA FUNCTION ----
def answer_question_from_doc(file, question):
ext = file.name.split(".")[-1].lower()
if ext == "pdf":
context = extract_text_from_pdf(file.name)
elif ext == "docx":
context = extract_text_from_docx(file.name)
elif ext == "pptx":
context = extract_text_from_pptx(file.name)
elif ext == "xlsx":
context = extract_text_from_xlsx(file.name)
else:
return "Unsupported file format.", None
if not context.strip():
return "No text found in the document.", None
try:
result = qa_model({"question": question, "context": context})
answer = result["answer"]
tts = gTTS(text=answer)
with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp:
tts.save(tmp.name)
audio_path = tmp.name
return answer, audio_path
except Exception as e:
return f"Error generating answer: {e}", None
|