Spaces:
Running
Running
File size: 3,755 Bytes
a768964 04626e2 5b863e3 04626e2 5b863e3 04626e2 5b863e3 04626e2 5b863e3 50195a6 5b863e3 5f2bd1b 04626e2 5b863e3 04626e2 401f74e 5b863e3 04626e2 401f74e 04626e2 5b863e3 04626e2 401f74e 04626e2 5b863e3 04626e2 401f74e 04626e2 5b863e3 04626e2 5b863e3 401f74e 5b863e3 401f74e 04626e2 5b863e3 401f74e 04626e2 5f2bd1b 04626e2 5f2bd1b 5b863e3 5f2bd1b 5b863e3 68df520 04626e2 df1ed5e 5b863e3 df1ed5e 5976e32 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 |
import gradio as gr
import fitz # PyMuPDF for PDFs
import easyocr # OCR for images
import openpyxl # XLSX processing
import pptx # PPTX processing
import docx # DOCX processing
import json # Exporting results
from deep_translator import GoogleTranslator
from transformers import pipeline
from fastapi import FastAPI
from starlette.responses import RedirectResponse
# Initialize FastAPI app
app = FastAPI()
# Initialize AI Models
qa_model = pipeline("question-answering", model="deepset/roberta-base-squad2")
image_captioning = pipeline("image-to-text", model="nlpconnect/vit-gpt2-image-captioning")
reader = easyocr.Reader(['en', 'fr']) # OCR for English & French
# ---- TEXT EXTRACTION FUNCTIONS ----
def extract_text_from_pdf(pdf_file):
"""Extract text from a PDF file."""
text = []
try:
with fitz.open(pdf_file) as doc:
for page in doc:
text.append(page.get_text("text"))
except Exception as e:
return f"Error reading PDF: {e}"
return "\n".join(text)
def extract_text_from_docx(docx_file):
"""Extract text from a DOCX file."""
doc = docx.Document(docx_file)
return "\n".join([p.text for p in doc.paragraphs if p.text.strip()])
def extract_text_from_pptx(pptx_file):
"""Extract text from a PPTX file."""
text = []
try:
presentation = pptx.Presentation(pptx_file)
for slide in presentation.slides:
for shape in slide.shapes:
if hasattr(shape, "text"):
text.append(shape.text)
except Exception as e:
return f"Error reading PPTX: {e}"
return "\n".join(text)
def extract_text_from_xlsx(xlsx_file):
"""Extract text from an XLSX file."""
text = []
try:
wb = openpyxl.load_workbook(xlsx_file)
for sheet in wb.sheetnames:
ws = wb[sheet]
for row in ws.iter_rows(values_only=True):
text.append(" ".join(str(cell) for cell in row if cell))
except Exception as e:
return f"Error reading XLSX: {e}"
return "\n".join(text)
# ---- MAIN PROCESSING FUNCTIONS ----
def answer_question_from_doc(file, question):
"""Process document and answer a question based on its content."""
ext = file.name.split(".")[-1].lower()
if ext == "pdf":
context = extract_text_from_pdf(file.name)
elif ext == "docx":
context = extract_text_from_docx(file.name)
elif ext == "pptx":
context = extract_text_from_pptx(file.name)
elif ext == "xlsx":
context = extract_text_from_xlsx(file.name)
else:
return """Unsupported file format."""
if not context.strip():
return """No text found in the document."""
# Generate answer using QA pipeline correctly
try:
result = qa_model({"question": question, "context": context})
return result["answer"]
except Exception as e:
return f"Error generating answer: {e}"
try:
result = qa_model({"question": question, "context": img_text})
return result["answer"]
except Exception as e:
return f"Error generating answer: {e}"
with gr.Blocks() as img_interface:
gr.Markdown("## 🖼️ Image Question Answering")
image_input = gr.Image(label="Upload an Image")
img_question_input = gr.Textbox(label="Ask a question")
img_answer_output = gr.Textbox(label="Answer")
image_submit = gr.Button("Get Answer")
image_submit.click(answer_question_from_image, inputs=[image_input, img_question_input], outputs=img_answer_output)
# ---- MOUNT GRADIO APP ----
demo = gr.TabbedInterface(img_interface, "Image QA")
app = gr.mount_gradio_app(app, demo, path="/")
@app.get("/")
def home():
return RedirectResponse(url="/")
|