Spaces:

ikraamkb
/

qtAnswering

Running

App Files Files Community

ikraamkb commited on Mar 29

Commit

29f5581

verified ·

1 Parent(s): d51b69d

Update app.py

Browse files

Files changed (1) hide show

app.py +50 -36

app.py CHANGED Viewed

@@ -1,30 +1,33 @@
 import gradio as gr
 import numpy as np
 import fitz  # PyMuPDF
-import tika
 import torch
 from fastapi import FastAPI
 from transformers import pipeline
 from PIL import Image
-from io import BytesIO
 from starlette.responses import RedirectResponse
-from tika import parser
 from openpyxl import load_workbook
-# Initialize Tika for DOCX & PPTX parsing (Ensure Java is installed)
-tika.initVM()
 # Initialize FastAPI
 app = FastAPI()
-# Load models
 device = "cuda" if torch.cuda.is_available() else "cpu"
-qa_pipeline = pipeline("text-generation", model="TinyLlama/TinyLlama-1.1B-Chat-v1.0", device=device)
-image_captioning_pipeline = pipeline("image-to-text", model="nlpconnect/vit-gpt2-image-captioning")
 ALLOWED_EXTENSIONS = {"pdf", "docx", "pptx", "xlsx"}
-# ✅ Function to Validate File Type
 def validate_file_type(file):
     if hasattr(file, "name"):
         ext = file.name.split(".")[-1].lower()
@@ -34,33 +37,38 @@ def validate_file_type(file):
     return "❌ Invalid file format!"
 # ✅ Extract Text from PDF
-def extract_text_from_pdf(file):
-    with fitz.open(file.name) as doc:
-        return "\n".join([page.get_text() for page in doc])
-# ✅ Extract Text from DOCX & PPTX using Tika
-def extract_text_with_tika(file):
-    return parser.from_file(file.name)["content"]
 # ✅ Extract Text from Excel
-def extract_text_from_excel(file):
-    wb = load_workbook(file.name, data_only=True)
-    text = []
-    for sheet in wb.worksheets:
-        for row in sheet.iter_rows(values_only=True):
-            text.append(" ".join(str(cell) for cell in row if cell))
-    return "\n".join(text)
-# ✅ Truncate Long Text for Model
-def truncate_text(text, max_length=2048):
-    return text[:max_length] if len(text) > max_length else text
 # ✅ Answer Questions from Image or Document
-def answer_question(file, question: str):
     if isinstance(file, np.ndarray):  # Image Processing
         image = Image.fromarray(file)
-        caption = image_captioning_pipeline(image)[0]['generated_text']
-        response = qa_pipeline(f"Question: {question}\nContext: {caption}")
         return response[0]["generated_text"]
     validation_error = validate_file_type(file)
@@ -69,13 +77,15 @@ def answer_question(file, question: str):
     file_ext = file.name.split(".")[-1].lower()
-    # Extract Text from Supported Documents
     if file_ext == "pdf":
-        text = extract_text_from_pdf(file)
-    elif file_ext in ["docx", "pptx"]:
-        text = extract_text_with_tika(file)
     elif file_ext == "xlsx":
-        text = extract_text_from_excel(file)
     else:
         return "❌ Unsupported file format!"
@@ -83,7 +93,11 @@ def answer_question(file, question: str):
         return "⚠️ No text extracted from the document."
     truncated_text = truncate_text(text)
-    response = qa_pipeline(f"Question: {question}\nContext: {truncated_text}")
     return response[0]["generated_text"]

 import gradio as gr
 import numpy as np
 import fitz  # PyMuPDF
 import torch
+import asyncio
 from fastapi import FastAPI
 from transformers import pipeline
 from PIL import Image
 from starlette.responses import RedirectResponse
 from openpyxl import load_workbook
+from docx import Document
+from pptx import Presentation
 # Initialize FastAPI
 app = FastAPI()
+# Use GPU if available
 device = "cuda" if torch.cuda.is_available() else "cpu"
+# Function to load models lazily
+def get_qa_pipeline():
+    return pipeline("text-generation", model="TinyLlama/TinyLlama-1.1B-Chat-v1.0", device=device, torch_dtype=torch.float16)
+def get_image_captioning_pipeline():
+    return pipeline("image-to-text", model="nlpconnect/vit-gpt2-image-captioning")
 ALLOWED_EXTENSIONS = {"pdf", "docx", "pptx", "xlsx"}
+MAX_INPUT_LENGTH = 1024  # Limit input length for faster processing
+# ✅ Validate File Type
 def validate_file_type(file):
     if hasattr(file, "name"):
         ext = file.name.split(".")[-1].lower()
     return "❌ Invalid file format!"
 # ✅ Extract Text from PDF
+async def extract_text_from_pdf(file):
+    loop = asyncio.get_event_loop()
+    return await loop.run_in_executor(None, lambda: "\n".join([page.get_text() for page in fitz.open(file.name)]))
+# ✅ Extract Text from DOCX
+async def extract_text_from_docx(file):
+    loop = asyncio.get_event_loop()
+    return await loop.run_in_executor(None, lambda: "\n".join([p.text for p in Document(file).paragraphs]))
+# ✅ Extract Text from PPTX
+async def extract_text_from_pptx(file):
+    loop = asyncio.get_event_loop()
+    return await loop.run_in_executor(None, lambda: "\n".join([shape.text for slide in Presentation(file).slides for shape in slide.shapes if hasattr(shape, "text")]))
 # ✅ Extract Text from Excel
+async def extract_text_from_excel(file):
+    loop = asyncio.get_event_loop()
+    return await loop.run_in_executor(None, lambda: "\n".join([" ".join(str(cell) for cell in row if cell) for sheet in load_workbook(file.name, data_only=True).worksheets for row in sheet.iter_rows(values_only=True)]))
+# ✅ Truncate Long Text
+def truncate_text(text):
+    return text[:MAX_INPUT_LENGTH] if len(text) > MAX_INPUT_LENGTH else text
 # ✅ Answer Questions from Image or Document
+async def answer_question(file, question: str):
     if isinstance(file, np.ndarray):  # Image Processing
         image = Image.fromarray(file)
+        image_captioning = get_image_captioning_pipeline()
+        caption = image_captioning(image)[0]['generated_text']
+        qa = get_qa_pipeline()
+        response = qa(f"Question: {question}\nContext: {caption}")
         return response[0]["generated_text"]
     validation_error = validate_file_type(file)
     file_ext = file.name.split(".")[-1].lower()
+    # Extract text asynchronously
     if file_ext == "pdf":
+        text = await extract_text_from_pdf(file)
+    elif file_ext == "docx":
+        text = await extract_text_from_docx(file)
+    elif file_ext == "pptx":
+        text = await extract_text_from_pptx(file)
     elif file_ext == "xlsx":
+        text = await extract_text_from_excel(file)
     else:
         return "❌ Unsupported file format!"
         return "⚠️ No text extracted from the document."
     truncated_text = truncate_text(text)
+    # Run QA model asynchronously
+    loop = asyncio.get_event_loop()
+    qa = get_qa_pipeline()
+    response = await loop.run_in_executor(None, qa, f"Question: {question}\nContext: {truncated_text}")
     return response[0]["generated_text"]