Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -122,38 +122,32 @@ if __name__ == "__main__":
|
|
122 |
import gradio as gr
|
123 |
import uvicorn
|
124 |
import numpy as np
|
125 |
-
import pymupdf
|
126 |
import tika
|
127 |
import torch
|
128 |
from fastapi import FastAPI
|
129 |
-
from transformers import pipeline
|
130 |
from PIL import Image
|
131 |
from io import BytesIO
|
132 |
from starlette.responses import RedirectResponse
|
133 |
from tika import parser
|
134 |
from openpyxl import load_workbook
|
135 |
from pptx import Presentation
|
136 |
-
import easyocr
|
137 |
import os
|
138 |
|
139 |
-
tika.initVM()
|
140 |
-
|
141 |
app = FastAPI()
|
142 |
|
143 |
-
# Load models
|
144 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
145 |
qa_pipeline = pipeline("text-generation", model="TinyLlama/TinyLlama-1.1B-Chat-v1.0", device=device)
|
146 |
image_captioning_pipeline = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")
|
147 |
|
148 |
-
|
149 |
-
reader = easyocr.Reader(["en"])
|
150 |
-
|
151 |
-
ALLOWED_EXTENSIONS = {"pdf", "docx", "pptx", "xlsx", "png", "jpg", "jpeg"}
|
152 |
|
153 |
def validate_file_type(file):
|
154 |
-
if file
|
155 |
return "β No file uploaded!"
|
156 |
-
if isinstance(file, str):
|
157 |
return None
|
158 |
if hasattr(file, "name"):
|
159 |
ext = file.name.split(".")[-1].lower()
|
@@ -162,7 +156,6 @@ def validate_file_type(file):
|
|
162 |
return None
|
163 |
return "β Invalid file format!"
|
164 |
|
165 |
-
# β
Extract Text from PDF
|
166 |
def extract_text_from_pdf(file_bytes):
|
167 |
try:
|
168 |
doc = pymupdf.open(stream=file_bytes, filetype="pdf")
|
@@ -170,7 +163,6 @@ def extract_text_from_pdf(file_bytes):
|
|
170 |
except Exception as e:
|
171 |
return f"β PDF Error: {str(e)}"
|
172 |
|
173 |
-
# β
Extract Text from DOCX & PPTX using Tika
|
174 |
def extract_text_with_tika(file_bytes):
|
175 |
try:
|
176 |
parsed = parser.from_buffer(file_bytes)
|
@@ -178,7 +170,6 @@ def extract_text_with_tika(file_bytes):
|
|
178 |
except Exception as e:
|
179 |
return f"β Tika Error: {str(e)}"
|
180 |
|
181 |
-
# β
Extract Text from Excel
|
182 |
def extract_text_from_excel(file_bytes):
|
183 |
try:
|
184 |
wb = load_workbook(BytesIO(file_bytes), data_only=True)
|
@@ -190,7 +181,6 @@ def extract_text_from_excel(file_bytes):
|
|
190 |
except Exception as e:
|
191 |
return f"β Excel Error: {str(e)}"
|
192 |
|
193 |
-
# β
Extract Text from PPTX
|
194 |
def extract_text_from_pptx(file_bytes):
|
195 |
try:
|
196 |
ppt = Presentation(BytesIO(file_bytes))
|
@@ -203,41 +193,15 @@ def extract_text_from_pptx(file_bytes):
|
|
203 |
except Exception as e:
|
204 |
return f"β PPTX Error: {str(e)}"
|
205 |
|
206 |
-
|
207 |
-
|
208 |
-
try:
|
209 |
-
image = Image.open(image_file).convert("RGB")
|
210 |
-
np_image = np.array(image)
|
211 |
-
|
212 |
-
if np_image.std() < 10: # Low contrast check
|
213 |
-
return "β οΈ No meaningful content detected in the image."
|
214 |
-
|
215 |
-
result = reader.readtext(np_image)
|
216 |
-
return " ".join([res[1] for res in result]) if result else "β οΈ No text found."
|
217 |
-
except Exception as e:
|
218 |
-
return f"β Image OCR Error: {str(e)}"
|
219 |
-
|
220 |
-
# β
Truncate Long Text for Model
|
221 |
-
def truncate_text(text, max_tokens=450):
|
222 |
-
tokens = tokenizer.tokenize(text)
|
223 |
-
return tokenizer.convert_tokens_to_string(tokens[:max_tokens])
|
224 |
|
225 |
-
# β
Answer Questions from Image or Document
|
226 |
def answer_question(file, question: str):
|
227 |
try:
|
228 |
-
# β
Handle Image Files (Gradio sends images as NumPy arrays)
|
229 |
-
if isinstance(file, np.ndarray):
|
230 |
-
image = Image.fromarray(file)
|
231 |
-
caption = image_captioning_pipeline(image)[0]['generated_text']
|
232 |
-
response = qa_pipeline(f"Question: {question}\nContext: {caption}")
|
233 |
-
return response[0]["generated_text"]
|
234 |
-
|
235 |
-
# β
Validate File
|
236 |
validation_error = validate_file_type(file)
|
237 |
if validation_error:
|
238 |
return validation_error
|
239 |
|
240 |
-
# β
Read File Bytes
|
241 |
file_bytes = None
|
242 |
file_ext = None
|
243 |
|
@@ -246,24 +210,21 @@ def answer_question(file, question: str):
|
|
246 |
with open(file, "rb") as f:
|
247 |
file_bytes = f.read()
|
248 |
elif hasattr(file, "read"):
|
249 |
-
file_ext = file.name.split(".")[-1].lower()
|
250 |
file_bytes = file.read()
|
251 |
else:
|
252 |
-
return "β Unexpected file type received!"
|
253 |
|
254 |
-
# β
Extract Text Based on File Type
|
255 |
if file_ext == "pdf":
|
256 |
text = extract_text_from_pdf(file_bytes)
|
257 |
elif file_ext in ["docx", "pptx"]:
|
258 |
text = extract_text_with_tika(file_bytes)
|
259 |
elif file_ext == "xlsx":
|
260 |
text = extract_text_from_excel(file_bytes)
|
261 |
-
elif file_ext in ["png", "jpg", "jpeg"]:
|
262 |
-
text = extract_text_from_image(BytesIO(file_bytes))
|
263 |
else:
|
264 |
return f"β Unsupported file format: {file_ext}"
|
265 |
|
266 |
-
if not text or "
|
267 |
return f"β οΈ No text extracted. Error: {text}"
|
268 |
|
269 |
truncated_text = truncate_text(text)
|
@@ -274,17 +235,15 @@ def answer_question(file, question: str):
|
|
274 |
except Exception as e:
|
275 |
return f"β Processing Error: {str(e)}"
|
276 |
|
277 |
-
# β
Gradio Interface
|
278 |
with gr.Blocks() as demo:
|
279 |
-
gr.Markdown("## π AI-Powered Document
|
280 |
with gr.Row():
|
281 |
-
file_input = gr.File(label="Upload Document
|
282 |
question_input = gr.Textbox(label="Ask a Question", placeholder="What is this document about?")
|
283 |
answer_output = gr.Textbox(label="Answer")
|
284 |
submit_btn = gr.Button("Get Answer")
|
285 |
submit_btn.click(answer_question, inputs=[file_input, question_input], outputs=answer_output)
|
286 |
|
287 |
-
# β
Mount Gradio with FastAPI
|
288 |
app = gr.mount_gradio_app(app, demo, path="/")
|
289 |
|
290 |
@app.get("/")
|
|
|
122 |
import gradio as gr
|
123 |
import uvicorn
|
124 |
import numpy as np
|
125 |
+
import pymupdf # PyMuPDF
|
126 |
import tika
|
127 |
import torch
|
128 |
from fastapi import FastAPI
|
129 |
+
from transformers import pipeline
|
130 |
from PIL import Image
|
131 |
from io import BytesIO
|
132 |
from starlette.responses import RedirectResponse
|
133 |
from tika import parser
|
134 |
from openpyxl import load_workbook
|
135 |
from pptx import Presentation
|
|
|
136 |
import os
|
137 |
|
138 |
+
tika.initVM()
|
|
|
139 |
app = FastAPI()
|
140 |
|
|
|
141 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
142 |
qa_pipeline = pipeline("text-generation", model="TinyLlama/TinyLlama-1.1B-Chat-v1.0", device=device)
|
143 |
image_captioning_pipeline = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")
|
144 |
|
145 |
+
ALLOWED_EXTENSIONS = {"pdf", "docx", "pptx", "xlsx"}
|
|
|
|
|
|
|
146 |
|
147 |
def validate_file_type(file):
|
148 |
+
if not file:
|
149 |
return "β No file uploaded!"
|
150 |
+
if isinstance(file, str) and os.path.exists(file):
|
151 |
return None
|
152 |
if hasattr(file, "name"):
|
153 |
ext = file.name.split(".")[-1].lower()
|
|
|
156 |
return None
|
157 |
return "β Invalid file format!"
|
158 |
|
|
|
159 |
def extract_text_from_pdf(file_bytes):
|
160 |
try:
|
161 |
doc = pymupdf.open(stream=file_bytes, filetype="pdf")
|
|
|
163 |
except Exception as e:
|
164 |
return f"β PDF Error: {str(e)}"
|
165 |
|
|
|
166 |
def extract_text_with_tika(file_bytes):
|
167 |
try:
|
168 |
parsed = parser.from_buffer(file_bytes)
|
|
|
170 |
except Exception as e:
|
171 |
return f"β Tika Error: {str(e)}"
|
172 |
|
|
|
173 |
def extract_text_from_excel(file_bytes):
|
174 |
try:
|
175 |
wb = load_workbook(BytesIO(file_bytes), data_only=True)
|
|
|
181 |
except Exception as e:
|
182 |
return f"β Excel Error: {str(e)}"
|
183 |
|
|
|
184 |
def extract_text_from_pptx(file_bytes):
|
185 |
try:
|
186 |
ppt = Presentation(BytesIO(file_bytes))
|
|
|
193 |
except Exception as e:
|
194 |
return f"β PPTX Error: {str(e)}"
|
195 |
|
196 |
+
def truncate_text(text, max_length=2048):
|
197 |
+
return text[:max_length] if len(text) > max_length else text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
198 |
|
|
|
199 |
def answer_question(file, question: str):
|
200 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
201 |
validation_error = validate_file_type(file)
|
202 |
if validation_error:
|
203 |
return validation_error
|
204 |
|
|
|
205 |
file_bytes = None
|
206 |
file_ext = None
|
207 |
|
|
|
210 |
with open(file, "rb") as f:
|
211 |
file_bytes = f.read()
|
212 |
elif hasattr(file, "read"):
|
213 |
+
file_ext = file.name.split(".")[-1].lower()
|
214 |
file_bytes = file.read()
|
215 |
else:
|
216 |
+
return f"β Unexpected file type received! Type: {type(file)}"
|
217 |
|
|
|
218 |
if file_ext == "pdf":
|
219 |
text = extract_text_from_pdf(file_bytes)
|
220 |
elif file_ext in ["docx", "pptx"]:
|
221 |
text = extract_text_with_tika(file_bytes)
|
222 |
elif file_ext == "xlsx":
|
223 |
text = extract_text_from_excel(file_bytes)
|
|
|
|
|
224 |
else:
|
225 |
return f"β Unsupported file format: {file_ext}"
|
226 |
|
227 |
+
if not text or "β" in text:
|
228 |
return f"β οΈ No text extracted. Error: {text}"
|
229 |
|
230 |
truncated_text = truncate_text(text)
|
|
|
235 |
except Exception as e:
|
236 |
return f"β Processing Error: {str(e)}"
|
237 |
|
|
|
238 |
with gr.Blocks() as demo:
|
239 |
+
gr.Markdown("## π AI-Powered Document QA")
|
240 |
with gr.Row():
|
241 |
+
file_input = gr.File(label="Upload Document")
|
242 |
question_input = gr.Textbox(label="Ask a Question", placeholder="What is this document about?")
|
243 |
answer_output = gr.Textbox(label="Answer")
|
244 |
submit_btn = gr.Button("Get Answer")
|
245 |
submit_btn.click(answer_question, inputs=[file_input, question_input], outputs=answer_output)
|
246 |
|
|
|
247 |
app = gr.mount_gradio_app(app, demo, path="/")
|
248 |
|
249 |
@app.get("/")
|