Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -122,22 +122,22 @@ if __name__ == "__main__":
|
|
122 |
import gradio as gr
|
123 |
import uvicorn
|
124 |
import numpy as np
|
125 |
-
import
|
126 |
import tika
|
127 |
import torch
|
128 |
from fastapi import FastAPI
|
129 |
-
from transformers import pipeline
|
130 |
from PIL import Image
|
131 |
from io import BytesIO
|
132 |
from starlette.responses import RedirectResponse
|
133 |
from tika import parser
|
134 |
from openpyxl import load_workbook
|
|
|
|
|
135 |
import os
|
136 |
-
import pymupdf
|
137 |
-
# Initialize Tika for DOCX & PPTX parsing
|
138 |
-
tika.initVM()
|
139 |
|
140 |
-
|
|
|
141 |
app = FastAPI()
|
142 |
|
143 |
# Load models
|
@@ -145,13 +145,15 @@ device = "cuda" if torch.cuda.is_available() else "cpu"
|
|
145 |
qa_pipeline = pipeline("text-generation", model="TinyLlama/TinyLlama-1.1B-Chat-v1.0", device=device)
|
146 |
image_captioning_pipeline = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")
|
147 |
|
148 |
-
|
|
|
|
|
|
|
149 |
|
150 |
-
# β
Function to Validate File Type
|
151 |
def validate_file_type(file):
|
152 |
if file is None:
|
153 |
return "β No file uploaded!"
|
154 |
-
if isinstance(file, str):
|
155 |
return None
|
156 |
if hasattr(file, "name"):
|
157 |
ext = file.name.split(".")[-1].lower()
|
@@ -160,23 +162,20 @@ def validate_file_type(file):
|
|
160 |
return None
|
161 |
return "β Invalid file format!"
|
162 |
|
163 |
-
# β
Extract Text from PDF
|
164 |
# β
Extract Text from PDF
|
165 |
def extract_text_from_pdf(file_bytes):
|
166 |
try:
|
167 |
-
doc = pymupdf.open(stream=file_bytes, filetype="pdf")
|
168 |
-
return "\n".join([page.get_text() for page in doc])
|
169 |
except Exception as e:
|
170 |
-
print(f"β PDF Extraction Error: {e}") # Log error
|
171 |
return f"β PDF Error: {str(e)}"
|
172 |
|
173 |
# β
Extract Text from DOCX & PPTX using Tika
|
174 |
def extract_text_with_tika(file_bytes):
|
175 |
try:
|
176 |
parsed = parser.from_buffer(file_bytes)
|
177 |
-
return parsed
|
178 |
except Exception as e:
|
179 |
-
print(f"β Tika Extraction Error: {e}") # Log error
|
180 |
return f"β Tika Error: {str(e)}"
|
181 |
|
182 |
# β
Extract Text from Excel
|
@@ -187,14 +186,41 @@ def extract_text_from_excel(file_bytes):
|
|
187 |
for sheet in wb.worksheets:
|
188 |
for row in sheet.iter_rows(values_only=True):
|
189 |
text.append(" ".join(str(cell) for cell in row if cell))
|
190 |
-
return "\n".join(text)
|
191 |
except Exception as e:
|
192 |
-
print(f"β Excel Extraction Error: {e}") # Log error
|
193 |
return f"β Excel Error: {str(e)}"
|
194 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
195 |
# β
Truncate Long Text for Model
|
196 |
-
def truncate_text(text,
|
197 |
-
|
|
|
198 |
|
199 |
# β
Answer Questions from Image or Document
|
200 |
def answer_question(file, question: str):
|
@@ -211,24 +237,19 @@ def answer_question(file, question: str):
|
|
211 |
if validation_error:
|
212 |
return validation_error
|
213 |
|
214 |
-
# β
|
215 |
file_bytes = None
|
216 |
file_ext = None
|
217 |
|
218 |
-
if isinstance(file, str)
|
219 |
-
|
220 |
-
|
221 |
-
|
222 |
-
|
223 |
-
else:
|
224 |
-
return f"β Error: File path does not exist! Path: {file}"
|
225 |
-
|
226 |
-
elif hasattr(file, "read"): # If it's a file-like object
|
227 |
file_ext = file.name.split(".")[-1].lower() if hasattr(file, "name") else None
|
228 |
file_bytes = file.read()
|
229 |
-
|
230 |
else:
|
231 |
-
return
|
232 |
|
233 |
# β
Extract Text Based on File Type
|
234 |
if file_ext == "pdf":
|
@@ -237,11 +258,12 @@ def answer_question(file, question: str):
|
|
237 |
text = extract_text_with_tika(file_bytes)
|
238 |
elif file_ext == "xlsx":
|
239 |
text = extract_text_from_excel(file_bytes)
|
|
|
|
|
240 |
else:
|
241 |
return f"β Unsupported file format: {file_ext}"
|
242 |
|
243 |
-
|
244 |
-
if not text or "β" in text:
|
245 |
return f"β οΈ No text extracted. Error: {text}"
|
246 |
|
247 |
truncated_text = truncate_text(text)
|
@@ -250,19 +272,15 @@ def answer_question(file, question: str):
|
|
250 |
return response[0]["generated_text"]
|
251 |
|
252 |
except Exception as e:
|
253 |
-
print(f"β General Processing Error: {e}") # Log error to console
|
254 |
return f"β Processing Error: {str(e)}"
|
255 |
|
256 |
-
# β
Gradio Interface
|
257 |
with gr.Blocks() as demo:
|
258 |
gr.Markdown("## π AI-Powered Document & Image QA")
|
259 |
-
|
260 |
with gr.Row():
|
261 |
file_input = gr.File(label="Upload Document / Image")
|
262 |
question_input = gr.Textbox(label="Ask a Question", placeholder="What is this document about?")
|
263 |
-
|
264 |
answer_output = gr.Textbox(label="Answer")
|
265 |
-
|
266 |
submit_btn = gr.Button("Get Answer")
|
267 |
submit_btn.click(answer_question, inputs=[file_input, question_input], outputs=answer_output)
|
268 |
|
@@ -273,6 +291,5 @@ app = gr.mount_gradio_app(app, demo, path="/")
|
|
273 |
def home():
|
274 |
return RedirectResponse(url="/")
|
275 |
|
276 |
-
# β
Run FastAPI + Gradio
|
277 |
if __name__ == "__main__":
|
278 |
uvicorn.run(app, host="0.0.0.0", port=7860)
|
|
|
122 |
import gradio as gr
|
123 |
import uvicorn
|
124 |
import numpy as np
|
125 |
+
import pymupdf
|
126 |
import tika
|
127 |
import torch
|
128 |
from fastapi import FastAPI
|
129 |
+
from transformers import pipeline, AutoTokenizer
|
130 |
from PIL import Image
|
131 |
from io import BytesIO
|
132 |
from starlette.responses import RedirectResponse
|
133 |
from tika import parser
|
134 |
from openpyxl import load_workbook
|
135 |
+
from pptx import Presentation
|
136 |
+
import easyocr
|
137 |
import os
|
|
|
|
|
|
|
138 |
|
139 |
+
tika.initVM()
|
140 |
+
|
141 |
app = FastAPI()
|
142 |
|
143 |
# Load models
|
|
|
145 |
qa_pipeline = pipeline("text-generation", model="TinyLlama/TinyLlama-1.1B-Chat-v1.0", device=device)
|
146 |
image_captioning_pipeline = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")
|
147 |
|
148 |
+
tokenizer = AutoTokenizer.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0")
|
149 |
+
reader = easyocr.Reader(["en"])
|
150 |
+
|
151 |
+
ALLOWED_EXTENSIONS = {"pdf", "docx", "pptx", "xlsx", "png", "jpg", "jpeg"}
|
152 |
|
|
|
153 |
def validate_file_type(file):
|
154 |
if file is None:
|
155 |
return "β No file uploaded!"
|
156 |
+
if isinstance(file, str):
|
157 |
return None
|
158 |
if hasattr(file, "name"):
|
159 |
ext = file.name.split(".")[-1].lower()
|
|
|
162 |
return None
|
163 |
return "β Invalid file format!"
|
164 |
|
|
|
165 |
# β
Extract Text from PDF
|
166 |
def extract_text_from_pdf(file_bytes):
|
167 |
try:
|
168 |
+
doc = pymupdf.open(stream=file_bytes, filetype="pdf")
|
169 |
+
return "\n".join([page.get_text("text") for page in doc])
|
170 |
except Exception as e:
|
|
|
171 |
return f"β PDF Error: {str(e)}"
|
172 |
|
173 |
# β
Extract Text from DOCX & PPTX using Tika
|
174 |
def extract_text_with_tika(file_bytes):
|
175 |
try:
|
176 |
parsed = parser.from_buffer(file_bytes)
|
177 |
+
return parsed.get("content", "β οΈ No text found.").strip()
|
178 |
except Exception as e:
|
|
|
179 |
return f"β Tika Error: {str(e)}"
|
180 |
|
181 |
# β
Extract Text from Excel
|
|
|
186 |
for sheet in wb.worksheets:
|
187 |
for row in sheet.iter_rows(values_only=True):
|
188 |
text.append(" ".join(str(cell) for cell in row if cell))
|
189 |
+
return "\n".join(text) if text else "β οΈ No text found."
|
190 |
except Exception as e:
|
|
|
191 |
return f"β Excel Error: {str(e)}"
|
192 |
|
193 |
+
# β
Extract Text from PPTX
|
194 |
+
def extract_text_from_pptx(file_bytes):
|
195 |
+
try:
|
196 |
+
ppt = Presentation(BytesIO(file_bytes))
|
197 |
+
text = []
|
198 |
+
for slide in ppt.slides:
|
199 |
+
for shape in slide.shapes:
|
200 |
+
if hasattr(shape, "text"):
|
201 |
+
text.append(shape.text)
|
202 |
+
return "\n".join(text) if text else "β οΈ No text found."
|
203 |
+
except Exception as e:
|
204 |
+
return f"β PPTX Error: {str(e)}"
|
205 |
+
|
206 |
+
# β
Extract Text from Image using OCR
|
207 |
+
def extract_text_from_image(image_file):
|
208 |
+
try:
|
209 |
+
image = Image.open(image_file).convert("RGB")
|
210 |
+
np_image = np.array(image)
|
211 |
+
|
212 |
+
if np_image.std() < 10: # Low contrast check
|
213 |
+
return "β οΈ No meaningful content detected in the image."
|
214 |
+
|
215 |
+
result = reader.readtext(np_image)
|
216 |
+
return " ".join([res[1] for res in result]) if result else "β οΈ No text found."
|
217 |
+
except Exception as e:
|
218 |
+
return f"β Image OCR Error: {str(e)}"
|
219 |
+
|
220 |
# β
Truncate Long Text for Model
|
221 |
+
def truncate_text(text, max_tokens=450):
|
222 |
+
tokens = tokenizer.tokenize(text)
|
223 |
+
return tokenizer.convert_tokens_to_string(tokens[:max_tokens])
|
224 |
|
225 |
# β
Answer Questions from Image or Document
|
226 |
def answer_question(file, question: str):
|
|
|
237 |
if validation_error:
|
238 |
return validation_error
|
239 |
|
240 |
+
# β
Read File Bytes
|
241 |
file_bytes = None
|
242 |
file_ext = None
|
243 |
|
244 |
+
if isinstance(file, str) and os.path.exists(file):
|
245 |
+
file_ext = file.split(".")[-1].lower()
|
246 |
+
with open(file, "rb") as f:
|
247 |
+
file_bytes = f.read()
|
248 |
+
elif hasattr(file, "read"):
|
|
|
|
|
|
|
|
|
249 |
file_ext = file.name.split(".")[-1].lower() if hasattr(file, "name") else None
|
250 |
file_bytes = file.read()
|
|
|
251 |
else:
|
252 |
+
return "β Unexpected file type received!"
|
253 |
|
254 |
# β
Extract Text Based on File Type
|
255 |
if file_ext == "pdf":
|
|
|
258 |
text = extract_text_with_tika(file_bytes)
|
259 |
elif file_ext == "xlsx":
|
260 |
text = extract_text_from_excel(file_bytes)
|
261 |
+
elif file_ext in ["png", "jpg", "jpeg"]:
|
262 |
+
text = extract_text_from_image(BytesIO(file_bytes))
|
263 |
else:
|
264 |
return f"β Unsupported file format: {file_ext}"
|
265 |
|
266 |
+
if not text or "β οΈ" in text:
|
|
|
267 |
return f"β οΈ No text extracted. Error: {text}"
|
268 |
|
269 |
truncated_text = truncate_text(text)
|
|
|
272 |
return response[0]["generated_text"]
|
273 |
|
274 |
except Exception as e:
|
|
|
275 |
return f"β Processing Error: {str(e)}"
|
276 |
|
277 |
+
# β
Gradio Interface
|
278 |
with gr.Blocks() as demo:
|
279 |
gr.Markdown("## π AI-Powered Document & Image QA")
|
|
|
280 |
with gr.Row():
|
281 |
file_input = gr.File(label="Upload Document / Image")
|
282 |
question_input = gr.Textbox(label="Ask a Question", placeholder="What is this document about?")
|
|
|
283 |
answer_output = gr.Textbox(label="Answer")
|
|
|
284 |
submit_btn = gr.Button("Get Answer")
|
285 |
submit_btn.click(answer_question, inputs=[file_input, question_input], outputs=answer_output)
|
286 |
|
|
|
291 |
def home():
|
292 |
return RedirectResponse(url="/")
|
293 |
|
|
|
294 |
if __name__ == "__main__":
|
295 |
uvicorn.run(app, host="0.0.0.0", port=7860)
|