ikraamkb commited on
Commit
5ebce4d
Β·
verified Β·
1 Parent(s): 96dbdf2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +56 -39
app.py CHANGED
@@ -122,22 +122,22 @@ if __name__ == "__main__":
122
  import gradio as gr
123
  import uvicorn
124
  import numpy as np
125
- import fitz # PyMuPDF
126
  import tika
127
  import torch
128
  from fastapi import FastAPI
129
- from transformers import pipeline
130
  from PIL import Image
131
  from io import BytesIO
132
  from starlette.responses import RedirectResponse
133
  from tika import parser
134
  from openpyxl import load_workbook
 
 
135
  import os
136
- import pymupdf
137
- # Initialize Tika for DOCX & PPTX parsing
138
- tika.initVM()
139
 
140
- # Initialize FastAPI
 
141
  app = FastAPI()
142
 
143
  # Load models
@@ -145,13 +145,15 @@ device = "cuda" if torch.cuda.is_available() else "cpu"
145
  qa_pipeline = pipeline("text-generation", model="TinyLlama/TinyLlama-1.1B-Chat-v1.0", device=device)
146
  image_captioning_pipeline = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")
147
 
148
- ALLOWED_EXTENSIONS = {"pdf", "docx", "pptx", "xlsx"}
 
 
 
149
 
150
- # βœ… Function to Validate File Type
151
  def validate_file_type(file):
152
  if file is None:
153
  return "❌ No file uploaded!"
154
- if isinstance(file, str): # Text-based input (NamedString)
155
  return None
156
  if hasattr(file, "name"):
157
  ext = file.name.split(".")[-1].lower()
@@ -160,23 +162,20 @@ def validate_file_type(file):
160
  return None
161
  return "❌ Invalid file format!"
162
 
163
- # βœ… Extract Text from PDF
164
  # βœ… Extract Text from PDF
165
  def extract_text_from_pdf(file_bytes):
166
  try:
167
- doc = pymupdf.open(stream=file_bytes, filetype="pdf") # Use pymupdf.open()
168
- return "\n".join([page.get_text() for page in doc])
169
  except Exception as e:
170
- print(f"❌ PDF Extraction Error: {e}") # Log error
171
  return f"❌ PDF Error: {str(e)}"
172
 
173
  # βœ… Extract Text from DOCX & PPTX using Tika
174
  def extract_text_with_tika(file_bytes):
175
  try:
176
  parsed = parser.from_buffer(file_bytes)
177
- return parsed["content"]
178
  except Exception as e:
179
- print(f"❌ Tika Extraction Error: {e}") # Log error
180
  return f"❌ Tika Error: {str(e)}"
181
 
182
  # βœ… Extract Text from Excel
@@ -187,14 +186,41 @@ def extract_text_from_excel(file_bytes):
187
  for sheet in wb.worksheets:
188
  for row in sheet.iter_rows(values_only=True):
189
  text.append(" ".join(str(cell) for cell in row if cell))
190
- return "\n".join(text)
191
  except Exception as e:
192
- print(f"❌ Excel Extraction Error: {e}") # Log error
193
  return f"❌ Excel Error: {str(e)}"
194
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
195
  # βœ… Truncate Long Text for Model
196
- def truncate_text(text, max_length=2048):
197
- return text[:max_length] if len(text) > max_length else text
 
198
 
199
  # βœ… Answer Questions from Image or Document
200
  def answer_question(file, question: str):
@@ -211,24 +237,19 @@ def answer_question(file, question: str):
211
  if validation_error:
212
  return validation_error
213
 
214
- # βœ… Determine File Path or Read Bytes
215
  file_bytes = None
216
  file_ext = None
217
 
218
- if isinstance(file, str): # Gradio sometimes passes a file path string
219
- if os.path.exists(file): # If it's a valid file path
220
- file_ext = file.split(".")[-1].lower()
221
- with open(file, "rb") as f:
222
- file_bytes = f.read()
223
- else:
224
- return f"❌ Error: File path does not exist! Path: {file}"
225
-
226
- elif hasattr(file, "read"): # If it's a file-like object
227
  file_ext = file.name.split(".")[-1].lower() if hasattr(file, "name") else None
228
  file_bytes = file.read()
229
-
230
  else:
231
- return f"❌ Unexpected file type received! Type: {type(file)}"
232
 
233
  # βœ… Extract Text Based on File Type
234
  if file_ext == "pdf":
@@ -237,11 +258,12 @@ def answer_question(file, question: str):
237
  text = extract_text_with_tika(file_bytes)
238
  elif file_ext == "xlsx":
239
  text = extract_text_from_excel(file_bytes)
 
 
240
  else:
241
  return f"❌ Unsupported file format: {file_ext}"
242
 
243
- # βœ… Validate Extraction
244
- if not text or "❌" in text:
245
  return f"⚠️ No text extracted. Error: {text}"
246
 
247
  truncated_text = truncate_text(text)
@@ -250,19 +272,15 @@ def answer_question(file, question: str):
250
  return response[0]["generated_text"]
251
 
252
  except Exception as e:
253
- print(f"❌ General Processing Error: {e}") # Log error to console
254
  return f"❌ Processing Error: {str(e)}"
255
 
256
- # βœ… Gradio Interface (Unified for Images & Documents)
257
  with gr.Blocks() as demo:
258
  gr.Markdown("## πŸ“„ AI-Powered Document & Image QA")
259
-
260
  with gr.Row():
261
  file_input = gr.File(label="Upload Document / Image")
262
  question_input = gr.Textbox(label="Ask a Question", placeholder="What is this document about?")
263
-
264
  answer_output = gr.Textbox(label="Answer")
265
-
266
  submit_btn = gr.Button("Get Answer")
267
  submit_btn.click(answer_question, inputs=[file_input, question_input], outputs=answer_output)
268
 
@@ -273,6 +291,5 @@ app = gr.mount_gradio_app(app, demo, path="/")
273
  def home():
274
  return RedirectResponse(url="/")
275
 
276
- # βœ… Run FastAPI + Gradio
277
  if __name__ == "__main__":
278
  uvicorn.run(app, host="0.0.0.0", port=7860)
 
122
  import gradio as gr
123
  import uvicorn
124
  import numpy as np
125
+ import pymupdf
126
  import tika
127
  import torch
128
  from fastapi import FastAPI
129
+ from transformers import pipeline, AutoTokenizer
130
  from PIL import Image
131
  from io import BytesIO
132
  from starlette.responses import RedirectResponse
133
  from tika import parser
134
  from openpyxl import load_workbook
135
+ from pptx import Presentation
136
+ import easyocr
137
  import os
 
 
 
138
 
139
+ tika.initVM()
140
+
141
  app = FastAPI()
142
 
143
  # Load models
 
145
  qa_pipeline = pipeline("text-generation", model="TinyLlama/TinyLlama-1.1B-Chat-v1.0", device=device)
146
  image_captioning_pipeline = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")
147
 
148
+ tokenizer = AutoTokenizer.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0")
149
+ reader = easyocr.Reader(["en"])
150
+
151
+ ALLOWED_EXTENSIONS = {"pdf", "docx", "pptx", "xlsx", "png", "jpg", "jpeg"}
152
 
 
153
  def validate_file_type(file):
154
  if file is None:
155
  return "❌ No file uploaded!"
156
+ if isinstance(file, str):
157
  return None
158
  if hasattr(file, "name"):
159
  ext = file.name.split(".")[-1].lower()
 
162
  return None
163
  return "❌ Invalid file format!"
164
 
 
165
  # βœ… Extract Text from PDF
166
  def extract_text_from_pdf(file_bytes):
167
  try:
168
+ doc = pymupdf.open(stream=file_bytes, filetype="pdf")
169
+ return "\n".join([page.get_text("text") for page in doc])
170
  except Exception as e:
 
171
  return f"❌ PDF Error: {str(e)}"
172
 
173
  # βœ… Extract Text from DOCX & PPTX using Tika
174
  def extract_text_with_tika(file_bytes):
175
  try:
176
  parsed = parser.from_buffer(file_bytes)
177
+ return parsed.get("content", "⚠️ No text found.").strip()
178
  except Exception as e:
 
179
  return f"❌ Tika Error: {str(e)}"
180
 
181
  # βœ… Extract Text from Excel
 
186
  for sheet in wb.worksheets:
187
  for row in sheet.iter_rows(values_only=True):
188
  text.append(" ".join(str(cell) for cell in row if cell))
189
+ return "\n".join(text) if text else "⚠️ No text found."
190
  except Exception as e:
 
191
  return f"❌ Excel Error: {str(e)}"
192
 
193
+ # βœ… Extract Text from PPTX
194
+ def extract_text_from_pptx(file_bytes):
195
+ try:
196
+ ppt = Presentation(BytesIO(file_bytes))
197
+ text = []
198
+ for slide in ppt.slides:
199
+ for shape in slide.shapes:
200
+ if hasattr(shape, "text"):
201
+ text.append(shape.text)
202
+ return "\n".join(text) if text else "⚠️ No text found."
203
+ except Exception as e:
204
+ return f"❌ PPTX Error: {str(e)}"
205
+
206
+ # βœ… Extract Text from Image using OCR
207
+ def extract_text_from_image(image_file):
208
+ try:
209
+ image = Image.open(image_file).convert("RGB")
210
+ np_image = np.array(image)
211
+
212
+ if np_image.std() < 10: # Low contrast check
213
+ return "⚠️ No meaningful content detected in the image."
214
+
215
+ result = reader.readtext(np_image)
216
+ return " ".join([res[1] for res in result]) if result else "⚠️ No text found."
217
+ except Exception as e:
218
+ return f"❌ Image OCR Error: {str(e)}"
219
+
220
  # βœ… Truncate Long Text for Model
221
+ def truncate_text(text, max_tokens=450):
222
+ tokens = tokenizer.tokenize(text)
223
+ return tokenizer.convert_tokens_to_string(tokens[:max_tokens])
224
 
225
  # βœ… Answer Questions from Image or Document
226
  def answer_question(file, question: str):
 
237
  if validation_error:
238
  return validation_error
239
 
240
+ # βœ… Read File Bytes
241
  file_bytes = None
242
  file_ext = None
243
 
244
+ if isinstance(file, str) and os.path.exists(file):
245
+ file_ext = file.split(".")[-1].lower()
246
+ with open(file, "rb") as f:
247
+ file_bytes = f.read()
248
+ elif hasattr(file, "read"):
 
 
 
 
249
  file_ext = file.name.split(".")[-1].lower() if hasattr(file, "name") else None
250
  file_bytes = file.read()
 
251
  else:
252
+ return "❌ Unexpected file type received!"
253
 
254
  # βœ… Extract Text Based on File Type
255
  if file_ext == "pdf":
 
258
  text = extract_text_with_tika(file_bytes)
259
  elif file_ext == "xlsx":
260
  text = extract_text_from_excel(file_bytes)
261
+ elif file_ext in ["png", "jpg", "jpeg"]:
262
+ text = extract_text_from_image(BytesIO(file_bytes))
263
  else:
264
  return f"❌ Unsupported file format: {file_ext}"
265
 
266
+ if not text or "⚠️" in text:
 
267
  return f"⚠️ No text extracted. Error: {text}"
268
 
269
  truncated_text = truncate_text(text)
 
272
  return response[0]["generated_text"]
273
 
274
  except Exception as e:
 
275
  return f"❌ Processing Error: {str(e)}"
276
 
277
+ # βœ… Gradio Interface
278
  with gr.Blocks() as demo:
279
  gr.Markdown("## πŸ“„ AI-Powered Document & Image QA")
 
280
  with gr.Row():
281
  file_input = gr.File(label="Upload Document / Image")
282
  question_input = gr.Textbox(label="Ask a Question", placeholder="What is this document about?")
 
283
  answer_output = gr.Textbox(label="Answer")
 
284
  submit_btn = gr.Button("Get Answer")
285
  submit_btn.click(answer_question, inputs=[file_input, question_input], outputs=answer_output)
286
 
 
291
  def home():
292
  return RedirectResponse(url="/")
293
 
 
294
  if __name__ == "__main__":
295
  uvicorn.run(app, host="0.0.0.0", port=7860)