ikraamkb commited on
Commit
d49960b
Β·
verified Β·
1 Parent(s): 5ebce4d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +13 -54
app.py CHANGED
@@ -122,38 +122,32 @@ if __name__ == "__main__":
122
  import gradio as gr
123
  import uvicorn
124
  import numpy as np
125
- import pymupdf
126
  import tika
127
  import torch
128
  from fastapi import FastAPI
129
- from transformers import pipeline, AutoTokenizer
130
  from PIL import Image
131
  from io import BytesIO
132
  from starlette.responses import RedirectResponse
133
  from tika import parser
134
  from openpyxl import load_workbook
135
  from pptx import Presentation
136
- import easyocr
137
  import os
138
 
139
- tika.initVM()
140
-
141
  app = FastAPI()
142
 
143
- # Load models
144
  device = "cuda" if torch.cuda.is_available() else "cpu"
145
  qa_pipeline = pipeline("text-generation", model="TinyLlama/TinyLlama-1.1B-Chat-v1.0", device=device)
146
  image_captioning_pipeline = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")
147
 
148
- tokenizer = AutoTokenizer.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0")
149
- reader = easyocr.Reader(["en"])
150
-
151
- ALLOWED_EXTENSIONS = {"pdf", "docx", "pptx", "xlsx", "png", "jpg", "jpeg"}
152
 
153
  def validate_file_type(file):
154
- if file is None:
155
  return "❌ No file uploaded!"
156
- if isinstance(file, str):
157
  return None
158
  if hasattr(file, "name"):
159
  ext = file.name.split(".")[-1].lower()
@@ -162,7 +156,6 @@ def validate_file_type(file):
162
  return None
163
  return "❌ Invalid file format!"
164
 
165
- # βœ… Extract Text from PDF
166
  def extract_text_from_pdf(file_bytes):
167
  try:
168
  doc = pymupdf.open(stream=file_bytes, filetype="pdf")
@@ -170,7 +163,6 @@ def extract_text_from_pdf(file_bytes):
170
  except Exception as e:
171
  return f"❌ PDF Error: {str(e)}"
172
 
173
- # βœ… Extract Text from DOCX & PPTX using Tika
174
  def extract_text_with_tika(file_bytes):
175
  try:
176
  parsed = parser.from_buffer(file_bytes)
@@ -178,7 +170,6 @@ def extract_text_with_tika(file_bytes):
178
  except Exception as e:
179
  return f"❌ Tika Error: {str(e)}"
180
 
181
- # βœ… Extract Text from Excel
182
  def extract_text_from_excel(file_bytes):
183
  try:
184
  wb = load_workbook(BytesIO(file_bytes), data_only=True)
@@ -190,7 +181,6 @@ def extract_text_from_excel(file_bytes):
190
  except Exception as e:
191
  return f"❌ Excel Error: {str(e)}"
192
 
193
- # βœ… Extract Text from PPTX
194
  def extract_text_from_pptx(file_bytes):
195
  try:
196
  ppt = Presentation(BytesIO(file_bytes))
@@ -203,41 +193,15 @@ def extract_text_from_pptx(file_bytes):
203
  except Exception as e:
204
  return f"❌ PPTX Error: {str(e)}"
205
 
206
- # βœ… Extract Text from Image using OCR
207
- def extract_text_from_image(image_file):
208
- try:
209
- image = Image.open(image_file).convert("RGB")
210
- np_image = np.array(image)
211
-
212
- if np_image.std() < 10: # Low contrast check
213
- return "⚠️ No meaningful content detected in the image."
214
-
215
- result = reader.readtext(np_image)
216
- return " ".join([res[1] for res in result]) if result else "⚠️ No text found."
217
- except Exception as e:
218
- return f"❌ Image OCR Error: {str(e)}"
219
-
220
- # βœ… Truncate Long Text for Model
221
- def truncate_text(text, max_tokens=450):
222
- tokens = tokenizer.tokenize(text)
223
- return tokenizer.convert_tokens_to_string(tokens[:max_tokens])
224
 
225
- # βœ… Answer Questions from Image or Document
226
  def answer_question(file, question: str):
227
  try:
228
- # βœ… Handle Image Files (Gradio sends images as NumPy arrays)
229
- if isinstance(file, np.ndarray):
230
- image = Image.fromarray(file)
231
- caption = image_captioning_pipeline(image)[0]['generated_text']
232
- response = qa_pipeline(f"Question: {question}\nContext: {caption}")
233
- return response[0]["generated_text"]
234
-
235
- # βœ… Validate File
236
  validation_error = validate_file_type(file)
237
  if validation_error:
238
  return validation_error
239
 
240
- # βœ… Read File Bytes
241
  file_bytes = None
242
  file_ext = None
243
 
@@ -246,24 +210,21 @@ def answer_question(file, question: str):
246
  with open(file, "rb") as f:
247
  file_bytes = f.read()
248
  elif hasattr(file, "read"):
249
- file_ext = file.name.split(".")[-1].lower() if hasattr(file, "name") else None
250
  file_bytes = file.read()
251
  else:
252
- return "❌ Unexpected file type received!"
253
 
254
- # βœ… Extract Text Based on File Type
255
  if file_ext == "pdf":
256
  text = extract_text_from_pdf(file_bytes)
257
  elif file_ext in ["docx", "pptx"]:
258
  text = extract_text_with_tika(file_bytes)
259
  elif file_ext == "xlsx":
260
  text = extract_text_from_excel(file_bytes)
261
- elif file_ext in ["png", "jpg", "jpeg"]:
262
- text = extract_text_from_image(BytesIO(file_bytes))
263
  else:
264
  return f"❌ Unsupported file format: {file_ext}"
265
 
266
- if not text or "⚠️" in text:
267
  return f"⚠️ No text extracted. Error: {text}"
268
 
269
  truncated_text = truncate_text(text)
@@ -274,17 +235,15 @@ def answer_question(file, question: str):
274
  except Exception as e:
275
  return f"❌ Processing Error: {str(e)}"
276
 
277
- # βœ… Gradio Interface
278
  with gr.Blocks() as demo:
279
- gr.Markdown("## πŸ“„ AI-Powered Document & Image QA")
280
  with gr.Row():
281
- file_input = gr.File(label="Upload Document / Image")
282
  question_input = gr.Textbox(label="Ask a Question", placeholder="What is this document about?")
283
  answer_output = gr.Textbox(label="Answer")
284
  submit_btn = gr.Button("Get Answer")
285
  submit_btn.click(answer_question, inputs=[file_input, question_input], outputs=answer_output)
286
 
287
- # βœ… Mount Gradio with FastAPI
288
  app = gr.mount_gradio_app(app, demo, path="/")
289
 
290
  @app.get("/")
 
122
  import gradio as gr
123
  import uvicorn
124
  import numpy as np
125
+ import pymupdf # PyMuPDF
126
  import tika
127
  import torch
128
  from fastapi import FastAPI
129
+ from transformers import pipeline
130
  from PIL import Image
131
  from io import BytesIO
132
  from starlette.responses import RedirectResponse
133
  from tika import parser
134
  from openpyxl import load_workbook
135
  from pptx import Presentation
 
136
  import os
137
 
138
+ tika.initVM()
 
139
  app = FastAPI()
140
 
 
141
  device = "cuda" if torch.cuda.is_available() else "cpu"
142
  qa_pipeline = pipeline("text-generation", model="TinyLlama/TinyLlama-1.1B-Chat-v1.0", device=device)
143
  image_captioning_pipeline = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")
144
 
145
+ ALLOWED_EXTENSIONS = {"pdf", "docx", "pptx", "xlsx"}
 
 
 
146
 
147
  def validate_file_type(file):
148
+ if not file:
149
  return "❌ No file uploaded!"
150
+ if isinstance(file, str) and os.path.exists(file):
151
  return None
152
  if hasattr(file, "name"):
153
  ext = file.name.split(".")[-1].lower()
 
156
  return None
157
  return "❌ Invalid file format!"
158
 
 
159
  def extract_text_from_pdf(file_bytes):
160
  try:
161
  doc = pymupdf.open(stream=file_bytes, filetype="pdf")
 
163
  except Exception as e:
164
  return f"❌ PDF Error: {str(e)}"
165
 
 
166
  def extract_text_with_tika(file_bytes):
167
  try:
168
  parsed = parser.from_buffer(file_bytes)
 
170
  except Exception as e:
171
  return f"❌ Tika Error: {str(e)}"
172
 
 
173
  def extract_text_from_excel(file_bytes):
174
  try:
175
  wb = load_workbook(BytesIO(file_bytes), data_only=True)
 
181
  except Exception as e:
182
  return f"❌ Excel Error: {str(e)}"
183
 
 
184
  def extract_text_from_pptx(file_bytes):
185
  try:
186
  ppt = Presentation(BytesIO(file_bytes))
 
193
  except Exception as e:
194
  return f"❌ PPTX Error: {str(e)}"
195
 
196
+ def truncate_text(text, max_length=2048):
197
+ return text[:max_length] if len(text) > max_length else text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
198
 
 
199
  def answer_question(file, question: str):
200
  try:
 
 
 
 
 
 
 
 
201
  validation_error = validate_file_type(file)
202
  if validation_error:
203
  return validation_error
204
 
 
205
  file_bytes = None
206
  file_ext = None
207
 
 
210
  with open(file, "rb") as f:
211
  file_bytes = f.read()
212
  elif hasattr(file, "read"):
213
+ file_ext = file.name.split(".")[-1].lower()
214
  file_bytes = file.read()
215
  else:
216
+ return f"❌ Unexpected file type received! Type: {type(file)}"
217
 
 
218
  if file_ext == "pdf":
219
  text = extract_text_from_pdf(file_bytes)
220
  elif file_ext in ["docx", "pptx"]:
221
  text = extract_text_with_tika(file_bytes)
222
  elif file_ext == "xlsx":
223
  text = extract_text_from_excel(file_bytes)
 
 
224
  else:
225
  return f"❌ Unsupported file format: {file_ext}"
226
 
227
+ if not text or "❌" in text:
228
  return f"⚠️ No text extracted. Error: {text}"
229
 
230
  truncated_text = truncate_text(text)
 
235
  except Exception as e:
236
  return f"❌ Processing Error: {str(e)}"
237
 
 
238
  with gr.Blocks() as demo:
239
+ gr.Markdown("## πŸ“„ AI-Powered Document QA")
240
  with gr.Row():
241
+ file_input = gr.File(label="Upload Document")
242
  question_input = gr.Textbox(label="Ask a Question", placeholder="What is this document about?")
243
  answer_output = gr.Textbox(label="Answer")
244
  submit_btn = gr.Button("Get Answer")
245
  submit_btn.click(answer_question, inputs=[file_input, question_input], outputs=answer_output)
246
 
 
247
  app = gr.mount_gradio_app(app, demo, path="/")
248
 
249
  @app.get("/")