ikraamkb commited on
Commit
1f136e0
Β·
verified Β·
1 Parent(s): 2553b67

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +36 -31
app.py CHANGED
@@ -165,6 +165,7 @@ def extract_text_from_pdf(file_bytes):
165
  doc = fitz.open(stream=file_bytes, filetype="pdf")
166
  return "\n".join([page.get_text() for page in doc])
167
  except Exception as e:
 
168
  return f"❌ PDF Error: {str(e)}"
169
 
170
  # βœ… Extract Text from DOCX & PPTX using Tika
@@ -173,6 +174,7 @@ def extract_text_with_tika(file_bytes):
173
  parsed = parser.from_buffer(file_bytes)
174
  return parsed["content"]
175
  except Exception as e:
 
176
  return f"❌ Tika Error: {str(e)}"
177
 
178
  # βœ… Extract Text from Excel
@@ -185,6 +187,7 @@ def extract_text_from_excel(file_bytes):
185
  text.append(" ".join(str(cell) for cell in row if cell))
186
  return "\n".join(text)
187
  except Exception as e:
 
188
  return f"❌ Excel Error: {str(e)}"
189
 
190
  # βœ… Truncate Long Text for Model
@@ -193,48 +196,50 @@ def truncate_text(text, max_length=2048):
193
 
194
  # βœ… Answer Questions from Image or Document
195
  def answer_question(file, question: str):
196
- # βœ… Image Processing (Gradio sends images as NumPy arrays)
197
- if isinstance(file, np.ndarray):
198
- image = Image.fromarray(file)
199
- caption = image_captioning_pipeline(image)[0]['generated_text']
200
- response = qa_pipeline(f"Question: {question}\nContext: {caption}")
201
- return response[0]["generated_text"]
202
-
203
- # βœ… Validate File
204
- validation_error = validate_file_type(file)
205
- if validation_error:
206
- return validation_error
207
-
208
- # βœ… Read File Bytes Properly
209
  try:
 
 
 
 
 
 
 
 
 
 
 
 
 
210
  if hasattr(file, "read"): # Gradio passes file objects
211
  file_bytes = file.read()
212
  elif isinstance(file, bytes): # Direct bytes input
213
  file_bytes = file
214
  else:
215
- return "❌ Could not read file content!"
216
- except Exception as e:
217
- return f"❌ File Read Error: {str(e)}"
218
 
219
- # βœ… Get File Extension
220
- file_ext = file.name.split(".")[-1].lower() if hasattr(file, "name") else None
221
 
222
- # βœ… Extract Text from Supported Documents
223
- text = None
224
- if file_ext == "pdf":
225
- text = extract_text_from_pdf(file_bytes)
226
- elif file_ext in ["docx", "pptx"]:
227
- text = extract_text_with_tika(file_bytes)
228
- elif file_ext == "xlsx":
229
- text = extract_text_from_excel(file_bytes)
230
 
231
- if not text or "❌" in text:
232
- return f"⚠️ No text extracted. Error: {text}"
233
 
234
- truncated_text = truncate_text(text)
235
- response = qa_pipeline(f"Question: {question}\nContext: {truncated_text}")
236
 
237
- return response[0]["generated_text"]
 
 
 
 
238
 
239
  # βœ… Gradio Interface (Unified for Images & Documents)
240
  with gr.Blocks() as demo:
 
165
  doc = fitz.open(stream=file_bytes, filetype="pdf")
166
  return "\n".join([page.get_text() for page in doc])
167
  except Exception as e:
168
+ print(f"❌ PDF Extraction Error: {e}") # Log error
169
  return f"❌ PDF Error: {str(e)}"
170
 
171
  # βœ… Extract Text from DOCX & PPTX using Tika
 
174
  parsed = parser.from_buffer(file_bytes)
175
  return parsed["content"]
176
  except Exception as e:
177
+ print(f"❌ Tika Extraction Error: {e}") # Log error
178
  return f"❌ Tika Error: {str(e)}"
179
 
180
  # βœ… Extract Text from Excel
 
187
  text.append(" ".join(str(cell) for cell in row if cell))
188
  return "\n".join(text)
189
  except Exception as e:
190
+ print(f"❌ Excel Extraction Error: {e}") # Log error
191
  return f"❌ Excel Error: {str(e)}"
192
 
193
  # βœ… Truncate Long Text for Model
 
196
 
197
  # βœ… Answer Questions from Image or Document
198
  def answer_question(file, question: str):
 
 
 
 
 
 
 
 
 
 
 
 
 
199
  try:
200
+ # βœ… Image Processing (Gradio sends images as NumPy arrays)
201
+ if isinstance(file, np.ndarray):
202
+ image = Image.fromarray(file)
203
+ caption = image_captioning_pipeline(image)[0]['generated_text']
204
+ response = qa_pipeline(f"Question: {question}\nContext: {caption}")
205
+ return response[0]["generated_text"]
206
+
207
+ # βœ… Validate File
208
+ validation_error = validate_file_type(file)
209
+ if validation_error:
210
+ return validation_error
211
+
212
+ # βœ… Read File Bytes Properly
213
  if hasattr(file, "read"): # Gradio passes file objects
214
  file_bytes = file.read()
215
  elif isinstance(file, bytes): # Direct bytes input
216
  file_bytes = file
217
  else:
218
+ raise ValueError("Unexpected file type received!")
 
 
219
 
220
+ # βœ… Get File Extension
221
+ file_ext = file.name.split(".")[-1].lower() if hasattr(file, "name") else None
222
 
223
+ # βœ… Extract Text from Supported Documents
224
+ text = None
225
+ if file_ext == "pdf":
226
+ text = extract_text_from_pdf(file_bytes)
227
+ elif file_ext in ["docx", "pptx"]:
228
+ text = extract_text_with_tika(file_bytes)
229
+ elif file_ext == "xlsx":
230
+ text = extract_text_from_excel(file_bytes)
231
 
232
+ if not text or "❌" in text:
233
+ return f"⚠️ No text extracted. Error: {text}"
234
 
235
+ truncated_text = truncate_text(text)
236
+ response = qa_pipeline(f"Question: {question}\nContext: {truncated_text}")
237
 
238
+ return response[0]["generated_text"]
239
+
240
+ except Exception as e:
241
+ print(f"❌ General Processing Error: {e}") # Log error
242
+ return f"❌ Processing Error: {str(e)}"
243
 
244
  # βœ… Gradio Interface (Unified for Images & Documents)
245
  with gr.Blocks() as demo: