ikraamkb commited on
Commit
2553b67
Β·
verified Β·
1 Parent(s): 669e074

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +29 -20
app.py CHANGED
@@ -160,31 +160,32 @@ def validate_file_type(file):
160
  return "❌ Invalid file format!"
161
 
162
  # βœ… Extract Text from PDF
163
- def extract_text_from_pdf(file):
164
  try:
165
- doc = fitz.open(stream=file, filetype="pdf")
166
  return "\n".join([page.get_text() for page in doc])
167
- except Exception:
168
- return None
169
 
170
  # βœ… Extract Text from DOCX & PPTX using Tika
171
- def extract_text_with_tika(file):
172
  try:
173
- return parser.from_buffer(file)["content"]
174
- except Exception:
175
- return None
 
176
 
177
  # βœ… Extract Text from Excel
178
- def extract_text_from_excel(file):
179
  try:
180
- wb = load_workbook(BytesIO(file), data_only=True)
181
  text = []
182
  for sheet in wb.worksheets:
183
  for row in sheet.iter_rows(values_only=True):
184
  text.append(" ".join(str(cell) for cell in row if cell))
185
  return "\n".join(text)
186
- except Exception:
187
- return None
188
 
189
  # βœ… Truncate Long Text for Model
190
  def truncate_text(text, max_length=2048):
@@ -192,25 +193,33 @@ def truncate_text(text, max_length=2048):
192
 
193
  # βœ… Answer Questions from Image or Document
194
  def answer_question(file, question: str):
195
- # Image Processing (Gradio sends images as NumPy arrays)
196
  if isinstance(file, np.ndarray):
197
  image = Image.fromarray(file)
198
  caption = image_captioning_pipeline(image)[0]['generated_text']
199
  response = qa_pipeline(f"Question: {question}\nContext: {caption}")
200
  return response[0]["generated_text"]
201
 
202
- # Validate File
203
  validation_error = validate_file_type(file)
204
  if validation_error:
205
  return validation_error
206
 
207
  # βœ… Read File Bytes Properly
 
 
 
 
 
 
 
 
 
 
 
208
  file_ext = file.name.split(".")[-1].lower() if hasattr(file, "name") else None
209
- file_bytes = file.read() if hasattr(file, "read") else None
210
- if not file_bytes:
211
- return "❌ Could not read file content!"
212
 
213
- # Extract Text from Supported Documents
214
  text = None
215
  if file_ext == "pdf":
216
  text = extract_text_from_pdf(file_bytes)
@@ -219,8 +228,8 @@ def answer_question(file, question: str):
219
  elif file_ext == "xlsx":
220
  text = extract_text_from_excel(file_bytes)
221
 
222
- if not text:
223
- return "⚠️ No text extracted from the document."
224
 
225
  truncated_text = truncate_text(text)
226
  response = qa_pipeline(f"Question: {question}\nContext: {truncated_text}")
 
160
  return "❌ Invalid file format!"
161
 
162
  # βœ… Extract Text from PDF
163
+ def extract_text_from_pdf(file_bytes):
164
  try:
165
+ doc = fitz.open(stream=file_bytes, filetype="pdf")
166
  return "\n".join([page.get_text() for page in doc])
167
+ except Exception as e:
168
+ return f"❌ PDF Error: {str(e)}"
169
 
170
  # βœ… Extract Text from DOCX & PPTX using Tika
171
+ def extract_text_with_tika(file_bytes):
172
  try:
173
+ parsed = parser.from_buffer(file_bytes)
174
+ return parsed["content"]
175
+ except Exception as e:
176
+ return f"❌ Tika Error: {str(e)}"
177
 
178
  # βœ… Extract Text from Excel
179
+ def extract_text_from_excel(file_bytes):
180
  try:
181
+ wb = load_workbook(BytesIO(file_bytes), data_only=True)
182
  text = []
183
  for sheet in wb.worksheets:
184
  for row in sheet.iter_rows(values_only=True):
185
  text.append(" ".join(str(cell) for cell in row if cell))
186
  return "\n".join(text)
187
+ except Exception as e:
188
+ return f"❌ Excel Error: {str(e)}"
189
 
190
  # βœ… Truncate Long Text for Model
191
  def truncate_text(text, max_length=2048):
 
193
 
194
  # βœ… Answer Questions from Image or Document
195
  def answer_question(file, question: str):
196
+ # βœ… Image Processing (Gradio sends images as NumPy arrays)
197
  if isinstance(file, np.ndarray):
198
  image = Image.fromarray(file)
199
  caption = image_captioning_pipeline(image)[0]['generated_text']
200
  response = qa_pipeline(f"Question: {question}\nContext: {caption}")
201
  return response[0]["generated_text"]
202
 
203
+ # βœ… Validate File
204
  validation_error = validate_file_type(file)
205
  if validation_error:
206
  return validation_error
207
 
208
  # βœ… Read File Bytes Properly
209
+ try:
210
+ if hasattr(file, "read"): # Gradio passes file objects
211
+ file_bytes = file.read()
212
+ elif isinstance(file, bytes): # Direct bytes input
213
+ file_bytes = file
214
+ else:
215
+ return "❌ Could not read file content!"
216
+ except Exception as e:
217
+ return f"❌ File Read Error: {str(e)}"
218
+
219
+ # βœ… Get File Extension
220
  file_ext = file.name.split(".")[-1].lower() if hasattr(file, "name") else None
 
 
 
221
 
222
+ # βœ… Extract Text from Supported Documents
223
  text = None
224
  if file_ext == "pdf":
225
  text = extract_text_from_pdf(file_bytes)
 
228
  elif file_ext == "xlsx":
229
  text = extract_text_from_excel(file_bytes)
230
 
231
+ if not text or "❌" in text:
232
+ return f"⚠️ No text extracted. Error: {text}"
233
 
234
  truncated_text = truncate_text(text)
235
  response = qa_pipeline(f"Question: {question}\nContext: {truncated_text}")