ikraamkb commited on
Commit
5535b2b
Β·
verified Β·
1 Parent(s): d49960b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +1 -135
app.py CHANGED
@@ -1,4 +1,4 @@
1
- """import gradio as gr
2
  import uvicorn
3
  import numpy as np
4
  import fitz # PyMuPDF
@@ -118,137 +118,3 @@ def home():
118
  # βœ… Run FastAPI + Gradio
119
  if __name__ == "__main__":
120
  uvicorn.run(app, host="0.0.0.0", port=7860)
121
- """
122
- import gradio as gr
123
- import uvicorn
124
- import numpy as np
125
- import pymupdf # PyMuPDF
126
- import tika
127
- import torch
128
- from fastapi import FastAPI
129
- from transformers import pipeline
130
- from PIL import Image
131
- from io import BytesIO
132
- from starlette.responses import RedirectResponse
133
- from tika import parser
134
- from openpyxl import load_workbook
135
- from pptx import Presentation
136
- import os
137
-
138
- tika.initVM()
139
- app = FastAPI()
140
-
141
- device = "cuda" if torch.cuda.is_available() else "cpu"
142
- qa_pipeline = pipeline("text-generation", model="TinyLlama/TinyLlama-1.1B-Chat-v1.0", device=device)
143
- image_captioning_pipeline = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")
144
-
145
- ALLOWED_EXTENSIONS = {"pdf", "docx", "pptx", "xlsx"}
146
-
147
- def validate_file_type(file):
148
- if not file:
149
- return "❌ No file uploaded!"
150
- if isinstance(file, str) and os.path.exists(file):
151
- return None
152
- if hasattr(file, "name"):
153
- ext = file.name.split(".")[-1].lower()
154
- if ext not in ALLOWED_EXTENSIONS:
155
- return f"❌ Unsupported file format: {ext}"
156
- return None
157
- return "❌ Invalid file format!"
158
-
159
- def extract_text_from_pdf(file_bytes):
160
- try:
161
- doc = pymupdf.open(stream=file_bytes, filetype="pdf")
162
- return "\n".join([page.get_text("text") for page in doc])
163
- except Exception as e:
164
- return f"❌ PDF Error: {str(e)}"
165
-
166
- def extract_text_with_tika(file_bytes):
167
- try:
168
- parsed = parser.from_buffer(file_bytes)
169
- return parsed.get("content", "⚠️ No text found.").strip()
170
- except Exception as e:
171
- return f"❌ Tika Error: {str(e)}"
172
-
173
- def extract_text_from_excel(file_bytes):
174
- try:
175
- wb = load_workbook(BytesIO(file_bytes), data_only=True)
176
- text = []
177
- for sheet in wb.worksheets:
178
- for row in sheet.iter_rows(values_only=True):
179
- text.append(" ".join(str(cell) for cell in row if cell))
180
- return "\n".join(text) if text else "⚠️ No text found."
181
- except Exception as e:
182
- return f"❌ Excel Error: {str(e)}"
183
-
184
- def extract_text_from_pptx(file_bytes):
185
- try:
186
- ppt = Presentation(BytesIO(file_bytes))
187
- text = []
188
- for slide in ppt.slides:
189
- for shape in slide.shapes:
190
- if hasattr(shape, "text"):
191
- text.append(shape.text)
192
- return "\n".join(text) if text else "⚠️ No text found."
193
- except Exception as e:
194
- return f"❌ PPTX Error: {str(e)}"
195
-
196
- def truncate_text(text, max_length=2048):
197
- return text[:max_length] if len(text) > max_length else text
198
-
199
- def answer_question(file, question: str):
200
- try:
201
- validation_error = validate_file_type(file)
202
- if validation_error:
203
- return validation_error
204
-
205
- file_bytes = None
206
- file_ext = None
207
-
208
- if isinstance(file, str) and os.path.exists(file):
209
- file_ext = file.split(".")[-1].lower()
210
- with open(file, "rb") as f:
211
- file_bytes = f.read()
212
- elif hasattr(file, "read"):
213
- file_ext = file.name.split(".")[-1].lower()
214
- file_bytes = file.read()
215
- else:
216
- return f"❌ Unexpected file type received! Type: {type(file)}"
217
-
218
- if file_ext == "pdf":
219
- text = extract_text_from_pdf(file_bytes)
220
- elif file_ext in ["docx", "pptx"]:
221
- text = extract_text_with_tika(file_bytes)
222
- elif file_ext == "xlsx":
223
- text = extract_text_from_excel(file_bytes)
224
- else:
225
- return f"❌ Unsupported file format: {file_ext}"
226
-
227
- if not text or "❌" in text:
228
- return f"⚠️ No text extracted. Error: {text}"
229
-
230
- truncated_text = truncate_text(text)
231
- response = qa_pipeline(f"Question: {question}\nContext: {truncated_text}")
232
-
233
- return response[0]["generated_text"]
234
-
235
- except Exception as e:
236
- return f"❌ Processing Error: {str(e)}"
237
-
238
- with gr.Blocks() as demo:
239
- gr.Markdown("## πŸ“„ AI-Powered Document QA")
240
- with gr.Row():
241
- file_input = gr.File(label="Upload Document")
242
- question_input = gr.Textbox(label="Ask a Question", placeholder="What is this document about?")
243
- answer_output = gr.Textbox(label="Answer")
244
- submit_btn = gr.Button("Get Answer")
245
- submit_btn.click(answer_question, inputs=[file_input, question_input], outputs=answer_output)
246
-
247
- app = gr.mount_gradio_app(app, demo, path="/")
248
-
249
- @app.get("/")
250
- def home():
251
- return RedirectResponse(url="/")
252
-
253
- if __name__ == "__main__":
254
- uvicorn.run(app, host="0.0.0.0", port=7860)
 
1
+ import gradio as gr
2
  import uvicorn
3
  import numpy as np
4
  import fitz # PyMuPDF
 
118
  # βœ… Run FastAPI + Gradio
119
  if __name__ == "__main__":
120
  uvicorn.run(app, host="0.0.0.0", port=7860)