ikraamkb commited on
Commit
ebf76ba
Β·
verified Β·
1 Parent(s): da390cd

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +137 -1
app.py CHANGED
@@ -1,4 +1,4 @@
1
- from fastapi import FastAPI, File, UploadFile
2
  import fitz # PyMuPDF for PDF parsing
3
  from tika import parser # Apache Tika for document parsing
4
  import openpyxl
@@ -127,6 +127,142 @@ doc_interface = gr.Interface(fn=answer_question_from_document, inputs=[gr.File()
127
  demo = gr.TabbedInterface([doc_interface], ["Document QA"])
128
  app = gr.mount_gradio_app(app, demo, path="/")
129
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
130
  @app.get("/")
131
  def home():
132
  return RedirectResponse(url="/")
 
1
+ """from fastapi import FastAPI, File, UploadFile
2
  import fitz # PyMuPDF for PDF parsing
3
  from tika import parser # Apache Tika for document parsing
4
  import openpyxl
 
127
  demo = gr.TabbedInterface([doc_interface], ["Document QA"])
128
  app = gr.mount_gradio_app(app, demo, path="/")
129
 
130
+ @app.get("/")
131
+ def home():
132
+ return RedirectResponse(url="/")
133
+ """
134
+ from fastapi import FastAPI, File, UploadFile
135
+ import fitz # PyMuPDF for PDF parsing
136
+ import openpyxl
137
+ from pptx import Presentation
138
+ import torch
139
+ from torchvision import transforms
140
+ from torchvision.models.detection import fasterrcnn_resnet50_fpn
141
+ from PIL import Image
142
+ from transformers import pipeline
143
+ import gradio as gr
144
+ from fastapi.responses import RedirectResponse
145
+ import numpy as np
146
+ import docx
147
+
148
+ # Initialize FastAPI
149
+ print("πŸš€ FastAPI server is starting...")
150
+ app = FastAPI()
151
+
152
+ # Load AI Model for Question Answering (DeepSeek-V2-Chat)
153
+ from transformers import AutoModelForCausalLM, AutoTokenizer
154
+
155
+ # Preload Hugging Face model
156
+ print(f"πŸ”„ Loading models")
157
+ qa_pipeline = pipeline("text-generation", model="TinyLlama/TinyLlama-1.1B-Chat-v1.0", device=-1)
158
+
159
+ # Load Pretrained Object Detection Model (Torchvision)
160
+ from torchvision.models.detection import FasterRCNN_ResNet50_FPN_Weights
161
+ weights = FasterRCNN_ResNet50_FPN_Weights.DEFAULT
162
+ model = fasterrcnn_resnet50_fpn(weights=weights)
163
+ model.eval()
164
+
165
+ # Image Transformations
166
+ transform = transforms.Compose([
167
+ transforms.ToTensor()
168
+ ])
169
+
170
+ # Allowed File Extensions
171
+ ALLOWED_EXTENSIONS = {"pdf", "docx", "pptx", "xlsx"}
172
+
173
+ def validate_file_type(file):
174
+ ext = file.name.split(".")[-1].lower()
175
+ print(f"πŸ” Validating file type: {ext}")
176
+ if ext not in ALLOWED_EXTENSIONS:
177
+ return f"❌ Unsupported file format: {ext}"
178
+ return None
179
+
180
+ # Function to truncate text to 450 tokens
181
+ def truncate_text(text, max_tokens=450):
182
+ words = text.split()
183
+ truncated = " ".join(words[:max_tokens])
184
+ print(f"βœ‚οΈ Truncated text to {max_tokens} tokens.")
185
+ return truncated
186
+
187
+ # Document Text Extraction Functions
188
+ def extract_text_from_pdf(pdf_file):
189
+ try:
190
+ print("πŸ“„ Extracting text from PDF...")
191
+ doc = fitz.open(pdf_file)
192
+ text = "\n".join([page.get_text("text") for page in doc])
193
+ print("βœ… PDF text extraction completed.")
194
+ return text if text else "⚠️ No text found."
195
+ except Exception as e:
196
+ return f"❌ Error reading PDF: {str(e)}"
197
+
198
+ def extract_text_from_docx(docx_file):
199
+ try:
200
+ print("πŸ“ Extracting text from DOCX...")
201
+ doc = docx.Document(docx_file)
202
+ text = "\n".join([para.text for para in doc.paragraphs])
203
+ print("βœ… DOCX text extraction completed.")
204
+ return text if text else "⚠️ No text found."
205
+ except Exception as e:
206
+ return f"❌ Error reading DOCX: {str(e)}"
207
+
208
+ def extract_text_from_pptx(pptx_file):
209
+ try:
210
+ print("πŸ“Š Extracting text from PPTX...")
211
+ ppt = Presentation(pptx_file)
212
+ text = []
213
+ for slide in ppt.slides:
214
+ for shape in slide.shapes:
215
+ if hasattr(shape, "text"):
216
+ text.append(shape.text)
217
+ print("βœ… PPTX text extraction completed.")
218
+ return "\n".join(text) if text else "⚠️ No text found."
219
+ except Exception as e:
220
+ return f"❌ Error reading PPTX: {str(e)}"
221
+
222
+ def extract_text_from_excel(excel_file):
223
+ try:
224
+ print("πŸ“Š Extracting text from Excel...")
225
+ wb = openpyxl.load_workbook(excel_file, read_only=True)
226
+ text = []
227
+ for sheet in wb.worksheets:
228
+ for row in sheet.iter_rows(values_only=True):
229
+ text.append(" ".join(map(str, row)))
230
+ print("βœ… Excel text extraction completed.")
231
+ return "\n".join(text) if text else "⚠️ No text found."
232
+ except Exception as e:
233
+ return f"❌ Error reading Excel: {str(e)}"
234
+
235
+ def answer_question_from_document(file, question):
236
+ print("πŸ“‚ Processing document for QA...")
237
+ validation_error = validate_file_type(file)
238
+ if validation_error:
239
+ return validation_error
240
+ file_ext = file.name.split(".")[-1].lower()
241
+ if file_ext == "pdf":
242
+ text = extract_text_from_pdf(file)
243
+ elif file_ext == "docx":
244
+ text = extract_text_from_docx(file)
245
+ elif file_ext == "pptx":
246
+ text = extract_text_from_pptx(file)
247
+ elif file_ext == "xlsx":
248
+ text = extract_text_from_excel(file)
249
+ else:
250
+ return "❌ Unsupported file format!"
251
+ if not text:
252
+ return "⚠️ No text extracted from the document."
253
+ truncated_text = truncate_text(text)
254
+ print("πŸ€– Generating response...")
255
+ response = qa_pipeline(f"Question: {question}\nContext: {truncated_text}")
256
+ print("βœ… AI response generated.")
257
+ return response[0]["generated_text"]
258
+
259
+ print("βœ… Models loaded successfully.")
260
+
261
+ doc_interface = gr.Interface(fn=answer_question_from_document, inputs=[gr.File(), gr.Textbox()], outputs="text")
262
+
263
+ demo = gr.TabbedInterface([doc_interface], ["Document QA"])
264
+ app = gr.mount_gradio_app(app, demo, path="/")
265
+
266
  @app.get("/")
267
  def home():
268
  return RedirectResponse(url="/")