Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -1,4 +1,4 @@
|
|
1 |
-
|
2 |
import uvicorn
|
3 |
import numpy as np
|
4 |
import fitz # PyMuPDF
|
@@ -118,137 +118,3 @@ def home():
|
|
118 |
# β
Run FastAPI + Gradio
|
119 |
if __name__ == "__main__":
|
120 |
uvicorn.run(app, host="0.0.0.0", port=7860)
|
121 |
-
"""
|
122 |
-
import gradio as gr
|
123 |
-
import uvicorn
|
124 |
-
import numpy as np
|
125 |
-
import pymupdf # PyMuPDF
|
126 |
-
import tika
|
127 |
-
import torch
|
128 |
-
from fastapi import FastAPI
|
129 |
-
from transformers import pipeline
|
130 |
-
from PIL import Image
|
131 |
-
from io import BytesIO
|
132 |
-
from starlette.responses import RedirectResponse
|
133 |
-
from tika import parser
|
134 |
-
from openpyxl import load_workbook
|
135 |
-
from pptx import Presentation
|
136 |
-
import os
|
137 |
-
|
138 |
-
tika.initVM()
|
139 |
-
app = FastAPI()
|
140 |
-
|
141 |
-
device = "cuda" if torch.cuda.is_available() else "cpu"
|
142 |
-
qa_pipeline = pipeline("text-generation", model="TinyLlama/TinyLlama-1.1B-Chat-v1.0", device=device)
|
143 |
-
image_captioning_pipeline = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")
|
144 |
-
|
145 |
-
ALLOWED_EXTENSIONS = {"pdf", "docx", "pptx", "xlsx"}
|
146 |
-
|
147 |
-
def validate_file_type(file):
|
148 |
-
if not file:
|
149 |
-
return "β No file uploaded!"
|
150 |
-
if isinstance(file, str) and os.path.exists(file):
|
151 |
-
return None
|
152 |
-
if hasattr(file, "name"):
|
153 |
-
ext = file.name.split(".")[-1].lower()
|
154 |
-
if ext not in ALLOWED_EXTENSIONS:
|
155 |
-
return f"β Unsupported file format: {ext}"
|
156 |
-
return None
|
157 |
-
return "β Invalid file format!"
|
158 |
-
|
159 |
-
def extract_text_from_pdf(file_bytes):
|
160 |
-
try:
|
161 |
-
doc = pymupdf.open(stream=file_bytes, filetype="pdf")
|
162 |
-
return "\n".join([page.get_text("text") for page in doc])
|
163 |
-
except Exception as e:
|
164 |
-
return f"β PDF Error: {str(e)}"
|
165 |
-
|
166 |
-
def extract_text_with_tika(file_bytes):
|
167 |
-
try:
|
168 |
-
parsed = parser.from_buffer(file_bytes)
|
169 |
-
return parsed.get("content", "β οΈ No text found.").strip()
|
170 |
-
except Exception as e:
|
171 |
-
return f"β Tika Error: {str(e)}"
|
172 |
-
|
173 |
-
def extract_text_from_excel(file_bytes):
|
174 |
-
try:
|
175 |
-
wb = load_workbook(BytesIO(file_bytes), data_only=True)
|
176 |
-
text = []
|
177 |
-
for sheet in wb.worksheets:
|
178 |
-
for row in sheet.iter_rows(values_only=True):
|
179 |
-
text.append(" ".join(str(cell) for cell in row if cell))
|
180 |
-
return "\n".join(text) if text else "β οΈ No text found."
|
181 |
-
except Exception as e:
|
182 |
-
return f"β Excel Error: {str(e)}"
|
183 |
-
|
184 |
-
def extract_text_from_pptx(file_bytes):
|
185 |
-
try:
|
186 |
-
ppt = Presentation(BytesIO(file_bytes))
|
187 |
-
text = []
|
188 |
-
for slide in ppt.slides:
|
189 |
-
for shape in slide.shapes:
|
190 |
-
if hasattr(shape, "text"):
|
191 |
-
text.append(shape.text)
|
192 |
-
return "\n".join(text) if text else "β οΈ No text found."
|
193 |
-
except Exception as e:
|
194 |
-
return f"β PPTX Error: {str(e)}"
|
195 |
-
|
196 |
-
def truncate_text(text, max_length=2048):
|
197 |
-
return text[:max_length] if len(text) > max_length else text
|
198 |
-
|
199 |
-
def answer_question(file, question: str):
|
200 |
-
try:
|
201 |
-
validation_error = validate_file_type(file)
|
202 |
-
if validation_error:
|
203 |
-
return validation_error
|
204 |
-
|
205 |
-
file_bytes = None
|
206 |
-
file_ext = None
|
207 |
-
|
208 |
-
if isinstance(file, str) and os.path.exists(file):
|
209 |
-
file_ext = file.split(".")[-1].lower()
|
210 |
-
with open(file, "rb") as f:
|
211 |
-
file_bytes = f.read()
|
212 |
-
elif hasattr(file, "read"):
|
213 |
-
file_ext = file.name.split(".")[-1].lower()
|
214 |
-
file_bytes = file.read()
|
215 |
-
else:
|
216 |
-
return f"β Unexpected file type received! Type: {type(file)}"
|
217 |
-
|
218 |
-
if file_ext == "pdf":
|
219 |
-
text = extract_text_from_pdf(file_bytes)
|
220 |
-
elif file_ext in ["docx", "pptx"]:
|
221 |
-
text = extract_text_with_tika(file_bytes)
|
222 |
-
elif file_ext == "xlsx":
|
223 |
-
text = extract_text_from_excel(file_bytes)
|
224 |
-
else:
|
225 |
-
return f"β Unsupported file format: {file_ext}"
|
226 |
-
|
227 |
-
if not text or "β" in text:
|
228 |
-
return f"β οΈ No text extracted. Error: {text}"
|
229 |
-
|
230 |
-
truncated_text = truncate_text(text)
|
231 |
-
response = qa_pipeline(f"Question: {question}\nContext: {truncated_text}")
|
232 |
-
|
233 |
-
return response[0]["generated_text"]
|
234 |
-
|
235 |
-
except Exception as e:
|
236 |
-
return f"β Processing Error: {str(e)}"
|
237 |
-
|
238 |
-
with gr.Blocks() as demo:
|
239 |
-
gr.Markdown("## π AI-Powered Document QA")
|
240 |
-
with gr.Row():
|
241 |
-
file_input = gr.File(label="Upload Document")
|
242 |
-
question_input = gr.Textbox(label="Ask a Question", placeholder="What is this document about?")
|
243 |
-
answer_output = gr.Textbox(label="Answer")
|
244 |
-
submit_btn = gr.Button("Get Answer")
|
245 |
-
submit_btn.click(answer_question, inputs=[file_input, question_input], outputs=answer_output)
|
246 |
-
|
247 |
-
app = gr.mount_gradio_app(app, demo, path="/")
|
248 |
-
|
249 |
-
@app.get("/")
|
250 |
-
def home():
|
251 |
-
return RedirectResponse(url="/")
|
252 |
-
|
253 |
-
if __name__ == "__main__":
|
254 |
-
uvicorn.run(app, host="0.0.0.0", port=7860)
|
|
|
1 |
+
import gradio as gr
|
2 |
import uvicorn
|
3 |
import numpy as np
|
4 |
import fitz # PyMuPDF
|
|
|
118 |
# β
Run FastAPI + Gradio
|
119 |
if __name__ == "__main__":
|
120 |
uvicorn.run(app, host="0.0.0.0", port=7860)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|