Spaces:
Running
Running
Update appImage.py
Browse files- appImage.py +2 -84
appImage.py
CHANGED
@@ -1,5 +1,4 @@
|
|
1 |
-
|
2 |
-
### ✅ appImage.py — Image QA Backend (Cleaned)
|
3 |
from fastapi import FastAPI
|
4 |
from fastapi.responses import RedirectResponse, JSONResponse, FileResponse
|
5 |
import os
|
@@ -58,85 +57,4 @@ def answer_question_from_image(image, question):
|
|
58 |
|
59 |
@app.get("/")
|
60 |
def home():
|
61 |
-
return RedirectResponse(url="/templates/home.html")
|
62 |
-
from fastapi import FastAPI
|
63 |
-
from fastapi.responses import RedirectResponse, JSONResponse, FileResponse
|
64 |
-
import os
|
65 |
-
from PIL import Image
|
66 |
-
from transformers import ViltProcessor, ViltForQuestionAnswering, AutoProcessor, AutoModelForCausalLM
|
67 |
-
from gtts import gTTS
|
68 |
-
import easyocr
|
69 |
-
import torch
|
70 |
-
import tempfile
|
71 |
-
import numpy as np
|
72 |
-
from io import BytesIO
|
73 |
-
|
74 |
-
app = FastAPI()
|
75 |
-
|
76 |
-
# Initialize models with optimized settings
|
77 |
-
vqa_processor = ViltProcessor.from_pretrained("dandelin/vilt-b32-finetuned-vqa")
|
78 |
-
vqa_model = ViltForQuestionAnswering.from_pretrained("dandelin/vilt-b32-finetuned-vqa")
|
79 |
-
|
80 |
-
# Load GIT model with performance optimizations
|
81 |
-
git_processor = AutoProcessor.from_pretrained("microsoft/git-large-coco")
|
82 |
-
git_model = AutoModelForCausalLM.from_pretrained(
|
83 |
-
"microsoft/git-large-coco",
|
84 |
-
torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
|
85 |
-
device_map="auto"
|
86 |
-
)
|
87 |
-
|
88 |
-
reader = easyocr.Reader(['en', 'fr'], gpu=torch.cuda.is_available())
|
89 |
-
|
90 |
-
def classify_question(question: str):
|
91 |
-
"""Optimized question classification"""
|
92 |
-
q = question.lower()
|
93 |
-
if any(w in q for w in ["text", "say", "written", "read"]):
|
94 |
-
return "ocr"
|
95 |
-
if any(w in q for w in ["caption", "describe", "what is in the image"]):
|
96 |
-
return "caption"
|
97 |
-
return "vqa"
|
98 |
-
|
99 |
-
@torch.inference_mode()
|
100 |
-
def generate_caption(image):
|
101 |
-
"""Optimized caption generation with GIT model"""
|
102 |
-
try:
|
103 |
-
inputs = git_processor(images=image, return_tensors="pt").to(git_model.device)
|
104 |
-
outputs = git_model.generate(**inputs, max_length=50)
|
105 |
-
return git_processor.batch_decode(outputs, skip_special_tokens=True)[0]
|
106 |
-
except Exception as e:
|
107 |
-
print(f"Caption generation error: {e}")
|
108 |
-
return "Could not generate caption"
|
109 |
-
|
110 |
-
def answer_question_from_image(image, question):
|
111 |
-
if image is None or not question.strip():
|
112 |
-
return "Please upload an image and ask a question.", None
|
113 |
-
|
114 |
-
mode = classify_question(question)
|
115 |
-
|
116 |
-
try:
|
117 |
-
if mode == "ocr":
|
118 |
-
result = reader.readtext(np.array(image))
|
119 |
-
answer = " ".join([entry[1] for entry in result]) or "No readable text found."
|
120 |
-
|
121 |
-
elif mode == "caption":
|
122 |
-
answer = generate_caption(image)
|
123 |
-
|
124 |
-
else: # VQA mode
|
125 |
-
inputs = vqa_processor(image, question, return_tensors="pt")
|
126 |
-
with torch.no_grad():
|
127 |
-
outputs = vqa_model(**inputs)
|
128 |
-
predicted_id = outputs.logits.argmax(-1).item()
|
129 |
-
answer = vqa_model.config.id2label[predicted_id]
|
130 |
-
|
131 |
-
# Generate audio response
|
132 |
-
tts = gTTS(text=answer)
|
133 |
-
with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp:
|
134 |
-
tts.save(tmp.name)
|
135 |
-
return answer, tmp.name
|
136 |
-
|
137 |
-
except Exception as e:
|
138 |
-
return f"Error: {e}", None
|
139 |
-
|
140 |
-
@app.get("/")
|
141 |
-
def home():
|
142 |
-
return RedirectResponse(url="/templates/home.html")
|
|
|
1 |
+
|
|
|
2 |
from fastapi import FastAPI
|
3 |
from fastapi.responses import RedirectResponse, JSONResponse, FileResponse
|
4 |
import os
|
|
|
57 |
|
58 |
@app.get("/")
|
59 |
def home():
|
60 |
+
return RedirectResponse(url="/templates/home.html")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|