ikraamkb commited on
Commit
c330600
·
verified ·
1 Parent(s): f94fa3b

sound works now human like answers

Browse files
Files changed (1) hide show
  1. app.py +78 -1
app.py CHANGED
@@ -1,4 +1,4 @@
1
- from fastapi import FastAPI, UploadFile, Form
2
  from fastapi.responses import RedirectResponse, FileResponse, JSONResponse
3
  import os
4
  import shutil
@@ -60,6 +60,83 @@ gui = gr.Interface(
60
 
61
  app = gr.mount_gradio_app(app, gui, path="/")
62
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
  @app.get("/")
64
  def home():
65
  return RedirectResponse(url="/")
 
1
+ """from fastapi import FastAPI, UploadFile, Form
2
  from fastapi.responses import RedirectResponse, FileResponse, JSONResponse
3
  import os
4
  import shutil
 
60
 
61
  app = gr.mount_gradio_app(app, gui, path="/")
62
 
63
+ @app.get("/")
64
+ def home():
65
+ return RedirectResponse(url="/") """
66
+ from fastapi import FastAPI, UploadFile, Form
67
+ from fastapi.responses import RedirectResponse, FileResponse, JSONResponse
68
+ import os
69
+ import shutil
70
+ from PIL import Image
71
+ from transformers import ViltProcessor, ViltForQuestionAnswering, pipeline
72
+ from gtts import gTTS
73
+ import torch
74
+ import tempfile
75
+ import gradio as gr
76
+
77
+ app = FastAPI()
78
+
79
+ # Load VQA Model
80
+ vqa_processor = ViltProcessor.from_pretrained("dandelin/vilt-b32-finetuned-vqa")
81
+ vqa_model = ViltForQuestionAnswering.from_pretrained("dandelin/vilt-b32-finetuned-vqa")
82
+
83
+ # Load GPT model for rewriting short answers
84
+ gpt_rewriter = pipeline("text-generation", model="EleutherAI/gpt-neo-1.3B")
85
+
86
+ def rewrite_answer(question: str, short_answer: str):
87
+ prompt = f"Q: {question}\nA: {short_answer}\n\nRespond with a full sentence:"
88
+ try:
89
+ result = gpt_rewriter(prompt, max_length=50, do_sample=False)
90
+ full_sentence = result[0]['generated_text'].split("Respond with a full sentence:")[-1].strip()
91
+ return full_sentence
92
+ except Exception as e:
93
+ return short_answer # fallback
94
+
95
+ def answer_question_from_image(image, question):
96
+ if image is None or not question.strip():
97
+ return "Please upload an image and ask a question.", None
98
+
99
+ # Process with model
100
+ inputs = vqa_processor(image, question, return_tensors="pt")
101
+ with torch.no_grad():
102
+ outputs = vqa_model(**inputs)
103
+ predicted_id = outputs.logits.argmax(-1).item()
104
+ short_answer = vqa_model.config.id2label[predicted_id]
105
+
106
+ # Rewrite short answer using GPT
107
+ full_answer = rewrite_answer(question, short_answer)
108
+
109
+ # Generate TTS audio
110
+ try:
111
+ tts = gTTS(text=full_answer)
112
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp:
113
+ tts.save(tmp.name)
114
+ audio_path = tmp.name
115
+ except Exception as e:
116
+ return f"Answer: {full_answer}\n\n⚠️ Audio generation error: {e}", None
117
+
118
+ return full_answer, audio_path
119
+
120
+ def process_image_question(image: Image.Image, question: str):
121
+ answer, audio_path = answer_question_from_image(image, question)
122
+ return answer, audio_path
123
+
124
+ gui = gr.Interface(
125
+ fn=process_image_question,
126
+ inputs=[
127
+ gr.Image(type="pil", label="Upload Image"),
128
+ gr.Textbox(lines=2, placeholder="Ask a question about the image...", label="Question")
129
+ ],
130
+ outputs=[
131
+ gr.Textbox(label="Answer", lines=5),
132
+ gr.Audio(label="Answer (Audio)", type="filepath")
133
+ ],
134
+ title="🧠 Image QA with Voice",
135
+ description="Upload an image and ask a question. You'll get a detailed text + spoken answer."
136
+ )
137
+
138
+ app = gr.mount_gradio_app(app, gui, path="/")
139
+
140
  @app.get("/")
141
  def home():
142
  return RedirectResponse(url="/")