ikraamkb commited on
Commit
5e640af
Β·
verified Β·
1 Parent(s): 6ce7adf

Update appImage.py

Browse files
Files changed (1) hide show
  1. appImage.py +61 -45
appImage.py CHANGED
@@ -1,60 +1,72 @@
1
- """from fastapi import FastAPI
2
- from fastapi.responses import RedirectResponse
3
- import gradio as gr
4
- from transformers import pipeline, ViltProcessor, ViltForQuestionAnswering, AutoTokenizer, AutoModelForCausalLM
 
5
  from PIL import Image
 
 
 
6
  import torch
7
- import fitz # PyMuPDF for PDF
 
 
 
8
  app = FastAPI()
9
 
10
- # ========== Image QA Setup ==========
11
  vqa_processor = ViltProcessor.from_pretrained("dandelin/vilt-b32-finetuned-vqa")
12
  vqa_model = ViltForQuestionAnswering.from_pretrained("dandelin/vilt-b32-finetuned-vqa")
13
- def answer_question_from_image(image, question):
14
- if image is None or not question.strip():
15
- return "Please upload an image and ask a question."
16
- inputs = vqa_processor(image, question, return_tensors="pt")
17
- with torch.no_grad():
18
- outputs = vqa_model(**inputs)
19
- predicted_id = outputs.logits.argmax(-1).item()
20
- return vqa_model.config.id2label[predicted_id]
21
- # ========== Gradio Interfaces ==========
22
-
23
- img_interface = gr.Interface(
24
- fn=answer_question_from_image,
25
- inputs=[gr.Image(label="Upload Image"), gr.Textbox(label="Ask a Question")],
26
- outputs="text",
27
- title="Image Question Answering"
28
- )
29
- # ========== Combine and Mount ==========
30
- demo = gr.TabbedInterface( img_interface , "Image QA")
31
- app = gr.mount_gradio_app(app, demo, path="/")
32
- @app.get("/")
33
- def root():
34
- return RedirectResponse(url="/") """
35
- """from transformers import ViltProcessor, ViltForQuestionAnswering
36
- import torch
37
 
38
- # Load image QA model once
39
- vqa_processor = ViltProcessor.from_pretrained("dandelin/vilt-b32-finetuned-vqa")
40
- vqa_model = ViltForQuestionAnswering.from_pretrained("dandelin/vilt-b32-finetuned-vqa")
 
 
 
 
41
 
42
  def answer_question_from_image(image, question):
43
  if image is None or not question.strip():
44
- return "Please upload an image and ask a question."
45
-
46
- inputs = vqa_processor(image, question, return_tensors="pt")
47
- with torch.no_grad():
48
- outputs = vqa_model(**inputs)
49
-
50
- predicted_id = outputs.logits.argmax(-1).item()
51
- return vqa_model.config.id2label[predicted_id]"""
52
- ### βœ… appImage.py β€” Image QA Backend (Cleaned)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53
  from fastapi import FastAPI
54
  from fastapi.responses import RedirectResponse, JSONResponse, FileResponse
55
  import os
56
  from PIL import Image
57
- from transformers import ViltProcessor, ViltForQuestionAnswering, pipeline
 
 
 
58
  from gtts import gTTS
59
  import easyocr
60
  import torch
@@ -64,9 +76,11 @@ from io import BytesIO
64
 
65
  app = FastAPI()
66
 
 
67
  vqa_processor = ViltProcessor.from_pretrained("dandelin/vilt-b32-finetuned-vqa")
68
  vqa_model = ViltForQuestionAnswering.from_pretrained("dandelin/vilt-b32-finetuned-vqa")
69
- captioner = pipeline("image-to-text", model="nlpconnect/vit-gpt2-image-captioning")
 
70
  reader = easyocr.Reader(['en', 'fr'])
71
 
72
  def classify_question(question: str):
@@ -89,7 +103,9 @@ def answer_question_from_image(image, question):
89
  answer = " ".join([entry[1] for entry in result]) or "No readable text found."
90
 
91
  elif mode == "caption":
92
- answer = captioner(image)[0]['generated_text']
 
 
93
 
94
  else:
95
  inputs = vqa_processor(image, question, return_tensors="pt")
 
1
+ """
2
+ ### βœ… appImage.py β€” Image QA Backend (Cleaned)
3
+ from fastapi import FastAPI
4
+ from fastapi.responses import RedirectResponse, JSONResponse, FileResponse
5
+ import os
6
  from PIL import Image
7
+ from transformers import ViltProcessor, ViltForQuestionAnswering, pipeline
8
+ from gtts import gTTS
9
+ import easyocr
10
  import torch
11
+ import tempfile
12
+ import numpy as np
13
+ from io import BytesIO
14
+
15
  app = FastAPI()
16
 
 
17
  vqa_processor = ViltProcessor.from_pretrained("dandelin/vilt-b32-finetuned-vqa")
18
  vqa_model = ViltForQuestionAnswering.from_pretrained("dandelin/vilt-b32-finetuned-vqa")
19
+ captioner = pipeline("image-to-text", model="nlpconnect/vit-gpt2-image-captioning")
20
+ reader = easyocr.Reader(['en', 'fr'])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
 
22
+ def classify_question(question: str):
23
+ q = question.lower()
24
+ if any(w in q for w in ["text", "say", "written", "read"]):
25
+ return "ocr"
26
+ if any(w in q for w in ["caption", "describe", "what is in the image"]):
27
+ return "caption"
28
+ return "vqa"
29
 
30
  def answer_question_from_image(image, question):
31
  if image is None or not question.strip():
32
+ return "Please upload an image and ask a question.", None
33
+
34
+ mode = classify_question(question)
35
+
36
+ try:
37
+ if mode == "ocr":
38
+ result = reader.readtext(np.array(image))
39
+ answer = " ".join([entry[1] for entry in result]) or "No readable text found."
40
+
41
+ elif mode == "caption":
42
+ answer = captioner(image)[0]['generated_text']
43
+
44
+ else:
45
+ inputs = vqa_processor(image, question, return_tensors="pt")
46
+ with torch.no_grad():
47
+ outputs = vqa_model(**inputs)
48
+ predicted_id = outputs.logits.argmax(-1).item()
49
+ answer = vqa_model.config.id2label[predicted_id]
50
+
51
+ tts = gTTS(text=answer)
52
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp:
53
+ tts.save(tmp.name)
54
+ return answer, tmp.name
55
+
56
+ except Exception as e:
57
+ return f"Error: {e}", None
58
+
59
+ @app.get("/")
60
+ def home():
61
+ return RedirectResponse(url="/templates/home.html")"""
62
  from fastapi import FastAPI
63
  from fastapi.responses import RedirectResponse, JSONResponse, FileResponse
64
  import os
65
  from PIL import Image
66
+ from transformers import (
67
+ ViltProcessor, ViltForQuestionAnswering,
68
+ AutoProcessor, GitForCausalLM
69
+ )
70
  from gtts import gTTS
71
  import easyocr
72
  import torch
 
76
 
77
  app = FastAPI()
78
 
79
+ # Models
80
  vqa_processor = ViltProcessor.from_pretrained("dandelin/vilt-b32-finetuned-vqa")
81
  vqa_model = ViltForQuestionAnswering.from_pretrained("dandelin/vilt-b32-finetuned-vqa")
82
+ caption_processor = AutoProcessor.from_pretrained("microsoft/git-large-coco")
83
+ caption_model = GitForCausalLM.from_pretrained("microsoft/git-large-coco")
84
  reader = easyocr.Reader(['en', 'fr'])
85
 
86
  def classify_question(question: str):
 
103
  answer = " ".join([entry[1] for entry in result]) or "No readable text found."
104
 
105
  elif mode == "caption":
106
+ image_tensor = caption_processor(images=image, return_tensors="pt").pixel_values
107
+ generated_ids = caption_model.generate(image_tensor, max_length=64)
108
+ answer = caption_processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
109
 
110
  else:
111
  inputs = vqa_processor(image, question, return_tensors="pt")