preston-cell commited on
Commit
d0aa231
·
verified ·
1 Parent(s): f4f3543

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +23 -56
app.py CHANGED
@@ -1,39 +1,34 @@
1
  import gradio as gr
2
  from transformers import (
3
  pipeline,
4
- AutoProcessor,
5
- AutoModelForCausalLM,
6
  AutoTokenizer,
 
7
  GenerationConfig,
8
  set_seed
9
  )
10
- from datasets import load_dataset
11
  import torch
12
  import numpy as np
 
 
 
13
 
14
  set_seed(42)
15
 
16
- # Device and dtype
17
  device = "cuda" if torch.cuda.is_available() else "cpu"
18
- dtype = torch.float16 if torch.cuda.is_available() else torch.float32
19
 
20
- # Load image captioning model
21
  caption_model = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")
22
 
23
- # Load SpeechT5 text-to-speech model
24
  synthesiser = pipeline("text-to-speech", model="microsoft/speecht5_tts")
25
 
26
- # Load OCR model (Florence-2)
27
- ocr_model = AutoModelForCausalLM.from_pretrained(
28
- "microsoft/Florence-2-large", torch_dtype=dtype, trust_remote_code=True
29
- ).to(device)
30
- ocr_processor = AutoProcessor.from_pretrained("microsoft/Florence-2-large", trust_remote_code=True)
31
-
32
- # Load Doge-320M-Instruct for context generation
33
  doge_tokenizer = AutoTokenizer.from_pretrained("SmallDoge/Doge-320M-Instruct")
34
  doge_model = AutoModelForCausalLM.from_pretrained(
35
  "SmallDoge/Doge-320M-Instruct", trust_remote_code=True
36
  ).to(device)
 
37
  doge_generation_config = GenerationConfig(
38
  max_new_tokens=100,
39
  use_cache=True,
@@ -43,44 +38,22 @@ doge_generation_config = GenerationConfig(
43
  repetition_penalty=1.0
44
  )
45
 
46
- # Load speaker embedding with fallback
47
- speaker_embedding = None
48
- embedding_data = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
49
-
50
- for entry in embedding_data:
51
- vec = entry["xvector"]
52
- if len(vec) >= 600:
53
- speaker_embedding = torch.tensor(vec[:600], dtype=torch.float32).unsqueeze(0)
54
- break
55
-
56
- # Fallback: use a zero vector if none found
57
- if speaker_embedding is None:
58
- print("⚠️ No suitable speaker embedding found. Using default 600-dim zero vector.")
59
- speaker_embedding = torch.zeros(1, 600, dtype=torch.float32)
60
-
61
- # Ensure correct shape
62
- assert speaker_embedding.shape == (1, 600), f"Expected shape (1, 600), got {speaker_embedding.shape}"
63
-
64
 
65
  def process_image(image):
66
  try:
67
- # 1. Caption the image
68
  caption = caption_model(image)[0]['generated_text']
69
 
70
- # 2. OCR with Florence-2
71
- inputs = ocr_processor(text="<OCR>", images=image, return_tensors="pt").to(device, dtype)
72
- generated_ids = ocr_model.generate(
73
- input_ids=inputs["input_ids"],
74
- pixel_values=inputs["pixel_values"],
75
- max_new_tokens=4096,
76
- num_beams=3,
77
- do_sample=False
78
- )
79
- extracted_text = ocr_processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
80
 
81
- # 3. Prompt Doge model for context generation
82
- prompt = f"Determine the context of this image based on the caption and extracted text.\nCaption: {caption}\nExtracted text: {extracted_text}\nContext:"
83
- prompt = prompt[:600] # Prevent tensor mismatch error
 
 
84
  conversation = [{"role": "user", "content": prompt}]
85
  doge_inputs = doge_tokenizer.apply_chat_template(
86
  conversation=conversation,
@@ -94,12 +67,8 @@ def process_image(image):
94
  )
95
  context = doge_tokenizer.decode(doge_output[0], skip_special_tokens=True).strip()
96
 
97
- # 4. Convert context to speech
98
- speech = synthesiser(
99
- context,
100
- forward_params={"speaker_embeddings": speaker_embedding}
101
- )
102
-
103
  audio = np.array(speech["audio"])
104
  rate = speech["sampling_rate"]
105
 
@@ -108,8 +77,6 @@ def process_image(image):
108
  except Exception as e:
109
  return None, f"Error: {str(e)}", "", ""
110
 
111
-
112
- # Gradio Interface
113
  iface = gr.Interface(
114
  fn=process_image,
115
  inputs=gr.Image(type='pil', label="Upload an Image"),
@@ -119,8 +86,8 @@ iface = gr.Interface(
119
  gr.Textbox(label="Extracted Text (OCR)"),
120
  gr.Textbox(label="Generated Context")
121
  ],
122
- title="SeeSay Contextualizer with Doge-320M",
123
- description="Upload an image to caption it, extract text, generate context, and hear the result as speech."
124
  )
125
 
126
  iface.launch(share=True)
 
1
  import gradio as gr
2
  from transformers import (
3
  pipeline,
 
 
4
  AutoTokenizer,
5
+ AutoModelForCausalLM,
6
  GenerationConfig,
7
  set_seed
8
  )
 
9
  import torch
10
  import numpy as np
11
+ import pytesseract
12
+ from PIL import Image
13
+ from datasets import load_dataset
14
 
15
  set_seed(42)
16
 
17
+ # Device
18
  device = "cuda" if torch.cuda.is_available() else "cpu"
 
19
 
20
+ # Image Captioning (BLIP)
21
  caption_model = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")
22
 
23
+ # Text-to-Speech without speaker embeddings
24
  synthesiser = pipeline("text-to-speech", model="microsoft/speecht5_tts")
25
 
26
+ # Doge-320M-Instruct for Context Generation
 
 
 
 
 
 
27
  doge_tokenizer = AutoTokenizer.from_pretrained("SmallDoge/Doge-320M-Instruct")
28
  doge_model = AutoModelForCausalLM.from_pretrained(
29
  "SmallDoge/Doge-320M-Instruct", trust_remote_code=True
30
  ).to(device)
31
+
32
  doge_generation_config = GenerationConfig(
33
  max_new_tokens=100,
34
  use_cache=True,
 
38
  repetition_penalty=1.0
39
  )
40
 
41
+ def extract_text_with_tesseract(image):
42
+ return pytesseract.image_to_string(image)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
 
44
  def process_image(image):
45
  try:
46
+ # 1. Caption
47
  caption = caption_model(image)[0]['generated_text']
48
 
49
+ # 2. OCR
50
+ extracted_text = extract_text_with_tesseract(image)
 
 
 
 
 
 
 
 
51
 
52
+ # 3. Context with Doge (truncate input)
53
+ prompt = (
54
+ f"Determine the context of this image.\n"
55
+ f"Caption: {caption[:200]}\nExtracted text: {extracted_text[:200]}\nContext:"
56
+ )
57
  conversation = [{"role": "user", "content": prompt}]
58
  doge_inputs = doge_tokenizer.apply_chat_template(
59
  conversation=conversation,
 
67
  )
68
  context = doge_tokenizer.decode(doge_output[0], skip_special_tokens=True).strip()
69
 
70
+ # 4. Text-to-Speech (no embeddings)
71
+ speech = synthesiser(context)
 
 
 
 
72
  audio = np.array(speech["audio"])
73
  rate = speech["sampling_rate"]
74
 
 
77
  except Exception as e:
78
  return None, f"Error: {str(e)}", "", ""
79
 
 
 
80
  iface = gr.Interface(
81
  fn=process_image,
82
  inputs=gr.Image(type='pil', label="Upload an Image"),
 
86
  gr.Textbox(label="Extracted Text (OCR)"),
87
  gr.Textbox(label="Generated Context")
88
  ],
89
+ title="SeeSay Contextualizer (Optimized)",
90
+ description="Upload an image to generate a caption, extract text (OCR), generate context, and hear it spoken."
91
  )
92
 
93
  iface.launch(share=True)