preston-cell commited on
Commit
a483c36
·
verified ·
1 Parent(s): dbabbd4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +36 -39
app.py CHANGED
@@ -5,39 +5,36 @@ from transformers import (
5
  AutoModelForCausalLM,
6
  AutoTokenizer,
7
  GenerationConfig,
 
8
  set_seed
9
  )
10
  from datasets import load_dataset
11
  import torch
12
  import numpy as np
13
 
14
- # Set seed for reproducibility
15
  set_seed(42)
16
 
17
- # Load BLIP model for image captioning
 
 
 
 
18
  caption_model = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")
19
 
20
- # Load SpeechT5 model for text-to-speech
21
  synthesiser = pipeline("text-to-speech", model="microsoft/speecht5_tts")
22
 
23
- # Load Florence-2 model for OCR
24
- ocr_device = "cuda" if torch.cuda.is_available() else "cpu"
25
- ocr_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
26
  ocr_model = AutoModelForCausalLM.from_pretrained(
27
- "microsoft/Florence-2-large",
28
- torch_dtype=ocr_dtype,
29
- trust_remote_code=True
30
- ).to(ocr_device)
31
  ocr_processor = AutoProcessor.from_pretrained("microsoft/Florence-2-large", trust_remote_code=True)
32
 
33
- # Load Doge-320M-Instruct model for context generation
34
  doge_tokenizer = AutoTokenizer.from_pretrained("SmallDoge/Doge-320M-Instruct")
35
  doge_model = AutoModelForCausalLM.from_pretrained(
36
- "SmallDoge/Doge-320M-Instruct",
37
- torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
38
- trust_remote_code=True
39
- ).to("cuda" if torch.cuda.is_available() else "cpu")
40
-
41
  doge_generation_config = GenerationConfig(
42
  max_new_tokens=100,
43
  use_cache=True,
@@ -47,26 +44,26 @@ doge_generation_config = GenerationConfig(
47
  repetition_penalty=1.0
48
  )
49
 
50
- # Load and pad/truncate speaker embedding to exactly 600 dimensions
51
- embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
52
- raw_vec = embeddings_dataset[0]["xvector"]
 
 
 
 
 
 
 
 
53
 
54
- # Force embedding to 600 dimensions
55
- if len(raw_vec) > 600:
56
- raw_vec = raw_vec[:600]
57
- elif len(raw_vec) < 600:
58
- raw_vec = raw_vec + [0.0] * (600 - len(raw_vec))
59
-
60
- speaker_embedding = torch.tensor(raw_vec, dtype=torch.float32).unsqueeze(0) # shape [1, 600]
61
- assert speaker_embedding.shape == (1, 600), f"Speaker embedding shape is {speaker_embedding.shape}, expected (1, 600)"
62
 
63
  def process_image(image):
64
  try:
65
- # Step 1: Generate caption
66
  caption = caption_model(image)[0]['generated_text']
67
 
68
- # Step 2: OCR to extract text
69
- inputs = ocr_processor(text="<OCR>", images=image, return_tensors="pt").to(ocr_device, ocr_dtype)
70
  generated_ids = ocr_model.generate(
71
  input_ids=inputs["input_ids"],
72
  pixel_values=inputs["pixel_values"],
@@ -76,23 +73,23 @@ def process_image(image):
76
  )
77
  extracted_text = ocr_processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
78
 
79
- # Step 3: Generate context using Doge model
80
  prompt = f"Determine the context of this image based on the caption and extracted text.\nCaption: {caption}\nExtracted text: {extracted_text}\nContext:"
 
81
  conversation = [{"role": "user", "content": prompt}]
82
  doge_inputs = doge_tokenizer.apply_chat_template(
83
  conversation=conversation,
84
  tokenize=True,
85
  return_tensors="pt"
86
- ).to(doge_model.device)
87
 
88
- doge_outputs = doge_model.generate(
89
- doge_inputs,
90
  generation_config=doge_generation_config
91
  )
 
92
 
93
- context = doge_tokenizer.decode(doge_outputs[0], skip_special_tokens=True).strip()
94
-
95
- # Step 4: Convert context to speech
96
  speech = synthesiser(
97
  context,
98
  forward_params={"speaker_embeddings": speaker_embedding}
@@ -118,7 +115,7 @@ iface = gr.Interface(
118
  gr.Textbox(label="Generated Context")
119
  ],
120
  title="SeeSay Contextualizer with Doge-320M",
121
- description="Upload an image to generate a caption, extract text (OCR), generate context using Doge, and turn it into speech using SpeechT5."
122
  )
123
 
124
- iface.launch()
 
5
  AutoModelForCausalLM,
6
  AutoTokenizer,
7
  GenerationConfig,
8
+ TextStreamer,
9
  set_seed
10
  )
11
  from datasets import load_dataset
12
  import torch
13
  import numpy as np
14
 
 
15
  set_seed(42)
16
 
17
+ # Device and dtype
18
+ device = "cuda" if torch.cuda.is_available() else "cpu"
19
+ dtype = torch.float16 if torch.cuda.is_available() else torch.float32
20
+
21
+ # Load image captioning model
22
  caption_model = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")
23
 
24
+ # Load SpeechT5 text-to-speech model
25
  synthesiser = pipeline("text-to-speech", model="microsoft/speecht5_tts")
26
 
27
+ # Load OCR model (Florence-2)
 
 
28
  ocr_model = AutoModelForCausalLM.from_pretrained(
29
+ "microsoft/Florence-2-large", torch_dtype=dtype, trust_remote_code=True
30
+ ).to(device)
 
 
31
  ocr_processor = AutoProcessor.from_pretrained("microsoft/Florence-2-large", trust_remote_code=True)
32
 
33
+ # Load Doge-320M-Instruct for context generation
34
  doge_tokenizer = AutoTokenizer.from_pretrained("SmallDoge/Doge-320M-Instruct")
35
  doge_model = AutoModelForCausalLM.from_pretrained(
36
+ "SmallDoge/Doge-320M-Instruct", trust_remote_code=True
37
+ ).to(device)
 
 
 
38
  doge_generation_config = GenerationConfig(
39
  max_new_tokens=100,
40
  use_cache=True,
 
44
  repetition_penalty=1.0
45
  )
46
 
47
+ # Load speaker embedding with exactly 600 values
48
+ speaker_embedding = None
49
+ embedding_data = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
50
+ for entry in embedding_data:
51
+ vec = entry["xvector"]
52
+ if len(vec) >= 600:
53
+ speaker_embedding = torch.tensor(vec[:600], dtype=torch.float32).unsqueeze(0) # Shape: [1, 600]
54
+ break
55
+ if speaker_embedding is None:
56
+ raise ValueError("No suitable speaker embedding of at least 600 dimensions found.")
57
+ assert speaker_embedding.shape == (1, 600), f"Expected shape (1, 600), got {speaker_embedding.shape}"
58
 
 
 
 
 
 
 
 
 
59
 
60
  def process_image(image):
61
  try:
62
+ # 1. Caption the image
63
  caption = caption_model(image)[0]['generated_text']
64
 
65
+ # 2. OCR with Florence-2
66
+ inputs = ocr_processor(text="<OCR>", images=image, return_tensors="pt").to(device, dtype)
67
  generated_ids = ocr_model.generate(
68
  input_ids=inputs["input_ids"],
69
  pixel_values=inputs["pixel_values"],
 
73
  )
74
  extracted_text = ocr_processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
75
 
76
+ # 3. Prompt Doge model for context generation
77
  prompt = f"Determine the context of this image based on the caption and extracted text.\nCaption: {caption}\nExtracted text: {extracted_text}\nContext:"
78
+ prompt = prompt[:600] # Ensure prompt isn't too long
79
  conversation = [{"role": "user", "content": prompt}]
80
  doge_inputs = doge_tokenizer.apply_chat_template(
81
  conversation=conversation,
82
  tokenize=True,
83
  return_tensors="pt"
84
+ ).to(device)
85
 
86
+ doge_output = doge_model.generate(
87
+ input_ids=doge_inputs,
88
  generation_config=doge_generation_config
89
  )
90
+ context = doge_tokenizer.decode(doge_output[0], skip_special_tokens=True).strip()
91
 
92
+ # 4. Convert context to speech
 
 
93
  speech = synthesiser(
94
  context,
95
  forward_params={"speaker_embeddings": speaker_embedding}
 
115
  gr.Textbox(label="Generated Context")
116
  ],
117
  title="SeeSay Contextualizer with Doge-320M",
118
+ description="Upload an image to caption it, extract text, generate context, and hear the result as speech."
119
  )
120
 
121
+ iface.launch(share=True)