preston-cell commited on
Commit
5a2766f
·
verified ·
1 Parent(s): 84402c4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +10 -3
app.py CHANGED
@@ -47,10 +47,17 @@ doge_generation_config = GenerationConfig(
47
  repetition_penalty=1.0
48
  )
49
 
50
- # Load and trim speaker embedding for SpeechT5 (must be size 600)
51
  embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
52
- speaker_embedding = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)[:, :600]
 
 
 
 
 
53
 
 
 
54
 
55
  def process_image(image):
56
  try:
@@ -109,7 +116,7 @@ iface = gr.Interface(
109
  gr.Textbox(label="Extracted Text (OCR)"),
110
  gr.Textbox(label="Generated Context")
111
  ],
112
- title="SeeSay Contextualizer with Doge-320M",
113
  description="Upload an image to generate a caption, extract text (OCR), generate context using Doge, and turn it into speech using SpeechT5."
114
  )
115
 
 
47
  repetition_penalty=1.0
48
  )
49
 
50
+ # Load and normalize speaker embedding to 600 dimensions
51
  embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
52
+ speaker_embedding = None
53
+ for i in range(len(embeddings_dataset)):
54
+ vec = embeddings_dataset[i]["xvector"]
55
+ if len(vec) >= 600:
56
+ speaker_embedding = torch.tensor(vec[:600]).unsqueeze(0)
57
+ break
58
 
59
+ if speaker_embedding is None:
60
+ raise ValueError("No suitable speaker embedding of at least 600 dimensions found.")
61
 
62
  def process_image(image):
63
  try:
 
116
  gr.Textbox(label="Extracted Text (OCR)"),
117
  gr.Textbox(label="Generated Context")
118
  ],
119
+ title="SeeSay",
120
  description="Upload an image to generate a caption, extract text (OCR), generate context using Doge, and turn it into speech using SpeechT5."
121
  )
122