preston-cell commited on
Commit
6a2189c
·
verified ·
1 Parent(s): 5a2766f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +5 -11
app.py CHANGED
@@ -47,17 +47,11 @@ doge_generation_config = GenerationConfig(
47
  repetition_penalty=1.0
48
  )
49
 
50
- # Load and normalize speaker embedding to 600 dimensions
51
  embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
52
- speaker_embedding = None
53
- for i in range(len(embeddings_dataset)):
54
- vec = embeddings_dataset[i]["xvector"]
55
- if len(vec) >= 600:
56
- speaker_embedding = torch.tensor(vec[:600]).unsqueeze(0)
57
- break
58
-
59
- if speaker_embedding is None:
60
- raise ValueError("No suitable speaker embedding of at least 600 dimensions found.")
61
 
62
  def process_image(image):
63
  try:
@@ -116,7 +110,7 @@ iface = gr.Interface(
116
  gr.Textbox(label="Extracted Text (OCR)"),
117
  gr.Textbox(label="Generated Context")
118
  ],
119
- title="SeeSay",
120
  description="Upload an image to generate a caption, extract text (OCR), generate context using Doge, and turn it into speech using SpeechT5."
121
  )
122
 
 
47
  repetition_penalty=1.0
48
  )
49
 
50
+ # Load and pad/truncate speaker embedding to exactly 600 dimensions
51
  embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
52
+ vec = embeddings_dataset[0]["xvector"]
53
+ vec = vec[:600] if len(vec) >= 600 else vec + [0.0] * (600 - len(vec))
54
+ speaker_embedding = torch.tensor(vec, dtype=torch.float32).unsqueeze(0)
 
 
 
 
 
 
55
 
56
  def process_image(image):
57
  try:
 
110
  gr.Textbox(label="Extracted Text (OCR)"),
111
  gr.Textbox(label="Generated Context")
112
  ],
113
+ title="SeeSay Contextualizer with Doge-320M",
114
  description="Upload an image to generate a caption, extract text (OCR), generate context using Doge, and turn it into speech using SpeechT5."
115
  )
116