Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -47,17 +47,11 @@ doge_generation_config = GenerationConfig(
|
|
47 |
repetition_penalty=1.0
|
48 |
)
|
49 |
|
50 |
-
# Load and
|
51 |
embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
if len(vec) >= 600:
|
56 |
-
speaker_embedding = torch.tensor(vec[:600]).unsqueeze(0)
|
57 |
-
break
|
58 |
-
|
59 |
-
if speaker_embedding is None:
|
60 |
-
raise ValueError("No suitable speaker embedding of at least 600 dimensions found.")
|
61 |
|
62 |
def process_image(image):
|
63 |
try:
|
@@ -116,7 +110,7 @@ iface = gr.Interface(
|
|
116 |
gr.Textbox(label="Extracted Text (OCR)"),
|
117 |
gr.Textbox(label="Generated Context")
|
118 |
],
|
119 |
-
title="SeeSay",
|
120 |
description="Upload an image to generate a caption, extract text (OCR), generate context using Doge, and turn it into speech using SpeechT5."
|
121 |
)
|
122 |
|
|
|
47 |
repetition_penalty=1.0
|
48 |
)
|
49 |
|
50 |
+
# Load and pad/truncate speaker embedding to exactly 600 dimensions
|
51 |
embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
|
52 |
+
vec = embeddings_dataset[0]["xvector"]
|
53 |
+
vec = vec[:600] if len(vec) >= 600 else vec + [0.0] * (600 - len(vec))
|
54 |
+
speaker_embedding = torch.tensor(vec, dtype=torch.float32).unsqueeze(0)
|
|
|
|
|
|
|
|
|
|
|
|
|
55 |
|
56 |
def process_image(image):
|
57 |
try:
|
|
|
110 |
gr.Textbox(label="Extracted Text (OCR)"),
|
111 |
gr.Textbox(label="Generated Context")
|
112 |
],
|
113 |
+
title="SeeSay Contextualizer with Doge-320M",
|
114 |
description="Upload an image to generate a caption, extract text (OCR), generate context using Doge, and turn it into speech using SpeechT5."
|
115 |
)
|
116 |
|