Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -47,10 +47,17 @@ doge_generation_config = GenerationConfig(
|
|
47 |
repetition_penalty=1.0
|
48 |
)
|
49 |
|
50 |
-
# Load and
|
51 |
embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
|
52 |
-
speaker_embedding =
|
|
|
|
|
|
|
|
|
|
|
53 |
|
|
|
|
|
54 |
|
55 |
def process_image(image):
|
56 |
try:
|
@@ -109,7 +116,7 @@ iface = gr.Interface(
|
|
109 |
gr.Textbox(label="Extracted Text (OCR)"),
|
110 |
gr.Textbox(label="Generated Context")
|
111 |
],
|
112 |
-
title="SeeSay
|
113 |
description="Upload an image to generate a caption, extract text (OCR), generate context using Doge, and turn it into speech using SpeechT5."
|
114 |
)
|
115 |
|
|
|
47 |
repetition_penalty=1.0
|
48 |
)
|
49 |
|
50 |
+
# Load and normalize speaker embedding to 600 dimensions
|
51 |
embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
|
52 |
+
speaker_embedding = None
|
53 |
+
for i in range(len(embeddings_dataset)):
|
54 |
+
vec = embeddings_dataset[i]["xvector"]
|
55 |
+
if len(vec) >= 600:
|
56 |
+
speaker_embedding = torch.tensor(vec[:600]).unsqueeze(0)
|
57 |
+
break
|
58 |
|
59 |
+
if speaker_embedding is None:
|
60 |
+
raise ValueError("No suitable speaker embedding of at least 600 dimensions found.")
|
61 |
|
62 |
def process_image(image):
|
63 |
try:
|
|
|
116 |
gr.Textbox(label="Extracted Text (OCR)"),
|
117 |
gr.Textbox(label="Generated Context")
|
118 |
],
|
119 |
+
title="SeeSay",
|
120 |
description="Upload an image to generate a caption, extract text (OCR), generate context using Doge, and turn it into speech using SpeechT5."
|
121 |
)
|
122 |
|