drewThomasson commited on
Commit
7dcf55b
·
verified ·
1 Parent(s): 5a597ea

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +4 -30
app.py CHANGED
@@ -8,14 +8,11 @@ import spaces
8
  import torch
9
  import torchaudio
10
  from generator import Segment, load_csm_1b
11
- from huggingface_hub import hf_hub_download, login
12
  from watermarking import watermark
13
 
14
- api_key = os.getenv("HF_TOKEN")
15
  gpu_timeout = int(os.getenv("GPU_TIMEOUT", 60))
16
- CSM_1B_HF_WATERMARK = list(map(int, os.getenv("WATERMARK_KEY").split(" ")))
17
-
18
- login(token=api_key)
19
 
20
  SPACE_INTRO_TEXT = """\
21
  # Sesame CSM 1B
@@ -24,12 +21,6 @@ Generate from CSM 1B (Conversational Speech Model).
24
  Code is available on GitHub: [SesameAILabs/csm](https://github.com/SesameAILabs/csm).
25
  Checkpoint is [hosted on HuggingFace](https://huggingface.co/sesame/csm-1b).
26
 
27
- Try out our interactive demo [sesame.com/voicedemo](https://www.sesame.com/voicedemo),
28
- this uses a fine-tuned variant of CSM.
29
-
30
- The model has some capacity for non-English languages due to data contamination in the training
31
- data, but it is likely not to perform well.
32
-
33
  ---
34
 
35
  """
@@ -87,20 +78,6 @@ SPEAKER_PROMPTS = {
87
  ),
88
  "audio": "prompts/read_speech_b.wav",
89
  },
90
- "read_speech_c": {
91
- "text": (
92
- "All passed so quickly, there was so much going on around him, the Tree quite forgot "
93
- "to look to himself."
94
- ),
95
- "audio": "prompts/read_speech_c.wav",
96
- },
97
- "read_speech_d": {
98
- "text": (
99
- "Suddenly I was back in the old days Before you felt we ought to drift apart. It was "
100
- "some trick-the way your eyebrows raise."
101
- ),
102
- "audio": "prompts/read_speech_d.wav",
103
- },
104
  }
105
 
106
  device = "cuda" if torch.cuda.is_available() else "cpu"
@@ -156,7 +133,7 @@ def infer(
156
  audio_prompt_speaker_b,
157
  gen_conversation_input,
158
  ) -> tuple[np.ndarray, int]:
159
- # Estimate token limit, otherwise failure might happen after many utterances have been generated.
160
  if len(gen_conversation_input.strip() + text_prompt_speaker_a.strip() + text_prompt_speaker_b.strip()) >= 2000:
161
  raise gr.Error("Prompts and conversation too long.", duration=30)
162
 
@@ -202,10 +179,7 @@ def _infer(
202
  audio_tensors = [segment.audio for segment in generated_segments]
203
  audio_tensor = torch.cat(audio_tensors, dim=0)
204
 
205
- # This applies an imperceptible watermark to identify audio as AI-generated.
206
- # Watermarking ensures transparency, dissuades misuse, and enables traceability.
207
- # Please be a responsible AI citizen and keep the watermarking in place.
208
- # If using CSM 1B in another application, use your own private key and keep it secret.
209
  audio_tensor, wm_sample_rate = watermark(
210
  generator._watermarker, audio_tensor, generator.sample_rate, CSM_1B_HF_WATERMARK
211
  )
 
8
  import torch
9
  import torchaudio
10
  from generator import Segment, load_csm_1b
 
11
  from watermarking import watermark
12
 
13
+ # Simplified environment variables handling
14
  gpu_timeout = int(os.getenv("GPU_TIMEOUT", 60))
15
+ CSM_1B_HF_WATERMARK = list(map(int, os.getenv("WATERMARK_KEY", "0 0 0").split(" ")))
 
 
16
 
17
  SPACE_INTRO_TEXT = """\
18
  # Sesame CSM 1B
 
21
  Code is available on GitHub: [SesameAILabs/csm](https://github.com/SesameAILabs/csm).
22
  Checkpoint is [hosted on HuggingFace](https://huggingface.co/sesame/csm-1b).
23
 
 
 
 
 
 
 
24
  ---
25
 
26
  """
 
78
  ),
79
  "audio": "prompts/read_speech_b.wav",
80
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
81
  }
82
 
83
  device = "cuda" if torch.cuda.is_available() else "cpu"
 
133
  audio_prompt_speaker_b,
134
  gen_conversation_input,
135
  ) -> tuple[np.ndarray, int]:
136
+ # Estimate token limit
137
  if len(gen_conversation_input.strip() + text_prompt_speaker_a.strip() + text_prompt_speaker_b.strip()) >= 2000:
138
  raise gr.Error("Prompts and conversation too long.", duration=30)
139
 
 
179
  audio_tensors = [segment.audio for segment in generated_segments]
180
  audio_tensor = torch.cat(audio_tensors, dim=0)
181
 
182
+ # Watermarking
 
 
 
183
  audio_tensor, wm_sample_rate = watermark(
184
  generator._watermarker, audio_tensor, generator.sample_rate, CSM_1B_HF_WATERMARK
185
  )