Spaces:

preston-cell
/

image-text-to-text

Sleeping

App Files Files Community

preston-cell commited on Mar 27

Commit

c4e4a14

verified ·

1 Parent(s): 0286478

Create run_csm.py

Browse files

Files changed (1) hide show

run_csm.py +117 -0

run_csm.py ADDED Viewed

	@@ -0,0 +1,117 @@

+import os
+import torch
+import torchaudio
+from huggingface_hub import hf_hub_download
+from generator import load_csm_1b, Segment
+from dataclasses import dataclass
+# Disable Triton compilation
+os.environ["NO_TORCH_COMPILE"] = "1"
+# Default prompts are available at https://hf.co/sesame/csm-1b
+prompt_filepath_conversational_a = hf_hub_download(
+    repo_id="sesame/csm-1b",
+    filename="prompts/conversational_a.wav"
+)
+prompt_filepath_conversational_b = hf_hub_download(
+    repo_id="sesame/csm-1b",
+    filename="prompts/conversational_b.wav"
+)
+SPEAKER_PROMPTS = {
+    "conversational_a": {
+        "text": (
+            "like revising for an exam I'd have to try and like keep up the momentum because I'd "
+            "start really early I'd be like okay I'm gonna start revising now and then like "
+            "you're revising for ages and then I just like start losing steam I didn't do that "
+            "for the exam we had recently to be fair that was a more of a last minute scenario "
+            "but like yeah I'm trying to like yeah I noticed this yesterday that like Mondays I "
+            "sort of start the day with this not like a panic but like a"
+        ),
+        "audio": prompt_filepath_conversational_a
+    },
+    "conversational_b": {
+        "text": (
+            "like a super Mario level. Like it's very like high detail. And like, once you get "
+            "into the park, it just like, everything looks like a computer game and they have all "
+            "these, like, you know, if, if there's like a, you know, like in a Mario game, they "
+            "will have like a question block. And if you like, you know, punch it, a coin will "
+            "come out. So like everyone, when they come into the park, they get like this little "
+            "bracelet and then you can go punching question blocks around."
+        ),
+        "audio": prompt_filepath_conversational_b
+    }
+}
+def load_prompt_audio(audio_path: str, target_sample_rate: int) -> torch.Tensor:
+    audio_tensor, sample_rate = torchaudio.load(audio_path)
+    audio_tensor = audio_tensor.squeeze(0)
+    # Resample is lazy so we can always call it
+    audio_tensor = torchaudio.functional.resample(
+        audio_tensor, orig_freq=sample_rate, new_freq=target_sample_rate
+    )
+    return audio_tensor
+def prepare_prompt(text: str, speaker: int, audio_path: str, sample_rate: int) -> Segment:
+    audio_tensor = load_prompt_audio(audio_path, sample_rate)
+    return Segment(text=text, speaker=speaker, audio=audio_tensor)
+def main():
+    # Select the best available device, skipping MPS due to float64 limitations
+    if torch.cuda.is_available():
+        device = "cuda"
+    else:
+        device = "cpu"
+    print(f"Using device: {device}")
+    # Load model
+    generator = load_csm_1b(device)
+    # Prepare prompts
+    prompt_a = prepare_prompt(
+        SPEAKER_PROMPTS["conversational_a"]["text"],
+        0,
+        SPEAKER_PROMPTS["conversational_a"]["audio"],
+        generator.sample_rate
+    )
+    prompt_b = prepare_prompt(
+        SPEAKER_PROMPTS["conversational_b"]["text"],
+        1,
+        SPEAKER_PROMPTS["conversational_b"]["audio"],
+        generator.sample_rate
+    )
+    # Generate conversation
+    conversation = [
+        {"text": "Hey how are you doing?", "speaker_id": 0},
+        {"text": "Pretty good, pretty good. How about you?", "speaker_id": 1},
+        {"text": "I'm great! So happy to be speaking with you today.", "speaker_id": 0},
+        {"text": "Me too! This is some cool stuff, isn't it?", "speaker_id": 1}
+    ]
+    # Generate each utterance
+    generated_segments = []
+    prompt_segments = [prompt_a, prompt_b]
+    for utterance in conversation:
+        print(f"Generating: {utterance['text']}")
+        audio_tensor = generator.generate(
+            text=utterance['text'],
+            speaker=utterance['speaker_id'],
+            context=prompt_segments + generated_segments,
+            max_audio_length_ms=10_000,
+        )
+        generated_segments.append(Segment(text=utterance['text'], speaker=utterance['speaker_id'], audio=audio_tensor))
+    # Concatenate all generations
+    all_audio = torch.cat([seg.audio for seg in generated_segments], dim=0)
+    torchaudio.save(
+        "full_conversation.wav",
+        all_audio.unsqueeze(0).cpu(),
+        generator.sample_rate
+    )
+    print("Successfully generated full_conversation.wav")
+if __name__ == "__main__":
+    main()