Spaces:

sesame
/

csm-1b

Running on Zero

Zackh commited on Mar 15

Commit

45e163c

1 Parent(s): ef55fce

switch to loading with from_pretrained

Files changed (3) hide show

app.py CHANGED Viewed

@@ -102,8 +102,7 @@ SPEAKER_PROMPTS = {
 }
 device = "cuda" if torch.cuda.is_available() else "cpu"
-model_path = hf_hub_download(repo_id="sesame/csm-1b", filename="ckpt.pt")
-generator = load_csm_1b(model_path, device)
 @spaces.GPU(duration=gpu_timeout)

 }
 device = "cuda" if torch.cuda.is_available() else "cpu"
+generator = load_csm_1b(device=device)
 @spaces.GPU(duration=gpu_timeout)

generator.py CHANGED Viewed

@@ -5,7 +5,7 @@ from typing import List, Tuple
 import torch
 import torchaudio
 from huggingface_hub import hf_hub_download
-from models import Model, ModelArgs
 from moshi.models import loaders
 from tokenizers.processors import TemplateProcessing
 from transformers import AutoTokenizer
@@ -166,17 +166,9 @@ class Generator:
         return audio
-def load_csm_1b(ckpt_path: str = "ckpt.pt", device: str = "cuda") -> Generator:
-    model_args = ModelArgs(
-        backbone_flavor="llama-1B",
-        decoder_flavor="llama-100M",
-        text_vocab_size=128256,
-        audio_vocab_size=2051,
-        audio_num_codebooks=32,
-    )
-    model = Model(model_args).to(device=device, dtype=torch.bfloat16)
-    state_dict = torch.load(ckpt_path)
-    model.load_state_dict(state_dict)
     generator = Generator(model)
     return generator

 import torch
 import torchaudio
 from huggingface_hub import hf_hub_download
+from models import Model
 from moshi.models import loaders
 from tokenizers.processors import TemplateProcessing
 from transformers import AutoTokenizer
         return audio
+def load_csm_1b(device: str = "cuda") -> Generator:
+    model = Model.from_pretrained("sesame/csm-1b")
+    model.to(device=device, dtype=torch.bfloat16)
     generator = Generator(model)
     return generator

models.py CHANGED Viewed

@@ -1,8 +1,9 @@
-from dataclasses import dataclass
 import torch
 import torch.nn as nn
 import torchtune
 from torchtune.models import llama3_2
@@ -95,7 +96,20 @@ class ModelArgs:
     audio_num_codebooks: int
-class Model(nn.Module):
     def __init__(self, args: ModelArgs):
         super().__init__()
         self.args = args
@@ -110,7 +124,7 @@ class Model(nn.Module):
         self.codebook0_head = nn.Linear(backbone_dim, args.audio_vocab_size, bias=False)
         self.audio_head = nn.Parameter(torch.empty(args.audio_num_codebooks - 1, decoder_dim, args.audio_vocab_size))
-    def setup_caches(self, max_batch_size: int) -> torch.Tensor:
         """Setup KV caches and return a causal mask."""
         dtype = next(self.parameters()).dtype
         device = next(self.parameters()).device

+from dataclasses import asdict, dataclass
 import torch
 import torch.nn as nn
 import torchtune
+from huggingface_hub import PyTorchModelHubMixin
 from torchtune.models import llama3_2
     audio_num_codebooks: int
+class Model(
+    nn.Module,
+    PyTorchModelHubMixin,
+    repo_url="https://github.com/SesameAILabs/csm",
+    pipeline_tag="text-to-speech",
+    license="apache-2.0",
+    coders={
+      # Tells the class how to serialize and deserialize config.json
+      ModelArgs : (
+         lambda x: asdict(x),  # Encoder: how to convert a `ModelArgs` to a valid jsonable value?
+         lambda data: ModelArgs(**data),  # Decoder: how to reconstruct a `ModelArgs` from a dictionary?
+      )
+   }
+):
     def __init__(self, args: ModelArgs):
         super().__init__()
         self.args = args
         self.codebook0_head = nn.Linear(backbone_dim, args.audio_vocab_size, bias=False)
         self.audio_head = nn.Parameter(torch.empty(args.audio_num_codebooks - 1, decoder_dim, args.audio_vocab_size))
+    def setup_caches(self, max_batch_size: int) -> None:
         """Setup KV caches and return a causal mask."""
         dtype = next(self.parameters()).dtype
         device = next(self.parameters()).device