thomwolf HF Staff commited on
Commit
a09ccc4
·
1 Parent(s): 33ddae1
Files changed (3) hide show
  1. app.py +97 -3
  2. hello.mp3 +0 -0
  3. requirements.txt +1 -0
app.py CHANGED
@@ -1,7 +1,101 @@
1
  import gradio as gr
 
 
2
 
3
- def greet(name):
4
- return "Hello " + name + "!!"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
 
6
- demo = gr.Interface(fn=greet, inputs="text", outputs="text")
7
  demo.launch()
 
1
  import gradio as gr
2
+ import random
3
+ import time
4
 
5
+ from huggingface_hub import hf_hub_download
6
+ import numpy as np
7
+ import sphn
8
+ import torch
9
+
10
+ from moshi.models import loaders
11
+
12
+
13
+ def seed_all(seed):
14
+ torch.manual_seed(seed)
15
+ if torch.cuda.is_available():
16
+ torch.cuda.manual_seed(seed)
17
+ torch.cuda.manual_seed_all(seed) # for multi-GPU setups
18
+ random.seed(seed)
19
+ np.random.seed(seed)
20
+ torch.backends.cudnn.deterministic = True
21
+ torch.backends.cudnn.benchmark = False
22
+
23
+
24
+ seed_all(42424242)
25
+
26
+ device = "cuda" if torch.cuda.device_count() else "cpu"
27
+ num_codebooks = 32
28
+
29
+ print("loading mimi")
30
+ model_file = hf_hub_download(loaders.DEFAULT_REPO, "tokenizer-e351c8d8-checkpoint125.safetensors")
31
+
32
+ mimi = loaders.get_mimi(model_file, device, num_codebooks=num_codebooks)
33
+ mimi.eval()
34
+ print("mimi loaded")
35
+
36
+
37
+ def mimi_streaming_test(input_wave, max_duration_sec=10.0):
38
+ pcm_chunk_size = int(mimi.sample_rate / mimi.frame_rate)
39
+ # wget https://github.com/metavoiceio/metavoice-src/raw/main/assets/bria.mp3
40
+ sample_pcm, sample_sr = sphn.read(input_wave) # ("bria.mp3")
41
+ sample_rate = mimi.sample_rate
42
+ print("loaded pcm", sample_pcm.shape, sample_sr)
43
+ sample_pcm = sphn.resample(
44
+ sample_pcm, src_sample_rate=sample_sr, dst_sample_rate=sample_rate
45
+ )
46
+ sample_pcm = torch.tensor(sample_pcm, device=device)
47
+ max_duration_len = int(sample_rate * max_duration_sec)
48
+ if sample_pcm.shape[-1] > max_duration_len:
49
+ sample_pcm = sample_pcm[..., :max_duration_len]
50
+ print("resampled pcm", sample_pcm.shape, sample_sr)
51
+ sample_pcm = sample_pcm[None].to(device=device)
52
+
53
+ print("streaming encoding...")
54
+ start_time = time.time()
55
+ all_codes = []
56
+
57
+ def run_loop():
58
+ for start_idx in range(0, sample_pcm.shape[-1], pcm_chunk_size):
59
+ end_idx = min(sample_pcm.shape[-1], start_idx + pcm_chunk_size)
60
+ chunk = sample_pcm[..., start_idx:end_idx]
61
+ with torch.no_grad():
62
+ codes = mimi.encode(chunk)
63
+ if codes.shape[-1]:
64
+ print(start_idx, codes.shape, end="\r")
65
+ all_codes.append(codes)
66
+
67
+ run_loop()
68
+ all_codes_th = torch.cat(all_codes, dim=-1)
69
+ print(f"codes {all_codes_th.shape} generated in {time.time() - start_time:.2f}s")
70
+
71
+ all_codes_list = [all_codes_th[:, :1, :],
72
+ all_codes_th[:, :2, :],
73
+ all_codes_th[:, :4, :],
74
+ # all_codes_th[:, :8, :],
75
+ # all_codes_th[:, :16, :],
76
+ all_codes_th[:, :32, :]]
77
+ pcm_list = []
78
+ for i, all_codes_th in enumerate(all_codes_list):
79
+ with torch.no_grad():
80
+ print(f"decoding {i+1} codebooks, {all_codes_th.shape}")
81
+ pcm = mimi.decode(all_codes_th)
82
+ pcm_list.append((sample_rate, pcm[0, 0].cpu().numpy()))
83
+ # sphn.write_wav("roundtrip_out.wav", pcm[0, 0].cpu().numpy(), sample_rate)
84
+ return pcm_list
85
+
86
+
87
+ demo = gr.Interface(
88
+ fn=mimi_streaming_test,
89
+ inputs=gr.Audio(sources=["microphone", "upload"], type="filepath"),
90
+ outputs=[gr.Audio(type="numpy", label="With 1 codebook"),
91
+ gr.Audio(type="numpy", label="With 2 codebooks"),
92
+ gr.Audio(type="numpy", label="With 4 codebooks"),
93
+ # gr.Audio(type="numpy", label="With 8 codebooks"),
94
+ # gr.Audio(type="numpy", label="With 16 codebooks"),
95
+ gr.Audio(type="numpy", label="With 32 codebooks")],
96
+ examples= [["hello.mp3"]],
97
+ title="Mimi tokenizer playground",
98
+ description="Explore the quality of compression when using various number of code books in the Mimi model."
99
+ )
100
 
 
101
  demo.launch()
hello.mp3 ADDED
Binary file (5.76 kB). View file
 
requirements.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ moshi