Spaces:

Vikhrmodels
/

salt

Running

App Files Files Community

AlexWortega commited on Sep 18, 2024

Commit

473ca63

verified ·

1 Parent(s): ffd4f38

Update main.py

Browse files

Files changed (1) hide show

main.py +43 -25

main.py CHANGED Viewed

@@ -24,39 +24,24 @@ top_k = 20
 from safetensors.torch import load_file
 def convert_to_16_bit_wav(data):
-    # Based on: https://docs.scipy.org/doc/scipy/reference/generated/scipy.io.wavfile.write.html
-    # breakpoint()
     if data.dtype == np.float32:
-        # warnings.warn(
-        #     "Audio data is not in 16-bit integer format."
-        #     "Trying to convert to 16-bit int format."
-        # )
         data = data / np.abs(data).max()
         data = data * 32767
         data = data.astype(np.int16)
     elif data.dtype == np.int32:
-        # warnings.warn(
-        #     "Audio data is not in 16-bit integer format."
-        #     "Trying to convert to 16-bit int format."
-        # )
         data = data / 65538
         data = data.astype(np.int16)
     elif data.dtype == np.int16:
         pass
     elif data.dtype == np.uint8:
-        # warnings.warn(
-        #     "Audio data is not in 16-bit integer format."
-        #     "Trying to convert to 16-bit int format."
-        # )
         data = data * 257 - 32768
         data = data.astype(np.int16)
     else:
-        raise ValueError("Audio data cannot be converted to " "16-bit int format.")
     return data
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 # Load the model with INT8 quantization
 model = AutoModelForCausalLM.from_pretrained(
     model_path,
@@ -71,7 +56,7 @@ ckpt_path = "audiotokenizer/SpeechTokenizer.pt"
 quantizer = SpeechTokenizer.load_from_checkpoint(config_path, ckpt_path)
 quantizer.eval()
-# Перемещение всех слоев квантизатора на устройство и их заморозка
 def freeze_entire_model(model):
     for n, p in model.named_parameters():
         p.requires_grad = False
@@ -81,7 +66,7 @@ for n, child in quantizer.named_children():
     child.to(device)
     child = freeze_entire_model(child)
-# Функция для создания токенов заполнения для аудио
 def get_audio_padding_tokens(quantizer):
     audio = torch.zeros((1, 1, 1)).to(device)
     codes = quantizer.encode(audio)
@@ -89,7 +74,7 @@ def get_audio_padding_tokens(quantizer):
     torch.cuda.empty_cache()
     return {"audio_tokens": codes.squeeze(1)}
-# Функция для декодирования аудио из токенов
 def decode_audio(tokens, quantizer, pad_tokens, n_original_tokens):
     start = torch.nonzero(tokens == tokenizer(start_audio_token)["input_ids"][-1])
     end = torch.nonzero(tokens == tokenizer(end_audio_token)["input_ids"][-1])
@@ -112,9 +97,7 @@ def decode_audio(tokens, quantizer, pad_tokens, n_original_tokens):
     return xp
-# Пример использования
-# Функция инференса для текста на входе и аудио на выходе
 def infer_text_to_audio(text, model, tokenizer, quantizer, max_seq_length=1024, top_k=20):
     text_tokenized = tokenizer(text, return_tensors="pt")
     text_input_tokens = text_tokenized["input_ids"].to(device)
@@ -132,7 +115,6 @@ def infer_text_to_audio(text, model, tokenizer, quantizer, max_seq_length=1024,
     return audio_signal
-# Функция инференса для аудио на входе и текста на выходе
 def infer_audio_to_text(audio_path, model, tokenizer, quantizer, max_seq_length=1024, top_k=20):
     audio_data, sample_rate = torchaudio.load(audio_path)
@@ -155,7 +137,7 @@ def infer_audio_to_text(audio_path, model, tokenizer, quantizer, max_seq_length=
     return decoded_text
-# Functions for inference
 def infer_text_to_audio_gr(text):
     audio_signal = infer_text_to_audio(text.strip().upper(), model, tokenizer, quantizer)
     return audio_signal
@@ -183,6 +165,42 @@ audio_to_text_interface = gr.Interface(
     allow_flagging='never'
 )
-# Launch Gradio App
 demo = gr.TabbedInterface([text_to_audio_interface, audio_to_text_interface], ["Text - Audio", "Audio - Text"])
 demo.launch(share=True)

 from safetensors.torch import load_file
 def convert_to_16_bit_wav(data):
     if data.dtype == np.float32:
         data = data / np.abs(data).max()
         data = data * 32767
         data = data.astype(np.int16)
     elif data.dtype == np.int32:
         data = data / 65538
         data = data.astype(np.int16)
     elif data.dtype == np.int16:
         pass
     elif data.dtype == np.uint8:
         data = data * 257 - 32768
         data = data.astype(np.int16)
     else:
+        raise ValueError("Audio data cannot be converted to 16-bit int format.")
     return data
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 # Load the model with INT8 quantization
 model = AutoModelForCausalLM.from_pretrained(
     model_path,
 quantizer = SpeechTokenizer.load_from_checkpoint(config_path, ckpt_path)
 quantizer.eval()
+# Freeze layers in the quantizer
 def freeze_entire_model(model):
     for n, p in model.named_parameters():
         p.requires_grad = False
     child.to(device)
     child = freeze_entire_model(child)
+# Create padding tokens for audio
 def get_audio_padding_tokens(quantizer):
     audio = torch.zeros((1, 1, 1)).to(device)
     codes = quantizer.encode(audio)
     torch.cuda.empty_cache()
     return {"audio_tokens": codes.squeeze(1)}
+# Decode audio from tokens
 def decode_audio(tokens, quantizer, pad_tokens, n_original_tokens):
     start = torch.nonzero(tokens == tokenizer(start_audio_token)["input_ids"][-1])
     end = torch.nonzero(tokens == tokenizer(end_audio_token)["input_ids"][-1])
     return xp
+# Inference functions
 def infer_text_to_audio(text, model, tokenizer, quantizer, max_seq_length=1024, top_k=20):
     text_tokenized = tokenizer(text, return_tensors="pt")
     text_input_tokens = text_tokenized["input_ids"].to(device)
     return audio_signal
 def infer_audio_to_text(audio_path, model, tokenizer, quantizer, max_seq_length=1024, top_k=20):
     audio_data, sample_rate = torchaudio.load(audio_path)
     return decoded_text
+# Functions for Gradio Interface
 def infer_text_to_audio_gr(text):
     audio_signal = infer_text_to_audio(text.strip().upper(), model, tokenizer, quantizer)
     return audio_signal
     allow_flagging='never'
 )
+# Gradio Demo
 demo = gr.TabbedInterface([text_to_audio_interface, audio_to_text_interface], ["Text - Audio", "Audio - Text"])
+# Custom CSS for centered links
+custom_css = """
+<style>
+    .center {
+        text-align: center;
+    }
+</style>
+"""
+# Add Gradio description with centered links
+description = f"""
+# **Salt: Speech And Language Transformer**
+Welcome to the demo of **Salt**, a speech and language model. Vikhr Salt is capable of both **Text-to-Speech (T2S)** and **Speech-to-Text (S2T)** tasks, making it a versatile tool for transforming language into speech and vice versa. Built on a pre-trained large language model, Vikhr Salt incorporates audio tokens using cutting-edge techniques like **Encodec** and **SpeechTokenizer**, enabling robust performance across multiple modalities.
+## **🛠 Features**
+- **Text-to-Speech (T2S)**: Enter text and generate high-quality audio outputs.
+- **Speech-to-Text (S2T)**: Upload an audio file and convert it into accurate text.
+## **🚀 Try it out:**
+Explore the tabs to try the **Text - Audio** and **Audio - Text** modes!
+---
+<div class="center">
+    ### **📄 Preprint**
+    [Read the paper](https://docs.google.com/document/d/1ZvV47W4BCyZM_JfDC1BKj-0ozwPck5t2yNB8jORVshI/edit?usp=sharing)
+    ### **📂 Code**
+    [Explore the code](https://github.com/VikhrModels/Vikhr4o)
+</div>
+"""
+# Launch Gradio App
 demo.launch(share=True)