podcast-generator

Paused

App Files Files Community

bluenevus commited on 9 days ago

Commit

9e8b4c0

verified ·

1 Parent(s): 1e4432d

Update app.py

Browse files

Files changed (1) hide show

app.py +95 -87

app.py CHANGED Viewed

@@ -109,46 +109,89 @@ app.layout = dbc.Container([
     dcc.Store(id='generated-audio'),
 ])
-def process_prompt(text, voice, tokenizer, device):
-    prompt = f"{voice}: {text}"
-    inputs = tokenizer(prompt, return_tensors="pt")
-    input_ids = inputs["input_ids"].to(device)
-    attention_mask = inputs["attention_mask"].to(device)
-    return input_ids, attention_mask
 def parse_output(generated_ids):
-    decoded = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
-    code_list = [int(code) for code in decoded.split() if code.isdigit()]
-    return code_list
-def redistribute_codes(code_list, snac_model):
-    audio = snac_model.codes_to_audio(torch.tensor(code_list).unsqueeze(0).to(device))
-    return audio.cpu().numpy().flatten()
-def detect_silence(audio, threshold=0.01, min_silence_len=1000):
-    is_silent = np.abs(audio) < threshold
-    silent_regions = []
-    silent_start = None
-    for i, silent in enumerate(is_silent):
-        if silent and silent_start is None:
-            silent_start = i
-        elif not silent and silent_start is not None:
-            if i - silent_start >= min_silence_len:
-                silent_regions.append((silent_start, i))
-            silent_start = None
-    if silent_start is not None and len(audio) - silent_start >= min_silence_len:
-        silent_regions.append((silent_start, len(audio)))
-    return silent_regions
-import logging
-import numpy as np
-import torch
-import soundfile as sf
-import io
-from tqdm import tqdm
-logging.basicConfig(level=logging.INFO)
-logger = logging.getLogger(__name__)
 def generate_audio(script_output, voice1, voice2, num_hosts, temperature, top_p, repetition_penalty, max_new_tokens):
     try:
@@ -174,59 +217,26 @@ def generate_audio(script_output, voice1, voice2, num_hosts, temperature, top_p,
                     max_new_tokens=max_new_tokens,
                     num_return_sequences=1,
                     eos_token_id=128258,
-                    pad_token_id=128258,
                 )
             code_list = parse_output(generated_ids)
-            # Ensure the code list matches the expected input size of the SNAC model
-            expected_size = 2048  # This should match the model's expected input size
-            if len(code_list) < expected_size:
-                code_list = code_list + [0] * (expected_size - len(code_list))
-            elif len(code_list) > expected_size:
-                code_list = code_list[:expected_size]
-            # Convert to float tensor to match bias type
-            codes_tensor = torch.tensor(code_list, dtype=torch.float32).unsqueeze(0).to(device)
-            # Reshape the tensor to match the expected input shape
-            codes_tensor = codes_tensor.view(1, -1, 2048)  # Adjust these dimensions as needed
-            # Generate audio
-            with torch.no_grad():
-                paragraph_audio = snac_model(codes_tensor)
-            # Handle tuple output
-            if isinstance(paragraph_audio, tuple):
-                paragraph_audio = paragraph_audio[0]  # Assume the first element is the audio tensor
-            paragraph_audio = paragraph_audio.cpu().numpy().flatten()
-            # Log audio statistics
-            logger.info(f"Paragraph {i+1} audio shape: {paragraph_audio.shape}, min: {np.min(paragraph_audio)}, max: {np.max(paragraph_audio)}")
-            # Normalize audio to [-1, 1] range
-            paragraph_audio = paragraph_audio / np.max(np.abs(paragraph_audio))
             audio_samples.append(paragraph_audio)
         final_audio = np.concatenate(audio_samples)
-        # Log final audio statistics
-        logger.info(f"Final audio shape: {final_audio.shape}, min: {np.min(final_audio)}, max: {np.max(final_audio)}")
-        # Convert to 16-bit PCM
-        final_audio = (final_audio * 32767).astype(np.int16)
-        # Save as WAV file in memory
-        buffer = io.BytesIO()
-        sf.write(buffer, final_audio, 24000, format='WAV', subtype='PCM_16')
-        buffer.seek(0)
-        # Log buffer size
-        logger.info(f"Audio buffer size: {buffer.getbuffer().nbytes} bytes")
-        return buffer
     except Exception as e:
         logger.error(f"Error generating speech: {str(e)}")
         return None
@@ -346,11 +356,16 @@ def combined_callback(generate_script_clicks, generate_audio_clicks, advanced_se
         if not script_output.strip():
             return dash.no_update, html.Div("No audio generated yet."), dash.no_update, dash.no_update, "", ""
-        audio_buffer = generate_audio(script_output, voice1, voice2, num_hosts, temperature, top_p, repetition_penalty, max_new_tokens)
-        if audio_buffer is not None:
             # Convert to base64 for audio playback
-            audio_base64 = base64.b64encode(audio_buffer.getvalue()).decode('utf-8')
             src = f"data:audio/wav;base64,{audio_base64}"
             # Log audio file size
@@ -367,14 +382,7 @@ def combined_callback(generate_script_clicks, generate_audio_clicks, advanced_se
         else:
             logger.error("Failed to generate audio")
             return dash.no_update, html.Div("Error generating audio"), dash.no_update, dash.no_update, "", ""
-    elif trigger_id == "advanced-settings-toggle":
-        return dash.no_update, dash.no_update, not is_advanced_open, dash.no_update, "", ""
-    elif trigger_id == "clear-btn":
-        return "", html.Div("No audio generated yet."), dash.no_update, "", "", ""
-    return dash.no_update, dash.no_update, dash.no_update, dash.no_update, "", ""
 # Run the app
 if __name__ == '__main__':

     dcc.Store(id='generated-audio'),
 ])
+def process_prompt(prompt, voice, tokenizer, device):
+    prompt = f"{voice}: {prompt}"
+    input_ids = tokenizer(prompt, return_tensors="pt").input_ids
+    start_token = torch.tensor([[128259]], dtype=torch.int64)
+    end_tokens = torch.tensor([[128009, 128260]], dtype=torch.int64)
+    modified_input_ids = torch.cat([start_token, input_ids, end_tokens], dim=1)
+    attention_mask = torch.ones_like(modified_input_ids)
+    return modified_input_ids.to(device), attention_mask.to(device)
 def parse_output(generated_ids):
+    token_to_find = 128257
+    token_to_remove = 128258
+    token_indices = (generated_ids == token_to_find).nonzero(as_tuple=True)
+    if len(token_indices[1]) > 0:
+        last_occurrence_idx = token_indices[1][-1].item()
+        cropped_tensor = generated_ids[:, last_occurrence_idx+1:]
+    else:
+        cropped_tensor = generated_ids
+    processed_rows = []
+    for row in cropped_tensor:
+        masked_row = row[row != token_to_remove]
+        processed_rows.append(masked_row)
+    code_lists = []
+    for row in processed_rows:
+        row_length = row.size(0)
+        new_length = (row_length // 7) * 7
+        trimmed_row = row[:new_length]
+        trimmed_row = [t - 128266 for t in trimmed_row]
+        code_lists.append(trimmed_row)
+    return code_lists[0]
+def redistribute_codes(code_list, snac_model):
+    device = next(snac_model.parameters()).device  # Get the device of SNAC model
+    layer_1 = []
+    layer_2 = []
+    layer_3 = []
+    for i in range((len(code_list)+1)//7):
+        layer_1.append(code_list[7*i])
+        layer_2.append(code_list[7*i+1]-4096)
+        layer_3.append(code_list[7*i+2]-(2*4096))
+        layer_3.append(code_list[7*i+3]-(3*4096))
+        layer_2.append(code_list[7*i+4]-(4*4096))
+        layer_3.append(code_list[7*i+5]-(5*4096))
+        layer_3.append(code_list[7*i+6]-(6*4096))
+    codes = [
+        torch.tensor(layer_1, device=device).unsqueeze(0),
+        torch.tensor(layer_2, device=device).unsqueeze(0),
+        torch.tensor(layer_3, device=device).unsqueeze(0)
+    ]
+    audio_hat = snac_model.decode(codes)
+    return audio_hat.detach().squeeze().cpu().numpy()  # Always return CPU numpy array
+def detect_silence(audio, threshold=0.005, min_silence_duration=1.3):
+    sample_rate = 24000  # Adjust if your sample rate is different
+    is_silent = np.abs(audio) < threshold
+    silent_regions = np.where(is_silent)[0]
+    silence_starts = []
+    silence_ends = []
+    if len(silent_regions) > 0:
+        silence_starts.append(silent_regions[0])
+        for i in range(1, len(silent_regions)):
+            if silent_regions[i] - silent_regions[i-1] > 1:
+                silence_ends.append(silent_regions[i-1])
+                silence_starts.append(silent_regions[i])
+        silence_ends.append(silent_regions[-1])
+    long_silences = [(start, end) for start, end in zip(silence_starts, silence_ends)
+                     if (end - start) / sample_rate >= min_silence_duration]
+    return long_silences
 def generate_audio(script_output, voice1, voice2, num_hosts, temperature, top_p, repetition_penalty, max_new_tokens):
     try:
                     max_new_tokens=max_new_tokens,
                     num_return_sequences=1,
                     eos_token_id=128258,
                 )
             code_list = parse_output(generated_ids)
+            paragraph_audio = redistribute_codes(code_list, snac_model)
+            # Add silence detection here
+            silences = detect_silence(paragraph_audio)
+            if silences:
+                # Trim the audio at the last detected silence
+                paragraph_audio = paragraph_audio[:silences[-1][1]]
             audio_samples.append(paragraph_audio)
         final_audio = np.concatenate(audio_samples)
+        # Normalize the audio
+        final_audio = np.int16(final_audio / np.max(np.abs(final_audio)) * 32767)
+        return final_audio
     except Exception as e:
         logger.error(f"Error generating speech: {str(e)}")
         return None
         if not script_output.strip():
             return dash.no_update, html.Div("No audio generated yet."), dash.no_update, dash.no_update, "", ""
+        final_audio = generate_audio(script_output, voice1, voice2, num_hosts, temperature, top_p, repetition_penalty, max_new_tokens)
+        if final_audio is not None:
+            # Convert to WAV format
+            buffer = io.BytesIO()
+            sf.write(buffer, final_audio, 24000, format='WAV', subtype='PCM_16')
+            buffer.seek(0)
             # Convert to base64 for audio playback
+            audio_base64 = base64.b64encode(buffer.getvalue()).decode('utf-8')
             src = f"data:audio/wav;base64,{audio_base64}"
             # Log audio file size
         else:
             logger.error("Failed to generate audio")
             return dash.no_update, html.Div("Error generating audio"), dash.no_update, dash.no_update, "", ""
+        return dash.no_update, dash.no_update, dash.no_update, dash.no_update, "", ""
 # Run the app
 if __name__ == '__main__':