Spaces:

ngxson
/

kokoro-podcast-generator

Running

App Files Files Community

ngxson HF Staff commited on Feb 17

Commit

16826f3

1 Parent(s): a414844

add noise for super real experience

Browse files

Files changed (5) hide show

front/src/components/PodcastGenerator.tsx +4 -0
front/src/components/ScriptMaker.tsx +1 -0
front/src/utils/prompts.ts +5 -3
front/src/utils/utils.ts +25 -0
index.html +92 -3

front/src/components/PodcastGenerator.tsx CHANGED Viewed

@@ -3,11 +3,14 @@ import { AudioPlayer } from './AudioPlayer';
 import { Podcast, PodcastTurn } from '../utils/types';
 import { parse } from 'yaml';
 import {
   generateAudio,
   joinAudio,
   loadWavAndDecode,
   pickRand,
 } from '../utils/utils';
 import openingSoundSrc from '../opening-sound.wav';
 interface GenerationStep {
@@ -144,6 +147,7 @@ export const PodcastGenerator = ({
         }
         setNumStepsDone(i + 1);
       }
       setWav(outputWav! ?? null);
     } catch (e) {
       console.error(e);

 import { Podcast, PodcastTurn } from '../utils/types';
 import { parse } from 'yaml';
 import {
+  addNoise,
   generateAudio,
   joinAudio,
   loadWavAndDecode,
   pickRand,
 } from '../utils/utils';
+// taken from https://freesound.org/people/artxmp1/sounds/660540
 import openingSoundSrc from '../opening-sound.wav';
 interface GenerationStep {
         }
         setNumStepsDone(i + 1);
       }
+      outputWav = addNoise(outputWav!, 0.002);
       setWav(outputWav! ?? null);
     } catch (e) {
       console.error(e);

front/src/components/ScriptMaker.tsx CHANGED Viewed

@@ -164,6 +164,7 @@ export const ScriptMaker = ({
           className="select select-bordered"
           value={model}
           onChange={(e) => setModel(e.target.value)}
         >
           {CONFIG.inferenceProviderModels.map((s) => (
             <option key={s} value={s}>

           className="select select-bordered"
           value={model}
           onChange={(e) => setModel(e.target.value)}
+          disabled={isGenerating || busy}
         >
           {CONFIG.inferenceProviderModels.map((s) => (
             <option key={s} value={s}>

front/src/utils/prompts.ts CHANGED Viewed

@@ -1,7 +1,7 @@
 export const getPromptGeneratePodcastScript = (content: string, note: string) =>
   `
-You are a podcast script writter. You only output content in YAML format. Given a raw unstructured content, think about a plan, then think more detailed how words can be written as pronunciations then write the podcast script in YAML format. Please also take into account the note from the podcast producer.
 Some rules:
 - Must output YAML format, must be wrapped inside mardown code block.
@@ -39,10 +39,10 @@ turns:
 [END OF EXAMPLE]
 The example above is truncated at index 1, REMEMBER TO CREATE AT LEAST 20 TURNS.
-The output text will be passed to TTS engine, make sure to be clean:
 - Write NUMBER and abbreviations as WORDS, as they are pronounced
 - For some less-common abbreviations, write the full words
-- Use ... for pauses, " and ' and ! and ? for intonation
 - IMPORTANT!! Write nicknames and names as they are pronounced. For example, "lora_rank=2" becomes "lora rank equals two", or "LoRA" becomes "Lo Ra", or "CrossEntropyLoss" becomes "Cross Entropy Loss", or "6GB" becomes "six gigabytes", "A6000" becomes "A six thousands"
 Make it engaging and have fun!
@@ -57,4 +57,6 @@ ${content}
 ${note.length < 1 ? '(No note provided)' : note}
 [END OF NOTE]
 `.trim();

 export const getPromptGeneratePodcastScript = (content: string, note: string) =>
   `
+You are a podcast script writter. You only output content in YAML format. Given a raw unstructured content, think about a detailed plan, then think more detailed how words can be written as pronunciations then write the podcast script in YAML format. Please also take into account the note from the podcast producer.
 Some rules:
 - Must output YAML format, must be wrapped inside mardown code block.
 [END OF EXAMPLE]
 The example above is truncated at index 1, REMEMBER TO CREATE AT LEAST 20 TURNS.
+The output text will be passed to TTS engine, make sure to be clean and natural:
 - Write NUMBER and abbreviations as WORDS, as they are pronounced
 - For some less-common abbreviations, write the full words
+- Use ... for pauses (IMPORTANT to add pauses), " and ' and ! and ? for intonation
 - IMPORTANT!! Write nicknames and names as they are pronounced. For example, "lora_rank=2" becomes "lora rank equals two", or "LoRA" becomes "Lo Ra", or "CrossEntropyLoss" becomes "Cross Entropy Loss", or "6GB" becomes "six gigabytes", "A6000" becomes "A six thousands"
 Make it engaging and have fun!
 ${note.length < 1 ? '(No note provided)' : note}
 [END OF NOTE]
+Now, think about a detailed plan.
 `.trim();

front/src/utils/utils.ts CHANGED Viewed

@@ -217,6 +217,31 @@ export const joinAudio = (
   return newBuffer;
 };
 ////////////////////////////////////////
 // Audio formatting utils

   return newBuffer;
 };
+export const addNoise = (
+  audioBuffer: AudioBuffer,
+  magnitude: number
+): AudioBuffer => {
+  const { numberOfChannels, sampleRate, length } = audioBuffer;
+  const newBuffer = new AudioBuffer({
+    length,
+    numberOfChannels,
+    sampleRate,
+  });
+  for (let channel = 0; channel < numberOfChannels; channel++) {
+    const inputData = audioBuffer.getChannelData(channel);
+    const outputData = newBuffer.getChannelData(channel);
+    for (let i = 0; i < length; i++) {
+      // Generate white noise in the range [-magnitude, +magnitude].
+      const noise = (Math.random() * 2 - 1) * magnitude;
+      outputData[i] = inputData[i] + noise;
+    }
+  }
+  return newBuffer;
+};
 ////////////////////////////////////////
 // Audio formatting utils

index.html CHANGED Viewed

@@ -14792,6 +14792,23 @@ const joinAudio = (audio1, audio2, gapSeconds, overlap = "none") => {
   }
   return newBuffer;
 };
 const loadWavAndDecode = async (url) => {
   const response = await fetch(url);
   const arrayBuffer = await response.arrayBuffer();
@@ -21020,6 +21037,7 @@ const PodcastGenerator = ({
         }
         setNumStepsDone(i + 1);
       }
       setWav(outputWav ?? null);
     } catch (e) {
       console.error(e);
@@ -21120,7 +21138,7 @@ const PodcastGenerator = ({
 };
 const getPromptGeneratePodcastScript = (content, note) => `
-You are a podcast script writter. You only output content in YAML format. Given a raw unstructured content, think about a plan, then think more detailed how words can be written as pronunciations then write the podcast script in YAML format. Please also take into account the note from the podcast producer.
 Some rules:
 - Must output YAML format, must be wrapped inside mardown code block.
@@ -21158,10 +21176,10 @@ turns:
 [END OF EXAMPLE]
 The example above is truncated at index 1, REMEMBER TO CREATE AT LEAST 20 TURNS.
-The output text will be passed to TTS engine, make sure to be clean:
 - Write NUMBER and abbreviations as WORDS, as they are pronounced
 - For some less-common abbreviations, write the full words
-- Use ... for pauses, " and ' and ! and ? for intonation
 - IMPORTANT!! Write nicknames and names as they are pronounced. For example, "lora_rank=2" becomes "lora rank equals two", or "LoRA" becomes "Lo Ra", or "CrossEntropyLoss" becomes "Cross Entropy Loss", or "6GB" becomes "six gigabytes", "A6000" becomes "A six thousands"
 Make it engaging and have fun!
@@ -21176,6 +21194,8 @@ ${content}
 ${note.length < 1 ? "(No note provided)" : note}
 [END OF NOTE]
 `.trim();
 const EXAMPLES = [
   {
@@ -22655,6 +22675,7 @@ const ScriptMaker = ({
         className: "select select-bordered",
         value: model,
         onChange: (e) => setModel(e.target.value),
         children: [
           CONFIG.inferenceProviderModels.map((s) => /* @__PURE__ */ jsxRuntimeExports.jsx("option", { value: s, children: s }, s)),
           /* @__PURE__ */ jsxRuntimeExports.jsx("option", { value: "custom", children: "Custom" })
@@ -27814,6 +27835,21 @@ html {
   border-radius: var(--rounded-box, 1rem);
   background-color: var(--fallback-bc,oklch(var(--bc)/0.2));
 }
 .select {
   display: inline-flex;
   cursor: pointer;
@@ -28295,6 +28331,59 @@ html {
       0 0 0 4px var(--fallback-b1,oklch(var(--b1)/1)) inset;
   }
 }
 @keyframes rating-pop {
   0% {

   }
   return newBuffer;
 };
+const addNoise = (audioBuffer, magnitude) => {
+  const { numberOfChannels, sampleRate, length } = audioBuffer;
+  const newBuffer = new AudioBuffer({
+    length,
+    numberOfChannels,
+    sampleRate
+  });
+  for (let channel = 0; channel < numberOfChannels; channel++) {
+    const inputData = audioBuffer.getChannelData(channel);
+    const outputData = newBuffer.getChannelData(channel);
+    for (let i = 0; i < length; i++) {
+      const noise = (Math.random() * 2 - 1) * magnitude;
+      outputData[i] = inputData[i] + noise;
+    }
+  }
+  return newBuffer;
+};
 const loadWavAndDecode = async (url) => {
   const response = await fetch(url);
   const arrayBuffer = await response.arrayBuffer();
         }
         setNumStepsDone(i + 1);
       }
+      outputWav = addNoise(outputWav, 2e-3);
       setWav(outputWav ?? null);
     } catch (e) {
       console.error(e);
 };
 const getPromptGeneratePodcastScript = (content, note) => `
+You are a podcast script writter. You only output content in YAML format. Given a raw unstructured content, think about a detailed plan, then think more detailed how words can be written as pronunciations then write the podcast script in YAML format. Please also take into account the note from the podcast producer.
 Some rules:
 - Must output YAML format, must be wrapped inside mardown code block.
 [END OF EXAMPLE]
 The example above is truncated at index 1, REMEMBER TO CREATE AT LEAST 20 TURNS.
+The output text will be passed to TTS engine, make sure to be clean and natural:
 - Write NUMBER and abbreviations as WORDS, as they are pronounced
 - For some less-common abbreviations, write the full words
+- Use ... for pauses (IMPORTANT to add pauses), " and ' and ! and ? for intonation
 - IMPORTANT!! Write nicknames and names as they are pronounced. For example, "lora_rank=2" becomes "lora rank equals two", or "LoRA" becomes "Lo Ra", or "CrossEntropyLoss" becomes "Cross Entropy Loss", or "6GB" becomes "six gigabytes", "A6000" becomes "A six thousands"
 Make it engaging and have fun!
 ${note.length < 1 ? "(No note provided)" : note}
 [END OF NOTE]
+Now, think about a detailed plan.
 `.trim();
 const EXAMPLES = [
   {
         className: "select select-bordered",
         value: model,
         onChange: (e) => setModel(e.target.value),
+        disabled: isGenerating || busy,
         children: [
           CONFIG.inferenceProviderModels.map((s) => /* @__PURE__ */ jsxRuntimeExports.jsx("option", { value: s, children: s }, s)),
           /* @__PURE__ */ jsxRuntimeExports.jsx("option", { value: "custom", children: "Custom" })
   border-radius: var(--rounded-box, 1rem);
   background-color: var(--fallback-bc,oklch(var(--bc)/0.2));
 }
+.range {
+  height: 1.5rem;
+  width: 100%;
+  cursor: pointer;
+  -moz-appearance: none;
+       appearance: none;
+  -webkit-appearance: none;
+  --range-shdw: var(--fallback-bc,oklch(var(--bc)/1));
+  overflow: hidden;
+  border-radius: var(--rounded-box, 1rem);
+  background-color: transparent;
+}
+.range:focus {
+  outline: none;
+}
 .select {
   display: inline-flex;
   cursor: pointer;
       0 0 0 4px var(--fallback-b1,oklch(var(--b1)/1)) inset;
   }
 }
+.range:focus-visible::-webkit-slider-thumb {
+  --focus-shadow: 0 0 0 6px var(--fallback-b1,oklch(var(--b1)/1)) inset, 0 0 0 2rem var(--range-shdw) inset;
+}
+.range:focus-visible::-moz-range-thumb {
+  --focus-shadow: 0 0 0 6px var(--fallback-b1,oklch(var(--b1)/1)) inset, 0 0 0 2rem var(--range-shdw) inset;
+}
+.range::-webkit-slider-runnable-track {
+  height: 0.5rem;
+  width: 100%;
+  border-radius: var(--rounded-box, 1rem);
+  background-color: var(--fallback-bc,oklch(var(--bc)/0.1));
+}
+.range::-moz-range-track {
+  height: 0.5rem;
+  width: 100%;
+  border-radius: var(--rounded-box, 1rem);
+  background-color: var(--fallback-bc,oklch(var(--bc)/0.1));
+}
+.range::-webkit-slider-thumb {
+  position: relative;
+  height: 1.5rem;
+  width: 1.5rem;
+  border-radius: var(--rounded-box, 1rem);
+  border-style: none;
+  --tw-bg-opacity: 1;
+  background-color: var(--fallback-b1,oklch(var(--b1)/var(--tw-bg-opacity)));
+  appearance: none;
+  -webkit-appearance: none;
+  top: 50%;
+  color: var(--range-shdw);
+  transform: translateY(-50%);
+  --filler-size: 100rem;
+  --filler-offset: 0.6rem;
+  box-shadow: 0 0 0 3px var(--range-shdw) inset,
+      var(--focus-shadow, 0 0),
+      calc(var(--filler-size) * -1 - var(--filler-offset)) 0 0 var(--filler-size);
+}
+.range::-moz-range-thumb {
+  position: relative;
+  height: 1.5rem;
+  width: 1.5rem;
+  border-radius: var(--rounded-box, 1rem);
+  border-style: none;
+  --tw-bg-opacity: 1;
+  background-color: var(--fallback-b1,oklch(var(--b1)/var(--tw-bg-opacity)));
+  top: 50%;
+  color: var(--range-shdw);
+  --filler-size: 100rem;
+  --filler-offset: 0.5rem;
+  box-shadow: 0 0 0 3px var(--range-shdw) inset,
+      var(--focus-shadow, 0 0),
+      calc(var(--filler-size) * -1 - var(--filler-offset)) 0 0 var(--filler-size);
+}
 @keyframes rating-pop {
   0% {