Spaces:

ngxson
/

kokoro-podcast-generator

Running

App Files Files Community

kokoro-podcast-generator / front /src /utils /utils.ts

ngxson HF Staff

first working version

8e0957b 3 months ago

raw

history blame

9.61 kB

	// @ts-expect-error this package does not have typing
	import TextLineStream from 'textlinestream';
	import { Client } from '@gradio/client';
	import decodeAudio from 'audio-decode';

	// ponyfill for missing ReadableStream asyncIterator on Safari
	import { asyncIterator } from '@sec-ant/readable-stream/ponyfill/asyncIterator';
	import { CONFIG } from '../config';

	// return URL to the WAV file
	export const generateAudio = async (
	content: string,
	voice: string,
	speed: number = 1.1
	): Promise<string> => {
	const client = await Client.connect(CONFIG.ttsSpaceId);
	const result = await client.predict('/tts', {
	text: content,
	voice,
	speed,
	});

	console.log(result.data);
	return (result.data as any)[0].url;
	};

	export const pickRand = <T>(arr: T[]): T => {
	return arr[Math.floor(Math.random() * arr.length)];
	};

	// wrapper for SSE
	export async function* getSSEStreamAsync(fetchResponse: Response) {
	if (!fetchResponse.body) throw new Error('Response body is empty');
	const lines: ReadableStream<string> = fetchResponse.body
	.pipeThrough(new TextDecoderStream())
	.pipeThrough(new TextLineStream());
	// @ts-expect-error asyncIterator complains about type, but it should work
	for await (const line of asyncIterator(lines)) {
	//if (isDev) console.log({ line });
	if (line.startsWith('data:') && !line.endsWith('[DONE]')) {
	const data = JSON.parse(line.slice(5));
	yield data;
	} else if (line.startsWith('error:')) {
	const data = JSON.parse(line.slice(6));
	throw new Error(data.message \|\| 'Unknown error');
	}
	}
	}

	/**
	* Ok now, most of the functions below are written by ChatGPT using Reasoning mode.
	*/

	////////////////////////////////////////
	// Audio manipulation utils

	export const trimSilence = (audioBuffer: AudioBuffer): AudioBuffer => {
	const threshold = 0.01; // Amplitude below which a sample is considered silent.
	const numChannels = audioBuffer.numberOfChannels;
	const totalSamples = audioBuffer.length;

	// Helper function to check if a sample at the given index is silent in all channels.
	const isSilent = (index: number): boolean => {
	for (let channel = 0; channel < numChannels; channel++) {
	const channelData = audioBuffer.getChannelData(channel);
	if (Math.abs(channelData[index]) > threshold) {
	return false;
	}
	}
	return true;
	};

	// Find the first non-silent sample.
	let startSample = 0;
	while (startSample < totalSamples && isSilent(startSample)) {
	startSample++;
	}

	// Find the last non-silent sample.
	let endSample = totalSamples - 1;
	while (endSample >= startSample && isSilent(endSample)) {
	endSample--;
	}

	// If no non-silent samples were found, return an empty AudioBuffer.
	if (startSample >= totalSamples \|\| endSample < startSample) {
	return new AudioBuffer({
	length: 1,
	numberOfChannels: numChannels,
	sampleRate: audioBuffer.sampleRate,
	});
	}

	const newLength = endSample - startSample + 1;
	const newBuffer = new AudioBuffer({
	length: newLength,
	numberOfChannels: numChannels,
	sampleRate: audioBuffer.sampleRate,
	});

	// Copy the trimmed audio samples from the original buffer to the new buffer.
	for (let channel = 0; channel < numChannels; channel++) {
	const oldData = audioBuffer.getChannelData(channel);
	const newData = newBuffer.getChannelData(channel);
	for (let i = 0; i < newLength; i++) {
	newData[i] = oldData[startSample + i];
	}
	}

	return newBuffer;
	};

	export const joinAudio = (
	audio1: AudioBuffer,
	audio2: AudioBuffer,
	gapSeconds: number
	): AudioBuffer => {
	const sampleRate = audio1.sampleRate;
	const numChannels = audio1.numberOfChannels;

	// Ensure both audio buffers are compatible.
	if (audio2.sampleRate !== sampleRate) {
	throw new Error('Audio buffers must have the same sample rate');
	}
	if (audio2.numberOfChannels !== numChannels) {
	throw new Error('Audio buffers must have the same number of channels');
	}

	let newLength: number;

	if (gapSeconds > 0) {
	// Pad with silence: gapSamples of silence in between.
	const gapSamples = Math.round(gapSeconds * sampleRate);
	newLength = audio1.length + gapSamples + audio2.length;
	} else if (gapSeconds === 0) {
	// Simply join one after the other.
	newLength = audio1.length + audio2.length;
	} else {
	// gapSeconds < 0 means we blend (overlap) the end of audio1 with the beginning of audio2.
	const overlapSamplesRequested = Math.round(-gapSeconds * sampleRate);
	// Ensure we don't overlap more than available in either buffer.
	const effectiveOverlap = Math.min(
	overlapSamplesRequested,
	audio1.length,
	audio2.length
	);
	newLength = audio1.length + audio2.length - effectiveOverlap;
	}

	// Create a new AudioBuffer for the joined result.
	const newBuffer = new AudioBuffer({
	length: newLength,
	numberOfChannels: numChannels,
	sampleRate: sampleRate,
	});

	// Process each channel.
	for (let channel = 0; channel < numChannels; channel++) {
	const outputData = newBuffer.getChannelData(channel);
	const data1 = audio1.getChannelData(channel);
	const data2 = audio2.getChannelData(channel);
	let offset = 0;

	if (gapSeconds < 0) {
	// Blend the join section.
	const overlapSamplesRequested = Math.round(-gapSeconds * sampleRate);
	const effectiveOverlap = Math.min(
	overlapSamplesRequested,
	audio1.length,
	audio2.length
	);

	// Copy audio1 data up to the start of the overlapping section.
	const nonOverlapLength = audio1.length - effectiveOverlap;
	outputData.set(data1.subarray(0, nonOverlapLength), offset);
	offset += nonOverlapLength;

	// Blend overlapping region.
	for (let i = 0; i < effectiveOverlap; i++) {
	// Linear crossfade:
	const fadeOut = 1 - i / effectiveOverlap;
	const fadeIn = i / effectiveOverlap;
	outputData[offset + i] =
	data1[nonOverlapLength + i] * fadeOut + data2[i] * fadeIn;
	}
	offset += effectiveOverlap;

	// Append remaining audio2 data.
	outputData.set(data2.subarray(effectiveOverlap), offset);
	} else if (gapSeconds === 0) {
	// Directly concatenate: copy audio1 then audio2.
	outputData.set(data1, offset);
	offset += audio1.length;
	outputData.set(data2, offset);
	} else {
	// gapSeconds > 0: insert silence between audio1 and audio2.
	const gapSamples = Math.round(gapSeconds * sampleRate);
	outputData.set(data1, offset);
	offset += audio1.length;

	// Silence: the buffer is initialized with zeros, so we simply move the offset.
	offset += gapSamples;

	outputData.set(data2, offset);
	}
	}

	return newBuffer;
	};

	////////////////////////////////////////
	// Audio formatting utils

	export const loadWavAndDecode = async (url: string): Promise<AudioBuffer> => {
	const response = await fetch(url);
	const arrayBuffer = await response.arrayBuffer();
	const audioBuffer = await decodeAudio(arrayBuffer);
	return audioBuffer;
	};

	export function audioBufferToWav(
	buffer: AudioBuffer,
	options: { float32?: boolean } = {}
	): ArrayBuffer {
	const numChannels = buffer.numberOfChannels;
	const sampleRate = buffer.sampleRate;
	const format = options.float32 ? 3 : 1; // 3 = IEEE float, 1 = PCM
	const bitDepth = options.float32 ? 32 : 16;

	const numSamples = buffer.length;
	const headerLength = 44;
	const bytesPerSample = bitDepth / 8;
	const dataLength = numSamples * numChannels * bytesPerSample;
	const bufferLength = headerLength + dataLength;

	const arrayBuffer = new ArrayBuffer(bufferLength);
	const view = new DataView(arrayBuffer);
	let offset = 0;

	function writeString(str: string) {
	for (let i = 0; i < str.length; i++) {
	view.setUint8(offset, str.charCodeAt(i));
	offset++;
	}
	}

	// Write WAV header
	writeString('RIFF');
	view.setUint32(offset, 36 + dataLength, true);
	offset += 4;
	writeString('WAVE');
	writeString('fmt ');
	view.setUint32(offset, 16, true);
	offset += 4;
	view.setUint16(offset, format, true);
	offset += 2;
	view.setUint16(offset, numChannels, true);
	offset += 2;
	view.setUint32(offset, sampleRate, true);
	offset += 4;
	view.setUint32(offset, sampleRate * numChannels * bytesPerSample, true);
	offset += 4;
	view.setUint16(offset, numChannels * bytesPerSample, true);
	offset += 2;
	view.setUint16(offset, bitDepth, true);
	offset += 2;
	writeString('data');
	view.setUint32(offset, dataLength, true);
	offset += 4;

	// Write PCM samples: interleave channels
	const channels: Float32Array[] = [];
	for (let i = 0; i < numChannels; i++) {
	channels.push(buffer.getChannelData(i));
	}

	for (let i = 0; i < numSamples; i++) {
	for (let channel = 0; channel < numChannels; channel++) {
	let sample = channels[channel][i];
	// Clamp the sample to [-1, 1]
	sample = Math.max(-1, Math.min(1, sample));
	if (options.float32) {
	view.setFloat32(offset, sample, true);
	offset += 4;
	} else {
	// Convert to 16-bit PCM sample
	const intSample = sample < 0 ? sample * 0x8000 : sample * 0x7fff;
	view.setInt16(offset, intSample, true);
	offset += 2;
	}
	}
	}

	return arrayBuffer;
	}

	export const blobFromAudioBuffer = (audioBuffer: AudioBuffer): Blob => {
	// Using 16-bit PCM for compatibility.
	const wavArrayBuffer = audioBufferToWav(audioBuffer, { float32: false });
	return new Blob([wavArrayBuffer], { type: 'audio/wav' });
	};