Spaces:

ybendou
/

Audio-Recorder-TTS

Sleeping

File size: 8,229 Bytes

import gradio as gr
from datasets import load_dataset, Dataset, Audio, concatenate_datasets
import json
import os
from datetime import datetime
import shutil

# Directory to save recordings
AUDIO_DIR = "data/audios"
SAMPLING_RATE = 16000
os.makedirs(AUDIO_DIR, exist_ok=True)

# State variables
state = {
    "sentences": [],
    "recordings": {},  # Dictionary to store recordings by ID
    "index": 0,  # Index for navigating through sentences
    "idx": 0,  # Index for sentences (IDs)
    "json_loaded": False

}

def load_json(file):
    with open(file.name, "r", encoding="utf-8") as f:
        content = json.load(f)
    state["sentences"].extend(content)
    state["recordings"].update({k["id"]:[] for k in content})
    state["json_loaded"] = True
    return update_display()

def update_display():
    if not state["sentences"]:
        return "No data loaded.", None, "", "", gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False)
    
    idx = state["index"]
    progress = ""
    if state["json_loaded"]:
        if idx >= len(state["sentences"]):
            export_json()
            return "✅ All sentences recorded!\n💾 Data Exported to Json", None, "", "", gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False)

        progress = 0
        for recordings in state["recordings"].values(): 
            if len(recordings) > 0: 
                progress += 1
        progress = f"{progress} / {len(state['sentences'])} recorded"
    
    
    # Enable/Disable buttons based on the current index
    next_btn_enabled = gr.update(visible= not (state["index"] == len(state["sentences"]) - 1))
    prev_btn_enabled = gr.update(visible= not (state["index"] == 0))
    
    recordings = []
    text = ""
    current_id = f"s_{state['idx']}"
    if idx < len(state["sentences"]):
        current = state["sentences"][idx]
        current_id = current['id']
        text = current["text"]
        recordings = state["recordings"].get(current["id"], [])
        
    if recordings:
        # Get the most recent recording for that sentence ID
        current_recording = recordings[-1]
        current_audio = current_recording["audio"]
        audio_visibility = gr.update(visible=True)
    else:
        current_audio = None
        audio_visibility = gr.update(visible=False)

    return text, None, f"ID: {current_id}", progress, gr.update(visible=True), prev_btn_enabled, next_btn_enabled, current_audio, audio_visibility

def record_audio(audio, text):
    if state["sentences"] and state["index"] >= len(state["sentences"]):
        return update_display()

    if audio is None: 
        gr.Warning("The audio is empty, please provide a valid audio")
        return update_display()
    if state["json_loaded"]:
        state["sentences"][state["index"]]["text"] = text # overwrite with current written value
    else:
        state["sentences"].append({"id": f"s_{state['idx']}", "text": text})
        state["idx"] += 1

    sentence = state["sentences"][state["index"]]
    uid = sentence["id"]

    filename = f"{uid}_{datetime.now().strftime('%Y%m%d%H%M%S')}.wav"
    filepath = os.path.join(AUDIO_DIR, filename)

    shutil.copy(audio, filepath)

    # Add the new recording under the correct ID in the recordings dictionary

    uid_versioning = uid
    recordings = state["recordings"].get(uid, [])
    if recordings:
        uid_versioning = f"{uid}_v{len(recordings)}"
    
    state["recordings"].setdefault(uid, []).append({
        "id": uid_versioning,
        "text": sentence["text"],
        "audio": filepath
    })
    state["index"] += 1
    return update_display()

def export_json():
    output_path = "data/tts_dataset.json"
    data = [record for records in state["recordings"].values() for record in records]
    if data: 
        with open(output_path, "w") as f:
            json.dump(data, f, indent=2)
    else:
        gr.Warning("There is no recorded data")
    return output_path

def go_previous():
    if state["index"] > 0:
        state["index"] -= 1
    return update_display()

def go_next():
    if state["index"] < len(state["sentences"]) - 1:
        state["index"] += 1
    return update_display()
def push_to_hub(hub_id, is_new_dataset, sampling_rate):
    if hub_id:
        # flatten recordings 
        recordings = []
        for element in state["recordings"].values():
            for version in element:
                recordings.append({"id": version["id"], "audio": version["audio"], "text": version["text"]}) 
            
        dataset = Dataset.from_list(recordings)
        dataset = dataset.cast_column("audio", Audio(sampling_rate=sampling_rate))
        if not is_new_dataset:
            previous_dataset = load_dataset(hub_id, split="train")
            dataset = concatenate_datasets([previous_dataset, dataset])
        dataset.push_to_hub(hub_id)
        gr.Info("Succesfully synched with the hub")
    else:
        gr.Warning("The hub_id field is empty, please provide a relevant hub id.")
    return update_display()
with gr.Blocks() as demo:
    gr.Markdown("""# 🗣️ TTS Dataset Recorder

Welcome to the **TTS Dataset Recorder**! This tool helps you quickly create a high-quality dataset for Text-to-Speech (TTS) models. Whether you're starting from scratch or have a pre-existing set of text data, this app lets you record audio samples and export them with the corresponding metadata.

### **How to Use?**
1. **Upload a JSON File** containing the sentences you'd like to record (or manually input them through the app).
2. **Record Audio** for each sentence. The app will automatically associate your recordings with the correct text.
3. **Export the Dataset** as a JSON file or **Sync** to HuggingFace for easy sharing and use.

### **Data Input Format**
Your JSON file should follow this structure:
```json
[
    { "id": "001", "text": "Hello, how are you?" },
    { "id": "002", "text": "This is a sample sentence." }
]

                """)
    
    with gr.Row():
        json_file = gr.File(label="Upload Sentences JSON", file_types=[".json"])
        with gr.Column():
            export_btn = gr.Button("💾 Export Metadata")
            with gr.Row():
                hub_id = gr.Textbox(label="Hub id", interactive=True)
                with gr.Row():
                    is_new_dataset = gr.Checkbox(label="New dataset", interactive=True)
                    sampling_rate = gr.Number(label="Sampling rate", value=SAMPLING_RATE, precision=0)
            push_to_hub_btn = gr.Button("🤗 Sync to HuggingFace")
    
    id_display = gr.Textbox(label="ID", interactive=False)
    progress_text = gr.Textbox(label="Progress", interactive=False)
    sentence_text = gr.Textbox(label="Sentence", interactive=True)
    audio_input = gr.Audio(type="filepath", label="Record your voice", interactive=True)
    record_btn = gr.Button("✅ Submit Recording")

    
    with gr.Row():
        prev_btn = gr.Button("⬅️ Previous")
        next_btn = gr.Button("➡️ Next")

    # audio_player = gr.Audio(label="Play Recorded Audio", interactive=False)
    audio_player = gr.Audio(label="Play Recorded Audio", type="filepath")


    json_file.change(load_json, inputs=json_file, outputs=[sentence_text, audio_input, id_display, progress_text, record_btn, prev_btn, next_btn, audio_player, audio_player])
    record_btn.click(record_audio, inputs=[audio_input, sentence_text], outputs=[sentence_text, audio_input, id_display, progress_text, record_btn, prev_btn, next_btn, audio_player, audio_player])
    export_btn.click(export_json, outputs=gr.File())

    prev_btn.click(go_previous, outputs=[sentence_text, audio_input, id_display, progress_text, record_btn, prev_btn, next_btn, audio_player, audio_player])
    next_btn.click(go_next, outputs=[sentence_text, audio_input, id_display, progress_text, record_btn, prev_btn, next_btn, audio_player, audio_player])


    push_to_hub_btn.click(push_to_hub, inputs=[hub_id, is_new_dataset], outputs=[sentence_text, audio_input, id_display, progress_text, record_btn, prev_btn, next_btn, audio_player, audio_player])

demo.launch()