ybendou's picture
Adding documentation
b1f991d verified
raw
history blame contribute delete
8.23 kB
import gradio as gr
from datasets import load_dataset, Dataset, Audio, concatenate_datasets
import json
import os
from datetime import datetime
import shutil
# Directory to save recordings
AUDIO_DIR = "data/audios"
SAMPLING_RATE = 16000
os.makedirs(AUDIO_DIR, exist_ok=True)
# State variables
state = {
"sentences": [],
"recordings": {}, # Dictionary to store recordings by ID
"index": 0, # Index for navigating through sentences
"idx": 0, # Index for sentences (IDs)
"json_loaded": False
}
def load_json(file):
with open(file.name, "r", encoding="utf-8") as f:
content = json.load(f)
state["sentences"].extend(content)
state["recordings"].update({k["id"]:[] for k in content})
state["json_loaded"] = True
return update_display()
def update_display():
if not state["sentences"]:
return "No data loaded.", None, "", "", gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False)
idx = state["index"]
progress = ""
if state["json_loaded"]:
if idx >= len(state["sentences"]):
export_json()
return "✅ All sentences recorded!\n💾 Data Exported to Json", None, "", "", gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False)
progress = 0
for recordings in state["recordings"].values():
if len(recordings) > 0:
progress += 1
progress = f"{progress} / {len(state['sentences'])} recorded"
# Enable/Disable buttons based on the current index
next_btn_enabled = gr.update(visible= not (state["index"] == len(state["sentences"]) - 1))
prev_btn_enabled = gr.update(visible= not (state["index"] == 0))
recordings = []
text = ""
current_id = f"s_{state['idx']}"
if idx < len(state["sentences"]):
current = state["sentences"][idx]
current_id = current['id']
text = current["text"]
recordings = state["recordings"].get(current["id"], [])
if recordings:
# Get the most recent recording for that sentence ID
current_recording = recordings[-1]
current_audio = current_recording["audio"]
audio_visibility = gr.update(visible=True)
else:
current_audio = None
audio_visibility = gr.update(visible=False)
return text, None, f"ID: {current_id}", progress, gr.update(visible=True), prev_btn_enabled, next_btn_enabled, current_audio, audio_visibility
def record_audio(audio, text):
if state["sentences"] and state["index"] >= len(state["sentences"]):
return update_display()
if audio is None:
gr.Warning("The audio is empty, please provide a valid audio")
return update_display()
if state["json_loaded"]:
state["sentences"][state["index"]]["text"] = text # overwrite with current written value
else:
state["sentences"].append({"id": f"s_{state['idx']}", "text": text})
state["idx"] += 1
sentence = state["sentences"][state["index"]]
uid = sentence["id"]
filename = f"{uid}_{datetime.now().strftime('%Y%m%d%H%M%S')}.wav"
filepath = os.path.join(AUDIO_DIR, filename)
shutil.copy(audio, filepath)
# Add the new recording under the correct ID in the recordings dictionary
uid_versioning = uid
recordings = state["recordings"].get(uid, [])
if recordings:
uid_versioning = f"{uid}_v{len(recordings)}"
state["recordings"].setdefault(uid, []).append({
"id": uid_versioning,
"text": sentence["text"],
"audio": filepath
})
state["index"] += 1
return update_display()
def export_json():
output_path = "data/tts_dataset.json"
data = [record for records in state["recordings"].values() for record in records]
if data:
with open(output_path, "w") as f:
json.dump(data, f, indent=2)
else:
gr.Warning("There is no recorded data")
return output_path
def go_previous():
if state["index"] > 0:
state["index"] -= 1
return update_display()
def go_next():
if state["index"] < len(state["sentences"]) - 1:
state["index"] += 1
return update_display()
def push_to_hub(hub_id, is_new_dataset, sampling_rate):
if hub_id:
# flatten recordings
recordings = []
for element in state["recordings"].values():
for version in element:
recordings.append({"id": version["id"], "audio": version["audio"], "text": version["text"]})
dataset = Dataset.from_list(recordings)
dataset = dataset.cast_column("audio", Audio(sampling_rate=sampling_rate))
if not is_new_dataset:
previous_dataset = load_dataset(hub_id, split="train")
dataset = concatenate_datasets([previous_dataset, dataset])
dataset.push_to_hub(hub_id)
gr.Info("Succesfully synched with the hub")
else:
gr.Warning("The hub_id field is empty, please provide a relevant hub id.")
return update_display()
with gr.Blocks() as demo:
gr.Markdown("""# 🗣️ TTS Dataset Recorder
Welcome to the **TTS Dataset Recorder**! This tool helps you quickly create a high-quality dataset for Text-to-Speech (TTS) models. Whether you're starting from scratch or have a pre-existing set of text data, this app lets you record audio samples and export them with the corresponding metadata.
### **How to Use?**
1. **Upload a JSON File** containing the sentences you'd like to record (or manually input them through the app).
2. **Record Audio** for each sentence. The app will automatically associate your recordings with the correct text.
3. **Export the Dataset** as a JSON file or **Sync** to HuggingFace for easy sharing and use.
### **Data Input Format**
Your JSON file should follow this structure:
```json
[
{ "id": "001", "text": "Hello, how are you?" },
{ "id": "002", "text": "This is a sample sentence." }
]
""")
with gr.Row():
json_file = gr.File(label="Upload Sentences JSON", file_types=[".json"])
with gr.Column():
export_btn = gr.Button("💾 Export Metadata")
with gr.Row():
hub_id = gr.Textbox(label="Hub id", interactive=True)
with gr.Row():
is_new_dataset = gr.Checkbox(label="New dataset", interactive=True)
sampling_rate = gr.Number(label="Sampling rate", value=SAMPLING_RATE, precision=0)
push_to_hub_btn = gr.Button("🤗 Sync to HuggingFace")
id_display = gr.Textbox(label="ID", interactive=False)
progress_text = gr.Textbox(label="Progress", interactive=False)
sentence_text = gr.Textbox(label="Sentence", interactive=True)
audio_input = gr.Audio(type="filepath", label="Record your voice", interactive=True)
record_btn = gr.Button("✅ Submit Recording")
with gr.Row():
prev_btn = gr.Button("⬅️ Previous")
next_btn = gr.Button("➡️ Next")
# audio_player = gr.Audio(label="Play Recorded Audio", interactive=False)
audio_player = gr.Audio(label="Play Recorded Audio", type="filepath")
json_file.change(load_json, inputs=json_file, outputs=[sentence_text, audio_input, id_display, progress_text, record_btn, prev_btn, next_btn, audio_player, audio_player])
record_btn.click(record_audio, inputs=[audio_input, sentence_text], outputs=[sentence_text, audio_input, id_display, progress_text, record_btn, prev_btn, next_btn, audio_player, audio_player])
export_btn.click(export_json, outputs=gr.File())
prev_btn.click(go_previous, outputs=[sentence_text, audio_input, id_display, progress_text, record_btn, prev_btn, next_btn, audio_player, audio_player])
next_btn.click(go_next, outputs=[sentence_text, audio_input, id_display, progress_text, record_btn, prev_btn, next_btn, audio_player, audio_player])
push_to_hub_btn.click(push_to_hub, inputs=[hub_id, is_new_dataset], outputs=[sentence_text, audio_input, id_display, progress_text, record_btn, prev_btn, next_btn, audio_player, audio_player])
demo.launch()