{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "name": "DefinedCrowd x NeMo - ASR Training Tutorial", "provenance": [], "collapsed_sections": [], "toc_visible": true }, "kernelspec": { "name": "python3", "display_name": "Python 3" }, "accelerator": "GPU", "widgets": { "application/vnd.jupyter.widget-state+json": { "9a92e517bd634016ad9ba293436f8080": { "model_module": "@jupyter-widgets/controls", "model_name": "HBoxModel", "state": { "_view_name": "HBoxView", "_dom_classes": [], "_model_name": "HBoxModel", "_view_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_view_count": null, "_view_module_version": "1.5.0", "box_style": "", "layout": "IPY_MODEL_a7b8d4270e2c4373be43941e4df13ad3", "_model_module": "@jupyter-widgets/controls", "children": [ "IPY_MODEL_1d71646d5d414a34890d38ce64b9f73c", "IPY_MODEL_ca56619da74c4dfaa4df3831d93b2f58" ] } }, "a7b8d4270e2c4373be43941e4df13ad3": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "state": { "_view_name": "LayoutView", "grid_template_rows": null, "right": null, "justify_content": null, "_view_module": "@jupyter-widgets/base", "overflow": null, "_model_module_version": "1.2.0", "_view_count": null, "flex_flow": "row wrap", "width": "100%", "min_width": null, "border": null, "align_items": null, "bottom": null, "_model_module": "@jupyter-widgets/base", "top": null, "grid_column": null, "overflow_y": null, "overflow_x": null, "grid_auto_flow": null, "grid_area": null, "grid_template_columns": null, "flex": null, "_model_name": "LayoutModel", "justify_items": null, "grid_row": null, "max_height": null, "align_content": null, "visibility": null, "align_self": null, "height": null, "min_height": null, "padding": null, "grid_auto_rows": null, "grid_gap": null, "max_width": null, "order": null, "_view_module_version": "1.2.0", "grid_template_areas": null, "object_position": null, "object_fit": null, "grid_auto_columns": null, "margin": null, "display": "inline-flex", "left": null } }, "1d71646d5d414a34890d38ce64b9f73c": { "model_module": "@jupyter-widgets/controls", "model_name": "FloatProgressModel", "state": { "_view_name": "ProgressView", "style": "IPY_MODEL_4bdab014ee6241848b562d74d56cd878", "_dom_classes": [], "description": "Validation sanity check: ", "_model_name": "FloatProgressModel", "bar_style": "info", "max": 1, "_view_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "value": 0, "_view_count": null, "_view_module_version": "1.5.0", "orientation": "horizontal", "min": 0, "description_tooltip": null, "_model_module": "@jupyter-widgets/controls", "layout": "IPY_MODEL_bbcb1cf8bd394fa3a2b2da8c858a5521" } }, "ca56619da74c4dfaa4df3831d93b2f58": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "state": { "_view_name": "HTMLView", "style": "IPY_MODEL_95e7ab0ae4ea4ce680ded066d204afd5", "_dom_classes": [], "description": "", "_model_name": "HTMLModel", "placeholder": "​", "_view_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "value": " 0/0 [00:00<?, ?it/s]", "_view_count": null, "_view_module_version": "1.5.0", "description_tooltip": null, "_model_module": "@jupyter-widgets/controls", "layout": "IPY_MODEL_6221c0a4048443d69f750d6c4ee8827d" } }, "4bdab014ee6241848b562d74d56cd878": { "model_module": "@jupyter-widgets/controls", "model_name": "ProgressStyleModel", "state": { "_view_name": "StyleView", "_model_name": "ProgressStyleModel", "description_width": "initial", "_view_module": "@jupyter-widgets/base", "_model_module_version": "1.5.0", "_view_count": null, "_view_module_version": "1.2.0", "bar_color": null, "_model_module": "@jupyter-widgets/controls" } }, "bbcb1cf8bd394fa3a2b2da8c858a5521": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "state": { "_view_name": "LayoutView", "grid_template_rows": null, "right": null, "justify_content": null, "_view_module": "@jupyter-widgets/base", "overflow": null, "_model_module_version": "1.2.0", "_view_count": null, "flex_flow": null, "width": null, "min_width": null, "border": null, "align_items": null, "bottom": null, "_model_module": "@jupyter-widgets/base", "top": null, "grid_column": null, "overflow_y": null, "overflow_x": null, "grid_auto_flow": null, "grid_area": null, "grid_template_columns": null, "flex": "2", "_model_name": "LayoutModel", "justify_items": null, "grid_row": null, "max_height": null, "align_content": null, "visibility": null, "align_self": null, "height": null, "min_height": null, "padding": null, "grid_auto_rows": null, "grid_gap": null, "max_width": null, "order": null, "_view_module_version": "1.2.0", "grid_template_areas": null, "object_position": null, "object_fit": null, "grid_auto_columns": null, "margin": null, "display": null, "left": null } }, "95e7ab0ae4ea4ce680ded066d204afd5": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "state": { "_view_name": "StyleView", "_model_name": "DescriptionStyleModel", "description_width": "", "_view_module": "@jupyter-widgets/base", "_model_module_version": "1.5.0", "_view_count": null, "_view_module_version": "1.2.0", "_model_module": "@jupyter-widgets/controls" } }, "6221c0a4048443d69f750d6c4ee8827d": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "state": { "_view_name": "LayoutView", "grid_template_rows": null, "right": null, "justify_content": null, "_view_module": "@jupyter-widgets/base", "overflow": null, "_model_module_version": "1.2.0", "_view_count": null, "flex_flow": null, "width": null, "min_width": null, "border": null, "align_items": null, "bottom": null, "_model_module": "@jupyter-widgets/base", "top": null, "grid_column": null, "overflow_y": null, "overflow_x": null, "grid_auto_flow": null, "grid_area": null, "grid_template_columns": null, "flex": null, "_model_name": "LayoutModel", "justify_items": null, "grid_row": null, "max_height": null, "align_content": null, "visibility": null, "align_self": null, "height": null, "min_height": null, "padding": null, "grid_auto_rows": null, "grid_gap": null, "max_width": null, "order": null, "_view_module_version": "1.2.0", "grid_template_areas": null, "object_position": null, "object_fit": null, "grid_auto_columns": null, "margin": null, "display": null, "left": null } }, "a61f308ce3044b2bb6bb69f72a59544f": { "model_module": "@jupyter-widgets/controls", "model_name": "HBoxModel", "state": { "_view_name": "HBoxView", "_dom_classes": [], "_model_name": "HBoxModel", "_view_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_view_count": null, "_view_module_version": "1.5.0", "box_style": "", "layout": "IPY_MODEL_9d8f022b712d41e3ba32c1c120ed44a7", "_model_module": "@jupyter-widgets/controls", "children": [ "IPY_MODEL_6a7d8bb3c4c444fa88f7e6edaa40ce4f", "IPY_MODEL_3dd0c629c3ea4371bbe48cff9829676d" ] } }, "9d8f022b712d41e3ba32c1c120ed44a7": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "state": { "_view_name": "LayoutView", "grid_template_rows": null, "right": null, "justify_content": null, "_view_module": "@jupyter-widgets/base", "overflow": null, "_model_module_version": "1.2.0", "_view_count": null, "flex_flow": "row wrap", "width": "100%", "min_width": null, "border": null, "align_items": null, "bottom": null, "_model_module": "@jupyter-widgets/base", "top": null, "grid_column": null, "overflow_y": null, "overflow_x": null, "grid_auto_flow": null, "grid_area": null, "grid_template_columns": null, "flex": null, "_model_name": "LayoutModel", "justify_items": null, "grid_row": null, "max_height": null, "align_content": null, "visibility": null, "align_self": null, "height": null, "min_height": null, "padding": null, "grid_auto_rows": null, "grid_gap": null, "max_width": null, "order": null, "_view_module_version": "1.2.0", "grid_template_areas": null, "object_position": null, "object_fit": null, "grid_auto_columns": null, "margin": null, "display": "inline-flex", "left": null } }, "6a7d8bb3c4c444fa88f7e6edaa40ce4f": { "model_module": "@jupyter-widgets/controls", "model_name": "FloatProgressModel", "state": { "_view_name": "ProgressView", "style": "IPY_MODEL_ff18fa96c2cc42c38fab6692eb427670", "_dom_classes": [], "description": "Epoch 0: 100%", "_model_name": "FloatProgressModel", "bar_style": "success", "max": 5610, "_view_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "value": 5610, "_view_count": null, "_view_module_version": "1.5.0", "orientation": "horizontal", "min": 0, "description_tooltip": null, "_model_module": "@jupyter-widgets/controls", "layout": "IPY_MODEL_f3a648b6aa3c4f95be08fc0a24e01b27" } }, "3dd0c629c3ea4371bbe48cff9829676d": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "state": { "_view_name": "HTMLView", "style": "IPY_MODEL_219cbf3d1c714971810d796781bd0cd2", "_dom_classes": [], "description": "", "_model_name": "HTMLModel", "placeholder": "​", "_view_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "value": " 5610/5610 [46:36<00:00, 2.01it/s, loss=16.416, v_num=1]", "_view_count": null, "_view_module_version": "1.5.0", "description_tooltip": null, "_model_module": "@jupyter-widgets/controls", "layout": "IPY_MODEL_348dda7188ca469e90dccdc3e418eed5" } }, "ff18fa96c2cc42c38fab6692eb427670": { "model_module": "@jupyter-widgets/controls", "model_name": "ProgressStyleModel", "state": { "_view_name": "StyleView", "_model_name": "ProgressStyleModel", "description_width": "initial", "_view_module": "@jupyter-widgets/base", "_model_module_version": "1.5.0", "_view_count": null, "_view_module_version": "1.2.0", "bar_color": null, "_model_module": "@jupyter-widgets/controls" } }, "f3a648b6aa3c4f95be08fc0a24e01b27": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "state": { "_view_name": "LayoutView", "grid_template_rows": null, "right": null, "justify_content": null, "_view_module": "@jupyter-widgets/base", "overflow": null, "_model_module_version": "1.2.0", "_view_count": null, "flex_flow": null, "width": null, "min_width": null, "border": null, "align_items": null, "bottom": null, "_model_module": "@jupyter-widgets/base", "top": null, "grid_column": null, "overflow_y": null, "overflow_x": null, "grid_auto_flow": null, "grid_area": null, "grid_template_columns": null, "flex": "2", "_model_name": "LayoutModel", "justify_items": null, "grid_row": null, "max_height": null, "align_content": null, "visibility": null, "align_self": null, "height": null, "min_height": null, "padding": null, "grid_auto_rows": null, "grid_gap": null, "max_width": null, "order": null, "_view_module_version": "1.2.0", "grid_template_areas": null, "object_position": null, "object_fit": null, "grid_auto_columns": null, "margin": null, "display": null, "left": null } }, "219cbf3d1c714971810d796781bd0cd2": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "state": { "_view_name": "StyleView", "_model_name": "DescriptionStyleModel", "description_width": "", "_view_module": "@jupyter-widgets/base", "_model_module_version": "1.5.0", "_view_count": null, "_view_module_version": "1.2.0", "_model_module": "@jupyter-widgets/controls" } }, "348dda7188ca469e90dccdc3e418eed5": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "state": { "_view_name": "LayoutView", "grid_template_rows": null, "right": null, "justify_content": null, "_view_module": "@jupyter-widgets/base", "overflow": null, "_model_module_version": "1.2.0", "_view_count": null, "flex_flow": null, "width": null, "min_width": null, "border": null, "align_items": null, "bottom": null, "_model_module": "@jupyter-widgets/base", "top": null, "grid_column": null, "overflow_y": null, "overflow_x": null, "grid_auto_flow": null, "grid_area": null, "grid_template_columns": null, "flex": null, "_model_name": "LayoutModel", "justify_items": null, "grid_row": null, "max_height": null, "align_content": null, "visibility": null, "align_self": null, "height": null, "min_height": null, "padding": null, "grid_auto_rows": null, "grid_gap": null, "max_width": null, "order": null, "_view_module_version": "1.2.0", "grid_template_areas": null, "object_position": null, "object_fit": null, "grid_auto_columns": null, "margin": null, "display": null, "left": null } } } } }, "cells": [ { "cell_type": "markdown", "metadata": { "id": "oSHYTQI9sVF-" }, "source": [ "# DefinedCrowd x NeMo - ASR Training" ] }, { "cell_type": "markdown", "metadata": { "id": "HmH3hc9TpSg_" }, "source": [ "DefinedCrowd’s core business is providing **high quality AI training data** to our customers. Our workflows can serve as standalone or end-to-end data services to build any Speech-or-Text-enabled AI architecture from scratch, to improve solutions already developed, or to evaluate models in production, all with the DefinedCrowd Quality Guarantee.\n", "\n", "NVIDIA NeMo is a toolkit built by NVIDIA for **creating conversational AI applications**. This toolkit includes collections of pre-trained modules for **Automatic Speech Recognition (ASR)**, Natural Language Processing (NLP), and Texto-to-Speech (TTS), enabling researchers and data scientists to easily compose complex neural network architectures and focus on designing their applications.\n", "\n", "In this tutorial, we want to demonstrate how to **connect DefinedCrowd Speech Workflows** to **train and improve an ASR model** using NVIDIA NeMo. The tutorial re-uses parts of a previous [ASR tutorial from NeMo](https://colab.research.google.com/github/NVIDIA/NeMo/blob/stable/tutorials/asr/ASR_with_NeMo.ipynb)." ] }, { "cell_type": "code", "metadata": { "id": "qf11ZZoZt8Rk" }, "source": [ "# First, let's install NeMo Toolkit and dependencies to run this notebook\n", "!apt-get install -y libsndfile1 ffmpeg\n", "!pip install Cython\n", "\n", "## Install NeMo dependencies in the correct versions\n", "!pip install torchtext==0.11.0 torch==1.10.0 pytorch-lightning==1.5.0\n", "\n", "## Install NeMo\n", "!python -m pip install nemo_toolkit[all]==1.0.0b3" ], "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "ckir-JG-X04r" }, "source": [ "## Obtaining data using DefinedCrowd API \n", "\n", "In this section, we are going to demonstrate how to connect to DefinedCrowd API in order to obtain speech collected data.\n", "\n", "\n", "For more information, visit https://developers.definedcrowd.com/" ] }, { "cell_type": "code", "metadata": { "id": "etSrLwqrH_jF" }, "source": [ "# For the demo, we will be using a sandbox environment\n", "auth_url = \"https://sandbox-auth.definedcrowd.com\"\n", "api_url = \"https://sandbox-api.definedcrowd.com\"" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "dMoEYQ1l82p6" }, "source": [ "# These variables should be obtained at the DefinedCrowd Enterprise Portal for your account.\n", "client_id = \"\"\n", "client_secret = \"\"\n", "project_id = \"\"" ], "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "9k0qVrDjBrgp" }, "source": [ "### Authentication" ] }, { "cell_type": "code", "metadata": { "id": "sgC3KPW6sIFh", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "0b360862-ffc8-4101-f07a-2beaef1a481b" }, "source": [ "import requests, json\n", "\n", "payload = {\n", " \"client_id\": client_id,\n", " \"client_secret\": client_secret,\n", " \"grant_type\": \"client_credentials\",\n", " \"scope\": \"PublicAPIv2\",\n", "}\n", "files = []\n", "headers = {}\n", "\n", "# request the Auth 2.0 access token\n", "response = requests.request(\n", " \"POST\", f\"{auth_url}/connect/token\", headers=headers, data=payload, files=files\n", ")\n", "if response.status_code == 200:\n", " print(\"Authentication Success!\")\n", " access_token = response.json()[\"access_token\"]\n", "else:\n", " print(\"Authentication Failed\")" ], "execution_count": null, "outputs": [ { "output_type": "stream", "text": [ "Authentication Success!\n" ], "name": "stdout" } ] }, { "cell_type": "markdown", "metadata": { "id": "GRwnRikLBxRB" }, "source": [ "### Get a list of deliverables" ] }, { "cell_type": "code", "metadata": { "id": "Sabeg4eI8ZUz", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "91572157-ff0a-4b3e-c5f7-0b370a330f3c" }, "source": [ "# GET /projects/{project-id}/deliverables\n", "headers = {\"Authorization\": \"Bearer \" + access_token}\n", "response = requests.request(\n", " \"GET\", f\"{api_url}/projects/{project_id}/deliverables\", headers=headers\n", ")\n", "\n", "if response.status_code == 200:\n", " # Pretty print the response\n", " print(json.dumps(response.json(), indent=4))\n", "\n", " # Get the first deliverable id\n", " deliverable_id = response.json()[0][\"id\"]" ], "execution_count": null, "outputs": [ { "output_type": "stream", "text": [ "[\n", " {\n", " \"projectId\": \"eb324e45-c4f9-41e7-b5cf-655aa693ae75\",\n", " \"id\": \"258f9e15-2937-4846-b9c3-3ae1164b7364\",\n", " \"type\": \"Flat\",\n", " \"fileName\": \"data_Flat_eb324e45-c4f9-41e7-b5cf-655aa693ae75_258f9e15-2937-4846-b9c3-3ae1164b7364_2021-03-22-14-34-37.zip\",\n", " \"createdTimestamp\": \"2021-03-22T14:34:37.8037259\",\n", " \"isPartial\": false,\n", " \"downloadCount\": 2,\n", " \"status\": \"Downloaded\"\n", " }\n", "]\n" ], "name": "stdout" } ] }, { "cell_type": "markdown", "metadata": { "id": "wIDInh7gCNFl" }, "source": [ "### Download the final deliverable for a speech data collection" ] }, { "cell_type": "code", "metadata": { "id": "36Mgnm7MBo4-", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "78748e7c-87b4-454a-e148-f03d3d92a13c" }, "source": [ "# the name I want to give to my deliverable file\n", "filename = \"scripted_monologue_en_GB.zip\"\n", "\n", "# GET /projects/{project-id}/deliverables/{deliverable-id}/download\n", "headers = {\"Authorization\": \"Bearer \" + access_token}\n", "response = requests.request(\n", " \"GET\",\n", " f\"{api_url}/projects/{project_id}/deliverables/{deliverable_id}/download/\",\n", " headers=headers,\n", ")\n", "\n", "if response.status_code == 200:\n", " # save the deliverable file\n", " with open(filename, \"wb\") as fp:\n", " fp.write(response.content)\n", " print(\"Deliverable file saved with success!\")" ], "execution_count": null, "outputs": [ { "output_type": "stream", "text": [ "Deliverable file saved with success!\n" ], "name": "stdout" } ] }, { "cell_type": "code", "metadata": { "id": "IDco1p_0jfOc" }, "source": [ "!unzip scripted_monologue_en_GB.zip &> /dev/null\n", "!rm -f en-gb_single-scripted_Dataset.zip" ], "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "QBnZJyyGw_Qg" }, "source": [ "## Speech Dataset\n", "\n", "In this section, we are going to analyse the data got from DefinedCrowd. The data is built of scripted speech data collected by the DefinedCrowd Neevo platform from several speakers in the UK (crowd members from DefinedCrowd).\n", "\n", "Each row of the dataset contains information about the speech prompt, crowd member, device used, and the recording. The data we find this this delivery is:\n", "\n", "**Recording**:\n", "* RecordingId\n", "* PromptId\n", "* Prompt\n", "\n", "**Audio File**:\n", "* RelativeFileName\n", "* Duration\n", "* SampleRate\n", "* BitDepth\n", "* AudioCommunicationBand\n", "* RecordingEnvironment\n", "\n", "**Crowd Member**:\n", "* SpeakerId\n", "* Gender\n", "* Age\n", "* Accent\n", "* LivingCountry\n", "\n", "**Recording Device**:\n", "* Manufacturer\n", "* DeviceType\n", "* Domain\n", "\n", "This data can be used for multiple purposes, but in this tutorial, we are going to use it for improving an existent ASR model for British speakers." ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 634 }, "id": "fqhRvz03R9V3", "outputId": "6237250f-fdbf-4ead-9be0-16593143b2eb" }, "source": [ "import pandas as pd\n", "\n", "# let's look into the metadata file\n", "dataset = pd.read_csv(\"metadata.tsv\", sep=\"\\t\", index_col=[0])\n", "dataset.head(10)" ], "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
RecordingIdPromptIdRelativeFileNamePromptDurationSpeakerIdGenderAgeManufacturerDeviceTypeAccentDomainSampleRateBitDepthAudioCommunicationBandLivingCountryNativeRecordingEnvironment
016555962864977250Audio/165559628.wavThe Avengers' extinction.00:00:02.815128209Female26AppleiPhone 6sSuffolkgeneric1600016BroadbandUnited KingdomTruesilent
116539652964940978Audio/165396529.wavand smile in pictures to make everyone feel safe?00:00:05.240422843Female31motorolamoto g(6)Hertfordshiregeneric1600016BroadbandUnited KingdomTruesilent
216546609064962327Audio/165466090.wav- (GUNSHOT) - (GROANS)00:00:03.560458727Male53XiaomiMi MIX 3 5GWest Sussexgeneric1600016BroadbandUnited KingdomTruesilent
316545060364958468Audio/165450603.wavThey had us dead to rights.00:00:02.621478075Female21AppleiPhone 6sWorcestershiregeneric1600016BroadbandUnited KingdomTruesilent
416545404264959449Audio/165454042.wavThe war is happening.00:00:03.960477240Male30samsungSM-G975FEssexgeneric1600016BroadbandUnited KingdomTruesilent
516549331964967271Audio/165493319.wavFeel her heart beat.00:00:03.200480713Male31HMD GlobalTA-1012Norfolkgeneric1600016BroadbandUnited KingdomTruesilent
616584540065000410Audio/165845400.wavIndian Ocean(Kerguelen Plateau)00:00:03.503432925Female69AppleiPhone XRScottish Borders, Thegeneric1600016BroadbandUnited KingdomTruesilent
716543502564954084Audio/165435025.wavHe's been forgotten.00:00:01.968478075Female21AppleiPhone 6sWorcestershiregeneric1600016BroadbandUnited KingdomTruesilent
816547437464963765Audio/165474374.wavand travel hundreds of miles00:00:03.711434058Female32AppleiPhone 6sCumbriageneric1600016BroadbandUnited KingdomTruesilent
916577088264995117Audio/165770882.wavsummoned to prove their noble family lines.00:00:03.839480713Male31HMD GlobalTA-1012Norfolkgeneric1600016BroadbandUnited KingdomTruesilent
\n", "
" ], "text/plain": [ " RecordingId PromptId ... Native RecordingEnvironment\n", "0 165559628 64977250 ... True silent\n", "1 165396529 64940978 ... True silent\n", "2 165466090 64962327 ... True silent\n", "3 165450603 64958468 ... True silent\n", "4 165454042 64959449 ... True silent\n", "5 165493319 64967271 ... True silent\n", "6 165845400 65000410 ... True silent\n", "7 165435025 64954084 ... True silent\n", "8 165474374 64963765 ... True silent\n", "9 165770882 64995117 ... True silent\n", "\n", "[10 rows x 18 columns]" ] }, "metadata": { "tags": [] }, "execution_count": 36 } ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "wDxPEuJGSKTb", "outputId": "afed8dc4-c2f5-4cfe-c468-72ace1ed4f79" }, "source": [ "# Let's check the data for the first row\n", "dataset.iloc[0]" ], "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "RecordingId 165559628\n", "PromptId 64977250\n", "RelativeFileName Audio/165559628.wav\n", "Prompt The Avengers' extinction.\n", "Duration 00:00:02.815\n", "SpeakerId 128209\n", "Gender Female\n", "Age 26\n", "Manufacturer Apple\n", "DeviceType iPhone 6s\n", "Accent Suffolk\n", "Domain generic\n", "SampleRate 16000\n", "BitDepth 16\n", "AudioCommunicationBand Broadband\n", "LivingCountry United Kingdom\n", "Native True\n", "RecordingEnvironment silent\n", "Name: 0, dtype: object" ] }, "metadata": { "tags": [] }, "execution_count": 8 } ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "Uor7Ar5fRZrM", "outputId": "4885abc7-1476-4175-8e89-5a393d02ffef" }, "source": [ "# How many rows do I have?\n", "len(dataset)" ], "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "50000" ] }, "metadata": { "tags": [] }, "execution_count": 9 } ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 317 }, "id": "gRnVerubR2p_", "outputId": "da3ac4e1-b234-4ce9-8957-a8f3a3b31fe3" }, "source": [ "# Let's check some examples from our dataset\n", "import librosa\n", "import IPython.display as ipd\n", "\n", "for index, row in dataset.sample(4, random_state=1).iterrows():\n", "\n", " print(f\"Prompt: {dataset.iloc[index].Prompt}\")\n", " audio_file = dataset.iloc[index].RelativeFileName\n", "\n", " # Load and listen to the audio file\n", " audio, sample_rate = librosa.load(audio_file)\n", " ipd.display(ipd.Audio(audio, rate=sample_rate))" ], "execution_count": null, "outputs": [ { "output_type": "stream", "text": [ "Prompt: You got to be kidding me.\n" ], "name": "stdout" }, { "output_type": "display_data", "data": { "text/html": [ "\n", " \n", " " ], "text/plain": [ "" ] }, "metadata": { "tags": [] } }, { "output_type": "stream", "text": [ "Prompt: waiting for you to finish knocking three times.\n" ], "name": "stdout" }, { "output_type": "display_data", "data": { "text/html": [ "\n", " \n", " " ], "text/plain": [ "" ] }, "metadata": { "tags": [] } }, { "output_type": "stream", "text": [ "Prompt: And let me know if you get a hit on that malware.\n" ], "name": "stdout" }, { "output_type": "display_data", "data": { "text/html": [ "\n", " \n", " " ], "text/plain": [ "" ] }, "metadata": { "tags": [] } }, { "output_type": "stream", "text": [ "Prompt: She had more reason than anyone in the Seven Kingdoms.\n" ], "name": "stdout" }, { "output_type": "display_data", "data": { "text/html": [ "\n", " \n", " " ], "text/plain": [ "" ] }, "metadata": { "tags": [] } } ] }, { "cell_type": "markdown", "metadata": { "id": "Di2rjANlT_CA" }, "source": [ "## Data Preparation\n", "\n", "After downloading the speech data from DefinedCrowd API, we need to adapt it for the format expected by NeMo for ASR training. For this, we need to create manifests for our training and evaluation data, including each audio file's metadata.\n", "\n", "NeMo requires that we adapt our data to a [particular manifest format](https://github.com/NVIDIA/NeMo/blob/ebade85f6d10319ef59312cb2eefcba4fd298a3d/nemo/collections/asr/parts/manifest.py#L39). Each line corresponding to one audio sample, so the line count equals the number of samples represented by the manifest. A line must contain the path to an audio file, the corresponding transcript, and the audio sample duration. For example, here is what one line might look like in a NeMo-compatible manifest:\n", "```\n", "{\"audio_filepath\": \"path/to/audio.wav\", \"duration\": 3.45, \"text\": \"this is a nemo tutorial\"}\n", "```\n", "\n", "For the creation of the manifest, we will also standardize the transcripts. " ] }, { "cell_type": "code", "metadata": { "id": "nJZl_4IiVcqH" }, "source": [ "import os\n", "\n", "# Function to build a manifest\n", "def build_manifest(dataframe, manifest_path):\n", " with open(manifest_path, \"w\") as fout:\n", " for index, row in dataframe.iterrows():\n", " transcript = row[\"Prompt\"]\n", "\n", " # Our model will use lowercased data for training/testing\n", " transcript = transcript.lower()\n", "\n", " # Removing linguistic marks (they are not necessary for this demo)\n", " transcript = (\n", " transcript.replace(\"\", \"\")\n", " .replace(\"\", \"\")\n", " .replace(\"[b_s/]\", \"\")\n", " .replace(\"[uni/]\", \"\")\n", " .replace(\"[v_n/]\", \"\")\n", " .replace(\"[filler/]\", \"\")\n", " .replace('\"', \"\")\n", " .replace(\"[n_s/]\", \"\")\n", " )\n", "\n", " audio_path = row[\"RelativeFileName\"]\n", "\n", " # Get the audio duration\n", " try:\n", " duration = librosa.core.get_duration(filename=audio_path)\n", " except Exception as e:\n", " print(\"An error occurred: \", e)\n", "\n", " if os.path.exists(audio_path):\n", " # Write the metadata to the manifest\n", " metadata = {\n", " \"audio_filepath\": audio_path,\n", " \"duration\": duration,\n", " \"text\": transcript,\n", " }\n", " json.dump(metadata, fout)\n", " fout.write(\"\\n\")\n", " else:\n", " continue" ], "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "7WBgwjZEWJNJ" }, "source": [ "### Train and Test splits\n", "\n", "In order to test the quality of our model, we need to reserve some data for model testing. We will be evaluating the model performance on this data." ] }, { "cell_type": "code", "metadata": { "id": "ZmldhUNLVchu" }, "source": [ "import json\n", "from sklearn.model_selection import train_test_split\n", "\n", "# Split 10% for testing (500 prompts) and 90% for training (4500 prompts)\n", "trainset, testset = train_test_split(dataset, test_size=0.1, random_state=1)\n", "\n", "# Build the manifests\n", "build_manifest(trainset, \"train_manifest.json\")\n", "build_manifest(testset, \"test_manifest.json\")" ], "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "IORnKt_8txSh" }, "source": [ "## Model Configuration\n" ] }, { "cell_type": "markdown", "metadata": { "id": "-CbdWinn57Yp" }, "source": [ "In this tutorial, we'll describe how to use the QuartzNet15x5 model as a base model for fine-tuning with our data. We want to improve the recognition of our dataset, so we will benchmark the model performance on the base model, and after, on the fine-tuned version.\n", "\n", "Some of the following functions were retrieved from the Nemo Tutorial on ASR that could be checked at [https://github.com/NVIDIA/NeMo](https://github.com/NVIDIA/NeMo)" ] }, { "cell_type": "code", "metadata": { "id": "5xHmfccBt233" }, "source": [ "# Let's import Nemo and the functions for ASR\n", "import torch\n", "import nemo\n", "import nemo.collections.asr as nemo_asr\n", "\n", "import logging\n", "from nemo.utils import _Logger\n", "\n", "# Setup the log level by NeMo\n", "logger = _Logger()\n", "logger.set_verbosity(logging.ERROR)" ], "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "SfMnA-xDSGjD" }, "source": [ "### Training Parameters\n", "\n", "For training, NeMo uses a python dictionary as data structure to keep all the parameters. More information about it can be accessed at the [NeMo ASR Config User Guide](https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/main/asr/configs.html). \n", "\n", "For this tutorial, we will load a pre-existent file with the standard ASR configuration and change only the necessary fields." ] }, { "cell_type": "code", "metadata": { "id": "97S1IEW8O1m-" }, "source": [ "## Download the config we'll use in this example\n", "!mkdir configs\n", "!wget -P configs/ https://raw.githubusercontent.com/NVIDIA/NeMo/stable/examples/asr/conf/config.yaml &> /dev/null\n", "\n", "# --- Config Information ---#\n", "from ruamel.yaml import YAML\n", "\n", "config_path = \"./configs/config.yaml\"\n", "\n", "yaml = YAML(typ=\"safe\")\n", "with open(config_path) as f:\n", " params = yaml.load(f)" ], "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "4Ou9cVK17NCW" }, "source": [ "### The Base Model\n", "\n", "For our ASR model, we will use a pre-trained QuartzNet15x5 model from NVIDIA's NGC cloud. ([List of pre-trained models from NeMo](https://ngc.nvidia.com/catalog/models/nvidia:nemospeechmodels))\n", "\n", " *Description of the pre-trained model*: QuartzNet15x5 model trained on six datasets: LibriSpeech, Mozilla Common Voice (validated clips from en_1488h_2019-12-10), WSJ, Fisher, Switchboard, and NSC Singapore English. It was trained with Apex/Amp optimization level O1 for 600 epochs. The model achieves a WER of 3.79% on LibriSpeech dev-clean, and a WER of 10.05% on dev-other." ] }, { "cell_type": "code", "metadata": { "id": "KCYDvrCe1y68" }, "source": [ "# This line will download pre-trained QuartzNet15x5 model from NVIDIA's NGC cloud and instantiate it for you\n", "quartznet = nemo_asr.models.EncDecCTCModel.from_pretrained(model_name=\"QuartzNet15x5Base-En\", strict=False)" ], "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "Z3j6Chun037q" }, "source": [ "#### Base Model Performance \n", "\n", "The Word Error Rate (WER) is a valuable measurement tool for comparing different ASR model and evaluating improvements within one system. To obtain the final results, we assess how the model performs by using the testing set." ] }, { "cell_type": "code", "metadata": { "id": "3WRVPeGC0sS5" }, "source": [ "# Let's configure our model parameters for testing\n", "\n", "# Parameters for training, validation, and testing are specified using the \n", "# train_ds, validation_ds, and test_ds sections of your configuration file\n", "\n", "# Bigger batch-size = bigger throughput\n", "params[\"model\"][\"validation_ds\"][\"batch_size\"] = 8\n", "\n", "# Setup the test data loader and make sure the model is on GPU\n", "params[\"model\"][\"validation_ds\"][\"manifest_filepath\"] = \"test_manifest.json\"\n", "quartznet.setup_test_data(test_data_config=params[\"model\"][\"validation_ds\"])\n", "\n", "# Comment this line if you don't want to use GPU acceleration\n", "_ = quartznet.cuda()" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "Z3sPG8GA6UWC" }, "source": [ "# We will be computing the Word Error Rate (WER) metric between our hypothesis and predictions.\n", "\n", "wer_numerators = []\n", "wer_denominators = []\n", "\n", "# Loop over all test batches.\n", "# Iterating over the model's `test_dataloader` will give us:\n", "# (audio_signal, audio_signal_length, transcript_tokens, transcript_length)\n", "# See the AudioToCharDataset for more details.\n", "with torch.no_grad():\n", " for test_batch in quartznet.test_dataloader():\n", " input_signal, input_signal_length, targets, targets_lengths = [x.cuda() for x in test_batch]\n", " \n", " log_probs, encoded_len, greedy_predictions = quartznet(\n", " input_signal=input_signal, \n", " input_signal_length=input_signal_length\n", " )\n", " # Notice the model has a helper object to compute WER\n", " quartznet._wer.update(greedy_predictions, targets, targets_lengths)\n", " _, wer_numerator, wer_denominator = quartznet._wer.compute()\n", " wer_numerators.append(wer_numerator.detach().cpu().numpy())\n", " wer_denominators.append(wer_denominator.detach().cpu().numpy())" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "_Z4QGXvLDub_", "outputId": "e861e867-b18b-416d-fd97-eb21209f3819" }, "source": [ "# We need to sum all numerators and denominators first. Then divide.\n", "print(f\"WER = {sum(wer_numerators)/sum(wer_denominators)*100:.2f}%\")" ], "execution_count": null, "outputs": [ { "output_type": "stream", "text": [ "WER = 39.70%\n" ], "name": "stdout" } ] }, { "cell_type": "markdown", "metadata": { "id": "bl_tzqeO7XOO" }, "source": [ "### Model Fine-tuning\n", "\n", "The base model got 39.7% of WER, which is not so good. Let's see if providing some data from the same domain and language dialects can improve our ASR model.\n", "\n", "For simplification, we are going to train for only 1 epoch using DefinedCrowd's data. " ] }, { "cell_type": "code", "metadata": { "id": "P1qL8NH97dzV", "colab": { "base_uri": "https://localhost:8080/", "height": 287, "referenced_widgets": [ "9a92e517bd634016ad9ba293436f8080", "a7b8d4270e2c4373be43941e4df13ad3", "1d71646d5d414a34890d38ce64b9f73c", "ca56619da74c4dfaa4df3831d93b2f58", "4bdab014ee6241848b562d74d56cd878", "bbcb1cf8bd394fa3a2b2da8c858a5521", "95e7ab0ae4ea4ce680ded066d204afd5", "6221c0a4048443d69f750d6c4ee8827d", "a61f308ce3044b2bb6bb69f72a59544f", "9d8f022b712d41e3ba32c1c120ed44a7", "6a7d8bb3c4c444fa88f7e6edaa40ce4f", "3dd0c629c3ea4371bbe48cff9829676d", "ff18fa96c2cc42c38fab6692eb427670", "f3a648b6aa3c4f95be08fc0a24e01b27", "219cbf3d1c714971810d796781bd0cd2", "348dda7188ca469e90dccdc3e418eed5" ] }, "outputId": "67209ee3-5161-40dc-a179-83d8219c3d71" }, "source": [ "import pytorch_lightning as pl\n", "from omegaconf import DictConfig\n", "import copy\n", "\n", "# Before training we need to \n", "\n", "# provide the train manifest for training\n", "params[\"model\"][\"train_ds\"][\"manifest_filepath\"] = \"train_manifest.json\"\n", "\n", "# Use the smaller learning rate for fine-tunning\n", "new_opt = copy.deepcopy(params[\"model\"][\"optim\"])\n", "new_opt[\"lr\"] = 0.001\n", "quartznet.setup_optimization(optim_config=DictConfig(new_opt))\n", "\n", "# Batch size will depend on the GPU memory available\n", "params[\"model\"][\"train_ds\"][\"batch_size\"] = 8\n", "\n", "# Point to the data we'll use for fine-tuning as the training set\n", "quartznet.setup_training_data(train_data_config=params[\"model\"][\"train_ds\"])\n", "\n", "# clean torch cache\n", "torch.cuda.empty_cache()\n", "\n", "# And now we can create a PyTorch Lightning trainer.\n", "trainer = pl.Trainer(devices=1, accelerator='gpu', max_epochs=1)\n", "\n", "# And the fit function will start the training\n", "trainer.fit(quartznet)" ], "execution_count": null, "outputs": [ { "output_type": "stream", "text": [ "GPU available: True, used: True\n", "TPU available: False, using: 0 TPU cores\n", "LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]\n", "\n", " | Name | Type | Params\n", "------------------------------------------------------------------------\n", "0 | preprocessor | AudioToMelSpectrogramPreprocessor | 0 \n", "1 | encoder | ConvASREncoder | 18.9 M\n", "2 | decoder | ConvASRDecoder | 29.7 K\n", "3 | loss | CTCLoss | 0 \n", "4 | spec_augmentation | SpectrogramAugmentation | 0 \n", "5 | _wer | WER | 0 \n" ], "name": "stderr" }, { "output_type": "display_data", "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "9a92e517bd634016ad9ba293436f8080", "version_minor": 0, "version_major": 2 }, "text/plain": [ "HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validation sanity check', layout=Layout…" ] }, "metadata": { "tags": [] } }, { "output_type": "stream", "text": [ "\r" ], "name": "stdout" }, { "output_type": "display_data", "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "a61f308ce3044b2bb6bb69f72a59544f", "version_minor": 0, "version_major": 2 }, "text/plain": [ "HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…" ] }, "metadata": { "tags": [] } }, { "output_type": "stream", "text": [ "\n" ], "name": "stdout" }, { "output_type": "execute_result", "data": { "text/plain": [ "1" ] }, "metadata": { "tags": [] }, "execution_count": 31 } ] }, { "cell_type": "markdown", "metadata": { "id": "ZKKNeuH8LrLv" }, "source": [ "#### Fine-tunned model Performance\n", "\n", "Let's compare the final model performance with the fine-tunned model we got from training with additional data." ] }, { "cell_type": "code", "metadata": { "id": "WuSccDRmx9oM" }, "source": [ "# Let's configure our model parameters for testing\n", "params[\"model\"][\"validation_ds\"][\"batch_size\"] = 8\n", "\n", "# Setup the test data loader and make sure the model is on GPU\n", "params[\"model\"][\"validation_ds\"][\"manifest_filepath\"] = \"test_manifest.json\"\n", "quartznet.setup_test_data(test_data_config=params[\"model\"][\"validation_ds\"])\n", "_ = quartznet.cuda()" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "dN8Blhwagy5V" }, "source": [ "# We will be computing the Word Error Rate (WER) metric between our hypothesis and predictions.\n", "\n", "wer_numerators = []\n", "wer_denominators = []\n", "\n", "# Loop over all test batches.\n", "# Iterating over the model's `test_dataloader` will give us:\n", "# (audio_signal, audio_signal_length, transcript_tokens, transcript_length)\n", "# See the AudioToCharDataset for more details.\n", "with torch.no_grad():\n", " for test_batch in quartznet.test_dataloader():\n", " input_signal, input_signal_length, targets, targets_lengths = [x.cuda() for x in test_batch]\n", " \n", " log_probs, encoded_len, greedy_predictions = quartznet(\n", " input_signal=input_signal, \n", " input_signal_length=input_signal_length\n", " )\n", " # Notice the model has a helper object to compute WER\n", " quartznet._wer.update(greedy_predictions, targets, targets_lengths)\n", " _, wer_numerator, wer_denominator = quartznet._wer.compute()\n", " wer_numerators.append(wer_numerator.detach().cpu().numpy())\n", " wer_denominators.append(wer_denominator.detach().cpu().numpy())" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "eW0L8UqnUtw_", "outputId": "7f3de1c0-03bb-4d62-982e-0b3e0a8f1393" }, "source": [ "# We need to sum all numerators and denominators first. Then divide.\n", "print(f\"WER = {sum(wer_numerators)/sum(wer_denominators)*100:.2f}%\")" ], "execution_count": null, "outputs": [ { "output_type": "stream", "text": [ "WER = 24.36%\n" ], "name": "stdout" } ] }, { "cell_type": "markdown", "metadata": { "id": "es1N6BIM8QQD" }, "source": [ "After training new epochs of the neural network ASR architecture, we got a Word Error Rate (WER) of 24.36% which is an improvement over the initial 39.7% from the base model using only 1 epoch for training. For better results, please consider to use more epochs in the training." ] }, { "cell_type": "markdown", "metadata": { "id": "S61sV8JI8TgP" }, "source": [ "# Conclusion\n", "\n", "In this tutorial, we demonstrated how to load speech data collected by DefinedCrowd and how to use it to train and measure the performance of an automatic speech recognition (ASR) model." ] } ] }