Spaces:

matdmiller
/

tts-openai

Runtime error

App Files Files Community

matdmiller commited on Jun 16, 2024

Commit

7c60390

1 Parent(s): 9c145c1

fixed cartesia audio concat/conversion

Browse files

Files changed (2) hide show

app.ipynb +71 -102
app.py +33 -29

app.ipynb CHANGED Viewed

@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 1,
    "id": "3bedf0dc-8d8e-4ede-a9e6-b8f35136aa00",
    "metadata": {},
    "outputs": [],
@@ -42,7 +42,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
    "id": "667802a7-0f36-4136-a381-e66210b20462",
    "metadata": {},
    "outputs": [
@@ -94,7 +94,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
    "id": "7664bc24-e8a7-440d-851d-eb16dc2d69fb",
    "metadata": {},
    "outputs": [
@@ -128,7 +128,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
    "id": "4d9863fc-969e-409b-8e20-b9c3cd2cc3e7",
    "metadata": {},
    "outputs": [],
@@ -142,7 +142,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
    "id": "4f486d3a",
    "metadata": {},
    "outputs": [],
@@ -187,7 +187,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
    "id": "ecb7f207-0fc2-4d19-a313-356c05776832",
    "metadata": {},
    "outputs": [
@@ -208,7 +208,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
    "id": "e5d6cac2-0dee-42d8-9b41-184b5be9cc3f",
    "metadata": {},
    "outputs": [],
@@ -219,7 +219,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
    "id": "b77ad8d6-3289-463c-b213-1c0cc215b141",
    "metadata": {},
    "outputs": [
@@ -248,7 +248,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
    "id": "87fca48b-a16a-4d2b-919c-75e88e4e5eb5",
    "metadata": {},
    "outputs": [
@@ -316,7 +316,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
    "id": "8eb7e7d5-7121-4762-b8d1-e5a9539e2b36",
    "metadata": {},
    "outputs": [],
@@ -327,7 +327,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
    "id": "52d373be-3a79-412e-8ca2-92bb443fa52d",
    "metadata": {},
    "outputs": [],
@@ -352,7 +352,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
    "id": "b5b29507-92bc-453d-bcc5-6402c17e9a0d",
    "metadata": {},
    "outputs": [],
@@ -372,7 +372,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
    "id": "24674094-4d47-4e48-b591-55faabcff8df",
    "metadata": {},
    "outputs": [],
@@ -413,26 +413,22 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
    "id": "e6224ae5-3792-42b2-8392-3abd42998a50",
    "metadata": {},
    "outputs": [],
    "source": [
     "#| export\n",
-    "def concatenate_mp3(mp3_files:list):\n",
     "\n",
     "    # Initialize an empty AudioSegment object for concatenation\n",
     "    combined = AudioSegment.empty()\n",
-    "    \n",
-    "    # Write out audio file responses as individual files for debugging\n",
-    "    # for idx, mp3_data in enumerate(mp3_files):\n",
-    "    #     with open(f'./{idx}.mp3', 'wb') as f:\n",
-    "    #         f.write(mp3_data)\n",
     "\n",
     "    # Loop through the list of mp3 binary data\n",
-    "    for mp3_data in mp3_files:\n",
     "        # Convert binary data to an audio segment\n",
-    "        audio_segment = AudioSegment.from_file(io.BytesIO(mp3_data), format=\"mp3\")\n",
     "        # Concatenate this segment to the combined segment\n",
     "        combined += audio_segment\n",
     "\n",
@@ -456,7 +452,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 15,
    "id": "4691703d-ed0f-4481-8006-b2906289b780",
    "metadata": {},
    "outputs": [],
@@ -508,7 +504,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 16,
    "id": "3420c868-71cb-4ac6-ac65-6f02bfd841d1",
    "metadata": {},
    "outputs": [],
@@ -516,19 +512,36 @@
     "#| export\n",
     "def create_speech_cartesiaai(chunk_idx, input, model='upbeat-moon', \n",
     "                             voice='248be419-c632-4f23-adf1-5324ed7dbf1d', #Hannah\n",
-    "                             websocket=False, output_format='pcm_44100', **kwargs):\n",
     "    client = cartesia.tts.CartesiaTTS()\n",
     "    \n",
-    "    @retry(wait=wait_random_exponential(min=1, max=180), stop=stop_after_attempt(6))\n",
     "    def _create_speech_with_backoff(**kwargs):\n",
     "        return client.generate(**kwargs)\n",
     "    \n",
-    "    response = _create_speech_with_backoff(transcript=input, model_id=model, voice=voice, \n",
-    "                                           websocket=websocket, output_format=output_format, **kwargs)\n",
     "    client.close()\n",
     "    return chunk_idx, response[\"audio\"]"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 17,
@@ -537,23 +550,26 @@
    "outputs": [],
    "source": [
     "#| export\n",
-    "def create_speech(input_text, provider, model='tts-1', voice='alloy', profile: gr.OAuthProfile|None=None, progress=gr.Progress(), **kwargs):\n",
     "\n",
     "    #Verify auth if it is required. This is very important if this is in a HF space. DO NOT DELETE!!!\n",
-    "    verify_authorization(profile)\n",
     "    start = datetime.now()\n",
-    "\n",
     "    \n",
     "    if provider == 'cartesiaai':\n",
     "        create_speech_func = create_speech_cartesiaai\n",
     "        max_chunk_size = 500\n",
     "        chunk_processing_time = 20\n",
     "        threads = CARTESIAAI_CLIENT_TTS_THREADS\n",
     "    elif provider == 'openai':\n",
     "        create_speech_func = create_speech_openai\n",
     "        max_chunk_size = 4000\n",
     "        chunk_processing_time = 60\n",
     "        threads = OPENAI_CLIENT_TTS_THREADS\n",
     "    else:\n",
     "        raise ValueError(f'Invalid argument provider: {provider}')\n",
     "    \n",
@@ -578,7 +594,7 @@
     "    progress(.9, desc=f\"Merging audio chunks... {(datetime.now()-start).seconds} seconds to process.\")\n",
     "    \n",
     "    # Concatenate the audio data from all chunks\n",
-    "    combined_audio = concatenate_mp3(audio_data)\n",
     "\n",
     "    # Final update to the progress bar\n",
     "    progress(1, desc=f\"Processing completed... {(datetime.now()-start).seconds} seconds to process.\")\n",
@@ -590,17 +606,19 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 18,
    "id": "ca2c6f8c-62ed-4ac1-9c2f-e3b2bfb47e8d",
    "metadata": {},
    "outputs": [],
    "source": [
-    "# create_speech(\"Hi. What's your name?\", provider='openai', model='tts-1', voice='alloy')"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 19,
    "id": "236dd8d3-4364-4731-af93-7dcdec6f18a1",
    "metadata": {},
    "outputs": [],
@@ -612,7 +630,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 20,
    "id": "0523a158-ee07-48b3-9350-ee39d4deee7f",
    "metadata": {},
    "outputs": [],
@@ -634,7 +652,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 21,
    "id": "f4d1ba0b-6960-4e22-8dba-7de70370753a",
    "metadata": {},
    "outputs": [],
@@ -646,7 +664,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 22,
    "id": "efa28cf2-548d-439f-bf2a-21a5edbf9eba",
    "metadata": {},
    "outputs": [],
@@ -654,12 +672,12 @@
     "#| export\n",
     "def update_model_choices(provider):\n",
     "    choices = get_model_choices(provider)\n",
-    "    return gr.update(choices=choices,value=choices[0])"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 23,
    "id": "cdc1dde5-5edd-4dbf-bd11-30eb418c571d",
    "metadata": {},
    "outputs": [],
@@ -671,7 +689,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 24,
    "id": "035c33dd-c8e6-42b4-91d4-6bc5f1b36df3",
    "metadata": {},
    "outputs": [],
@@ -679,12 +697,12 @@
     "#| export\n",
     "def update_voice_choices(provider, model):\n",
     "    choices = get_voice_choices(provider, model)\n",
-    "    return gr.update(choices=choices,value=choices[0])"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 29,
    "id": "e4fb3159-579b-4271-bc96-4cd1e2816eca",
    "metadata": {},
    "outputs": [],
@@ -739,7 +757,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 30,
    "id": "a00648a1-891b-470b-9959-f5d502055713",
    "metadata": {},
    "outputs": [],
@@ -753,7 +771,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 31,
    "id": "4b534fe7-4337-423e-846a-1bdb7cccc4ea",
    "metadata": {},
    "outputs": [
@@ -761,7 +779,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Running on local URL:  http://127.0.0.1:7860\n",
       "\n",
       "To create a public link, set `share=True` in `launch()`.\n"
      ]
@@ -769,7 +787,7 @@
     {
      "data": {
       "text/html": [
-       "<div><iframe src=\"http://127.0.0.1:7860/\" width=\"100%\" height=\"500\" allow=\"autoplay; camera; microphone; clipboard-read; clipboard-write;\" frameborder=\"0\" allowfullscreen></iframe></div>"
       ],
       "text/plain": [
        "<IPython.core.display.HTML object>"
@@ -782,59 +800,9 @@
      "data": {
       "text/plain": []
      },
-     "execution_count": 31,
      "metadata": {},
      "output_type": "execute_result"
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/Users/mathewmiller/anaconda3/envs/gradio1/lib/python3.11/site-packages/gradio/components/dropdown.py:181: UserWarning: The value passed into gr.Dropdown() is not in the list of choices. Please update the list of choices to include: $0.000 or set allow_custom_value=True.\n",
-      "  warnings.warn(\n",
-      "Traceback (most recent call last):\n",
-      "  File \"/Users/mathewmiller/anaconda3/envs/gradio1/lib/python3.11/site-packages/gradio/queueing.py\", line 532, in process_events\n",
-      "    response = await route_utils.call_process_api(\n",
-      "               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
-      "  File \"/Users/mathewmiller/anaconda3/envs/gradio1/lib/python3.11/site-packages/gradio/route_utils.py\", line 276, in call_process_api\n",
-      "    output = await app.get_blocks().process_api(\n",
-      "             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
-      "  File \"/Users/mathewmiller/anaconda3/envs/gradio1/lib/python3.11/site-packages/gradio/blocks.py\", line 1928, in process_api\n",
-      "    result = await self.call_function(\n",
-      "             ^^^^^^^^^^^^^^^^^^^^^^^^^\n",
-      "  File \"/Users/mathewmiller/anaconda3/envs/gradio1/lib/python3.11/site-packages/gradio/blocks.py\", line 1500, in call_function\n",
-      "    processed_input, progress_index, _ = special_args(\n",
-      "                                         ^^^^^^^^^^^^^\n",
-      "  File \"/Users/mathewmiller/anaconda3/envs/gradio1/lib/python3.11/site-packages/gradio/helpers.py\", line 891, in special_args\n",
-      "    getattr(request, \"session\", {})\n",
-      "  File \"/Users/mathewmiller/anaconda3/envs/gradio1/lib/python3.11/site-packages/gradio/route_utils.py\", line 158, in __getattr__\n",
-      "    return self.dict_to_obj(getattr(self.request, name))\n",
-      "                            ^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
-      "  File \"/Users/mathewmiller/anaconda3/envs/gradio1/lib/python3.11/site-packages/starlette/requests.py\", line 157, in session\n",
-      "    \"session\" in self.scope\n",
-      "AssertionError: SessionMiddleware must be installed to access request.session\n",
-      "Traceback (most recent call last):\n",
-      "  File \"/Users/mathewmiller/anaconda3/envs/gradio1/lib/python3.11/site-packages/gradio/queueing.py\", line 532, in process_events\n",
-      "    response = await route_utils.call_process_api(\n",
-      "               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
-      "  File \"/Users/mathewmiller/anaconda3/envs/gradio1/lib/python3.11/site-packages/gradio/route_utils.py\", line 276, in call_process_api\n",
-      "    output = await app.get_blocks().process_api(\n",
-      "             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
-      "  File \"/Users/mathewmiller/anaconda3/envs/gradio1/lib/python3.11/site-packages/gradio/blocks.py\", line 1928, in process_api\n",
-      "    result = await self.call_function(\n",
-      "             ^^^^^^^^^^^^^^^^^^^^^^^^^\n",
-      "  File \"/Users/mathewmiller/anaconda3/envs/gradio1/lib/python3.11/site-packages/gradio/blocks.py\", line 1500, in call_function\n",
-      "    processed_input, progress_index, _ = special_args(\n",
-      "                                         ^^^^^^^^^^^^^\n",
-      "  File \"/Users/mathewmiller/anaconda3/envs/gradio1/lib/python3.11/site-packages/gradio/helpers.py\", line 891, in special_args\n",
-      "    getattr(request, \"session\", {})\n",
-      "  File \"/Users/mathewmiller/anaconda3/envs/gradio1/lib/python3.11/site-packages/gradio/route_utils.py\", line 158, in __getattr__\n",
-      "    return self.dict_to_obj(getattr(self.request, name))\n",
-      "                            ^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
-      "  File \"/Users/mathewmiller/anaconda3/envs/gradio1/lib/python3.11/site-packages/starlette/requests.py\", line 157, in session\n",
-      "    \"session\" in self.scope\n",
-      "AssertionError: SessionMiddleware must be installed to access request.session\n"
-     ]
     }
    ],
    "source": [
@@ -860,7 +828,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 33,
    "id": "28e8d888-e790-46fa-bbac-4511b9ab796c",
    "metadata": {},
    "outputs": [
@@ -868,7 +836,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Closing server running on port: 7860\n"
      ]
     }
    ],
@@ -879,7 +847,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 37,
    "id": "afbc9699-4d16-4060-88f4-cd1251754cbd",
    "metadata": {},
    "outputs": [],
@@ -890,12 +858,13 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 38,
    "id": "0420310d-930b-4904-8bd4-3458ad8bdbd3",
    "metadata": {},
    "outputs": [],
    "source": [
     "#| hide\n",
     "nbdev.export.nb_export('app.ipynb',lib_path='.')"
    ]
   },

  "cells": [
   {
    "cell_type": "code",
+   "execution_count": 30,
    "id": "3bedf0dc-8d8e-4ede-a9e6-b8f35136aa00",
    "metadata": {},
    "outputs": [],
   },
   {
    "cell_type": "code",
+   "execution_count": 1,
    "id": "667802a7-0f36-4136-a381-e66210b20462",
    "metadata": {},
    "outputs": [
   },
   {
    "cell_type": "code",
+   "execution_count": 2,
    "id": "7664bc24-e8a7-440d-851d-eb16dc2d69fb",
    "metadata": {},
    "outputs": [
   },
   {
    "cell_type": "code",
+   "execution_count": 3,
    "id": "4d9863fc-969e-409b-8e20-b9c3cd2cc3e7",
    "metadata": {},
    "outputs": [],
   },
   {
    "cell_type": "code",
+   "execution_count": 4,
    "id": "4f486d3a",
    "metadata": {},
    "outputs": [],
   },
   {
    "cell_type": "code",
+   "execution_count": 5,
    "id": "ecb7f207-0fc2-4d19-a313-356c05776832",
    "metadata": {},
    "outputs": [
   },
   {
    "cell_type": "code",
+   "execution_count": 6,
    "id": "e5d6cac2-0dee-42d8-9b41-184b5be9cc3f",
    "metadata": {},
    "outputs": [],
   },
   {
    "cell_type": "code",
+   "execution_count": 7,
    "id": "b77ad8d6-3289-463c-b213-1c0cc215b141",
    "metadata": {},
    "outputs": [
   },
   {
    "cell_type": "code",
+   "execution_count": 8,
    "id": "87fca48b-a16a-4d2b-919c-75e88e4e5eb5",
    "metadata": {},
    "outputs": [
   },
   {
    "cell_type": "code",
+   "execution_count": 9,
    "id": "8eb7e7d5-7121-4762-b8d1-e5a9539e2b36",
    "metadata": {},
    "outputs": [],
   },
   {
    "cell_type": "code",
+   "execution_count": 10,
    "id": "52d373be-3a79-412e-8ca2-92bb443fa52d",
    "metadata": {},
    "outputs": [],
   },
   {
    "cell_type": "code",
+   "execution_count": 11,
    "id": "b5b29507-92bc-453d-bcc5-6402c17e9a0d",
    "metadata": {},
    "outputs": [],
   },
   {
    "cell_type": "code",
+   "execution_count": 12,
    "id": "24674094-4d47-4e48-b591-55faabcff8df",
    "metadata": {},
    "outputs": [],
   },
   {
    "cell_type": "code",
+   "execution_count": 13,
    "id": "e6224ae5-3792-42b2-8392-3abd42998a50",
    "metadata": {},
    "outputs": [],
    "source": [
     "#| export\n",
+    "def concatenate_audio(files:list, **kwargs):\n",
     "\n",
     "    # Initialize an empty AudioSegment object for concatenation\n",
     "    combined = AudioSegment.empty()\n",
     "\n",
     "    # Loop through the list of mp3 binary data\n",
+    "    for data in files:\n",
     "        # Convert binary data to an audio segment\n",
+    "        audio_segment = AudioSegment.from_file(io.BytesIO(data), **kwargs)\n",
+    "        \n",
     "        # Concatenate this segment to the combined segment\n",
     "        combined += audio_segment\n",
     "\n",
   },
   {
    "cell_type": "code",
+   "execution_count": 14,
    "id": "4691703d-ed0f-4481-8006-b2906289b780",
    "metadata": {},
    "outputs": [],
   },
   {
    "cell_type": "code",
+   "execution_count": 15,
    "id": "3420c868-71cb-4ac6-ac65-6f02bfd841d1",
    "metadata": {},
    "outputs": [],
     "#| export\n",
     "def create_speech_cartesiaai(chunk_idx, input, model='upbeat-moon', \n",
     "                             voice='248be419-c632-4f23-adf1-5324ed7dbf1d', #Hannah\n",
+    "                             websocket=False, \n",
+    "                             output_format='pcm_44100', \n",
+    "                             **kwargs):\n",
     "    client = cartesia.tts.CartesiaTTS()\n",
     "    \n",
+    "    # @retry(wait=wait_random_exponential(min=1, max=180), stop=stop_after_attempt(6))\n",
     "    def _create_speech_with_backoff(**kwargs):\n",
     "        return client.generate(**kwargs)\n",
     "    \n",
+    "    response = _create_speech_with_backoff(transcript=input, model_id=model, \n",
+    "                                           voice=client.get_voice_embedding(voice_id=voice), \n",
+    "                                           websocket=websocket, \n",
+    "                                           output_format=output_format, \n",
+    "                                           **kwargs)\n",
     "    client.close()\n",
     "    return chunk_idx, response[\"audio\"]"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "id": "d0082383-9d03-4b25-b68a-080d0b28caa9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# test\n",
+    "# create_speech_cartesiaai(1,\"Hi. What's your name?\", model='upbeat-moon',\n",
+    "#               voice='63ff761f-c1e8-414b-b969-d1833d1c870c')"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 17,
    "outputs": [],
    "source": [
     "#| export\n",
+    "def create_speech(input_text, provider, model='tts-1', voice='alloy', \n",
+    "                  # profile: gr.OAuthProfile|None=None, \n",
+    "                  progress=gr.Progress(), **kwargs):\n",
     "\n",
     "    #Verify auth if it is required. This is very important if this is in a HF space. DO NOT DELETE!!!\n",
+    "    if REQUIRE_AUTH: verify_authorization(profile)\n",
     "    start = datetime.now()\n",
     "    \n",
     "    if provider == 'cartesiaai':\n",
     "        create_speech_func = create_speech_cartesiaai\n",
     "        max_chunk_size = 500\n",
     "        chunk_processing_time = 20\n",
     "        threads = CARTESIAAI_CLIENT_TTS_THREADS\n",
+    "        audio_file_conversion_kwargs = {'format': 'raw', 'frame_rate': 44100, 'channels': 1, 'sample_width': 2}\n",
     "    elif provider == 'openai':\n",
     "        create_speech_func = create_speech_openai\n",
     "        max_chunk_size = 4000\n",
     "        chunk_processing_time = 60\n",
     "        threads = OPENAI_CLIENT_TTS_THREADS\n",
+    "        audio_file_conversion_kwargs = {'format': 'mp3'}\n",
     "    else:\n",
     "        raise ValueError(f'Invalid argument provider: {provider}')\n",
     "    \n",
     "    progress(.9, desc=f\"Merging audio chunks... {(datetime.now()-start).seconds} seconds to process.\")\n",
     "    \n",
     "    # Concatenate the audio data from all chunks\n",
+    "    combined_audio = concatenate_audio(audio_data, **audio_file_conversion_kwargs)\n",
     "\n",
     "    # Final update to the progress bar\n",
     "    progress(1, desc=f\"Processing completed... {(datetime.now()-start).seconds} seconds to process.\")\n",
   },
   {
    "cell_type": "code",
+   "execution_count": 19,
    "id": "ca2c6f8c-62ed-4ac1-9c2f-e3b2bfb47e8d",
    "metadata": {},
    "outputs": [],
    "source": [
+    "# create_speech(\"Hi. What's your name?\", provider='openai', model='tts-1', voice='alloy')\n",
+    "# create_speech(\"Hi. What's your name?\", provider='cartesiaai', model='upbeat-moon',\n",
+    "#               voice='63ff761f-c1e8-414b-b969-d1833d1c870c')"
    ]
   },
   {
    "cell_type": "code",
+   "execution_count": 20,
    "id": "236dd8d3-4364-4731-af93-7dcdec6f18a1",
    "metadata": {},
    "outputs": [],
   },
   {
    "cell_type": "code",
+   "execution_count": 21,
    "id": "0523a158-ee07-48b3-9350-ee39d4deee7f",
    "metadata": {},
    "outputs": [],
   },
   {
    "cell_type": "code",
+   "execution_count": 22,
    "id": "f4d1ba0b-6960-4e22-8dba-7de70370753a",
    "metadata": {},
    "outputs": [],
   },
   {
    "cell_type": "code",
+   "execution_count": 23,
    "id": "efa28cf2-548d-439f-bf2a-21a5edbf9eba",
    "metadata": {},
    "outputs": [],
     "#| export\n",
     "def update_model_choices(provider):\n",
     "    choices = get_model_choices(provider)\n",
+    "    return gr.update(choices=choices,value=choices[0][1])"
    ]
   },
   {
    "cell_type": "code",
+   "execution_count": 24,
    "id": "cdc1dde5-5edd-4dbf-bd11-30eb418c571d",
    "metadata": {},
    "outputs": [],
   },
   {
    "cell_type": "code",
+   "execution_count": 25,
    "id": "035c33dd-c8e6-42b4-91d4-6bc5f1b36df3",
    "metadata": {},
    "outputs": [],
     "#| export\n",
     "def update_voice_choices(provider, model):\n",
     "    choices = get_voice_choices(provider, model)\n",
+    "    return gr.update(choices=choices,value=choices[0][1])"
    ]
   },
   {
    "cell_type": "code",
+   "execution_count": 26,
    "id": "e4fb3159-579b-4271-bc96-4cd1e2816eca",
    "metadata": {},
    "outputs": [],
   },
   {
    "cell_type": "code",
+   "execution_count": 27,
    "id": "a00648a1-891b-470b-9959-f5d502055713",
    "metadata": {},
    "outputs": [],
   },
   {
    "cell_type": "code",
+   "execution_count": 28,
    "id": "4b534fe7-4337-423e-846a-1bdb7cccc4ea",
    "metadata": {},
    "outputs": [
      "name": "stdout",
      "output_type": "stream",
      "text": [
+      "Running on local URL:  http://127.0.0.1:7861\n",
       "\n",
       "To create a public link, set `share=True` in `launch()`.\n"
      ]
     {
      "data": {
       "text/html": [
+       "<div><iframe src=\"http://127.0.0.1:7861/\" width=\"100%\" height=\"500\" allow=\"autoplay; camera; microphone; clipboard-read; clipboard-write;\" frameborder=\"0\" allowfullscreen></iframe></div>"
       ],
       "text/plain": [
        "<IPython.core.display.HTML object>"
      "data": {
       "text/plain": []
      },
+     "execution_count": 28,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
   },
   {
    "cell_type": "code",
+   "execution_count": 29,
    "id": "28e8d888-e790-46fa-bbac-4511b9ab796c",
    "metadata": {},
    "outputs": [
      "name": "stdout",
      "output_type": "stream",
      "text": [
+      "Closing server running on port: 7861\n"
      ]
     }
    ],
   },
   {
    "cell_type": "code",
+   "execution_count": 30,
    "id": "afbc9699-4d16-4060-88f4-cd1251754cbd",
    "metadata": {},
    "outputs": [],
   },
   {
    "cell_type": "code",
+   "execution_count": 53,
    "id": "0420310d-930b-4904-8bd4-3458ad8bdbd3",
    "metadata": {},
    "outputs": [],
    "source": [
     "#| hide\n",
+    "import nbdev\n",
     "nbdev.export.nb_export('app.ipynb',lib_path='.')"
    ]
   },

app.py CHANGED Viewed

@@ -3,7 +3,7 @@
 # %% auto 0
 __all__ = ['secret_import_failed', 'TEMP', 'TEMP_DIR', 'providers', 'clean_text_prompt', 'OPENAI_CLIENT_TTS_THREADS',
            'CARTESIAAI_CLIENT_TTS_THREADS', 'DEFAULT_PROVIDER', 'DEFAULT_MODEL', 'DEFAULT_VOICE', 'launch_kwargs',
-           'queue_kwargs', 'verify_authorization', 'split_text', 'concatenate_mp3', 'create_speech_openai',
            'create_speech_cartesiaai', 'create_speech', 'get_input_text_len', 'get_generation_cost',
            'get_model_choices', 'update_model_choices', 'get_voice_choices', 'update_voice_choices']
@@ -163,20 +163,16 @@ def split_text(input_text, max_length=4000, lookback=1000):
     return chunks
 # %% app.ipynb 21
-def concatenate_mp3(mp3_files:list):
     # Initialize an empty AudioSegment object for concatenation
     combined = AudioSegment.empty()
-    # Write out audio file responses as individual files for debugging
-    # for idx, mp3_data in enumerate(mp3_files):
-    #     with open(f'./{idx}.mp3', 'wb') as f:
-    #         f.write(mp3_data)
     # Loop through the list of mp3 binary data
-    for mp3_data in mp3_files:
         # Convert binary data to an audio segment
-        audio_segment = AudioSegment.from_file(io.BytesIO(mp3_data), format="mp3")
         # Concatenate this segment to the combined segment
         combined += audio_segment
@@ -212,36 +208,44 @@ def create_speech_openai(chunk_idx, input, model='tts-1', voice='alloy', speed=1
 # %% app.ipynb 24
 def create_speech_cartesiaai(chunk_idx, input, model='upbeat-moon',
                              voice='248be419-c632-4f23-adf1-5324ed7dbf1d', #Hannah
-                             websocket=False, output_format='pcm_44100', **kwargs):
     client = cartesia.tts.CartesiaTTS()
-    @retry(wait=wait_random_exponential(min=1, max=180), stop=stop_after_attempt(6))
     def _create_speech_with_backoff(**kwargs):
         return client.generate(**kwargs)
-    response = _create_speech_with_backoff(transcript=input, model_id=model, voice=voice,
-                                           websocket=websocket, output_format=output_format, **kwargs)
     client.close()
     return chunk_idx, response["audio"]
-# %% app.ipynb 25
-def create_speech(input_text, provider, model='tts-1', voice='alloy', profile: gr.OAuthProfile|None=None, progress=gr.Progress(), **kwargs):
     #Verify auth if it is required. This is very important if this is in a HF space. DO NOT DELETE!!!
-    verify_authorization(profile)
     start = datetime.now()
     if provider == 'cartesiaai':
         create_speech_func = create_speech_cartesiaai
         max_chunk_size = 500
         chunk_processing_time = 20
         threads = CARTESIAAI_CLIENT_TTS_THREADS
     elif provider == 'openai':
         create_speech_func = create_speech_openai
         max_chunk_size = 4000
         chunk_processing_time = 60
         threads = OPENAI_CLIENT_TTS_THREADS
     else:
         raise ValueError(f'Invalid argument provider: {provider}')
@@ -266,7 +270,7 @@ def create_speech(input_text, provider, model='tts-1', voice='alloy', profile: g
     progress(.9, desc=f"Merging audio chunks... {(datetime.now()-start).seconds} seconds to process.")
     # Concatenate the audio data from all chunks
-    combined_audio = concatenate_mp3(audio_data)
     # Final update to the progress bar
     progress(1, desc=f"Processing completed... {(datetime.now()-start).seconds} seconds to process.")
@@ -276,11 +280,11 @@ def create_speech(input_text, provider, model='tts-1', voice='alloy', profile: g
     return combined_audio
-# %% app.ipynb 27
 def get_input_text_len(input_text):
     return len(input_text)
-# %% app.ipynb 28
 def get_generation_cost(input_text, tts_model_dropdown, provider):
     text_len = len(input_text)
     if provider == 'openai':
@@ -294,25 +298,25 @@ def get_generation_cost(input_text, tts_model_dropdown, provider):
         raise ValueError(f'Invalid argument provider: {provider}')
     return "${:,.3f}".format(cost)
-# %% app.ipynb 29
 def get_model_choices(provider):
     return sorted([(v,k) for k,v in providers[provider]['models'].items()])
-# %% app.ipynb 30
 def update_model_choices(provider):
     choices = get_model_choices(provider)
-    return gr.update(choices=choices,value=choices[0])
-# %% app.ipynb 31
 def get_voice_choices(provider, model):
     return sorted([(v['name'],v['id']) for v in providers[provider]['voices'].values()])
-# %% app.ipynb 32
 def update_voice_choices(provider, model):
     choices = get_voice_choices(provider, model)
-    return gr.update(choices=choices,value=choices[0])
-# %% app.ipynb 33
 with gr.Blocks(title='TTS', head='TTS', delete_cache=(3600,3600)) as app:
     gr.Markdown("# TTS")
     gr.Markdown("""Start typing below and then click **Go** to create the speech from your text.
@@ -359,13 +363,13 @@ For requests longer than allowed by the API they will be broken into chunks auto
         app.load(verify_authorization, None, m)
-# %% app.ipynb 34
 # launch_kwargs = {'auth':('username',GRADIO_PASSWORD),
 #                  'auth_message':'Please log in to Mat\'s TTS App with username: username and password.'}
 launch_kwargs = {}
 queue_kwargs = {'default_concurrency_limit':10}
-# %% app.ipynb 36
 #.py launch
 if __name__ == "__main__":
     app.queue(**queue_kwargs)

 # %% auto 0
 __all__ = ['secret_import_failed', 'TEMP', 'TEMP_DIR', 'providers', 'clean_text_prompt', 'OPENAI_CLIENT_TTS_THREADS',
            'CARTESIAAI_CLIENT_TTS_THREADS', 'DEFAULT_PROVIDER', 'DEFAULT_MODEL', 'DEFAULT_VOICE', 'launch_kwargs',
+           'queue_kwargs', 'verify_authorization', 'split_text', 'concatenate_audio', 'create_speech_openai',
            'create_speech_cartesiaai', 'create_speech', 'get_input_text_len', 'get_generation_cost',
            'get_model_choices', 'update_model_choices', 'get_voice_choices', 'update_voice_choices']
     return chunks
 # %% app.ipynb 21
+def concatenate_audio(files:list, **kwargs):
     # Initialize an empty AudioSegment object for concatenation
     combined = AudioSegment.empty()
     # Loop through the list of mp3 binary data
+    for data in files:
         # Convert binary data to an audio segment
+        audio_segment = AudioSegment.from_file(io.BytesIO(data), **kwargs)
         # Concatenate this segment to the combined segment
         combined += audio_segment
 # %% app.ipynb 24
 def create_speech_cartesiaai(chunk_idx, input, model='upbeat-moon',
                              voice='248be419-c632-4f23-adf1-5324ed7dbf1d', #Hannah
+                             websocket=False,
+                             output_format='pcm_44100',
+                             **kwargs):
     client = cartesia.tts.CartesiaTTS()
+    # @retry(wait=wait_random_exponential(min=1, max=180), stop=stop_after_attempt(6))
     def _create_speech_with_backoff(**kwargs):
         return client.generate(**kwargs)
+    response = _create_speech_with_backoff(transcript=input, model_id=model,
+                                           voice=client.get_voice_embedding(voice_id=voice),
+                                           websocket=websocket,
+                                           output_format=output_format,
+                                           **kwargs)
     client.close()
     return chunk_idx, response["audio"]
+# %% app.ipynb 26
+def create_speech(input_text, provider, model='tts-1', voice='alloy',
+                  # profile: gr.OAuthProfile|None=None,
+                  progress=gr.Progress(), **kwargs):
     #Verify auth if it is required. This is very important if this is in a HF space. DO NOT DELETE!!!
+    if REQUIRE_AUTH: verify_authorization(profile)
     start = datetime.now()
     if provider == 'cartesiaai':
         create_speech_func = create_speech_cartesiaai
         max_chunk_size = 500
         chunk_processing_time = 20
         threads = CARTESIAAI_CLIENT_TTS_THREADS
+        audio_file_conversion_kwargs = {'format': 'raw', 'frame_rate': 44100, 'channels': 1, 'sample_width': 2}
     elif provider == 'openai':
         create_speech_func = create_speech_openai
         max_chunk_size = 4000
         chunk_processing_time = 60
         threads = OPENAI_CLIENT_TTS_THREADS
+        audio_file_conversion_kwargs = {'format': 'mp3'}
     else:
         raise ValueError(f'Invalid argument provider: {provider}')
     progress(.9, desc=f"Merging audio chunks... {(datetime.now()-start).seconds} seconds to process.")
     # Concatenate the audio data from all chunks
+    combined_audio = concatenate_audio(audio_data, **audio_file_conversion_kwargs)
     # Final update to the progress bar
     progress(1, desc=f"Processing completed... {(datetime.now()-start).seconds} seconds to process.")
     return combined_audio
+# %% app.ipynb 28
 def get_input_text_len(input_text):
     return len(input_text)
+# %% app.ipynb 29
 def get_generation_cost(input_text, tts_model_dropdown, provider):
     text_len = len(input_text)
     if provider == 'openai':
         raise ValueError(f'Invalid argument provider: {provider}')
     return "${:,.3f}".format(cost)
+# %% app.ipynb 30
 def get_model_choices(provider):
     return sorted([(v,k) for k,v in providers[provider]['models'].items()])
+# %% app.ipynb 31
 def update_model_choices(provider):
     choices = get_model_choices(provider)
+    return gr.update(choices=choices,value=choices[0][1])
+# %% app.ipynb 32
 def get_voice_choices(provider, model):
     return sorted([(v['name'],v['id']) for v in providers[provider]['voices'].values()])
+# %% app.ipynb 33
 def update_voice_choices(provider, model):
     choices = get_voice_choices(provider, model)
+    return gr.update(choices=choices,value=choices[0][1])
+# %% app.ipynb 34
 with gr.Blocks(title='TTS', head='TTS', delete_cache=(3600,3600)) as app:
     gr.Markdown("# TTS")
     gr.Markdown("""Start typing below and then click **Go** to create the speech from your text.
         app.load(verify_authorization, None, m)
+# %% app.ipynb 35
 # launch_kwargs = {'auth':('username',GRADIO_PASSWORD),
 #                  'auth_message':'Please log in to Mat\'s TTS App with username: username and password.'}
 launch_kwargs = {}
 queue_kwargs = {'default_concurrency_limit':10}
+# %% app.ipynb 37
 #.py launch
 if __name__ == "__main__":
     app.queue(**queue_kwargs)