Spaces:

sofdog
/

live-transcription-docker

Sleeping

App Files Files Community

Sofia Casadei commited on 10 days ago

Commit

97f18ea

1 Parent(s): 382a8a5

fix: turn server config

Browse files

Files changed (2) hide show

main.py +21 -14
utils/turn_server.py +86 -81

main.py CHANGED Viewed

@@ -2,6 +2,7 @@ import os
 import logging
 import json
 import torch
 import gradio as gr
 import numpy as np
@@ -16,8 +17,6 @@ from fastrtc import (
     AlgoOptions,
     SileroVadOptions,
     audio_to_bytes,
-    get_cloudflare_turn_credentials_async,
-    get_cloudflare_turn_credentials,
 )
 from transformers import (
     AutoModelForSpeechSeq2Seq,
@@ -28,7 +27,7 @@ from transformers.utils import is_flash_attn_2_available
 from utils.logger_config import setup_logging
 from utils.device import get_device, get_torch_and_np_dtypes
-from utils.turn_server import get_rtc_credentials
 load_dotenv()
@@ -39,10 +38,10 @@ logger = logging.getLogger(__name__)
 UI_MODE = os.getenv("UI_MODE", "fastapi").lower() # gradio | fastapi
 UI_TYPE = os.getenv("UI_TYPE", "base").lower() # base | screen
 APP_MODE = os.getenv("APP_MODE", "local").lower() # local | deployed
 MODEL_ID = os.getenv("MODEL_ID", "openai/whisper-large-v3-turbo")
 LANGUAGE = os.getenv("LANGUAGE", "english").lower()
 device = get_device(force_cpu=False)
 torch_dtype, np_dtype = get_torch_and_np_dtypes(device, use_bfloat16=False)
 logger.info(f"Using device: {device}, torch_dtype: {torch_dtype}, np_dtype: {np_dtype}")
@@ -93,8 +92,8 @@ async def transcribe(audio: tuple[int, np.ndarray]):
     outputs = transcribe_pipeline(
         audio_to_bytes(audio),
-        chunk_length_s=6,
-        batch_size=1,
         generate_kwargs={
             'task': 'transcribe',
             'language': LANGUAGE,
@@ -103,8 +102,8 @@ async def transcribe(audio: tuple[int, np.ndarray]):
     )
     yield AdditionalOutputs(outputs["text"].strip())
-async def get_credentials():
-    return await get_cloudflare_turn_credentials_async(hf_token=os.getenv("HF_TOKEN"))
 logger.info("Initializing FastRTC stream")
 stream = Stream(
@@ -123,12 +122,13 @@ stream = Stream(
             threshold=0.5,
             # Final speech chunks shorter min_speech_duration_ms are thrown out (default 250)
             min_speech_duration_ms=250,
-            # Max duration of speech chunks, longer will be split (default float('inf'))
-            max_speech_duration_s=6,
             # Wait for ms at the end of each speech chunk before separating it (default 2000)
             min_silence_duration_ms=100,
             # Chunk size for VAD model. Can be 512, 1024, 1536 for 16k s.r. (default 1024)
-            window_size_samples=1024,
             # Final speech chunks are padded by speech_pad_ms each side (default 400)
             speech_pad_ms=200,
         ),
@@ -142,8 +142,8 @@ stream = Stream(
         gr.Textbox(label="Transcript"),
     ],
     additional_outputs_handler=lambda current, new: current + " " + new,
-    rtc_configuration=get_credentials if APP_MODE == "deployed" else None,
-    server_rtc_configuration=get_cloudflare_turn_credentials(ttl=360_000) if APP_MODE == "deployed" else None,
     concurrency_limit=6
 )
@@ -158,7 +158,14 @@ async def index():
     elif UI_TYPE == "screen":
         html_content = open("static/index-screen.html").read()
-    rtc_config = get_credentials if APP_MODE == "deployed" else None
     return HTMLResponse(content=html_content.replace("__RTC_CONFIGURATION__", json.dumps(rtc_config)))
 @app.get("/transcript")

 import logging
 import json
 import torch
+import asyncio
 import gradio as gr
 import numpy as np
     AlgoOptions,
     SileroVadOptions,
     audio_to_bytes,
 )
 from transformers import (
     AutoModelForSpeechSeq2Seq,
 from utils.logger_config import setup_logging
 from utils.device import get_device, get_torch_and_np_dtypes
+from utils.turn_server import get_credential_function, get_rtc_credentials
 load_dotenv()
 UI_MODE = os.getenv("UI_MODE", "fastapi").lower() # gradio | fastapi
 UI_TYPE = os.getenv("UI_TYPE", "base").lower() # base | screen
 APP_MODE = os.getenv("APP_MODE", "local").lower() # local | deployed
+TURN_SERVER_PROVIDER = os.getenv("TURN_SERVER_PROVIDER", "hf-cloudflare").lower() # hf-cloudflare | cloudflare | hf | twilio
 MODEL_ID = os.getenv("MODEL_ID", "openai/whisper-large-v3-turbo")
 LANGUAGE = os.getenv("LANGUAGE", "english").lower()
 device = get_device(force_cpu=False)
 torch_dtype, np_dtype = get_torch_and_np_dtypes(device, use_bfloat16=False)
 logger.info(f"Using device: {device}, torch_dtype: {torch_dtype}, np_dtype: {np_dtype}")
     outputs = transcribe_pipeline(
         audio_to_bytes(audio),
+        chunk_length_s=3,
+        batch_size=2,
         generate_kwargs={
             'task': 'transcribe',
             'language': LANGUAGE,
     )
     yield AdditionalOutputs(outputs["text"].strip())
+get_credentials = get_credential_function(TURN_SERVER_PROVIDER, is_async=True) if APP_MODE == "deployed" else None
+server_rtc_configuration = get_rtc_credentials(provider=TURN_SERVER_PROVIDER, ttl=360_000) if APP_MODE == "deployed" else None
 logger.info("Initializing FastRTC stream")
 stream = Stream(
             threshold=0.5,
             # Final speech chunks shorter min_speech_duration_ms are thrown out (default 250)
             min_speech_duration_ms=250,
+            # Max duration of speech chunks, longer will be split at the timestamp of the last silence
+            # that lasts more than 100ms (if any) or just before max_speech_duration_s (default float('inf'))
+            max_speech_duration_s=3,
             # Wait for ms at the end of each speech chunk before separating it (default 2000)
             min_silence_duration_ms=100,
             # Chunk size for VAD model. Can be 512, 1024, 1536 for 16k s.r. (default 1024)
+            window_size_samples=512,
             # Final speech chunks are padded by speech_pad_ms each side (default 400)
             speech_pad_ms=200,
         ),
         gr.Textbox(label="Transcript"),
     ],
     additional_outputs_handler=lambda current, new: current + " " + new,
+    rtc_configuration=get_credentials,
+    server_rtc_configuration=server_rtc_configuration,
     concurrency_limit=6
 )
     elif UI_TYPE == "screen":
         html_content = open("static/index-screen.html").read()
+    # Return the actual credentials for client-side, not the function
+    rtc_config = None
+    if APP_MODE == "deployed":
+        if asyncio.iscoroutinefunction(get_credentials):
+            rtc_config = await get_credentials()
+        else:
+            rtc_config = get_rtc_credentials(provider=TURN_SERVER_PROVIDER)
     return HTMLResponse(content=html_content.replace("__RTC_CONFIGURATION__", json.dumps(rtc_config)))
 @app.get("/transcript")

utils/turn_server.py CHANGED Viewed

@@ -1,19 +1,24 @@
 import os
-from typing import Literal, Optional, Dict, Any
 import requests
-from fastrtc import get_hf_turn_credentials, get_twilio_turn_credentials
 def get_rtc_credentials(
-    provider: Literal["hf", "twilio", "cloudflare"] = "hf",
     **kwargs
 ) -> Dict[str, Any]:
     """
     Get RTC configuration for different TURN server providers.
     Args:
-        provider: The TURN server provider to use ('hf', 'twilio', or 'cloudflare')
         **kwargs: Additional arguments passed to the specific provider's function
     Returns:
@@ -21,99 +26,99 @@ def get_rtc_credentials(
     """
     try:
         if provider == "hf":
-            return get_hf_credentials(**kwargs)
         elif provider == "twilio":
-            return get_twilio_credentials(**kwargs)
         elif provider == "cloudflare":
-            return get_cloudflare_credentials(**kwargs)
     except Exception as e:
         raise Exception(f"Failed to get RTC credentials ({provider}): {str(e)}")
-def get_hf_credentials(token: Optional[str] = None) -> Dict[str, Any]:
-    """
-    Get credentials for Hugging Face's community TURN server.
-    Required setup:
-    1. Create a Hugging Face account at huggingface.co
-    2. Visit: https://huggingface.co/spaces/fastrtc/turn-server-login
-    3. Set HF_TOKEN environment variable or pass token directly
-    """
-    token = token or os.environ.get("HF_TOKEN")
-    if not token:
-        raise ValueError("HF_TOKEN environment variable not set")
-    try:
-        return get_hf_turn_credentials(token=token)
-    except Exception as e:
-        raise Exception(f"Failed to get HF TURN credentials: {str(e)}")
-def get_twilio_credentials(
-    account_sid: Optional[str] = None,
-    auth_token: Optional[str] = None
 ) -> Dict[str, Any]:
     """
-    Get credentials for Twilio's TURN server.
-    Required setup:
-    1. Create a free Twilio account at: https://login.twilio.com/u/signup
-    2. Get your Account SID and Auth Token from the Twilio Console
-    3. Set environment variables:
-       - TWILIO_ACCOUNT_SID (or pass directly)
-       - TWILIO_AUTH_TOKEN (or pass directly)
-    """
-    account_sid = account_sid or os.environ.get("TWILIO_ACCOUNT_SID")
-    auth_token = auth_token or os.environ.get("TWILIO_AUTH_TOKEN")
-    if not account_sid or not auth_token:
-        raise ValueError("Twilio credentials not found. Set TWILIO_ACCOUNT_SID and TWILIO_AUTH_TOKEN env vars")
     try:
-        return get_twilio_turn_credentials(account_sid=account_sid, auth_token=auth_token)
     except Exception as e:
-        raise Exception(f"Failed to get Twilio TURN credentials: {str(e)}")
-def get_cloudflare_credentials(
-    key_id: Optional[str] = None,
-    api_token: Optional[str] = None,
-    ttl: int = 86400
-) -> Dict[str, Any]:
     """
-    Get credentials for Cloudflare's TURN server.
-    Required setup:
-    1. Create a free Cloudflare account
-    2. Go to Cloudflare dashboard -> Calls section
-    3. Create a TURN App and get the Turn Token ID and API Token
-    4. Set environment variables:
-       - TURN_KEY_ID
-       - TURN_KEY_API_TOKEN
     Args:
-        key_id: Cloudflare Turn Token ID (optional, will use env var if not provided)
-        api_token: Cloudflare API Token (optional, will use env var if not provided)
-        ttl: Time-to-live for credentials in seconds (default: 24 hours)
     """
-    key_id = key_id or os.environ.get("TURN_KEY_ID")
-    api_token = api_token or os.environ.get("TURN_KEY_API_TOKEN")
-    if not key_id or not api_token:
-        raise ValueError("Cloudflare credentials not found. Set TURN_KEY_ID and TURN_KEY_API_TOKEN env vars")
-    response = requests.post(
-        f"https://rtc.live.cloudflare.com/v1/turn/keys/{key_id}/credentials/generate",
-        headers={
-            "Authorization": f"Bearer {api_token}",
-            "Content-Type": "application/json",
-        },
-        json={"ttl": ttl},
-    )
-    if response.ok:
-        return {"iceServers": [response.json()["iceServers"]]}
     else:
-        raise Exception(
-            f"Failed to get Cloudflare TURN credentials: {response.status_code} {response.text}"
-        )

 import os
+from typing import Literal, Optional, Dict, Any, Callable, Awaitable
 import requests
+from fastrtc import (
+    get_hf_turn_credentials,
+    get_twilio_turn_credentials,
+    get_cloudflare_turn_credentials,
+    get_cloudflare_turn_credentials_async
+)
 def get_rtc_credentials(
+    provider: Literal["hf", "twilio", "cloudflare", "hf-cloudflare"] = "hf-cloudflare",
     **kwargs
 ) -> Dict[str, Any]:
     """
     Get RTC configuration for different TURN server providers.
     Args:
+        provider: The TURN server provider to use ('hf', 'twilio', 'cloudflare', or 'hf-cloudflare')
         **kwargs: Additional arguments passed to the specific provider's function
     Returns:
     """
     try:
         if provider == "hf":
+            # HF Community Server (Deprecated)
+            # 1. Create a Hugging Face account at huggingface.co
+            # 2. Visit: https://huggingface.co/settings/tokens to create a token
+            # 3. Set HF_TOKEN environment variable or pass token directly
+            token = kwargs.pop("token", os.environ.get("HF_TOKEN"))
+            if not token:
+                raise ValueError("HF_TOKEN environment variable not set")
+            return get_hf_turn_credentials(token=token)
         elif provider == "twilio":
+            # Twilio TURN Server
+            # 1. Create a free Twilio account at: https://login.twilio.com/u/signup
+            # 2. Get your Account SID and Auth Token from the Twilio Console
+            # 3. Set environment variables: TWILIO_ACCOUNT_SID and TWILIO_AUTH_TOKEN
+            account_sid = kwargs.pop("account_sid", os.environ.get("TWILIO_ACCOUNT_SID"))
+            auth_token = kwargs.pop("auth_token", os.environ.get("TWILIO_AUTH_TOKEN"))
+            if not account_sid or not auth_token:
+                raise ValueError("Twilio credentials not found. Set TWILIO_ACCOUNT_SID and TWILIO_AUTH_TOKEN env vars")
+            return get_twilio_turn_credentials(account_sid=account_sid, auth_token=auth_token)
         elif provider == "cloudflare":
+            # Cloudflare TURN Server
+            # 1. Create a free Cloudflare account
+            # 2. Go to Cloudflare dashboard -> Calls section
+            # 3. Create a TURN App and get the Turn Token ID and API Token
+            # 4. Set environment variables: TURN_KEY_ID and TURN_KEY_API_TOKEN
+            key_id = kwargs.pop("key_id", os.environ.get("TURN_KEY_ID"))
+            api_token = kwargs.pop("api_token", os.environ.get("TURN_KEY_API_TOKEN"))
+            ttl = kwargs.pop("ttl", 86400)
+            if not key_id or not api_token:
+                raise ValueError("Cloudflare credentials not found. Set TURN_KEY_ID and TURN_KEY_API_TOKEN env vars")
+            return get_cloudflare_turn_credentials(key_id=key_id, api_token=api_token, ttl=ttl)
+        elif provider == "hf-cloudflare":
+            # Cloudflare with Hugging Face Token (10GB free traffic per month)
+            # 1. Create a Hugging Face account at huggingface.co
+            # 2. Visit: https://huggingface.co/settings/tokens to create a token
+            # 3. Set HF_TOKEN environment variable or pass token directly
+            hf_token = kwargs.pop("hf_token", os.environ.get("HF_TOKEN"))
+            ttl = kwargs.pop("ttl", 86400)
+            if not hf_token:
+                raise ValueError("HF_TOKEN environment variable not set")
+            return get_cloudflare_turn_credentials(hf_token=hf_token, ttl=ttl)
+        else:
+            raise ValueError(f"Unknown provider: {provider}")
     except Exception as e:
         raise Exception(f"Failed to get RTC credentials ({provider}): {str(e)}")
+async def get_rtc_credentials_async(
+    provider: Literal["hf-cloudflare"] = "hf-cloudflare",
+    **kwargs
 ) -> Dict[str, Any]:
     """
+    Get RTC configuration asynchronously for different TURN server providers.
+    Args:
+        provider: Currently only supports 'hf-cloudflare'
+        **kwargs: Additional arguments passed to the specific provider's function
+    Returns:
+        Dictionary containing the RTC configuration
+    """
+    if provider != "hf-cloudflare":
+        raise NotImplementedError(f"Async credentials for {provider} not implemented")
     try:
+        # Cloudflare with Hugging Face Token (10GB free traffic per month)
+        hf_token = kwargs.pop("hf_token", os.environ.get("HF_TOKEN"))
+        ttl = kwargs.pop("ttl", 600)  # Default 10 minutes for client-side
+        if not hf_token:
+            raise ValueError("HF_TOKEN environment variable not set")
+        return await get_cloudflare_turn_credentials_async(hf_token=hf_token, ttl=ttl)
     except Exception as e:
+        raise Exception(f"Failed to get async RTC credentials: {str(e)}")
+def get_credential_function(provider: str, is_async: bool = False) -> Callable:
     """
+    Get the appropriate credential function based on provider and whether async is needed.
     Args:
+        provider: The TURN server provider
+        is_async: Whether to return an async function
+    Returns:
+        Function that returns credentials (async or sync)
     """
+    if is_async and provider == "hf-cloudflare":
+        async def get_creds():
+            return await get_rtc_credentials_async(provider=provider)
+        return get_creds
     else:
+        def get_creds():
+            return get_rtc_credentials(provider=provider)
+        return get_creds