Spaces:

freddyaboulton
/

really-fast-whisper

Running on CPU Upgrade

File size: 2,756 Bytes

650a7ec
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bd507ca
650a7ec
 
 
 
 
 
c2be063
650a7ec
 
bd507ca
650a7ec
 
 
 
bd507ca
650a7ec
f5c4474
 
 
bd507ca
 
f5c4474
650a7ec
 
 
 
bd507ca
650a7ec
 
bd507ca
650a7ec
 
bd507ca
650a7ec
 
bd507ca
650a7ec
 
 
0a71e28
 
bd507ca
0a71e28
 
 
 
650a7ec
 
0a71e28
650a7ec
 
 
 
 
 
 
 
 
 
 
 
f5c4474
 
0a71e28
 
 
f5c4474
 
0a71e28
650a7ec

import os
from pathlib import Path
from httpx import AsyncClient

import gradio as gr
import numpy as np
from dotenv import load_dotenv
from fastrtc import (
    AdditionalOutputs,
    ReplyOnPause,
    Stream,
    audio_to_bytes,
    get_turn_credentials_async,
    get_turn_credentials,
)
from gradio.utils import get_space
from languages import LANGUAGES

cur_dir = Path(__file__).parent

load_dotenv()


client = AsyncClient(timeout=30)


async def transcribe_file(audio: tuple[int, np.ndarray], language: str):
    response = await client.post(
        url="https://douatiz8x2itm3yn.us-east-1.aws.endpoints.huggingface.cloud/api/v1/audio/transcriptions",
        headers={"Authorization": f"Bearer {os.getenv('HF_TOKEN')}"},
        files={"file": audio_to_bytes(audio)},
        data={"response_format": "text", "language": language},
    )
    return response.text


async def transcribe(audio: tuple[int, np.ndarray], transcript: str, language: str):
    text = await transcribe_file(audio, language)
    yield AdditionalOutputs(transcript + " " + text)


transcript = gr.Textbox(label="Transcript")
stream = Stream(
    ReplyOnPause(transcribe, input_sample_rate=48_100),
    modality="audio",
    mode="send",
    additional_inputs=[transcript, gr.Dropdown(choices=LANGUAGES, label="Language")],
    additional_outputs=[transcript],
    additional_outputs_handler=lambda a, b: b,
    rtc_configuration=get_turn_credentials_async,
    server_rtc_configuration=get_turn_credentials(ttl=604_800),
    concurrency_limit=20 if get_space() else None,
    time_limit=300, 
    ui_args={"title": ""},
)

iface = gr.Interface(
    fn=transcribe_file,
    inputs=[gr.Audio(label="Upload Audio", sources=["upload", "microphone"]), gr.Dropdown(choices=LANGUAGES, label="Language")],
    outputs=gr.Textbox(label="Transcript"),
)


with gr.Blocks() as demo:
    gr.HTML(
        """
        <h1 style='text-align: center; display: flex; align-items: center; justify-content: center;'>
        <img src="/gradio_api/file=AV_Huggy.png" alt="Streaming Huggy" style="height: 50px; margin-right: 10px"> Really Fast Whisper
        </h1>
    """
    )
    gr.HTML(
        """
    <h2 style='text-align: center'>
    Powered by <a href="https://huggingface.co/hfendpoints/whisper-large-v3">HF Inference Endpoints</a> and <a href="https://fastrtc.org/">FastRTC</a>
    </h2>
    """
    )
    with gr.Tabs():
        with gr.Tab("Streaming"):
            gr.Markdown(
                "Grant access to the microphone and speak naturally. The transcript will be updated as you pause."
            )
            stream.ui.render()
        with gr.Tab("File Upload"):
            iface.render()
if __name__ == "__main__":
    demo.launch(allowed_paths=["AV_Huggy.png"])