File size: 4,795 Bytes
83a4e82
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6b6fa2c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
83a4e82
 
 
 
6b6fa2c
83a4e82
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
import gradio as gr
import subprocess
import tempfile
import os
import shutil
from logging_config import logger
from transcription_tool import TranscriptTool  # Assuming TranscriptionTool is in `transcription_tool.py`

# smolagent transcription tool
transcript_tool = TranscriptTool()


def transcribe_url(url):
    """
    Transcribes audio or video from a given URL.

    Downloads the media from the URL, converts it to WAV format,
    and then uses the TranscriptTool to perform the transcription in English

    Args:
        url (str): The URL of the audio or video file.

    Returns:
        str: The transcription of the audio/video in english, or an error message if
             download or transcription fails.
    """
    local_file_path = None

    try:
        if not url:
            return "Error: Please provide a URL." # Removed the second empty string as the function only returns one value

        logger.info(f"Attempting to download audio from URL: {url}")
        temp_download_dir = "./temp_downloads"
        os.makedirs(temp_download_dir, exist_ok=True)

        # Use yt-dlp to download the best audio format and convert to wav
        # -f bestaudio: selects the best audio format
        # -x: extracts audio
        # --audio-format wav: converts to wav format
        # -o: specifies output template
        output_template = os.path.join(temp_download_dir, "downloaded_audio.%(ext)s")

        # --- Start Added Code ---
        # Ensure any previous download is removed before starting a new one
        logger.info(f"Checking for existing files matching 'downloaded_audio.*' in {temp_download_dir}")
        for filename in os.listdir(temp_download_dir):
            if filename.startswith("downloaded_audio."):
                file_to_delete = os.path.join(temp_download_dir, filename)
                try:
                    os.remove(file_to_delete)
                    logger.info(f"Removed existing file: {file_to_delete}")
                except OSError as e:
                    logger.error(f"Error removing file {file_to_delete}: {e.strerror}")
        # --- End Added Code ---

        cmd = ["yt-dlp", "-f", "bestaudio", "-x", "--audio-format", "wav", "-o", output_template, url]
        process = subprocess.run(cmd, check=True, capture_output=True, text=True)
        logger.info("yt-dlp stdout:\n" + process.stdout)
        logger.info("yt-dlp stderr:\n" + process.stderr)

        # Find the downloaded file (yt-dlp might add .wav extension)
        downloaded_files = [f for f in os.listdir(temp_download_dir) if f.startswith("downloaded_audio.")]
        if not downloaded_files:
            raise RuntimeError("yt-dlp failed to download or convert audio.")
        local_file_path = os.path.join(temp_download_dir, downloaded_files[0])
        logger.info(f"Downloaded audio to temporary file: {local_file_path}")
        print(f"Downloaded audio to temporary file: {local_file_path}")

        # Perform transcription
        transcription_result = transcript_tool.forward(local_file_path)

        return transcription_result

    except subprocess.CalledProcessError as e:
        error_message = f"yt-dlp error: {e.stderr}"
        logger.error(error_message)
        return f"An error occurred during download: {error_message}"
    except Exception as e:
        error_message = f"An unexpected error occurred: {str(e)}"
        logger.error(error_message)
        return error_message


with gr.Blocks() as app:
    gr.Markdown("# <center>gradio-transcript-mcp: Transcribe Audio/Video from URL</center>")
    gr.Markdown(
        """
        This application functions as an MCP server that transcribes audio or video from a URL using OpenAI's Whisper model.
        It downloads the media, converts it to WAV, and performs the transcription.

        ### Connecting to the Hosted Server
        To connect your MCP client that supports SSE to this hosted server, add a configuration entry similar to this:
        
        ```json
        {
        "mcpServers": {
            "gradio-transcript": {
            "url": "https://bismay-gradio-transcript-mcp.hf.space/gradio_api/mcp/sse"
            }
        }
        }
        ```

        For more details on setup and MCP usage, see the [README.md](README.md).
        """
    )

    url_input = gr.Textbox(label="Enter Audio/Video URL", placeholder="e.g., https://www.youtube.com/watch?v=dQw4w9WgXcQ")
    transcribe_button = gr.Button("Transcribe")

    gr.Markdown("Provide a URL to transcribe audio or YT video.")

    transcription_output = gr.Textbox(label="Transcription", lines=10)

    transcribe_button.click(
        fn=transcribe_url,
        inputs=[url_input],
        outputs=[transcription_output]
    )

if __name__ == "__main__":
    app.launch(mcp_server=True)