Bismay commited on
Commit
83a4e82
·
0 Parent(s):

Initial commit

Browse files
Files changed (10) hide show
  1. .gitattributes +35 -0
  2. .gitignore +158 -0
  3. README.md +113 -0
  4. app.py +103 -0
  5. ffmpeg_setup.py +59 -0
  6. logging_config.py +17 -0
  7. requirements.txt +11 -0
  8. tool_config.json +12 -0
  9. transcription.py +29 -0
  10. transcription_tool.py +100 -0
.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,158 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ pip-wheel-metadata/
24
+ share/python-wheels/
25
+ *.egg-info/
26
+ .installed.cfg
27
+ *.egg
28
+ MANIFEST
29
+
30
+ # PyInstaller
31
+ *.manifest
32
+ *.spec
33
+
34
+ # Installer logs
35
+ pip-log.txt
36
+ pip-delete-this-directory.txt
37
+
38
+ # Unit test / coverage reports
39
+ htmlcov/
40
+ .tox/
41
+ .nox/
42
+ .coverage
43
+ .coverage.*
44
+ .cache
45
+ nosetests.xml
46
+ coverage.xml
47
+ *.cover
48
+ *.py,cover
49
+ .hypothesis/
50
+ .pytest_cache/
51
+
52
+ # Translations
53
+ *.mo
54
+ *.pot
55
+
56
+ # Django stuff:
57
+ # *.log (Covered by generic *.log below)
58
+ local_settings.py
59
+ db.sqlite3
60
+ db.sqlite3-journal
61
+
62
+ # Flask stuff:
63
+ instance/
64
+ .webassets-cache
65
+
66
+ # Scrapy stuff:
67
+ .scrapy
68
+
69
+ # Sphinx documentation
70
+ docs/_build/
71
+
72
+ # PyBuilder
73
+ target/
74
+
75
+ # Jupyter Notebook
76
+ .ipynb_checkpoints
77
+
78
+ # IPython
79
+ profile_default/
80
+ ipython_config.py
81
+
82
+ # pyenv
83
+ .python-version
84
+
85
+ # PEP 582; used by PDM, Flit and potentially other packaging tools.
86
+ __pypackages__/
87
+
88
+ # Celery stuff
89
+ celerybeat-schedule
90
+ celerybeat.pid
91
+
92
+ # SageMath parsed files
93
+ *.sage.py
94
+
95
+ # Environments
96
+ .env
97
+ .venv
98
+ env/
99
+ venv/
100
+ ENV/
101
+ env.bak/
102
+ venv.bak/
103
+
104
+ # Spyder project settings
105
+ .spyderproject
106
+ .spyproject
107
+
108
+ # Rope project settings
109
+ .ropeproject
110
+
111
+ # mkdocs documentation
112
+ /site
113
+
114
+ # mypy
115
+ .mypy_cache/
116
+ .dmypy.json
117
+ dmypy.json
118
+
119
+ # Pyre type checker
120
+ .pyre/
121
+
122
+ # Hugging Face cache (often large)
123
+ .cache/huggingface/
124
+
125
+ # Gradio
126
+ .gradio/
127
+
128
+ # General cache directory (covers some cases like pytest)
129
+ .cache/
130
+
131
+ # IDE / Editor specific files
132
+ .vscode/
133
+ .idea/
134
+ *.swp
135
+ *~
136
+ *.sublime-project
137
+ *.sublime-workspace
138
+
139
+ # OS generated files
140
+ .DS_Store
141
+ Thumbs.db
142
+
143
+ # Project-specific
144
+ temp_downloads/
145
+ test.wav
146
+ *.log # General log files
147
+
148
+ # Large media files potentially downloaded by yt-dlp (uncomment if needed)
149
+ *.mp4
150
+ *.mkv
151
+ *.webm
152
+ *.mp3
153
+ *.m4a
154
+ *.flv
155
+ *.aac
156
+ *.ogg
157
+ *.opus
158
+ *.wav # Ignoring all .wav files, including test.wav
README.md ADDED
@@ -0,0 +1,113 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: TranscriptTool - Gradio MCP Server for Transcription
3
+ emoji: 💬
4
+ colorFrom: green
5
+ colorTo: green
6
+ sdk: gradio
7
+ sdk_version: 5.29.0
8
+ app_file: app.py
9
+ pinned: false
10
+ license: apache-2.0
11
+ short_description: Gradio-based MCP server to transcribe audio & video from URLs
12
+ ---
13
+
14
+ # TranscriptTool: A Gradio MCP Server for Audio/Video Transcription from URLs
15
+
16
+ ## Overview
17
+
18
+ `TranscriptTool` is a Gradio application configured to function as an MCP (Model Control Protocol) server. It is designed to transcribe audio and video from URLs into text. Implementing OpenAI's Whisper and `ffmpeg` (via `yt-dlp`), this server enables MCP clients (like Cline) to process multimedia inputs efficiently by downloading and converting content from a given URL. It supports robust handling, including format conversion to WAV and dynamic device selection (CPU or GPU).
19
+
20
+ The repository contains the following main components:
21
+ - **`app.py`**: The main Gradio application file that runs the MCP server.
22
+ - **`transcription_tool.py`**: The core logic for handling file conversion and calling the transcription function.
23
+ - **`transcription.py`**: Contains the implementation for Whisper transcription using the `transformers` library.
24
+ - **`tool_config.json`**: Configuration details for the `TranscriptTool`.
25
+ - **`requirements.txt`**: Lists the necessary Python dependencies.
26
+ - **`ffmpeg_setup.py`**: Script to ensure ffmpeg is available.
27
+ - **`logging_config.py`**: Configuration for logging.
28
+
29
+ ---
30
+
31
+ ## Installation
32
+
33
+ 1. Clone this repository:
34
+ ```bash
35
+ git clone https://huggingface.co/spaces/bismay/TranscriptTool
36
+ cd TranscriptTool
37
+ ```
38
+ 2. Install dependencies:
39
+ ```bash
40
+ pip install -r requirements.txt
41
+ ```
42
+ This will install the necessary libraries, including `gradio[mcp]`, `yt-dlp`, `transformers`, and `torch`.
43
+
44
+ ---
45
+ ## Usage
46
+
47
+ ### Running the Gradio App / MCP Server
48
+
49
+ To run the Gradio application which also starts the MCP server, execute:
50
+ ```bash
51
+ python app.py
52
+ ```
53
+
54
+ This will launch a local Gradio web interface and start the MCP server. The server will expose the `transcribe_url` function as an MCP tool.
55
+
56
+ ### Using as an MCP Server
57
+
58
+ When you run `python app.py`, the application starts an MCP server accessible to MCP clients.
59
+
60
+ **Exposed Tool:**
61
+
62
+ The server exposes one tool: `transcribe_url`.
63
+
64
+ * **Description:** Transcribes audio or video from a given URL. Downloads the media from the URL, converts it to WAV format, and then uses the TranscriptTool to perform the transcription in English.
65
+ * **Input:**
66
+ * `url` (string): The URL of the audio or video file.
67
+ * **Output:** (string): The transcription of the audio/video in English, or an error message if download or transcription fails.
68
+
69
+ **Connecting an MCP Client:**
70
+
71
+ The MCP server will typically be accessible at `http://127.0.0.1:7860/gradio_api/mcp/sse` when run locally. You can find the exact URL printed in your console when the Gradio app launches.
72
+
73
+ To connect an MCP client (like Cline) to this server, you need to add a configuration entry in your client's settings. The exact format depends on your client, but it generally involves specifying a name for the server and its URL.
74
+
75
+ Example configuration for a client (like Cline) that supports SSE:
76
+
77
+ ```json
78
+ {
79
+ "mcpServers": {
80
+ "localTranscript": {
81
+ "url": "http://127.0.0.1:7860/gradio_api/mcp/sse"
82
+ }
83
+ }
84
+ }
85
+ ```
86
+
87
+ *Note: If your MCP client does not directly support SSE-based servers (like some versions of Claude Desktop), you may need to use a tool like `mcp-remote` as an intermediary. Refer to your client's documentation for details.*
88
+
89
+ ### Connecting to the Hosted Server on Hugging Face Spaces
90
+
91
+ This application is also hosted on Hugging Face Spaces, providing a publicly accessible MCP server. You can connect to this hosted server using the following URL:
92
+
93
+ `https://bismay-transcripttool.hf.space/gradio_api/mcp/sse`
94
+
95
+ To connect your MCP client (like Cline) to this hosted server, add a configuration entry similar to this:
96
+
97
+ ```json
98
+ {
99
+ "mcpServers": {
100
+ "remoteTranscript": {
101
+ "url": "https://maguid28-transcripttool.hf.space/gradio_api/mcp/sse"
102
+ }
103
+ }
104
+ }
105
+ ```
106
+
107
+ ---
108
+ ## License
109
+ This project is licensed under the Apache-2.0 License. See the LICENSE file for more details.
110
+
111
+ ---
112
+ ## Contributing
113
+ Contributions are welcome! Please open an issue or submit a pull request for any improvements or bug fixes.
app.py ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import subprocess
3
+ import tempfile
4
+ import os
5
+ import shutil
6
+ from logging_config import logger
7
+ from transcription_tool import TranscriptTool # Assuming TranscriptionTool is in `transcription_tool.py`
8
+
9
+ # smolagent transcription tool
10
+ transcript_tool = TranscriptTool()
11
+
12
+
13
+ def transcribe_url(url):
14
+ """
15
+ Transcribes audio or video from a given URL.
16
+
17
+ Downloads the media from the URL, converts it to WAV format,
18
+ and then uses the TranscriptTool to perform the transcription in English
19
+
20
+ Args:
21
+ url (str): The URL of the audio or video file.
22
+
23
+ Returns:
24
+ str: The transcription of the audio/video in english, or an error message if
25
+ download or transcription fails.
26
+ """
27
+ local_file_path = None
28
+
29
+ try:
30
+ if not url:
31
+ return "Error: Please provide a URL." # Removed the second empty string as the function only returns one value
32
+
33
+ logger.info(f"Attempting to download audio from URL: {url}")
34
+ temp_download_dir = "./temp_downloads"
35
+ os.makedirs(temp_download_dir, exist_ok=True)
36
+
37
+ # Use yt-dlp to download the best audio format and convert to wav
38
+ # -f bestaudio: selects the best audio format
39
+ # -x: extracts audio
40
+ # --audio-format wav: converts to wav format
41
+ # -o: specifies output template
42
+ output_template = os.path.join(temp_download_dir, "downloaded_audio.%(ext)s")
43
+
44
+ # --- Start Added Code ---
45
+ # Ensure any previous download is removed before starting a new one
46
+ logger.info(f"Checking for existing files matching 'downloaded_audio.*' in {temp_download_dir}")
47
+ for filename in os.listdir(temp_download_dir):
48
+ if filename.startswith("downloaded_audio."):
49
+ file_to_delete = os.path.join(temp_download_dir, filename)
50
+ try:
51
+ os.remove(file_to_delete)
52
+ logger.info(f"Removed existing file: {file_to_delete}")
53
+ except OSError as e:
54
+ logger.error(f"Error removing file {file_to_delete}: {e.strerror}")
55
+ # --- End Added Code ---
56
+
57
+ cmd = ["yt-dlp", "-f", "bestaudio", "-x", "--audio-format", "wav", "-o", output_template, url]
58
+ process = subprocess.run(cmd, check=True, capture_output=True, text=True)
59
+ logger.info("yt-dlp stdout:\n" + process.stdout)
60
+ logger.info("yt-dlp stderr:\n" + process.stderr)
61
+
62
+ # Find the downloaded file (yt-dlp might add .wav extension)
63
+ downloaded_files = [f for f in os.listdir(temp_download_dir) if f.startswith("downloaded_audio.")]
64
+ if not downloaded_files:
65
+ raise RuntimeError("yt-dlp failed to download or convert audio.")
66
+ local_file_path = os.path.join(temp_download_dir, downloaded_files[0])
67
+ logger.info(f"Downloaded audio to temporary file: {local_file_path}")
68
+ print(f"Downloaded audio to temporary file: {local_file_path}")
69
+
70
+ # Perform transcription
71
+ transcription_result = transcript_tool.forward(local_file_path)
72
+
73
+ return transcription_result
74
+
75
+ except subprocess.CalledProcessError as e:
76
+ error_message = f"yt-dlp error: {e.stderr}"
77
+ logger.error(error_message)
78
+ return f"An error occurred during download: {error_message}"
79
+ except Exception as e:
80
+ error_message = f"An unexpected error occurred: {str(e)}"
81
+ logger.error(error_message)
82
+ return error_message
83
+
84
+
85
+ with gr.Blocks() as app:
86
+ gr.Markdown("# TranscriptTool: Transcribe Audio/Video")
87
+ gr.Markdown("TranscriptTool is a smolagent tool used to transcribe audio and video files into text. This tool allows agents to process multimedia inputs efficiently. Can be used within a smolagent via the Hugging Face API.")
88
+
89
+ url_input = gr.Textbox(label="Enter Audio/Video URL", placeholder="e.g., https://www.youtube.com/watch?v=dQw4w9WgXcQ")
90
+ transcribe_button = gr.Button("Transcribe")
91
+
92
+ gr.Markdown("Provide a URL to transcribe audio or video.")
93
+
94
+ transcription_output = gr.Textbox(label="Transcription", lines=10)
95
+
96
+ transcribe_button.click(
97
+ fn=transcribe_url,
98
+ inputs=[url_input],
99
+ outputs=[transcription_output]
100
+ )
101
+
102
+ if __name__ == "__main__":
103
+ app.launch(mcp_server=True)
ffmpeg_setup.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import stat
3
+ import shutil
4
+ import subprocess
5
+ import imageio_ffmpeg
6
+ from logging_config import logger
7
+
8
+
9
+ def is_ffmpeg_in_path() -> bool:
10
+ try:
11
+ subprocess.run(
12
+ ["ffmpeg", "-version"],
13
+ stdout=subprocess.PIPE,
14
+ stderr=subprocess.PIPE,
15
+ check=True
16
+ )
17
+ return True
18
+ except (subprocess.CalledProcessError, FileNotFoundError):
19
+ return False
20
+
21
+
22
+ def ensure_ffmpeg_in_path():
23
+
24
+ if is_ffmpeg_in_path():
25
+ logger.info("FFmpeg is already available in PATH.")
26
+ return
27
+
28
+ try:
29
+ ffmpeg_path_original = imageio_ffmpeg.get_ffmpeg_exe()
30
+ ffmpeg_dir = os.path.dirname(ffmpeg_path_original)
31
+ binary_name = os.path.basename(ffmpeg_path_original)
32
+
33
+ logger.info(f"imageio-ffmpeg reported path: {ffmpeg_path_original}")
34
+ logger.info(f"Directory contents: {os.listdir(ffmpeg_dir)}")
35
+ logger.info(f"Binary name: {binary_name}")
36
+
37
+ expected_binary_name = "ffmpeg"
38
+ copied_path = os.path.join(ffmpeg_dir, expected_binary_name)
39
+
40
+ if not os.path.exists(copied_path):
41
+ logger.info(f"Copying {binary_name} to {expected_binary_name} in {ffmpeg_dir}.")
42
+ shutil.copy2(ffmpeg_path_original, copied_path)
43
+ st = os.stat(copied_path)
44
+ os.chmod(copied_path, st.st_mode | stat.S_IEXEC)
45
+ else:
46
+ logger.info(f"'{copied_path}' already exists; skipping copy.")
47
+
48
+ # Add directory to PATH
49
+ os.environ["PATH"] = ffmpeg_dir + os.pathsep + os.environ["PATH"]
50
+ logger.info(f"PATH updated to include: {ffmpeg_dir}")
51
+
52
+ if is_ffmpeg_in_path():
53
+ logger.info("FFmpeg is now accessible in PATH.")
54
+ else:
55
+ logger.warning("FFmpeg is still not found in PATH after attempting to add it.")
56
+ raise RuntimeError("Failed to make FFmpeg accessible in PATH.")
57
+ except Exception as e:
58
+ logger.error(f"Failed to ensure FFmpeg is in PATH: {str(e)}")
59
+ raise RuntimeError("Failed to ensure FFmpeg is in PATH.") from e
logging_config.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import sys # Import sys for console output
3
+
4
+ logger = logging.getLogger("transcription_logger")
5
+ logger.setLevel(logging.DEBUG)
6
+
7
+ # Formatter for the logs
8
+ formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
9
+
10
+ # Log handler that writes to the console (stdout/stderr)
11
+ console_handler = logging.StreamHandler(sys.stdout) # Use sys.stdout for console output
12
+ console_handler.setLevel(logging.DEBUG)
13
+ console_handler.setFormatter(formatter)
14
+
15
+ logger.addHandler(console_handler)
16
+
17
+ logger.propagate = False
requirements.txt ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ huggingface_hub==0.30.1
2
+ gradio==5.29.0
3
+ gradio[mcp]
4
+ pydantic==2.10.6
5
+ youtube-transcript-api==0.6.3
6
+ yt-dlp==2025.1.15
7
+ transformers==4.48.1
8
+ torch==2.2.2
9
+ imageio-ffmpeg==0.6.0
10
+ numpy==1.24.3
11
+ smolagents==1.4.1
tool_config.json ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "description": "\n A smolagent tool for transcribing audio and video from URLs into text. This tool utilizes Whisper for transcription \n and ffmpeg (via yt-dlp) for media conversion, enabling agents to process multimedia inputs into text by downloading \n and converting content from a given URL. It supports robust handling, including format conversion to WAV and dynamic \n device selection for optimal performance. It empowers agents to integrate audio and video transcription capabilities \n into workflows for enhanced data accessibility.",
3
+ "inputs": {
4
+ "file_path": {
5
+ "description": "Path to the audio or video file for transcription.",
6
+ "type": "string"
7
+ }
8
+ },
9
+ "name": "transcription_tool",
10
+ "output_type": "string",
11
+ "tool_class": "TranscriptionTool"
12
+ }
transcription.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from transformers import pipeline
3
+ from logging_config import logger
4
+
5
+ def run_whisper_transcription(wav_file_path: str, device: str):
6
+ try:
7
+ model_name = "distil-whisper/distil-small.en"
8
+ logger.info(f"Initialising Whisper ASR pipeline with model: {model_name}")
9
+ logger.info(f"Running pipeline on device: {device}")
10
+
11
+ asr_pipeline = pipeline(
12
+ "automatic-speech-recognition",
13
+ model=model_name,
14
+ device=0 if device == "cuda" else -1,
15
+ return_timestamps=True
16
+ )
17
+ logger.info("Whisper ASR pipeline initialised.")
18
+ logger.info(f"Starting transcription for file: {wav_file_path}")
19
+
20
+ # Perform transcription
21
+ result = asr_pipeline(wav_file_path)
22
+ transcription = result.get("text", "")
23
+ logger.info("Transcription completed successfully.")
24
+
25
+ yield transcription # Yield only the transcription string
26
+ except Exception as e:
27
+ err_msg = f"Error during transcription: {str(e)}"
28
+ logger.error(err_msg)
29
+ yield err_msg # Yield only the error message string
transcription_tool.py ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from smolagents import Tool
2
+ import os
3
+ import tempfile
4
+ import shutil
5
+ import torch
6
+ import subprocess
7
+ from transcription import run_whisper_transcription
8
+ from logging_config import logger
9
+ from ffmpeg_setup import ensure_ffmpeg_in_path
10
+
11
+
12
+ class TranscriptTool(Tool):
13
+ name = "TranscriptTool"
14
+ description = """
15
+ A smolagent tool for transcribing audio and video files into text. This tool utilises Whisper for transcription
16
+ and ffmpeg for media conversion, enabling agents to process multimedia inputs into text. The tool supports robust
17
+ file handling, including format conversion to WAV and dynamic device selection for optimal performance.
18
+ """
19
+ inputs = {
20
+ "file_path": {
21
+ "type": "string",
22
+ "description": "Path to the audio or video file for transcription."
23
+ }
24
+ }
25
+ output_type = "string"
26
+
27
+ def __init__(self, audio_directory=None):
28
+ super().__init__()
29
+ ensure_ffmpeg_in_path()
30
+ self.audio_directory = audio_directory or os.getcwd()
31
+
32
+ def locate_audio_file(self, file_name):
33
+ for root, _, files in os.walk(self.audio_directory):
34
+ if file_name in files:
35
+ return os.path.join(root, file_name)
36
+ return None
37
+
38
+ def convert_audio_to_wav(self, input_file: str, output_file: str, ffmpeg_path: str) -> str:
39
+ logger.info(f"Converting {input_file} to WAV format: {output_file}")
40
+ cmd = [
41
+ ffmpeg_path,
42
+ "-y", # Overwrite output files without asking
43
+ "-i", input_file,
44
+ "-ar", "16000", # Set audio sampling rate to 16kHz
45
+ "-ac", "1", # Set number of audio channels to mono
46
+ output_file
47
+ ]
48
+ try:
49
+ subprocess.run(cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
50
+ logger.info("Audio conversion to WAV completed successfully.")
51
+ return output_file
52
+ except subprocess.CalledProcessError as e:
53
+ ffmpeg_error = e.stderr.decode()
54
+ logger.error(f"ffmpeg error: {ffmpeg_error}")
55
+ raise RuntimeError("Failed to convert audio to WAV.") from e
56
+
57
+ def forward(self, file_path: str) -> str:
58
+ try:
59
+ # Locate the file if it does not exist
60
+ logger.info(f"Attempting to transcribe file: {file_path}")
61
+ if not os.path.exists(file_path):
62
+ file_name = os.path.basename(file_path)
63
+ file_path = self.locate_audio_file(file_name)
64
+ if not file_path:
65
+ logger.error(f"File '{file_name}' not found in '{self.audio_directory}'.")
66
+ return f"Error: File '{file_name}' not found in '{self.audio_directory}'."
67
+
68
+ with tempfile.TemporaryDirectory() as tmpdir:
69
+ # Copy file to temp dir
70
+ filename = os.path.basename(file_path)
71
+ input_file_path = os.path.join(tmpdir, filename)
72
+ shutil.copy(file_path, input_file_path)
73
+ logger.info(f"Copied input file to temporary directory: {input_file_path}")
74
+
75
+ # Check if already WAV, otherwise convert
76
+ if input_file_path.lower().endswith(".wav"):
77
+ logger.info(f"File {filename} is already in WAV format. Skipping conversion.")
78
+ transcription_input_path = input_file_path
79
+ else:
80
+ # Convert to wav
81
+ wav_file_path = os.path.join(tmpdir, "converted_audio.wav")
82
+ ffmpeg_path = shutil.which("ffmpeg")
83
+ if not ffmpeg_path:
84
+ logger.error("ffmpeg is not accessible in PATH.")
85
+ raise RuntimeError("ffmpeg is not accessible in PATH.")
86
+ self.convert_audio_to_wav(input_file_path, wav_file_path, ffmpeg_path)
87
+ transcription_input_path = wav_file_path
88
+
89
+ device = "cuda" if torch.cuda.is_available() else "cpu"
90
+ logger.info(f"Using device for transcription: {device}")
91
+
92
+ # Transcribe audio
93
+ transcription_generator = run_whisper_transcription(transcription_input_path, device)
94
+ # The generator now yields only the result string (transcription or error)
95
+ for result_string in transcription_generator:
96
+ return result_string # Return the first (and only) result
97
+
98
+ except Exception as e:
99
+ logger.error(f"An unexpected error occurred in TranscriptTool: {str(e)}")
100
+ return f"An error occurred: {str(e)}"