Spaces:
Running
Running
Bismay
commited on
Commit
·
83a4e82
0
Parent(s):
Initial commit
Browse files- .gitattributes +35 -0
- .gitignore +158 -0
- README.md +113 -0
- app.py +103 -0
- ffmpeg_setup.py +59 -0
- logging_config.py +17 -0
- requirements.txt +11 -0
- tool_config.json +12 -0
- transcription.py +29 -0
- transcription_tool.py +100 -0
.gitattributes
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
.gitignore
ADDED
@@ -0,0 +1,158 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Byte-compiled / optimized / DLL files
|
2 |
+
__pycache__/
|
3 |
+
*.py[cod]
|
4 |
+
*$py.class
|
5 |
+
|
6 |
+
# C extensions
|
7 |
+
*.so
|
8 |
+
|
9 |
+
# Distribution / packaging
|
10 |
+
.Python
|
11 |
+
build/
|
12 |
+
develop-eggs/
|
13 |
+
dist/
|
14 |
+
downloads/
|
15 |
+
eggs/
|
16 |
+
.eggs/
|
17 |
+
lib/
|
18 |
+
lib64/
|
19 |
+
parts/
|
20 |
+
sdist/
|
21 |
+
var/
|
22 |
+
wheels/
|
23 |
+
pip-wheel-metadata/
|
24 |
+
share/python-wheels/
|
25 |
+
*.egg-info/
|
26 |
+
.installed.cfg
|
27 |
+
*.egg
|
28 |
+
MANIFEST
|
29 |
+
|
30 |
+
# PyInstaller
|
31 |
+
*.manifest
|
32 |
+
*.spec
|
33 |
+
|
34 |
+
# Installer logs
|
35 |
+
pip-log.txt
|
36 |
+
pip-delete-this-directory.txt
|
37 |
+
|
38 |
+
# Unit test / coverage reports
|
39 |
+
htmlcov/
|
40 |
+
.tox/
|
41 |
+
.nox/
|
42 |
+
.coverage
|
43 |
+
.coverage.*
|
44 |
+
.cache
|
45 |
+
nosetests.xml
|
46 |
+
coverage.xml
|
47 |
+
*.cover
|
48 |
+
*.py,cover
|
49 |
+
.hypothesis/
|
50 |
+
.pytest_cache/
|
51 |
+
|
52 |
+
# Translations
|
53 |
+
*.mo
|
54 |
+
*.pot
|
55 |
+
|
56 |
+
# Django stuff:
|
57 |
+
# *.log (Covered by generic *.log below)
|
58 |
+
local_settings.py
|
59 |
+
db.sqlite3
|
60 |
+
db.sqlite3-journal
|
61 |
+
|
62 |
+
# Flask stuff:
|
63 |
+
instance/
|
64 |
+
.webassets-cache
|
65 |
+
|
66 |
+
# Scrapy stuff:
|
67 |
+
.scrapy
|
68 |
+
|
69 |
+
# Sphinx documentation
|
70 |
+
docs/_build/
|
71 |
+
|
72 |
+
# PyBuilder
|
73 |
+
target/
|
74 |
+
|
75 |
+
# Jupyter Notebook
|
76 |
+
.ipynb_checkpoints
|
77 |
+
|
78 |
+
# IPython
|
79 |
+
profile_default/
|
80 |
+
ipython_config.py
|
81 |
+
|
82 |
+
# pyenv
|
83 |
+
.python-version
|
84 |
+
|
85 |
+
# PEP 582; used by PDM, Flit and potentially other packaging tools.
|
86 |
+
__pypackages__/
|
87 |
+
|
88 |
+
# Celery stuff
|
89 |
+
celerybeat-schedule
|
90 |
+
celerybeat.pid
|
91 |
+
|
92 |
+
# SageMath parsed files
|
93 |
+
*.sage.py
|
94 |
+
|
95 |
+
# Environments
|
96 |
+
.env
|
97 |
+
.venv
|
98 |
+
env/
|
99 |
+
venv/
|
100 |
+
ENV/
|
101 |
+
env.bak/
|
102 |
+
venv.bak/
|
103 |
+
|
104 |
+
# Spyder project settings
|
105 |
+
.spyderproject
|
106 |
+
.spyproject
|
107 |
+
|
108 |
+
# Rope project settings
|
109 |
+
.ropeproject
|
110 |
+
|
111 |
+
# mkdocs documentation
|
112 |
+
/site
|
113 |
+
|
114 |
+
# mypy
|
115 |
+
.mypy_cache/
|
116 |
+
.dmypy.json
|
117 |
+
dmypy.json
|
118 |
+
|
119 |
+
# Pyre type checker
|
120 |
+
.pyre/
|
121 |
+
|
122 |
+
# Hugging Face cache (often large)
|
123 |
+
.cache/huggingface/
|
124 |
+
|
125 |
+
# Gradio
|
126 |
+
.gradio/
|
127 |
+
|
128 |
+
# General cache directory (covers some cases like pytest)
|
129 |
+
.cache/
|
130 |
+
|
131 |
+
# IDE / Editor specific files
|
132 |
+
.vscode/
|
133 |
+
.idea/
|
134 |
+
*.swp
|
135 |
+
*~
|
136 |
+
*.sublime-project
|
137 |
+
*.sublime-workspace
|
138 |
+
|
139 |
+
# OS generated files
|
140 |
+
.DS_Store
|
141 |
+
Thumbs.db
|
142 |
+
|
143 |
+
# Project-specific
|
144 |
+
temp_downloads/
|
145 |
+
test.wav
|
146 |
+
*.log # General log files
|
147 |
+
|
148 |
+
# Large media files potentially downloaded by yt-dlp (uncomment if needed)
|
149 |
+
*.mp4
|
150 |
+
*.mkv
|
151 |
+
*.webm
|
152 |
+
*.mp3
|
153 |
+
*.m4a
|
154 |
+
*.flv
|
155 |
+
*.aac
|
156 |
+
*.ogg
|
157 |
+
*.opus
|
158 |
+
*.wav # Ignoring all .wav files, including test.wav
|
README.md
ADDED
@@ -0,0 +1,113 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
title: TranscriptTool - Gradio MCP Server for Transcription
|
3 |
+
emoji: 💬
|
4 |
+
colorFrom: green
|
5 |
+
colorTo: green
|
6 |
+
sdk: gradio
|
7 |
+
sdk_version: 5.29.0
|
8 |
+
app_file: app.py
|
9 |
+
pinned: false
|
10 |
+
license: apache-2.0
|
11 |
+
short_description: Gradio-based MCP server to transcribe audio & video from URLs
|
12 |
+
---
|
13 |
+
|
14 |
+
# TranscriptTool: A Gradio MCP Server for Audio/Video Transcription from URLs
|
15 |
+
|
16 |
+
## Overview
|
17 |
+
|
18 |
+
`TranscriptTool` is a Gradio application configured to function as an MCP (Model Control Protocol) server. It is designed to transcribe audio and video from URLs into text. Implementing OpenAI's Whisper and `ffmpeg` (via `yt-dlp`), this server enables MCP clients (like Cline) to process multimedia inputs efficiently by downloading and converting content from a given URL. It supports robust handling, including format conversion to WAV and dynamic device selection (CPU or GPU).
|
19 |
+
|
20 |
+
The repository contains the following main components:
|
21 |
+
- **`app.py`**: The main Gradio application file that runs the MCP server.
|
22 |
+
- **`transcription_tool.py`**: The core logic for handling file conversion and calling the transcription function.
|
23 |
+
- **`transcription.py`**: Contains the implementation for Whisper transcription using the `transformers` library.
|
24 |
+
- **`tool_config.json`**: Configuration details for the `TranscriptTool`.
|
25 |
+
- **`requirements.txt`**: Lists the necessary Python dependencies.
|
26 |
+
- **`ffmpeg_setup.py`**: Script to ensure ffmpeg is available.
|
27 |
+
- **`logging_config.py`**: Configuration for logging.
|
28 |
+
|
29 |
+
---
|
30 |
+
|
31 |
+
## Installation
|
32 |
+
|
33 |
+
1. Clone this repository:
|
34 |
+
```bash
|
35 |
+
git clone https://huggingface.co/spaces/bismay/TranscriptTool
|
36 |
+
cd TranscriptTool
|
37 |
+
```
|
38 |
+
2. Install dependencies:
|
39 |
+
```bash
|
40 |
+
pip install -r requirements.txt
|
41 |
+
```
|
42 |
+
This will install the necessary libraries, including `gradio[mcp]`, `yt-dlp`, `transformers`, and `torch`.
|
43 |
+
|
44 |
+
---
|
45 |
+
## Usage
|
46 |
+
|
47 |
+
### Running the Gradio App / MCP Server
|
48 |
+
|
49 |
+
To run the Gradio application which also starts the MCP server, execute:
|
50 |
+
```bash
|
51 |
+
python app.py
|
52 |
+
```
|
53 |
+
|
54 |
+
This will launch a local Gradio web interface and start the MCP server. The server will expose the `transcribe_url` function as an MCP tool.
|
55 |
+
|
56 |
+
### Using as an MCP Server
|
57 |
+
|
58 |
+
When you run `python app.py`, the application starts an MCP server accessible to MCP clients.
|
59 |
+
|
60 |
+
**Exposed Tool:**
|
61 |
+
|
62 |
+
The server exposes one tool: `transcribe_url`.
|
63 |
+
|
64 |
+
* **Description:** Transcribes audio or video from a given URL. Downloads the media from the URL, converts it to WAV format, and then uses the TranscriptTool to perform the transcription in English.
|
65 |
+
* **Input:**
|
66 |
+
* `url` (string): The URL of the audio or video file.
|
67 |
+
* **Output:** (string): The transcription of the audio/video in English, or an error message if download or transcription fails.
|
68 |
+
|
69 |
+
**Connecting an MCP Client:**
|
70 |
+
|
71 |
+
The MCP server will typically be accessible at `http://127.0.0.1:7860/gradio_api/mcp/sse` when run locally. You can find the exact URL printed in your console when the Gradio app launches.
|
72 |
+
|
73 |
+
To connect an MCP client (like Cline) to this server, you need to add a configuration entry in your client's settings. The exact format depends on your client, but it generally involves specifying a name for the server and its URL.
|
74 |
+
|
75 |
+
Example configuration for a client (like Cline) that supports SSE:
|
76 |
+
|
77 |
+
```json
|
78 |
+
{
|
79 |
+
"mcpServers": {
|
80 |
+
"localTranscript": {
|
81 |
+
"url": "http://127.0.0.1:7860/gradio_api/mcp/sse"
|
82 |
+
}
|
83 |
+
}
|
84 |
+
}
|
85 |
+
```
|
86 |
+
|
87 |
+
*Note: If your MCP client does not directly support SSE-based servers (like some versions of Claude Desktop), you may need to use a tool like `mcp-remote` as an intermediary. Refer to your client's documentation for details.*
|
88 |
+
|
89 |
+
### Connecting to the Hosted Server on Hugging Face Spaces
|
90 |
+
|
91 |
+
This application is also hosted on Hugging Face Spaces, providing a publicly accessible MCP server. You can connect to this hosted server using the following URL:
|
92 |
+
|
93 |
+
`https://bismay-transcripttool.hf.space/gradio_api/mcp/sse`
|
94 |
+
|
95 |
+
To connect your MCP client (like Cline) to this hosted server, add a configuration entry similar to this:
|
96 |
+
|
97 |
+
```json
|
98 |
+
{
|
99 |
+
"mcpServers": {
|
100 |
+
"remoteTranscript": {
|
101 |
+
"url": "https://maguid28-transcripttool.hf.space/gradio_api/mcp/sse"
|
102 |
+
}
|
103 |
+
}
|
104 |
+
}
|
105 |
+
```
|
106 |
+
|
107 |
+
---
|
108 |
+
## License
|
109 |
+
This project is licensed under the Apache-2.0 License. See the LICENSE file for more details.
|
110 |
+
|
111 |
+
---
|
112 |
+
## Contributing
|
113 |
+
Contributions are welcome! Please open an issue or submit a pull request for any improvements or bug fixes.
|
app.py
ADDED
@@ -0,0 +1,103 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import subprocess
|
3 |
+
import tempfile
|
4 |
+
import os
|
5 |
+
import shutil
|
6 |
+
from logging_config import logger
|
7 |
+
from transcription_tool import TranscriptTool # Assuming TranscriptionTool is in `transcription_tool.py`
|
8 |
+
|
9 |
+
# smolagent transcription tool
|
10 |
+
transcript_tool = TranscriptTool()
|
11 |
+
|
12 |
+
|
13 |
+
def transcribe_url(url):
|
14 |
+
"""
|
15 |
+
Transcribes audio or video from a given URL.
|
16 |
+
|
17 |
+
Downloads the media from the URL, converts it to WAV format,
|
18 |
+
and then uses the TranscriptTool to perform the transcription in English
|
19 |
+
|
20 |
+
Args:
|
21 |
+
url (str): The URL of the audio or video file.
|
22 |
+
|
23 |
+
Returns:
|
24 |
+
str: The transcription of the audio/video in english, or an error message if
|
25 |
+
download or transcription fails.
|
26 |
+
"""
|
27 |
+
local_file_path = None
|
28 |
+
|
29 |
+
try:
|
30 |
+
if not url:
|
31 |
+
return "Error: Please provide a URL." # Removed the second empty string as the function only returns one value
|
32 |
+
|
33 |
+
logger.info(f"Attempting to download audio from URL: {url}")
|
34 |
+
temp_download_dir = "./temp_downloads"
|
35 |
+
os.makedirs(temp_download_dir, exist_ok=True)
|
36 |
+
|
37 |
+
# Use yt-dlp to download the best audio format and convert to wav
|
38 |
+
# -f bestaudio: selects the best audio format
|
39 |
+
# -x: extracts audio
|
40 |
+
# --audio-format wav: converts to wav format
|
41 |
+
# -o: specifies output template
|
42 |
+
output_template = os.path.join(temp_download_dir, "downloaded_audio.%(ext)s")
|
43 |
+
|
44 |
+
# --- Start Added Code ---
|
45 |
+
# Ensure any previous download is removed before starting a new one
|
46 |
+
logger.info(f"Checking for existing files matching 'downloaded_audio.*' in {temp_download_dir}")
|
47 |
+
for filename in os.listdir(temp_download_dir):
|
48 |
+
if filename.startswith("downloaded_audio."):
|
49 |
+
file_to_delete = os.path.join(temp_download_dir, filename)
|
50 |
+
try:
|
51 |
+
os.remove(file_to_delete)
|
52 |
+
logger.info(f"Removed existing file: {file_to_delete}")
|
53 |
+
except OSError as e:
|
54 |
+
logger.error(f"Error removing file {file_to_delete}: {e.strerror}")
|
55 |
+
# --- End Added Code ---
|
56 |
+
|
57 |
+
cmd = ["yt-dlp", "-f", "bestaudio", "-x", "--audio-format", "wav", "-o", output_template, url]
|
58 |
+
process = subprocess.run(cmd, check=True, capture_output=True, text=True)
|
59 |
+
logger.info("yt-dlp stdout:\n" + process.stdout)
|
60 |
+
logger.info("yt-dlp stderr:\n" + process.stderr)
|
61 |
+
|
62 |
+
# Find the downloaded file (yt-dlp might add .wav extension)
|
63 |
+
downloaded_files = [f for f in os.listdir(temp_download_dir) if f.startswith("downloaded_audio.")]
|
64 |
+
if not downloaded_files:
|
65 |
+
raise RuntimeError("yt-dlp failed to download or convert audio.")
|
66 |
+
local_file_path = os.path.join(temp_download_dir, downloaded_files[0])
|
67 |
+
logger.info(f"Downloaded audio to temporary file: {local_file_path}")
|
68 |
+
print(f"Downloaded audio to temporary file: {local_file_path}")
|
69 |
+
|
70 |
+
# Perform transcription
|
71 |
+
transcription_result = transcript_tool.forward(local_file_path)
|
72 |
+
|
73 |
+
return transcription_result
|
74 |
+
|
75 |
+
except subprocess.CalledProcessError as e:
|
76 |
+
error_message = f"yt-dlp error: {e.stderr}"
|
77 |
+
logger.error(error_message)
|
78 |
+
return f"An error occurred during download: {error_message}"
|
79 |
+
except Exception as e:
|
80 |
+
error_message = f"An unexpected error occurred: {str(e)}"
|
81 |
+
logger.error(error_message)
|
82 |
+
return error_message
|
83 |
+
|
84 |
+
|
85 |
+
with gr.Blocks() as app:
|
86 |
+
gr.Markdown("# TranscriptTool: Transcribe Audio/Video")
|
87 |
+
gr.Markdown("TranscriptTool is a smolagent tool used to transcribe audio and video files into text. This tool allows agents to process multimedia inputs efficiently. Can be used within a smolagent via the Hugging Face API.")
|
88 |
+
|
89 |
+
url_input = gr.Textbox(label="Enter Audio/Video URL", placeholder="e.g., https://www.youtube.com/watch?v=dQw4w9WgXcQ")
|
90 |
+
transcribe_button = gr.Button("Transcribe")
|
91 |
+
|
92 |
+
gr.Markdown("Provide a URL to transcribe audio or video.")
|
93 |
+
|
94 |
+
transcription_output = gr.Textbox(label="Transcription", lines=10)
|
95 |
+
|
96 |
+
transcribe_button.click(
|
97 |
+
fn=transcribe_url,
|
98 |
+
inputs=[url_input],
|
99 |
+
outputs=[transcription_output]
|
100 |
+
)
|
101 |
+
|
102 |
+
if __name__ == "__main__":
|
103 |
+
app.launch(mcp_server=True)
|
ffmpeg_setup.py
ADDED
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import stat
|
3 |
+
import shutil
|
4 |
+
import subprocess
|
5 |
+
import imageio_ffmpeg
|
6 |
+
from logging_config import logger
|
7 |
+
|
8 |
+
|
9 |
+
def is_ffmpeg_in_path() -> bool:
|
10 |
+
try:
|
11 |
+
subprocess.run(
|
12 |
+
["ffmpeg", "-version"],
|
13 |
+
stdout=subprocess.PIPE,
|
14 |
+
stderr=subprocess.PIPE,
|
15 |
+
check=True
|
16 |
+
)
|
17 |
+
return True
|
18 |
+
except (subprocess.CalledProcessError, FileNotFoundError):
|
19 |
+
return False
|
20 |
+
|
21 |
+
|
22 |
+
def ensure_ffmpeg_in_path():
|
23 |
+
|
24 |
+
if is_ffmpeg_in_path():
|
25 |
+
logger.info("FFmpeg is already available in PATH.")
|
26 |
+
return
|
27 |
+
|
28 |
+
try:
|
29 |
+
ffmpeg_path_original = imageio_ffmpeg.get_ffmpeg_exe()
|
30 |
+
ffmpeg_dir = os.path.dirname(ffmpeg_path_original)
|
31 |
+
binary_name = os.path.basename(ffmpeg_path_original)
|
32 |
+
|
33 |
+
logger.info(f"imageio-ffmpeg reported path: {ffmpeg_path_original}")
|
34 |
+
logger.info(f"Directory contents: {os.listdir(ffmpeg_dir)}")
|
35 |
+
logger.info(f"Binary name: {binary_name}")
|
36 |
+
|
37 |
+
expected_binary_name = "ffmpeg"
|
38 |
+
copied_path = os.path.join(ffmpeg_dir, expected_binary_name)
|
39 |
+
|
40 |
+
if not os.path.exists(copied_path):
|
41 |
+
logger.info(f"Copying {binary_name} to {expected_binary_name} in {ffmpeg_dir}.")
|
42 |
+
shutil.copy2(ffmpeg_path_original, copied_path)
|
43 |
+
st = os.stat(copied_path)
|
44 |
+
os.chmod(copied_path, st.st_mode | stat.S_IEXEC)
|
45 |
+
else:
|
46 |
+
logger.info(f"'{copied_path}' already exists; skipping copy.")
|
47 |
+
|
48 |
+
# Add directory to PATH
|
49 |
+
os.environ["PATH"] = ffmpeg_dir + os.pathsep + os.environ["PATH"]
|
50 |
+
logger.info(f"PATH updated to include: {ffmpeg_dir}")
|
51 |
+
|
52 |
+
if is_ffmpeg_in_path():
|
53 |
+
logger.info("FFmpeg is now accessible in PATH.")
|
54 |
+
else:
|
55 |
+
logger.warning("FFmpeg is still not found in PATH after attempting to add it.")
|
56 |
+
raise RuntimeError("Failed to make FFmpeg accessible in PATH.")
|
57 |
+
except Exception as e:
|
58 |
+
logger.error(f"Failed to ensure FFmpeg is in PATH: {str(e)}")
|
59 |
+
raise RuntimeError("Failed to ensure FFmpeg is in PATH.") from e
|
logging_config.py
ADDED
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import logging
|
2 |
+
import sys # Import sys for console output
|
3 |
+
|
4 |
+
logger = logging.getLogger("transcription_logger")
|
5 |
+
logger.setLevel(logging.DEBUG)
|
6 |
+
|
7 |
+
# Formatter for the logs
|
8 |
+
formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
|
9 |
+
|
10 |
+
# Log handler that writes to the console (stdout/stderr)
|
11 |
+
console_handler = logging.StreamHandler(sys.stdout) # Use sys.stdout for console output
|
12 |
+
console_handler.setLevel(logging.DEBUG)
|
13 |
+
console_handler.setFormatter(formatter)
|
14 |
+
|
15 |
+
logger.addHandler(console_handler)
|
16 |
+
|
17 |
+
logger.propagate = False
|
requirements.txt
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
huggingface_hub==0.30.1
|
2 |
+
gradio==5.29.0
|
3 |
+
gradio[mcp]
|
4 |
+
pydantic==2.10.6
|
5 |
+
youtube-transcript-api==0.6.3
|
6 |
+
yt-dlp==2025.1.15
|
7 |
+
transformers==4.48.1
|
8 |
+
torch==2.2.2
|
9 |
+
imageio-ffmpeg==0.6.0
|
10 |
+
numpy==1.24.3
|
11 |
+
smolagents==1.4.1
|
tool_config.json
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"description": "\n A smolagent tool for transcribing audio and video from URLs into text. This tool utilizes Whisper for transcription \n and ffmpeg (via yt-dlp) for media conversion, enabling agents to process multimedia inputs into text by downloading \n and converting content from a given URL. It supports robust handling, including format conversion to WAV and dynamic \n device selection for optimal performance. It empowers agents to integrate audio and video transcription capabilities \n into workflows for enhanced data accessibility.",
|
3 |
+
"inputs": {
|
4 |
+
"file_path": {
|
5 |
+
"description": "Path to the audio or video file for transcription.",
|
6 |
+
"type": "string"
|
7 |
+
}
|
8 |
+
},
|
9 |
+
"name": "transcription_tool",
|
10 |
+
"output_type": "string",
|
11 |
+
"tool_class": "TranscriptionTool"
|
12 |
+
}
|
transcription.py
ADDED
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
from transformers import pipeline
|
3 |
+
from logging_config import logger
|
4 |
+
|
5 |
+
def run_whisper_transcription(wav_file_path: str, device: str):
|
6 |
+
try:
|
7 |
+
model_name = "distil-whisper/distil-small.en"
|
8 |
+
logger.info(f"Initialising Whisper ASR pipeline with model: {model_name}")
|
9 |
+
logger.info(f"Running pipeline on device: {device}")
|
10 |
+
|
11 |
+
asr_pipeline = pipeline(
|
12 |
+
"automatic-speech-recognition",
|
13 |
+
model=model_name,
|
14 |
+
device=0 if device == "cuda" else -1,
|
15 |
+
return_timestamps=True
|
16 |
+
)
|
17 |
+
logger.info("Whisper ASR pipeline initialised.")
|
18 |
+
logger.info(f"Starting transcription for file: {wav_file_path}")
|
19 |
+
|
20 |
+
# Perform transcription
|
21 |
+
result = asr_pipeline(wav_file_path)
|
22 |
+
transcription = result.get("text", "")
|
23 |
+
logger.info("Transcription completed successfully.")
|
24 |
+
|
25 |
+
yield transcription # Yield only the transcription string
|
26 |
+
except Exception as e:
|
27 |
+
err_msg = f"Error during transcription: {str(e)}"
|
28 |
+
logger.error(err_msg)
|
29 |
+
yield err_msg # Yield only the error message string
|
transcription_tool.py
ADDED
@@ -0,0 +1,100 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from smolagents import Tool
|
2 |
+
import os
|
3 |
+
import tempfile
|
4 |
+
import shutil
|
5 |
+
import torch
|
6 |
+
import subprocess
|
7 |
+
from transcription import run_whisper_transcription
|
8 |
+
from logging_config import logger
|
9 |
+
from ffmpeg_setup import ensure_ffmpeg_in_path
|
10 |
+
|
11 |
+
|
12 |
+
class TranscriptTool(Tool):
|
13 |
+
name = "TranscriptTool"
|
14 |
+
description = """
|
15 |
+
A smolagent tool for transcribing audio and video files into text. This tool utilises Whisper for transcription
|
16 |
+
and ffmpeg for media conversion, enabling agents to process multimedia inputs into text. The tool supports robust
|
17 |
+
file handling, including format conversion to WAV and dynamic device selection for optimal performance.
|
18 |
+
"""
|
19 |
+
inputs = {
|
20 |
+
"file_path": {
|
21 |
+
"type": "string",
|
22 |
+
"description": "Path to the audio or video file for transcription."
|
23 |
+
}
|
24 |
+
}
|
25 |
+
output_type = "string"
|
26 |
+
|
27 |
+
def __init__(self, audio_directory=None):
|
28 |
+
super().__init__()
|
29 |
+
ensure_ffmpeg_in_path()
|
30 |
+
self.audio_directory = audio_directory or os.getcwd()
|
31 |
+
|
32 |
+
def locate_audio_file(self, file_name):
|
33 |
+
for root, _, files in os.walk(self.audio_directory):
|
34 |
+
if file_name in files:
|
35 |
+
return os.path.join(root, file_name)
|
36 |
+
return None
|
37 |
+
|
38 |
+
def convert_audio_to_wav(self, input_file: str, output_file: str, ffmpeg_path: str) -> str:
|
39 |
+
logger.info(f"Converting {input_file} to WAV format: {output_file}")
|
40 |
+
cmd = [
|
41 |
+
ffmpeg_path,
|
42 |
+
"-y", # Overwrite output files without asking
|
43 |
+
"-i", input_file,
|
44 |
+
"-ar", "16000", # Set audio sampling rate to 16kHz
|
45 |
+
"-ac", "1", # Set number of audio channels to mono
|
46 |
+
output_file
|
47 |
+
]
|
48 |
+
try:
|
49 |
+
subprocess.run(cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
50 |
+
logger.info("Audio conversion to WAV completed successfully.")
|
51 |
+
return output_file
|
52 |
+
except subprocess.CalledProcessError as e:
|
53 |
+
ffmpeg_error = e.stderr.decode()
|
54 |
+
logger.error(f"ffmpeg error: {ffmpeg_error}")
|
55 |
+
raise RuntimeError("Failed to convert audio to WAV.") from e
|
56 |
+
|
57 |
+
def forward(self, file_path: str) -> str:
|
58 |
+
try:
|
59 |
+
# Locate the file if it does not exist
|
60 |
+
logger.info(f"Attempting to transcribe file: {file_path}")
|
61 |
+
if not os.path.exists(file_path):
|
62 |
+
file_name = os.path.basename(file_path)
|
63 |
+
file_path = self.locate_audio_file(file_name)
|
64 |
+
if not file_path:
|
65 |
+
logger.error(f"File '{file_name}' not found in '{self.audio_directory}'.")
|
66 |
+
return f"Error: File '{file_name}' not found in '{self.audio_directory}'."
|
67 |
+
|
68 |
+
with tempfile.TemporaryDirectory() as tmpdir:
|
69 |
+
# Copy file to temp dir
|
70 |
+
filename = os.path.basename(file_path)
|
71 |
+
input_file_path = os.path.join(tmpdir, filename)
|
72 |
+
shutil.copy(file_path, input_file_path)
|
73 |
+
logger.info(f"Copied input file to temporary directory: {input_file_path}")
|
74 |
+
|
75 |
+
# Check if already WAV, otherwise convert
|
76 |
+
if input_file_path.lower().endswith(".wav"):
|
77 |
+
logger.info(f"File {filename} is already in WAV format. Skipping conversion.")
|
78 |
+
transcription_input_path = input_file_path
|
79 |
+
else:
|
80 |
+
# Convert to wav
|
81 |
+
wav_file_path = os.path.join(tmpdir, "converted_audio.wav")
|
82 |
+
ffmpeg_path = shutil.which("ffmpeg")
|
83 |
+
if not ffmpeg_path:
|
84 |
+
logger.error("ffmpeg is not accessible in PATH.")
|
85 |
+
raise RuntimeError("ffmpeg is not accessible in PATH.")
|
86 |
+
self.convert_audio_to_wav(input_file_path, wav_file_path, ffmpeg_path)
|
87 |
+
transcription_input_path = wav_file_path
|
88 |
+
|
89 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
90 |
+
logger.info(f"Using device for transcription: {device}")
|
91 |
+
|
92 |
+
# Transcribe audio
|
93 |
+
transcription_generator = run_whisper_transcription(transcription_input_path, device)
|
94 |
+
# The generator now yields only the result string (transcription or error)
|
95 |
+
for result_string in transcription_generator:
|
96 |
+
return result_string # Return the first (and only) result
|
97 |
+
|
98 |
+
except Exception as e:
|
99 |
+
logger.error(f"An unexpected error occurred in TranscriptTool: {str(e)}")
|
100 |
+
return f"An error occurred: {str(e)}"
|