Spaces:
Running
Running
# tools/youtube_tool.py | |
from youtube_transcript_api import YouTubeTranscriptApi | |
from youtube_transcript_api._errors import TranscriptsDisabled, NoTranscriptFound | |
import re | |
def extract_video_id(url: str) -> str: | |
""" | |
Extracts the video ID from a YouTube URL. | |
Args: | |
url (str): The full YouTube video URL. | |
Returns: | |
str: The extracted video ID or raises ValueError. | |
""" | |
patterns = [ | |
r"youtube\.com/watch\?v=([a-zA-Z0-9_-]{11})", | |
r"youtu\.be/([a-zA-Z0-9_-]{11})" | |
] | |
for pattern in patterns: | |
match = re.search(pattern, url) | |
if match: | |
return match.group(1) | |
raise ValueError("Invalid YouTube URL or unable to extract video ID.") | |
def get_youtube_transcript(url: str) -> str: | |
""" | |
Fetches the transcript text for a given YouTube video. | |
Args: | |
url (str): The YouTube video URL. | |
Returns: | |
str: Combined transcript text or an error message. | |
""" | |
try: | |
video_id = extract_video_id(url) | |
transcript_list = YouTubeTranscriptApi.get_transcript(video_id) | |
full_text = " ".join([entry["text"] for entry in transcript_list]) | |
return full_text.strip()[:2000] # Truncate to 2000 chars to prevent token overflow | |
except TranscriptsDisabled: | |
return "This video has transcripts disabled." | |
except NoTranscriptFound: | |
return "No transcript was found for this video." | |
except Exception as e: | |
return f"Transcript error: {str(e)}" | |