File size: 13,269 Bytes
583462f 41a98e7 56ac5db 41a98e7 56ac5db f1018d4 41a98e7 56ac5db 41a98e7 56ac5db 41a98e7 56ac5db 41a98e7 56ac5db b984651 56ac5db 41a98e7 56ac5db b984651 56ac5db 41a98e7 56ac5db 41a98e7 56ac5db b984651 41a98e7 b984651 41a98e7 8895970 41a98e7 8895970 41a98e7 b984651 41a98e7 56ac5db 583462f 41a98e7 583462f 56ac5db 41a98e7 f279b36 41a98e7 f1018d4 41a98e7 f1018d4 41a98e7 f1018d4 41a98e7 f1018d4 41a98e7 583462f 56ac5db 583462f 41a98e7 583462f 41a98e7 583462f 41a98e7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 |
from smolagents.tools import Tool
from typing import Optional, Union, Dict, Any
import os
import time
import requests
import io
from PIL import Image
from pytubefix import YouTube
import docx
from docx.shared import Pt, RGBColor, Inches
from docx.enum.text import WD_PARAGRAPH_ALIGNMENT
import google.generativeai as genai
from dotenv import load_dotenv
# Load environment variables
load_dotenv()
class TranscriptSummarizer(Tool):
description = "Summarizes a transcript and generates blog content using Google's Gemini model for summarization and Hugging Face API for image generation."
name = "transcript_summarizer"
inputs = {
'transcript': {'type': 'string', 'description': 'The transcript to summarize.'},
'language': {'type': 'string', 'description': 'The language of the transcript.', 'nullable': True}
}
output_type = "string"
def __init__(self, *args, hf_api_key: str = None, **kwargs):
super().__init__(*args, **kwargs)
# Get Gemini API key from environment variables
gemini_api_key = os.getenv("GEMINI_API_KEY")
if gemini_api_key:
# Configure the Gemini API
genai.configure(api_key=gemini_api_key)
# Set up the model
self.gemini_model = genai.GenerativeModel('gemini-2.0-flash')
else:
self.gemini_model = None
# Set up Hugging Face for image generation
self.api_url = "https://api-inference.huggingface.co/models/ZB-Tech/Text-to-Image"
self.hf_api_key = hf_api_key
self.headers = {"Authorization": f"Bearer {self.hf_api_key}"}
def query_image_api(self, payload):
response = requests.post(self.api_url, headers=self.headers, json=payload)
return response.content
def summarize_with_gemini(self, text, language='en', max_tokens=1000):
"""Use Gemini to summarize text in the specified language"""
# Map language codes to full language names for better prompting
language_map = {
'en': 'English',
'hi': 'Hindi',
'es': 'Spanish',
'fr': 'French',
'de': 'German',
'it': 'Italian',
'ja': 'Japanese',
'ko': 'Korean',
'pt': 'Portuguese',
'ru': 'Russian',
'zh': 'Chinese',
'ar': 'Arabic',
'bn': 'Bengali',
'ta': 'Tamil',
'te': 'Telugu',
'mr': 'Marathi',
'gu': 'Gujarati',
'kn': 'Kannada',
'ml': 'Malayalam',
'pa': 'Punjabi',
'ur': 'Urdu'
# Add more languages as needed
}
language_name = language_map.get(language, language)
prompt = f"""
Please summarize the following transcript in a concise but comprehensive way.
Focus on the main points and key information.
IMPORTANT: The transcript is in {language_name}. Please provide the summary in the SAME LANGUAGE ({language_name}).
Do not translate to any other language. Keep the summary in the original language of the transcript.
Transcript:
{text}
"""
generation_config = {
"temperature": 0.4,
"top_p": 0.95,
"top_k": 40,
"max_output_tokens": max_tokens,
}
response = self.gemini_model.generate_content(
prompt,
generation_config=generation_config
)
return response.text
def forward(self, transcript: str, language: str = 'en') -> str:
try:
if not self.hf_api_key:
return "Hugging Face API key is required for image generation. Please provide it in the input field."
if not self.gemini_model:
return "Gemini API key is required for summarization. Please add it to your .env file."
transcript_length = len(transcript)
# Check if transcript is too short
if transcript_length < 100:
return "Transcript is too short to summarize."
# For longer transcripts, split into chunks to handle context window limitations
if transcript_length > 30000: # Gemini has a context window limit
chunk_size = 25000
transcript_chunks = [transcript[i:i+chunk_size] for i in range(0, len(transcript), chunk_size)]
# Summarize each chunk
chunk_summaries = []
for chunk in transcript_chunks:
chunk_summary = self.summarize_with_gemini(chunk, language=language, max_tokens=1000)
chunk_summaries.append(chunk_summary)
# Combine chunk summaries and create a final summary
combined_summary = "\n\n".join(chunk_summaries)
if len(combined_summary) > 25000:
full_summary = self.summarize_with_gemini(combined_summary, language=language, max_tokens=2000)
else:
full_summary = combined_summary
else:
# For shorter transcripts, summarize directly
full_summary = self.summarize_with_gemini(transcript, language=language, max_tokens=2000)
# Generate image based on summary
try:
key_entities = full_summary.split()[:15] # Extract first 15 words as key entities
image_prompt = f"Generate an image related to: {' '.join(key_entities)}, cartoon style"
image_bytes = self.query_image_api({"inputs": image_prompt})
# Check if the response is valid
if not image_bytes or len(image_bytes) < 100:
print("Warning: Received invalid or empty image response")
return full_summary # Return just the summary without image
try:
# Try to open the image
image = Image.open(io.BytesIO(image_bytes))
# Save the image
image_folder = "Image"
if not os.path.exists(image_folder):
os.makedirs(image_folder)
image_url = os.path.join(image_folder, f"image_{int(time.time())}.jpg") # Use timestamp for unique filename
image.save(image_url)
return f"{full_summary}\n\nImage URL: {image_url}" # Return the file path with summary
except Exception as img_error:
print(f"Error processing image: {str(img_error)}")
# Return just the summary if image processing fails
return full_summary
except Exception as img_gen_error:
print(f"Error generating image: {str(img_gen_error)}")
# Return just the summary if image generation fails
return full_summary
except Exception as e:
return f"An unexpected error occurred: {str(e)}"
class YouTubeTranscriptExtractor(Tool):
description = "Extracts the transcript from a YouTube video."
name = "youtube_transcript_extractor"
inputs = {'video_url': {'type': 'string', 'description': 'The URL of the YouTube video.'}}
output_type = "string" # Keep as string for compatibility with smolagents
def forward(self, video_url: str) -> str:
try:
# Create a YouTube object
yt = YouTube(video_url)
lang = 'en' # Default language
# Get the video transcript
try:
if 'en' in yt.captions:
transcript = yt.captions['en'].generate_srt_captions()
lang = 'en'
else:
# Get the first available caption
if len(yt.captions.all()) > 0:
caption = yt.captions.all()[0]
transcript = caption.generate_srt_captions()
lang = caption.code
else:
return f"LANGUAGE:{lang}||No transcript available for this video."
except StopIteration:
return f"LANGUAGE:{lang}||No transcript available for this video."
except Exception as e:
return f"LANGUAGE:{lang}||An unexpected error occurred while accessing captions: {str(e)}"
# Clean up the transcript by removing timestamps and line numbers
cleaned_transcript = ""
for line in transcript.splitlines():
if not line.strip().isdigit() and "-->" not in line:
cleaned_transcript += line + "\n"
print(f"Transcript language detected: {lang}")
print("Transcript sample: ", cleaned_transcript[:200] + "..." if len(cleaned_transcript) > 200 else cleaned_transcript)
# Return both the transcript and the language as a formatted string
# Format: "LANGUAGE:lang||transcript_text"
return f"LANGUAGE:{lang}||{cleaned_transcript}"
except Exception as e:
return f"LANGUAGE:en||An unexpected error occurred: {str(e)}"
def __init__(self, *args, **kwargs):
self.is_initialized = False
class TranscriptToDocx(Tool):
description = "Creates or updates a DOCX file with YouTube transcript and summary."
name = "transcript_to_docx"
inputs = {
'transcript': {'type': 'string', 'description': 'The transcript to include in the document.'},
'summary': {'type': 'string', 'description': 'The summary to include in the document.'},
'video_title': {'type': 'string', 'description': 'The title of the YouTube video.'},
'image_path': {'type': 'string', 'description': 'Path to the image to include in the document.', 'nullable': True},
'existing_docx_path': {'type': 'string', 'description': 'Path to an existing DOCX file to update.', 'nullable': True}
}
output_type = "string"
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.docx_folder = "Documents"
if not os.path.exists(self.docx_folder):
os.makedirs(self.docx_folder)
def forward(self, transcript: str, summary: str, video_title: str, image_path: Optional[str] = None, existing_docx_path: Optional[str] = None) -> str:
try:
# Determine if we're creating a new document or updating an existing one
if existing_docx_path and os.path.exists(existing_docx_path):
doc = docx.Document(existing_docx_path)
# Add a page break before adding new content
doc.add_paragraph().add_run().add_break(docx.enum.text.WD_BREAK.PAGE)
else:
doc = docx.Document()
# Set document properties
doc.core_properties.title = f"YouTube Transcript: {video_title}"
doc.core_properties.author = "YouTube Transcript Tool"
# Add title
title = doc.add_heading(video_title, level=1)
title.alignment = WD_PARAGRAPH_ALIGNMENT.CENTER
# Add summary section
doc.add_heading("Summary", level=2)
summary_para = doc.add_paragraph(summary)
# Add image if provided
if image_path and os.path.exists(image_path):
try:
doc.add_picture(image_path, width=Inches(6))
# Add caption for the image
caption = doc.add_paragraph("Generated image based on transcript content")
caption.alignment = WD_PARAGRAPH_ALIGNMENT.CENTER
caption.runs[0].italic = True
except Exception as img_error:
# If there's an error adding the image, just log it and continue
print(f"Error adding image to document: {str(img_error)}")
# Add transcript section
doc.add_heading("Full Transcript", level=2)
transcript_para = doc.add_paragraph(transcript)
# Clean the video title for filename
safe_title = ''.join(c for c in video_title if c.isalnum() or c in ' _-')
safe_title = safe_title.replace(' ', '_')
# Save the document
output_filename = f"{safe_title}.docx"
output_path = os.path.join(self.docx_folder, output_filename)
try:
doc.save(output_path)
print(f"Document saved successfully at: {output_path}")
return output_path
except Exception as save_error:
error_msg = f"Error saving document: {str(save_error)}"
print(error_msg)
# Try with a simpler filename as fallback
try:
fallback_path = os.path.join(self.docx_folder, f"youtube_transcript_{int(time.time())}.docx")
doc.save(fallback_path)
print(f"Document saved with fallback name at: {fallback_path}")
return fallback_path
except:
return error_msg
except Exception as e:
return f"An error occurred while creating the DOCX file: {str(e)}"
|