|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import logging |
|
import tempfile |
|
from pathlib import Path |
|
from typing import List, Optional |
|
|
|
import ffmpeg |
|
from PIL import Image |
|
from scenedetect import ( |
|
SceneManager, |
|
VideoManager, |
|
) |
|
from scenedetect.detectors import ( |
|
ContentDetector, |
|
) |
|
|
|
from camel.agents import ChatAgent |
|
from camel.configs import QwenConfig |
|
from camel.messages import BaseMessage |
|
from camel.models import ModelFactory, OpenAIAudioModels |
|
from camel.toolkits.base import BaseToolkit |
|
from camel.toolkits.function_tool import FunctionTool |
|
from camel.types import ModelPlatformType, ModelType |
|
from camel.utils import dependencies_required |
|
|
|
from .video_downloader_toolkit import ( |
|
VideoDownloaderToolkit, |
|
_capture_screenshot, |
|
) |
|
|
|
logger = logging.getLogger(__name__) |
|
|
|
VIDEO_QA_PROMPT = """ |
|
Analyze the provided video frames and corresponding audio transcription to \ |
|
answer the given question(s) thoroughly and accurately. |
|
|
|
Instructions: |
|
1. Visual Analysis: |
|
- Examine the video frames to identify visible entities. |
|
- Differentiate objects, species, or features based on key attributes \ |
|
such as size, color, shape, texture, or behavior. |
|
- Note significant groupings, interactions, or contextual patterns \ |
|
relevant to the analysis. |
|
|
|
2. Audio Integration: |
|
- Use the audio transcription to complement or clarify your visual \ |
|
observations. |
|
- Identify names, descriptions, or contextual hints in the \ |
|
transcription that help confirm or refine your visual analysis. |
|
|
|
3. Detailed Reasoning and Justification: |
|
- Provide a brief explanation of how you identified and distinguished \ |
|
each species or object. |
|
- Highlight specific features or contextual clues that informed \ |
|
your reasoning. |
|
|
|
4. Comprehensive Answer: |
|
- Specify the total number of distinct species or object types \ |
|
identified in the video. |
|
- Describe the defining characteristics and any supporting evidence \ |
|
from the video and transcription. |
|
|
|
5. Important Considerations: |
|
- Pay close attention to subtle differences that could distinguish \ |
|
similar-looking species or objects |
|
(e.g., juveniles vs. adults, closely related species). |
|
- Provide concise yet complete explanations to ensure clarity. |
|
|
|
**Audio Transcription:** |
|
{audio_transcription} |
|
|
|
**Question:** |
|
{question} |
|
""" |
|
|
|
|
|
class VideoAnalysisToolkit(BaseToolkit): |
|
r"""A class for analysing videos with vision-language model. |
|
|
|
Args: |
|
download_directory (Optional[str], optional): The directory where the |
|
video will be downloaded to. If not provided, video will be stored |
|
in a temporary directory and will be cleaned up after use. |
|
(default: :obj:`None`) |
|
""" |
|
|
|
@dependencies_required("ffmpeg", "scenedetect") |
|
def __init__( |
|
self, |
|
download_directory: Optional[str] = None, |
|
) -> None: |
|
self._cleanup = download_directory is None |
|
|
|
self._download_directory = Path( |
|
download_directory or tempfile.mkdtemp() |
|
).resolve() |
|
|
|
self.video_downloader_toolkit = VideoDownloaderToolkit( |
|
download_directory=str(self._download_directory) |
|
) |
|
|
|
try: |
|
self._download_directory.mkdir(parents=True, exist_ok=True) |
|
except FileExistsError: |
|
raise ValueError( |
|
f"{self._download_directory} is not a valid directory." |
|
) |
|
except OSError as e: |
|
raise ValueError( |
|
f"Error creating directory {self._download_directory}: {e}" |
|
) |
|
|
|
logger.info(f"Video will be downloaded to {self._download_directory}") |
|
|
|
self.vl_model = ModelFactory.create( |
|
model_platform=ModelPlatformType.QWEN, |
|
model_type=ModelType.QWEN_VL_MAX, |
|
model_config_dict=QwenConfig(temperature=0.2).as_dict(), |
|
) |
|
|
|
self.vl_agent = ChatAgent( |
|
model=self.vl_model, output_language="English" |
|
) |
|
|
|
self.audio_models = OpenAIAudioModels() |
|
|
|
def _extract_audio_from_video( |
|
self, video_path: str, output_format: str = "mp3" |
|
) -> str: |
|
r"""Extract audio from the video. |
|
|
|
Args: |
|
video_path (str): The path to the video file. |
|
output_format (str): The format of the audio file to be saved. |
|
(default: :obj:`"mp3"`) |
|
|
|
Returns: |
|
str: The path to the audio file.""" |
|
|
|
output_path = video_path.rsplit('.', 1)[0] + f".{output_format}" |
|
try: |
|
( |
|
ffmpeg.input(video_path) |
|
.output(output_path, vn=None, acodec="libmp3lame") |
|
.run() |
|
) |
|
return output_path |
|
except ffmpeg.Error as e: |
|
raise RuntimeError(f"FFmpeg-Python failed: {e}") |
|
|
|
def _transcribe_audio(self, audio_path: str) -> str: |
|
r"""Transcribe the audio of the video.""" |
|
audio_transcript = self.audio_models.speech_to_text(audio_path) |
|
return audio_transcript |
|
|
|
def _extract_keyframes( |
|
self, video_path: str, num_frames: int, threshold: float = 25.0 |
|
) -> List[Image.Image]: |
|
r"""Extract keyframes from a video based on scene changes |
|
and return them as PIL.Image.Image objects. |
|
|
|
Args: |
|
video_path (str): Path to the video file. |
|
num_frames (int): Number of keyframes to extract. |
|
threshold (float): The threshold value for scene change detection. |
|
|
|
Returns: |
|
list: A list of PIL.Image.Image objects representing |
|
the extracted keyframes. |
|
""" |
|
video_manager = VideoManager([video_path]) |
|
scene_manager = SceneManager() |
|
scene_manager.add_detector(ContentDetector(threshold=threshold)) |
|
|
|
video_manager.set_duration() |
|
video_manager.start() |
|
scene_manager.detect_scenes(video_manager) |
|
|
|
scenes = scene_manager.get_scene_list() |
|
keyframes: List[Image.Image] = [] |
|
|
|
for start_time, _ in scenes: |
|
if len(keyframes) >= num_frames: |
|
break |
|
frame = _capture_screenshot(video_path, start_time) |
|
keyframes.append(frame) |
|
|
|
print(len(keyframes)) |
|
return keyframes |
|
|
|
def ask_question_about_video( |
|
self, |
|
video_path: str, |
|
question: str, |
|
num_frames: int = 28, |
|
|
|
|
|
|
|
) -> str: |
|
r"""Ask a question about the video. |
|
|
|
Args: |
|
video_path (str): The path to the video file. |
|
It can be a local file or a URL (such as Youtube website). |
|
question (str): The question to ask about the video. |
|
num_frames (int): The number of frames to extract from the video. |
|
To be adjusted based on the length of the video. |
|
(default: :obj:`28`) |
|
|
|
Returns: |
|
str: The answer to the question. |
|
""" |
|
|
|
from urllib.parse import urlparse |
|
|
|
parsed_url = urlparse(video_path) |
|
is_url = all([parsed_url.scheme, parsed_url.netloc]) |
|
|
|
if is_url: |
|
video_path = self.video_downloader_toolkit.download_video( |
|
video_path |
|
) |
|
audio_path = self._extract_audio_from_video(video_path) |
|
|
|
video_frames = self._extract_keyframes(video_path, num_frames) |
|
|
|
audio_transcript = self._transcribe_audio(audio_path) |
|
|
|
prompt = VIDEO_QA_PROMPT.format( |
|
audio_transcription=audio_transcript, |
|
question=question, |
|
) |
|
|
|
print(prompt) |
|
|
|
msg = BaseMessage.make_user_message( |
|
role_name="User", |
|
content=prompt, |
|
image_list=video_frames, |
|
) |
|
|
|
response = self.vl_agent.step(msg) |
|
answer = response.msgs[0].content |
|
|
|
return answer |
|
|
|
def get_tools(self) -> List[FunctionTool]: |
|
r"""Returns a list of FunctionTool objects representing the |
|
functions in the toolkit. |
|
|
|
Returns: |
|
List[FunctionTool]: A list of FunctionTool objects representing |
|
the functions in the toolkit. |
|
""" |
|
return [FunctionTool(self.ask_question_about_video)] |