Spaces:

rote1
/

IAGO

Sleeping

App Files Files Community

IAGO / deep-swarm /camel /toolkits /video_analysis_toolkit.py

zyh-ralph

initial update

62da328 3 months ago

raw

history blame

9.04 kB

	# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========

	import logging
	import tempfile
	from pathlib import Path
	from typing import List, Optional

	import ffmpeg
	from PIL import Image
	from scenedetect import ( # type: ignore[import-untyped]
	SceneManager,
	VideoManager,
	)
	from scenedetect.detectors import ( # type: ignore[import-untyped]
	ContentDetector,
	)

	from camel.agents import ChatAgent
	from camel.configs import QwenConfig
	from camel.messages import BaseMessage
	from camel.models import ModelFactory, OpenAIAudioModels
	from camel.toolkits.base import BaseToolkit
	from camel.toolkits.function_tool import FunctionTool
	from camel.types import ModelPlatformType, ModelType
	from camel.utils import dependencies_required

	from .video_downloader_toolkit import (
	VideoDownloaderToolkit,
	_capture_screenshot,
	)

	logger = logging.getLogger(__name__)

	VIDEO_QA_PROMPT = """
	Analyze the provided video frames and corresponding audio transcription to \
	answer the given question(s) thoroughly and accurately.

	Instructions:
	1. Visual Analysis:
	- Examine the video frames to identify visible entities.
	- Differentiate objects, species, or features based on key attributes \
	such as size, color, shape, texture, or behavior.
	- Note significant groupings, interactions, or contextual patterns \
	relevant to the analysis.

	2. Audio Integration:
	- Use the audio transcription to complement or clarify your visual \
	observations.
	- Identify names, descriptions, or contextual hints in the \
	transcription that help confirm or refine your visual analysis.

	3. Detailed Reasoning and Justification:
	- Provide a brief explanation of how you identified and distinguished \
	each species or object.
	- Highlight specific features or contextual clues that informed \
	your reasoning.

	4. Comprehensive Answer:
	- Specify the total number of distinct species or object types \
	identified in the video.
	- Describe the defining characteristics and any supporting evidence \
	from the video and transcription.

	5. Important Considerations:
	- Pay close attention to subtle differences that could distinguish \
	similar-looking species or objects
	(e.g., juveniles vs. adults, closely related species).
	- Provide concise yet complete explanations to ensure clarity.

	Audio Transcription:
	{audio_transcription}

	Question:
	{question}
	"""


	class VideoAnalysisToolkit(BaseToolkit):
	r"""A class for analysing videos with vision-language model.

	Args:
	download_directory (Optional[str], optional): The directory where the
	video will be downloaded to. If not provided, video will be stored
	in a temporary directory and will be cleaned up after use.
	(default: :obj:`None`)
	"""

	@dependencies_required("ffmpeg", "scenedetect")
	def __init__(
	self,
	download_directory: Optional[str] = None,
	) -> None:
	self._cleanup = download_directory is None

	self._download_directory = Path(
	download_directory or tempfile.mkdtemp()
	).resolve()

	self.video_downloader_toolkit = VideoDownloaderToolkit(
	download_directory=str(self._download_directory)
	)

	try:
	self._download_directory.mkdir(parents=True, exist_ok=True)
	except FileExistsError:
	raise ValueError(
	f"{self._download_directory} is not a valid directory."
	)
	except OSError as e:
	raise ValueError(
	f"Error creating directory {self._download_directory}: {e}"
	)

	logger.info(f"Video will be downloaded to {self._download_directory}")

	self.vl_model = ModelFactory.create(
	model_platform=ModelPlatformType.QWEN,
	model_type=ModelType.QWEN_VL_MAX,
	model_config_dict=QwenConfig(temperature=0.2).as_dict(),
	)

	self.vl_agent = ChatAgent(
	model=self.vl_model, output_language="English"
	)

	self.audio_models = OpenAIAudioModels()

	def _extract_audio_from_video(
	self, video_path: str, output_format: str = "mp3"
	) -> str:
	r"""Extract audio from the video.

	Args:
	video_path (str): The path to the video file.
	output_format (str): The format of the audio file to be saved.
	(default: :obj:`"mp3"`)

	Returns:
	str: The path to the audio file."""

	output_path = video_path.rsplit('.', 1)[0] + f".{output_format}"
	try:
	(
	ffmpeg.input(video_path)
	.output(output_path, vn=None, acodec="libmp3lame")
	.run()
	)
	return output_path
	except ffmpeg.Error as e:
	raise RuntimeError(f"FFmpeg-Python failed: {e}")

	def _transcribe_audio(self, audio_path: str) -> str:
	r"""Transcribe the audio of the video."""
	audio_transcript = self.audio_models.speech_to_text(audio_path)
	return audio_transcript

	def _extract_keyframes(
	self, video_path: str, num_frames: int, threshold: float = 25.0
	) -> List[Image.Image]:
	r"""Extract keyframes from a video based on scene changes
	and return them as PIL.Image.Image objects.

	Args:
	video_path (str): Path to the video file.
	num_frames (int): Number of keyframes to extract.
	threshold (float): The threshold value for scene change detection.

	Returns:
	list: A list of PIL.Image.Image objects representing
	the extracted keyframes.
	"""
	video_manager = VideoManager([video_path])
	scene_manager = SceneManager()
	scene_manager.add_detector(ContentDetector(threshold=threshold))

	video_manager.set_duration()
	video_manager.start()
	scene_manager.detect_scenes(video_manager)

	scenes = scene_manager.get_scene_list()
	keyframes: List[Image.Image] = []

	for start_time, _ in scenes:
	if len(keyframes) >= num_frames:
	break
	frame = _capture_screenshot(video_path, start_time)
	keyframes.append(frame)

	print(len(keyframes))
	return keyframes

	def ask_question_about_video(
	self,
	video_path: str,
	question: str,
	num_frames: int = 28,
	# 28 is the maximum number of frames
	# that can be displayed in a single message for
	# the Qwen-VL-Max model
	) -> str:
	r"""Ask a question about the video.

	Args:
	video_path (str): The path to the video file.
	It can be a local file or a URL (such as Youtube website).
	question (str): The question to ask about the video.
	num_frames (int): The number of frames to extract from the video.
	To be adjusted based on the length of the video.
	(default: :obj:`28`)

	Returns:
	str: The answer to the question.
	"""

	from urllib.parse import urlparse

	parsed_url = urlparse(video_path)
	is_url = all([parsed_url.scheme, parsed_url.netloc])

	if is_url:
	video_path = self.video_downloader_toolkit.download_video(
	video_path
	)
	audio_path = self._extract_audio_from_video(video_path)

	video_frames = self._extract_keyframes(video_path, num_frames)

	audio_transcript = self._transcribe_audio(audio_path)

	prompt = VIDEO_QA_PROMPT.format(
	audio_transcription=audio_transcript,
	question=question,
	)

	print(prompt)

	msg = BaseMessage.make_user_message(
	role_name="User",
	content=prompt,
	image_list=video_frames,
	)

	response = self.vl_agent.step(msg)
	answer = response.msgs[0].content

	return answer

	def get_tools(self) -> List[FunctionTool]:
	r"""Returns a list of FunctionTool objects representing the
	functions in the toolkit.

	Returns:
	List[FunctionTool]: A list of FunctionTool objects representing
	the functions in the toolkit.
	"""
	return [FunctionTool(self.ask_question_about_video)]