Spaces:

rote1
/

IAGO

Sleeping

IAGO / owl /camel /embeddings /vlm_embedding.py

lazychih114

update

c413558 3 months ago

5.5 kB

	# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
	from typing import Any, List, Optional, Union

	from PIL import Image

	from camel.embeddings import BaseEmbedding
	from camel.logger import get_logger

	logger = get_logger(__name__)


	class VisionLanguageEmbedding(BaseEmbedding[Union[str, Image.Image]]):
	r"""Provides image embedding functionalities using multimodal model.

	Args:
	model_name : The model type to be used for generating embeddings.
	And the default value is: obj:`openai/clip-vit-base-patch32`.

	Raises:
	RuntimeError: If an unsupported model type is specified.
	"""

	def __init__(
	self, model_name: str = "openai/clip-vit-base-patch32"
	) -> None:
	r"""Initializes the: obj: `VisionLanguageEmbedding` class with a
	specified model and return the dimension of embeddings.

	Args:
	model_name (str, optional): The version name of the model to use.
	(default: :obj:`openai/clip-vit-base-patch32`)
	"""
	from transformers import AutoModel, AutoProcessor

	try:
	self.model = AutoModel.from_pretrained(model_name)
	self.processor = AutoProcessor.from_pretrained(model_name)
	except Exception as e:
	raise RuntimeError(f"Failed to load model '{model_name}': {e}")

	self.valid_processor_kwargs = []
	self.valid_model_kwargs = []

	try:
	self.valid_processor_kwargs = (
	self.processor.image_processor._valid_processor_keys
	)
	self.valid_model_kwargs = [
	"pixel_values",
	"return_dict",
	"interpolate_pos_encoding",
	]
	except Exception:
	logger.warning("not typically processor and model structure")
	pass
	self.dim: Optional[int] = None

	def embed_list(
	self, objs: List[Union[Image.Image, str]], **kwargs: Any
	) -> List[List[float]]:
	"""Generates embeddings for the given images or texts.

	Args:
	objs (List[Image.Image\|str]): The list of images or texts for
	which to generate the embeddings.
	image_processor_kwargs: Extra kwargs passed to the image processor.
	tokenizer_kwargs: Extra kwargs passed to the text tokenizer
	(processor).
	model_kwargs: Extra kwargs passed to the main model.

	Returns:
	List[List[float]]: A list that represents the generated embedding
	as a list of floating-point numbers.

	Raises:
	ValueError: If the input type is not `Image.Image` or `str`.
	"""
	if not objs:
	raise ValueError("Input objs list is empty.")

	image_processor_kwargs: Optional[dict] = kwargs.get(
	'image_processor_kwargs', {}
	)
	tokenizer_kwargs: Optional[dict] = kwargs.get('tokenizer_kwargs', {})
	model_kwargs: Optional[dict] = kwargs.get('model_kwargs', {})

	result_list = []
	for obj in objs:
	if isinstance(obj, Image.Image):
	image_input = self.processor(
	images=obj,
	return_tensors="pt",
	padding=True,
	**image_processor_kwargs,
	)
	image_feature = (
	self.model.get_image_features(
	image_input, model_kwargs
	)
	.squeeze(dim=0)
	.tolist()
	)
	result_list.append(image_feature)
	elif isinstance(obj, str):
	text_input = self.processor(
	text=obj,
	return_tensors="pt",
	padding=True,
	**tokenizer_kwargs,
	)
	text_feature = (
	self.model.get_text_features(text_input, model_kwargs)
	.squeeze(dim=0)
	.tolist()
	)
	result_list.append(text_feature)
	else:
	raise ValueError("Input type is not image nor text.")

	self.dim = len(result_list[0])

	if any(len(result) != self.dim for result in result_list):
	raise ValueError("Dimensionality is not consistent.")

	return result_list

	def get_output_dim(self) -> int:
	r"""Returns the output dimension of the embeddings.

	Returns:
	int: The dimensionality of the embedding for the current model.
	"""
	if self.dim is None:
	text = 'dimension'
	inputs = self.processor(text=[text], return_tensors="pt")
	self.dim = self.model.get_text_features(**inputs).shape[1]
	return self.dim