Spaces:

rote1
/

IAGO

Sleeping

App Files Files Community

IAGO / deep-swarm /camel /retrievers /auto_retriever.py

zyh-ralph

initial update

62da328 3 months ago

raw

history blame

9.6 kB

	# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
	import re
	import uuid
	from typing import (
	TYPE_CHECKING,
	Collection,
	List,
	Optional,
	Sequence,
	Tuple,
	Union,
	)

	from camel.embeddings import BaseEmbedding, OpenAIEmbedding
	from camel.retrievers.vector_retriever import VectorRetriever
	from camel.storages import (
	BaseVectorStorage,
	MilvusStorage,
	QdrantStorage,
	)
	from camel.types import StorageType
	from camel.utils import Constants

	if TYPE_CHECKING:
	from unstructured.documents.elements import Element


	class AutoRetriever:
	r"""Facilitates the automatic retrieval of information using a
	query-based approach with pre-defined elements.

	Attributes:
	url_and_api_key (Optional[Tuple[str, str]]): URL and API key for
	accessing the vector storage remotely.
	vector_storage_local_path (Optional[str]): Local path for vector
	storage, if applicable.
	storage_type (Optional[StorageType]): The type of vector storage to
	use. Defaults to `StorageType.QDRANT`.
	embedding_model (Optional[BaseEmbedding]): Model used for embedding
	queries and documents. Defaults to `OpenAIEmbedding()`.
	"""

	def __init__(
	self,
	url_and_api_key: Optional[Tuple[str, str]] = None,
	vector_storage_local_path: Optional[str] = None,
	storage_type: Optional[StorageType] = None,
	embedding_model: Optional[BaseEmbedding] = None,
	):
	self.storage_type = storage_type or StorageType.QDRANT
	self.embedding_model = embedding_model or OpenAIEmbedding()
	self.vector_storage_local_path = vector_storage_local_path
	self.url_and_api_key = url_and_api_key

	def _initialize_vector_storage(
	self,
	collection_name: Optional[str] = None,
	) -> BaseVectorStorage:
	r"""Sets up and returns a vector storage instance with specified
	parameters.

	Args:
	collection_name (Optional[str]): Name of the collection in the
	vector storage.

	Returns:
	BaseVectorStorage: Configured vector storage instance.
	"""
	if self.storage_type == StorageType.MILVUS:
	if self.url_and_api_key is None:
	raise ValueError(
	"URL and API key required for Milvus storage are not"
	"provided."
	)
	return MilvusStorage(
	vector_dim=self.embedding_model.get_output_dim(),
	collection_name=collection_name,
	url_and_api_key=self.url_and_api_key,
	)

	if self.storage_type == StorageType.QDRANT:
	return QdrantStorage(
	vector_dim=self.embedding_model.get_output_dim(),
	collection_name=collection_name,
	path=self.vector_storage_local_path,
	url_and_api_key=self.url_and_api_key,
	)

	raise ValueError(
	f"Unsupported vector storage type: {self.storage_type}"
	)

	def _collection_name_generator(
	self, content: Union[str, "Element"]
	) -> str:
	r"""Generates a valid collection name from a given file path or URL.

	Args:
	content (Union[str, Element]): Local file path, remote URL,
	string content or Element object.

	Returns:
	str: A sanitized, valid collection name suitable for use.
	"""
	from unstructured.documents.elements import Element

	if isinstance(content, Element):
	content = content.metadata.file_directory or str(uuid.uuid4())

	collection_name = re.sub(r'[^a-zA-Z0-9]', '', content)[:20]

	return collection_name

	def run_vector_retriever(
	self,
	query: str,
	contents: Union[str, List[str], "Element", List["Element"]],
	top_k: int = Constants.DEFAULT_TOP_K_RESULTS,
	similarity_threshold: float = Constants.DEFAULT_SIMILARITY_THRESHOLD,
	return_detailed_info: bool = False,
	max_characters: int = 500,
	) -> dict[str, Sequence[Collection[str]]]:
	r"""Executes the automatic vector retriever process using vector
	storage.

	Args:
	query (str): Query string for information retriever.
	contents (Union[str, List[str], Element, List[Element]]): Local
	file paths, remote URLs, string contents or Element objects.
	top_k (int, optional): The number of top results to return during
	retrieve. Must be a positive integer. Defaults to
	`DEFAULT_TOP_K_RESULTS`.
	similarity_threshold (float, optional): The similarity threshold
	for filtering results. Defaults to
	`DEFAULT_SIMILARITY_THRESHOLD`.
	return_detailed_info (bool, optional): Whether to return detailed
	information including similarity score, content path and
	metadata. Defaults to `False`.
	max_characters (int): Max number of characters in each chunk.
	Defaults to `500`.

	Returns:
	dict[str, Sequence[Collection[str]]]: By default, returns
	only the text information. If `return_detailed_info` is
	`True`, return detailed information including similarity
	score, content path and metadata.

	Raises:
	ValueError: If there's an vector storage existing with content
	name in the vector path but the payload is None. If
	`contents` is empty.
	RuntimeError: If any errors occur during the retrieve process.
	"""
	from unstructured.documents.elements import Element

	if not contents:
	raise ValueError("content cannot be empty.")

	# Normalize contents to a list
	if isinstance(contents, str):
	contents = [contents]
	elif isinstance(contents, Element):
	contents = [contents]
	elif not isinstance(contents, list):
	raise ValueError(
	"contents must be a string, Element, or a list of them."
	)
	all_retrieved_info = []
	for content in contents:
	# Generate a valid collection name
	collection_name = self._collection_name_generator(content)
	try:
	vector_storage_instance = self._initialize_vector_storage(
	collection_name
	)

	if vector_storage_instance.status().vector_count == 0:
	# Clear the vector storage
	vector_storage_instance.clear()
	# Process and store the content to the vector storage
	vr = VectorRetriever(
	storage=vector_storage_instance,
	embedding_model=self.embedding_model,
	)
	vr.process(content=content, max_characters=max_characters)
	else:
	vr = VectorRetriever(
	storage=vector_storage_instance,
	embedding_model=self.embedding_model,
	)
	# Retrieve info by given query from the vector storage
	retrieved_info = vr.query(query, top_k, similarity_threshold)
	all_retrieved_info.extend(retrieved_info)
	except Exception as e:
	raise RuntimeError(
	f"Error in auto vector retriever processing: {e!s}"
	) from e

	# Split records into those with and without a 'similarity_score'
	# Records with 'similarity_score' lower than 'similarity_threshold'
	# will not have a 'similarity_score' in the output content
	with_score = [
	info for info in all_retrieved_info if 'similarity score' in info
	]
	without_score = [
	info
	for info in all_retrieved_info
	if 'similarity score' not in info
	]
	# Sort only the list with scores
	with_score_sorted = sorted(
	with_score, key=lambda x: x['similarity score'], reverse=True
	)
	# Merge back the sorted scored items with the non-scored items
	all_retrieved_info_sorted = with_score_sorted + without_score
	# Select the 'top_k' results
	all_retrieved_info = all_retrieved_info_sorted[:top_k]

	text_retrieved_info = [item['text'] for item in all_retrieved_info]

	detailed_info = {
	"Original Query": query,
	"Retrieved Context": all_retrieved_info,
	}

	text_info = {
	"Original Query": query,
	"Retrieved Context": text_retrieved_info,
	}
	# breakpoint()

	if return_detailed_info:
	return detailed_info
	else:
	return text_info