github-actions[bot]
GitHub deploy: b2af6313f1dff7fdd5d51fdcac99a86f9ffecebc
74027ad
raw
history blame
14.9 kB
from typing import Optional, List, Dict, Any, Union
import logging
from pinecone import Pinecone, ServerlessSpec
from open_webui.retrieval.vector.main import (
VectorDBBase,
VectorItem,
SearchResult,
GetResult,
)
from open_webui.config import (
PINECONE_API_KEY,
PINECONE_ENVIRONMENT,
PINECONE_INDEX_NAME,
PINECONE_DIMENSION,
PINECONE_METRIC,
PINECONE_CLOUD,
)
from open_webui.env import SRC_LOG_LEVELS
NO_LIMIT = 10000 # Reasonable limit to avoid overwhelming the system
BATCH_SIZE = 100 # Recommended batch size for Pinecone operations
log = logging.getLogger(__name__)
log.setLevel(SRC_LOG_LEVELS["RAG"])
class PineconeClient(VectorDBBase):
def __init__(self):
self.collection_prefix = "open-webui"
# Validate required configuration
self._validate_config()
# Store configuration values
self.api_key = PINECONE_API_KEY
self.environment = PINECONE_ENVIRONMENT
self.index_name = PINECONE_INDEX_NAME
self.dimension = PINECONE_DIMENSION
self.metric = PINECONE_METRIC
self.cloud = PINECONE_CLOUD
# Initialize Pinecone client
self.client = Pinecone(api_key=self.api_key)
# Create index if it doesn't exist
self._initialize_index()
def _validate_config(self) -> None:
"""Validate that all required configuration variables are set."""
missing_vars = []
if not PINECONE_API_KEY:
missing_vars.append("PINECONE_API_KEY")
if not PINECONE_ENVIRONMENT:
missing_vars.append("PINECONE_ENVIRONMENT")
if not PINECONE_INDEX_NAME:
missing_vars.append("PINECONE_INDEX_NAME")
if not PINECONE_DIMENSION:
missing_vars.append("PINECONE_DIMENSION")
if not PINECONE_CLOUD:
missing_vars.append("PINECONE_CLOUD")
if missing_vars:
raise ValueError(
f"Required configuration missing: {', '.join(missing_vars)}"
)
def _initialize_index(self) -> None:
"""Initialize the Pinecone index."""
try:
# Check if index exists
if self.index_name not in self.client.list_indexes().names():
log.info(f"Creating Pinecone index '{self.index_name}'...")
self.client.create_index(
name=self.index_name,
dimension=self.dimension,
metric=self.metric,
spec=ServerlessSpec(cloud=self.cloud, region=self.environment),
)
log.info(f"Successfully created Pinecone index '{self.index_name}'")
else:
log.info(f"Using existing Pinecone index '{self.index_name}'")
# Connect to the index
self.index = self.client.Index(self.index_name)
except Exception as e:
log.error(f"Failed to initialize Pinecone index: {e}")
raise RuntimeError(f"Failed to initialize Pinecone index: {e}")
def _create_points(
self, items: List[VectorItem], collection_name_with_prefix: str
) -> List[Dict[str, Any]]:
"""Convert VectorItem objects to Pinecone point format."""
points = []
for item in items:
# Start with any existing metadata or an empty dict
metadata = item.get("metadata", {}).copy() if item.get("metadata") else {}
# Add text to metadata if available
if "text" in item:
metadata["text"] = item["text"]
# Always add collection_name to metadata for filtering
metadata["collection_name"] = collection_name_with_prefix
point = {
"id": item["id"],
"values": item["vector"],
"metadata": metadata,
}
points.append(point)
return points
def _get_collection_name_with_prefix(self, collection_name: str) -> str:
"""Get the collection name with prefix."""
return f"{self.collection_prefix}_{collection_name}"
def _normalize_distance(self, score: float) -> float:
"""Normalize distance score based on the metric used."""
if self.metric.lower() == "cosine":
# Cosine similarity ranges from -1 to 1, normalize to 0 to 1
return (score + 1.0) / 2.0
elif self.metric.lower() in ["euclidean", "dotproduct"]:
# These are already suitable for ranking (smaller is better for Euclidean)
return score
else:
# For other metrics, use as is
return score
def _result_to_get_result(self, matches: list) -> GetResult:
"""Convert Pinecone matches to GetResult format."""
ids = []
documents = []
metadatas = []
for match in matches:
metadata = match.get("metadata", {})
ids.append(match["id"])
documents.append(metadata.get("text", ""))
metadatas.append(metadata)
return GetResult(
**{
"ids": [ids],
"documents": [documents],
"metadatas": [metadatas],
}
)
def has_collection(self, collection_name: str) -> bool:
"""Check if a collection exists by searching for at least one item."""
collection_name_with_prefix = self._get_collection_name_with_prefix(
collection_name
)
try:
# Search for at least 1 item with this collection name in metadata
response = self.index.query(
vector=[0.0] * self.dimension, # dummy vector
top_k=1,
filter={"collection_name": collection_name_with_prefix},
include_metadata=False,
)
return len(response.matches) > 0
except Exception as e:
log.exception(
f"Error checking collection '{collection_name_with_prefix}': {e}"
)
return False
def delete_collection(self, collection_name: str) -> None:
"""Delete a collection by removing all vectors with the collection name in metadata."""
collection_name_with_prefix = self._get_collection_name_with_prefix(
collection_name
)
try:
self.index.delete(filter={"collection_name": collection_name_with_prefix})
log.info(
f"Collection '{collection_name_with_prefix}' deleted (all vectors removed)."
)
except Exception as e:
log.warning(
f"Failed to delete collection '{collection_name_with_prefix}': {e}"
)
raise
def insert(self, collection_name: str, items: List[VectorItem]) -> None:
"""Insert vectors into a collection."""
if not items:
log.warning("No items to insert")
return
collection_name_with_prefix = self._get_collection_name_with_prefix(
collection_name
)
points = self._create_points(items, collection_name_with_prefix)
# Insert in batches for better performance and reliability
for i in range(0, len(points), BATCH_SIZE):
batch = points[i : i + BATCH_SIZE]
try:
self.index.upsert(vectors=batch)
log.debug(
f"Inserted batch of {len(batch)} vectors into '{collection_name_with_prefix}'"
)
except Exception as e:
log.error(
f"Error inserting batch into '{collection_name_with_prefix}': {e}"
)
raise
log.info(
f"Successfully inserted {len(items)} vectors into '{collection_name_with_prefix}'"
)
def upsert(self, collection_name: str, items: List[VectorItem]) -> None:
"""Upsert (insert or update) vectors into a collection."""
if not items:
log.warning("No items to upsert")
return
collection_name_with_prefix = self._get_collection_name_with_prefix(
collection_name
)
points = self._create_points(items, collection_name_with_prefix)
# Upsert in batches
for i in range(0, len(points), BATCH_SIZE):
batch = points[i : i + BATCH_SIZE]
try:
self.index.upsert(vectors=batch)
log.debug(
f"Upserted batch of {len(batch)} vectors into '{collection_name_with_prefix}'"
)
except Exception as e:
log.error(
f"Error upserting batch into '{collection_name_with_prefix}': {e}"
)
raise
log.info(
f"Successfully upserted {len(items)} vectors into '{collection_name_with_prefix}'"
)
def search(
self, collection_name: str, vectors: List[List[Union[float, int]]], limit: int
) -> Optional[SearchResult]:
"""Search for similar vectors in a collection."""
if not vectors or not vectors[0]:
log.warning("No vectors provided for search")
return None
collection_name_with_prefix = self._get_collection_name_with_prefix(
collection_name
)
if limit is None or limit <= 0:
limit = NO_LIMIT
try:
# Search using the first vector (assuming this is the intended behavior)
query_vector = vectors[0]
# Perform the search
query_response = self.index.query(
vector=query_vector,
top_k=limit,
include_metadata=True,
filter={"collection_name": collection_name_with_prefix},
)
if not query_response.matches:
# Return empty result if no matches
return SearchResult(
ids=[[]],
documents=[[]],
metadatas=[[]],
distances=[[]],
)
# Convert to GetResult format
get_result = self._result_to_get_result(query_response.matches)
# Calculate normalized distances based on metric
distances = [
[
self._normalize_distance(match.score)
for match in query_response.matches
]
]
return SearchResult(
ids=get_result.ids,
documents=get_result.documents,
metadatas=get_result.metadatas,
distances=distances,
)
except Exception as e:
log.error(f"Error searching in '{collection_name_with_prefix}': {e}")
return None
def query(
self, collection_name: str, filter: Dict, limit: Optional[int] = None
) -> Optional[GetResult]:
"""Query vectors by metadata filter."""
collection_name_with_prefix = self._get_collection_name_with_prefix(
collection_name
)
if limit is None or limit <= 0:
limit = NO_LIMIT
try:
# Create a zero vector for the dimension as Pinecone requires a vector
zero_vector = [0.0] * self.dimension
# Combine user filter with collection_name
pinecone_filter = {"collection_name": collection_name_with_prefix}
if filter:
pinecone_filter.update(filter)
# Perform metadata-only query
query_response = self.index.query(
vector=zero_vector,
filter=pinecone_filter,
top_k=limit,
include_metadata=True,
)
return self._result_to_get_result(query_response.matches)
except Exception as e:
log.error(f"Error querying collection '{collection_name}': {e}")
return None
def get(self, collection_name: str) -> Optional[GetResult]:
"""Get all vectors in a collection."""
collection_name_with_prefix = self._get_collection_name_with_prefix(
collection_name
)
try:
# Use a zero vector for fetching all entries
zero_vector = [0.0] * self.dimension
# Add filter to only get vectors for this collection
query_response = self.index.query(
vector=zero_vector,
top_k=NO_LIMIT,
include_metadata=True,
filter={"collection_name": collection_name_with_prefix},
)
return self._result_to_get_result(query_response.matches)
except Exception as e:
log.error(f"Error getting collection '{collection_name}': {e}")
return None
def delete(
self,
collection_name: str,
ids: Optional[List[str]] = None,
filter: Optional[Dict] = None,
) -> None:
"""Delete vectors by IDs or filter."""
collection_name_with_prefix = self._get_collection_name_with_prefix(
collection_name
)
try:
if ids:
# Delete by IDs (in batches for large deletions)
for i in range(0, len(ids), BATCH_SIZE):
batch_ids = ids[i : i + BATCH_SIZE]
# Note: When deleting by ID, we can't filter by collection_name
# This is a limitation of Pinecone - be careful with ID uniqueness
self.index.delete(ids=batch_ids)
log.debug(
f"Deleted batch of {len(batch_ids)} vectors by ID from '{collection_name_with_prefix}'"
)
log.info(
f"Successfully deleted {len(ids)} vectors by ID from '{collection_name_with_prefix}'"
)
elif filter:
# Combine user filter with collection_name
pinecone_filter = {"collection_name": collection_name_with_prefix}
if filter:
pinecone_filter.update(filter)
# Delete by metadata filter
self.index.delete(filter=pinecone_filter)
log.info(
f"Successfully deleted vectors by filter from '{collection_name_with_prefix}'"
)
else:
log.warning("No ids or filter provided for delete operation")
except Exception as e:
log.error(f"Error deleting from collection '{collection_name}': {e}")
raise
def reset(self) -> None:
"""Reset the database by deleting all collections."""
try:
self.index.delete(delete_all=True)
log.info("All vectors successfully deleted from the index.")
except Exception as e:
log.error(f"Failed to reset Pinecone index: {e}")
raise