Tai Truong
fix readme
d202ada
raw
history blame contribute delete
2.82 kB
import numpy as np
from langflow.custom import Component
from langflow.io import DataInput, DropdownInput, Output
from langflow.schema import Data
class EmbeddingSimilarityComponent(Component):
display_name: str = "Embedding Similarity"
description: str = "Compute selected form of similarity between two embedding vectors."
icon = "equal"
inputs = [
DataInput(
name="embedding_vectors",
display_name="Embedding Vectors",
info="A list containing exactly two data objects with embedding vectors to compare.",
is_list=True,
),
DropdownInput(
name="similarity_metric",
display_name="Similarity Metric",
info="Select the similarity metric to use.",
options=["Cosine Similarity", "Euclidean Distance", "Manhattan Distance"],
value="Cosine Similarity",
),
]
outputs = [
Output(display_name="Similarity Data", name="similarity_data", method="compute_similarity"),
]
def compute_similarity(self) -> Data:
embedding_vectors: list[Data] = self.embedding_vectors
# Assert that the list contains exactly two Data objects
if len(embedding_vectors) != 2: # noqa: PLR2004
msg = "Exactly two embedding vectors are required."
raise ValueError(msg)
embedding_1 = np.array(embedding_vectors[0].data["embeddings"])
embedding_2 = np.array(embedding_vectors[1].data["embeddings"])
if embedding_1.shape != embedding_2.shape:
similarity_score = {"error": "Embeddings must have the same dimensions."}
else:
similarity_metric = self.similarity_metric
if similarity_metric == "Cosine Similarity":
score = np.dot(embedding_1, embedding_2) / (np.linalg.norm(embedding_1) * np.linalg.norm(embedding_2))
similarity_score = {"cosine_similarity": score}
elif similarity_metric == "Euclidean Distance":
score = np.linalg.norm(embedding_1 - embedding_2)
similarity_score = {"euclidean_distance": score}
elif similarity_metric == "Manhattan Distance":
score = np.sum(np.abs(embedding_1 - embedding_2))
similarity_score = {"manhattan_distance": score}
# Create a Data object to encapsulate the similarity score and additional information
similarity_data = Data(
data={
"embedding_1": embedding_vectors[0].data["embeddings"],
"embedding_2": embedding_vectors[1].data["embeddings"],
"similarity_score": similarity_score,
},
text_key="similarity_score",
)
self.status = similarity_data
return similarity_data