Fixed: tensor dtype fixed.
Browse files- src/evaluate.py +10 -16
src/evaluate.py
CHANGED
@@ -20,15 +20,6 @@ def read_pertubed_data(filename, task, lang="en"):
|
|
20 |
raise FileNotFoundError(f"File {filename} not found.")
|
21 |
return pd.read_csv(filename)
|
22 |
|
23 |
-
def compute_metrics(emb1, emb2,metric="cosine"):
|
24 |
-
"""Compute all metrics between two sets of embeddings."""
|
25 |
-
# sim = utils.cosine_similarity(emb1, emb2)
|
26 |
-
# ned = compute_ned_distance(emb1, emb2)
|
27 |
-
# ed = np.linalg.norm(emb1 - emb2, axis=1)
|
28 |
-
# dotp = np.sum(emb1 * emb2, axis=1)
|
29 |
-
if metric=="cosine":
|
30 |
-
sim = CosineMetric(emb1,emb2)
|
31 |
-
return sim
|
32 |
|
33 |
def run(args_model, dataset_name, target_lang,args_task, default_gpu="cuda", metric="cosine",save=False,batch_size=2):
|
34 |
model = LLMEmbeddings(args_model, device=default_gpu)
|
@@ -61,9 +52,15 @@ def run(args_model, dataset_name, target_lang,args_task, default_gpu="cuda", met
|
|
61 |
|
62 |
# Batch process embeddings
|
63 |
embeddings = model.encode_batch(sentences,batch_size=batch_size)
|
64 |
-
|
65 |
-
|
66 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
67 |
|
68 |
# Process embeddings based on task
|
69 |
if args_task == "anto":
|
@@ -151,6 +148,7 @@ if __name__ == "__main__":
|
|
151 |
"batch_size":2
|
152 |
}
|
153 |
else:
|
|
|
154 |
config = {
|
155 |
"args_model": "llama3",
|
156 |
"dataset_name": "mrpc",
|
@@ -161,7 +159,3 @@ if __name__ == "__main__":
|
|
161 |
|
162 |
}
|
163 |
run(**config)
|
164 |
-
|
165 |
-
|
166 |
-
# file_path = "/home/yash/ALIGN-SIM/data/perturbed_dataset/en/anto/mrpc_anto_perturbed_en.csv"
|
167 |
-
# run("llama3","mrpc_anto_perturbed_en", "anto", "cuda:2", False)
|
|
|
20 |
raise FileNotFoundError(f"File {filename} not found.")
|
21 |
return pd.read_csv(filename)
|
22 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
23 |
|
24 |
def run(args_model, dataset_name, target_lang,args_task, default_gpu="cuda", metric="cosine",save=False,batch_size=2):
|
25 |
model = LLMEmbeddings(args_model, device=default_gpu)
|
|
|
52 |
|
53 |
# Batch process embeddings
|
54 |
embeddings = model.encode_batch(sentences,batch_size=batch_size)
|
55 |
+
# Ensure embeddings are on CPU and in numpy format
|
56 |
+
if args_model == "chatgpt":
|
57 |
+
# For chatgpt, embeddings is likely a list of torch tensors
|
58 |
+
embeddings = [emb.cpu().numpy() if isinstance(emb, torch.Tensor) else emb for emb in embeddings]
|
59 |
+
embeddings = np.array(embeddings)
|
60 |
+
else:
|
61 |
+
# For other models, assume a single torch tensor
|
62 |
+
if isinstance(embeddings, torch.Tensor):
|
63 |
+
embeddings = embeddings.cpu().numpy()
|
64 |
|
65 |
# Process embeddings based on task
|
66 |
if args_task == "anto":
|
|
|
148 |
"batch_size":2
|
149 |
}
|
150 |
else:
|
151 |
+
#sentence-transformers/all-MiniLM-L6-v2
|
152 |
config = {
|
153 |
"args_model": "llama3",
|
154 |
"dataset_name": "mrpc",
|
|
|
159 |
|
160 |
}
|
161 |
run(**config)
|
|
|
|
|
|
|
|