BridgeAI-Lab
/

ALIGN-Sim

Model card Files Files and versions Community

yzm0034 commited on 25 days ago

Commit

76a5b51

·

verified ·

1 Parent(s): fdfb446

Fixed: tensor dtype fixed.

Files changed (1) hide show

src/evaluate.py +10 -16

src/evaluate.py CHANGED Viewed

@@ -20,15 +20,6 @@ def read_pertubed_data(filename, task, lang="en"):
         raise FileNotFoundError(f"File {filename} not found.")
     return pd.read_csv(filename)
-def compute_metrics(emb1, emb2,metric="cosine"):
-    """Compute all metrics between two sets of embeddings."""
-    # sim = utils.cosine_similarity(emb1, emb2)
-    # ned = compute_ned_distance(emb1, emb2)
-    # ed = np.linalg.norm(emb1 - emb2, axis=1)
-    # dotp = np.sum(emb1 * emb2, axis=1)
-    if metric=="cosine":
-        sim = CosineMetric(emb1,emb2)
-    return sim
 def run(args_model, dataset_name, target_lang,args_task, default_gpu="cuda", metric="cosine",save=False,batch_size=2):
     model = LLMEmbeddings(args_model, device=default_gpu)
@@ -61,9 +52,15 @@ def run(args_model, dataset_name, target_lang,args_task, default_gpu="cuda", met
     # Batch process embeddings
     embeddings = model.encode_batch(sentences,batch_size=batch_size)
-    if args_model != "chatgpt":
-        embeddings = [emb.cpu().numpy() for emb in embeddings]
-    embeddings = np.array(embeddings)
     # Process embeddings based on task
     if args_task == "anto":
@@ -151,6 +148,7 @@ if __name__ == "__main__":
             "batch_size":2
         }
     else:
         config = {
             "args_model": "llama3",
             "dataset_name": "mrpc",
@@ -161,7 +159,3 @@ if __name__ == "__main__":
         }
     run(**config)
-    # file_path = "/home/yash/ALIGN-SIM/data/perturbed_dataset/en/anto/mrpc_anto_perturbed_en.csv"
-    # run("llama3","mrpc_anto_perturbed_en", "anto", "cuda:2", False)

         raise FileNotFoundError(f"File {filename} not found.")
     return pd.read_csv(filename)
 def run(args_model, dataset_name, target_lang,args_task, default_gpu="cuda", metric="cosine",save=False,batch_size=2):
     model = LLMEmbeddings(args_model, device=default_gpu)
     # Batch process embeddings
     embeddings = model.encode_batch(sentences,batch_size=batch_size)
+    # Ensure embeddings are on CPU and in numpy format
+    if args_model == "chatgpt":
+        # For chatgpt, embeddings is likely a list of torch tensors
+        embeddings = [emb.cpu().numpy() if isinstance(emb, torch.Tensor) else emb for emb in embeddings]
+        embeddings = np.array(embeddings)
+    else:
+        # For other models, assume a single torch tensor
+        if isinstance(embeddings, torch.Tensor):
+            embeddings = embeddings.cpu().numpy()
     # Process embeddings based on task
     if args_task == "anto":
             "batch_size":2
         }
     else:
+        #sentence-transformers/all-MiniLM-L6-v2
         config = {
             "args_model": "llama3",
             "dataset_name": "mrpc",
         }
     run(**config)