Synthyra
/

ESMplusplus_small

Fill-Mask

Transformers

Safetensors

ESMplusplus

custom_code

Model card Files Files and versions Community

lhallee commited on 21 days ago

Commit

32d5094

verified ·

1 Parent(s): 688ced4

Upload modeling_esm_plusplus.py with huggingface_hub

Browse files

Files changed (1) hide show

modeling_esm_plusplus.py +7 -3

modeling_esm_plusplus.py CHANGED Viewed

@@ -629,6 +629,7 @@ class EmbeddingMixin:
         tokenizer: PreTrainedTokenizerBase,
         batch_size: int = 2,
         max_len: int = 512,
         full_embeddings: bool = False,
         embed_dtype: torch.dtype = torch.float32,
         pooling_types: List[str] = ['mean'],
@@ -680,8 +681,9 @@ class EmbeddingMixin:
             )
             >>> # embedding_dict is a dictionary mapping sequences to their embeddings as tensors for .pth or numpy arrays for sql
         """
-        sequences = list(set([seq[:max_len] for seq in sequences]))
         sequences = sorted(sequences, key=len, reverse=True)
         collate_fn = build_collator(tokenizer)
         device = self.device
         pooler = Pooler(pooling_types) if not full_embeddings else None
@@ -712,7 +714,7 @@ class EmbeddingMixin:
                         embeddings = get_embeddings(residue_embeddings, attention_mask).cpu()
                         for seq, emb, mask in zip(seqs, embeddings, attention_mask):
                             if full_embeddings:
-                                emb = emb[mask.bool()]
                             c.execute("INSERT OR REPLACE INTO embeddings VALUES (?, ?)",
                                     (seq, emb.cpu().numpy().tobytes()))
@@ -742,7 +744,9 @@ class EmbeddingMixin:
                     input_ids, attention_mask = batch['input_ids'].to(device), batch['attention_mask'].to(device)
                     residue_embeddings = self._embed(input_ids, attention_mask)
                     embeddings = get_embeddings(residue_embeddings, attention_mask).to(embed_dtype).cpu()
-                    for seq, emb in zip(seqs, embeddings):
                         embeddings_dict[seq] = emb
         if save:

         tokenizer: PreTrainedTokenizerBase,
         batch_size: int = 2,
         max_len: int = 512,
+        truncate: bool = True,
         full_embeddings: bool = False,
         embed_dtype: torch.dtype = torch.float32,
         pooling_types: List[str] = ['mean'],
             )
             >>> # embedding_dict is a dictionary mapping sequences to their embeddings as tensors for .pth or numpy arrays for sql
         """
+        sequences = list(set([seq[:max_len] if truncate else seq for seq in sequences]))
         sequences = sorted(sequences, key=len, reverse=True)
+        hidden_size = self.config.hidden_size
         collate_fn = build_collator(tokenizer)
         device = self.device
         pooler = Pooler(pooling_types) if not full_embeddings else None
                         embeddings = get_embeddings(residue_embeddings, attention_mask).cpu()
                         for seq, emb, mask in zip(seqs, embeddings, attention_mask):
                             if full_embeddings:
+                                emb = emb[mask.bool()].reshape(-1, hidden_size)
                             c.execute("INSERT OR REPLACE INTO embeddings VALUES (?, ?)",
                                     (seq, emb.cpu().numpy().tobytes()))
                     input_ids, attention_mask = batch['input_ids'].to(device), batch['attention_mask'].to(device)
                     residue_embeddings = self._embed(input_ids, attention_mask)
                     embeddings = get_embeddings(residue_embeddings, attention_mask).to(embed_dtype).cpu()
+                    for seq, emb, mask in zip(seqs, embeddings, attention_mask):
+                        if full_embeddings:
+                            emb = emb[mask.bool()].reshape(-1, hidden_size)
                         embeddings_dict[seq] = emb
         if save: