AjayP13 commited on
Commit
c633cc6
·
verified ·
1 Parent(s): 2fe76e3

Update instruction_template_retriever.py

Browse files
Files changed (1) hide show
  1. instruction_template_retriever.py +155 -0
instruction_template_retriever.py CHANGED
@@ -11,6 +11,161 @@ from huggingface_hub import hf_hub_download
11
  from sentence_transformers import SentenceTransformer
12
 
13
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
  class InstructionTemplateRetriever:
15
  FINETEMPLATES_REVISION = "831ab22c90f9da011bd972585afdf609f40fa54b"
16
  RETRIEVAL_EMBEDDING_NAME = "fineinstructions/matching_embedding"
 
11
  from sentence_transformers import SentenceTransformer
12
 
13
 
14
+ class GaussianCoveragePooling(torch.nn.Module):
15
+ def __init__(self, coverage_chunks, sigma, alpha):
16
+ """
17
+ Custom pooling layer that computes weighted mean pooling using Gaussian-based weights.
18
+ Args:
19
+ coverage_chunks (int): Number of weighted pooling operations (N).
20
+ sigma (float): Standard deviation for Gaussian weighting.
21
+ alpha (float): Weighting factor for merging with standard mean pooling.
22
+ """
23
+ super().__init__()
24
+ self.coverage_chunks = coverage_chunks
25
+ self.sigma = sigma # Controls width of Gaussians
26
+ self.alpha = alpha # Blends standard mean with weighted mean
27
+
28
+ def forward(self, features, chunk_indicators=None):
29
+ """
30
+ Computes weighted mean pooling using Gaussian-based weights.
31
+ Args:
32
+ self (SentenceTransformer): The model.
33
+ features (dict): The token embeddings and attention mask.
34
+ chunk_indicators (tensor[bz, 1]): Index indicators to return a specific chunk,
35
+ leave as None to return embeddings for all chunks. Mainly useful for training,
36
+ not inference. Leave as None for inference.
37
+ """
38
+
39
+ # Get token embeddings and attention mask
40
+ token_embeddings = features[
41
+ "token_embeddings"
42
+ ] # (batch_size, seq_len, hidden_dim)
43
+ attention_mask = (
44
+ features["attention_mask"].float().unsqueeze(-1)
45
+ ) # (batch_size, seq_len, 1)
46
+
47
+ # Get shapes and devices
48
+ batch_size, seq_len, hidden_dim = token_embeddings.shape
49
+ device = token_embeddings.device
50
+
51
+ # Compute actual sequence lengths (ignoring padding)
52
+ # (batch_size, 1)
53
+ seq_lengths = attention_mask.squeeze(-1).sum(dim=1, keepdim=True)
54
+ max_seq_length = int(torch.max(seq_lengths).item())
55
+
56
+ # Standard mean pooling
57
+ sum_embeddings = torch.sum(token_embeddings * attention_mask, dim=1)
58
+ sum_mask = torch.sum(attention_mask, dim=1).clamp(min=1e-9)
59
+ standard_mean = sum_embeddings / sum_mask # (batch_size, hidden_dim)
60
+
61
+ # Compute chunk centers dynamically based on sequence length
62
+ chunk_positions = torch.linspace(0, 1, self.coverage_chunks + 2, device=device)[
63
+ 1:-1
64
+ ] # Excludes 0 and 1
65
+ chunk_centers = chunk_positions * seq_lengths # (batch_size, N)
66
+
67
+ # Token positions per sequence (batch_size, seq_len)
68
+ token_positions = (
69
+ torch.arange(seq_len, device=device).float().unsqueeze(0)
70
+ ) # (1, seq_len)
71
+
72
+ # Compute Gaussian weights (batch_size, N, seq_len)
73
+ seq_lengths = seq_lengths.view(seq_lengths.shape[0], 1, 1).repeat(
74
+ 1, self.coverage_chunks, max_seq_length
75
+ )
76
+ gaussians = torch.exp(
77
+ -0.5
78
+ * (
79
+ (token_positions.unsqueeze(1) - chunk_centers.unsqueeze(2))
80
+ / (self.sigma * seq_lengths)
81
+ )
82
+ ** 2
83
+ )
84
+
85
+ # Mask out padding and normalize Gaussian weights per sequence
86
+ # (batch_size, N, seq_len)
87
+ gaussians = gaussians * attention_mask.squeeze(-1).unsqueeze(1)
88
+
89
+ # Normalize against gaussian weights
90
+ gaussians /= gaussians.sum(dim=2, keepdim=True).clamp(min=1e-9)
91
+
92
+ # Compute weighted mean for each chunk (batch_size, N, hidden_dim)
93
+ weighted_means = torch.einsum(
94
+ "bns,bsh->bnh", gaussians.to(token_embeddings.dtype), token_embeddings
95
+ )
96
+
97
+ # Blend with standard mean pooling
98
+ # (batch_size, N, hidden_dim)
99
+ combined_embeddings = (1 - self.alpha) * standard_mean.unsqueeze(
100
+ 1
101
+ ) + self.alpha * weighted_means
102
+
103
+ # Add an embedding for the entire document at index 0
104
+ # (batch_size, N+1, hidden_dim)
105
+ combined_embeddings = torch.cat(
106
+ [torch.zeros_like(combined_embeddings[:, :1]), combined_embeddings], 1
107
+ )
108
+ combined_embeddings[:, 0:1, :] = standard_mean.unsqueeze(1)
109
+
110
+ # Select the indicator if provided
111
+ if chunk_indicators is not None:
112
+ combined_embeddings = combined_embeddings[
113
+ torch.arange(combined_embeddings.size(0)), chunk_indicators
114
+ ]
115
+
116
+ # Normalize all the embeddings
117
+ combined_embeddings = torch.nn.functional.normalize(
118
+ combined_embeddings, p=2, dim=-1
119
+ )
120
+
121
+ # Flatten final embeddings (batch_size, hidden_dim * (N+1))
122
+ if chunk_indicators is None:
123
+ sentence_embedding = combined_embeddings.reshape(
124
+ batch_size, hidden_dim * (self.coverage_chunks + 1)
125
+ )
126
+ else:
127
+ sentence_embedding = combined_embeddings
128
+
129
+ # Return the final flattened entence embedding
130
+ features["sentence_embedding"] = sentence_embedding
131
+ return features
132
+
133
+
134
+ def use_gaussian_coverage_pooling(m, coverage_chunks=10, sigma=0.05, alpha=1.0):
135
+ """
136
+ Add custom pooling layer that computes weighted mean pooling using Gaussian-based weights.
137
+ Args:
138
+ m (SentenceTransformer): The model to add pooling layer to.
139
+ coverage_chunks (int): Number of weighted pooling operations (N).
140
+ sigma (float): Standard deviation for Gaussian weighting.
141
+ alpha (float): Weighting factor for merging with standard mean pooling.
142
+ """
143
+ if isinstance(m[1], GaussianCoveragePooling):
144
+ m = unuse_gaussian_coverage_pooling(m)
145
+ word_embedding_model = m[0]
146
+ custom_pooling = GaussianCoveragePooling(
147
+ coverage_chunks=coverage_chunks, sigma=sigma, alpha=alpha
148
+ )
149
+ old_pooling = m[1]
150
+ new_m = m.__class__(modules=[word_embedding_model, custom_pooling])
151
+ new_m.old_pooling = {"old_pooling": old_pooling}
152
+ return new_m
153
+
154
+
155
+ def unuse_gaussian_coverage_pooling(m):
156
+ """
157
+ Removes the custom pooling layer.
158
+ Args:
159
+ m (SentenceTransformer): The model to remove the pooling layer from.
160
+ """
161
+
162
+ if isinstance(m[1], GaussianCoveragePooling):
163
+ new_m = m.__class__(modules=[m[0], m.old_pooling["old_pooling"]])
164
+ return new_m
165
+ else:
166
+ return m
167
+
168
+
169
  class InstructionTemplateRetriever:
170
  FINETEMPLATES_REVISION = "831ab22c90f9da011bd972585afdf609f40fa54b"
171
  RETRIEVAL_EMBEDDING_NAME = "fineinstructions/matching_embedding"