Delete pooling_coverage.py
Browse files- pooling_coverage.py +0 -160
pooling_coverage.py
DELETED
@@ -1,160 +0,0 @@
|
|
1 |
-
import torch
|
2 |
-
|
3 |
-
|
4 |
-
class GaussianCoveragePooling(torch.nn.Module):
|
5 |
-
def __init__(self, coverage_chunks, sigma, alpha):
|
6 |
-
"""
|
7 |
-
Custom pooling layer that computes weighted mean pooling using Gaussian-based weights.
|
8 |
-
|
9 |
-
Args:
|
10 |
-
coverage_chunks (int): Number of weighted pooling operations (N).
|
11 |
-
sigma (float): Standard deviation for Gaussian weighting.
|
12 |
-
alpha (float): Weighting factor for merging with standard mean pooling.
|
13 |
-
"""
|
14 |
-
super().__init__()
|
15 |
-
self.coverage_chunks = coverage_chunks
|
16 |
-
self.sigma = sigma # Controls width of Gaussians
|
17 |
-
self.alpha = alpha # Blends standard mean with weighted mean
|
18 |
-
|
19 |
-
def forward(self, features, chunk_indicators=None):
|
20 |
-
"""
|
21 |
-
Computes weighted mean pooling using Gaussian-based weights.
|
22 |
-
|
23 |
-
Args:
|
24 |
-
self (SentenceTransformer): The model.
|
25 |
-
features (dict): The token embeddings and attention mask.
|
26 |
-
chunk_indicators (tensor[bz, 1]): Index indicators to return a specific chunk,
|
27 |
-
leave as None to return embeddings for all chunks. Mainly useful for training,
|
28 |
-
not inference. Leave as None for inference.
|
29 |
-
"""
|
30 |
-
|
31 |
-
# Get token embeddings and attention mask
|
32 |
-
token_embeddings = features[
|
33 |
-
"token_embeddings"
|
34 |
-
] # (batch_size, seq_len, hidden_dim)
|
35 |
-
attention_mask = (
|
36 |
-
features["attention_mask"].float().unsqueeze(-1)
|
37 |
-
) # (batch_size, seq_len, 1)
|
38 |
-
|
39 |
-
# Get shapes and devices
|
40 |
-
batch_size, seq_len, hidden_dim = token_embeddings.shape
|
41 |
-
device = token_embeddings.device
|
42 |
-
|
43 |
-
# Compute actual sequence lengths (ignoring padding)
|
44 |
-
# (batch_size, 1)
|
45 |
-
seq_lengths = attention_mask.squeeze(-1).sum(dim=1, keepdim=True)
|
46 |
-
max_seq_length = int(torch.max(seq_lengths).item())
|
47 |
-
|
48 |
-
# Standard mean pooling
|
49 |
-
sum_embeddings = torch.sum(token_embeddings * attention_mask, dim=1)
|
50 |
-
sum_mask = torch.sum(attention_mask, dim=1).clamp(min=1e-9)
|
51 |
-
standard_mean = sum_embeddings / sum_mask # (batch_size, hidden_dim)
|
52 |
-
|
53 |
-
# Compute chunk centers dynamically based on sequence length
|
54 |
-
chunk_positions = torch.linspace(0, 1, self.coverage_chunks + 2, device=device)[
|
55 |
-
1:-1
|
56 |
-
] # Excludes 0 and 1
|
57 |
-
chunk_centers = chunk_positions * seq_lengths # (batch_size, N)
|
58 |
-
|
59 |
-
# Token positions per sequence (batch_size, seq_len)
|
60 |
-
token_positions = (
|
61 |
-
torch.arange(seq_len, device=device).float().unsqueeze(0)
|
62 |
-
) # (1, seq_len)
|
63 |
-
|
64 |
-
# Compute Gaussian weights (batch_size, N, seq_len)
|
65 |
-
seq_lengths = seq_lengths.view(seq_lengths.shape[0], 1, 1).repeat(
|
66 |
-
1, self.coverage_chunks, max_seq_length
|
67 |
-
)
|
68 |
-
gaussians = torch.exp(
|
69 |
-
-0.5
|
70 |
-
* (
|
71 |
-
(token_positions.unsqueeze(1) - chunk_centers.unsqueeze(2))
|
72 |
-
/ (self.sigma * seq_lengths)
|
73 |
-
)
|
74 |
-
** 2
|
75 |
-
)
|
76 |
-
|
77 |
-
# Mask out padding and normalize Gaussian weights per sequence
|
78 |
-
# (batch_size, N, seq_len)
|
79 |
-
gaussians = gaussians * attention_mask.squeeze(-1).unsqueeze(1)
|
80 |
-
|
81 |
-
# Normalize against gaussian weights
|
82 |
-
gaussians /= gaussians.sum(dim=2, keepdim=True).clamp(min=1e-9)
|
83 |
-
|
84 |
-
# Compute weighted mean for each chunk (batch_size, N, hidden_dim)
|
85 |
-
weighted_means = torch.einsum(
|
86 |
-
"bns,bsh->bnh", gaussians.to(token_embeddings.dtype), token_embeddings
|
87 |
-
)
|
88 |
-
|
89 |
-
# Blend with standard mean pooling
|
90 |
-
# (batch_size, N, hidden_dim)
|
91 |
-
combined_embeddings = (1 - self.alpha) * standard_mean.unsqueeze(
|
92 |
-
1
|
93 |
-
) + self.alpha * weighted_means
|
94 |
-
|
95 |
-
# Add an embedding for the entire document at index 0
|
96 |
-
# (batch_size, N+1, hidden_dim)
|
97 |
-
combined_embeddings = torch.cat(
|
98 |
-
[torch.zeros_like(combined_embeddings[:, :1]), combined_embeddings], 1
|
99 |
-
)
|
100 |
-
combined_embeddings[:, 0:1, :] = standard_mean.unsqueeze(1)
|
101 |
-
|
102 |
-
# Select the indicator if provided
|
103 |
-
if chunk_indicators is not None:
|
104 |
-
combined_embeddings = combined_embeddings[
|
105 |
-
torch.arange(combined_embeddings.size(0)), chunk_indicators
|
106 |
-
]
|
107 |
-
|
108 |
-
# Normalize all the embeddings
|
109 |
-
combined_embeddings = torch.nn.functional.normalize(
|
110 |
-
combined_embeddings, p=2, dim=-1
|
111 |
-
)
|
112 |
-
|
113 |
-
# Flatten final embeddings (batch_size, hidden_dim * (N+1))
|
114 |
-
if chunk_indicators is None:
|
115 |
-
sentence_embedding = combined_embeddings.reshape(
|
116 |
-
batch_size, hidden_dim * (self.coverage_chunks + 1)
|
117 |
-
)
|
118 |
-
else:
|
119 |
-
sentence_embedding = combined_embeddings
|
120 |
-
|
121 |
-
# Return the final flattened entence embedding
|
122 |
-
features["sentence_embedding"] = sentence_embedding
|
123 |
-
return features
|
124 |
-
|
125 |
-
|
126 |
-
def use_gaussian_coverage_pooling(m, coverage_chunks=10, sigma=0.05, alpha=1.0):
|
127 |
-
"""
|
128 |
-
Add custom pooling layer that computes weighted mean pooling using Gaussian-based weights.
|
129 |
-
|
130 |
-
Args:
|
131 |
-
m (SentenceTransformer): The model to add pooling layer to.
|
132 |
-
coverage_chunks (int): Number of weighted pooling operations (N).
|
133 |
-
sigma (float): Standard deviation for Gaussian weighting.
|
134 |
-
alpha (float): Weighting factor for merging with standard mean pooling.
|
135 |
-
"""
|
136 |
-
if isinstance(m[1], GaussianCoveragePooling):
|
137 |
-
m = unuse_gaussian_coverage_pooling(m)
|
138 |
-
word_embedding_model = m[0]
|
139 |
-
custom_pooling = GaussianCoveragePooling(
|
140 |
-
coverage_chunks=coverage_chunks, sigma=sigma, alpha=alpha
|
141 |
-
)
|
142 |
-
old_pooling = m[1]
|
143 |
-
new_m = m.__class__(modules=[word_embedding_model, custom_pooling])
|
144 |
-
new_m.old_pooling = {"old_pooling": old_pooling}
|
145 |
-
return new_m
|
146 |
-
|
147 |
-
|
148 |
-
def unuse_gaussian_coverage_pooling(m):
|
149 |
-
"""
|
150 |
-
Removes the custom pooling layer.
|
151 |
-
|
152 |
-
Args:
|
153 |
-
m (SentenceTransformer): The model to remove the pooling layer from.
|
154 |
-
"""
|
155 |
-
|
156 |
-
if isinstance(m[1], GaussianCoveragePooling):
|
157 |
-
new_m = m.__class__(modules=[m[0], m.old_pooling["old_pooling"]])
|
158 |
-
return new_m
|
159 |
-
else:
|
160 |
-
return m
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|