File size: 1,360 Bytes
70a6fd7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 |
import torch
import torch.nn as nn
import math
class MultiHeadAttention(nn.Module):
def __init__(self, d_model, n_heads):
super(MultiHeadAttention, self).__init__()
self.d_model = d_model
self.n_heads = n_heads
assert d_model % self.n_heads == 0
self.head_dim = d_model // n_heads
self.query = nn.Linear(d_model, d_model)
self.key = nn.Linear(d_model, d_model)
self.value = nn.Linear(d_model, d_model)
self.fc_out = nn.Linear(d_model, d_model)
def forward(self, query, key, value, mask=None):
N = query.shape[0]
Q = self.query(query)
K = self.key(key)
V = self.value(value)
Q = Q.view(N, -1, self.n_heads, self.head_dim).transpose(1, 2)
K = K.view(N, -1, self.n_heads, self.head_dim).transpose(1, 2)
V = V.view(N, -1, self.n_heads, self.head_dim).transpose(1, 2)
energy = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.head_dim)
if mask is not None:
energy = energy.masked_fill(mask == 0, float('-1e20'))
attention = torch.softmax(energy, dim=-1)
out = torch.matmul(attention, V)
out = out.transpose(1, 2).contiguous().view(N, -1, self.n_heads * self.head_dim)
out = self.fc_out(out)
return out
|