Spaces:

cpwan
/

RLOR-TSP

Runtime error

RLOR-TSP / models /nets /attention_model /decoder.py

Patrick WAN

initial commit

52933b5 about 2 years ago

8.62 kB

	import torch
	from torch import nn

	from ...nets.attention_model.context import AutoContext
	from ...nets.attention_model.dynamic_embedding import AutoDynamicEmbedding
	from ...nets.attention_model.multi_head_attention import (
	AttentionScore,
	MultiHeadAttention,
	)


	class Decoder(nn.Module):
	r"""
	The decoder of the Attention Model.

	.. math::
	\{\log(\pmb{p}_t)\},\pi = \mathrm{Decoder}(s, \pmb{h})

	First of all, precompute the keys and values for the embedding :math:`\pmb{h}`:

	.. math::
	\pmb{k}, \pmb{v}, \pmb{k}^\prime = W^K\pmb{h}, W^V\pmb{h}, W^{K^\prime}\pmb{h}
	and the projection of the graph embedding:

	.. math::
	W_{gc}\bar{\pmb{h}} \quad \text{ for } \bar{\pmb{h}} = \frac{1}{N}\sum\nolimits_i \pmb{h}_i.

	Then, the decoder iterates the decoding process autoregressively.
	In each decoding step, we perform multiple attentions to get the logits for each node.

	.. math::
	\begin{aligned}
	\pmb{h}_{(c)} &= [\bar{\pmb{h}}, \text{Context}(s,\pmb{h})] \\
	q & = W^Q \pmb{h}_{(c)} = W_{gc}\bar{\pmb{h}} + W_{sc}\text{Context}(s,\pmb{h}) \\
	q_{gl} &= \mathrm{MultiHeadAttention}(q,\pmb{k},\pmb{v},\mathrm{mask}_t) \\
	\pmb{p}_t &= \mathrm{Softmax}(\mathrm{AttentionScore}_{\text{clip}}(q_{gl},\pmb{k}^\prime, \mathrm{mask}_t))\\
	\pi_{t} &= \mathrm{DecodingStartegy}(\pmb{p}_t) \\
	\mathrm{mask}_{t+1} &= \mathrm{mask}_t.update(\pi_t).
	\end{aligned}



	.. note::
	If there are dynamic node features specified by :mod:`.dynamic_embedding` ,
	the keys and values projections are updated in each decoding step by

	.. math::
	\begin{aligned}
	\pmb{k}_{\text{dynamic}}, \pmb{v}_{\text{dynamic}}, \pmb{k}^\prime_{\text{dynamic}} &= \mathrm{DynamicEmbedding}(s)\\
	\pmb{k} &= \pmb{k} + \pmb{k}_{\text{dynamic}}\\
	\pmb{v} &= \pmb{v} +\pmb{v}_{\text{dynamic}} \\
	\pmb{k}^\prime &= \pmb{k}^\prime +\pmb{k}^\prime_{\text{dynamic}}.
	\end{aligned}
	.. seealso::
	* The :math:`\text{Context}` is defined in the :mod:`.context` module.
	* The :math:`\text{AttentionScore}` is defined by the :class:`.AttentionScore` class.
	* The :math:`\text{MultiHeadAttention}` is defined by the :class:`.MultiHeadAttention` class.

	Args:
	embedding_dim : the dimension of the embedded inputs
	step_context_dim : the dimension of the context :math:`\text{Context}(\pmb{x})`
	n_heads: number of heads in the :math:`\mathrm{MultiHeadAttention}`
	problem: an object defining the state and the mask updating rule of the problem
	tanh_clipping : the clipping scale of the pointer (attention layer before output)
	Inputs: input, embeddings
	* input : dict of inputs, for example ``{'loc': tensor, 'depot': tensor, 'demand': tensor}`` for CVRP.
	* embeddings: [batch, graph_size, embedding_dim]
	Outputs: log_ps, pi
	* log_ps: [batch, graph_size, T]
	* pi: [batch, T]

	"""

	def __init__(self, embedding_dim, step_context_dim, n_heads, problem, tanh_clipping):
	super(Decoder, self).__init__()
	# For each node we compute (glimpse key, glimpse value, logit key) so 3 * embedding_dim

	self.project_node_embeddings = nn.Linear(embedding_dim, 3 * embedding_dim, bias=False)
	self.project_fixed_context = nn.Linear(embedding_dim, embedding_dim, bias=False)
	self.project_step_context = nn.Linear(step_context_dim, embedding_dim, bias=False)

	self.context = AutoContext(problem.NAME, {"context_dim": step_context_dim})
	self.dynamic_embedding = AutoDynamicEmbedding(
	problem.NAME, {"embedding_dim": embedding_dim}
	)
	self.glimpse = MultiHeadAttention(embedding_dim=embedding_dim, n_heads=n_heads)
	self.pointer = AttentionScore(use_tanh=True, C=tanh_clipping)

	self.decode_type = None
	self.problem = problem

	def forward(self, input, embeddings):
	outputs = []
	sequences = []

	state = self.problem.make_state(input)

	# Compute keys, values for the glimpse and keys for the logits once as they can be reused in every step
	cached_embeddings = self._precompute(embeddings)

	# Perform decoding steps
	while not (state.all_finished()):

	log_p, mask = self.advance(cached_embeddings, state)

	# Select the indices of the next nodes in the sequences, result (batch_size) long
	# Squeeze out steps dimension
	action = self.decode(log_p.exp(), mask)
	state = state.update(action)

	# Collect output of step
	outputs.append(log_p)
	sequences.append(action)

	# Collected lists, return Tensor
	return torch.stack(outputs, 1), torch.stack(sequences, 1)

	def set_decode_type(self, decode_type):
	r"""
	Currently support

	.. code-block:: python

	["greedy", "sampling"]

	"""
	self.decode_type = decode_type

	def decode(self, probs, mask):
	r"""
	Execute the decoding strategy specified by ``self.decode_type``.

	Inputs:
	* probs: [batch_size, graph_size]
	* mask (bool): [batch_size, graph_size]
	Outputs:
	* idxs (int): index of action chosen. [batch_size]
	"""
	assert (probs == probs).all(), "Probs should not contain any nans"

	if self.decode_type == "greedy":
	_, selected = probs.max(1)
	assert not mask.gather(
	1, selected.unsqueeze(-1)
	).data.any(), "Decode greedy: infeasible action has maximum probability"

	elif self.decode_type == "sampling":
	selected = probs.multinomial(1).squeeze(1)

	# Check if sampling went OK, can go wrong due to bug on GPU
	# See https://discuss.pytorch.org/t/bad-behavior-of-multinomial-function/10232
	while mask.gather(1, selected.unsqueeze(-1)).data.any():
	print("Sampled bad values, resampling!")
	selected = probs.multinomial(1).squeeze(1)

	else:
	assert False, "Unknown decode type"
	return selected

	def _precompute(self, embeddings):

	# The fixed context projection of the graph embedding is calculated only once for efficiency
	graph_embed = embeddings.mean(1)
	# fixed context = (batch_size, 1, embed_dim) to make broadcastable with parallel timesteps
	graph_context = self.project_fixed_context(graph_embed).unsqueeze(-2)
	# The projection of the node embeddings for the attention is calculated once up front
	glimpse_key, glimpse_val, logit_key = self.project_node_embeddings(embeddings).chunk(
	3, dim=-1
	)

	cache = (
	embeddings,
	graph_context,
	glimpse_key,
	glimpse_val,
	logit_key,
	) # single head for the final logit
	return cache

	def advance(self, cached_embeddings, state):

	node_embeddings, graph_context, glimpse_K, glimpse_V, logit_K = cached_embeddings

	# Compute context node embedding: [graph embedding\| prev node\| problem-state-context]
	# [batch, 1, context dim]
	context = self.context(node_embeddings, state)
	step_context = self.project_step_context(context) # [batch, 1, embed_dim]
	query = graph_context + step_context # [batch, 1, embed_dim]

	glimpse_key_dynamic, glimpse_val_dynamic, logit_key_dynamic = self.dynamic_embedding(state)
	glimpse_K = glimpse_K + glimpse_key_dynamic
	glimpse_V = glimpse_V + glimpse_val_dynamic
	logit_K = logit_K + logit_key_dynamic
	# Compute the mask
	mask = state.get_mask()

	# Compute logits (unnormalized log_p)
	logits, glimpse = self.calc_logits(query, glimpse_K, glimpse_V, logit_K, mask)

	return logits, glimpse

	def calc_logits(self, query, glimpse_K, glimpse_V, logit_K, mask):
	# Compute glimpse with multi-head-attention.
	# Then use glimpse as a query to compute logits for each node

	# [batch, 1, embed dim]
	glimpse = self.glimpse(query, glimpse_K, glimpse_V, mask)

	logits = self.pointer(glimpse, logit_K, mask)

	return logits, glimpse