Skip to content
Permalink
 
 
Cannot retrieve contributors at this time
588 lines (481 sloc) 23.4 KB
"""
Check my blog post on attention and transformer:
https://lilianweng.github.io/lil-log/2018/06/24/attention-attention.html
Implementations that helped me:
https://github.com/Kyubyong/transformer/
https://github.com/tensorflow/tensor2tensor/blob/master/tensor2tensor/models/transformer.py
http://nlp.seas.harvard.edu/2018/04/01/attention.html
Author: Lilian Weng (lilian.wengweng@gmail.com)
http://lilianweng.github.io/lil-log
Oct 2018
"""
import numpy as np
import tensorflow as tf
import tensorflow.contrib as tc
import json
import os
from utils import BaseModelMixin, REPO_ROOT
from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction
from data import recover_sentence, START_ID, PAD_ID
class Transformer(BaseModelMixin):
"""
See the architecture spec of Transformer in:
Vaswani et al. Attention is All You Need. NIPS 2017.
"""
def __init__(self, num_heads=8, d_model=512, d_ff=2048, num_enc_layers=6, num_dec_layers=6,
drop_rate=0.1, warmup_steps=400, pos_encoding_type='sinusoid',
ls_epsilon=0.1, use_label_smoothing=True,
model_name='transformer', tf_sess_config=None, **kwargs):
"""
Args:
num_heads (int): number of heads in multi-head attention unit.
d_model (int): dimension of embedding size and the model data flow.
d_ff (int): dimension of the feed-forward layer.
num_enc_layers (int): number of encoder layers in the encoder.
num_dec_layers (int): number of decoder layers in the decoder.
drop_rate (float): drop rate in the dropout layer.
warmup_steps (int)
pos_encoding_type (str): type of positional encoding, 'sinusoid' or 'embedding'.
ls_epsilon (float): epsilon in the label smoothing function.
use_label_smoothing (bool): whether use label smoothing for the truth target.
model_name (str):
tf_sess_config (dict): dict config used when creating a tf.session.
"""
assert d_model % num_heads == 0
assert pos_encoding_type in ('sinusoid', 'embedding')
super().__init__(model_name, tf_sess_config=tf_sess_config)
self.h = num_heads
self.d_model = d_model
self.d_ff = d_ff
self.num_enc_layers = num_enc_layers
self.num_dec_layers = num_dec_layers
# Dropout regularization: added in every sublayer before layer_norm(...) and
# applied to embedding + positional encoding.
self.drop_rate = drop_rate
# Label smoothing epsilon
self.ls_epsilon = ls_epsilon
self.use_label_smoothing = use_label_smoothing
self.pos_encoding_type = pos_encoding_type
# For computing the learning rate
self.warmup_steps = warmup_steps
self.config = dict(
num_heads=self.h,
d_model=self.d_model,
d_ff=self.d_ff,
num_enc_layers=self.num_enc_layers,
num_dec_layers=self.num_dec_layers,
drop_rate=self.drop_rate,
warmup_steps=self.warmup_steps,
ls_epsilon=self.ls_epsilon,
use_label_smoothing=self.use_label_smoothing,
pos_encoding_type=self.pos_encoding_type,
model_name=self.model_name,
tf_sess_config=self.tf_sess_config,
)
# The following variables are inputs for build_model().
self._input_id2word = None
self._target_id2word = None
self._pad_id = 0
# The following variables will be constructed in build_model().
self._learning_rate = None
self._is_training = None
self._raw_input = None
self._raw_target = None
self._output = None
self._accuracy = None
self._loss = None
self._train_op = None
self._is_init = False
self.step = 0 # training step.
def build_model(self, dataset_name, input_id2word, target_id2word,
pad_id=PAD_ID, is_training=True, **train_params):
"""
Args:
dataset_name (str): name of the training dataset.
input_id2word (list): list of source words and the order matches ohe vectors.
target_id2word (list): list of target words and the order matches ohe vectors.
pad_id (int): the id of '<pad>' symbol.
is_training (bool)
train_params (dict): keys include 'lr', 'batch_size', and 'seq_len'.
"""
assert input_id2word[pad_id] == '<pad>'
assert target_id2word[pad_id] == '<pad>'
self.config.update(dict(
dataset=dataset_name,
input_id2word=input_id2word,
target_id2word=target_id2word,
pad_id=pad_id,
train_params=train_params,
))
batch_size = train_params.get('batch_size', 32)
seq_len = train_params.get('seq_len', 20)
self._input_id2word = input_id2word
self._target_id2word = target_id2word
self._pad_id = np.int32(pad_id)
input_vocab = len(input_id2word)
target_vocab = len(target_id2word)
with tf.variable_scope(self.model_name):
self._learning_rate = tf.placeholder(tf.float32, shape=None, name='learning_rate')
self._is_training = tf.placeholder_with_default(
is_training, shape=None, name="is_training")
self._raw_input = tf.placeholder(
tf.int32, shape=[batch_size, seq_len + 1], name='raw_input')
self._raw_target = tf.placeholder(
tf.int32, shape=[batch_size, seq_len + 1], name='raw_target')
# Add the offset on the input and target sentences.
# For the input we remove the starting <s> to keep the seq len consistent.
enc_inp = self._raw_input[:, 1:]
# For the decoder input, we remove the last element, as no more future prediction
# is gonna be made based on it.
dec_inp = self._raw_target[:, :-1] # starts with <s>
dec_target = self._raw_target[:, 1:] # starts with the first word
dec_target_ohe = tf.one_hot(dec_target, depth=target_vocab)
if self.use_label_smoothing:
dec_target_ohe = self.label_smoothing(dec_target_ohe)
# The input mask only hides the <pad> symbol.
input_mask = self.construct_padding_mask(enc_inp)
# The target mask hides both <pad> and future words.
target_mask = self.construct_padding_mask(dec_inp)
target_mask *= self.construct_autoregressive_mask(dec_inp)
# Input embedding + positional encoding
inp_embed = self.preprocess(enc_inp, input_vocab, "input_preprocess")
enc_out = self.encoder(inp_embed, input_mask)
# Target embedding + positional encoding
dec_inp_embed = self.preprocess(dec_inp, target_vocab, "target_preprocess")
dec_out = self.decoder(dec_inp_embed, enc_out, input_mask, target_mask)
# Make the prediction out of the decoder output.
logits = tf.layers.dense(dec_out, target_vocab) # [batch, target_vocab]
self._output = tf.argmax(logits, axis=-1, output_type=tf.int32)
target_not_pad = tf.cast(tf.not_equal(dec_target, self._pad_id), tf.float32)
self._accuracy = tf.reduce_sum(
tf.cast(tf.equal(self._output, dec_target), tf.float32) * target_not_pad /
tf.cast(tf.reduce_sum(target_not_pad), tf.float32)
)
self._loss = tf.reduce_mean(
tf.nn.softmax_cross_entropy_with_logits_v2(logits=logits, labels=dec_target_ohe))
optim = tf.train.AdamOptimizer(learning_rate=self._learning_rate,
beta1=0.9, beta2=0.98, epsilon=1e-9)
self._train_op = optim.minimize(self._loss)
with tf.variable_scope(self.model_name + '_summary'):
tf.summary.scalar('loss', self._loss)
tf.summary.scalar('accuracy', self._accuracy)
self.merged_summary = tf.summary.merge_all()
@classmethod
def load_model(cls, model_name, is_training=False):
"""Returns a Transformer object, with checkpoint loaded.
"""
config_path = os.path.join(REPO_ROOT, 'checkpoints', model_name, 'model.config.json')
with open(config_path, 'r') as fin:
cfg = json.load(fin)
model = cls(**cfg)
model.build_model(cfg['dataset'], cfg['input_id2word'], cfg['target_id2word'],
pad_id=cfg['pad_id'], is_training=is_training,
**cfg['train_params'])
# model.sess.run([tf.global_variables_initializer(), tf.local_variables_initializer()])
model.load_checkpoint()
return model
def embedding(self, inp, vocab_size, zero_pad=True):
"""When the `zero_pad` flag is on, the first row in the embedding lookup table is
fixed to be an all-zero vector, corresponding to the '<pad>' symbol."""
embed_size = self.d_model
embed_lookup = tf.get_variable("embed_lookup", [vocab_size, embed_size], tf.float32,
initializer=tf.contrib.layers.xavier_initializer())
if zero_pad:
assert self._pad_id == 0
embed_lookup = tf.concat((tf.zeros(shape=[1, self.d_model]), embed_lookup[1:, :]), 0)
out = tf.nn.embedding_lookup(embed_lookup, inp)
return out
def _positional_encoding_embedding(self, inp):
batch_size, seq_len = inp.shape.as_list()
with tf.variable_scope('positional_embedding'):
# Copy [0, 1, ..., `inp_size`] by `batch_size` times => matrix [batch, seq_len]
pos_ind = tf.tile(tf.expand_dims(tf.range(seq_len), 0), [batch_size, 1])
return self.embedding(pos_ind, seq_len, zero_pad=False) # [batch, seq_len, d_model]
def _positional_encoding_sinusoid(self, inp):
"""
PE(pos, 2i) = sin(pos / 10000^{2i/d_model})
PE(pos, 2i+1) = cos(pos / 10000^{2i/d_model})
"""
batch, seq_len = inp.shape.as_list()
with tf.variable_scope('positional_sinusoid'):
# Copy [0, 1, ..., `inp_size`] by `batch_size` times => matrix [batch, seq_len]
pos_ind = tf.tile(tf.expand_dims(tf.range(seq_len), 0), [batch, 1])
# Compute the arguments for sin and cos: pos / 10000^{2i/d_model})
# Each dimension is sin/cos wave, as a function of the position.
pos_enc = np.array([
[pos / np.power(10000., 2. * (i // 2) / self.d_model) for i in range(self.d_model)]
for pos in range(seq_len)
]) # [seq_len, d_model]
# Apply the cosine to even columns and sin to odds.
pos_enc[:, 0::2] = np.sin(pos_enc[:, 0::2]) # dim 2i
pos_enc[:, 1::2] = np.cos(pos_enc[:, 1::2]) # dim 2i+1
# Convert to a tensor
lookup_table = tf.convert_to_tensor(pos_enc, dtype=tf.float32) # [seq_len, d_model]
if True:
lookup_table = tf.concat((tf.zeros(shape=[1, self.d_model]), lookup_table[1:, :]),
0)
out = tf.nn.embedding_lookup(lookup_table, pos_ind) # [batch, seq_len, d_model]
return out
def positional_encoding(self, inp):
if self.pos_encoding_type == 'sinusoid':
pos_enc = self._positional_encoding_sinusoid(inp)
else:
pos_enc = self._positional_encoding_embedding(inp)
return pos_enc
def preprocess(self, inp, inp_vocab, scope):
# Pre-processing: embedding + positional encoding
# Output shape: [batch, seq_len, d_model]
with tf.variable_scope(scope):
out = self.embedding(inp, inp_vocab, zero_pad=True) + self.positional_encoding(inp)
out = tf.layers.dropout(out, rate=self.drop_rate, training=self._is_training)
return out
def layer_norm(self, inp):
return tc.layers.layer_norm(inp, center=True, scale=True)
def scaled_dot_product_attention(self, Q, K, V, mask=None):
"""
Args:
Q (tf.tensor): of shape (h * batch, q_size, d_model)
K (tf.tensor): of shape (h * batch, k_size, d_model)
V (tf.tensor): of shape (h * batch, k_size, d_model)
mask (tf.tensor): of shape (h * batch, q_size, k_size)
"""
d = self.d_model // self.h
assert d == Q.shape[-1] == K.shape[-1] == V.shape[-1]
out = tf.matmul(Q, tf.transpose(K, [0, 2, 1])) # [h*batch, q_size, k_size]
out = out / tf.sqrt(tf.cast(d, tf.float32)) # scaled by sqrt(d_k)
if mask is not None:
# masking out (0.0) => setting to -inf.
out = tf.multiply(out, mask) + (1.0 - mask) * (-1e10)
out = tf.nn.softmax(out) # [h * batch, q_size, k_size]
out = tf.layers.dropout(out, training=self._is_training)
out = tf.matmul(out, V) # [h * batch, q_size, d_model]
return out
def multihead_attention(self, query, memory=None, mask=None, scope='attn'):
"""
Args:
query (tf.tensor): of shape (batch, q_size, d_model)
memory (tf.tensor): of shape (batch, m_size, d_model)
mask (tf.tensor): shape (batch, q_size, k_size)
Returns:h
a tensor of shape (bs, q_size, d_model)
"""
if memory is None:
memory = query
with tf.variable_scope(scope):
# Linear project to d_model dimension: [batch, q_size/k_size, d_model]
Q = tf.layers.dense(query, self.d_model, activation=tf.nn.relu)
K = tf.layers.dense(memory, self.d_model, activation=tf.nn.relu)
V = tf.layers.dense(memory, self.d_model, activation=tf.nn.relu)
# Split the matrix to multiple heads and then concatenate to have a larger
# batch size: [h*batch, q_size/k_size, d_model/num_heads]
Q_split = tf.concat(tf.split(Q, self.h, axis=2), axis=0)
K_split = tf.concat(tf.split(K, self.h, axis=2), axis=0)
V_split = tf.concat(tf.split(V, self.h, axis=2), axis=0)
mask_split = tf.tile(mask, [self.h, 1, 1])
# Apply scaled dot product attention
out = self.scaled_dot_product_attention(Q_split, K_split, V_split, mask=mask_split)
# Merge the multi-head back to the original shape
out = tf.concat(tf.split(out, self.h, axis=0), axis=2) # [bs, q_size, d_model]
# The final linear layer and dropout.
# out = tf.layers.dense(out, self.d_model)
# out = tf.layers.dropout(out, rate=self.drop_rate, training=self._is_training)
return out
def feed_forwad(self, inp, scope='ff'):
"""
Position-wise fully connected feed-forward network, applied to each position
separately and identically. It can be implemented as (linear + ReLU + linear) or
(conv1d + ReLU + conv1d).
Args:
inp (tf.tensor): shape [batch, length, d_model]
"""
out = inp
with tf.variable_scope(scope):
# out = tf.layers.dense(out, self.d_ff, activation=tf.nn.relu)
# out = tf.layers.dropout(out, rate=self.drop_rate, training=self._is_training)
# out = tf.layers.dense(out, self.d_model, activation=None)
# by default, use_bias=True
out = tf.layers.conv1d(out, filters=self.d_ff, kernel_size=1, activation=tf.nn.relu)
out = tf.layers.conv1d(out, filters=self.d_model, kernel_size=1)
return out
def construct_padding_mask(self, inp):
"""
Args: Original input of word ids, shape [batch, seq_len]
Returns: a mask of shape [batch, seq_len, seq_len], where <pad> is 0 and others are 1s.
"""
seq_len = inp.shape.as_list()[1]
mask = tf.cast(tf.not_equal(inp, self._pad_id), tf.float32) # mask '<pad>'
mask = tf.tile(tf.expand_dims(mask, 1), [1, seq_len, 1])
return mask
def construct_autoregressive_mask(self, target):
"""
Args: Original target of word ids, shape [batch, seq_len]
Returns: a mask of shape [batch, seq_len, seq_len].
"""
batch_size, seq_len = target.shape.as_list()
tri_matrix = np.zeros((seq_len, seq_len))
tri_matrix[np.tril_indices(seq_len)] = 1
mask = tf.convert_to_tensor(tri_matrix, dtype=tf.float32)
masks = tf.tile(tf.expand_dims(mask, 0), [batch_size, 1, 1]) # copies
return masks
def encoder_layer(self, inp, input_mask, scope):
"""
Args:
inp: tf.tensor of shape (batch, seq_len, embed_size)
input_mask: tf.tensor of shape (batch, seq_len, seq_len)
"""
out = inp
with tf.variable_scope(scope):
# One multi-head attention + one feed-forword
out = self.layer_norm(out + self.multihead_attention(out, mask=input_mask))
out = self.layer_norm(out + self.feed_forwad(out))
return out
def encoder(self, inp, input_mask, scope='encoder'):
"""
Args:
inp (tf.tensor): shape (batch, seq_len, embed_size)
input_mask (tf.tensor): shape (batch, seq_len, seq_len)
scope (str): name of the variable scope.
"""
out = inp # now, (batch, seq_len, embed_size)
with tf.variable_scope(scope):
for i in range(self.num_enc_layers):
out = self.encoder_layer(out, input_mask, f'enc_{i}')
return out
def decoder_layer(self, target, enc_out, input_mask, target_mask, scope):
out = target
with tf.variable_scope(scope):
out = self.layer_norm(out + self.multihead_attention(
out, mask=target_mask, scope='self_attn'))
out = self.layer_norm(out + self.multihead_attention(
out, memory=enc_out, mask=input_mask))
out = self.layer_norm(out + self.feed_forwad(out))
return out
def decoder(self, target, enc_out, input_mask, target_mask, scope='decoder'):
out = target
with tf.variable_scope(scope):
for i in range(self.num_enc_layers):
out = self.decoder_layer(out, enc_out, input_mask, target_mask, f'dec_{i}')
return out
def label_smoothing(self, inp):
"""
From the paper: "... employed label smoothing of epsilon = 0.1. This hurts perplexity,
as the model learns to be more unsure, but improves accuracy and BLEU score."
Args:
inp (tf.tensor): one-hot encoding vectors, [batch, seq_len, vocab_size]
"""
vocab_size = inp.shape.as_list()[-1]
smoothed = (1.0 - self.ls_epsilon) * inp + (self.ls_epsilon / vocab_size)
return smoothed
def init(self):
"""Call .init() before training starts.
- Initialize the variables.
- Save the model config into json file.
"""
self.sess.run([tf.global_variables_initializer(), tf.local_variables_initializer()])
self._is_init = True
self.step = 0
self.save_checkpoint() # make sure saver is created.
# Save the model config into a json.
config_path = os.path.join(self.checkpoint_dir, 'model.config.json')
with open(config_path, 'w') as fout:
json.dump(self.config, fout)
def done(self):
"""Call .done() after training is complete.
"""
self.writer.close()
self.save_checkpoint() # Final checkpoint.
def train(self, input_ids, target_ids):
"""
One train step with one mini-batch.
Args:
input_ids (np.array): same shape as raw input placeholder.
target_ids (np.array): same shape as raw target placeholder.
Returns:
A dict of some meta information, including 'loss'.
"""
assert self._is_init, "Please call .init() before training starts."
self.step += 1
lr = np.power(self.d_model, -0.5) * min(
np.power(self.step, -0.5),
self.step * np.power(self.warmup_steps, -1.5)
)
train_loss, train_accu, summary, _ = self.sess.run(
[self._loss, self._accuracy, self.merged_summary, self.train_op],
feed_dict={
self._learning_rate: lr,
self.raw_input_ph: input_ids.astype(np.int32),
self.raw_target_ph: target_ids.astype(np.int32),
self.is_training_ph: True,
})
self.writer.add_summary(summary, global_step=self.step)
if self.step % 10000 == 0:
# Save the model checkpoint every 1000 steps.
self.save_checkpoint(step=self.step)
return {'train_loss': train_loss,
'train_accuracy': train_accu,
'learning_rate': lr,
'step': self.step}
def predict(self, input_ids):
"""
Make predict in an autoregressive way.
Args:
input_ids (np.array): same shape as raw input placeholder.
Returns:
a np.array of the same shape as the raw target placeholder.
"""
assert list(input_ids.shape) == self.raw_input_ph.shape.as_list()
batch_size, inp_seq_len = self.raw_input_ph.shape.as_list()
input_ids = input_ids.astype(np.int32)
pred_ids = np.zeros(input_ids.shape, dtype=np.int32)
pred_ids[:, 0] = START_ID
# Predict one output a time autoregressively.
for i in range(1, inp_seq_len):
# The decoder does not output <s>
next_pred = self.sess.run(self._output, feed_dict={
self.raw_input_ph: input_ids,
self.raw_target_ph: pred_ids,
self.is_training_ph: False,
})
# Only update the i-th column in one step.
pred_ids[:, i] = next_pred[:, i - 1]
# print(f"i={i}", pred_ids)
return pred_ids
def evaluate(self, input_ids, target_ids):
"""Make a prediction and compute BLEU score.
"""
pred_ids = self.predict(input_ids)
refs = []
hypos = []
for truth, pred in zip(target_ids, pred_ids):
truth_sent = recover_sentence(truth, self._target_id2word)
pred_sent = recover_sentence(pred, self._target_id2word)
refs.append([truth_sent])
hypos.append(pred_sent)
# Print the last pair for fun.
source_sent = recover_sentence(input_ids[-1], self._input_id2word)
print("[Source]", source_sent)
print("[Truth]", truth_sent)
print("[Translated]", pred_sent)
smoothie = SmoothingFunction().method4
bleu_score = corpus_bleu(refs, hypos, smoothing_function=smoothie)
return {'bleu_score': bleu_score * 100.}
# ============================= Utils ===============================
def _check_variable(self, v, name):
if v is None:
raise ValueError(f"Call build_model() to initialize {name}.")
return v
@property
def raw_input_ph(self):
return self._check_variable(self._raw_input, 'input placeholder')
@property
def raw_target_ph(self):
return self._check_variable(self._raw_target, 'target placeholder')
@property
def is_training_ph(self):
return self._check_variable(self._is_training, 'is_training placeholder')
@property
def train_op(self):
return self._check_variable(self._train_op, 'train_op')
@property
def loss(self):
return self._check_variable(self._loss, 'loss')