""" Check my blog post on attention and transformer: https://lilianweng.github.io/lil-log/2018/06/24/attention-attention.html Implementations that helped me: https://github.com/Kyubyong/transformer/ https://github.com/tensorflow/tensor2tensor/blob/master/tensor2tensor/models/transformer.py http://nlp.seas.harvard.edu/2018/04/01/attention.html Author: Lilian Weng (lilian.wengweng@gmail.com) http://lilianweng.github.io/lil-log Oct 2018 """ import numpy as np import tensorflow as tf import tensorflow.contrib as tc import json import os from utils import BaseModelMixin, REPO_ROOT from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction from data import recover_sentence, START_ID, PAD_ID class Transformer(BaseModelMixin): """ See the architecture spec of Transformer in: Vaswani et al. Attention is All You Need. NIPS 2017. """ def __init__(self, num_heads=8, d_model=512, d_ff=2048, num_enc_layers=6, num_dec_layers=6, drop_rate=0.1, warmup_steps=400, pos_encoding_type='sinusoid', ls_epsilon=0.1, use_label_smoothing=True, model_name='transformer', tf_sess_config=None, **kwargs): """ Args: num_heads (int): number of heads in multi-head attention unit. d_model (int): dimension of embedding size and the model data flow. d_ff (int): dimension of the feed-forward layer. num_enc_layers (int): number of encoder layers in the encoder. num_dec_layers (int): number of decoder layers in the decoder. drop_rate (float): drop rate in the dropout layer. warmup_steps (int) pos_encoding_type (str): type of positional encoding, 'sinusoid' or 'embedding'. ls_epsilon (float): epsilon in the label smoothing function. use_label_smoothing (bool): whether use label smoothing for the truth target. model_name (str): tf_sess_config (dict): dict config used when creating a tf.session. """ assert d_model % num_heads == 0 assert pos_encoding_type in ('sinusoid', 'embedding') super().__init__(model_name, tf_sess_config=tf_sess_config) self.h = num_heads self.d_model = d_model self.d_ff = d_ff self.num_enc_layers = num_enc_layers self.num_dec_layers = num_dec_layers # Dropout regularization: added in every sublayer before layer_norm(...) and # applied to embedding + positional encoding. self.drop_rate = drop_rate # Label smoothing epsilon self.ls_epsilon = ls_epsilon self.use_label_smoothing = use_label_smoothing self.pos_encoding_type = pos_encoding_type # For computing the learning rate self.warmup_steps = warmup_steps self.config = dict( num_heads=self.h, d_model=self.d_model, d_ff=self.d_ff, num_enc_layers=self.num_enc_layers, num_dec_layers=self.num_dec_layers, drop_rate=self.drop_rate, warmup_steps=self.warmup_steps, ls_epsilon=self.ls_epsilon, use_label_smoothing=self.use_label_smoothing, pos_encoding_type=self.pos_encoding_type, model_name=self.model_name, tf_sess_config=self.tf_sess_config, ) # The following variables are inputs for build_model(). self._input_id2word = None self._target_id2word = None self._pad_id = 0 # The following variables will be constructed in build_model(). self._learning_rate = None self._is_training = None self._raw_input = None self._raw_target = None self._output = None self._accuracy = None self._loss = None self._train_op = None self._is_init = False self.step = 0 # training step. def build_model(self, dataset_name, input_id2word, target_id2word, pad_id=PAD_ID, is_training=True, **train_params): """ Args: dataset_name (str): name of the training dataset. input_id2word (list): list of source words and the order matches ohe vectors. target_id2word (list): list of target words and the order matches ohe vectors. pad_id (int): the id of '<pad>' symbol. is_training (bool) train_params (dict): keys include 'lr', 'batch_size', and 'seq_len'. """ assert input_id2word[pad_id] == '<pad>' assert target_id2word[pad_id] == '<pad>' self.config.update(dict( dataset=dataset_name, input_id2word=input_id2word, target_id2word=target_id2word, pad_id=pad_id, train_params=train_params, )) batch_size = train_params.get('batch_size', 32) seq_len = train_params.get('seq_len', 20) self._input_id2word = input_id2word self._target_id2word = target_id2word self._pad_id = np.int32(pad_id) input_vocab = len(input_id2word) target_vocab = len(target_id2word) with tf.variable_scope(self.model_name): self._learning_rate = tf.placeholder(tf.float32, shape=None, name='learning_rate') self._is_training = tf.placeholder_with_default( is_training, shape=None, name="is_training") self._raw_input = tf.placeholder( tf.int32, shape=[batch_size, seq_len + 1], name='raw_input') self._raw_target = tf.placeholder( tf.int32, shape=[batch_size, seq_len + 1], name='raw_target') # Add the offset on the input and target sentences. # For the input we remove the starting <s> to keep the seq len consistent. enc_inp = self._raw_input[:, 1:] # For the decoder input, we remove the last element, as no more future prediction # is gonna be made based on it. dec_inp = self._raw_target[:, :-1] # starts with <s> dec_target = self._raw_target[:, 1:] # starts with the first word dec_target_ohe = tf.one_hot(dec_target, depth=target_vocab) if self.use_label_smoothing: dec_target_ohe = self.label_smoothing(dec_target_ohe) # The input mask only hides the <pad> symbol. input_mask = self.construct_padding_mask(enc_inp) # The target mask hides both <pad> and future words. target_mask = self.construct_padding_mask(dec_inp) target_mask *= self.construct_autoregressive_mask(dec_inp) # Input embedding + positional encoding inp_embed = self.preprocess(enc_inp, input_vocab, "input_preprocess") enc_out = self.encoder(inp_embed, input_mask) # Target embedding + positional encoding dec_inp_embed = self.preprocess(dec_inp, target_vocab, "target_preprocess") dec_out = self.decoder(dec_inp_embed, enc_out, input_mask, target_mask) # Make the prediction out of the decoder output. logits = tf.layers.dense(dec_out, target_vocab) # [batch, target_vocab] self._output = tf.argmax(logits, axis=-1, output_type=tf.int32) target_not_pad = tf.cast(tf.not_equal(dec_target, self._pad_id), tf.float32) self._accuracy = tf.reduce_sum( tf.cast(tf.equal(self._output, dec_target), tf.float32) * target_not_pad / tf.cast(tf.reduce_sum(target_not_pad), tf.float32) ) self._loss = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits_v2(logits=logits, labels=dec_target_ohe)) optim = tf.train.AdamOptimizer(learning_rate=self._learning_rate, beta1=0.9, beta2=0.98, epsilon=1e-9) self._train_op = optim.minimize(self._loss) with tf.variable_scope(self.model_name + '_summary'): tf.summary.scalar('loss', self._loss) tf.summary.scalar('accuracy', self._accuracy) self.merged_summary = tf.summary.merge_all() @classmethod def load_model(cls, model_name, is_training=False): """Returns a Transformer object, with checkpoint loaded. """ config_path = os.path.join(REPO_ROOT, 'checkpoints', model_name, 'model.config.json') with open(config_path, 'r') as fin: cfg = json.load(fin) model = cls(**cfg) model.build_model(cfg['dataset'], cfg['input_id2word'], cfg['target_id2word'], pad_id=cfg['pad_id'], is_training=is_training, **cfg['train_params']) # model.sess.run([tf.global_variables_initializer(), tf.local_variables_initializer()]) model.load_checkpoint() return model def embedding(self, inp, vocab_size, zero_pad=True): """When the `zero_pad` flag is on, the first row in the embedding lookup table is fixed to be an all-zero vector, corresponding to the '<pad>' symbol.""" embed_size = self.d_model embed_lookup = tf.get_variable("embed_lookup", [vocab_size, embed_size], tf.float32, initializer=tf.contrib.layers.xavier_initializer()) if zero_pad: assert self._pad_id == 0 embed_lookup = tf.concat((tf.zeros(shape=[1, self.d_model]), embed_lookup[1:, :]), 0) out = tf.nn.embedding_lookup(embed_lookup, inp) return out def _positional_encoding_embedding(self, inp): batch_size, seq_len = inp.shape.as_list() with tf.variable_scope('positional_embedding'): # Copy [0, 1, ..., `inp_size`] by `batch_size` times => matrix [batch, seq_len] pos_ind = tf.tile(tf.expand_dims(tf.range(seq_len), 0), [batch_size, 1]) return self.embedding(pos_ind, seq_len, zero_pad=False) # [batch, seq_len, d_model] def _positional_encoding_sinusoid(self, inp): """ PE(pos, 2i) = sin(pos / 10000^{2i/d_model}) PE(pos, 2i+1) = cos(pos / 10000^{2i/d_model}) """ batch, seq_len = inp.shape.as_list() with tf.variable_scope('positional_sinusoid'): # Copy [0, 1, ..., `inp_size`] by `batch_size` times => matrix [batch, seq_len] pos_ind = tf.tile(tf.expand_dims(tf.range(seq_len), 0), [batch, 1]) # Compute the arguments for sin and cos: pos / 10000^{2i/d_model}) # Each dimension is sin/cos wave, as a function of the position. pos_enc = np.array([ [pos / np.power(10000., 2. * (i // 2) / self.d_model) for i in range(self.d_model)] for pos in range(seq_len) ]) # [seq_len, d_model] # Apply the cosine to even columns and sin to odds. pos_enc[:, 0::2] = np.sin(pos_enc[:, 0::2]) # dim 2i pos_enc[:, 1::2] = np.cos(pos_enc[:, 1::2]) # dim 2i+1 # Convert to a tensor lookup_table = tf.convert_to_tensor(pos_enc, dtype=tf.float32) # [seq_len, d_model] if True: lookup_table = tf.concat((tf.zeros(shape=[1, self.d_model]), lookup_table[1:, :]), 0) out = tf.nn.embedding_lookup(lookup_table, pos_ind) # [batch, seq_len, d_model] return out def positional_encoding(self, inp): if self.pos_encoding_type == 'sinusoid': pos_enc = self._positional_encoding_sinusoid(inp) else: pos_enc = self._positional_encoding_embedding(inp) return pos_enc def preprocess(self, inp, inp_vocab, scope): # Pre-processing: embedding + positional encoding # Output shape: [batch, seq_len, d_model] with tf.variable_scope(scope): out = self.embedding(inp, inp_vocab, zero_pad=True) + self.positional_encoding(inp) out = tf.layers.dropout(out, rate=self.drop_rate, training=self._is_training) return out def layer_norm(self, inp): return tc.layers.layer_norm(inp, center=True, scale=True) def scaled_dot_product_attention(self, Q, K, V, mask=None): """ Args: Q (tf.tensor): of shape (h * batch, q_size, d_model) K (tf.tensor): of shape (h * batch, k_size, d_model) V (tf.tensor): of shape (h * batch, k_size, d_model) mask (tf.tensor): of shape (h * batch, q_size, k_size) """ d = self.d_model // self.h assert d == Q.shape[-1] == K.shape[-1] == V.shape[-1] out = tf.matmul(Q, tf.transpose(K, [0, 2, 1])) # [h*batch, q_size, k_size] out = out / tf.sqrt(tf.cast(d, tf.float32)) # scaled by sqrt(d_k) if mask is not None: # masking out (0.0) => setting to -inf. out = tf.multiply(out, mask) + (1.0 - mask) * (-1e10) out = tf.nn.softmax(out) # [h * batch, q_size, k_size] out = tf.layers.dropout(out, training=self._is_training) out = tf.matmul(out, V) # [h * batch, q_size, d_model] return out def multihead_attention(self, query, memory=None, mask=None, scope='attn'): """ Args: query (tf.tensor): of shape (batch, q_size, d_model) memory (tf.tensor): of shape (batch, m_size, d_model) mask (tf.tensor): shape (batch, q_size, k_size) Returns:h a tensor of shape (bs, q_size, d_model) """ if memory is None: memory = query with tf.variable_scope(scope): # Linear project to d_model dimension: [batch, q_size/k_size, d_model] Q = tf.layers.dense(query, self.d_model, activation=tf.nn.relu) K = tf.layers.dense(memory, self.d_model, activation=tf.nn.relu) V = tf.layers.dense(memory, self.d_model, activation=tf.nn.relu) # Split the matrix to multiple heads and then concatenate to have a larger # batch size: [h*batch, q_size/k_size, d_model/num_heads] Q_split = tf.concat(tf.split(Q, self.h, axis=2), axis=0) K_split = tf.concat(tf.split(K, self.h, axis=2), axis=0) V_split = tf.concat(tf.split(V, self.h, axis=2), axis=0) mask_split = tf.tile(mask, [self.h, 1, 1]) # Apply scaled dot product attention out = self.scaled_dot_product_attention(Q_split, K_split, V_split, mask=mask_split) # Merge the multi-head back to the original shape out = tf.concat(tf.split(out, self.h, axis=0), axis=2) # [bs, q_size, d_model] # The final linear layer and dropout. # out = tf.layers.dense(out, self.d_model) # out = tf.layers.dropout(out, rate=self.drop_rate, training=self._is_training) return out def feed_forwad(self, inp, scope='ff'): """ Position-wise fully connected feed-forward network, applied to each position separately and identically. It can be implemented as (linear + ReLU + linear) or (conv1d + ReLU + conv1d). Args: inp (tf.tensor): shape [batch, length, d_model] """ out = inp with tf.variable_scope(scope): # out = tf.layers.dense(out, self.d_ff, activation=tf.nn.relu) # out = tf.layers.dropout(out, rate=self.drop_rate, training=self._is_training) # out = tf.layers.dense(out, self.d_model, activation=None) # by default, use_bias=True out = tf.layers.conv1d(out, filters=self.d_ff, kernel_size=1, activation=tf.nn.relu) out = tf.layers.conv1d(out, filters=self.d_model, kernel_size=1) return out def construct_padding_mask(self, inp): """ Args: Original input of word ids, shape [batch, seq_len] Returns: a mask of shape [batch, seq_len, seq_len], where <pad> is 0 and others are 1s. """ seq_len = inp.shape.as_list()[1] mask = tf.cast(tf.not_equal(inp, self._pad_id), tf.float32) # mask '<pad>' mask = tf.tile(tf.expand_dims(mask, 1), [1, seq_len, 1]) return mask def construct_autoregressive_mask(self, target): """ Args: Original target of word ids, shape [batch, seq_len] Returns: a mask of shape [batch, seq_len, seq_len]. """ batch_size, seq_len = target.shape.as_list() tri_matrix = np.zeros((seq_len, seq_len)) tri_matrix[np.tril_indices(seq_len)] = 1 mask = tf.convert_to_tensor(tri_matrix, dtype=tf.float32) masks = tf.tile(tf.expand_dims(mask, 0), [batch_size, 1, 1]) # copies return masks def encoder_layer(self, inp, input_mask, scope): """ Args: inp: tf.tensor of shape (batch, seq_len, embed_size) input_mask: tf.tensor of shape (batch, seq_len, seq_len) """ out = inp with tf.variable_scope(scope): # One multi-head attention + one feed-forword out = self.layer_norm(out + self.multihead_attention(out, mask=input_mask)) out = self.layer_norm(out + self.feed_forwad(out)) return out def encoder(self, inp, input_mask, scope='encoder'): """ Args: inp (tf.tensor): shape (batch, seq_len, embed_size) input_mask (tf.tensor): shape (batch, seq_len, seq_len) scope (str): name of the variable scope. """ out = inp # now, (batch, seq_len, embed_size) with tf.variable_scope(scope): for i in range(self.num_enc_layers): out = self.encoder_layer(out, input_mask, f'enc_{i}') return out def decoder_layer(self, target, enc_out, input_mask, target_mask, scope): out = target with tf.variable_scope(scope): out = self.layer_norm(out + self.multihead_attention( out, mask=target_mask, scope='self_attn')) out = self.layer_norm(out + self.multihead_attention( out, memory=enc_out, mask=input_mask)) out = self.layer_norm(out + self.feed_forwad(out)) return out def decoder(self, target, enc_out, input_mask, target_mask, scope='decoder'): out = target with tf.variable_scope(scope): for i in range(self.num_enc_layers): out = self.decoder_layer(out, enc_out, input_mask, target_mask, f'dec_{i}') return out def label_smoothing(self, inp): """ From the paper: "... employed label smoothing of epsilon = 0.1. This hurts perplexity, as the model learns to be more unsure, but improves accuracy and BLEU score." Args: inp (tf.tensor): one-hot encoding vectors, [batch, seq_len, vocab_size] """ vocab_size = inp.shape.as_list()[-1] smoothed = (1.0 - self.ls_epsilon) * inp + (self.ls_epsilon / vocab_size) return smoothed def init(self): """Call .init() before training starts. - Initialize the variables. - Save the model config into json file. """ self.sess.run([tf.global_variables_initializer(), tf.local_variables_initializer()]) self._is_init = True self.step = 0 self.save_checkpoint() # make sure saver is created. # Save the model config into a json. config_path = os.path.join(self.checkpoint_dir, 'model.config.json') with open(config_path, 'w') as fout: json.dump(self.config, fout) def done(self): """Call .done() after training is complete. """ self.writer.close() self.save_checkpoint() # Final checkpoint. def train(self, input_ids, target_ids): """ One train step with one mini-batch. Args: input_ids (np.array): same shape as raw input placeholder. target_ids (np.array): same shape as raw target placeholder. Returns: A dict of some meta information, including 'loss'. """ assert self._is_init, "Please call .init() before training starts." self.step += 1 lr = np.power(self.d_model, -0.5) * min( np.power(self.step, -0.5), self.step * np.power(self.warmup_steps, -1.5) ) train_loss, train_accu, summary, _ = self.sess.run( [self._loss, self._accuracy, self.merged_summary, self.train_op], feed_dict={ self._learning_rate: lr, self.raw_input_ph: input_ids.astype(np.int32), self.raw_target_ph: target_ids.astype(np.int32), self.is_training_ph: True, }) self.writer.add_summary(summary, global_step=self.step) if self.step % 10000 == 0: # Save the model checkpoint every 1000 steps. self.save_checkpoint(step=self.step) return {'train_loss': train_loss, 'train_accuracy': train_accu, 'learning_rate': lr, 'step': self.step} def predict(self, input_ids): """ Make predict in an autoregressive way. Args: input_ids (np.array): same shape as raw input placeholder. Returns: a np.array of the same shape as the raw target placeholder. """ assert list(input_ids.shape) == self.raw_input_ph.shape.as_list() batch_size, inp_seq_len = self.raw_input_ph.shape.as_list() input_ids = input_ids.astype(np.int32) pred_ids = np.zeros(input_ids.shape, dtype=np.int32) pred_ids[:, 0] = START_ID # Predict one output a time autoregressively. for i in range(1, inp_seq_len): # The decoder does not output <s> next_pred = self.sess.run(self._output, feed_dict={ self.raw_input_ph: input_ids, self.raw_target_ph: pred_ids, self.is_training_ph: False, }) # Only update the i-th column in one step. pred_ids[:, i] = next_pred[:, i - 1] # print(f"i={i}", pred_ids) return pred_ids def evaluate(self, input_ids, target_ids): """Make a prediction and compute BLEU score. """ pred_ids = self.predict(input_ids) refs = [] hypos = [] for truth, pred in zip(target_ids, pred_ids): truth_sent = recover_sentence(truth, self._target_id2word) pred_sent = recover_sentence(pred, self._target_id2word) refs.append([truth_sent]) hypos.append(pred_sent) # Print the last pair for fun. source_sent = recover_sentence(input_ids[-1], self._input_id2word) print("[Source]", source_sent) print("[Truth]", truth_sent) print("[Translated]", pred_sent) smoothie = SmoothingFunction().method4 bleu_score = corpus_bleu(refs, hypos, smoothing_function=smoothie) return {'bleu_score': bleu_score * 100.} # ============================= Utils =============================== def _check_variable(self, v, name): if v is None: raise ValueError(f"Call build_model() to initialize {name}.") return v @property def raw_input_ph(self): return self._check_variable(self._raw_input, 'input placeholder') @property def raw_target_ph(self): return self._check_variable(self._raw_target, 'target placeholder') @property def is_training_ph(self): return self._check_variable(self._is_training, 'is_training placeholder') @property def train_op(self): return self._check_variable(self._train_op, 'train_op') @property def loss(self): return self._check_variable(self._loss, 'loss')