keras-team
diff --git a/‎keras_nlp/layers/transformer_decoder.py‎
Lines changed: 69 additions & 79 deletions b/‎keras_nlp/layers/transformer_decoder.py‎
Lines changed: 69 additions & 79 deletions
diff --git a/‎keras_nlp/layers/transformer_decoder_test.py‎
Lines changed: 6 additions & 10 deletions b/‎keras_nlp/layers/transformer_decoder_test.py‎
Lines changed: 6 additions & 10 deletions
diff --git a/‎keras_nlp/layers/transformer_encoder.py‎
Lines changed: 49 additions & 41 deletions b/‎keras_nlp/layers/transformer_encoder.py‎
Lines changed: 49 additions & 41 deletions
@@ -133,74 +133,70 @@ def _build(self, input_shape, has_cross_attention):
         self._built = True
         self._input_shape = input_shape
         self._has_cross_attention = has_cross_attention
-        feature_size = input_shape[-1]
-        self._attention_head_size = int(feature_size // self.num_heads)
+        # Infer the dimension of our hidden feature size from the build shape.
+        hidden_dim = input_shape[-1]
+        # Attention head size is `hidden_dim` over the number of heads.
+        head_dim = int(hidden_dim // self.num_heads)
+
+        # Self attention layers.
         self._self_attention_layer = keras.layers.MultiHeadAttention(
             num_heads=self.num_heads,
-            key_dim=self._attention_head_size,
-            value_dim=self._attention_head_size,
+            key_dim=head_dim,
             dropout=self.dropout,
             kernel_initializer=self.kernel_initializer,
             bias_initializer=self.bias_initializer,
         )
         self._self_attention_layer._build_from_signature(
-            input_shape, input_shape
+            query=input_shape,
+            value=input_shape,
         )
-
-        self._decoder_attention_layernorm = keras.layers.LayerNormalization(
+        self._self_attention_layernorm = keras.layers.LayerNormalization(
             epsilon=self.layer_norm_epsilon,
         )
+        self._self_attention_dropout = keras.layers.Dropout(
+            rate=self.dropout,
+        )
 
+        # Cross attention layers are optional.
         self._cross_attention_layer = None
         if has_cross_attention:
-            # Create layers for cross attention.
             self._cross_attention_layer = keras.layers.MultiHeadAttention(
                 num_heads=self.num_heads,
-                key_dim=self._attention_head_size,
-                value_dim=feature_size,
+                key_dim=head_dim,
+                value_dim=hidden_dim,
                 dropout=self.dropout,
                 kernel_initializer=self.kernel_initializer,
                 bias_initializer=self.bias_initializer,
             )
             self._cross_attention_layer._build_from_signature(
-                input_shape, input_shape
+                query=input_shape,
+                value=input_shape,
             )
-
             self._cross_attention_layernorm = keras.layers.LayerNormalization(
                 epsilon=self.layer_norm_epsilon,
             )
-
             self._cross_attention_dropout = keras.layers.Dropout(
                 rate=self.dropout,
             )
 
-        self._feedforward_layernorm = keras.layers.LayerNormalization(
-            epsilon=self.layer_norm_epsilon,
-        )
-
-        self._self_attention_dropout = keras.layers.Dropout(rate=self.dropout)
-
-        # First dense layer in the feedforward network, which maps input
-        # feauture size to dimension `self.intermediate_dim`.
-        self._intermediate_dense = keras.layers.Dense(
+        # Feedforward layers.
+        self._feedforward_intermediate_dense = keras.layers.Dense(
             self.intermediate_dim,
             activation=self.activation,
             kernel_initializer=self.kernel_initializer,
             bias_initializer=self.bias_initializer,
         )
-        # Second dense layer in the feedforward network, which maps input
-        # feature size back to the input feature size.
-        self._output_dense = keras.layers.Dense(
-            feature_size,
+        self._feedforward_output_dense = keras.layers.Dense(
+            hidden_dim,
             kernel_initializer=self.kernel_initializer,
             bias_initializer=self.bias_initializer,
         )
-        self._output_dropout = keras.layers.Dropout(rate=self.dropout)
-
-    def _feedforward(self, input):
-        x = self._intermediate_dense(input)
-        x = self._output_dense(x)
-        return self._output_dropout(x)
+        self._feedforward_layernorm = keras.layers.LayerNormalization(
+            epsilon=self.layer_norm_epsilon,
+        )
+        self._feedforward_dropout = keras.layers.Dropout(
+            rate=self.dropout,
+        )
 
     def call(
         self,
@@ -232,6 +228,7 @@ def call(
         Returns:
             A Tensor of the same shape as the `decoder_sequence`.
         """
+
         has_encoder_sequence = encoder_sequence is not None
         if not self._built:
             self._build(decoder_sequence.shape, has_encoder_sequence)
@@ -257,71 +254,64 @@ def call(
                 "This layer has been built with cross attention, but "
                 "you did not provide encoder_sequence."
             )
+
+        x = decoder_sequence  # Intermediate result.
+
+        # Compute self attention mask.
+        self_attention_mask = compute_causal_mask(decoder_sequence)
         decoder_mask = merge_padding_and_attention_mask(
             decoder_sequence, decoder_padding_mask, decoder_attention_mask
         )
-        causal_mask = tf.cast(
-            compute_causal_mask(decoder_sequence),
-            dtype=tf.int32,
-        )
-        if decoder_mask is None:
-            decoder_mask = causal_mask
-        else:
-            decoder_mask = tf.minimum(decoder_mask, causal_mask)
+        if decoder_mask is not None:
+            self_attention_mask = tf.minimum(decoder_mask, self_attention_mask)
 
-        residual_decoder_sequence = decoder_sequence
+        # Self attention block.
+        residual = x
         if self.normalize_first:
-            decoder_sequence = self._decoder_attention_layernorm(
-                decoder_sequence
-            )
-        # Decoder input self-attention.
-        self_attended = self._self_attention_layer(
-            decoder_sequence,
-            decoder_sequence,
-            decoder_sequence,
-            attention_mask=decoder_mask,
+            x = self._self_attention_layernorm(x)
+        x = self._self_attention_layer(
+            query=x,
+            value=x,
+            attention_mask=self_attention_mask,
         )
-        self_attended = self._self_attention_dropout(self_attended)
-        attention_output = residual_decoder_sequence + self_attended
+        x = self._self_attention_dropout(x)
+        x = x + residual
         if not self.normalize_first:
-            attention_output = self._decoder_attention_layernorm(
-                attention_output
-            )
+            x = self._self_attention_layernorm(x)
 
+        # Cross attention is optional.
         if self._cross_attention_layer is not None:
-            encoder_mask = merge_padding_and_attention_mask(
+            # Compute cross attention mask.
+            cross_attention_mask = merge_padding_and_attention_mask(
                 encoder_sequence, encoder_padding_mask, encoder_attention_mask
             )
-            residual_attention_output = attention_output
+
+            # Cross attention block.
+            residual = x
             if self.normalize_first:
-                attention_output = self._cross_attention_layernorm(
-                    attention_output
-                )
-            # Cross attention.
-            cross_attended = self._cross_attention_layer(
-                query=attention_output,
+                x = self._cross_attention_layernorm(x)
+            x = self._cross_attention_layer(
+                query=x,
                 value=encoder_sequence,
-                key=encoder_sequence,
-                attention_mask=encoder_mask,
-            )
-            cross_attended = self._cross_attention_dropout(
-                cross_attended,
+                attention_mask=cross_attention_mask,
             )
-            attention_output = residual_attention_output + cross_attended
+            x = self._cross_attention_dropout(x)
+            x = x + residual
             if not self.normalize_first:
-                attention_output = self._cross_attention_layernorm(
-                    attention_output
-                )
+                x = self._cross_attention_layernorm(x)
 
-        residual_attention_output = attention_output
+        # Feedforward block.
+        residual = x
         if self.normalize_first:
-            attention_output = self._feedforward_layernorm(attention_output)
-        # Feedforward.
-        feedforward_output = self._feedforward(attention_output)
-        feedforward_output = residual_attention_output + feedforward_output
+            x = self._feedforward_layernorm(x)
+        x = self._feedforward_intermediate_dense(x)
+        x = self._feedforward_output_dense(x)
+        x = self._feedforward_dropout(x)
+        x = x + residual
         if not self.normalize_first:
-            feedforward_output = self._feedforward_layernorm(feedforward_output)
-        return feedforward_output
+            x = self._feedforward_layernorm(x)
+
+        return x
 
     def get_config(self):
         config = super().get_config()
 
@@ -199,11 +199,9 @@ def test_checkpointing_transformer_decoder(self):
         decoder1(decoder_sequence, encoder_sequence)
         decoder2(decoder_sequence, encoder_sequence)
         # The weights of decoder1 and decoder2 are different.
-        self.assertFalse(
-            all(
-                decoder1._output_dense.trainable_variables[0][0]
-                == decoder2._output_dense.trainable_variables[0][0]
-            )
+        self.assertNotAllClose(
+            decoder1.trainable_variables[0][0],
+            decoder2.trainable_variables[0][0],
         )
         checkpoint = tf.train.Checkpoint(decoder1)
         checkpoint2 = tf.train.Checkpoint(decoder2)
@@ -230,11 +228,9 @@ def test_checkpointing_transformer_decoder_without_cross_attention(self):
         decoder1(decoder_sequence)
         decoder2(decoder_sequence)
         # The weights of decoder1 and decoder2 are different.
-        self.assertFalse(
-            all(
-                decoder1._output_dense.trainable_variables[0][0]
-                == decoder2._output_dense.trainable_variables[0][0]
-            )
+        self.assertNotAllClose(
+            decoder1.trainable_variables[0][0],
+            decoder2.trainable_variables[0][0],
         )
         checkpoint = tf.train.Checkpoint(decoder1)
         checkpoint2 = tf.train.Checkpoint(decoder2)
 
@@ -114,53 +114,55 @@ def _build(self, input_shape):
         # Create layers based on input shape.
         self._built = True
         self._input_shape = input_shape
-        feature_size = input_shape[-1]
-        self._attention_head_size = int(feature_size // self.num_heads)
-        self._multi_head_attention_layer = keras.layers.MultiHeadAttention(
+        # Infer the dimension of our hidden feature size from the build shape.
+        hidden_dim = input_shape[-1]
+        # Attention head size is `hidden_dim` over the number of heads.
+        key_dim = int(hidden_dim // self.num_heads)
+
+        # Self attention layers.
+        self._self_attention_layer = keras.layers.MultiHeadAttention(
             num_heads=self.num_heads,
-            key_dim=self._attention_head_size,
-            value_dim=self._attention_head_size,
+            key_dim=key_dim,
             dropout=self.dropout,
             kernel_initializer=self.kernel_initializer,
             bias_initializer=self.bias_initializer,
         )
-        self._multi_head_attention_layer._build_from_signature(
-            input_shape, input_shape
+        self._self_attention_layer._build_from_signature(
+            query=input_shape,
+            value=input_shape,
         )
-
-        self._attention_layernorm = keras.layers.LayerNormalization(
+        self._self_attention_layernorm = keras.layers.LayerNormalization(
             epsilon=self.layer_norm_epsilon,
         )
+        self._self_attention_dropout = keras.layers.Dropout(
+            rate=self.dropout,
+        )
+
+        # Feedforward layers.
         self._feedforward_layernorm = keras.layers.LayerNormalization(
             epsilon=self.layer_norm_epsilon,
         )
-
-        self._attention_dropout = keras.layers.Dropout(rate=self.dropout)
-
-        self._intermediate_dense = keras.layers.Dense(
+        self._feedforward_intermediate_dense = keras.layers.Dense(
             self.intermediate_dim,
             activation=self.activation,
             kernel_initializer=self.kernel_initializer,
             bias_initializer=self.bias_initializer,
         )
-        self._output_dense = keras.layers.Dense(
-            feature_size,
+        self._feedforward_output_dense = keras.layers.Dense(
+            hidden_dim,
             kernel_initializer=self.kernel_initializer,
             bias_initializer=self.bias_initializer,
         )
-        self._output_dropout = keras.layers.Dropout(rate=self.dropout)
-
-    def _feedforward(self, input):
-        x = self._intermediate_dense(input)
-        x = self._output_dense(x)
-        return self._output_dropout(x)
+        self._feedforward_dropout = keras.layers.Dropout(
+            rate=self.dropout,
+        )
 
     def call(self, inputs, padding_mask=None, attention_mask=None):
         """Forward pass of the TransformerEncoder.
 
         Args:
             inputs: a Tensor. The input data to TransformerEncoder, should be
-                of shape [batch_size, sequence_length, feature_dim].
+                of shape [batch_size, sequence_length, hidden_dim].
             padding_mask: a boolean Tensor. It indicates if the token should be
                 masked because the token is introduced due to padding.
                 `padding_mask` should have shape [batch_size, sequence_length].
@@ -176,33 +178,39 @@ def call(self, inputs, padding_mask=None, attention_mask=None):
         if not self._built:
             self._build(inputs.shape)
 
-        mask = merge_padding_and_attention_mask(
-            inputs,
-            padding_mask,
-            attention_mask,
+        x = inputs  # Intermediate result.
+
+        # Compute self attention mask.
+        self_attention_mask = merge_padding_and_attention_mask(
+            inputs, padding_mask, attention_mask
         )
 
-        residual_inputs = inputs
+        # Self attention block.
+        residual = x
         if self.normalize_first:
-            inputs = self._attention_layernorm(inputs)
-        # Self attention.
-        attended = self._multi_head_attention_layer(
-            inputs, inputs, inputs, attention_mask=mask
+            x = self._self_attention_layernorm(x)
+        x = self._self_attention_layer(
+            query=x,
+            value=x,
+            attention_mask=self_attention_mask,
         )
-        attended = self._attention_dropout(attended)
-        attended = residual_inputs + attended
+        x = self._self_attention_dropout(x)
+        x = x + residual
         if not self.normalize_first:
-            attended = self._attention_layernorm(attended)
+            x = self._self_attention_layernorm(x)
 
-        residual_attended = attended
+        # Feedforward block.
+        residual = x
         if self.normalize_first:
-            attended = self._feedforward_layernorm(attended)
-        # Feedforward.
-        feedforward_output = self._feedforward(attended)
-        feedforward_output = residual_attended + feedforward_output
+            x = self._feedforward_layernorm(x)
+        x = self._feedforward_intermediate_dense(x)
+        x = self._feedforward_output_dense(x)
+        x = self._feedforward_dropout(x)
+        x = x + residual
         if not self.normalize_first:
-            feedforward_output = self._feedforward_layernorm(feedforward_output)
-        return feedforward_output
+            x = self._feedforward_layernorm(x)
+
+        return x
 
     def get_config(self):
         config = super().get_config()