From b2fc665668d9ead4bf2d13ed2e07b9ffe1e75ac8 Mon Sep 17 00:00:00 2001
From: janEbert <janpublicebert@posteo.net>
Date: Mon, 12 Dec 2022 18:30:20 +0100
Subject: [PATCH 001/122] Fix `PretrainedFromHF` tokenizer with T5 training

---
 megatron/tokenizer/tokenizer.py | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/megatron/tokenizer/tokenizer.py b/megatron/tokenizer/tokenizer.py
index 09304b1dd..c0356a12c 100644
--- a/megatron/tokenizer/tokenizer.py
+++ b/megatron/tokenizer/tokenizer.py
@@ -388,6 +388,18 @@ def eos(self):
         candidate = self.tokenizer.eos_token_id
         return self._check_token_candidate(candidate)
 
+    @property
+    def bos_token_id(self):
+        """Id of the beginning of sentence token in the vocabulary."""
+        candidate = self.tokenizer.bos_token_id
+        return self._check_token_candidate(candidate)
+
+    @property
+    def eos_token_id(self):
+        """Id of the end of sentence token in the vocabulary."""
+        candidate = self.tokenizer.eos_token_id
+        return self._check_token_candidate(candidate)
+
     @property
     def additional_special_tokens_ids(self):
         """ All the additional special tokens you may want to use (list of strings)."""

From 13becf1b3a01f2b3c809ba6bd63451dd92d92de5 Mon Sep 17 00:00:00 2001
From: janEbert <janpublicebert@posteo.net>
Date: Mon, 12 Dec 2022 18:30:49 +0100
Subject: [PATCH 002/122] Allow passing existing casual attention masks

Since we create them in the T5 data loader, why not use them?
---
 megatron/model/fused_softmax.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/megatron/model/fused_softmax.py b/megatron/model/fused_softmax.py
index 07192e2bf..973c2a384 100644
--- a/megatron/model/fused_softmax.py
+++ b/megatron/model/fused_softmax.py
@@ -214,8 +214,7 @@ def forward_torch_softmax(self, input, mask):
         if self.scale is not None:
             input = input * self.scale
 
-        if self.attn_mask_type == AttnMaskType.causal:
-            assert mask is None
+        if self.attn_mask_type == AttnMaskType.causal and mask is None:
             assert input.shape[2] == input.shape[3]
             mask = self.get_causal_mask(input.shape[2])
 

From 7f50532d9e77f995f9758e76b86a2104a1b14276 Mon Sep 17 00:00:00 2001
From: janEbert <janpublicebert@posteo.net>
Date: Mon, 12 Dec 2022 18:33:41 +0100
Subject: [PATCH 003/122] Refactor masked LM sampling style selection

Handles backward-compatibility, so the rest of the code base does not
need to change.
---
 megatron/data/dataset_utils.py | 22 ++++++++++++++++++----
 1 file changed, 18 insertions(+), 4 deletions(-)

diff --git a/megatron/data/dataset_utils.py b/megatron/data/dataset_utils.py
index 3841e263e..643739d35 100644
--- a/megatron/data/dataset_utils.py
+++ b/megatron/data/dataset_utils.py
@@ -18,6 +18,7 @@
 #   https://github.com/google-research/albert/blob/master/create_pretraining_data.py
 # with some modifications.
 
+from enum import Enum
 import math
 import os
 import time
@@ -41,6 +42,11 @@
 DSET_TYPES = [DSET_TYPE_BERT, DSET_TYPE_ICT, DSET_TYPE_T5]
 
 
+class SamplingStyle(Enum):
+    POISSON = 'poisson'
+    GEOMETRIC = 'geometric'
+
+
 def analyze_data_prefix(data_prefix):
 
     # The data prefix should be in the format of:
@@ -194,9 +200,15 @@ def create_masked_lm_predictions(tokens,
                                  favor_longer_ngram=False,
                                  do_permutation=False,
                                  geometric_dist=False,
-                                 masking_style="bert"):
+                                 masking_style="bert",
+                                 sampling_style=SamplingStyle.POISSON):
     """Creates the predictions for the masked LM objective.
     Note: Tokens here are vocab ids and not text tokens."""
+    if not isinstance(sampling_style, SamplingStyle):
+        sampling_style = SamplingStyle(sampling_style)
+    # Backward-compatibility
+    if geometric_dist:
+        sampling_style = SamplingStyle.GEOMETRIC
 
     cand_indexes = []
     # Note(mingdachen): We create a list for recording if the piece is
@@ -235,7 +247,7 @@ def create_masked_lm_predictions(tokens,
                          max(1, int(round(len(tokens) * masked_lm_prob))))
 
     ngrams = np.arange(1, max_ngrams + 1, dtype=np.int64)
-    if not geometric_dist:
+    if sampling_style is SamplingStyle.POISSON:
         # Note(mingdachen):
         # By default, we set the probilities to favor shorter ngram sequences.
         pvals = 1. / np.arange(1, max_ngrams + 1)
@@ -266,15 +278,17 @@ def create_masked_lm_predictions(tokens,
                 if index in covered_indexes:
                     continue
 
-        if not geometric_dist:
+        if sampling_style is SamplingStyle.POISSON:
             n = np_rng.choice(ngrams[:len(cand_index_set)],
                               p=pvals[:len(cand_index_set)] /
                               pvals[:len(cand_index_set)].sum(keepdims=True))
-        else:
+        elif sampling_style is SamplingStyle.GEOMETRIC:
             # Sampling "n" from the geometric distribution and clipping it to
             # the max_ngrams. Using p=0.2 default from the SpanBERT paper
             # https://arxiv.org/pdf/1907.10529.pdf (Sec 3.1)
             n = min(np_rng.geometric(0.2), max_ngrams)
+        else:
+            raise ValueError('unknown sampling style')
 
         index_set = sum(cand_index_set[n - 1], [])
         n -= 1

From d8db18922e97e7b8bf1e45182665bc73f223bfc1 Mon Sep 17 00:00:00 2001
From: janEbert <janpublicebert@posteo.net>
Date: Mon, 12 Dec 2022 18:36:35 +0100
Subject: [PATCH 004/122] Add more masked LM sampling styles

Namely sampling from uniform and normal distributions.
---
 megatron/data/dataset_utils.py | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/megatron/data/dataset_utils.py b/megatron/data/dataset_utils.py
index 643739d35..a28672b24 100644
--- a/megatron/data/dataset_utils.py
+++ b/megatron/data/dataset_utils.py
@@ -45,6 +45,8 @@
 class SamplingStyle(Enum):
     POISSON = 'poisson'
     GEOMETRIC = 'geometric'
+    UNIFORM = 'uniform'
+    NORMAL = 'normal'
 
 
 def analyze_data_prefix(data_prefix):
@@ -254,6 +256,8 @@ def create_masked_lm_predictions(tokens,
         pvals /= pvals.sum(keepdims=True)
         if favor_longer_ngram:
             pvals = pvals[::-1]
+    elif sampling_style is SamplingStyle.NORMAL:
+        normal_mean = (max_ngrams + 1) / 2
 
     ngram_indexes = []
     for idx in range(len(cand_indexes)):
@@ -287,6 +291,14 @@ def create_masked_lm_predictions(tokens,
             # the max_ngrams. Using p=0.2 default from the SpanBERT paper
             # https://arxiv.org/pdf/1907.10529.pdf (Sec 3.1)
             n = min(np_rng.geometric(0.2), max_ngrams)
+        elif sampling_style is SamplingStyle.UNIFORM:
+            n = np_rng.choice(ngrams[:len(cand_index_set)])
+        elif sampling_style is SamplingStyle.NORMAL:
+            n = round(np.clip(
+                np_rng.normal(loc=normal_mean),
+                1,
+                len(cand_index_set),
+            ))
         else:
             raise ValueError('unknown sampling style')
 

From 006c4e96556cd241f43e91793e8b7fab5ba0546b Mon Sep 17 00:00:00 2001
From: janEbert <janpublicebert@posteo.net>
Date: Mon, 12 Dec 2022 18:37:42 +0100
Subject: [PATCH 005/122] Allow Prefix-LM style masked LM

---
 megatron/data/dataset_utils.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/megatron/data/dataset_utils.py b/megatron/data/dataset_utils.py
index a28672b24..7f96ddfce 100644
--- a/megatron/data/dataset_utils.py
+++ b/megatron/data/dataset_utils.py
@@ -203,7 +203,8 @@ def create_masked_lm_predictions(tokens,
                                  do_permutation=False,
                                  geometric_dist=False,
                                  masking_style="bert",
-                                 sampling_style=SamplingStyle.POISSON):
+                                 sampling_style=SamplingStyle.POISSON,
+                                 prefix_lm=False):
     """Creates the predictions for the masked LM objective.
     Note: Tokens here are vocab ids and not text tokens."""
     if not isinstance(sampling_style, SamplingStyle):
@@ -263,6 +264,10 @@ def create_masked_lm_predictions(tokens,
     for idx in range(len(cand_indexes)):
         ngram_index = []
         for n in ngrams:
+            if prefix_lm:
+                last_cand_index_index = min(idx + n - 1, len(cand_indexes) - 1)
+                if cand_indexes[last_cand_index_index][-1] < len(tokens) - 1:
+                    continue
             ngram_index.append(cand_indexes[idx:idx + n])
         ngram_indexes.append(ngram_index)
 

From f80231789b8bdfb4e3e4d83fd2c63ca82bab4fc6 Mon Sep 17 00:00:00 2001
From: janEbert <janpublicebert@posteo.net>
Date: Mon, 12 Dec 2022 19:46:46 +0100
Subject: [PATCH 006/122] Add UL2 pretraining for T5 model

---
 megatron/arguments.py          |  34 ++++++
 megatron/data/dataset_utils.py |  21 +++-
 megatron/data/ul2_dataset.py   | 204 +++++++++++++++++++++++++++++++++
 pretrain_ul2.py                | 134 ++++++++++++++++++++++
 4 files changed, 392 insertions(+), 1 deletion(-)
 create mode 100644 megatron/data/ul2_dataset.py
 create mode 100644 pretrain_ul2.py

diff --git a/megatron/arguments.py b/megatron/arguments.py
index c18235a78..5fd48a7b8 100644
--- a/megatron/arguments.py
+++ b/megatron/arguments.py
@@ -49,6 +49,7 @@ def parse_args(extra_args_provider=None, defaults={},
     parser = _add_autoresume_args(parser)
     parser = _add_biencoder_args(parser)
     parser = _add_vit_args(parser)
+    parser = _add_ul2_args(parser)
     parser = _add_logging_args(parser)
     parser = _add_zero_args(parser)
     parser = _add_memoryopt_args(parser)
@@ -1024,6 +1025,39 @@ def _add_vit_args(parser):
     return parser
 
 
+def _add_ul2_args(parser):
+    group = parser.add_argument_group(title="UL2")
+
+    group.add_argument('--ul2-denoiser-ratios', nargs='+', type=float,
+                       default=None,
+                       help='Probability of each denoising objective to be '
+                       'selected. Uniform distribution by default.')
+    group.add_argument('--ul2-denoisers', nargs='+', type=str,
+                       default=['R', 'R', 'S', 'X', 'X', 'X', 'X'],
+                       choices=['R', 'S', 'X'],
+                       help='What type of UL2 denoising objective the other '
+                       'UL2 configurations refer to.')
+    group.add_argument('--ul2-mean-span-lengths', nargs='+', type=float,
+                       default=[3, 8, 0.25, 3, 8, 64, 64],
+                       help='Mean length for sampling span lengths. '
+                       'Numbers < 1 indicate a mean length of the sequence '
+                       'length times that number.')
+    group.add_argument('--ul2-mask-ratios', nargs='+', type=float,
+                       default=[0.15, 0.15, 0.25, 0.5, 0.5, 0.15, 0.5],
+                       help='Ratio of masked token in the full sequence.')
+    group.add_argument('--ul2-r-denoiser-token', type=str, default='[R]',
+                       help='What token to prepend for the UL2 R-denoising '
+                       'objective.')
+    group.add_argument('--ul2-s-denoiser-token', type=str, default='[S]',
+                       help='What token to prepend for the UL2 S-denoising '
+                       'objective.')
+    group.add_argument('--ul2-x-denoiser-token', type=str, default='[X]',
+                       help='What token to prepend for the UL2 X-denoising '
+                       'objective.')
+
+    return parser
+
+
 def _add_zero_args(parser):
     """Text generate arguments."""
 
diff --git a/megatron/data/dataset_utils.py b/megatron/data/dataset_utils.py
index 7f96ddfce..92f37b2f8 100644
--- a/megatron/data/dataset_utils.py
+++ b/megatron/data/dataset_utils.py
@@ -38,8 +38,9 @@
 DSET_TYPE_BERT = 'standard_bert'
 DSET_TYPE_ICT = 'ict'
 DSET_TYPE_T5  = 't5'
+DSET_TYPE_UL2  = 'ul2'
 
-DSET_TYPES = [DSET_TYPE_BERT, DSET_TYPE_ICT, DSET_TYPE_T5]
+DSET_TYPES = [DSET_TYPE_BERT, DSET_TYPE_ICT, DSET_TYPE_T5, DSET_TYPE_UL2]
 
 
 class SamplingStyle(Enum):
@@ -553,6 +554,7 @@ def build_dataset(index, name):
         from megatron.data.bert_dataset import BertDataset
         from megatron.data.ict_dataset import ICTDataset
         from megatron.data.t5_dataset import T5Dataset
+        from megatron.data.ul2_dataset import UL2Dataset
         dataset = None
         if splits[index + 1] > splits[index]:
             # Get the pointer to the original doc-idx so we can set it later.
@@ -591,6 +593,23 @@ def build_dataset(index, name):
                     short_seq_prob=short_seq_prob,
                     **kwargs
                 )
+            elif dataset_type == DSET_TYPE_UL2:
+                args = get_args()
+                dataset = UL2Dataset(
+                    indexed_dataset=indexed_dataset,
+                    denoiser_ratios=args.ul2_denoiser_ratios,
+                    denoisers=args.ul2_denoisers,
+                    mean_span_lengths=args.ul2_mean_span_lengths,
+                    mask_ratios=args.ul2_mask_ratios,
+                    denoiser_tokens={
+                        'R': args.ul2_r_denoiser_token,
+                        'S': args.ul2_s_denoiser_token,
+                        'X': args.ul2_x_denoiser_token,
+                    },
+                    max_seq_length_dec=max_seq_length_dec,
+                    short_seq_prob=short_seq_prob,
+                    **kwargs,
+                )
             elif dataset_type == DSET_TYPE_BERT:
                 dataset = BertDataset(
                     indexed_dataset=indexed_dataset,
diff --git a/megatron/data/ul2_dataset.py b/megatron/data/ul2_dataset.py
new file mode 100644
index 000000000..4f2d333a1
--- /dev/null
+++ b/megatron/data/ul2_dataset.py
@@ -0,0 +1,204 @@
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""UL2-style dataset."""
+
+import numpy as np
+
+from megatron import get_tokenizer
+from megatron.data.dataset_utils import (
+    create_masked_lm_predictions,
+    get_samples_mapping,
+    SamplingStyle
+)
+from megatron.data.t5_dataset import pad_and_convert_to_numpy, T5Dataset
+
+
+class UL2Dataset(T5Dataset):
+
+    def __init__(self, name, indexed_dataset, data_prefix,
+                 num_epochs, max_num_samples, denoiser_ratios,
+                 denoisers, mean_span_lengths, mask_ratios,
+                 denoiser_tokens, max_seq_length, max_seq_length_dec,
+                 short_seq_prob, seed):
+
+        if denoiser_ratios is None:
+            # Uniform distribution by default.
+            denoiser_ratios = [1 / len(denoisers)] * len(denoisers)
+
+        assert (
+            len(denoiser_ratios) == len(denoisers)
+            == len(mean_span_lengths) == len(mask_ratios)
+        ), (
+            'some UL2 configurations do not correspond to the amount of '
+            'denoising objectives'
+        )
+
+        super().__init__(name, indexed_dataset, data_prefix,
+                         num_epochs, max_num_samples, None,
+                         max_seq_length, max_seq_length_dec,
+                         short_seq_prob, seed)
+
+        # Params to store.
+        self.denoiser_ratios = [
+            denoiser_ratio / sum(denoiser_ratios)
+            for denoiser_ratio in denoiser_ratios
+        ]
+        self.denoisers = [denoiser.upper() for denoiser in denoisers]
+        self.mean_span_lengths = mean_span_lengths
+        self.mask_ratios = mask_ratios
+
+        # Vocab stuff.
+        tokenizer = get_tokenizer()
+        # Remove CLS token because we don't need it.
+        del self.cls_id
+        self.cls_ids = {
+            denoiser: tokenizer.vocab[token]
+            for (denoiser, token) in denoiser_tokens.items()
+        }
+        # cls_token = self.vocab_id_to_token_dict[tokenizer.cls]
+        # if cls_token not in self.cls_ids:
+        #     self.cls_ids[cls_token] = tokenizer.cls
+
+        # Filter out denoiser tokens.
+        self.sentinel_tokens = [
+            token
+            for token in tokenizer.additional_special_tokens_ids
+            if token not in self.cls_ids.values()
+        ]
+        assert len(self.sentinel_tokens) > 0, \
+            "Provide the argument --vocab-extra-ids 100 to the script"
+
+    def __getitem__(self, idx):
+
+        start_index, end_index, seq_length = self.samples_mapping[idx]
+        sample = []
+        for index in range(start_index, end_index):
+            sample.append(self.indexed_dataset[index])
+        # Note that this rng state should be numpy and not python since
+        # python randint is inclusive whereas the numpy one is exclusive.
+        np_rng = np.random.RandomState(seed=(self.seed + idx))
+        return build_training_sample(sample, seq_length,
+                                     self.max_seq_length,  # needed for padding
+                                     self.max_seq_length_dec,
+                                     self.vocab_id_list,
+                                     self.vocab_id_to_token_dict,
+                                     self.cls_ids, self.sep_id,
+                                     self.mask_id, self.pad_id,
+                                     self.denoiser_ratios, self.denoisers,
+                                     self.mean_span_lengths, self.mask_ratios,
+                                     np_rng,
+                                     self.bos_id, self.eos_id,
+                                     self.sentinel_tokens)
+
+
+def build_training_sample(sample, target_seq_length,
+                          max_seq_length, max_seq_length_dec,
+                          vocab_id_list, vocab_id_to_token_dict,
+                          cls_ids, sep_id, mask_id, pad_id,
+                          denoiser_ratios, denoisers,
+                          mean_span_lengths, mask_ratios,
+                          np_rng, bos_id=None,
+                          eos_id=None, sentinel_tokens=None):
+    """Build training sample.
+
+    Arguments:
+        sample: A list of sentences in which each sentence is a list token ids.
+        target_seq_length: Desired sequence length.
+        max_seq_length: Maximum length of the sequence. All values are padded to
+            this length.
+        vocab_id_list: List of vocabulary ids. Used to pick a random id.
+        vocab_id_to_token_dict: A dictionary from vocab ids to text tokens.
+        cls_ids: Start of example ids.
+        sep_id: Separator id.
+        mask_id: Mask token id.
+        pad_id: Padding token id.
+        denoiser_ratios: Probability of each denoising objective to be selected.
+        denoisers: What type of UL2 denoising objective the other UL2
+              configurations refer to.
+        mean_span_lengths: Mean length for sampling span lengths. Numbers < 1
+              indicate a mean length of the sequence length times that number.
+        mask_ratios: Ratio of masked token in the full sequence.
+        np_rng: Random number genenrator. Note that this rng state should be
+              numpy and not python since python randint is inclusive for
+              the opper bound whereas the numpy one is exclusive.
+        bos_id: start of decoder example id
+        eos_id: end of generation id
+        sentinel_tokens: unique value to be substituted for every replaced span
+    """
+
+    assert target_seq_length <= max_seq_length
+
+    # flatten sentences into one list
+    tokens = [token for sentence in sample for token in sentence]
+
+    # Truncate to `target_sequence_length`.
+    max_num_tokens = target_seq_length
+    truncated = len(tokens) > max_num_tokens
+    tokens = tokens[:max_num_tokens]
+
+    # Denoiser selection
+    denoiser_index = np_rng.choice(np.arange(len(denoisers)), p=denoiser_ratios)
+    denoiser = denoisers[denoiser_index]
+    masked_lm_prob = mask_ratios[denoiser_index]
+    mean_ngrams = mean_span_lengths[denoiser_index]
+    if mean_ngrams < 1:
+        mean_ngrams = round(len(tokens) * mean_ngrams)
+    max_ngrams = mean_ngrams * 2 - 1
+
+    # Prepend objective token.
+    cls_id = cls_ids.get(denoiser)
+    if cls_id is None:
+        raise ValueError('unknown denoiser')
+    tokens = [cls_id] + tokens
+
+    # Masking.
+    max_predictions_per_seq = masked_lm_prob * len(tokens)
+    if denoiser == 'R' or denoiser == 'X':
+        sampling_style = SamplingStyle.NORMAL
+        prefix_lm = False
+    elif denoiser == 'S':
+        sampling_style = SamplingStyle.UNIFORM
+        prefix_lm = True
+    else:
+        raise ValueError('unknown denoiser')
+    (
+        tokens, masked_positions, masked_labels, _, masked_spans,
+    ) = create_masked_lm_predictions(
+        tokens, vocab_id_list, vocab_id_to_token_dict, masked_lm_prob,
+        cls_id, sep_id, mask_id, max_predictions_per_seq, np_rng,
+        max_ngrams=max_ngrams, masking_style="t5",
+        sampling_style=sampling_style, prefix_lm=prefix_lm,
+    )
+
+    # Padding.
+    tokens_enc, tokens_dec_in, labels, enc_mask, \
+    dec_mask, enc_dec_mask, loss_mask \
+        = pad_and_convert_to_numpy(tokens, masked_positions,
+                                   masked_labels, pad_id, max_seq_length,
+                                   max_seq_length_dec, masked_spans,
+                                   bos_id, eos_id, sentinel_tokens)
+
+    train_sample = {
+        'text_enc': tokens_enc,
+        'text_dec': tokens_dec_in,
+        'labels': labels,
+        'loss_mask': loss_mask,
+        'truncated': int(truncated),
+        'enc_mask': enc_mask,
+        'dec_mask': dec_mask,
+        'enc_dec_mask': enc_dec_mask,
+    }
+    return train_sample
diff --git a/pretrain_ul2.py b/pretrain_ul2.py
new file mode 100644
index 000000000..04b2b0dc6
--- /dev/null
+++ b/pretrain_ul2.py
@@ -0,0 +1,134 @@
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Pretrain UL2"""
+
+from functools import partial
+
+import torch
+
+from megatron import (
+    get_args,
+    get_timers,
+    mpu,
+    print_rank_0
+)
+from megatron.data.dataset_utils import build_train_valid_test_datasets
+from megatron.model.t5_model import T5Model
+from megatron.training import pretrain
+from megatron.utils import average_losses_across_data_parallel_group
+
+
+def model_provider(pre_process=True, post_process=True):
+    """Build the model."""
+    assert pre_process and post_process, "UL2 doesn't yet support pipelining"
+
+    print_rank_0('building UL2 model ...')
+    model = T5Model(num_tokentypes=0,
+                    parallel_output=True)
+    return model
+
+
+def get_batch(data_iterator):
+    """Build the batch."""
+
+    keys = ['text_enc', 'text_dec', 'labels', 'loss_mask',
+            'enc_mask', 'dec_mask', 'enc_dec_mask']
+    datatype = torch.int64
+
+    # Broadcast data.
+    if data_iterator is not None:
+        data = next(data_iterator)
+    else:
+        data = None
+    data_b = mpu.broadcast_data(keys, data, datatype)
+
+    # Unpack.
+    tokens_enc = data_b['text_enc'].long()
+    tokens_dec = data_b['text_dec'].long()
+    labels = data_b['labels'].long()
+    loss_mask = data_b['loss_mask'].float()
+
+    enc_mask = (data_b['enc_mask'] < 0.5)
+    dec_mask = (data_b['dec_mask'] < 0.5)
+    enc_dec_mask = (data_b['enc_dec_mask'] < 0.5)
+
+    return tokens_enc, tokens_dec, loss_mask, labels, \
+           enc_mask, dec_mask, enc_dec_mask
+
+
+def loss_func(loss_mask, output_tensor):
+    lm_loss_, _ = output_tensor
+
+    lm_loss_ = lm_loss_.float()
+    lm_loss = torch.sum(
+        lm_loss_.view(-1) * loss_mask.reshape(-1)) / loss_mask.sum()
+
+    loss = lm_loss
+    averaged_losses = average_losses_across_data_parallel_group([lm_loss])
+
+    return loss, {'lm loss': averaged_losses[0]}
+
+
+def forward_step(data_iterator, model):
+    """Forward step."""
+    args = get_args()
+    timers = get_timers()
+
+    # Get the batch.
+    timers('batch generator').start()
+    tokens_enc, tokens_dec, loss_mask, lm_labels, enc_mask, dec_mask, enc_dec_mask \
+        = get_batch(data_iterator)
+    timers('batch generator').stop()
+
+    # Forward model lm_labels
+    output_tensor = model(tokens_enc,
+                          tokens_dec,
+                          enc_mask,
+                          dec_mask,
+                          enc_dec_mask,
+                          tokentype_ids=None,
+                          lm_labels=lm_labels)
+
+    return output_tensor, partial(loss_func, loss_mask)
+
+
+def train_valid_test_datasets_provider(train_val_test_num_samples):
+    """Build train, valid, and test datasets."""
+    args = get_args()
+
+    print_rank_0('> building train, validation, and test datasets '
+                 'for UL2 ...')
+    train_ds, valid_ds, test_ds = build_train_valid_test_datasets(
+        data_prefix=args.data_path,
+        data_impl=args.data_impl,
+        splits_string=args.split,
+        train_valid_test_num_samples=train_val_test_num_samples,
+        max_seq_length=args.encoder_seq_length,
+        max_seq_length_dec=args.decoder_seq_length,
+        masked_lm_prob=args.mask_prob,
+        short_seq_prob=args.short_seq_prob,
+        seed=args.seed,
+        skip_warmup=(not args.mmap_warmup),
+        dataset_type='ul2')
+    print_rank_0("> finished creating UL2 datasets ...")
+
+    return train_ds, valid_ds, test_ds
+
+
+if __name__ == "__main__":
+
+    pretrain(train_valid_test_datasets_provider, model_provider, forward_step,
+             args_defaults={'tokenizer_type': 'BertWordPieceLowerCase'})

From deed87f769918617c896a9fba0ab063670c39491 Mon Sep 17 00:00:00 2001
From: janEbert <janpublicebert@posteo.net>
Date: Tue, 13 Dec 2022 12:09:50 +0100
Subject: [PATCH 007/122] Refactor span merging

---
 megatron/data/t5_dataset.py | 21 ++++++++++++++-------
 1 file changed, 14 insertions(+), 7 deletions(-)

diff --git a/megatron/data/t5_dataset.py b/megatron/data/t5_dataset.py
index 42110b923..f952de5d2 100644
--- a/megatron/data/t5_dataset.py
+++ b/megatron/data/t5_dataset.py
@@ -157,13 +157,8 @@ def build_training_sample(sample, target_seq_length,
     return train_sample
 
 
-def pad_and_convert_to_numpy(tokens, masked_positions,
-                             masked_labels, pad_id,
-                             max_seq_length, max_seq_length_dec,
-                             masked_spans=None, bos_id=None,
-                             eos_id=None, sentinel_tokens=None):
-    """Pad sequences and convert them to numpy."""
-
+def merge_subsequent_masks(tokens, masked_spans=None, bos_id=None,
+                           eos_id=None, sentinel_tokens=None):
     sentinel_tokens = collections.deque(sentinel_tokens)
     t5_input = []
     (t5_decoder_in, t5_decoder_out) = ([bos_id], [])
@@ -189,6 +184,18 @@ def pad_and_convert_to_numpy(tokens, masked_positions,
 
     # Add the remaining tokens to the t5 input
     t5_input.extend(tokens[start_index:])
+    return t5_input, t5_decoder_in, t5_decoder_out
+
+
+def pad_and_convert_to_numpy(tokens, masked_positions,
+                             masked_labels, pad_id,
+                             max_seq_length, max_seq_length_dec,
+                             masked_spans=None, bos_id=None,
+                             eos_id=None, sentinel_tokens=None):
+    """Pad sequences and convert them to numpy."""
+
+    t5_input, t5_decoder_in, t5_decoder_out = merge_subsequent_masks(
+        tokens, masked_spans, bos_id, eos_id, sentinel_tokens)
 
     # assert (len(t5_input) - len(masked_spans)) + \
     #        (len(t5_decoder_in) - (len(masked_spans) + 1)) == len(tokens)

From 728e076d7a73a9257d0e20f1084920de833615d8 Mon Sep 17 00:00:00 2001
From: janEbert <janpublicebert@posteo.net>
Date: Tue, 13 Dec 2022 12:08:46 +0100
Subject: [PATCH 008/122] Support UL2 for decoder-only models

---
 megatron/arguments.py          |  18 +++-
 megatron/data/dataset_utils.py |   1 +
 megatron/data/ul2_dataset.py   | 161 ++++++++++++++++++++++++---------
 megatron/enums.py              |   5 +
 pretrain_ul2.py                |  99 +++++++++++++++-----
 5 files changed, 215 insertions(+), 69 deletions(-)

diff --git a/megatron/arguments.py b/megatron/arguments.py
index 5fd48a7b8..9aae25cda 100644
--- a/megatron/arguments.py
+++ b/megatron/arguments.py
@@ -24,7 +24,7 @@
 import torch
 import deepspeed
 
-from megatron.enums import PositionEmbeddingType
+from megatron.enums import PositionEmbeddingType, UL2ModelType
 import megatron
 from megatron.logging import log_levels
 
@@ -311,6 +311,17 @@ def parse_args(extra_args_provider=None, defaults={},
                 )
         args.skip_train_iteration_range = skip_train_iteration_range
 
+    args.ul2_model_type = UL2ModelType(args.ul2_model_type)
+    if (
+            args.ul2_model_type is not UL2ModelType.ENCODER_DECODER
+            and args.decoder_seq_length is not None
+    ):
+        print(
+            f'WARNING: `--decoder_seq_length` is ignored when '
+            f'`--ul2-model-type` is not '
+            f'"{UL2ModelType.ENCODER_DECODER.value}"!'
+        )
+
     if args.use_bnb_optimizer:
         try:
             import bitsandbytes as bnb
@@ -1028,6 +1039,11 @@ def _add_vit_args(parser):
 def _add_ul2_args(parser):
     group = parser.add_argument_group(title="UL2")
 
+    group.add_argument('--ul2-model-type', type=str, default='ED',
+                       choices=['ED', 'ND', 'CD'],
+                       help='What type of model to use for UL2 pretraining. '
+                       'ED = encoder-decoder; ND = non-causal decoder-only; '
+                       'CD = causal decoder-only')
     group.add_argument('--ul2-denoiser-ratios', nargs='+', type=float,
                        default=None,
                        help='Probability of each denoising objective to be '
diff --git a/megatron/data/dataset_utils.py b/megatron/data/dataset_utils.py
index 92f37b2f8..60d4e0d90 100644
--- a/megatron/data/dataset_utils.py
+++ b/megatron/data/dataset_utils.py
@@ -597,6 +597,7 @@ def build_dataset(index, name):
                 args = get_args()
                 dataset = UL2Dataset(
                     indexed_dataset=indexed_dataset,
+                    model_type=args.ul2_model_type,
                     denoiser_ratios=args.ul2_denoiser_ratios,
                     denoisers=args.ul2_denoisers,
                     mean_span_lengths=args.ul2_mean_span_lengths,
diff --git a/megatron/data/ul2_dataset.py b/megatron/data/ul2_dataset.py
index 4f2d333a1..7fc3e6f32 100644
--- a/megatron/data/ul2_dataset.py
+++ b/megatron/data/ul2_dataset.py
@@ -15,6 +15,8 @@
 
 """UL2-style dataset."""
 
+import math
+
 import numpy as np
 
 from megatron import get_tokenizer
@@ -23,16 +25,34 @@
     get_samples_mapping,
     SamplingStyle
 )
-from megatron.data.t5_dataset import pad_and_convert_to_numpy, T5Dataset
+from megatron.data.t5_dataset import (
+    make_history_mask,
+    merge_subsequent_masks,
+    pad_and_convert_to_numpy,
+    T5Dataset,
+)
+from megatron.enums import UL2ModelType
+
+
+def is_decoder_only(ul2_model_type):
+    """Return whether we use a decoder-only model."""
+    assert isinstance(ul2_model_type, UL2ModelType)
+    return ul2_model_type is not UL2ModelType.ENCODER_DECODER
+
+
+def is_prefix_lm(ul2_model_type):
+    """Return whether we use a non-causal decoder-only model."""
+    assert isinstance(ul2_model_type, UL2ModelType)
+    return ul2_model_type is UL2ModelType.NON_CAUSAL_DECODER
 
 
 class UL2Dataset(T5Dataset):
 
     def __init__(self, name, indexed_dataset, data_prefix,
-                 num_epochs, max_num_samples, denoiser_ratios,
-                 denoisers, mean_span_lengths, mask_ratios,
-                 denoiser_tokens, max_seq_length, max_seq_length_dec,
-                 short_seq_prob, seed):
+                 num_epochs, max_num_samples, model_type,
+                 denoiser_ratios, denoisers, mean_span_lengths,
+                 mask_ratios, denoiser_tokens, max_seq_length,
+                 max_seq_length_dec, short_seq_prob, seed):
 
         if denoiser_ratios is None:
             # Uniform distribution by default.
@@ -52,6 +72,7 @@ def __init__(self, name, indexed_dataset, data_prefix,
                          short_seq_prob, seed)
 
         # Params to store.
+        self.model_type = model_type
         self.denoiser_ratios = [
             denoiser_ratio / sum(denoiser_ratios)
             for denoiser_ratio in denoiser_ratios
@@ -97,21 +118,21 @@ def __getitem__(self, idx):
                                      self.vocab_id_to_token_dict,
                                      self.cls_ids, self.sep_id,
                                      self.mask_id, self.pad_id,
-                                     self.denoiser_ratios, self.denoisers,
-                                     self.mean_span_lengths, self.mask_ratios,
-                                     np_rng,
-                                     self.bos_id, self.eos_id,
-                                     self.sentinel_tokens)
+                                     self.model_type, self.denoiser_ratios,
+                                     self.denoisers, self.mean_span_lengths,
+                                     self.mask_ratios, np_rng, self.bos_id,
+                                     self.eos_id, self.sentinel_tokens)
 
 
 def build_training_sample(sample, target_seq_length,
                           max_seq_length, max_seq_length_dec,
                           vocab_id_list, vocab_id_to_token_dict,
                           cls_ids, sep_id, mask_id, pad_id,
-                          denoiser_ratios, denoisers,
-                          mean_span_lengths, mask_ratios,
-                          np_rng, bos_id=None,
-                          eos_id=None, sentinel_tokens=None):
+                          model_type, denoiser_ratios,
+                          denoisers, mean_span_lengths,
+                          mask_ratios, np_rng,
+                          bos_id=None, eos_id=None,
+                          sentinel_tokens=None):
     """Build training sample.
 
     Arguments:
@@ -125,6 +146,7 @@ def build_training_sample(sample, target_seq_length,
         sep_id: Separator id.
         mask_id: Mask token id.
         pad_id: Padding token id.
+        model_type: What type of model is used.
         denoiser_ratios: Probability of each denoising objective to be selected.
         denoisers: What type of UL2 denoising objective the other UL2
               configurations refer to.
@@ -139,24 +161,28 @@ def build_training_sample(sample, target_seq_length,
         sentinel_tokens: unique value to be substituted for every replaced span
     """
 
+    # Denoiser selection
+    denoiser_index = np_rng.choice(np.arange(len(denoisers)), p=denoiser_ratios)
+    denoiser = denoisers[denoiser_index]
+    masked_lm_prob = mask_ratios[denoiser_index]
+
     assert target_seq_length <= max_seq_length
 
     # flatten sentences into one list
     tokens = [token for sentence in sample for token in sentence]
 
-    # Truncate to `target_sequence_length`.
     max_num_tokens = target_seq_length
-    truncated = len(tokens) > max_num_tokens
-    tokens = tokens[:max_num_tokens]
-
-    # Denoiser selection
-    denoiser_index = np_rng.choice(np.arange(len(denoisers)), p=denoiser_ratios)
-    denoiser = denoisers[denoiser_index]
-    masked_lm_prob = mask_ratios[denoiser_index]
-    mean_ngrams = mean_span_lengths[denoiser_index]
-    if mean_ngrams < 1:
-        mean_ngrams = round(len(tokens) * mean_ngrams)
-    max_ngrams = mean_ngrams * 2 - 1
+    if is_decoder_only(model_type):
+        # Keep space for repeated `extra_id` tokens; not the most data
+        # efficient since we calculate this based on the maximum number
+        # of possible `extra_id` tokens.
+        safe_max_seq_len = math.floor(max_num_tokens / (1 + masked_lm_prob))
+        truncated = len(tokens) > safe_max_seq_len
+        tokens = tokens[:safe_max_seq_len]
+    else:
+        # Truncate to `target_sequence_length`.
+        truncated = len(tokens) > max_num_tokens
+        tokens = tokens[:max_num_tokens]
 
     # Prepend objective token.
     cls_id = cls_ids.get(denoiser)
@@ -166,6 +192,11 @@ def build_training_sample(sample, target_seq_length,
 
     # Masking.
     max_predictions_per_seq = masked_lm_prob * len(tokens)
+    mean_ngrams = mean_span_lengths[denoiser_index]
+    if mean_ngrams < 1:
+        mean_ngrams = round(len(tokens) * mean_ngrams)
+    max_ngrams = mean_ngrams * 2 - 1
+
     if denoiser == 'R' or denoiser == 'X':
         sampling_style = SamplingStyle.NORMAL
         prefix_lm = False
@@ -183,22 +214,64 @@ def build_training_sample(sample, target_seq_length,
         sampling_style=sampling_style, prefix_lm=prefix_lm,
     )
 
-    # Padding.
-    tokens_enc, tokens_dec_in, labels, enc_mask, \
-    dec_mask, enc_dec_mask, loss_mask \
-        = pad_and_convert_to_numpy(tokens, masked_positions,
-                                   masked_labels, pad_id, max_seq_length,
-                                   max_seq_length_dec, masked_spans,
-                                   bos_id, eos_id, sentinel_tokens)
-
-    train_sample = {
-        'text_enc': tokens_enc,
-        'text_dec': tokens_dec_in,
-        'labels': labels,
-        'loss_mask': loss_mask,
-        'truncated': int(truncated),
-        'enc_mask': enc_mask,
-        'dec_mask': dec_mask,
-        'enc_dec_mask': enc_dec_mask,
-    }
+    if is_decoder_only(model_type):
+        # Concatenate to one sequence.
+        tokens_enc, tokens_dec_in, labels = merge_subsequent_masks(
+            tokens, masked_spans, bos_id, eos_id, sentinel_tokens)
+
+        # Move EOS tokens to end of sequence.
+        while tokens_enc[-1] == eos_id:
+            del tokens_enc[-1]
+            tokens_dec_in.append(eos_id)
+            labels.append(eos_id)
+
+        num_labels = len(labels)
+
+        # Move BOS token to start of sequence.
+        tokens_dec_in = tokens_dec_in[1:]
+        tokens = np.concatenate([
+            np.array([bos_id], dtype=np.int64),
+            tokens_enc,
+            np.array([sep_id], dtype=np.int64),
+            tokens_dec_in,
+        ])
+        labels = np.concatenate([
+            tokens_enc,
+            np.array([sep_id], dtype=np.int64),
+            labels,
+        ])
+
+        loss_mask = np.zeros(len(tokens), dtype=np.int64)
+        loss_mask[-num_labels:] = 1
+
+        dec_mask = make_history_mask(tokens)
+        if is_prefix_lm(model_type):
+            dec_mask[:-num_labels, :-num_labels] = 1
+
+        train_sample = {
+            'text': tokens,
+            'labels': labels,
+            'loss_mask': loss_mask,
+            'truncated': int(truncated),
+            'dec_mask': dec_mask,
+        }
+    else:
+        # Padding.
+        tokens_enc, tokens_dec_in, labels, enc_mask, \
+        dec_mask, enc_dec_mask, loss_mask \
+            = pad_and_convert_to_numpy(tokens, masked_positions,
+                                       masked_labels, pad_id, max_seq_length,
+                                       max_seq_length_dec, masked_spans,
+                                       bos_id, eos_id, sentinel_tokens)
+
+        train_sample = {
+            'text_enc': tokens_enc,
+            'text_dec': tokens_dec_in,
+            'labels': labels,
+            'loss_mask': loss_mask,
+            'truncated': int(truncated),
+            'enc_mask': enc_mask,
+            'dec_mask': dec_mask,
+            'enc_dec_mask': enc_dec_mask,
+        }
     return train_sample
diff --git a/megatron/enums.py b/megatron/enums.py
index 90d00a071..2961cbb66 100644
--- a/megatron/enums.py
+++ b/megatron/enums.py
@@ -33,3 +33,8 @@ class PositionEmbeddingType(enum.Enum):
     rotary = 1
     absolute = 2
     alibi = 3
+
+class UL2ModelType(enum.Enum):
+    ENCODER_DECODER = 'ED'
+    NON_CAUSAL_DECODER = 'ND'
+    CAUSAL_DECODER = 'CD'
diff --git a/pretrain_ul2.py b/pretrain_ul2.py
index 04b2b0dc6..cab24ced0 100644
--- a/pretrain_ul2.py
+++ b/pretrain_ul2.py
@@ -26,26 +26,56 @@
     print_rank_0
 )
 from megatron.data.dataset_utils import build_train_valid_test_datasets
-from megatron.model.t5_model import T5Model
+from megatron.data.ul2_dataset import (
+    is_decoder_only as _is_decoder_only,
+    is_prefix_lm as _is_prefix_lm,
+)
+from megatron.model.gpt_model import GPTModel
+from megatron.model.t5_model import T5Model, t5_position_ids
 from megatron.training import pretrain
 from megatron.utils import average_losses_across_data_parallel_group
 
 
+def is_decoder_only():
+    """Return whether we use a decoder-only model."""
+    args = get_args()
+    return _is_decoder_only(args.ul2_model_type)
+
+
+def is_prefix_lm():
+    """Return whether we use a non-causal decoder-only model."""
+    args = get_args()
+    return _is_prefix_lm(args.ul2_model_type)
+
+
 def model_provider(pre_process=True, post_process=True):
     """Build the model."""
     assert pre_process and post_process, "UL2 doesn't yet support pipelining"
 
     print_rank_0('building UL2 model ...')
-    model = T5Model(num_tokentypes=0,
-                    parallel_output=True)
+    if is_decoder_only():
+        print_rank_0('Using decoder-only UL2 model.')
+        model = GPTModel(
+            num_tokentypes=0,
+            parallel_output=True,
+            pre_process=pre_process,
+            post_process=post_process,
+            prefix_lm=is_prefix_lm(),
+        )
+    else:
+        print_rank_0('Using encoder-decoder UL2 model.')
+        model = T5Model(num_tokentypes=0, parallel_output=True)
     return model
 
 
 def get_batch(data_iterator):
     """Build the batch."""
 
-    keys = ['text_enc', 'text_dec', 'labels', 'loss_mask',
-            'enc_mask', 'dec_mask', 'enc_dec_mask']
+    if is_decoder_only():
+        keys = ['text', 'labels', 'loss_mask', 'dec_mask']
+    else:
+        keys = ['text_enc', 'text_dec', 'labels', 'loss_mask',
+                'enc_mask', 'dec_mask', 'enc_dec_mask']
     datatype = torch.int64
 
     # Broadcast data.
@@ -56,21 +86,32 @@ def get_batch(data_iterator):
     data_b = mpu.broadcast_data(keys, data, datatype)
 
     # Unpack.
-    tokens_enc = data_b['text_enc'].long()
-    tokens_dec = data_b['text_dec'].long()
-    labels = data_b['labels'].long()
-    loss_mask = data_b['loss_mask'].float()
+    if is_decoder_only():
+        tokens = data_b['text'].long()
+        labels = data_b['labels'].long()
+        loss_mask = data_b['loss_mask'].float()
+
+        dec_mask = (data_b['dec_mask'] < 0.5)
+        return tokens, loss_mask, labels, dec_mask
+    else:
+        tokens_enc = data_b['text_enc'].long()
+        tokens_dec = data_b['text_dec'].long()
+        labels = data_b['labels'].long()
+        loss_mask = data_b['loss_mask'].float()
 
-    enc_mask = (data_b['enc_mask'] < 0.5)
-    dec_mask = (data_b['dec_mask'] < 0.5)
-    enc_dec_mask = (data_b['enc_dec_mask'] < 0.5)
+        enc_mask = (data_b['enc_mask'] < 0.5)
+        dec_mask = (data_b['dec_mask'] < 0.5)
+        enc_dec_mask = (data_b['enc_dec_mask'] < 0.5)
 
-    return tokens_enc, tokens_dec, loss_mask, labels, \
-           enc_mask, dec_mask, enc_dec_mask
+        return tokens_enc, tokens_dec, loss_mask, labels, \
+               enc_mask, dec_mask, enc_dec_mask
 
 
 def loss_func(loss_mask, output_tensor):
-    lm_loss_, _ = output_tensor
+    if is_decoder_only():
+        lm_loss_ = output_tensor
+    else:
+        lm_loss_, _ = output_tensor
 
     lm_loss_ = lm_loss_.float()
     lm_loss = torch.sum(
@@ -89,18 +130,28 @@ def forward_step(data_iterator, model):
 
     # Get the batch.
     timers('batch generator').start()
-    tokens_enc, tokens_dec, loss_mask, lm_labels, enc_mask, dec_mask, enc_dec_mask \
-        = get_batch(data_iterator)
+    if is_decoder_only():
+        (tokens, loss_mask, lm_labels, dec_mask) = get_batch(data_iterator)
+    else:
+        (
+            tokens_enc, tokens_dec, loss_mask, lm_labels,
+            enc_mask, dec_mask, enc_dec_mask,
+        ) = get_batch(data_iterator)
     timers('batch generator').stop()
 
     # Forward model lm_labels
-    output_tensor = model(tokens_enc,
-                          tokens_dec,
-                          enc_mask,
-                          dec_mask,
-                          enc_dec_mask,
-                          tokentype_ids=None,
-                          lm_labels=lm_labels)
+    if is_decoder_only():
+        position_ids = t5_position_ids(tokens)
+        output_tensor = model(tokens, position_ids, dec_mask,
+                              labels=lm_labels)
+    else:
+        output_tensor = model(tokens_enc,
+                              tokens_dec,
+                              enc_mask,
+                              dec_mask,
+                              enc_dec_mask,
+                              tokentype_ids=None,
+                              lm_labels=lm_labels)
 
     return output_tensor, partial(loss_func, loss_mask)
 

From 42ece6b836e2f2858718aa1cd0a6ee0d426fe774 Mon Sep 17 00:00:00 2001
From: janEbert <janpublicebert@posteo.net>
Date: Tue, 13 Dec 2022 20:24:55 +0100
Subject: [PATCH 009/122] Unconditionally use safe maximum sequence length

---
 megatron/data/ul2_dataset.py | 17 ++++++-----------
 1 file changed, 6 insertions(+), 11 deletions(-)

diff --git a/megatron/data/ul2_dataset.py b/megatron/data/ul2_dataset.py
index 7fc3e6f32..508b3b455 100644
--- a/megatron/data/ul2_dataset.py
+++ b/megatron/data/ul2_dataset.py
@@ -172,17 +172,12 @@ def build_training_sample(sample, target_seq_length,
     tokens = [token for sentence in sample for token in sentence]
 
     max_num_tokens = target_seq_length
-    if is_decoder_only(model_type):
-        # Keep space for repeated `extra_id` tokens; not the most data
-        # efficient since we calculate this based on the maximum number
-        # of possible `extra_id` tokens.
-        safe_max_seq_len = math.floor(max_num_tokens / (1 + masked_lm_prob))
-        truncated = len(tokens) > safe_max_seq_len
-        tokens = tokens[:safe_max_seq_len]
-    else:
-        # Truncate to `target_sequence_length`.
-        truncated = len(tokens) > max_num_tokens
-        tokens = tokens[:max_num_tokens]
+    # Keep space for repeated `extra_id` tokens; not the most data
+    # efficient since we calculate this based on the maximum number
+    # of possible `extra_id` tokens.
+    safe_max_seq_len = math.floor(max_num_tokens / (1 + masked_lm_prob))
+    truncated = len(tokens) > safe_max_seq_len
+    tokens = tokens[:safe_max_seq_len]
 
     # Prepend objective token.
     cls_id = cls_ids.get(denoiser)

From d18f84e5aa902ad68d527cfea215553b2b03c572 Mon Sep 17 00:00:00 2001
From: janEbert <janpublicebert@posteo.net>
Date: Wed, 14 Dec 2022 08:55:03 +0100
Subject: [PATCH 010/122] Add custom exceptions

... which also improve error messages.
---
 megatron/data/t5_dataset.py | 27 +++++++++++++++++++++++++--
 1 file changed, 25 insertions(+), 2 deletions(-)

diff --git a/megatron/data/t5_dataset.py b/megatron/data/t5_dataset.py
index f952de5d2..be52206ec 100644
--- a/megatron/data/t5_dataset.py
+++ b/megatron/data/t5_dataset.py
@@ -26,6 +26,27 @@
     get_samples_mapping
 )
 
+
+class LengthExceededError(ValueError):
+    def __init__(self, msg=None):
+        if msg is None:
+            msg = (
+                'The sequence input became too long. '
+                'Try to increase `--seq-length` or `--encoder-seq-length`.'
+            )
+        super().__init__(msg)
+
+
+class DecoderLengthExceededError(ValueError):
+    def __init__(self, msg=None):
+        if msg is None:
+            msg = (
+                'The sequence input for the decoder became too long. '
+                'Try to increase `--decoder-seq-length`.'
+            )
+        super().__init__(msg)
+
+
 class T5Dataset(torch.utils.data.Dataset):
 
     def __init__(self, name, indexed_dataset, data_prefix,
@@ -205,7 +226,8 @@ def pad_and_convert_to_numpy(tokens, masked_positions,
     # Encoder-side padding mask.
     num_tokens = len(t5_input)
     padding_length = max_seq_length - num_tokens
-    assert padding_length >= 0
+    if padding_length < 0:
+        raise LengthExceededError()
     assert len(masked_positions) == len(masked_labels)
 
     # Tokens..
@@ -215,7 +237,8 @@ def pad_and_convert_to_numpy(tokens, masked_positions,
     # Decoder-side padding mask.
     num_tokens_dec = len(t5_decoder_in)
     padding_length_dec = max_seq_length_dec - num_tokens_dec
-    assert padding_length_dec >= 0
+    if padding_length_dec < 0:
+        raise DecoderLengthExceededError()
     filler_dec = [pad_id] * padding_length_dec
     tokens_dec_in = np.array(t5_decoder_in + filler_dec, dtype=np.int64)
 

From fa5aa68b8f8980012d5c4640e79516700f346972 Mon Sep 17 00:00:00 2001
From: janEbert <janpublicebert@posteo.net>
Date: Wed, 14 Dec 2022 08:56:34 +0100
Subject: [PATCH 011/122] Error out on too long sequences

---
 megatron/data/ul2_dataset.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/megatron/data/ul2_dataset.py b/megatron/data/ul2_dataset.py
index 508b3b455..e29c21f15 100644
--- a/megatron/data/ul2_dataset.py
+++ b/megatron/data/ul2_dataset.py
@@ -26,6 +26,7 @@
     SamplingStyle
 )
 from megatron.data.t5_dataset import (
+    LengthExceededError,
     make_history_mask,
     merge_subsequent_masks,
     pad_and_convert_to_numpy,
@@ -236,6 +237,9 @@ def build_training_sample(sample, target_seq_length,
             labels,
         ])
 
+        if max_seq_length - len(tokens) < 0:
+            raise LengthExceededError()
+
         loss_mask = np.zeros(len(tokens), dtype=np.int64)
         loss_mask[-num_labels:] = 1
 

From c7d8a8ba9bb9cac0af9944c80c092a5f7339cbc0 Mon Sep 17 00:00:00 2001
From: janEbert <janpublicebert@posteo.net>
Date: Wed, 14 Dec 2022 08:57:10 +0100
Subject: [PATCH 012/122] Remove additional sequence truncation

Instead, the user should choose a larger maximum sequence length, which
an error warns them about.
---
 megatron/data/ul2_dataset.py | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/megatron/data/ul2_dataset.py b/megatron/data/ul2_dataset.py
index e29c21f15..4ce6cc7eb 100644
--- a/megatron/data/ul2_dataset.py
+++ b/megatron/data/ul2_dataset.py
@@ -173,12 +173,17 @@ def build_training_sample(sample, target_seq_length,
     tokens = [token for sentence in sample for token in sentence]
 
     max_num_tokens = target_seq_length
-    # Keep space for repeated `extra_id` tokens; not the most data
-    # efficient since we calculate this based on the maximum number
-    # of possible `extra_id` tokens.
-    safe_max_seq_len = math.floor(max_num_tokens / (1 + masked_lm_prob))
-    truncated = len(tokens) > safe_max_seq_len
-    tokens = tokens[:safe_max_seq_len]
+    # if is_decoder_only(model_type):
+    #     # Keep space for repeated `extra_id` tokens; not the most data
+    #     # efficient since we calculate this based on the maximum number
+    #     # of possible `extra_id` tokens.
+    #     safe_max_seq_len = math.floor(max_num_tokens / (1 + masked_lm_prob))
+    #     truncated = len(tokens) > safe_max_seq_len
+    #     tokens = tokens[:safe_max_seq_len]
+    # else:
+    # Truncate to `target_sequence_length`.
+    truncated = len(tokens) > max_num_tokens
+    tokens = tokens[:max_num_tokens]
 
     # Prepend objective token.
     cls_id = cls_ids.get(denoiser)

From c722516376cd3aebb3312dda42f512a12f9679b9 Mon Sep 17 00:00:00 2001
From: janEbert <janpublicebert@posteo.net>
Date: Wed, 14 Dec 2022 09:35:01 +0100
Subject: [PATCH 013/122] Prefer array-from-list creation

Instead of concatenating arrays and lists to get a certain dtype.
---
 megatron/data/ul2_dataset.py | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/megatron/data/ul2_dataset.py b/megatron/data/ul2_dataset.py
index 4ce6cc7eb..4f80c973b 100644
--- a/megatron/data/ul2_dataset.py
+++ b/megatron/data/ul2_dataset.py
@@ -230,17 +230,17 @@ def build_training_sample(sample, target_seq_length,
 
         # Move BOS token to start of sequence.
         tokens_dec_in = tokens_dec_in[1:]
-        tokens = np.concatenate([
-            np.array([bos_id], dtype=np.int64),
-            tokens_enc,
-            np.array([sep_id], dtype=np.int64),
-            tokens_dec_in,
-        ])
-        labels = np.concatenate([
-            tokens_enc,
-            np.array([sep_id], dtype=np.int64),
-            labels,
-        ])
+        tokens = np.array((
+            [bos_id]
+            + tokens_enc
+            + [sep_id]
+            + tokens_dec_in
+        ), dtype=np.int64)
+        labels = np.array((
+            tokens_enc
+            + [sep_id]
+            + labels
+        ), dtype=np.int64)
 
         if max_seq_length - len(tokens) < 0:
             raise LengthExceededError()

From 69f6e7077be8e4e7b6d8389351e6bb25d299863a Mon Sep 17 00:00:00 2001
From: janEbert <janpublicebert@posteo.net>
Date: Mon, 2 Jan 2023 11:51:35 +0100
Subject: [PATCH 014/122] Remove redundant imports

---
 megatron/data/ul2_dataset.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/megatron/data/ul2_dataset.py b/megatron/data/ul2_dataset.py
index 4f80c973b..0a5f9a100 100644
--- a/megatron/data/ul2_dataset.py
+++ b/megatron/data/ul2_dataset.py
@@ -15,14 +15,11 @@
 
 """UL2-style dataset."""
 
-import math
-
 import numpy as np
 
 from megatron import get_tokenizer
 from megatron.data.dataset_utils import (
     create_masked_lm_predictions,
-    get_samples_mapping,
     SamplingStyle
 )
 from megatron.data.t5_dataset import (

From f08a104b64079172d99fd856ab61e8bf68f28049 Mon Sep 17 00:00:00 2001
From: janEbert <janpublicebert@posteo.net>
Date: Tue, 3 Jan 2023 12:02:03 +0100
Subject: [PATCH 015/122] Fix not inserting prefixes

For small sequence lengths or low probability/mean ngram values, we
could get `max_ngrams` < 1 and `max_predictions_per_seq` < 1, causing no
masking to be done.
---
 megatron/data/ul2_dataset.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/megatron/data/ul2_dataset.py b/megatron/data/ul2_dataset.py
index 0a5f9a100..892e94b3a 100644
--- a/megatron/data/ul2_dataset.py
+++ b/megatron/data/ul2_dataset.py
@@ -189,10 +189,12 @@ def build_training_sample(sample, target_seq_length,
     tokens = [cls_id] + tokens
 
     # Masking.
-    max_predictions_per_seq = masked_lm_prob * len(tokens)
+    # Ensure we always have at least one prediction.
+    max_predictions_per_seq = max(1.0, masked_lm_prob * len(tokens))
     mean_ngrams = mean_span_lengths[denoiser_index]
     if mean_ngrams < 1:
-        mean_ngrams = round(len(tokens) * mean_ngrams)
+        # Ensure we always obtain at least one `max_ngrams`.
+        mean_ngrams = max(1, round(len(tokens) * mean_ngrams))
     max_ngrams = mean_ngrams * 2 - 1
 
     if denoiser == 'R' or denoiser == 'X':

From d2fd03e66c832d0d10aad7523cf9289a804a12f6 Mon Sep 17 00:00:00 2001
From: janEbert <janpublicebert@posteo.net>
Date: Tue, 3 Jan 2023 12:05:33 +0100
Subject: [PATCH 016/122] Do not insert `extra_id` tokens for PrefixLM task

Now same as in the UL2 paper code snippet.
---
 megatron/data/t5_dataset.py  | 25 ++++++++++++++++---------
 megatron/data/ul2_dataset.py |  5 +++--
 2 files changed, 19 insertions(+), 11 deletions(-)

diff --git a/megatron/data/t5_dataset.py b/megatron/data/t5_dataset.py
index be52206ec..31ac7bbee 100644
--- a/megatron/data/t5_dataset.py
+++ b/megatron/data/t5_dataset.py
@@ -179,23 +179,29 @@ def build_training_sample(sample, target_seq_length,
 
 
 def merge_subsequent_masks(tokens, masked_spans=None, bos_id=None,
-                           eos_id=None, sentinel_tokens=None):
-    sentinel_tokens = collections.deque(sentinel_tokens)
+                           eos_id=None, sentinel_tokens=None, prefix_lm=False):
+    if prefix_lm:
+        assert len(masked_spans) <= 1, \
+            'Received more than one masked span for PrefixLM masking'
+    else:
+        sentinel_tokens = collections.deque(sentinel_tokens)
     t5_input = []
     (t5_decoder_in, t5_decoder_out) = ([bos_id], [])
     (start_index, end_index) = (0, None)
     for span in masked_spans:
-        flag = sentinel_tokens.popleft()
+        if not prefix_lm:
+            flag = sentinel_tokens.popleft()
 
-        # Append the same tokens in decoder input and output
-        t5_decoder_in.append(flag)
+            # Append the same tokens in decoder input and output
+            t5_decoder_in.append(flag)
+            t5_decoder_out.append(flag)
         t5_decoder_in.extend(span.label)
-        t5_decoder_out.append(flag)
         t5_decoder_out.extend(span.label)
 
         end_index = span.index[0]
         t5_input.extend(tokens[start_index: end_index])
-        t5_input.append(flag)
+        if not prefix_lm:
+            t5_input.append(flag)
 
         # the next start index is the token after the last span token
         start_index = span.index[-1] + 1
@@ -212,11 +218,12 @@ def pad_and_convert_to_numpy(tokens, masked_positions,
                              masked_labels, pad_id,
                              max_seq_length, max_seq_length_dec,
                              masked_spans=None, bos_id=None,
-                             eos_id=None, sentinel_tokens=None):
+                             eos_id=None, sentinel_tokens=None,
+                             prefix_lm=False):
     """Pad sequences and convert them to numpy."""
 
     t5_input, t5_decoder_in, t5_decoder_out = merge_subsequent_masks(
-        tokens, masked_spans, bos_id, eos_id, sentinel_tokens)
+        tokens, masked_spans, bos_id, eos_id, sentinel_tokens, prefix_lm)
 
     # assert (len(t5_input) - len(masked_spans)) + \
     #        (len(t5_decoder_in) - (len(masked_spans) + 1)) == len(tokens)
diff --git a/megatron/data/ul2_dataset.py b/megatron/data/ul2_dataset.py
index 892e94b3a..d76bdbf63 100644
--- a/megatron/data/ul2_dataset.py
+++ b/megatron/data/ul2_dataset.py
@@ -217,7 +217,7 @@ def build_training_sample(sample, target_seq_length,
     if is_decoder_only(model_type):
         # Concatenate to one sequence.
         tokens_enc, tokens_dec_in, labels = merge_subsequent_masks(
-            tokens, masked_spans, bos_id, eos_id, sentinel_tokens)
+            tokens, masked_spans, bos_id, eos_id, sentinel_tokens, prefix_lm)
 
         # Move EOS tokens to end of sequence.
         while tokens_enc[-1] == eos_id:
@@ -265,7 +265,8 @@ def build_training_sample(sample, target_seq_length,
             = pad_and_convert_to_numpy(tokens, masked_positions,
                                        masked_labels, pad_id, max_seq_length,
                                        max_seq_length_dec, masked_spans,
-                                       bos_id, eos_id, sentinel_tokens)
+                                       bos_id, eos_id, sentinel_tokens,
+                                       prefix_lm)
 
         train_sample = {
             'text_enc': tokens_enc,

From daf52cc03fd0c1b6262097f6350a11059463405e Mon Sep 17 00:00:00 2001
From: janEbert <janpublicebert@posteo.net>
Date: Tue, 3 Jan 2023 12:08:03 +0100
Subject: [PATCH 017/122] Document `max_seq_length_dec` argument

---
 megatron/data/t5_dataset.py  | 2 ++
 megatron/data/ul2_dataset.py | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/megatron/data/t5_dataset.py b/megatron/data/t5_dataset.py
index 31ac7bbee..b3c141db0 100644
--- a/megatron/data/t5_dataset.py
+++ b/megatron/data/t5_dataset.py
@@ -125,6 +125,8 @@ def build_training_sample(sample, target_seq_length,
         target_seq_length: Desired sequence length.
         max_seq_length: Maximum length of the sequence. All values are padded to
             this length.
+        max_seq_length_dec: Maximum length of the decoder input sequence. All
+            values are padded to this length.
         vocab_id_list: List of vocabulary ids. Used to pick a random id.
         vocab_id_to_token_dict: A dictionary from vocab ids to text tokens.
         cls_id: Start of example id.
diff --git a/megatron/data/ul2_dataset.py b/megatron/data/ul2_dataset.py
index d76bdbf63..3fef81deb 100644
--- a/megatron/data/ul2_dataset.py
+++ b/megatron/data/ul2_dataset.py
@@ -138,6 +138,8 @@ def build_training_sample(sample, target_seq_length,
         target_seq_length: Desired sequence length.
         max_seq_length: Maximum length of the sequence. All values are padded to
             this length.
+        max_seq_length_dec: Maximum length of the decoder input sequence. All
+            values are padded to this length.
         vocab_id_list: List of vocabulary ids. Used to pick a random id.
         vocab_id_to_token_dict: A dictionary from vocab ids to text tokens.
         cls_ids: Start of example ids.

From 04be5905414d18581fad443356febbd2b22ed1dd Mon Sep 17 00:00:00 2001
From: janEbert <janpublicebert@posteo.net>
Date: Tue, 3 Jan 2023 12:08:34 +0100
Subject: [PATCH 018/122] Skip redundant computations

---
 megatron/data/dataset_utils.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/megatron/data/dataset_utils.py b/megatron/data/dataset_utils.py
index 60d4e0d90..479063cf6 100644
--- a/megatron/data/dataset_utils.py
+++ b/megatron/data/dataset_utils.py
@@ -270,6 +270,10 @@ def create_masked_lm_predictions(tokens,
                 if cand_indexes[last_cand_index_index][-1] < len(tokens) - 1:
                     continue
             ngram_index.append(cand_indexes[idx:idx + n])
+            if prefix_lm:
+                # No need to go further – we would only produce
+                # duplicate entries by continuing for this `idx`.
+                break
         ngram_indexes.append(ngram_index)
 
     np_rng.shuffle(ngram_indexes)

From 7bc5a8779fefef0916f013c2fd6b45036849dc21 Mon Sep 17 00:00:00 2001
From: janEbert <janpublicebert@posteo.net>
Date: Tue, 3 Jan 2023 12:08:56 +0100
Subject: [PATCH 019/122] Fix PrefixLM mean location

---
 megatron/data/dataset_utils.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/megatron/data/dataset_utils.py b/megatron/data/dataset_utils.py
index 479063cf6..4fb010129 100644
--- a/megatron/data/dataset_utils.py
+++ b/megatron/data/dataset_utils.py
@@ -246,6 +246,12 @@ def create_masked_lm_predictions(tokens,
     if masked_lm_prob == 0:
         return (output_tokens, masked_lm_positions,
                 masked_lm_labels, token_boundary)
+    if prefix_lm:
+        # Adjust probabilities so that the mean is centered at the
+        # correct position.
+        # If we do not do this, the mean is at
+        # `len(tokens) * masked_lm_prob / 2`.
+        masked_lm_prob *= 2
 
     num_to_predict = min(max_predictions_per_seq,
                          max(1, int(round(len(tokens) * masked_lm_prob))))

From 775e99d8deeb32e6929a7d3f2bb83708759797c8 Mon Sep 17 00:00:00 2001
From: janEbert <janpublicebert@posteo.net>
Date: Tue, 3 Jan 2023 12:37:37 +0100
Subject: [PATCH 020/122] Pad decoder-only inputs to same length

---
 megatron/data/ul2_dataset.py | 22 +++++++++++++++-------
 1 file changed, 15 insertions(+), 7 deletions(-)

diff --git a/megatron/data/ul2_dataset.py b/megatron/data/ul2_dataset.py
index 3fef81deb..0a4820da5 100644
--- a/megatron/data/ul2_dataset.py
+++ b/megatron/data/ul2_dataset.py
@@ -231,27 +231,35 @@ def build_training_sample(sample, target_seq_length,
 
         # Move BOS token to start of sequence.
         tokens_dec_in = tokens_dec_in[1:]
-        tokens = np.array((
+        tokens = (
             [bos_id]
             + tokens_enc
             + [sep_id]
             + tokens_dec_in
-        ), dtype=np.int64)
+        )
+
+        # Pad and convert to NumPy.
+        padding_length = max_seq_length - len(tokens)
+        if padding_length < 0:
+            raise LengthExceededError()
+        filler = [pad_id] * padding_length
+
+        tokens = np.array(tokens + filler, dtype=np.int64)
         labels = np.array((
             tokens_enc
             + [sep_id]
             + labels
+            + filler
         ), dtype=np.int64)
 
-        if max_seq_length - len(tokens) < 0:
-            raise LengthExceededError()
-
         loss_mask = np.zeros(len(tokens), dtype=np.int64)
-        loss_mask[-num_labels:] = 1
+        labels_start_neg_index = -(num_labels + padding_length)
+        labels_end_neg_index = -padding_length if padding_length > 0 else None
+        loss_mask[labels_start_neg_index:labels_end_neg_index] = 1
 
         dec_mask = make_history_mask(tokens)
         if is_prefix_lm(model_type):
-            dec_mask[:-num_labels, :-num_labels] = 1
+            dec_mask[:labels_start_neg_index, :labels_start_neg_index] = 1
 
         train_sample = {
             'text': tokens,

From 538c30bf6c4ec9b8d41432f3a1e86f17978e74ff Mon Sep 17 00:00:00 2001
From: janEbert <janpublicebert@posteo.net>
Date: Tue, 3 Jan 2023 13:38:17 +0100
Subject: [PATCH 021/122] Fix decoder-only attention mask shape

---
 pretrain_ul2.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pretrain_ul2.py b/pretrain_ul2.py
index cab24ced0..8a6d4b136 100644
--- a/pretrain_ul2.py
+++ b/pretrain_ul2.py
@@ -92,6 +92,7 @@ def get_batch(data_iterator):
         loss_mask = data_b['loss_mask'].float()
 
         dec_mask = (data_b['dec_mask'] < 0.5)
+        dec_mask = dec_mask.unsqueeze(1)
         return tokens, loss_mask, labels, dec_mask
     else:
         tokens_enc = data_b['text_enc'].long()

From ba4476c75c7d2137d68b3474eab5d049a9667814 Mon Sep 17 00:00:00 2001
From: janEbert <janpublicebert@posteo.net>
Date: Mon, 23 Jan 2023 11:48:47 +0100
Subject: [PATCH 022/122] Document index set selection for PrefixLM masking

---
 megatron/data/dataset_utils.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/megatron/data/dataset_utils.py b/megatron/data/dataset_utils.py
index 4fb010129..a297608ff 100644
--- a/megatron/data/dataset_utils.py
+++ b/megatron/data/dataset_utils.py
@@ -272,6 +272,8 @@ def create_masked_lm_predictions(tokens,
         ngram_index = []
         for n in ngrams:
             if prefix_lm:
+                # Select those index sets for which the final index is
+                # at the end of the sequence.
                 last_cand_index_index = min(idx + n - 1, len(cand_indexes) - 1)
                 if cand_indexes[last_cand_index_index][-1] < len(tokens) - 1:
                     continue

From 678fbdcaf9f893459445a15d89e58adf983540ad Mon Sep 17 00:00:00 2001
From: janEbert <janpublicebert@posteo.net>
Date: Mon, 23 Jan 2023 11:50:00 +0100
Subject: [PATCH 023/122] Fix `max_ngrams` for normal sampling style

Since the normal distribution is unbounded, we cannot have `max_ngrams`
set to a bounded value.
---
 megatron/data/dataset_utils.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/megatron/data/dataset_utils.py b/megatron/data/dataset_utils.py
index a297608ff..c7092f437 100644
--- a/megatron/data/dataset_utils.py
+++ b/megatron/data/dataset_utils.py
@@ -256,6 +256,10 @@ def create_masked_lm_predictions(tokens,
     num_to_predict = min(max_predictions_per_seq,
                          max(1, int(round(len(tokens) * masked_lm_prob))))
 
+    if sampling_style is SamplingStyle.NORMAL:
+        normal_mean = (max_ngrams + 1) / 2
+        max_ngrams = len(tokens) - 1
+
     ngrams = np.arange(1, max_ngrams + 1, dtype=np.int64)
     if sampling_style is SamplingStyle.POISSON:
         # Note(mingdachen):
@@ -264,8 +268,6 @@ def create_masked_lm_predictions(tokens,
         pvals /= pvals.sum(keepdims=True)
         if favor_longer_ngram:
             pvals = pvals[::-1]
-    elif sampling_style is SamplingStyle.NORMAL:
-        normal_mean = (max_ngrams + 1) / 2
 
     ngram_indexes = []
     for idx in range(len(cand_indexes)):

From 00479e5ddfea3cc84adc49bd034d05bfeea81181 Mon Sep 17 00:00:00 2001
From: janEbert <janpublicebert@posteo.net>
Date: Mon, 23 Jan 2023 17:00:06 +0100
Subject: [PATCH 024/122] Do not limit `max_predictions_per_seq`

---
 megatron/data/ul2_dataset.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/megatron/data/ul2_dataset.py b/megatron/data/ul2_dataset.py
index 0a4820da5..024b83d1d 100644
--- a/megatron/data/ul2_dataset.py
+++ b/megatron/data/ul2_dataset.py
@@ -191,8 +191,6 @@ def build_training_sample(sample, target_seq_length,
     tokens = [cls_id] + tokens
 
     # Masking.
-    # Ensure we always have at least one prediction.
-    max_predictions_per_seq = max(1.0, masked_lm_prob * len(tokens))
     mean_ngrams = mean_span_lengths[denoiser_index]
     if mean_ngrams < 1:
         # Ensure we always obtain at least one `max_ngrams`.
@@ -202,11 +200,19 @@ def build_training_sample(sample, target_seq_length,
     if denoiser == 'R' or denoiser == 'X':
         sampling_style = SamplingStyle.NORMAL
         prefix_lm = False
+        max_predictions_per_seq = len(tokens) - 1
     elif denoiser == 'S':
         sampling_style = SamplingStyle.UNIFORM
         prefix_lm = True
+        max_predictions_per_seq = min(
+            round(masked_lm_prob * len(tokens)) * 2 - 1,
+            len(tokens) - 1,
+        )
     else:
         raise ValueError('unknown denoiser')
+
+    # Ensure we always have at least one prediction.
+    max_predictions_per_seq = max(1, max_predictions_per_seq)
     (
         tokens, masked_positions, masked_labels, _, masked_spans,
     ) = create_masked_lm_predictions(

From 795caef6924ddfacd750e8d62421a8fa2ee2b66e Mon Sep 17 00:00:00 2001
From: janEbert <janpublicebert@posteo.net>
Date: Mon, 23 Jan 2023 17:01:51 +0100
Subject: [PATCH 025/122] Calculate and use amount of filtered tokens

Filtered means not `cls_id` or `sep_id` tokens. This slightly improves
calculated statistics for long sequences and greatly for very short
sequences.
---
 megatron/data/dataset_utils.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/megatron/data/dataset_utils.py b/megatron/data/dataset_utils.py
index c7092f437..273c26034 100644
--- a/megatron/data/dataset_utils.py
+++ b/megatron/data/dataset_utils.py
@@ -219,6 +219,7 @@ def create_masked_lm_predictions(tokens,
     # the starting piece of current token, where 1 means true, so that
     # on-the-fly whole word masking is possible.
     token_boundary = [0] * len(tokens)
+    num_filtered_tokens = 0
 
     for (i, token) in enumerate(tokens):
         if token == cls_id or token == sep_id:
@@ -237,6 +238,7 @@ def create_masked_lm_predictions(tokens,
             cand_indexes.append([i])
             if is_start_piece(vocab_id_to_token_dict[token]):
                 token_boundary[i] = 1
+        num_filtered_tokens += 1
 
     output_tokens = list(tokens)
 
@@ -258,7 +260,7 @@ def create_masked_lm_predictions(tokens,
 
     if sampling_style is SamplingStyle.NORMAL:
         normal_mean = (max_ngrams + 1) / 2
-        max_ngrams = len(tokens) - 1
+        max_ngrams = num_filtered_tokens - 1
 
     ngrams = np.arange(1, max_ngrams + 1, dtype=np.int64)
     if sampling_style is SamplingStyle.POISSON:

From 689e15f95b9075ce199f184629cb73d05c14b7b1 Mon Sep 17 00:00:00 2001
From: janEbert <janpublicebert@posteo.net>
Date: Mon, 23 Jan 2023 17:20:38 +0100
Subject: [PATCH 026/122] Document normal sampling style

---
 megatron/data/dataset_utils.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/megatron/data/dataset_utils.py b/megatron/data/dataset_utils.py
index 273c26034..8199a5762 100644
--- a/megatron/data/dataset_utils.py
+++ b/megatron/data/dataset_utils.py
@@ -259,7 +259,13 @@ def create_masked_lm_predictions(tokens,
                          max(1, int(round(len(tokens) * masked_lm_prob))))
 
     if sampling_style is SamplingStyle.NORMAL:
+        # First, we get the center of our normal distribution from
+        # `max_ngrams`. Keeping the meaning of `max_ngrams` this way
+        # plays nicely with the other probability distributions in terms
+        # of math.
         normal_mean = (max_ngrams + 1) / 2
+        # However, we do not want to bound the maximum number of
+        # n-grams.
         max_ngrams = num_filtered_tokens - 1
 
     ngrams = np.arange(1, max_ngrams + 1, dtype=np.int64)

From e44d0e493903bdf0649ae28bda52290cb5d04b42 Mon Sep 17 00:00:00 2001
From: janEbert <janpublicebert@posteo.net>
Date: Mon, 23 Jan 2023 17:14:30 +0100
Subject: [PATCH 027/122] Fix PrefixLM possible spans calculation

---
 megatron/data/dataset_utils.py | 63 ++++++++++++++++++++--------------
 1 file changed, 37 insertions(+), 26 deletions(-)

diff --git a/megatron/data/dataset_utils.py b/megatron/data/dataset_utils.py
index 8199a5762..289e845ad 100644
--- a/megatron/data/dataset_utils.py
+++ b/megatron/data/dataset_utils.py
@@ -248,15 +248,6 @@ def create_masked_lm_predictions(tokens,
     if masked_lm_prob == 0:
         return (output_tokens, masked_lm_positions,
                 masked_lm_labels, token_boundary)
-    if prefix_lm:
-        # Adjust probabilities so that the mean is centered at the
-        # correct position.
-        # If we do not do this, the mean is at
-        # `len(tokens) * masked_lm_prob / 2`.
-        masked_lm_prob *= 2
-
-    num_to_predict = min(max_predictions_per_seq,
-                         max(1, int(round(len(tokens) * masked_lm_prob))))
 
     if sampling_style is SamplingStyle.NORMAL:
         # First, we get the center of our normal distribution from
@@ -277,24 +268,44 @@ def create_masked_lm_predictions(tokens,
         if favor_longer_ngram:
             pvals = pvals[::-1]
 
-    ngram_indexes = []
-    for idx in range(len(cand_indexes)):
-        ngram_index = []
-        for n in ngrams:
-            if prefix_lm:
-                # Select those index sets for which the final index is
-                # at the end of the sequence.
-                last_cand_index_index = min(idx + n - 1, len(cand_indexes) - 1)
-                if cand_indexes[last_cand_index_index][-1] < len(tokens) - 1:
-                    continue
-            ngram_index.append(cand_indexes[idx:idx + n])
-            if prefix_lm:
-                # No need to go further – we would only produce
-                # duplicate entries by continuing for this `idx`.
-                break
-        ngram_indexes.append(ngram_index)
+    if prefix_lm:
+        # We only do one span searching loop anyway, so this does not
+        # matter in terms of random search. However, we do want to allow
+        # sequences greater than the mean ratio.
+        num_to_predict = max_predictions_per_seq
+
+        # Find first index which is greater than the number of
+        # predictions.
+        first_gt_index = next(
+            (
+                i
+                for (i, x) in enumerate(cand_indexes)
+                if x[0] > num_filtered_tokens - max_predictions_per_seq
+            ),
+            len(cand_indexes),
+        )
+        # Then move one index before to get less than or equal to the
+        # number of predictions, handling not going below 0.
+        first_le_index = max(1, first_gt_index) - 1
+
+        tail_cand_indexes = cand_indexes[first_le_index:]
+        ngram_indexes = []
+        for i in range(len(tail_cand_indexes)):
+            ngram_indexes.append(tail_cand_indexes[i:])
+        ngram_indexes = [ngram_indexes]
+        # No need to shuffle outer list of length 1.
+    else:
+        num_to_predict = min(max_predictions_per_seq,
+                             max(1, int(round(len(tokens) * masked_lm_prob))))
 
-    np_rng.shuffle(ngram_indexes)
+        ngram_indexes = []
+        for idx in range(len(cand_indexes)):
+            ngram_index = []
+            for n in ngrams:
+                ngram_index.append(cand_indexes[idx:idx + n])
+            ngram_indexes.append(ngram_index)
+
+        np_rng.shuffle(ngram_indexes)
 
     (masked_lms, masked_spans) = ([], [])
     covered_indexes = set()

From 075f05fd086a3df4da36c57c3aacefcbcf83db2f Mon Sep 17 00:00:00 2001
From: janEbert <janpublicebert@posteo.net>
Date: Tue, 24 Jan 2023 10:10:28 +0100
Subject: [PATCH 028/122] Use binary search for PrefixLM first tail index

---
 megatron/data/dataset_utils.py | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/megatron/data/dataset_utils.py b/megatron/data/dataset_utils.py
index 289e845ad..a711b5b41 100644
--- a/megatron/data/dataset_utils.py
+++ b/megatron/data/dataset_utils.py
@@ -18,6 +18,7 @@
 #   https://github.com/google-research/albert/blob/master/create_pretraining_data.py
 # with some modifications.
 
+import bisect
 from enum import Enum
 import math
 import os
@@ -276,13 +277,9 @@ def create_masked_lm_predictions(tokens,
 
         # Find first index which is greater than the number of
         # predictions.
-        first_gt_index = next(
-            (
-                i
-                for (i, x) in enumerate(cand_indexes)
-                if x[0] > num_filtered_tokens - max_predictions_per_seq
-            ),
-            len(cand_indexes),
+        first_gt_index = bisect.bisect_right(
+            cand_indexes,
+            [num_filtered_tokens - max_predictions_per_seq]
         )
         # Then move one index before to get less than or equal to the
         # number of predictions, handling not going below 0.

From 6bc7471d8fa352864b738f026fc67dbc0cf0f4d7 Mon Sep 17 00:00:00 2001
From: janEbert <janpublicebert@posteo.net>
Date: Tue, 24 Jan 2023 11:21:39 +0100
Subject: [PATCH 029/122] Calculate n-gram indices lazily

Usually we do not iterate through all indices, so we can save quite some
time if `max_ngrams` is large.
---
 megatron/data/dataset_utils.py | 69 ++++++++++++++++++++++------------
 1 file changed, 45 insertions(+), 24 deletions(-)

diff --git a/megatron/data/dataset_utils.py b/megatron/data/dataset_utils.py
index a711b5b41..d28d3d3eb 100644
--- a/megatron/data/dataset_utils.py
+++ b/megatron/data/dataset_utils.py
@@ -193,6 +193,36 @@ def is_start_piece(piece):
     return not piece.startswith("##")
 
 
+def get_ngram_indices(
+        idx,
+        ngrams,
+        cand_indexes,
+        num_to_predict,
+        num_filtered_tokens,
+        prefix_lm,
+):
+    if prefix_lm:
+        # Find first index which is greater than the number of
+        # predictions.
+        first_gt_index = bisect.bisect_right(
+            cand_indexes,
+            [num_filtered_tokens - num_to_predict]
+        )
+        # Then move one index before to get less than or equal to the
+        # number of predictions, handling not going below 0.
+        first_le_index = max(1, first_gt_index) - 1
+
+        tail_cand_indexes = cand_indexes[first_le_index:]
+        ngram_index = []
+        for i in range(len(tail_cand_indexes)):
+            ngram_index.append(tail_cand_indexes[i:])
+    else:
+        ngram_index = []
+        for n in ngrams:
+            ngram_index.append(cand_indexes[idx:idx + n])
+    return ngram_index
+
+
 def create_masked_lm_predictions(tokens,
                                  vocab_id_list, vocab_id_to_token_dict,
                                  masked_lm_prob,
@@ -275,34 +305,24 @@ def create_masked_lm_predictions(tokens,
         # sequences greater than the mean ratio.
         num_to_predict = max_predictions_per_seq
 
-        # Find first index which is greater than the number of
-        # predictions.
-        first_gt_index = bisect.bisect_right(
-            cand_indexes,
-            [num_filtered_tokens - max_predictions_per_seq]
-        )
-        # Then move one index before to get less than or equal to the
-        # number of predictions, handling not going below 0.
-        first_le_index = max(1, first_gt_index) - 1
-
-        tail_cand_indexes = cand_indexes[first_le_index:]
-        ngram_indexes = []
-        for i in range(len(tail_cand_indexes)):
-            ngram_indexes.append(tail_cand_indexes[i:])
-        ngram_indexes = [ngram_indexes]
-        # No need to shuffle outer list of length 1.
+        ngram_index_indexes = np.array([0])
     else:
         num_to_predict = min(max_predictions_per_seq,
                              max(1, int(round(len(tokens) * masked_lm_prob))))
 
-        ngram_indexes = []
-        for idx in range(len(cand_indexes)):
-            ngram_index = []
-            for n in ngrams:
-                ngram_index.append(cand_indexes[idx:idx + n])
-            ngram_indexes.append(ngram_index)
+        ngram_index_indexes = np.arange(len(cand_indexes))
+        np_rng.shuffle(ngram_index_indexes)
 
-        np_rng.shuffle(ngram_indexes)
+    def get_ngram_indices_(idx):
+        return get_ngram_indices(
+            idx,
+            ngrams,
+            cand_indexes,
+            num_to_predict,
+            num_filtered_tokens,
+            prefix_lm,
+        )
+    ngram_indexes = map(get_ngram_indices_, ngram_index_indexes)
 
     (masked_lms, masked_spans) = ([], [])
     covered_indexes = set()
@@ -386,7 +406,8 @@ def create_masked_lm_predictions(tokens,
             label=[tokens[index] for index in index_set]))
 
     assert len(masked_lms) <= num_to_predict
-    np_rng.shuffle(ngram_indexes)
+    np_rng.shuffle(ngram_index_indexes)
+    ngram_indexes = map(get_ngram_indices_, ngram_index_indexes)
 
     select_indexes = set()
     if do_permutation:

From a105f320599bc132fffab2bcb3e53929f59a6b46 Mon Sep 17 00:00:00 2001
From: janEbert <janpublicebert@posteo.net>
Date: Tue, 24 Jan 2023 11:30:53 +0100
Subject: [PATCH 030/122] Fix code style

---
 megatron/data/dataset_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/megatron/data/dataset_utils.py b/megatron/data/dataset_utils.py
index d28d3d3eb..bfaf6ff1a 100644
--- a/megatron/data/dataset_utils.py
+++ b/megatron/data/dataset_utils.py
@@ -206,7 +206,7 @@ def get_ngram_indices(
         # predictions.
         first_gt_index = bisect.bisect_right(
             cand_indexes,
-            [num_filtered_tokens - num_to_predict]
+            [num_filtered_tokens - num_to_predict],
         )
         # Then move one index before to get less than or equal to the
         # number of predictions, handling not going below 0.

From f0fe282abcc4d862482a68d738b6597641ee7822 Mon Sep 17 00:00:00 2001
From: janEbert <janpublicebert@posteo.net>
Date: Tue, 24 Jan 2023 11:46:53 +0100
Subject: [PATCH 031/122] Prefer list comprehensions

---
 megatron/data/dataset_utils.py | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/megatron/data/dataset_utils.py b/megatron/data/dataset_utils.py
index bfaf6ff1a..3213ebb2f 100644
--- a/megatron/data/dataset_utils.py
+++ b/megatron/data/dataset_utils.py
@@ -213,13 +213,12 @@ def get_ngram_indices(
         first_le_index = max(1, first_gt_index) - 1
 
         tail_cand_indexes = cand_indexes[first_le_index:]
-        ngram_index = []
-        for i in range(len(tail_cand_indexes)):
-            ngram_index.append(tail_cand_indexes[i:])
+        ngram_index = [
+            tail_cand_indexes[i:]
+            for i in range(len(tail_cand_indexes))
+        ]
     else:
-        ngram_index = []
-        for n in ngrams:
-            ngram_index.append(cand_indexes[idx:idx + n])
+        ngram_index = [cand_indexes[idx:idx + n] for n in ngrams]
     return ngram_index
 
 

From 11bd6db592f1723f131db9152aa2d59fcbdb86aa Mon Sep 17 00:00:00 2001
From: janEbert <janpublicebert@posteo.net>
Date: Tue, 14 Feb 2023 11:43:41 +0100
Subject: [PATCH 032/122] Allow recognizing when UL2 is used

Via an extra "private" argument.
---
 pretrain_ul2.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/pretrain_ul2.py b/pretrain_ul2.py
index 8a6d4b136..1dc425486 100644
--- a/pretrain_ul2.py
+++ b/pretrain_ul2.py
@@ -15,6 +15,7 @@
 
 """Pretrain UL2"""
 
+import argparse
 from functools import partial
 
 import torch
@@ -180,7 +181,13 @@ def train_valid_test_datasets_provider(train_val_test_num_samples):
     return train_ds, valid_ds, test_ds
 
 
+def extra_args_provider(parser):
+    parser.add_argument('--_is_ul2', default=True, help=argparse.SUPPRESS)
+    return parser
+
+
 if __name__ == "__main__":
 
     pretrain(train_valid_test_datasets_provider, model_provider, forward_step,
+             extra_args_provider=extra_args_provider,
              args_defaults={'tokenizer_type': 'BertWordPieceLowerCase'})

From 43eee931ffced2d1be88c7ebe3f71be770113cc5 Mon Sep 17 00:00:00 2001
From: janEbert <janpublicebert@posteo.net>
Date: Tue, 14 Feb 2023 12:06:37 +0100
Subject: [PATCH 033/122] Support UL2 tokens for all tokenizers

The GPT tokenizer does not handle the difference between UL2 tokens and
other special tokens well. This should be fine as UL2 tokens being
distinct from other special tokens is never assumed at the
moment (although other tokenizers implement it like that). In general,
`additional_special_token_ids` is new for the GPT tokenizer, so there is
no backward compatibility trouble.
---
 megatron/tokenizer/tokenizer.py | 145 ++++++++++++++++++++++++++++----
 1 file changed, 130 insertions(+), 15 deletions(-)

diff --git a/megatron/tokenizer/tokenizer.py b/megatron/tokenizer/tokenizer.py
index c0356a12c..1537e5697 100644
--- a/megatron/tokenizer/tokenizer.py
+++ b/megatron/tokenizer/tokenizer.py
@@ -30,17 +30,37 @@ def build_tokenizer(args):
 
     # Select and instantiate the tokenizer.
     assert args.vocab_file is not None or args.tokenizer_type == "PretrainedFromHF"
+
+    if hasattr(args, '_is_ul2') and args._is_ul2:
+        ul2_denoiser_tokens = [
+            args.ul2_r_denoiser_token,
+            args.ul2_s_denoiser_token,
+            args.ul2_x_denoiser_token,
+        ]
+    else:
+        ul2_denoiser_tokens = []
+
     if args.tokenizer_type == 'BertWordPieceLowerCase':
-        tokenizer = _BertWordPieceTokenizer(vocab_file=args.vocab_file,
-                                            lower_case=True,
-                                            vocab_extra_ids=args.vocab_extra_ids)
+        tokenizer = _BertWordPieceTokenizer(
+            vocab_file=args.vocab_file,
+            lower_case=True,
+            vocab_extra_ids=args.vocab_extra_ids,
+            ul2_denoiser_tokens=ul2_denoiser_tokens,
+        )
     elif args.tokenizer_type == 'BertWordPieceCase':
-        tokenizer = _BertWordPieceTokenizer(vocab_file=args.vocab_file,
-                                            lower_case=False,
-                                            vocab_extra_ids=args.vocab_extra_ids)
+        tokenizer = _BertWordPieceTokenizer(
+            vocab_file=args.vocab_file,
+            lower_case=False,
+            vocab_extra_ids=args.vocab_extra_ids,
+            ul2_denoiser_tokens=ul2_denoiser_tokens,
+        )
     elif args.tokenizer_type == 'GPT2BPETokenizer':
         assert args.merge_file is not None
-        tokenizer = _GPT2BPETokenizer(args.vocab_file, args.merge_file)
+        tokenizer = _GPT2BPETokenizer(
+            args.vocab_file,
+            args.merge_file,
+            ul2_denoiser_tokens=ul2_denoiser_tokens,
+        )
     elif args.tokenizer_type == "PretrainedFromHF":
         assert args.tokenizer_name_or_path is not None
 
@@ -55,7 +75,11 @@ def build_tokenizer(args):
 
         if args.rank == 0:
             print(" vocab file is un-used. loading tokenizer from pre-trained model")
-        tokenizer = _AutoTokenizer(args.tokenizer_name_or_path, vocab_extra_ids=args.vocab_extra_ids)
+        tokenizer = _AutoTokenizer(
+            args.tokenizer_name_or_path,
+            vocab_extra_ids=args.vocab_extra_ids,
+            ul2_denoiser_tokens=ul2_denoiser_tokens,
+        )
     else:
         raise NotImplementedError('{} tokenizer is not '
                                   'implemented.'.format(args.tokenizer_type))
@@ -155,7 +179,13 @@ def mask(self):
 class _BertWordPieceTokenizer(AbstractTokenizer):
     """Original BERT wordpiece tokenizer."""
 
-    def __init__(self, vocab_file, lower_case=True, vocab_extra_ids=0):
+    def __init__(
+            self,
+            vocab_file,
+            lower_case=True,
+            vocab_extra_ids=0,
+            ul2_denoiser_tokens=None,
+    ):
         if lower_case:
             name = 'BERT Lower Case'
         else:
@@ -184,6 +214,13 @@ def __init__(self, vocab_file, lower_case=True, vocab_extra_ids=0):
         additional_special_tokens = []
         additional_special_tokens.extend(
             ["<extra_id_{}>".format(i) for i in range(vocab_extra_ids)])
+
+        if ul2_denoiser_tokens is None:
+            ul2_denoiser_tokens = []
+        self._ul2_tokens = ul2_denoiser_tokens
+        for value in self._ul2_tokens:
+            self.add_token(value)
+
         self.add_additional_special_tokens(additional_special_tokens)
 
     def add_token(self, token):
@@ -282,16 +319,35 @@ def additional_special_tokens_ids(self):
     def additional_special_tokens(self, value):
         self._additional_special_tokens = value
 
+    @property
+    def ul2_token_ids(self):
+        return [self.vocab[k] for k in self._ul2_tokens]
+
 
 class _GPT2BPETokenizer(AbstractTokenizer):
     """Original GPT2 BPE tokenizer."""
 
-    def __init__(self, vocab_file, merge_file):
+    def __init__(self, vocab_file, merge_file, ul2_denoiser_tokens=None):
         name = 'GPT2 BPE'
         super().__init__(name)
 
+        if ul2_denoiser_tokens is None:
+            ul2_denoiser_tokens = []
+        self._ul2_tokens = ul2_denoiser_tokens
+
+        # Warning! `additional_special_token_ids` will also return the UL2
+        # tokens here.
+        special_tokens = self._ul2_tokens.copy()
+        if self._ul2_tokens:
+            special_tokens.append('<SEP>')
+
         self.tokenizer = GPT2Tokenizer(vocab_file, merge_file, errors='replace',
-                                       special_tokens=[], max_len=None)
+                                       special_tokens=special_tokens,
+                                       max_len=None)
+        if self._ul2_tokens:
+            self.sep_id = self.tokenizer.encoder['<SEP>']
+        else:
+            self.sep_id = None
         self.eod_id = self.tokenizer.encoder['<|endoftext|>']
 
     @property
@@ -312,22 +368,77 @@ def tokenize(self, text):
     def detokenize(self, token_ids):
         return self.tokenizer.decode(token_ids)
 
+    @property
+    def sep(self):
+        if self.sep_id is None:
+            raise AttributeError(
+                'GPT tokenizer does not have a SEP token by default; '
+                'please add it to the `special_tokens`')
+        return self.sep_id
+
     @property
     def eod(self):
         return self.eod_id
 
+    @property
+    def additional_special_tokens_ids(self):
+        # Warning! This will also return the UL2 tokens.
+        return [self.vocab[k] for k in self.tokenizer.special_tokens]
+
+    @property
+    def ul2_tokens_ids(self):
+        return [self.vocab[k] for k in self._ul2_tokens]
+
 
 class _AutoTokenizer(AbstractTokenizer):
     """AutoTokenizer for Hf Pretrained model loading."""
 
-    def __init__(self, tokenizer_name_or_path, vocab_extra_ids):
+    def __init__(
+            self,
+            tokenizer_name_or_path,
+            vocab_extra_ids,
+            ul2_denoiser_tokens=None,
+    ):
         name = tokenizer_name_or_path
         super().__init__(name)
         hf_tokenizer_kwargs = {}
+
         if vocab_extra_ids > 0:
             # TODO @thomasw21 we might need to concatenate to a pre-existing list?
-            hf_tokenizer_kwargs["additional_special_tokens"] = [f"<extra_id_{_id}>" for _id in range(vocab_extra_ids)]
-        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path, **hf_tokenizer_kwargs)
+            self._extra_id_tokens = [
+                f"<extra_id_{_id}>" for _id in range(vocab_extra_ids)]
+            hf_tokenizer_kwargs["additional_special_tokens"] = \
+                self._extra_id_tokens
+        else:
+            self._extra_id_tokens = []
+
+        if ul2_denoiser_tokens is None:
+            ul2_denoiser_tokens = []
+        self._ul2_tokens = ul2_denoiser_tokens
+
+        if self._ul2_tokens:
+            additional_tokens = hf_tokenizer_kwargs.setdefault(
+                'additional_special_tokens', [])
+            additional_tokens.extend(self._ul2_tokens)
+
+        try:
+            self.tokenizer = AutoTokenizer.from_pretrained(
+                tokenizer_name_or_path, **hf_tokenizer_kwargs)
+        except ValueError as e:
+            # Try to catch the exception raised when we have to pass
+            # `extra_ids` explicitly because its default does not match.
+            if not (
+                    str(e).startswith('Both extra_ids ')
+                    and str(e).endswith(
+                        'the additional_special_tokens must include the '
+                        'extra_ids tokens'
+                    )
+            ):
+                raise e
+
+            hf_tokenizer_kwargs['extra_ids'] = len(self._extra_id_tokens)
+            self.tokenizer = AutoTokenizer.from_pretrained(
+                tokenizer_name_or_path, **hf_tokenizer_kwargs)
         self.encoder = self.tokenizer.get_vocab()
         self.decoder = {v: k for k, v in self.encoder.items()}
 
@@ -403,7 +514,11 @@ def eos_token_id(self):
     @property
     def additional_special_tokens_ids(self):
         """ All the additional special tokens you may want to use (list of strings)."""
-        return self.tokenizer.additional_special_tokens_ids
+        return [self.vocab[k] for k in self._extra_id_tokens]
+
+    @property
+    def ul2_token_ids(self):
+        return [self.vocab[k] for k in self._ul2_tokens]
 
     @staticmethod
     def _check_token_candidate(candidate):

From 6686f0425d9b2c29df133dad6552878dd3e15766 Mon Sep 17 00:00:00 2001
From: janEbert <janpublicebert@posteo.net>
Date: Tue, 14 Feb 2023 12:51:21 +0100
Subject: [PATCH 034/122] Support `<extra_id>` tokens for GPT tokenizer

With this, we also adjust the `additional_special_token_ids` to only
return extra ID tokens.
---
 megatron/tokenizer/tokenizer.py | 20 ++++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

diff --git a/megatron/tokenizer/tokenizer.py b/megatron/tokenizer/tokenizer.py
index 1537e5697..746cfbf41 100644
--- a/megatron/tokenizer/tokenizer.py
+++ b/megatron/tokenizer/tokenizer.py
@@ -59,6 +59,7 @@ def build_tokenizer(args):
         tokenizer = _GPT2BPETokenizer(
             args.vocab_file,
             args.merge_file,
+            vocab_extra_ids=args.vocab_extra_ids,
             ul2_denoiser_tokens=ul2_denoiser_tokens,
         )
     elif args.tokenizer_type == "PretrainedFromHF":
@@ -327,18 +328,26 @@ def ul2_token_ids(self):
 class _GPT2BPETokenizer(AbstractTokenizer):
     """Original GPT2 BPE tokenizer."""
 
-    def __init__(self, vocab_file, merge_file, ul2_denoiser_tokens=None):
+    def __init__(
+            self,
+            vocab_file,
+            merge_file,
+            vocab_extra_ids=0,
+            ul2_denoiser_tokens=None,
+    ):
         name = 'GPT2 BPE'
         super().__init__(name)
 
+        self._extra_id_tokens = [
+            f"<extra_id_{i}>" for i in range(vocab_extra_ids)]
+
         if ul2_denoiser_tokens is None:
             ul2_denoiser_tokens = []
         self._ul2_tokens = ul2_denoiser_tokens
 
-        # Warning! `additional_special_token_ids` will also return the UL2
-        # tokens here.
-        special_tokens = self._ul2_tokens.copy()
+        special_tokens = self._extra_id_tokens.copy()
         if self._ul2_tokens:
+            special_tokens.extend(self._ul2_tokens)
             special_tokens.append('<SEP>')
 
         self.tokenizer = GPT2Tokenizer(vocab_file, merge_file, errors='replace',
@@ -382,8 +391,7 @@ def eod(self):
 
     @property
     def additional_special_tokens_ids(self):
-        # Warning! This will also return the UL2 tokens.
-        return [self.vocab[k] for k in self.tokenizer.special_tokens]
+        return [self.vocab[k] for k in self._extra_id_tokens]
 
     @property
     def ul2_tokens_ids(self):

From f6128c63b63deb1138e01b9e61d2c573182372c0 Mon Sep 17 00:00:00 2001
From: janEbert <janpublicebert@posteo.net>
Date: Tue, 14 Feb 2023 13:08:49 +0100
Subject: [PATCH 035/122] Fix tokenizer vocab access

---
 megatron/tokenizer/tokenizer.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/megatron/tokenizer/tokenizer.py b/megatron/tokenizer/tokenizer.py
index 746cfbf41..3d4e92b60 100644
--- a/megatron/tokenizer/tokenizer.py
+++ b/megatron/tokenizer/tokenizer.py
@@ -354,7 +354,7 @@ def __init__(
                                        special_tokens=special_tokens,
                                        max_len=None)
         if self._ul2_tokens:
-            self.sep_id = self.tokenizer.encoder['<SEP>']
+            self.sep_id = self.tokenizer.special_tokens['<SEP>']
         else:
             self.sep_id = None
         self.eod_id = self.tokenizer.encoder['<|endoftext|>']
@@ -391,11 +391,12 @@ def eod(self):
 
     @property
     def additional_special_tokens_ids(self):
-        return [self.vocab[k] for k in self._extra_id_tokens]
+        return [
+            self.tokenizer.special_tokens[k] for k in self._extra_id_tokens]
 
     @property
     def ul2_tokens_ids(self):
-        return [self.vocab[k] for k in self._ul2_tokens]
+        return [self.tokenizer.special_tokens[k] for k in self._ul2_tokens]
 
 
 class _AutoTokenizer(AbstractTokenizer):

From 8f48763fc43fb872f41de78de1feccec765a5acc Mon Sep 17 00:00:00 2001
From: janEbert <janpublicebert@posteo.net>
Date: Tue, 14 Feb 2023 13:19:24 +0100
Subject: [PATCH 036/122] Revert inheriting from `T5Dataset`

Personally, this makes the model more holistic and we never inherited
correctly anyway, changing the public API. Finally, this allows usage of
tokenizers without `cls_id`, which was previously redundantly queried
due to the mentioned incorrect inheritance.

Finally, the inheritance never saved much repetition to begin with.
---
 megatron/data/ul2_dataset.py | 44 +++++++++++++++++++++++++++---------
 1 file changed, 33 insertions(+), 11 deletions(-)

diff --git a/megatron/data/ul2_dataset.py b/megatron/data/ul2_dataset.py
index 024b83d1d..364069b75 100644
--- a/megatron/data/ul2_dataset.py
+++ b/megatron/data/ul2_dataset.py
@@ -20,6 +20,7 @@
 from megatron import get_tokenizer
 from megatron.data.dataset_utils import (
     create_masked_lm_predictions,
+    get_samples_mapping,
     SamplingStyle
 )
 from megatron.data.t5_dataset import (
@@ -27,7 +28,6 @@
     make_history_mask,
     merge_subsequent_masks,
     pad_and_convert_to_numpy,
-    T5Dataset,
 )
 from megatron.enums import UL2ModelType
 
@@ -44,8 +44,7 @@ def is_prefix_lm(ul2_model_type):
     return ul2_model_type is UL2ModelType.NON_CAUSAL_DECODER
 
 
-class UL2Dataset(T5Dataset):
-
+class UL2Dataset:
     def __init__(self, name, indexed_dataset, data_prefix,
                  num_epochs, max_num_samples, model_type,
                  denoiser_ratios, denoisers, mean_span_lengths,
@@ -64,12 +63,12 @@ def __init__(self, name, indexed_dataset, data_prefix,
             'denoising objectives'
         )
 
-        super().__init__(name, indexed_dataset, data_prefix,
-                         num_epochs, max_num_samples, None,
-                         max_seq_length, max_seq_length_dec,
-                         short_seq_prob, seed)
-
         # Params to store.
+        self.name = name
+        self.seed = seed
+        self.max_seq_length = max_seq_length
+        self.max_seq_length_dec = max_seq_length_dec
+
         self.model_type = model_type
         self.denoiser_ratios = [
             denoiser_ratio / sum(denoiser_ratios)
@@ -79,10 +78,26 @@ def __init__(self, name, indexed_dataset, data_prefix,
         self.mean_span_lengths = mean_span_lengths
         self.mask_ratios = mask_ratios
 
+        # Dataset.
+        self.indexed_dataset = indexed_dataset
+
+        # Build the samples mapping.
+        self.samples_mapping = get_samples_mapping(
+            self.indexed_dataset,
+            data_prefix,
+            num_epochs,
+            max_num_samples,
+            self.max_seq_length - 2,  # account for added tokens
+            short_seq_prob,
+            self.seed,
+            self.name,
+            False,
+        )
+
         # Vocab stuff.
         tokenizer = get_tokenizer()
-        # Remove CLS token because we don't need it.
-        del self.cls_id
+        self.vocab_id_list = list(tokenizer.inv_vocab.keys())
+        self.vocab_id_to_token_dict = tokenizer.inv_vocab
         self.cls_ids = {
             denoiser: tokenizer.vocab[token]
             for (denoiser, token) in denoiser_tokens.items()
@@ -90,6 +105,11 @@ def __init__(self, name, indexed_dataset, data_prefix,
         # cls_token = self.vocab_id_to_token_dict[tokenizer.cls]
         # if cls_token not in self.cls_ids:
         #     self.cls_ids[cls_token] = tokenizer.cls
+        self.sep_id = tokenizer.sep
+        self.mask_id = tokenizer.mask
+        self.pad_id = tokenizer.pad
+        self.bos_id = tokenizer.bos_token_id
+        self.eos_id = tokenizer.eos_token_id
 
         # Filter out denoiser tokens.
         self.sentinel_tokens = [
@@ -100,8 +120,10 @@ def __init__(self, name, indexed_dataset, data_prefix,
         assert len(self.sentinel_tokens) > 0, \
             "Provide the argument --vocab-extra-ids 100 to the script"
 
-    def __getitem__(self, idx):
+    def __len__(self):
+        return self.samples_mapping.shape[0]
 
+    def __getitem__(self, idx):
         start_index, end_index, seq_length = self.samples_mapping[idx]
         sample = []
         for index in range(start_index, end_index):

From 7f99a12047c66ea89f7905a41e228e1688d38df0 Mon Sep 17 00:00:00 2001
From: janEbert <janpublicebert@posteo.net>
Date: Tue, 14 Feb 2023 14:52:50 +0100
Subject: [PATCH 037/122] Fix GPT tokenizer special token handling

---
 megatron/data/ul2_dataset.py    | 23 +++++++++++++--
 megatron/tokenizer/tokenizer.py | 50 +++++++++++++++++++++++++++++++--
 2 files changed, 68 insertions(+), 5 deletions(-)

diff --git a/megatron/data/ul2_dataset.py b/megatron/data/ul2_dataset.py
index 364069b75..8968a8547 100644
--- a/megatron/data/ul2_dataset.py
+++ b/megatron/data/ul2_dataset.py
@@ -15,6 +15,8 @@
 
 """UL2-style dataset."""
 
+from collections import ChainMap
+
 import numpy as np
 
 from megatron import get_tokenizer
@@ -96,10 +98,25 @@ def __init__(self, name, indexed_dataset, data_prefix,
 
         # Vocab stuff.
         tokenizer = get_tokenizer()
-        self.vocab_id_list = list(tokenizer.inv_vocab.keys())
-        self.vocab_id_to_token_dict = tokenizer.inv_vocab
+        # Some tokenizers split their vocabularies. Here we handle both
+        # cases.
+        if (
+                hasattr(tokenizer, 'tokenizer')
+                and hasattr(tokenizer.tokenizer, 'special_tokens_decoder')
+        ):
+            inv_vocab = ChainMap(
+                tokenizer.inv_vocab,
+                tokenizer.tokenizer.special_tokens_decoder,
+            )
+            vocab = ChainMap(
+                tokenizer.vocab, tokenizer.tokenizer.special_tokens)
+        else:
+            inv_vocab = tokenizer.inv_vocab
+            vocab = tokenizer.vocab
+        self.vocab_id_list = list(inv_vocab.keys())
+        self.vocab_id_to_token_dict = inv_vocab
         self.cls_ids = {
-            denoiser: tokenizer.vocab[token]
+            denoiser: vocab[token]
             for (denoiser, token) in denoiser_tokens.items()
         }
         # cls_token = self.vocab_id_to_token_dict[tokenizer.cls]
diff --git a/megatron/tokenizer/tokenizer.py b/megatron/tokenizer/tokenizer.py
index 3d4e92b60..6da66ea56 100644
--- a/megatron/tokenizer/tokenizer.py
+++ b/megatron/tokenizer/tokenizer.py
@@ -348,15 +348,29 @@ def __init__(
         special_tokens = self._extra_id_tokens.copy()
         if self._ul2_tokens:
             special_tokens.extend(self._ul2_tokens)
-            special_tokens.append('<SEP>')
-
+            extra_ul2_tokens = [
+                '<SEP>',
+                '<MASK>',
+                '<PAD>',
+                '<BOS>',
+                '<EOS>',
+            ]
+            special_tokens.extend(extra_ul2_tokens)
         self.tokenizer = GPT2Tokenizer(vocab_file, merge_file, errors='replace',
                                        special_tokens=special_tokens,
                                        max_len=None)
         if self._ul2_tokens:
             self.sep_id = self.tokenizer.special_tokens['<SEP>']
+            self.mask_id = self.tokenizer.special_tokens['<MASK>']
+            self.pad_id = self.tokenizer.special_tokens['<PAD>']
+            self._bos_token_id = self.tokenizer.special_tokens['<BOS>']
+            self._eos_token_id = self.tokenizer.special_tokens['<EOS>']
         else:
             self.sep_id = None
+            self.mask_id = None
+            self.pad_id = None
+            self._bos_token_id = None
+            self._eos_token_id = None
         self.eod_id = self.tokenizer.encoder['<|endoftext|>']
 
     @property
@@ -385,6 +399,38 @@ def sep(self):
                 'please add it to the `special_tokens`')
         return self.sep_id
 
+    @property
+    def mask(self):
+        if self.mask_id is None:
+            raise AttributeError(
+                'GPT tokenizer does not have a MASK token by default; '
+                'please add it to the `special_tokens`')
+        return self.mask_id
+
+    @property
+    def pad(self):
+        if self.pad_id is None:
+            raise AttributeError(
+                'GPT tokenizer does not have a PAD token by default; '
+                'please add it to the `special_tokens`')
+        return self.pad_id
+
+    @property
+    def bos_token_id(self):
+        if self._bos_token_id is None:
+            raise AttributeError(
+                'GPT tokenizer does not have a BOS token by default; '
+                'please add it to the `special_tokens`')
+        return self._bos_token_id
+
+    @property
+    def eos_token_id(self):
+        if self._eos_token_id is None:
+            raise AttributeError(
+                'GPT tokenizer does not have a EOS token by default; '
+                'please add it to the `special_tokens`')
+        return self._eos_token_id
+
     @property
     def eod(self):
         return self.eod_id

From 535a306963eb70cfa91fd81b22abd673f8ae739d Mon Sep 17 00:00:00 2001
From: janEbert <janpublicebert@posteo.net>
Date: Tue, 14 Feb 2023 14:53:29 +0100
Subject: [PATCH 038/122] Do inherit from `torch.utils.data.Dataset`

Removing all inheritance from the class was a bit too eager.
---
 megatron/data/ul2_dataset.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/megatron/data/ul2_dataset.py b/megatron/data/ul2_dataset.py
index 8968a8547..2df2b332e 100644
--- a/megatron/data/ul2_dataset.py
+++ b/megatron/data/ul2_dataset.py
@@ -18,6 +18,7 @@
 from collections import ChainMap
 
 import numpy as np
+import torch
 
 from megatron import get_tokenizer
 from megatron.data.dataset_utils import (
@@ -46,12 +47,13 @@ def is_prefix_lm(ul2_model_type):
     return ul2_model_type is UL2ModelType.NON_CAUSAL_DECODER
 
 
-class UL2Dataset:
+class UL2Dataset(torch.utils.data.Dataset):
     def __init__(self, name, indexed_dataset, data_prefix,
                  num_epochs, max_num_samples, model_type,
                  denoiser_ratios, denoisers, mean_span_lengths,
                  mask_ratios, denoiser_tokens, max_seq_length,
                  max_seq_length_dec, short_seq_prob, seed):
+        super().__init__()
 
         if denoiser_ratios is None:
             # Uniform distribution by default.

From db623b355e4c10e4953ed4f7fd359d26157d9443 Mon Sep 17 00:00:00 2001
From: janEbert <janpublicebert@posteo.net>
Date: Tue, 14 Feb 2023 14:59:56 +0100
Subject: [PATCH 039/122] Add whitespace

For readability.
---
 megatron/tokenizer/tokenizer.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/megatron/tokenizer/tokenizer.py b/megatron/tokenizer/tokenizer.py
index 6da66ea56..bd4b66c8a 100644
--- a/megatron/tokenizer/tokenizer.py
+++ b/megatron/tokenizer/tokenizer.py
@@ -356,6 +356,7 @@ def __init__(
                 '<EOS>',
             ]
             special_tokens.extend(extra_ul2_tokens)
+
         self.tokenizer = GPT2Tokenizer(vocab_file, merge_file, errors='replace',
                                        special_tokens=special_tokens,
                                        max_len=None)

From ef72280fab128d2c1c26f6007abf19ca6f11abcc Mon Sep 17 00:00:00 2001
From: janEbert <janpublicebert@posteo.net>
Date: Tue, 14 Feb 2023 17:26:17 +0100
Subject: [PATCH 040/122] Allow selectively disabling denoiser token

Could make sense in the future to even allow different tokens for same
denoising objectives. (E.g. one R-denoiser has token `[R]`, other
R-denoiser has `[R+]`.)
---
 megatron/arguments.py        |  3 +++
 megatron/data/ul2_dataset.py | 13 +++++++++----
 2 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/megatron/arguments.py b/megatron/arguments.py
index 9aae25cda..df81da27e 100644
--- a/megatron/arguments.py
+++ b/megatron/arguments.py
@@ -1063,12 +1063,15 @@ def _add_ul2_args(parser):
                        help='Ratio of masked token in the full sequence.')
     group.add_argument('--ul2-r-denoiser-token', type=str, default='[R]',
                        help='What token to prepend for the UL2 R-denoising '
+                       'objective. If empty, do not prepend a token for this '
                        'objective.')
     group.add_argument('--ul2-s-denoiser-token', type=str, default='[S]',
                        help='What token to prepend for the UL2 S-denoising '
+                       'objective. If empty, do not prepend a token for this '
                        'objective.')
     group.add_argument('--ul2-x-denoiser-token', type=str, default='[X]',
                        help='What token to prepend for the UL2 X-denoising '
+                       'objective. If empty, do not prepend a token for this '
                        'objective.')
 
     return parser
diff --git a/megatron/data/ul2_dataset.py b/megatron/data/ul2_dataset.py
index 2df2b332e..24a5e01d0 100644
--- a/megatron/data/ul2_dataset.py
+++ b/megatron/data/ul2_dataset.py
@@ -117,8 +117,10 @@ def __init__(self, name, indexed_dataset, data_prefix,
             vocab = tokenizer.vocab
         self.vocab_id_list = list(inv_vocab.keys())
         self.vocab_id_to_token_dict = inv_vocab
+        # Replace empty string tokens with `None` – we want to ignore
+        # those.
         self.cls_ids = {
-            denoiser: vocab[token]
+            denoiser: vocab[token] if token else None
             for (denoiser, token) in denoiser_tokens.items()
         }
         # cls_token = self.vocab_id_to_token_dict[tokenizer.cls]
@@ -226,10 +228,13 @@ def build_training_sample(sample, target_seq_length,
     tokens = tokens[:max_num_tokens]
 
     # Prepend objective token.
-    cls_id = cls_ids.get(denoiser)
-    if cls_id is None:
+    cls_id = cls_ids.get(denoiser, False)
+    if cls_id is False:
         raise ValueError('unknown denoiser')
-    tokens = [cls_id] + tokens
+
+    # If objective token is `None`, ignore it.
+    if cls_id is not None:
+        tokens = [cls_id] + tokens
 
     # Masking.
     mean_ngrams = mean_span_lengths[denoiser_index]

From 001b50cdae4c3fa7adf5c706ff35df2e2ea95c71 Mon Sep 17 00:00:00 2001
From: janEbert <janpublicebert@posteo.net>
Date: Tue, 14 Feb 2023 17:28:09 +0100
Subject: [PATCH 041/122] Allow not replacing masks with sentinel tokens

Backward-compatible since passing `sentinel_tokens=None` would have
resulted in an error previously.
---
 megatron/data/t5_dataset.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/megatron/data/t5_dataset.py b/megatron/data/t5_dataset.py
index b3c141db0..e4084f420 100644
--- a/megatron/data/t5_dataset.py
+++ b/megatron/data/t5_dataset.py
@@ -185,13 +185,16 @@ def merge_subsequent_masks(tokens, masked_spans=None, bos_id=None,
     if prefix_lm:
         assert len(masked_spans) <= 1, \
             'Received more than one masked span for PrefixLM masking'
-    else:
+    elif sentinel_tokens is not None:
         sentinel_tokens = collections.deque(sentinel_tokens)
+
+    insert_mask_tokens = not prefix_lm and sentinel_tokens is not None
+
     t5_input = []
     (t5_decoder_in, t5_decoder_out) = ([bos_id], [])
     (start_index, end_index) = (0, None)
     for span in masked_spans:
-        if not prefix_lm:
+        if insert_mask_tokens:
             flag = sentinel_tokens.popleft()
 
             # Append the same tokens in decoder input and output
@@ -202,7 +205,7 @@ def merge_subsequent_masks(tokens, masked_spans=None, bos_id=None,
 
         end_index = span.index[0]
         t5_input.extend(tokens[start_index: end_index])
-        if not prefix_lm:
+        if insert_mask_tokens:
             t5_input.append(flag)
 
         # the next start index is the token after the last span token

From 23c052f5713581726af48f8f5d093c895b98c2d6 Mon Sep 17 00:00:00 2001
From: janEbert <janpublicebert@posteo.net>
Date: Tue, 14 Feb 2023 17:34:27 +0100
Subject: [PATCH 042/122] Support not adding mask tokens in span corruption

Backward-incompatible change as we put this before an existing
positional argument.
---
 megatron/arguments.py          |  4 ++++
 megatron/data/dataset_utils.py |  3 +++
 megatron/data/t5_dataset.py    |  9 ++++++---
 megatron/data/ul2_dataset.py   | 24 ++++++++++++++----------
 4 files changed, 27 insertions(+), 13 deletions(-)

diff --git a/megatron/arguments.py b/megatron/arguments.py
index df81da27e..18d76e0fc 100644
--- a/megatron/arguments.py
+++ b/megatron/arguments.py
@@ -913,6 +913,10 @@ def __call__(self, parser, args, values, option_string=None):
                        help='Probability of replacing a token with mask.')
     group.add_argument('--short-seq-prob', type=float, default=0.1,
                        help='Probability of producing a short sequence.')
+    group.add_argument('--no-add-mask-tokens', action='store_false',
+                       help='Whether not to add sentinel tokens for masked '
+                       'spans in span corruption tasks.',
+                       dest='add_mask_tokens')
     group.add_argument('--mmap-warmup', action='store_true',
                        help='Warm up mmap files.')
     group.add_argument('--num-workers', type=int, default=2,
diff --git a/megatron/data/dataset_utils.py b/megatron/data/dataset_utils.py
index 3213ebb2f..381eb8cb7 100644
--- a/megatron/data/dataset_utils.py
+++ b/megatron/data/dataset_utils.py
@@ -636,11 +636,13 @@ def build_dataset(index, name):
                     **kwargs
                 )
             elif dataset_type == DSET_TYPE_T5:
+                args = get_args()
                 dataset = T5Dataset(
                     indexed_dataset=indexed_dataset,
                     masked_lm_prob=masked_lm_prob,
                     max_seq_length_dec=max_seq_length_dec,
                     short_seq_prob=short_seq_prob,
+                    add_mask_tokens=args.add_mask_tokens,
                     **kwargs
                 )
             elif dataset_type == DSET_TYPE_UL2:
@@ -652,6 +654,7 @@ def build_dataset(index, name):
                     denoisers=args.ul2_denoisers,
                     mean_span_lengths=args.ul2_mean_span_lengths,
                     mask_ratios=args.ul2_mask_ratios,
+                    add_mask_tokens=args.add_mask_tokens,
                     denoiser_tokens={
                         'R': args.ul2_r_denoiser_token,
                         'S': args.ul2_s_denoiser_token,
diff --git a/megatron/data/t5_dataset.py b/megatron/data/t5_dataset.py
index e4084f420..6482d7731 100644
--- a/megatron/data/t5_dataset.py
+++ b/megatron/data/t5_dataset.py
@@ -52,7 +52,7 @@ class T5Dataset(torch.utils.data.Dataset):
     def __init__(self, name, indexed_dataset, data_prefix,
                  num_epochs, max_num_samples, masked_lm_prob,
                  max_seq_length, max_seq_length_dec,
-                 short_seq_prob, seed):
+                 short_seq_prob, add_mask_tokens, seed):
 
         # Params to store.
         self.name = name
@@ -85,8 +85,11 @@ def __init__(self, name, indexed_dataset, data_prefix,
         self.pad_id = tokenizer.pad
         self.bos_id = tokenizer.bos_token_id
         self.eos_id = tokenizer.eos_token_id
-        self.sentinel_tokens = tokenizer.additional_special_tokens_ids
-        assert len(self.sentinel_tokens) > 0, "Provide the argument --vocab-extra-ids 100 to the script"
+        if add_mask_tokens:
+            self.sentinel_tokens = tokenizer.additional_special_tokens_ids
+            assert len(self.sentinel_tokens) > 0, "Provide the argument --vocab-extra-ids 100 to the script"
+        else:
+            self.sentinel_tokens = None
 
     def __len__(self):
         return self.samples_mapping.shape[0]
diff --git a/megatron/data/ul2_dataset.py b/megatron/data/ul2_dataset.py
index 24a5e01d0..66a5c8f66 100644
--- a/megatron/data/ul2_dataset.py
+++ b/megatron/data/ul2_dataset.py
@@ -51,8 +51,9 @@ class UL2Dataset(torch.utils.data.Dataset):
     def __init__(self, name, indexed_dataset, data_prefix,
                  num_epochs, max_num_samples, model_type,
                  denoiser_ratios, denoisers, mean_span_lengths,
-                 mask_ratios, denoiser_tokens, max_seq_length,
-                 max_seq_length_dec, short_seq_prob, seed):
+                 mask_ratios, add_mask_tokens, denoiser_tokens,
+                 max_seq_length, max_seq_length_dec, short_seq_prob,
+                 seed):
         super().__init__()
 
         if denoiser_ratios is None:
@@ -132,14 +133,17 @@ def __init__(self, name, indexed_dataset, data_prefix,
         self.bos_id = tokenizer.bos_token_id
         self.eos_id = tokenizer.eos_token_id
 
-        # Filter out denoiser tokens.
-        self.sentinel_tokens = [
-            token
-            for token in tokenizer.additional_special_tokens_ids
-            if token not in self.cls_ids.values()
-        ]
-        assert len(self.sentinel_tokens) > 0, \
-            "Provide the argument --vocab-extra-ids 100 to the script"
+        if add_mask_tokens:
+            # Filter out denoiser tokens.
+            self.sentinel_tokens = [
+                token
+                for token in tokenizer.additional_special_tokens_ids
+                if token not in self.cls_ids.values()
+            ]
+            assert len(self.sentinel_tokens) > 0, \
+                "Provide the argument --vocab-extra-ids 100 to the script"
+        else:
+            self.sentinel_tokens = None
 
     def __len__(self):
         return self.samples_mapping.shape[0]

From 0f4fd3ff70891d9fcbfe0af8c33bcaafd87fd40c Mon Sep 17 00:00:00 2001
From: janEbert <janpublicebert@posteo.net>
Date: Wed, 15 Feb 2023 09:02:56 +0100
Subject: [PATCH 043/122] Fix expected number of added tokens

Was wrong for decoder-only case.
---
 megatron/data/ul2_dataset.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/megatron/data/ul2_dataset.py b/megatron/data/ul2_dataset.py
index 66a5c8f66..844df1b30 100644
--- a/megatron/data/ul2_dataset.py
+++ b/megatron/data/ul2_dataset.py
@@ -87,12 +87,17 @@ def __init__(self, name, indexed_dataset, data_prefix,
         self.indexed_dataset = indexed_dataset
 
         # Build the samples mapping.
+        # Minimum number of tokens added: BOS and EOS.
+        min_added_tokens = 2
+        if is_decoder_only(model_type):
+            # Here we also add a SEP token.
+            min_added_tokens += 1
         self.samples_mapping = get_samples_mapping(
             self.indexed_dataset,
             data_prefix,
             num_epochs,
             max_num_samples,
-            self.max_seq_length - 2,  # account for added tokens
+            self.max_seq_length - min_added_tokens,  # account for added tokens
             short_seq_prob,
             self.seed,
             self.name,

From da1f4e9040adb5c3ede32e8b4a758db7347ef415 Mon Sep 17 00:00:00 2001
From: janEbert <janpublicebert@posteo.net>
Date: Thu, 16 Feb 2023 09:59:00 +0100
Subject: [PATCH 044/122] Fix non-masked data
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Previously the model didn't know _where_ the data was actually inserted.
Now it repeats the input sequence and inserts the masked data in the
correct place. See example in Fig. 1 of AlexaTM 20B
paper (arXiv/2208.01448).
---
 megatron/data/t5_dataset.py | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/megatron/data/t5_dataset.py b/megatron/data/t5_dataset.py
index 6482d7731..6cd2537a3 100644
--- a/megatron/data/t5_dataset.py
+++ b/megatron/data/t5_dataset.py
@@ -197,17 +197,26 @@ def merge_subsequent_masks(tokens, masked_spans=None, bos_id=None,
     (t5_decoder_in, t5_decoder_out) = ([bos_id], [])
     (start_index, end_index) = (0, None)
     for span in masked_spans:
+        end_index = span.index[0]
+        # The part of the sequence that is visible before the masked
+        # span starts. Starting from beginning or end of last masked
+        # span.
+        before_mask = tokens[start_index:end_index]
+
         if insert_mask_tokens:
             flag = sentinel_tokens.popleft()
 
             # Append the same tokens in decoder input and output
             t5_decoder_in.append(flag)
             t5_decoder_out.append(flag)
+        elif not prefix_lm:
+            # Append visible part of input sequence.
+            t5_decoder_in.extend(before_mask)
+            t5_decoder_out.extend(before_mask)
         t5_decoder_in.extend(span.label)
         t5_decoder_out.extend(span.label)
 
-        end_index = span.index[0]
-        t5_input.extend(tokens[start_index: end_index])
+        t5_input.extend(before_mask)
         if insert_mask_tokens:
             t5_input.append(flag)
 

From 55320eaf05c007a0198c028bff626340deb796f2 Mon Sep 17 00:00:00 2001
From: janEbert <janpublicebert@posteo.net>
Date: Thu, 16 Feb 2023 10:00:21 +0100
Subject: [PATCH 045/122] Fix unclear wording

This wording was confusing and basically stated the wrong thing. The
number/amount of n-grams is not bounded by `max_ngrams`, even though the
variable name sounds like it. Instead, `max_ngrams` bounds n.
---
 megatron/data/dataset_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/megatron/data/dataset_utils.py b/megatron/data/dataset_utils.py
index 381eb8cb7..3a8038e6a 100644
--- a/megatron/data/dataset_utils.py
+++ b/megatron/data/dataset_utils.py
@@ -285,7 +285,7 @@ def create_masked_lm_predictions(tokens,
         # plays nicely with the other probability distributions in terms
         # of math.
         normal_mean = (max_ngrams + 1) / 2
-        # However, we do not want to bound the maximum number of
+        # However, we do not want to bound the maximum length of
         # n-grams.
         max_ngrams = num_filtered_tokens - 1
 

From 5d27b27fb6ba13a7b4b21a0cfc20aa709db9ab35 Mon Sep 17 00:00:00 2001
From: janEbert <janpublicebert@posteo.net>
Date: Thu, 16 Feb 2023 11:00:07 +0100
Subject: [PATCH 046/122] Adjust code style

It's just too ugly to leave it like the original.
---
 megatron/data/ul2_dataset.py | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/megatron/data/ul2_dataset.py b/megatron/data/ul2_dataset.py
index 844df1b30..da8d86e1e 100644
--- a/megatron/data/ul2_dataset.py
+++ b/megatron/data/ul2_dataset.py
@@ -331,13 +331,14 @@ def build_training_sample(sample, target_seq_length,
         }
     else:
         # Padding.
-        tokens_enc, tokens_dec_in, labels, enc_mask, \
-        dec_mask, enc_dec_mask, loss_mask \
-            = pad_and_convert_to_numpy(tokens, masked_positions,
-                                       masked_labels, pad_id, max_seq_length,
-                                       max_seq_length_dec, masked_spans,
-                                       bos_id, eos_id, sentinel_tokens,
-                                       prefix_lm)
+        (
+            tokens_enc, tokens_dec_in, labels, enc_mask,
+            dec_mask, enc_dec_mask, loss_mask,
+        ) = pad_and_convert_to_numpy(tokens, masked_positions,
+                                     masked_labels, pad_id, max_seq_length,
+                                     max_seq_length_dec, masked_spans,
+                                     bos_id, eos_id, sentinel_tokens,
+                                     prefix_lm)
 
         train_sample = {
             'text_enc': tokens_enc,

From 23181ab3b1e18a6e80c4737a5b289e9f92536ed8 Mon Sep 17 00:00:00 2001
From: janEbert <janpublicebert@posteo.net>
Date: Fri, 17 Feb 2023 13:00:26 +0100
Subject: [PATCH 047/122] Fix covered index skipping

---
 megatron/data/dataset_utils.py | 8 +++++++-
 megatron/data/ul2_dataset.py   | 1 +
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/megatron/data/dataset_utils.py b/megatron/data/dataset_utils.py
index 3a8038e6a..f1d708aba 100644
--- a/megatron/data/dataset_utils.py
+++ b/megatron/data/dataset_utils.py
@@ -332,10 +332,16 @@ def get_ngram_indices_(idx):
             continue
         # Note(mingdachen):
         # Skip current piece if they are covered in lm masking or previous ngrams.
+        is_covered = False
         for index_set in cand_index_set[0]:
             for index in index_set:
                 if index in covered_indexes:
-                    continue
+                    is_covered = True
+                    break
+            if is_covered:
+                break
+        if is_covered:
+            break
 
         if sampling_style is SamplingStyle.POISSON:
             n = np_rng.choice(ngrams[:len(cand_index_set)],
diff --git a/megatron/data/ul2_dataset.py b/megatron/data/ul2_dataset.py
index da8d86e1e..6f49010ec 100644
--- a/megatron/data/ul2_dataset.py
+++ b/megatron/data/ul2_dataset.py
@@ -92,6 +92,7 @@ def __init__(self, name, indexed_dataset, data_prefix,
         if is_decoder_only(model_type):
             # Here we also add a SEP token.
             min_added_tokens += 1
+
         self.samples_mapping = get_samples_mapping(
             self.indexed_dataset,
             data_prefix,

From 6032cc6c96cca6f43e21c1f27669a03af553214d Mon Sep 17 00:00:00 2001
From: janEbert <janpublicebert@posteo.net>
Date: Fri, 17 Feb 2023 13:24:51 +0100
Subject: [PATCH 048/122] Prepend objective token before truncating

---
 megatron/data/ul2_dataset.py | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/megatron/data/ul2_dataset.py b/megatron/data/ul2_dataset.py
index 6f49010ec..1f5e2d2e9 100644
--- a/megatron/data/ul2_dataset.py
+++ b/megatron/data/ul2_dataset.py
@@ -224,6 +224,15 @@ def build_training_sample(sample, target_seq_length,
     # flatten sentences into one list
     tokens = [token for sentence in sample for token in sentence]
 
+    # Prepend objective token.
+    cls_id = cls_ids.get(denoiser, False)
+    if cls_id is False:
+        raise ValueError('unknown denoiser')
+
+    # If objective token is `None`, ignore it.
+    if cls_id is not None:
+        tokens = [cls_id] + tokens
+
     max_num_tokens = target_seq_length
     # if is_decoder_only(model_type):
     #     # Keep space for repeated `extra_id` tokens; not the most data
@@ -237,15 +246,6 @@ def build_training_sample(sample, target_seq_length,
     truncated = len(tokens) > max_num_tokens
     tokens = tokens[:max_num_tokens]
 
-    # Prepend objective token.
-    cls_id = cls_ids.get(denoiser, False)
-    if cls_id is False:
-        raise ValueError('unknown denoiser')
-
-    # If objective token is `None`, ignore it.
-    if cls_id is not None:
-        tokens = [cls_id] + tokens
-
     # Masking.
     mean_ngrams = mean_span_lengths[denoiser_index]
     if mean_ngrams < 1:

From c9c336f7f5fdebda16aabc581b853e5cfeb17d88 Mon Sep 17 00:00:00 2001
From: janEbert <janpublicebert@posteo.net>
Date: Fri, 17 Feb 2023 13:25:20 +0100
Subject: [PATCH 049/122] Automatically truncate sequences for decoder-only

Expecting the user to supply a sequence length greater than any data
point is ridiculous. So now we greedily truncate the sequence based on
the maximum amount of `extra_id`s, which wastes a lot of data. An
alternative would be going a statistical route with significance
attached to it; allowing the expected amount of tokens with some leeway,
while handling an unlikely length excession error.

This only handles the decoder-only case, while the encoder-decoder case
is left as is. This is because errors are much less like for the
encoder-decoder case unless massive corruption is configured or if the
decoder has a smaller sequence length than the encoder.
---
 megatron/data/ul2_dataset.py | 27 ++++++++++++++++-----------
 1 file changed, 16 insertions(+), 11 deletions(-)

diff --git a/megatron/data/ul2_dataset.py b/megatron/data/ul2_dataset.py
index 1f5e2d2e9..d78bc7dbb 100644
--- a/megatron/data/ul2_dataset.py
+++ b/megatron/data/ul2_dataset.py
@@ -16,6 +16,7 @@
 """UL2-style dataset."""
 
 from collections import ChainMap
+import math
 
 import numpy as np
 import torch
@@ -234,17 +235,21 @@ def build_training_sample(sample, target_seq_length,
         tokens = [cls_id] + tokens
 
     max_num_tokens = target_seq_length
-    # if is_decoder_only(model_type):
-    #     # Keep space for repeated `extra_id` tokens; not the most data
-    #     # efficient since we calculate this based on the maximum number
-    #     # of possible `extra_id` tokens.
-    #     safe_max_seq_len = math.floor(max_num_tokens / (1 + masked_lm_prob))
-    #     truncated = len(tokens) > safe_max_seq_len
-    #     tokens = tokens[:safe_max_seq_len]
-    # else:
-    # Truncate to `target_sequence_length`.
-    truncated = len(tokens) > max_num_tokens
-    tokens = tokens[:max_num_tokens]
+    if is_decoder_only(model_type) and denoiser != 'S':
+        # Keep space for repeated `extra_id` tokens; not the most data
+        # efficient since we calculate this based on the maximum number
+        # of possible `extra_id` tokens.
+        safe_max_seq_len = math.floor(max_num_tokens / (1 + masked_lm_prob))
+        truncated = len(tokens) > safe_max_seq_len
+        tokens = tokens[:safe_max_seq_len]
+    else:
+        # If we are S-denoising, we know only one `extra_id` token is
+        # goint to be added.
+        if is_decoder_only(model_type) and denoiser == 'S':
+            max_num_tokens -= 1
+        # Truncate to `target_sequence_length`.
+        truncated = len(tokens) > max_num_tokens
+        tokens = tokens[:max_num_tokens]
 
     # Masking.
     mean_ngrams = mean_span_lengths[denoiser_index]

From b8003cba16559a67e38a7132634f81b091924f37 Mon Sep 17 00:00:00 2001
From: janEbert <janpublicebert@posteo.net>
Date: Fri, 17 Feb 2023 13:41:50 +0100
Subject: [PATCH 050/122] Fix covered span skipping fix

---
 megatron/data/dataset_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/megatron/data/dataset_utils.py b/megatron/data/dataset_utils.py
index f1d708aba..b657a06ba 100644
--- a/megatron/data/dataset_utils.py
+++ b/megatron/data/dataset_utils.py
@@ -341,7 +341,7 @@ def get_ngram_indices_(idx):
             if is_covered:
                 break
         if is_covered:
-            break
+            continue
 
         if sampling_style is SamplingStyle.POISSON:
             n = np_rng.choice(ngrams[:len(cand_index_set)],

From e3d91a6aea771cfdbd362c549a7c8ef3fa99d51f Mon Sep 17 00:00:00 2001
From: janEbert <janpublicebert@posteo.net>
Date: Fri, 17 Feb 2023 15:00:37 +0100
Subject: [PATCH 051/122] Make `build_index_mappings` public

---
 megatron/data/gpt_dataset.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/megatron/data/gpt_dataset.py b/megatron/data/gpt_dataset.py
index 0db1aa2fe..071222949 100644
--- a/megatron/data/gpt_dataset.py
+++ b/megatron/data/gpt_dataset.py
@@ -237,7 +237,7 @@ def __init__(self, name, data_prefix, documents, indexed_dataset,
         assert np.max(documents) < indexed_dataset.sizes.shape[0]
 
         # Build index mappings.
-        self.doc_idx, self.sample_idx, self.shuffle_idx = _build_index_mappings(
+        self.doc_idx, self.sample_idx, self.shuffle_idx = build_index_mappings(
             self.name, data_prefix, documents, self.indexed_dataset.sizes,
             num_samples, seq_length, seed)
 
@@ -275,7 +275,7 @@ def __getitem__(self, idx):
         return {'text': np.array(sample, dtype=np.int64)}
 
 
-def _build_index_mappings(name, data_prefix, documents, sizes,
+def build_index_mappings(name, data_prefix, documents, sizes,
                           num_samples, seq_length, seed, cutoff_last_epoch=0.95):
     """Build doc-idx, sample-idx, and shuffle-idx.
     doc-idx: is an array (ordered) of documents to be used in training.

From e61e78fa2f2890409232172e855ac8baa5724397 Mon Sep 17 00:00:00 2001
From: janEbert <janpublicebert@posteo.net>
Date: Fri, 17 Feb 2023 15:00:49 +0100
Subject: [PATCH 052/122] Refactor getting sample

---
 megatron/data/gpt_dataset.py | 55 ++++++++++++++++++++----------------
 1 file changed, 30 insertions(+), 25 deletions(-)

diff --git a/megatron/data/gpt_dataset.py b/megatron/data/gpt_dataset.py
index 071222949..14706cc3c 100644
--- a/megatron/data/gpt_dataset.py
+++ b/megatron/data/gpt_dataset.py
@@ -247,34 +247,39 @@ def __len__(self):
         return self.sample_idx.shape[0] - 1
 
     def __getitem__(self, idx):
-        # Get the shuffled index.
-        idx = self.shuffle_idx[idx]
-        # Start and end documents and offsets.
-        doc_index_f = self.sample_idx[idx][0]
-        doc_index_l = self.sample_idx[idx + 1][0]
-        offset_f = self.sample_idx[idx][1]
-        offset_l = self.sample_idx[idx + 1][1]
-        # If we are within the same document, just extract the chunk.
-        if doc_index_f == doc_index_l:
-            sample = self.indexed_dataset.get(self.doc_idx[doc_index_f],
-                                              offset=offset_f,
-                                              length=offset_l - offset_f + 1)
-        else:
-            # Otherwise, get the rest of the initial document.
-            sample_list = [self.indexed_dataset.get(self.doc_idx[doc_index_f],
-                                                    offset=offset_f)]
-            # Loop over all in between documents and add the entire document.
-            for i in range(doc_index_f + 1, doc_index_l):
-                sample_list.append(self.indexed_dataset.get(self.doc_idx[i]))
-            # And finally add the relevant portion of last document.
-            sample_list.append(self.indexed_dataset.get(
-                self.doc_idx[doc_index_l],
-                length=offset_l + 1))
-            sample = np.concatenate(sample_list)
-
+        sample = get_sample(self.indexed_dataset, self.doc_idx,
+                            self.sample_idx, self.shuffle_idx, idx)
         return {'text': np.array(sample, dtype=np.int64)}
 
 
+def get_sample(indexed_dataset, doc_idx, sample_idx, shuffle_idx, idx):
+    # Get the shuffled index.
+    idx = shuffle_idx[idx]
+    # Start and end documents and offsets.
+    doc_index_f = sample_idx[idx][0]
+    doc_index_l = sample_idx[idx + 1][0]
+    offset_f = sample_idx[idx][1]
+    offset_l = sample_idx[idx + 1][1]
+    # If we are within the same document, just extract the chunk.
+    if doc_index_f == doc_index_l:
+        sample = indexed_dataset.get(doc_idx[doc_index_f],
+                                     offset=offset_f,
+                                     length=offset_l - offset_f + 1)
+    else:
+        # Otherwise, get the rest of the initial document.
+        sample_list = [indexed_dataset.get(doc_idx[doc_index_f],
+                                           offset=offset_f)]
+        # Loop over all in between documents and add the entire document.
+        for i in range(doc_index_f + 1, doc_index_l):
+            sample_list.append(indexed_dataset.get(doc_idx[i]))
+        # And finally add the relevant portion of last document.
+        sample_list.append(indexed_dataset.get(
+            doc_idx[doc_index_l],
+            length=offset_l + 1))
+        sample = np.concatenate(sample_list)
+    return sample
+
+
 def build_index_mappings(name, data_prefix, documents, sizes,
                           num_samples, seq_length, seed, cutoff_last_epoch=0.95):
     """Build doc-idx, sample-idx, and shuffle-idx.

From c3b0a55e59d65b4031f6b52467958dcc2e40c91e Mon Sep 17 00:00:00 2001
From: janEbert <janpublicebert@posteo.net>
Date: Fri, 17 Feb 2023 15:03:47 +0100
Subject: [PATCH 053/122] Add sample packing to T5 dataset

Backward-incompatible change due to positional argument without default,
inserted before another positional argument.
---
 megatron/arguments.py       |  3 +++
 megatron/data/t5_dataset.py | 51 ++++++++++++++++++++++++-------------
 2 files changed, 37 insertions(+), 17 deletions(-)

diff --git a/megatron/arguments.py b/megatron/arguments.py
index 18d76e0fc..397dd33ff 100644
--- a/megatron/arguments.py
+++ b/megatron/arguments.py
@@ -917,6 +917,9 @@ def __call__(self, parser, args, values, option_string=None):
                        help='Whether not to add sentinel tokens for masked '
                        'spans in span corruption tasks.',
                        dest='add_mask_tokens')
+    group.add_argument('--pack-samples', action='store_true',
+                       help='Whether to pack samples in span corruption '
+                       'datasets (T5 or UL2). GPT dataset is always packed.')
     group.add_argument('--mmap-warmup', action='store_true',
                        help='Warm up mmap files.')
     group.add_argument('--num-workers', type=int, default=2,
diff --git a/megatron/data/t5_dataset.py b/megatron/data/t5_dataset.py
index 6cd2537a3..c238c0028 100644
--- a/megatron/data/t5_dataset.py
+++ b/megatron/data/t5_dataset.py
@@ -25,6 +25,7 @@
     create_masked_lm_predictions,
     get_samples_mapping
 )
+from megatron.data.gpt_dataset import build_index_mappings, get_sample
 
 
 class LengthExceededError(ValueError):
@@ -52,7 +53,7 @@ class T5Dataset(torch.utils.data.Dataset):
     def __init__(self, name, indexed_dataset, data_prefix,
                  num_epochs, max_num_samples, masked_lm_prob,
                  max_seq_length, max_seq_length_dec,
-                 short_seq_prob, add_mask_tokens, seed):
+                 short_seq_prob, add_mask_tokens, pack_samples, seed):
 
         # Params to store.
         self.name = name
@@ -63,17 +64,25 @@ def __init__(self, name, indexed_dataset, data_prefix,
 
         # Dataset.
         self.indexed_dataset = indexed_dataset
+        self.pack_samples = pack_samples
 
-        # Build the samples mapping.
-        self.samples_mapping = get_samples_mapping(self.indexed_dataset,
-                                                   data_prefix,
-                                                   num_epochs,
-                                                   max_num_samples,
-                                                   self.max_seq_length - 2, # account for added tokens
-                                                   short_seq_prob,
-                                                   self.seed,
-                                                   self.name,
-                                                   False)
+        if self.pack_samples:
+            self.doc_idx, self.sample_idx, self.shuffle_idx = build_index_mappings(
+                self.name, data_prefix, self.indexed_dataset.get_doc_idx(),
+                self.indexed_dataset.sizes, max_num_samples,
+                self.max_seq_length, self.seed)
+        else:
+            # Build the samples mapping.
+            self.samples_mapping = get_samples_mapping(self.indexed_dataset,
+                                                       data_prefix,
+                                                       num_epochs,
+                                                       max_num_samples,
+                                                       # account for added tokens
+                                                       self.max_seq_length - 2,
+                                                       short_seq_prob,
+                                                       self.seed,
+                                                       self.name,
+                                                       False)
 
         # Vocab stuff.
         tokenizer = get_tokenizer()
@@ -92,14 +101,22 @@ def __init__(self, name, indexed_dataset, data_prefix,
             self.sentinel_tokens = None
 
     def __len__(self):
-        return self.samples_mapping.shape[0]
+        if self.pack_samples:
+            return self.sample_idx.shape[0] - 1
+        else:
+            return self.samples_mapping.shape[0]
 
     def __getitem__(self, idx):
-
-        start_index, end_index, seq_length = self.samples_mapping[idx]
-        sample = []
-        for index in range(start_index, end_index):
-            sample.append(self.indexed_dataset[index])
+        if self.pack_samples:
+            sample = get_sample(self.indexed_dataset, self.doc_idx,
+                                self.sample_idx, self.shuffle_idx, idx)
+            seq_length = len(sample)
+            sample = [sample]
+        else:
+            start_index, end_index, seq_length = self.samples_mapping[idx]
+            sample = []
+            for index in range(start_index, end_index):
+                sample.append(self.indexed_dataset[index])
         # Note that this rng state should be numpy and not python since
         # python randint is inclusive whereas the numpy one is exclusive.
         np_rng = np.random.RandomState(seed=(self.seed + idx))

From c4d748ba39dfad16ca30e8fd0f853d29744df18e Mon Sep 17 00:00:00 2001
From: janEbert <janpublicebert@posteo.net>
Date: Fri, 17 Feb 2023 15:05:52 +0100
Subject: [PATCH 054/122] Add sample packing to UL2 dataset

---
 megatron/data/ul2_dataset.py | 55 +++++++++++++++++++++++-------------
 1 file changed, 36 insertions(+), 19 deletions(-)

diff --git a/megatron/data/ul2_dataset.py b/megatron/data/ul2_dataset.py
index d78bc7dbb..5cffa2935 100644
--- a/megatron/data/ul2_dataset.py
+++ b/megatron/data/ul2_dataset.py
@@ -27,6 +27,7 @@
     get_samples_mapping,
     SamplingStyle
 )
+from megatron.data.gpt_dataset import build_index_mappings, get_sample
 from megatron.data.t5_dataset import (
     LengthExceededError,
     make_history_mask,
@@ -52,9 +53,9 @@ class UL2Dataset(torch.utils.data.Dataset):
     def __init__(self, name, indexed_dataset, data_prefix,
                  num_epochs, max_num_samples, model_type,
                  denoiser_ratios, denoisers, mean_span_lengths,
-                 mask_ratios, add_mask_tokens, denoiser_tokens,
-                 max_seq_length, max_seq_length_dec, short_seq_prob,
-                 seed):
+                 mask_ratios, add_mask_tokens, pack_samples,
+                 denoiser_tokens, max_seq_length, max_seq_length_dec,
+                 short_seq_prob, seed):
         super().__init__()
 
         if denoiser_ratios is None:
@@ -86,6 +87,7 @@ def __init__(self, name, indexed_dataset, data_prefix,
 
         # Dataset.
         self.indexed_dataset = indexed_dataset
+        self.pack_samples = pack_samples
 
         # Build the samples mapping.
         # Minimum number of tokens added: BOS and EOS.
@@ -94,17 +96,23 @@ def __init__(self, name, indexed_dataset, data_prefix,
             # Here we also add a SEP token.
             min_added_tokens += 1
 
-        self.samples_mapping = get_samples_mapping(
-            self.indexed_dataset,
-            data_prefix,
-            num_epochs,
-            max_num_samples,
-            self.max_seq_length - min_added_tokens,  # account for added tokens
-            short_seq_prob,
-            self.seed,
-            self.name,
-            False,
-        )
+        if self.pack_samples:
+            self.doc_idx, self.sample_idx, self.shuffle_idx = build_index_mappings(
+                self.name, data_prefix, self.indexed_dataset.get_doc_idx()[:-1],
+                self.indexed_dataset.sizes, max_num_samples,
+                self.max_seq_length - min_added_tokens, self.seed)
+        else:
+            self.samples_mapping = get_samples_mapping(
+                self.indexed_dataset,
+                data_prefix,
+                num_epochs,
+                max_num_samples,
+                self.max_seq_length - min_added_tokens,  # account for added tokens
+                short_seq_prob,
+                self.seed,
+                self.name,
+                False,
+            )
 
         # Vocab stuff.
         tokenizer = get_tokenizer()
@@ -153,13 +161,22 @@ def __init__(self, name, indexed_dataset, data_prefix,
             self.sentinel_tokens = None
 
     def __len__(self):
-        return self.samples_mapping.shape[0]
+        if self.pack_samples:
+            return self.sample_idx.shape[0] - 1
+        else:
+            return self.samples_mapping.shape[0]
 
     def __getitem__(self, idx):
-        start_index, end_index, seq_length = self.samples_mapping[idx]
-        sample = []
-        for index in range(start_index, end_index):
-            sample.append(self.indexed_dataset[index])
+        if self.pack_samples:
+            sample = get_sample(self.indexed_dataset, self.doc_idx,
+                                self.sample_idx, self.shuffle_idx, idx)
+            seq_length = len(sample)
+            sample = [sample]
+        else:
+            start_index, end_index, seq_length = self.samples_mapping[idx]
+            sample = []
+            for index in range(start_index, end_index):
+                sample.append(self.indexed_dataset[index])
         # Note that this rng state should be numpy and not python since
         # python randint is inclusive whereas the numpy one is exclusive.
         np_rng = np.random.RandomState(seed=(self.seed + idx))

From 689b57e3737b94e6c48704c7fa1b75e984081eb8 Mon Sep 17 00:00:00 2001
From: janEbert <janpublicebert@posteo.net>
Date: Fri, 17 Feb 2023 15:19:46 +0100
Subject: [PATCH 055/122] Fix typo and comment placement

---
 megatron/data/ul2_dataset.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/megatron/data/ul2_dataset.py b/megatron/data/ul2_dataset.py
index 5cffa2935..1d2a71455 100644
--- a/megatron/data/ul2_dataset.py
+++ b/megatron/data/ul2_dataset.py
@@ -89,13 +89,13 @@ def __init__(self, name, indexed_dataset, data_prefix,
         self.indexed_dataset = indexed_dataset
         self.pack_samples = pack_samples
 
-        # Build the samples mapping.
         # Minimum number of tokens added: BOS and EOS.
         min_added_tokens = 2
         if is_decoder_only(model_type):
             # Here we also add a SEP token.
             min_added_tokens += 1
 
+        # Build the samples mapping.
         if self.pack_samples:
             self.doc_idx, self.sample_idx, self.shuffle_idx = build_index_mappings(
                 self.name, data_prefix, self.indexed_dataset.get_doc_idx()[:-1],
@@ -261,7 +261,7 @@ def build_training_sample(sample, target_seq_length,
         tokens = tokens[:safe_max_seq_len]
     else:
         # If we are S-denoising, we know only one `extra_id` token is
-        # goint to be added.
+        # going to be added.
         if is_decoder_only(model_type) and denoiser == 'S':
             max_num_tokens -= 1
         # Truncate to `target_sequence_length`.

From af204e75119f6b0018b80a29dfd8eaa591841efc Mon Sep 17 00:00:00 2001
From: janEbert <janpublicebert@posteo.net>
Date: Fri, 17 Feb 2023 15:29:34 +0100
Subject: [PATCH 056/122] Fix not supplying `--pack-samples` argument

---
 megatron/data/dataset_utils.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/megatron/data/dataset_utils.py b/megatron/data/dataset_utils.py
index b657a06ba..52cdee03c 100644
--- a/megatron/data/dataset_utils.py
+++ b/megatron/data/dataset_utils.py
@@ -649,6 +649,7 @@ def build_dataset(index, name):
                     max_seq_length_dec=max_seq_length_dec,
                     short_seq_prob=short_seq_prob,
                     add_mask_tokens=args.add_mask_tokens,
+                    pack_samples=args.pack_samples,
                     **kwargs
                 )
             elif dataset_type == DSET_TYPE_UL2:
@@ -661,6 +662,7 @@ def build_dataset(index, name):
                     mean_span_lengths=args.ul2_mean_span_lengths,
                     mask_ratios=args.ul2_mask_ratios,
                     add_mask_tokens=args.add_mask_tokens,
+                    pack_samples=args.pack_samples,
                     denoiser_tokens={
                         'R': args.ul2_r_denoiser_token,
                         'S': args.ul2_s_denoiser_token,

From 78eb0358159266082ca900646175aeb7ec9348ca Mon Sep 17 00:00:00 2001
From: janEbert <janpublicebert@posteo.net>
Date: Fri, 17 Feb 2023 15:29:15 +0100
Subject: [PATCH 057/122] Add support for UL2R-style implementation

---
 megatron/arguments.py          |  4 ++++
 megatron/data/dataset_utils.py |  1 +
 megatron/data/ul2_dataset.py   | 19 +++++++++++++------
 3 files changed, 18 insertions(+), 6 deletions(-)

diff --git a/megatron/arguments.py b/megatron/arguments.py
index 397dd33ff..28e246ece 100644
--- a/megatron/arguments.py
+++ b/megatron/arguments.py
@@ -1080,6 +1080,10 @@ def _add_ul2_args(parser):
                        help='What token to prepend for the UL2 X-denoising '
                        'objective. If empty, do not prepend a token for this '
                        'objective.')
+    group.add_argument('--ul2-like-ul2r', action='store_true',
+                       help='Whether to use the updated implementation as '
+                       'described in the UL2R paper. This only changes the '
+                       'implementation, not the objective configurations!')
 
     return parser
 
diff --git a/megatron/data/dataset_utils.py b/megatron/data/dataset_utils.py
index 52cdee03c..68400cdf9 100644
--- a/megatron/data/dataset_utils.py
+++ b/megatron/data/dataset_utils.py
@@ -668,6 +668,7 @@ def build_dataset(index, name):
                         'S': args.ul2_s_denoiser_token,
                         'X': args.ul2_x_denoiser_token,
                     },
+                    like_ul2r=args.ul2_like_ul2r,
                     max_seq_length_dec=max_seq_length_dec,
                     short_seq_prob=short_seq_prob,
                     **kwargs,
diff --git a/megatron/data/ul2_dataset.py b/megatron/data/ul2_dataset.py
index 1d2a71455..ada6b5d95 100644
--- a/megatron/data/ul2_dataset.py
+++ b/megatron/data/ul2_dataset.py
@@ -54,8 +54,8 @@ def __init__(self, name, indexed_dataset, data_prefix,
                  num_epochs, max_num_samples, model_type,
                  denoiser_ratios, denoisers, mean_span_lengths,
                  mask_ratios, add_mask_tokens, pack_samples,
-                 denoiser_tokens, max_seq_length, max_seq_length_dec,
-                 short_seq_prob, seed):
+                 denoiser_tokens, like_ul2r, max_seq_length,
+                 max_seq_length_dec, short_seq_prob, seed):
         super().__init__()
 
         if denoiser_ratios is None:
@@ -84,6 +84,7 @@ def __init__(self, name, indexed_dataset, data_prefix,
         self.denoisers = [denoiser.upper() for denoiser in denoisers]
         self.mean_span_lengths = mean_span_lengths
         self.mask_ratios = mask_ratios
+        self.like_ul2r = like_ul2r
 
         # Dataset.
         self.indexed_dataset = indexed_dataset
@@ -189,8 +190,9 @@ def __getitem__(self, idx):
                                      self.mask_id, self.pad_id,
                                      self.model_type, self.denoiser_ratios,
                                      self.denoisers, self.mean_span_lengths,
-                                     self.mask_ratios, np_rng, self.bos_id,
-                                     self.eos_id, self.sentinel_tokens)
+                                     self.mask_ratios, self.like_ul2r, np_rng,
+                                     self.bos_id, self.eos_id,
+                                     self.sentinel_tokens)
 
 
 def build_training_sample(sample, target_seq_length,
@@ -199,7 +201,7 @@ def build_training_sample(sample, target_seq_length,
                           cls_ids, sep_id, mask_id, pad_id,
                           model_type, denoiser_ratios,
                           denoisers, mean_span_lengths,
-                          mask_ratios, np_rng,
+                          mask_ratios, like_ul2r, np_rng,
                           bos_id=None, eos_id=None,
                           sentinel_tokens=None):
     """Build training sample.
@@ -224,6 +226,8 @@ def build_training_sample(sample, target_seq_length,
         mean_span_lengths: Mean length for sampling span lengths. Numbers < 1
               indicate a mean length of the sequence length times that number.
         mask_ratios: Ratio of masked token in the full sequence.
+        like_ul2r: Whether to use the updated implementation as specified in
+            the UL2R paper.
         np_rng: Random number genenrator. Note that this rng state should be
               numpy and not python since python randint is inclusive for
               the opper bound whereas the numpy one is exclusive.
@@ -276,7 +280,10 @@ def build_training_sample(sample, target_seq_length,
     max_ngrams = mean_ngrams * 2 - 1
 
     if denoiser == 'R' or denoiser == 'X':
-        sampling_style = SamplingStyle.NORMAL
+        if like_ul2r:
+            sampling_style = SamplingStyle.UNIFORM
+        else:
+            sampling_style = SamplingStyle.NORMAL
         prefix_lm = False
         max_predictions_per_seq = len(tokens) - 1
     elif denoiser == 'S':

From c03eed4e68fcac9153c201f7731c0615e5fd1dbf Mon Sep 17 00:00:00 2001
From: janEbert <janpublicebert@posteo.net>
Date: Fri, 17 Feb 2023 15:52:58 +0100
Subject: [PATCH 058/122] Fix T5 dataset packing

Forgot to apply fixes here.
---
 megatron/data/t5_dataset.py | 28 ++++++++++++++++------------
 1 file changed, 16 insertions(+), 12 deletions(-)

diff --git a/megatron/data/t5_dataset.py b/megatron/data/t5_dataset.py
index c238c0028..53392aeaf 100644
--- a/megatron/data/t5_dataset.py
+++ b/megatron/data/t5_dataset.py
@@ -66,23 +66,27 @@ def __init__(self, name, indexed_dataset, data_prefix,
         self.indexed_dataset = indexed_dataset
         self.pack_samples = pack_samples
 
+        # Minimum number of tokens added: BOS and EOS.
+        min_added_tokens = 2
         if self.pack_samples:
             self.doc_idx, self.sample_idx, self.shuffle_idx = build_index_mappings(
-                self.name, data_prefix, self.indexed_dataset.get_doc_idx(),
+                self.name, data_prefix, self.indexed_dataset.get_doc_idx()[:-1],
                 self.indexed_dataset.sizes, max_num_samples,
-                self.max_seq_length, self.seed)
+                self.max_seq_length - min_added_tokens, self.seed)
         else:
             # Build the samples mapping.
-            self.samples_mapping = get_samples_mapping(self.indexed_dataset,
-                                                       data_prefix,
-                                                       num_epochs,
-                                                       max_num_samples,
-                                                       # account for added tokens
-                                                       self.max_seq_length - 2,
-                                                       short_seq_prob,
-                                                       self.seed,
-                                                       self.name,
-                                                       False)
+            self.samples_mapping = get_samples_mapping(
+                self.indexed_dataset,
+                data_prefix,
+                num_epochs,
+                max_num_samples,
+                # account for added tokens
+                self.max_seq_length - min_added_tokens,
+                short_seq_prob,
+                self.seed,
+                self.name,
+                False,
+            )
 
         # Vocab stuff.
         tokenizer = get_tokenizer()

From 9e84f06dcea37200f1450e3468383edc7d20de73 Mon Sep 17 00:00:00 2001
From: janEbert <janpublicebert@posteo.net>
Date: Wed, 22 Feb 2023 16:23:08 +0100
Subject: [PATCH 059/122] Refactor `get_sample` to return a list

Accordingly, rename to `get_samples`.
---
 megatron/data/gpt_dataset.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/megatron/data/gpt_dataset.py b/megatron/data/gpt_dataset.py
index 14706cc3c..61d182445 100644
--- a/megatron/data/gpt_dataset.py
+++ b/megatron/data/gpt_dataset.py
@@ -247,12 +247,13 @@ def __len__(self):
         return self.sample_idx.shape[0] - 1
 
     def __getitem__(self, idx):
-        sample = get_sample(self.indexed_dataset, self.doc_idx,
-                            self.sample_idx, self.shuffle_idx, idx)
+        sample_list = get_samples(self.indexed_dataset, self.doc_idx,
+                                  self.sample_idx, self.shuffle_idx, idx)
+        sample = np.concatenate(sample_list)
         return {'text': np.array(sample, dtype=np.int64)}
 
 
-def get_sample(indexed_dataset, doc_idx, sample_idx, shuffle_idx, idx):
+def get_samples(indexed_dataset, doc_idx, sample_idx, shuffle_idx, idx):
     # Get the shuffled index.
     idx = shuffle_idx[idx]
     # Start and end documents and offsets.
@@ -265,6 +266,7 @@ def get_sample(indexed_dataset, doc_idx, sample_idx, shuffle_idx, idx):
         sample = indexed_dataset.get(doc_idx[doc_index_f],
                                      offset=offset_f,
                                      length=offset_l - offset_f + 1)
+        sample_list = [sample]
     else:
         # Otherwise, get the rest of the initial document.
         sample_list = [indexed_dataset.get(doc_idx[doc_index_f],
@@ -276,8 +278,7 @@ def get_sample(indexed_dataset, doc_idx, sample_idx, shuffle_idx, idx):
         sample_list.append(indexed_dataset.get(
             doc_idx[doc_index_l],
             length=offset_l + 1))
-        sample = np.concatenate(sample_list)
-    return sample
+    return sample_list
 
 
 def build_index_mappings(name, data_prefix, documents, sizes,

From 5e2b4f545b4afa823ec21da46791e81ddae51a27 Mon Sep 17 00:00:00 2001
From: janEbert <janpublicebert@posteo.net>
Date: Wed, 22 Feb 2023 16:33:24 +0100
Subject: [PATCH 060/122] Fix T5 sample packing

---
 megatron/data/t5_dataset.py | 243 +++++++++++++++++++++++++++++++++---
 1 file changed, 225 insertions(+), 18 deletions(-)

diff --git a/megatron/data/t5_dataset.py b/megatron/data/t5_dataset.py
index 53392aeaf..a8ac07ed8 100644
--- a/megatron/data/t5_dataset.py
+++ b/megatron/data/t5_dataset.py
@@ -25,7 +25,7 @@
     create_masked_lm_predictions,
     get_samples_mapping
 )
-from megatron.data.gpt_dataset import build_index_mappings, get_sample
+from megatron.data.gpt_dataset import build_index_mappings, get_samples
 
 
 class LengthExceededError(ValueError):
@@ -111,29 +111,78 @@ def __len__(self):
             return self.samples_mapping.shape[0]
 
     def __getitem__(self, idx):
+        # Note that this rng state should be numpy and not python since
+        # python randint is inclusive whereas the numpy one is exclusive.
+        np_rng = np.random.RandomState(seed=(self.seed + idx))
         if self.pack_samples:
-            sample = get_sample(self.indexed_dataset, self.doc_idx,
-                                self.sample_idx, self.shuffle_idx, idx)
-            seq_length = len(sample)
-            sample = [sample]
+            samples = get_samples(self.indexed_dataset, self.doc_idx,
+                                  self.sample_idx, self.shuffle_idx, idx)
+            samples_dict = {
+                'text_enc': np.empty((self.max_seq_length,), dtype=np.int64),
+                'text_dec': np.empty(
+                    (self.max_seq_length_dec,), dtype=np.int64),
+                'labels': np.empty(
+                    (self.max_seq_length_dec,), dtype=np.int64),
+                'loss_mask': np.zeros(
+                    (self.max_seq_length_dec,), dtype=np.int64),
+                'truncated': 0,
+                'enc_mask': np.zeros(
+                    (self.max_seq_length, self.max_seq_length),
+                    dtype=np.int64,
+                ),
+                'dec_mask': np.zeros(
+                    (self.max_seq_length_dec, self.max_seq_length_dec),
+                    dtype=np.int64,
+                ),
+                'enc_dec_mask': np.zeros(
+                    (self.max_seq_length_dec, self.max_seq_length),
+                    dtype=np.int64,
+                ),
+            }
+            prev_len = 0
+            prev_len_dec = 0
+            for sample in samples:
+                seq_length = len(sample)
+                result_sample = build_training_sample(
+                    [sample], seq_length,
+                    self.max_seq_length,  # needed for padding
+                    self.max_seq_length_dec, self.vocab_id_list,
+                    self.vocab_id_to_token_dict, self.cls_id, self.sep_id,
+                    self.mask_id, self.pad_id, self.masked_lm_prob, np_rng,
+                    self.bos_id, self.eos_id, self.sentinel_tokens)
+                maybe_lens = update_samples_dict(
+                    samples_dict,
+                    result_sample,
+                    self.max_seq_length,
+                    self.max_seq_length_dec,
+                    prev_len,
+                    prev_len_dec,
+                    self.pad_id,
+                    self.eos_id,
+                )
+                if maybe_lens is None:
+                    # We are exceeding our sequence length already.
+                    break
+
+                len_enc, len_dec = maybe_lens
+                prev_len += len_enc
+                prev_len_dec += len_dec
+
+            add_final_padding(
+                samples_dict, prev_len, prev_len_dec, self.pad_id)
         else:
             start_index, end_index, seq_length = self.samples_mapping[idx]
             sample = []
             for index in range(start_index, end_index):
                 sample.append(self.indexed_dataset[index])
-        # Note that this rng state should be numpy and not python since
-        # python randint is inclusive whereas the numpy one is exclusive.
-        np_rng = np.random.RandomState(seed=(self.seed + idx))
-        return build_training_sample(sample, seq_length,
-                                     self.max_seq_length,  # needed for padding
-                                     self.max_seq_length_dec,
-                                     self.vocab_id_list,
-                                     self.vocab_id_to_token_dict,
-                                     self.cls_id, self.sep_id,
-                                     self.mask_id, self.pad_id,
-                                     self.masked_lm_prob, np_rng,
-                                     self.bos_id, self.eos_id,
-                                     self.sentinel_tokens)
+            samples_dict = build_training_sample(
+                sample, seq_length,
+                self.max_seq_length,  # needed for padding
+                self.max_seq_length_dec, self.vocab_id_list,
+                self.vocab_id_to_token_dict, self.cls_id, self.sep_id,
+                self.mask_id, self.pad_id, self.masked_lm_prob, np_rng,
+                self.bos_id, self.eos_id, self.sentinel_tokens)
+        return samples_dict
 
 
 def build_training_sample(sample, target_seq_length,
@@ -343,3 +392,161 @@ def make_history_mask_3d(block):
     history_mask = (arange[None, ] <= arange[:, None])[None, ]
     history_mask = history_mask.expand(batch, length, length)
     return history_mask
+
+
+def _remove_padding(result_sample, pad_id):
+    # Remove padding
+    padding_length = (
+        len(result_sample['text_enc'])
+        - np.argmax(result_sample['text_enc'] == pad_id)
+    )
+    padding_length_dec = (
+        len(result_sample['text_dec'])
+        - np.argmax(result_sample['text_dec'] == pad_id)
+    )
+    result_sample['text_enc'] = result_sample['text_enc'][:padding_length]
+    for key in ['text_dec', 'labels', 'loss_mask']:
+        result_sample[key] = result_sample[key][:padding_length_dec]
+    result_sample['enc_mask'] = \
+        result_sample['enc_mask'][:padding_length, :padding_length]
+    result_sample['enc_dec_mask'] = \
+        result_sample['enc_dec_mask'][:padding_length_dec, :padding_length]
+    result_sample['dec_mask'] = \
+        result_sample['dec_mask'][:padding_length_dec, :padding_length_dec]
+
+
+def get_lens(key, prev_len, prev_len_dec, len_enc, len_dec):
+    assert key != 'enc_dec_mask'
+    if key in ['text_enc', 'enc_mask']:
+        offset = prev_len
+        length = len_enc
+    else:
+        offset = prev_len_dec
+        length = len_dec
+    return offset, length
+
+
+def update_samples_dict(
+        samples_dict,
+        result_sample,
+        max_seq_len,
+        max_seq_len_dec,
+        prev_len,
+        prev_len_dec,
+        pad_id,
+        eos_id,
+):
+    _remove_padding(result_sample, pad_id)
+
+    len_enc = len(result_sample['text_enc'])
+    len_dec = len(result_sample['text_dec'])
+
+    if (
+            (
+                prev_len
+                + len_enc
+                + int(result_sample['text_enc'][-1] != eos_id)
+            ) > max_seq_len
+            or (
+                prev_len_dec
+                + len_dec
+                + int(result_sample['text_dec'][-1] != eos_id)
+            ) > max_seq_len_dec
+    ):
+        return None
+
+    eos_added = {
+        'text_enc': False,
+        'text_dec': False,
+        'labels': False,
+    }
+    for (key, is_enc) in zip(
+            ['text_enc', 'text_dec', 'labels'],
+            [True, False, False],
+    ):
+        curr_sample = result_sample[key]
+        offset, length = get_lens(
+            key, prev_len, prev_len_dec, len_enc, len_dec)
+        samples_dict[key][offset:offset + length] = curr_sample
+
+        # Add EOS token if not present.
+        if (
+                curr_sample[-1] != eos_id
+                or key == 'labels' and eos_added['text_dec']
+        ):
+            samples_dict[key][offset + length] = eos_id
+            eos_added[key] = True
+
+    need_extras = {
+        'loss_mask': False,
+        'enc_mask': False,
+        'dec_mask': False,
+        'enc_dec_mask': [False, False],
+    }
+    if eos_added['text_enc']:
+        need_extras['enc_mask'] = True
+        need_extras['enc_dec_mask'][1] = True
+    if eos_added['text_dec']:
+        need_extras['loss_mask'] = True
+        need_extras['dec_mask'] = True
+        need_extras['enc_dec_mask'][0] = True
+
+    samples_dict['loss_mask'][
+        prev_len_dec:prev_len_dec + len_dec,
+    ] += result_sample['loss_mask']
+    samples_dict['enc_mask'][
+        prev_len:prev_len + len_enc,
+        prev_len:prev_len + len_enc,
+    ] += result_sample['enc_mask']
+    samples_dict['dec_mask'][
+        prev_len_dec:prev_len_dec + len_dec,
+        prev_len_dec:prev_len_dec + len_dec,
+    ] += result_sample['dec_mask']
+    samples_dict['enc_dec_mask'][
+        prev_len_dec:prev_len_dec + len_dec,
+        prev_len:prev_len + len_enc,
+    ] += result_sample['enc_dec_mask']
+
+    if need_extras['loss_mask']:
+        samples_dict['loss_mask'][prev_len_dec + len_dec] = 1
+
+    for key in ['enc_mask', 'dec_mask']:
+        if need_extras[key]:
+            all_samples = samples_dict[key]
+            offset, length = get_lens(
+                key, prev_len, prev_len_dec, len_enc, len_dec)
+            all_samples[
+                offset + length,
+                offset:offset + length,
+            ] = 1
+            all_samples[
+                offset:offset + length,
+                offset + length,
+            ] = 1
+
+    if need_extras['enc_dec_mask'][0] or need_extras['enc_dec_mask'][1]:
+        all_samples = samples_dict['enc_dec_mask']
+        if need_extras['enc_dec_mask'][0]:
+            all_samples[
+                prev_len_dec + len_dec,
+                prev_len:prev_len + len_enc,
+            ] = 1
+        elif need_extras['enc_dec_mask'][1]:
+            all_samples[
+                prev_len_dec:prev_len_dec + len_dec,
+                prev_len + len_enc,
+            ] = 1
+    samples_dict['truncated'] += result_sample['truncated']
+
+    if eos_added['text_enc']:
+        len_enc += 1
+    if eos_added['text_dec']:
+        len_dec += 1
+
+    return len_enc, len_dec
+
+
+def add_final_padding(samples_dict, prev_len, prev_len_dec, pad_id):
+    samples_dict['text_enc'][prev_len:] = pad_id
+    samples_dict['text_dec'][prev_len_dec:] = pad_id
+    samples_dict['labels'][prev_len_dec:] = -1

From e2a0c36d5f1d715ddd960792729ddc3112876f4e Mon Sep 17 00:00:00 2001
From: janEbert <janpublicebert@posteo.net>
Date: Wed, 22 Feb 2023 16:35:05 +0100
Subject: [PATCH 061/122] Fix UL2 sample packing

---
 megatron/data/ul2_dataset.py | 223 +++++++++++++++++++++++++++++++----
 1 file changed, 197 insertions(+), 26 deletions(-)

diff --git a/megatron/data/ul2_dataset.py b/megatron/data/ul2_dataset.py
index ada6b5d95..e41ef8d8b 100644
--- a/megatron/data/ul2_dataset.py
+++ b/megatron/data/ul2_dataset.py
@@ -27,12 +27,14 @@
     get_samples_mapping,
     SamplingStyle
 )
-from megatron.data.gpt_dataset import build_index_mappings, get_sample
+from megatron.data.gpt_dataset import build_index_mappings, get_samples
 from megatron.data.t5_dataset import (
+    add_final_padding,
     LengthExceededError,
     make_history_mask,
     merge_subsequent_masks,
     pad_and_convert_to_numpy,
+    update_samples_dict,
 )
 from megatron.enums import UL2ModelType
 
@@ -98,8 +100,13 @@ def __init__(self, name, indexed_dataset, data_prefix,
 
         # Build the samples mapping.
         if self.pack_samples:
-            self.doc_idx, self.sample_idx, self.shuffle_idx = build_index_mappings(
-                self.name, data_prefix, self.indexed_dataset.get_doc_idx()[:-1],
+            (
+                self.doc_idx,
+                self.sample_idx,
+                self.shuffle_idx,
+            ) = build_index_mappings(
+                self.name, data_prefix,
+                self.indexed_dataset.get_doc_idx()[:-1],
                 self.indexed_dataset.sizes, max_num_samples,
                 self.max_seq_length - min_added_tokens, self.seed)
         else:
@@ -108,7 +115,8 @@ def __init__(self, name, indexed_dataset, data_prefix,
                 data_prefix,
                 num_epochs,
                 max_num_samples,
-                self.max_seq_length - min_added_tokens,  # account for added tokens
+                # account for added tokens
+                self.max_seq_length - min_added_tokens,
                 short_seq_prob,
                 self.seed,
                 self.name,
@@ -167,39 +175,129 @@ def __len__(self):
         else:
             return self.samples_mapping.shape[0]
 
+    def _create_samples_dict(self):
+        if is_decoder_only(self.model_type):
+            samples_dict = {
+                'text': np.empty((self.max_seq_length,), dtype=np.int64),
+                'labels': np.empty((self.max_seq_length,), dtype=np.int64),
+                'loss_mask': np.zeros((self.max_seq_length,), dtype=np.int64),
+                'truncated': 0,
+                'dec_mask': np.zeros(
+                    (self.max_seq_length, self.max_seq_length),
+                    dtype=np.int64,
+                ),
+            }
+        else:
+            samples_dict = {
+                'text_enc': np.empty((self.max_seq_length,), dtype=np.int64),
+                'text_dec': np.empty(
+                    (self.max_seq_length_dec,), dtype=np.int64),
+                'labels': np.empty(
+                    (self.max_seq_length_dec,), dtype=np.int64),
+                'loss_mask': np.zeros(
+                    (self.max_seq_length_dec,), dtype=np.int64),
+                'truncated': 0,
+                'enc_mask': np.zeros(
+                    (self.max_seq_length, self.max_seq_length),
+                    dtype=np.int64,
+                ),
+                'dec_mask': np.zeros(
+                    (self.max_seq_length_dec, self.max_seq_length_dec),
+                    dtype=np.int64,
+                ),
+                'enc_dec_mask': np.zeros(
+                    (self.max_seq_length_dec, self.max_seq_length),
+                    dtype=np.int64,
+                ),
+            }
+        return samples_dict
+
     def __getitem__(self, idx):
+        # Note that this rng state should be numpy and not python since
+        # python randint is inclusive whereas the numpy one is exclusive.
+        np_rng = np.random.RandomState(seed=(self.seed + idx))
+        # Denoiser selection
+        denoiser_index = np_rng.choice(
+            np.arange(len(self.denoisers)),
+            p=self.denoiser_ratios,
+        )
+
         if self.pack_samples:
-            sample = get_sample(self.indexed_dataset, self.doc_idx,
-                                self.sample_idx, self.shuffle_idx, idx)
-            seq_length = len(sample)
-            sample = [sample]
+            samples = get_samples(self.indexed_dataset, self.doc_idx,
+                                  self.sample_idx, self.shuffle_idx, idx)
+            samples_dict = self._create_samples_dict()
+            prev_len = 0
+            prev_len_dec = 0
+            for sample in samples:
+                seq_length = len(sample)
+                result_sample = build_training_sample(
+                    [sample], seq_length,
+                    self.max_seq_length,  # needed for padding
+                    self.max_seq_length_dec, self.vocab_id_list,
+                    self.vocab_id_to_token_dict, self.cls_ids, self.sep_id,
+                    self.mask_id, self.pad_id, self.model_type, denoiser_index,
+                    self.denoisers, self.mean_span_lengths,
+                    self.mask_ratios, self.like_ul2r, np_rng,
+                    self.bos_id, self.eos_id, self.sentinel_tokens)
+                if is_decoder_only(self.model_type):
+                    maybe_lens = update_samples_dict_decoder_only(
+                        samples_dict,
+                        result_sample,
+                        self.max_seq_length,
+                        prev_len,
+                        self.pad_id,
+                        self.eos_id,
+                    )
+                else:
+                    maybe_lens = update_samples_dict(
+                        samples_dict,
+                        result_sample,
+                        self.max_seq_length,
+                        self.max_seq_length_dec,
+                        prev_len,
+                        prev_len_dec,
+                        self.pad_id,
+                        self.eos_id,
+                    )
+                if maybe_lens is None:
+                    # We are exceeding our sequence length already.
+                    break
+
+                if is_decoder_only(self.model_type):
+                    len_enc = maybe_lens
+                else:
+                    len_enc, len_dec = maybe_lens
+                    prev_len_dec += len_dec
+                prev_len += len_enc
+
+            if is_decoder_only(self.model_type):
+                samples_dict['text'][prev_len:] = self.pad_id
+                samples_dict['labels'][prev_len:] = -1
+            else:
+                add_final_padding(
+                    samples_dict, prev_len, prev_len_dec, self.pad_id)
         else:
             start_index, end_index, seq_length = self.samples_mapping[idx]
             sample = []
             for index in range(start_index, end_index):
                 sample.append(self.indexed_dataset[index])
-        # Note that this rng state should be numpy and not python since
-        # python randint is inclusive whereas the numpy one is exclusive.
-        np_rng = np.random.RandomState(seed=(self.seed + idx))
-        return build_training_sample(sample, seq_length,
-                                     self.max_seq_length,  # needed for padding
-                                     self.max_seq_length_dec,
-                                     self.vocab_id_list,
-                                     self.vocab_id_to_token_dict,
-                                     self.cls_ids, self.sep_id,
-                                     self.mask_id, self.pad_id,
-                                     self.model_type, self.denoiser_ratios,
-                                     self.denoisers, self.mean_span_lengths,
-                                     self.mask_ratios, self.like_ul2r, np_rng,
-                                     self.bos_id, self.eos_id,
-                                     self.sentinel_tokens)
+            samples_dict = build_training_sample(
+                sample, seq_length,
+                self.max_seq_length,  # needed for padding
+                self.max_seq_length_dec, self.vocab_id_list,
+                self.vocab_id_to_token_dict, self.cls_ids, self.sep_id,
+                self.mask_id, self.pad_id, self.model_type, denoiser_index,
+                self.denoisers, self.mean_span_lengths,
+                self.mask_ratios, self.like_ul2r, np_rng,
+                self.bos_id, self.eos_id, self.sentinel_tokens)
+        return samples_dict
 
 
 def build_training_sample(sample, target_seq_length,
                           max_seq_length, max_seq_length_dec,
                           vocab_id_list, vocab_id_to_token_dict,
                           cls_ids, sep_id, mask_id, pad_id,
-                          model_type, denoiser_ratios,
+                          model_type, denoiser_index,
                           denoisers, mean_span_lengths,
                           mask_ratios, like_ul2r, np_rng,
                           bos_id=None, eos_id=None,
@@ -220,7 +318,7 @@ def build_training_sample(sample, target_seq_length,
         mask_id: Mask token id.
         pad_id: Padding token id.
         model_type: What type of model is used.
-        denoiser_ratios: Probability of each denoising objective to be selected.
+        denoiser_index: Index of selected denoising objective.
         denoisers: What type of UL2 denoising objective the other UL2
               configurations refer to.
         mean_span_lengths: Mean length for sampling span lengths. Numbers < 1
@@ -237,7 +335,6 @@ def build_training_sample(sample, target_seq_length,
     """
 
     # Denoiser selection
-    denoiser_index = np_rng.choice(np.arange(len(denoisers)), p=denoiser_ratios)
     denoiser = denoisers[denoiser_index]
     masked_lm_prob = mask_ratios[denoiser_index]
 
@@ -381,3 +478,77 @@ def build_training_sample(sample, target_seq_length,
             'enc_dec_mask': enc_dec_mask,
         }
     return train_sample
+
+
+def _remove_padding(result_sample, pad_id):
+    # Remove padding
+    padding_length = (
+        len(result_sample['text'])
+        - np.argmax(result_sample['text'] == pad_id)
+    )
+    result_sample['text'] = result_sample['text'][:padding_length]
+    for key in ['labels', 'loss_mask']:
+        result_sample[key] = result_sample[key][:padding_length]
+    result_sample['dec_mask'] = \
+        result_sample['dec_mask'][:padding_length, :padding_length]
+
+
+def update_samples_dict_decoder_only(
+        samples_dict,
+        result_sample,
+        max_seq_len,
+        prev_len,
+        pad_id,
+        eos_id,
+):
+    _remove_padding(result_sample, pad_id)
+    len_enc = len(result_sample['text'])
+
+    if (
+            (
+                prev_len
+                + len_enc
+                + int(result_sample['text'][-1] != eos_id)
+            ) > max_seq_len
+    ):
+        return None
+
+    eos_added = False
+    for key in ['text', 'labels']:
+        curr_sample = result_sample[key]
+        samples_dict[key][prev_len:prev_len + len_enc] = curr_sample
+
+        # Add EOS token if not present.
+        if (
+                curr_sample[-1] != eos_id
+                or key == 'labels' and eos_added
+        ):
+            samples_dict[key][prev_len + len_enc] = eos_id
+            eos_added = True
+
+    samples_dict['loss_mask'][
+        prev_len:prev_len + len_enc,
+    ] += result_sample['loss_mask']
+    samples_dict['dec_mask'][
+        prev_len:prev_len + len_enc,
+        prev_len:prev_len + len_enc,
+    ] += result_sample['dec_mask']
+
+    if eos_added:
+        samples_dict['loss_mask'][prev_len + len_enc] = 1
+
+        all_samples = samples_dict['dec_mask']
+        all_samples[
+            prev_len + len_enc,
+            prev_len:prev_len + len_enc,
+        ] = 1
+        all_samples[
+            prev_len:prev_len + len_enc,
+            prev_len + len_enc,
+        ] = 1
+
+        len_enc += 1
+
+    samples_dict['truncated'] += result_sample['truncated']
+
+    return len_enc

From c2884c8c993ddb1eed45630a74f6c03adb181e51 Mon Sep 17 00:00:00 2001
From: janEbert <janpublicebert@posteo.net>
Date: Wed, 22 Feb 2023 16:48:06 +0100
Subject: [PATCH 062/122] Refactor samples dict creation

---
 megatron/data/t5_dataset.py | 48 ++++++++++++++++++++-----------------
 1 file changed, 26 insertions(+), 22 deletions(-)

diff --git a/megatron/data/t5_dataset.py b/megatron/data/t5_dataset.py
index a8ac07ed8..2053cbf38 100644
--- a/megatron/data/t5_dataset.py
+++ b/megatron/data/t5_dataset.py
@@ -110,6 +110,31 @@ def __len__(self):
         else:
             return self.samples_mapping.shape[0]
 
+    def _create_samples_dict(self):
+        samples_dict = {
+            'text_enc': np.empty((self.max_seq_length,), dtype=np.int64),
+            'text_dec': np.empty(
+                (self.max_seq_length_dec,), dtype=np.int64),
+            'labels': np.empty(
+                (self.max_seq_length_dec,), dtype=np.int64),
+            'loss_mask': np.zeros(
+                (self.max_seq_length_dec,), dtype=np.int64),
+            'truncated': 0,
+            'enc_mask': np.zeros(
+                (self.max_seq_length, self.max_seq_length),
+                dtype=np.int64,
+            ),
+            'dec_mask': np.zeros(
+                (self.max_seq_length_dec, self.max_seq_length_dec),
+                dtype=np.int64,
+            ),
+            'enc_dec_mask': np.zeros(
+                (self.max_seq_length_dec, self.max_seq_length),
+                dtype=np.int64,
+            ),
+        }
+        return samples_dict
+
     def __getitem__(self, idx):
         # Note that this rng state should be numpy and not python since
         # python randint is inclusive whereas the numpy one is exclusive.
@@ -117,28 +142,7 @@ def __getitem__(self, idx):
         if self.pack_samples:
             samples = get_samples(self.indexed_dataset, self.doc_idx,
                                   self.sample_idx, self.shuffle_idx, idx)
-            samples_dict = {
-                'text_enc': np.empty((self.max_seq_length,), dtype=np.int64),
-                'text_dec': np.empty(
-                    (self.max_seq_length_dec,), dtype=np.int64),
-                'labels': np.empty(
-                    (self.max_seq_length_dec,), dtype=np.int64),
-                'loss_mask': np.zeros(
-                    (self.max_seq_length_dec,), dtype=np.int64),
-                'truncated': 0,
-                'enc_mask': np.zeros(
-                    (self.max_seq_length, self.max_seq_length),
-                    dtype=np.int64,
-                ),
-                'dec_mask': np.zeros(
-                    (self.max_seq_length_dec, self.max_seq_length_dec),
-                    dtype=np.int64,
-                ),
-                'enc_dec_mask': np.zeros(
-                    (self.max_seq_length_dec, self.max_seq_length),
-                    dtype=np.int64,
-                ),
-            }
+            samples_dict = self._create_samples_dict()
             prev_len = 0
             prev_len_dec = 0
             for sample in samples:

From 7eb792362d2778e2e97cf8272502fb59e6372029 Mon Sep 17 00:00:00 2001
From: janEbert <janpublicebert@posteo.net>
Date: Thu, 23 Feb 2023 08:48:03 +0100
Subject: [PATCH 063/122] Fix desired seq length

Now we won't exceed the desired seq length.
---
 megatron/data/t5_dataset.py  | 4 +++-
 megatron/data/ul2_dataset.py | 4 +++-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/megatron/data/t5_dataset.py b/megatron/data/t5_dataset.py
index 2053cbf38..4fa4e7b7c 100644
--- a/megatron/data/t5_dataset.py
+++ b/megatron/data/t5_dataset.py
@@ -146,7 +146,9 @@ def __getitem__(self, idx):
             prev_len = 0
             prev_len_dec = 0
             for sample in samples:
-                seq_length = len(sample)
+                remaining_seq_len = self.max_seq_length - prev_len
+                seq_length = min(remaining_seq_len, len(sample))
+
                 result_sample = build_training_sample(
                     [sample], seq_length,
                     self.max_seq_length,  # needed for padding
diff --git a/megatron/data/ul2_dataset.py b/megatron/data/ul2_dataset.py
index e41ef8d8b..b9da28832 100644
--- a/megatron/data/ul2_dataset.py
+++ b/megatron/data/ul2_dataset.py
@@ -229,7 +229,9 @@ def __getitem__(self, idx):
             prev_len = 0
             prev_len_dec = 0
             for sample in samples:
-                seq_length = len(sample)
+                remaining_seq_len = self.max_seq_length - prev_len
+                seq_length = min(remaining_seq_len, len(sample))
+
                 result_sample = build_training_sample(
                     [sample], seq_length,
                     self.max_seq_length,  # needed for padding

From dd4c0d0d957440eee8527f9e248d7e3da4d14b64 Mon Sep 17 00:00:00 2001
From: janEbert <janpublicebert@posteo.net>
Date: Thu, 23 Feb 2023 08:53:51 +0100
Subject: [PATCH 064/122] Fix padding removal

---
 megatron/data/t5_dataset.py  | 21 ++++++++-------------
 megatron/data/ul2_dataset.py | 11 ++++-------
 2 files changed, 12 insertions(+), 20 deletions(-)

diff --git a/megatron/data/t5_dataset.py b/megatron/data/t5_dataset.py
index 4fa4e7b7c..c077137f5 100644
--- a/megatron/data/t5_dataset.py
+++ b/megatron/data/t5_dataset.py
@@ -402,23 +402,18 @@ def make_history_mask_3d(block):
 
 def _remove_padding(result_sample, pad_id):
     # Remove padding
-    padding_length = (
-        len(result_sample['text_enc'])
-        - np.argmax(result_sample['text_enc'] == pad_id)
-    )
-    padding_length_dec = (
-        len(result_sample['text_dec'])
-        - np.argmax(result_sample['text_dec'] == pad_id)
-    )
-    result_sample['text_enc'] = result_sample['text_enc'][:padding_length]
+    padding_start = np.argmax(result_sample['text_enc'] == pad_id)
+    padding_start_dec = np.argmax(result_sample['text_dec'] == pad_id)
+
+    result_sample['text_enc'] = result_sample['text_enc'][:padding_start]
     for key in ['text_dec', 'labels', 'loss_mask']:
-        result_sample[key] = result_sample[key][:padding_length_dec]
+        result_sample[key] = result_sample[key][:padding_start_dec]
     result_sample['enc_mask'] = \
-        result_sample['enc_mask'][:padding_length, :padding_length]
+        result_sample['enc_mask'][:padding_start, :padding_start]
     result_sample['enc_dec_mask'] = \
-        result_sample['enc_dec_mask'][:padding_length_dec, :padding_length]
+        result_sample['enc_dec_mask'][:padding_start_dec, :padding_start]
     result_sample['dec_mask'] = \
-        result_sample['dec_mask'][:padding_length_dec, :padding_length_dec]
+        result_sample['dec_mask'][:padding_start_dec, :padding_start_dec]
 
 
 def get_lens(key, prev_len, prev_len_dec, len_enc, len_dec):
diff --git a/megatron/data/ul2_dataset.py b/megatron/data/ul2_dataset.py
index b9da28832..5723f74ec 100644
--- a/megatron/data/ul2_dataset.py
+++ b/megatron/data/ul2_dataset.py
@@ -484,15 +484,12 @@ def build_training_sample(sample, target_seq_length,
 
 def _remove_padding(result_sample, pad_id):
     # Remove padding
-    padding_length = (
-        len(result_sample['text'])
-        - np.argmax(result_sample['text'] == pad_id)
-    )
-    result_sample['text'] = result_sample['text'][:padding_length]
+    padding_start = np.argmax(result_sample['text'] == pad_id)
+    result_sample['text'] = result_sample['text'][:padding_start]
     for key in ['labels', 'loss_mask']:
-        result_sample[key] = result_sample[key][:padding_length]
+        result_sample[key] = result_sample[key][:padding_start]
     result_sample['dec_mask'] = \
-        result_sample['dec_mask'][:padding_length, :padding_length]
+        result_sample['dec_mask'][:padding_start, :padding_start]
 
 
 def update_samples_dict_decoder_only(

From 58148f8aadbee177468d1e689f5a41408c2669a7 Mon Sep 17 00:00:00 2001
From: janEbert <janpublicebert@posteo.net>
Date: Thu, 23 Feb 2023 09:05:55 +0100
Subject: [PATCH 065/122] Allow repeating UL2 prompt token when packing

---
 megatron/arguments.py          |  4 ++++
 megatron/data/dataset_utils.py |  1 +
 megatron/data/ul2_dataset.py   | 10 +++++++---
 3 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/megatron/arguments.py b/megatron/arguments.py
index 28e246ece..996f0dd4e 100644
--- a/megatron/arguments.py
+++ b/megatron/arguments.py
@@ -1084,6 +1084,10 @@ def _add_ul2_args(parser):
                        help='Whether to use the updated implementation as '
                        'described in the UL2R paper. This only changes the '
                        'implementation, not the objective configurations!')
+    group.add_argument('--ul2-pack-repeat-prompt', action='store_true',
+                       help='When `--pack-samples` is also given and '
+                       '`--ul2-pack-any` is *not* given, whether to '
+                       'repeat the prompt token for each packed sample.')
 
     return parser
 
diff --git a/megatron/data/dataset_utils.py b/megatron/data/dataset_utils.py
index 68400cdf9..cfcc73ed1 100644
--- a/megatron/data/dataset_utils.py
+++ b/megatron/data/dataset_utils.py
@@ -669,6 +669,7 @@ def build_dataset(index, name):
                         'X': args.ul2_x_denoiser_token,
                     },
                     like_ul2r=args.ul2_like_ul2r,
+                    pack_repeat_prompt=args.ul2_pack_repeat_prompt,
                     max_seq_length_dec=max_seq_length_dec,
                     short_seq_prob=short_seq_prob,
                     **kwargs,
diff --git a/megatron/data/ul2_dataset.py b/megatron/data/ul2_dataset.py
index 5723f74ec..68efb5de6 100644
--- a/megatron/data/ul2_dataset.py
+++ b/megatron/data/ul2_dataset.py
@@ -56,8 +56,8 @@ def __init__(self, name, indexed_dataset, data_prefix,
                  num_epochs, max_num_samples, model_type,
                  denoiser_ratios, denoisers, mean_span_lengths,
                  mask_ratios, add_mask_tokens, pack_samples,
-                 denoiser_tokens, like_ul2r, max_seq_length,
-                 max_seq_length_dec, short_seq_prob, seed):
+                 denoiser_tokens, like_ul2r, pack_repeat_prompt,
+                 max_seq_length, max_seq_length_dec, short_seq_prob, seed):
         super().__init__()
 
         if denoiser_ratios is None:
@@ -91,6 +91,7 @@ def __init__(self, name, indexed_dataset, data_prefix,
         # Dataset.
         self.indexed_dataset = indexed_dataset
         self.pack_samples = pack_samples
+        self.repeat_prompt = pack_repeat_prompt
 
         # Minimum number of tokens added: BOS and EOS.
         min_added_tokens = 2
@@ -228,6 +229,7 @@ def __getitem__(self, idx):
             samples_dict = self._create_samples_dict()
             prev_len = 0
             prev_len_dec = 0
+            cls_ids = self.cls_ids
             for sample in samples:
                 remaining_seq_len = self.max_seq_length - prev_len
                 seq_length = min(remaining_seq_len, len(sample))
@@ -236,7 +238,7 @@ def __getitem__(self, idx):
                     [sample], seq_length,
                     self.max_seq_length,  # needed for padding
                     self.max_seq_length_dec, self.vocab_id_list,
-                    self.vocab_id_to_token_dict, self.cls_ids, self.sep_id,
+                    self.vocab_id_to_token_dict, cls_ids, self.sep_id,
                     self.mask_id, self.pad_id, self.model_type, denoiser_index,
                     self.denoisers, self.mean_span_lengths,
                     self.mask_ratios, self.like_ul2r, np_rng,
@@ -272,6 +274,8 @@ def __getitem__(self, idx):
                     prev_len_dec += len_dec
                 prev_len += len_enc
 
+                if not self.repeat_prompt:
+                    cls_ids = {self.denoisers[denoiser_index]: None}
             if is_decoder_only(self.model_type):
                 samples_dict['text'][prev_len:] = self.pad_id
                 samples_dict['labels'][prev_len:] = -1

From c41fecd07dfd3ff2dc1dd02d373cc64dbd48a4da Mon Sep 17 00:00:00 2001
From: janEbert <janpublicebert@posteo.net>
Date: Thu, 23 Feb 2023 09:06:46 +0100
Subject: [PATCH 066/122] Allow packing different denoisers together

---
 megatron/arguments.py          |  5 +++++
 megatron/data/dataset_utils.py |  1 +
 megatron/data/ul2_dataset.py   | 11 +++++++++--
 3 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/megatron/arguments.py b/megatron/arguments.py
index 996f0dd4e..a60684596 100644
--- a/megatron/arguments.py
+++ b/megatron/arguments.py
@@ -1084,6 +1084,11 @@ def _add_ul2_args(parser):
                        help='Whether to use the updated implementation as '
                        'described in the UL2R paper. This only changes the '
                        'implementation, not the objective configurations!')
+    group.add_argument('--ul2-pack-any', action='store_true',
+                       help='When `--pack-samples` is also given, whether to '
+                       'pack different denoisers into one sample. If not '
+                       'given, the same denoiser is used for all packed '
+                       'samples.')
     group.add_argument('--ul2-pack-repeat-prompt', action='store_true',
                        help='When `--pack-samples` is also given and '
                        '`--ul2-pack-any` is *not* given, whether to '
diff --git a/megatron/data/dataset_utils.py b/megatron/data/dataset_utils.py
index cfcc73ed1..5b9c7eba1 100644
--- a/megatron/data/dataset_utils.py
+++ b/megatron/data/dataset_utils.py
@@ -669,6 +669,7 @@ def build_dataset(index, name):
                         'X': args.ul2_x_denoiser_token,
                     },
                     like_ul2r=args.ul2_like_ul2r,
+                    pack_any=args.ul2_pack_any,
                     pack_repeat_prompt=args.ul2_pack_repeat_prompt,
                     max_seq_length_dec=max_seq_length_dec,
                     short_seq_prob=short_seq_prob,
diff --git a/megatron/data/ul2_dataset.py b/megatron/data/ul2_dataset.py
index 68efb5de6..3bd2521a7 100644
--- a/megatron/data/ul2_dataset.py
+++ b/megatron/data/ul2_dataset.py
@@ -56,7 +56,7 @@ def __init__(self, name, indexed_dataset, data_prefix,
                  num_epochs, max_num_samples, model_type,
                  denoiser_ratios, denoisers, mean_span_lengths,
                  mask_ratios, add_mask_tokens, pack_samples,
-                 denoiser_tokens, like_ul2r, pack_repeat_prompt,
+                 denoiser_tokens, like_ul2r, pack_any, pack_repeat_prompt,
                  max_seq_length, max_seq_length_dec, short_seq_prob, seed):
         super().__init__()
 
@@ -91,6 +91,7 @@ def __init__(self, name, indexed_dataset, data_prefix,
         # Dataset.
         self.indexed_dataset = indexed_dataset
         self.pack_samples = pack_samples
+        self.pack_any = pack_any
         self.repeat_prompt = pack_repeat_prompt
 
         # Minimum number of tokens added: BOS and EOS.
@@ -274,8 +275,14 @@ def __getitem__(self, idx):
                     prev_len_dec += len_dec
                 prev_len += len_enc
 
-                if not self.repeat_prompt:
+                if not self.repeat_prompt and not self.pack_any:
                     cls_ids = {self.denoisers[denoiser_index]: None}
+
+                if self.pack_any:
+                    denoiser_index = np_rng.choice(
+                        np.arange(len(self.denoisers)),
+                        p=self.denoiser_ratios,
+                    )
             if is_decoder_only(self.model_type):
                 samples_dict['text'][prev_len:] = self.pad_id
                 samples_dict['labels'][prev_len:] = -1

From 057bb47640bd22ff7a711d87319ee5f800888dd5 Mon Sep 17 00:00:00 2001
From: janEbert <janpublicebert@posteo.net>
Date: Thu, 23 Feb 2023 09:11:24 +0100
Subject: [PATCH 067/122] Refactor sample packing functions

Just pull them out of the other ones (and add separating whitespace/join
lines).
---
 megatron/data/t5_dataset.py  |  76 ++++++++++----------
 megatron/data/ul2_dataset.py | 135 ++++++++++++++++++-----------------
 2 files changed, 111 insertions(+), 100 deletions(-)

diff --git a/megatron/data/t5_dataset.py b/megatron/data/t5_dataset.py
index c077137f5..93b1b49d9 100644
--- a/megatron/data/t5_dataset.py
+++ b/megatron/data/t5_dataset.py
@@ -135,47 +135,51 @@ def _create_samples_dict(self):
         }
         return samples_dict
 
+    def _pack_samples(self, np_rng, idx):
+        samples = get_samples(self.indexed_dataset, self.doc_idx,
+                              self.sample_idx, self.shuffle_idx, idx)
+        samples_dict = self._create_samples_dict()
+        prev_len = 0
+        prev_len_dec = 0
+
+        for sample in samples:
+            remaining_seq_len = self.max_seq_length - prev_len
+            seq_length = min(remaining_seq_len, len(sample))
+
+            result_sample = build_training_sample(
+                [sample], seq_length,
+                self.max_seq_length,  # needed for padding
+                self.max_seq_length_dec, self.vocab_id_list,
+                self.vocab_id_to_token_dict, self.cls_id, self.sep_id,
+                self.mask_id, self.pad_id, self.masked_lm_prob, np_rng,
+                self.bos_id, self.eos_id, self.sentinel_tokens)
+            maybe_lens = update_samples_dict(
+                samples_dict,
+                result_sample,
+                self.max_seq_length,
+                self.max_seq_length_dec,
+                prev_len,
+                prev_len_dec,
+                self.pad_id,
+                self.eos_id,
+            )
+            if maybe_lens is None:
+                # We are exceeding our sequence length already.
+                break
+
+            len_enc, len_dec = maybe_lens
+            prev_len += len_enc
+            prev_len_dec += len_dec
+
+        add_final_padding(samples_dict, prev_len, prev_len_dec, self.pad_id)
+        return samples_dict
+
     def __getitem__(self, idx):
         # Note that this rng state should be numpy and not python since
         # python randint is inclusive whereas the numpy one is exclusive.
         np_rng = np.random.RandomState(seed=(self.seed + idx))
         if self.pack_samples:
-            samples = get_samples(self.indexed_dataset, self.doc_idx,
-                                  self.sample_idx, self.shuffle_idx, idx)
-            samples_dict = self._create_samples_dict()
-            prev_len = 0
-            prev_len_dec = 0
-            for sample in samples:
-                remaining_seq_len = self.max_seq_length - prev_len
-                seq_length = min(remaining_seq_len, len(sample))
-
-                result_sample = build_training_sample(
-                    [sample], seq_length,
-                    self.max_seq_length,  # needed for padding
-                    self.max_seq_length_dec, self.vocab_id_list,
-                    self.vocab_id_to_token_dict, self.cls_id, self.sep_id,
-                    self.mask_id, self.pad_id, self.masked_lm_prob, np_rng,
-                    self.bos_id, self.eos_id, self.sentinel_tokens)
-                maybe_lens = update_samples_dict(
-                    samples_dict,
-                    result_sample,
-                    self.max_seq_length,
-                    self.max_seq_length_dec,
-                    prev_len,
-                    prev_len_dec,
-                    self.pad_id,
-                    self.eos_id,
-                )
-                if maybe_lens is None:
-                    # We are exceeding our sequence length already.
-                    break
-
-                len_enc, len_dec = maybe_lens
-                prev_len += len_enc
-                prev_len_dec += len_dec
-
-            add_final_padding(
-                samples_dict, prev_len, prev_len_dec, self.pad_id)
+            samples_dict = self._pack_samples(np_rng, idx)
         else:
             start_index, end_index, seq_length = self.samples_mapping[idx]
             sample = []
diff --git a/megatron/data/ul2_dataset.py b/megatron/data/ul2_dataset.py
index 3bd2521a7..105a482cb 100644
--- a/megatron/data/ul2_dataset.py
+++ b/megatron/data/ul2_dataset.py
@@ -214,6 +214,76 @@ def _create_samples_dict(self):
             }
         return samples_dict
 
+    def _pack_samples(self, np_rng, idx, denoiser_index):
+        samples = get_samples(self.indexed_dataset, self.doc_idx,
+                              self.sample_idx, self.shuffle_idx, idx)
+        samples_dict = self._create_samples_dict()
+        prev_len = 0
+        prev_len_dec = 0
+        cls_ids = self.cls_ids
+
+        for sample in samples:
+            remaining_seq_len = self.max_seq_length - prev_len
+            seq_length = min(remaining_seq_len, len(sample))
+
+            result_sample = build_training_sample(
+                [sample], seq_length,
+                self.max_seq_length,  # needed for padding
+                self.max_seq_length_dec, self.vocab_id_list,
+                self.vocab_id_to_token_dict, cls_ids, self.sep_id,
+                self.mask_id, self.pad_id, self.model_type, denoiser_index,
+                self.denoisers, self.mean_span_lengths,
+                self.mask_ratios, self.like_ul2r, np_rng,
+                self.bos_id, self.eos_id, self.sentinel_tokens)
+            if is_decoder_only(self.model_type):
+                maybe_lens = update_samples_dict_decoder_only(
+                    samples_dict,
+                    result_sample,
+                    self.max_seq_length,
+                    prev_len,
+                    self.pad_id,
+                    self.eos_id,
+                )
+            else:
+                maybe_lens = update_samples_dict(
+                    samples_dict,
+                    result_sample,
+                    self.max_seq_length,
+                    self.max_seq_length_dec,
+                    prev_len,
+                    prev_len_dec,
+                    self.pad_id,
+                    self.eos_id,
+                )
+            if maybe_lens is None:
+                # We are exceeding our sequence length already.
+                break
+
+            if is_decoder_only(self.model_type):
+                len_enc = maybe_lens
+            else:
+                len_enc, len_dec = maybe_lens
+                prev_len_dec += len_dec
+            prev_len += len_enc
+
+            if not self.repeat_prompt and not self.pack_any:
+                cls_ids = {self.denoisers[denoiser_index]: None}
+
+            if self.pack_any:
+                denoiser_index = np_rng.choice(
+                    np.arange(len(self.denoisers)),
+                    p=self.denoiser_ratios,
+                )
+
+        if is_decoder_only(self.model_type):
+            samples_dict['text'][prev_len:] = self.pad_id
+            samples_dict['labels'][prev_len:] = -1
+        else:
+            add_final_padding(
+                samples_dict, prev_len, prev_len_dec, self.pad_id)
+
+        return samples_dict
+
     def __getitem__(self, idx):
         # Note that this rng state should be numpy and not python since
         # python randint is inclusive whereas the numpy one is exclusive.
@@ -225,70 +295,7 @@ def __getitem__(self, idx):
         )
 
         if self.pack_samples:
-            samples = get_samples(self.indexed_dataset, self.doc_idx,
-                                  self.sample_idx, self.shuffle_idx, idx)
-            samples_dict = self._create_samples_dict()
-            prev_len = 0
-            prev_len_dec = 0
-            cls_ids = self.cls_ids
-            for sample in samples:
-                remaining_seq_len = self.max_seq_length - prev_len
-                seq_length = min(remaining_seq_len, len(sample))
-
-                result_sample = build_training_sample(
-                    [sample], seq_length,
-                    self.max_seq_length,  # needed for padding
-                    self.max_seq_length_dec, self.vocab_id_list,
-                    self.vocab_id_to_token_dict, cls_ids, self.sep_id,
-                    self.mask_id, self.pad_id, self.model_type, denoiser_index,
-                    self.denoisers, self.mean_span_lengths,
-                    self.mask_ratios, self.like_ul2r, np_rng,
-                    self.bos_id, self.eos_id, self.sentinel_tokens)
-                if is_decoder_only(self.model_type):
-                    maybe_lens = update_samples_dict_decoder_only(
-                        samples_dict,
-                        result_sample,
-                        self.max_seq_length,
-                        prev_len,
-                        self.pad_id,
-                        self.eos_id,
-                    )
-                else:
-                    maybe_lens = update_samples_dict(
-                        samples_dict,
-                        result_sample,
-                        self.max_seq_length,
-                        self.max_seq_length_dec,
-                        prev_len,
-                        prev_len_dec,
-                        self.pad_id,
-                        self.eos_id,
-                    )
-                if maybe_lens is None:
-                    # We are exceeding our sequence length already.
-                    break
-
-                if is_decoder_only(self.model_type):
-                    len_enc = maybe_lens
-                else:
-                    len_enc, len_dec = maybe_lens
-                    prev_len_dec += len_dec
-                prev_len += len_enc
-
-                if not self.repeat_prompt and not self.pack_any:
-                    cls_ids = {self.denoisers[denoiser_index]: None}
-
-                if self.pack_any:
-                    denoiser_index = np_rng.choice(
-                        np.arange(len(self.denoisers)),
-                        p=self.denoiser_ratios,
-                    )
-            if is_decoder_only(self.model_type):
-                samples_dict['text'][prev_len:] = self.pad_id
-                samples_dict['labels'][prev_len:] = -1
-            else:
-                add_final_padding(
-                    samples_dict, prev_len, prev_len_dec, self.pad_id)
+            samples_dict = self._pack_samples(np_rng, idx, denoiser_index)
         else:
             start_index, end_index, seq_length = self.samples_mapping[idx]
             sample = []

From e2062b79b76537d5647ebc2b9b32c90c0979afd1 Mon Sep 17 00:00:00 2001
From: janEbert <janpublicebert@posteo.net>
Date: Thu, 23 Feb 2023 09:36:35 +0100
Subject: [PATCH 068/122] Repeat prompt by default when packing UL2

---
 megatron/arguments.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/megatron/arguments.py b/megatron/arguments.py
index a60684596..7cd3a7fbe 100644
--- a/megatron/arguments.py
+++ b/megatron/arguments.py
@@ -1089,10 +1089,11 @@ def _add_ul2_args(parser):
                        'pack different denoisers into one sample. If not '
                        'given, the same denoiser is used for all packed '
                        'samples.')
-    group.add_argument('--ul2-pack-repeat-prompt', action='store_true',
+    group.add_argument('--ul2-pack-no-repeat-prompt', action='store_false',
                        help='When `--pack-samples` is also given and '
                        '`--ul2-pack-any` is *not* given, whether to '
-                       'repeat the prompt token for each packed sample.')
+                       'repeat the prompt token for each packed sample.',
+                       dest='ul2_pack_repeat_prompt')
 
     return parser
 

From d31b89f788bff087e39e865873922672a8b8b93f Mon Sep 17 00:00:00 2001
From: janEbert <janpublicebert@posteo.net>
Date: Thu, 23 Feb 2023 22:55:42 +0100
Subject: [PATCH 069/122] Support pipelining for decoder-only model

---
 pretrain_ul2.py | 107 +++++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 92 insertions(+), 15 deletions(-)

diff --git a/pretrain_ul2.py b/pretrain_ul2.py
index 1dc425486..cb7aa61cd 100644
--- a/pretrain_ul2.py
+++ b/pretrain_ul2.py
@@ -18,6 +18,8 @@
 import argparse
 from functools import partial
 
+import deepspeed
+from deepspeed.runtime.utils import see_memory_usage
 import torch
 
 from megatron import (
@@ -31,7 +33,8 @@
     is_decoder_only as _is_decoder_only,
     is_prefix_lm as _is_prefix_lm,
 )
-from megatron.model.gpt_model import GPTModel
+from megatron.enums import AttnMaskType
+from megatron.model.gpt_model import GPTModel, GPTModelPipe
 from megatron.model.t5_model import T5Model, t5_position_ids
 from megatron.training import pretrain
 from megatron.utils import average_losses_across_data_parallel_group
@@ -51,21 +54,52 @@ def is_prefix_lm():
 
 def model_provider(pre_process=True, post_process=True):
     """Build the model."""
-    assert pre_process and post_process, "UL2 doesn't yet support pipelining"
+    args = get_args()
 
-    print_rank_0('building UL2 model ...')
-    if is_decoder_only():
-        print_rank_0('Using decoder-only UL2 model.')
-        model = GPTModel(
-            num_tokentypes=0,
-            parallel_output=True,
-            pre_process=pre_process,
-            post_process=post_process,
-            prefix_lm=is_prefix_lm(),
-        )
-    else:
-        print_rank_0('Using encoder-decoder UL2 model.')
-        model = T5Model(num_tokentypes=0, parallel_output=True)
+    see_memory_usage("Before Building Model", force=True)
+    with deepspeed.zero.Init(data_parallel_group=mpu.get_data_parallel_group(),
+                             remote_device=(
+                                 None
+                                 if args.remote_device == 'none'
+                                 else args.remote_device
+                             ),
+                             config_dict_or_path=args.deepspeed_config,
+                             enabled=args.zero_stage == 3,
+                             mpu=mpu):
+
+        print_rank_0('building UL2 model ...')
+        if is_decoder_only():
+            print_rank_0('Using decoder-only UL2 model.')
+            if args.deepspeed:
+                args.pretrain_causal_attention = not is_prefix_lm()
+                model = GPTModelPipe(
+                    num_tokentypes=0,
+                    parallel_output=True,
+                    attn_mask_type=(
+                        AttnMaskType.prefix
+                        if is_prefix_lm()
+                        else AttnMaskType.causal
+                    ),
+                )
+                # This is a hack to give us a reference to
+                # `get_batch_pipe` from within `training.py`.
+                # We need to call `model.set_batch_fn` after
+                # `deepspeed.initialize`.
+                model._megatron_batch_fn = get_batch_pipe
+            else:
+                model = GPTModel(
+                    num_tokentypes=0,
+                    parallel_output=True,
+                    pre_process=pre_process,
+                    post_process=post_process,
+                    prefix_lm=is_prefix_lm(),
+                )
+        else:
+            assert pre_process and post_process and not args.deepspeed, \
+                "Encoder-decoder model doesn't yet support pipelining"
+            print_rank_0('Using encoder-decoder UL2 model.')
+            model = T5Model(num_tokentypes=0, parallel_output=True)
+    see_memory_usage("After Building Model", force=True)
     return model
 
 
@@ -109,6 +143,49 @@ def get_batch(data_iterator):
                enc_mask, dec_mask, enc_dec_mask
 
 
+def get_batch_pipe(data):
+    """Modification of `get_batch` to work on `next(data_iterator)`
+    instead of `data_iterator`.
+    """
+
+    if is_decoder_only():
+        keys = ['text', 'labels', 'loss_mask', 'dec_mask']
+    else:
+        keys = ['text_enc', 'text_dec', 'labels', 'loss_mask',
+                'enc_mask', 'dec_mask', 'enc_dec_mask']
+    datatype = torch.int64
+
+    # Broadcast data.
+    data_b = mpu.broadcast_data(keys, data, datatype)
+
+    # Unpack.
+    if is_decoder_only():
+        tokens = data_b['text'].long()
+        labels = data_b['labels'].long()
+        loss_mask = data_b['loss_mask'].float()
+
+        dec_mask = (data_b['dec_mask'] < 0.5)
+        dec_mask = dec_mask.unsqueeze(1)
+
+        position_ids = t5_position_ids(tokens)
+        return (tokens, position_ids, dec_mask), (labels, loss_mask)
+    else:
+        tokens_enc = data_b['text_enc'].long()
+        tokens_dec = data_b['text_dec'].long()
+        labels = data_b['labels'].long()
+        loss_mask = data_b['loss_mask'].float()
+
+        enc_mask = (data_b['enc_mask'] < 0.5)
+        dec_mask = (data_b['dec_mask'] < 0.5)
+        enc_dec_mask = (data_b['enc_dec_mask'] < 0.5)
+
+        # This will probably be incorrect. Need to adapt this if
+        # pipelining for encoder-decoder models is ever implemented (and
+        # implemented similarly to the GPT model).
+        return (tokens_enc, tokens_dec, enc_mask, dec_mask, enc_dec_mask), \
+            (labels, loss_mask)
+
+
 def loss_func(loss_mask, output_tensor):
     if is_decoder_only():
         lm_loss_ = output_tensor

From 17dca4fe89eef2578cd1ca6405502b0a3f0cd783 Mon Sep 17 00:00:00 2001
From: janEbert <janpublicebert@posteo.net>
Date: Fri, 24 Feb 2023 11:58:05 +0100
Subject: [PATCH 070/122] Fix GPT tokenizer vocab size query

Did not include additional special tokens.
---
 megatron/tokenizer/tokenizer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/megatron/tokenizer/tokenizer.py b/megatron/tokenizer/tokenizer.py
index bd4b66c8a..8b4de875a 100644
--- a/megatron/tokenizer/tokenizer.py
+++ b/megatron/tokenizer/tokenizer.py
@@ -376,7 +376,7 @@ def __init__(
 
     @property
     def vocab_size(self):
-        return len(self.tokenizer.encoder)
+        return len(self.tokenizer)
 
     @property
     def vocab(self):

From bf9b1eb5e8df8e8b7c69388fdad1166074cd5f18 Mon Sep 17 00:00:00 2001
From: janEbert <janpublicebert@posteo.net>
Date: Fri, 24 Feb 2023 11:59:46 +0100
Subject: [PATCH 071/122] Handle possibly empty list

---
 megatron/data/ul2_dataset.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/megatron/data/ul2_dataset.py b/megatron/data/ul2_dataset.py
index 105a482cb..7ab9936c8 100644
--- a/megatron/data/ul2_dataset.py
+++ b/megatron/data/ul2_dataset.py
@@ -430,7 +430,7 @@ def build_training_sample(sample, target_seq_length,
             tokens, masked_spans, bos_id, eos_id, sentinel_tokens, prefix_lm)
 
         # Move EOS tokens to end of sequence.
-        while tokens_enc[-1] == eos_id:
+        while tokens_enc and tokens_enc[-1] == eos_id:
             del tokens_enc[-1]
             tokens_dec_in.append(eos_id)
             labels.append(eos_id)

From c4aa4cdc092c096caa2fcb37d1aac1ce488fb1d9 Mon Sep 17 00:00:00 2001
From: janEbert <janpublicebert@posteo.net>
Date: Mon, 27 Feb 2023 19:21:28 +0100
Subject: [PATCH 072/122] Fix no newline at EOF

---
 megatron/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/megatron/utils.py b/megatron/utils.py
index 893f58dd2..18d94221f 100644
--- a/megatron/utils.py
+++ b/megatron/utils.py
@@ -536,4 +536,4 @@ def dump_weights(preamble, iteration, model, optimizer, tensor=None):
     # hostname = socket.gethostname()
     # pid = os.getpid()
     # global_rank = torch.distributed.get_rank()
-    #fn = f"debug-{iteration}-pp{pp_rank}-tp{tp_rank}-dp{dp_rank}-global{global_rank}-{preamble}-{pid}.txt"
\ No newline at end of file
+    #fn = f"debug-{iteration}-pp{pp_rank}-tp{tp_rank}-dp{dp_rank}-global{global_rank}-{preamble}-{pid}.txt"

From 8d7a0dfb9de710e69688f771c98c388da7ff42f6 Mon Sep 17 00:00:00 2001
From: janEbert <janpublicebert@posteo.net>
Date: Mon, 27 Feb 2023 19:21:58 +0100
Subject: [PATCH 073/122] Allow full prefix Prefix-LM attention sampling

Useful for evaluation.
---
 megatron/utils.py | 23 +++++++++++++++--------
 1 file changed, 15 insertions(+), 8 deletions(-)

diff --git a/megatron/utils.py b/megatron/utils.py
index 18d94221f..e27b21f61 100644
--- a/megatron/utils.py
+++ b/megatron/utils.py
@@ -376,15 +376,18 @@ def get_prefix_indices(data, eod_token, partial_prefix_indices, reset_attention_
     :param eod_token: int, token_id used to signal end of document
     :param partial_prefix_indices: this agument can have multiple types:
         - None, it signals that all prefix indices are randomly sampled.
-        - List[Optional[int]], its length has to be equal to mini batch size. It stores all the indices for per row prefix.
-            Optional means that if set to None, we allows ourselves to sample one randomly.
-        - List[List[Optional[int]]], it follows the following rules:
+        - False, it signals that prefix indices always go to the end of document.
+        - List[Union[int, bool, None]], its length has to be equal to mini batch size. It stores all the indices for per row prefix.
+            If set to None, we allows ourselves to sample one randomly.
+            If set to False, the current row will be attended to completely.
+        - List[List[Union[int, bool, None]]], it follows the following rules:
             - The first dimension refers to that sample, ie len(partial_prefix_indices) == len(data)
             - The second dimension refers to the number of document of that sample, ie
                 len(partial_prefix_indices[b]) == (data[b] == eod_token).sum() (+1 for the last partial document).
             - partial_prefix_indices have to be interleaved with eod_indices, ie
                 eod_indices[b][d-1] < partial_prefix_indices[b][d] < eod_indices[b][d] + 1 or is None.
-            - Optional means that if set to None, we allows ourselves to sample one randomly.
+            - If set to None, we allows ourselves to sample one randomly.
+            - If set to False, the current document will be attended to completely.
     :param reset_attention_mask: bool, determines if prefixes are to be per document or per row.
     :return Depending if prefix is per document or per row, the method returns:
         - List[List[int]]: prefix indices for each document in case of per document prefix
@@ -393,7 +396,7 @@ def get_prefix_indices(data, eod_token, partial_prefix_indices, reset_attention_
     micro_batch_size, seq_length = data.size()
     prefix_indices = []
 
-    assert partial_prefix_indices is None or len(partial_prefix_indices) == micro_batch_size, f"partial_prefix_indices has to be None or its length equal to {micro_batch_size}, got {len(partial_prefix_indices)}"
+    assert partial_prefix_indices is None or partial_prefix_indices is False or len(partial_prefix_indices) == micro_batch_size, f"partial_prefix_indices has to be None or its length equal to {micro_batch_size}, got {len(partial_prefix_indices)}"
     for batch_id in range(micro_batch_size):
         # Prefix lm per document.
         if reset_attention_mask:
@@ -411,14 +414,16 @@ def get_prefix_indices(data, eod_token, partial_prefix_indices, reset_attention_
                 )
 
             prev_index = 0
-            assert partial_prefix_indices is None or len(partial_prefix_indices[batch_id]) == len(eod_indices), f"The number of prefixes has to match the number of documents, complete or partial. Got {len(partial_prefix_indices[batch_id])} prefixes and {len(eod_indices)} documents"
+            assert partial_prefix_indices is None or partial_prefix_indices is False or len(partial_prefix_indices[batch_id]) == len(eod_indices), f"The number of prefixes has to match the number of documents, complete or partial. Got {len(partial_prefix_indices[batch_id])} prefixes and {len(eod_indices)} documents"
 
             for doc_id, eod_index in enumerate(eod_indices):
-                assert partial_prefix_indices is None or isinstance(partial_prefix_indices[batch_id], list), f"Per document prefix has to store a list on indices for each row, got {partial_prefix_indices[batch_id]}"
+                assert partial_prefix_indices is None or partial_prefix_indices is False or isinstance(partial_prefix_indices[batch_id], list), f"Per document prefix has to store a list on indices for each row, got {partial_prefix_indices[batch_id]}"
                 # Prefix index is defined as the first index that isn't attended by all tokens in a document
                 if partial_prefix_indices is None or partial_prefix_indices[batch_id][doc_id] is None:
                     # We need to randomly generate a prefix index that satisfies the interleave condition in the docstring
                     prefix_index = randint(prev_index + 1, eod_index)
+                elif partial_prefix_indices is False or partial_prefix_indices[batch_id][doc_id] is False:
+                    prefix_index = eod_index
                 else:
                     # We get value from partial_prefix_indices, and run validation on that value
                     prefix_index = partial_prefix_indices[batch_id][doc_id]
@@ -429,7 +434,7 @@ def get_prefix_indices(data, eod_token, partial_prefix_indices, reset_attention_
 
         # Prefix lm per row.
         else:
-            assert partial_prefix_indices is None or isinstance(partial_prefix_indices[batch_id], int), \
+            assert partial_prefix_indices is None or partial_prefix_indices is False or isinstance(partial_prefix_indices[batch_id], int), \
                 f"Per document prefix has to store an int for each row, got {partial_prefix_indices[batch_id]}"
 
             # Prefix index is defined as the first index that isn't attended by all previous tokens in a document
@@ -437,6 +442,8 @@ def get_prefix_indices(data, eod_token, partial_prefix_indices, reset_attention_
             if partial_prefix_indices is None or partial_prefix_indices[batch_id] is None:
                 # 0 being the first prefix index makes no sense since 0 always attends to itself, and there are no other tokens before.
                 prefix_index = randint(1, seq_length)
+            elif partial_prefix_indices is False or partial_prefix_indices[batch_id] is False:
+                prefix_index = seq_length
             else:
                 # We get value from partial_prefix_indices, and run validation on that value
                 prefix_index = partial_prefix_indices[batch_id]

From 9bd6e1e2bf4a83ec01a1c2cd897d79efc30cecf5 Mon Sep 17 00:00:00 2001
From: janEbert <janpublicebert@posteo.net>
Date: Mon, 27 Feb 2023 19:23:49 +0100
Subject: [PATCH 074/122] Support PrefixLM models

---
 tasks/eval_harness/evaluate.py | 21 +++++++++++++++++++--
 1 file changed, 19 insertions(+), 2 deletions(-)

diff --git a/tasks/eval_harness/evaluate.py b/tasks/eval_harness/evaluate.py
index 68dd649fd..71d9903df 100644
--- a/tasks/eval_harness/evaluate.py
+++ b/tasks/eval_harness/evaluate.py
@@ -24,7 +24,11 @@
 from megatron.training import setup_model_and_optimizer, get_model
 from megatron.mpu.mappings import gather_from_tensor_model_parallel_region
 
-from megatron.utils import get_ltor_masks_and_position_ids, unwrap_model
+from megatron.utils import (
+    get_ltor_masks_and_position_ids,
+    get_prefix_indices,
+    unwrap_model,
+)
 from megatron.p2p_communication import recv_forward, send_forward
 import pickle
 import json
@@ -185,13 +189,23 @@ def _collate(x):
     def create_model_inputs(self, tokens):
         args = get_args()
 
+        if args.prefix_lm:
+            prefix_indices = get_prefix_indices(
+                tokens,
+                self.EOT_TOKEN_ID,
+                partial_prefix_indices=False,
+                reset_attention_mask=args.reset_attention_mask
+            )
+        else:
+            prefix_indices = None
+
         attention_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids(
             tokens,
             self.EOT_TOKEN_ID,
             args.reset_position_ids,
             args.reset_attention_mask,
             args.eod_mask_loss,
-            prefix_indices=None,
+            prefix_indices=prefix_indices,
             loss_on_targets_only=False)
 
         return (tokens, position_ids, attention_mask), (tokens, loss_mask)
@@ -390,6 +404,9 @@ def tasks_args(parser):
     group.add_argument('--intermed_results',  default = False, action='store_true', help='Whether to print & write intermediate results for each task')
     group.add_argument('--bootstrap_iters', type=int, default=100000, help='How many iterations to use for stderr estimation')
     group.add_argument('--micro_bs_multiplier', type=int, default=1, help='Increase the global batch size to remove bubble when pipeline parallel')
+    group.add_argument('--prefix_lm', action='store_true',
+                       help='Whether to adjust attention masks for a PrefixLM '
+                       'decoder-only model.')
     return parser
 
 from megatron.global_vars import _parse_args

From ba4ab491afd4d1fa431dd1bf6c01be1e7f0961ac Mon Sep 17 00:00:00 2001
From: janEbert <janpublicebert@posteo.net>
Date: Mon, 27 Feb 2023 19:24:14 +0100
Subject: [PATCH 075/122] Allow setting number of few-shot examples

---
 tasks/eval_harness/evaluate.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/tasks/eval_harness/evaluate.py b/tasks/eval_harness/evaluate.py
index 71d9903df..62b7058b7 100644
--- a/tasks/eval_harness/evaluate.py
+++ b/tasks/eval_harness/evaluate.py
@@ -402,6 +402,8 @@ def tasks_args(parser):
                        help='Should the sequence length be adapted to the batch during evaluation, if in fp16 the results will be slightly different due to numerical errors but greatly speed up evaluation.')
     group.add_argument('--eval_fp32',  default = False, action='store_true', help='Should the evaluation run in fp32')
     group.add_argument('--intermed_results',  default = False, action='store_true', help='Whether to print & write intermediate results for each task')
+    group.add_argument('--num_fewshot', type=int, default=0,
+                       help='How many examples to show.')
     group.add_argument('--bootstrap_iters', type=int, default=100000, help='How many iterations to use for stderr estimation')
     group.add_argument('--micro_bs_multiplier', type=int, default=1, help='Increase the global batch size to remove bubble when pipeline parallel')
     group.add_argument('--prefix_lm', action='store_true',
@@ -434,7 +436,8 @@ def main():
 
     tokenizer = get_tokenizer()
     adaptor = EvalHarnessAdaptor(model, tokenizer)
-    
+    num_fewshot = args.num_fewshot
+
     if args.intermed_results:
         global_results = {"results": {}, "versions": {}}
         timestamp = datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S')
@@ -443,7 +446,7 @@ def main():
         # Backup file in case of interruption during writing
         results_path_backup = args.results_path.replace(".json", f"_lm-eval_{iteration_id}_{timestamp}_backup.json")
         for task_name, task in task_dict.items():
-            results = evaluator.evaluate(adaptor, {task_name: task}, False, 0, None, bootstrap_iters=args.bootstrap_iters)
+            results = evaluator.evaluate(adaptor, {task_name: task}, False, num_fewshot, None, bootstrap_iters=args.bootstrap_iters)
             global_results["results"] = {**global_results["results"], **results["results"]}
             global_results["versions"] = {**global_results["versions"], **results["versions"]}
             if mpu.is_pipeline_last_stage() and mpu.get_tensor_model_parallel_rank() == 0:
@@ -453,7 +456,7 @@ def main():
                 with open(results_path_backup, 'w') as outfile:
                     json.dump(global_results, outfile, indent=4)
     else:
-        global_results = evaluator.evaluate(adaptor, task_dict, False, 0, None, bootstrap_iters=args.bootstrap_iters)
+        global_results = evaluator.evaluate(adaptor, task_dict, False, num_fewshot, None, bootstrap_iters=args.bootstrap_iters)
         if mpu.is_pipeline_last_stage() and mpu.get_tensor_model_parallel_rank() == 0:
             print(json.dumps(global_results, indent=2))
             with open(args.results_path, 'w') as outfile:

From 9f531711b7a232fe11cfbffb0b4c24ac7cb07397 Mon Sep 17 00:00:00 2001
From: janEbert <janpublicebert@posteo.net>
Date: Mon, 27 Feb 2023 19:26:15 +0100
Subject: [PATCH 076/122] Update task/dataset name

"lambada" was renamed to "lambada_openai" in the upstream
lm-eval-harness repo.
---
 examples/run_evalharness_deepspeed.md       | 2 +-
 examples/run_evalharness_deepspeed.slurm    | 2 +-
 examples/run_evalharness_tr11-176b-ml.slurm | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/examples/run_evalharness_deepspeed.md b/examples/run_evalharness_deepspeed.md
index 695d9d0aa..eee4d70e6 100644
--- a/examples/run_evalharness_deepspeed.md
+++ b/examples/run_evalharness_deepspeed.md
@@ -29,7 +29,7 @@ Also make sure `data` is not on one of the limited paritions like WORKSF.
 Then install datasets for the tasks:
 ```
 python ./tasks/eval_harness/download.py --task_list
-arc_challenge,arc_easy,boolq,copa,hellaswag,lambada,logiqa,mathqa,mc_taco,mrpc,multirc,openbookqa,piqa,prost,pubmedqa,qnli,qqp,race,rte,sciq,sst,triviaqa,webqs,wic,winogrande,wnli,wsc
+arc_challenge,arc_easy,boolq,copa,hellaswag,lambada_openai,logiqa,mathqa,mc_taco,mrpc,multirc,openbookqa,piqa,prost,pubmedqa,qnli,qqp,race,rte,sciq,sst,triviaqa,webqs,wic,winogrande,wnli,wsc
 ```
 and make sure that `export HF_DATASETS_OFFLINE=1`
 
diff --git a/examples/run_evalharness_deepspeed.slurm b/examples/run_evalharness_deepspeed.slurm
index e58ed9608..82d2ff024 100644
--- a/examples/run_evalharness_deepspeed.slurm
+++ b/examples/run_evalharness_deepspeed.slurm
@@ -85,7 +85,7 @@ CMD="./tasks/eval_harness/evaluate.py  \
     --seq-length $SEQ_LEN \
     --adaptive_seq_len \
     --eval_fp32 \
-    --task_list arc_challenge,arc_easy,boolq,copa,hellaswag,lambada,logiqa,mathqa,mc_taco,mrpc,multirc,openbookqa,piqa,prost,pubmedqa,qnli,qqp,race,rte,sst,webqs,wic,winogrande,wnli,wsc,triviaqa,sciq \
+    --task_list arc_challenge,arc_easy,boolq,copa,hellaswag,lambada_openai,logiqa,mathqa,mc_taco,mrpc,multirc,openbookqa,piqa,prost,pubmedqa,qnli,qqp,race,rte,sst,webqs,wic,winogrande,wnli,wsc,triviaqa,sciq \
     $MEGATRON_REQUIRED_ARGS \
     "
 
diff --git a/examples/run_evalharness_tr11-176b-ml.slurm b/examples/run_evalharness_tr11-176b-ml.slurm
index 6d4849461..273215c08 100644
--- a/examples/run_evalharness_tr11-176b-ml.slurm
+++ b/examples/run_evalharness_tr11-176b-ml.slurm
@@ -89,7 +89,7 @@ CMD="./tasks/eval_harness/evaluate.py  \
     --bf16 \
     --inference \
     --seq-length $SEQ_LEN \
-    --task_list arc_challenge,arc_easy,boolq,copa,headqa,hellaswag,lambada,logiqa,mathqa,mc_taco,mrpc,multirc,openbookqa,piqa,prost,pubmedqa,qnli,qqp,race,rte,sciq,sst,triviaqa,webqs,wic,winogrande,wnli,wsc \
+    --task_list arc_challenge,arc_easy,boolq,copa,headqa,hellaswag,lambada_openai,logiqa,mathqa,mc_taco,mrpc,multirc,openbookqa,piqa,prost,pubmedqa,qnli,qqp,race,rte,sciq,sst,triviaqa,webqs,wic,winogrande,wnli,wsc \
     --deepspeed \
     --deepspeed_config ds_config.json \
     --bootstrap_iters 2 \

From 5b63d0b5e10dbfffc80e54611a78e856975cd787 Mon Sep 17 00:00:00 2001
From: janEbert <janpublicebert@posteo.net>
Date: Tue, 28 Feb 2023 14:47:13 +0100
Subject: [PATCH 077/122] Do not remove last token

This corrupts the targets. There is no good reason for this.
---
 tasks/eval_harness/evaluate.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tasks/eval_harness/evaluate.py b/tasks/eval_harness/evaluate.py
index 62b7058b7..66f34323e 100644
--- a/tasks/eval_harness/evaluate.py
+++ b/tasks/eval_harness/evaluate.py
@@ -137,7 +137,7 @@ def _collate(x):
                 for _, context_enc, continuation_enc in chunk:
                     # when too long to fit in context, truncate from the left
                     inp = torch.tensor(
-                        (context_enc + continuation_enc)[-(self.max_length + 1):][:-1]
+                        (context_enc + continuation_enc)[-(self.max_length + 1):]
                         , dtype=torch.long).to(self.device)
                     inplen, = inp.shape
 

From 639b71d26f5f2ee63cfd176be0a38ca060b0272e Mon Sep 17 00:00:00 2001
From: janEbert <janpublicebert@posteo.net>
Date: Tue, 28 Feb 2023 14:55:54 +0100
Subject: [PATCH 078/122] Fix PrefixLM contexts

Previously we always gave the whole sequence as context, when it also
includes the answer. This is obviously not desired. We only want to give
enough context to reach the answer.
---
 tasks/eval_harness/evaluate.py | 38 ++++++++++++++++++++++++++++------
 1 file changed, 32 insertions(+), 6 deletions(-)

diff --git a/tasks/eval_harness/evaluate.py b/tasks/eval_harness/evaluate.py
index 66f34323e..3f7014a9c 100644
--- a/tasks/eval_harness/evaluate.py
+++ b/tasks/eval_harness/evaluate.py
@@ -133,7 +133,7 @@ def _collate(x):
 
             reord = utils.Reorderer(requests, _collate)
             for chunk in utils.chunks(tqdm(reord.get_reordered(), disable=disable_tqdm), self.batch_size):
-                inps, contlens, inplens, padding_length = [], [], [], None
+                inps, ctxlens, contlens, inplens, padding_length = [], [], [], [], None
                 for _, context_enc, continuation_enc in chunk:
                     # when too long to fit in context, truncate from the left
                     inp = torch.tensor(
@@ -141,6 +141,13 @@ def _collate(x):
                         , dtype=torch.long).to(self.device)
                     inplen, = inp.shape
 
+                    total_len = len(context_enc) + len(continuation_enc)
+                    if len(continuation_enc) == 0:
+                        ctxlen = 1
+                    else:
+                        num_truncated = max(total_len - self.max_length + 1, 0)
+                        ctxlen = max(len(context_enc) - num_truncated, 1)
+
                     cont = continuation_enc
 
                     # since in _collate we make sure length is descending, the longest is always the first one.
@@ -157,8 +164,9 @@ def _collate(x):
 
                     contlens.append(cont)
                     inplens.append(inplen)
+                    ctxlens.append(ctxlen)
 
-                logits = self._model_call(torch.cat(inps, dim=0))
+                logits = self._model_call((torch.cat(inps, dim=0), ctxlens))
                 res_len += len(chunk)
                 if logits is not None:
                     multi_logits = F.log_softmax(logits, dim=-1).cpu()  # [batch, seq, vocab]
@@ -189,14 +197,24 @@ def _collate(x):
     def create_model_inputs(self, tokens):
         args = get_args()
 
-        if args.prefix_lm:
+        if isinstance(tokens, tuple) and len(tokens) == 2:
+            tokens, ctxlens = tokens
+        else:
+            ctxlens = None
+
+        if args.prefix_lm and ctxlens is not None:
             prefix_indices = get_prefix_indices(
                 tokens,
                 self.EOT_TOKEN_ID,
-                partial_prefix_indices=False,
+                partial_prefix_indices=ctxlens,
                 reset_attention_mask=args.reset_attention_mask
             )
         else:
+            if args.prefix_lm:
+                print(
+                    'Warning: requested PrefixLM inputs, but cannot determine '
+                    'prefix length – prefix is empty.'
+                )
             prefix_indices = None
 
         attention_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids(
@@ -213,13 +231,21 @@ def create_model_inputs(self, tokens):
     def _model_call(self, inps):
         args = get_args()
 
+        if isinstance(inps, tuple) and len(inps) == 2:
+            inps, ctxlens = inps
+        else:
+            ctxlens = None
+
         if args.deepspeed:
             self.model.set_batch_fn(self.create_model_inputs)
             # round up to multiple of micro_batch_size
             new_size = ((len(inps) + args.micro_batch_size-1)  // args.micro_batch_size) * args.micro_batch_size
             padded = F.pad(inps, (0, 0, 0, new_size-len(inps)), value = 0)
             # dummy data iterator for pipelining.
-            data_iterator = list((torch.stack(inp) for inp in utils.chunks(padded, args.micro_batch_size)))
+            data_iterator = list((
+                (torch.stack(inp), ctxlens)
+                for inp in utils.chunks(padded, args.micro_batch_size)
+            ))
             self.model.micro_batches = len(data_iterator)
             
             if self.adaptive_seq_len:
@@ -253,7 +279,7 @@ def _model_call(self, inps):
             # Forward pass through the model.
             unwrapped_model = unwrap_model(self.model, (torchDDP, LocalDDP, Float16Module))
             unwrapped_model.set_input_tensor(input_tensor)
-            output = self.model(*self.create_model_inputs(inps)[0])
+            output = self.model(*self.create_model_inputs((inps, ctxlens))[0])
             send_forward(output)
 
         if mpu.is_pipeline_last_stage():

From 127d1e49842c645f9dfbb888243b3dbf7e537be3 Mon Sep 17 00:00:00 2001
From: janEbert <janpublicebert@posteo.net>
Date: Tue, 28 Feb 2023 14:58:58 +0100
Subject: [PATCH 079/122] Fix module refactor

These models have moved into DeepSpeed but were never probably replaced
here after they have been removed.
---
 tasks/eval_harness/evaluate.py                    | 2 +-
 tools/convert_checkpoint/deepspeed_to_megatron.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/tasks/eval_harness/evaluate.py b/tasks/eval_harness/evaluate.py
index 3f7014a9c..b8c623ec7 100644
--- a/tasks/eval_harness/evaluate.py
+++ b/tasks/eval_harness/evaluate.py
@@ -300,7 +300,7 @@ def tokenizer_encode(self, text):
 from megatron.initialize import initialize_megatron
 import megatron
 
-from tools.convert_checkpoint.deepspeed_checkpoint import DeepSpeedCheckpoint
+from deepspeed.checkpoint.deepspeed_checkpoint import DeepSpeedCheckpoint
 from tools.convert_checkpoint.deepspeed_to_megatron import _create_rank_checkpoint
 
 def override_args(args, override_args, skip_keys, skip_if_specified_keys):
diff --git a/tools/convert_checkpoint/deepspeed_to_megatron.py b/tools/convert_checkpoint/deepspeed_to_megatron.py
index 74e5ca7c9..385c22fd3 100755
--- a/tools/convert_checkpoint/deepspeed_to_megatron.py
+++ b/tools/convert_checkpoint/deepspeed_to_megatron.py
@@ -4,7 +4,7 @@
 import os
 import torch
 from collections import OrderedDict
-from .deepspeed_checkpoint import ARGS_KEY, DeepSpeedCheckpoint
+from deepspeed.checkpoint.deepspeed_checkpoint import ARGS_KEY, DeepSpeedCheckpoint
 
 MODEL_KEY = 'model'
 ARGS_KEY = 'args'

From 1bb788d00370644583ed4f4a1203fccc9222b439 Mon Sep 17 00:00:00 2001
From: janEbert <janpublicebert@posteo.net>
Date: Tue, 28 Feb 2023 15:06:39 +0100
Subject: [PATCH 080/122] Fix possible `TypeError`

When indexing into `False` or `None`.
---
 megatron/utils.py | 32 ++++++++++++++++++++++++++++----
 1 file changed, 28 insertions(+), 4 deletions(-)

diff --git a/megatron/utils.py b/megatron/utils.py
index e27b21f61..f658da932 100644
--- a/megatron/utils.py
+++ b/megatron/utils.py
@@ -419,10 +419,22 @@ def get_prefix_indices(data, eod_token, partial_prefix_indices, reset_attention_
             for doc_id, eod_index in enumerate(eod_indices):
                 assert partial_prefix_indices is None or partial_prefix_indices is False or isinstance(partial_prefix_indices[batch_id], list), f"Per document prefix has to store a list on indices for each row, got {partial_prefix_indices[batch_id]}"
                 # Prefix index is defined as the first index that isn't attended by all tokens in a document
-                if partial_prefix_indices is None or partial_prefix_indices[batch_id][doc_id] is None:
+                if (
+                        partial_prefix_indices is None
+                        or (
+                            partial_prefix_indices is not False
+                            and partial_prefix_indices[batch_id][doc_id] is None
+                        )
+                ):
                     # We need to randomly generate a prefix index that satisfies the interleave condition in the docstring
                     prefix_index = randint(prev_index + 1, eod_index)
-                elif partial_prefix_indices is False or partial_prefix_indices[batch_id][doc_id] is False:
+                elif (
+                        partial_prefix_indices is False
+                        or (
+                            partial_prefix_indices is not None
+                            and partial_prefix_indices[batch_id][doc_id] is False
+                        )
+                ):
                     prefix_index = eod_index
                 else:
                     # We get value from partial_prefix_indices, and run validation on that value
@@ -439,10 +451,22 @@ def get_prefix_indices(data, eod_token, partial_prefix_indices, reset_attention_
 
             # Prefix index is defined as the first index that isn't attended by all previous tokens in a document
             prefix_index: int
-            if partial_prefix_indices is None or partial_prefix_indices[batch_id] is None:
+            if (
+                    partial_prefix_indices is None
+                    or (
+                        partial_prefix_indices is not False
+                        and partial_prefix_indices[batch_id] is None
+                    )
+            ):
                 # 0 being the first prefix index makes no sense since 0 always attends to itself, and there are no other tokens before.
                 prefix_index = randint(1, seq_length)
-            elif partial_prefix_indices is False or partial_prefix_indices[batch_id] is False:
+            elif (
+                    partial_prefix_indices is False
+                    or (
+                        partial_prefix_indices is not None
+                        and partial_prefix_indices[batch_id] is False
+                    )
+            ):
                 prefix_index = seq_length
             else:
                 # We get value from partial_prefix_indices, and run validation on that value

From cf5965a146e9a15df941858c0c14696cc31e00b3 Mon Sep 17 00:00:00 2001
From: janEbert <janpublicebert@posteo.net>
Date: Tue, 28 Feb 2023 16:07:04 +0100
Subject: [PATCH 081/122] Optionally add prefix tokens

---
 tasks/eval_harness/evaluate.py | 39 +++++++++++++++++++++++++++++-----
 1 file changed, 34 insertions(+), 5 deletions(-)

diff --git a/tasks/eval_harness/evaluate.py b/tasks/eval_harness/evaluate.py
index b8c623ec7..fa56f67d5 100644
--- a/tasks/eval_harness/evaluate.py
+++ b/tasks/eval_harness/evaluate.py
@@ -48,6 +48,11 @@ def __init__(self, model, tokenizer):
         self.EOT_TOKEN_ID = tokenizer.eod
 
         self._max_length = args.seq_length
+        self._prefix_tokens = args.prefix_tokens
+        self._prefix_token_ids = [
+            self.tokenizer.tokenizer.convert_tokens_to_ids(token)
+            for token in self._prefix_tokens
+        ]
 
         # For ds we split into mini batches and then micro batches to keep pipelining api happy.
         # With Megatron we just go to micro_batches directly
@@ -78,6 +83,14 @@ def batch_size(self):
     def device(self):
         return self._device
 
+    def _prepend_prefix_token_ids(self, tokens):
+        if not self._prefix_token_ids:
+            pass
+        elif tokens[0] == self.EOT_TOKEN_ID:
+            tokens = tokens[:1] + self._prefix_token_ids + tokens[1:]
+        else:
+            tokens = self._prefix_token_ids + tokens
+        return tokens
 
     def loglikelihood(self, requests):
         new_reqs = []
@@ -136,17 +149,31 @@ def _collate(x):
                 inps, ctxlens, contlens, inplens, padding_length = [], [], [], [], None
                 for _, context_enc, continuation_enc in chunk:
                     # when too long to fit in context, truncate from the left
+                    context_len = len(context_enc) + len(self._prefix_tokens)
+                    total_len = context_len + len(continuation_enc)
+
+                    context_num_truncated = max(
+                        total_len - self.max_length + 1, 0)
+                    continuation_num_truncated = max(
+                        context_num_truncated - context_len, 0)
+
+                    context_enc = context_enc[context_num_truncated:]
+                    continuation_enc = \
+                        continuation_enc[continuation_num_truncated:]
+
+                    # Add prefix token after truncation.
+                    context_enc = self._prepend_prefix_token_ids(context_enc)
+
                     inp = torch.tensor(
-                        (context_enc + continuation_enc)[-(self.max_length + 1):]
-                        , dtype=torch.long).to(self.device)
+                        context_enc + continuation_enc,
+                        dtype=torch.long,
+                    ).to(self.device)
                     inplen, = inp.shape
 
-                    total_len = len(context_enc) + len(continuation_enc)
                     if len(continuation_enc) == 0:
                         ctxlen = 1
                     else:
-                        num_truncated = max(total_len - self.max_length + 1, 0)
-                        ctxlen = max(len(context_enc) - num_truncated, 1)
+                        ctxlen = max(context_len - context_num_truncated, 1)
 
                     cont = continuation_enc
 
@@ -435,6 +462,8 @@ def tasks_args(parser):
     group.add_argument('--prefix_lm', action='store_true',
                        help='Whether to adjust attention masks for a PrefixLM '
                        'decoder-only model.')
+    group.add_argument('--prefix_tokens', type=str, nargs='*', default=[],
+                       help='Tokens to add at the front of the input sequence.')
     return parser
 
 from megatron.global_vars import _parse_args

From a53823845e5af2b5b3aab813ee03256de16fb209 Mon Sep 17 00:00:00 2001
From: janEbert <janpublicebert@posteo.net>
Date: Tue, 28 Feb 2023 17:12:57 +0100
Subject: [PATCH 082/122] Automatically add UL2 tokens

At worst, these may be mapped to the wrong tokens. However, the chance
that the amount of unknown tokens are as many or fewer than the few UL2
tokens is very low. And if there are more unknown tokens than UL2
tokens, we'll get errors.
---
 tasks/eval_harness/evaluate.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tasks/eval_harness/evaluate.py b/tasks/eval_harness/evaluate.py
index fa56f67d5..0c07ebb22 100644
--- a/tasks/eval_harness/evaluate.py
+++ b/tasks/eval_harness/evaluate.py
@@ -1,3 +1,4 @@
+import argparse
 from functools import reduce
 from logging import logMultiprocessing
 import os
@@ -464,6 +465,8 @@ def tasks_args(parser):
                        'decoder-only model.')
     group.add_argument('--prefix_tokens', type=str, nargs='*', default=[],
                        help='Tokens to add at the front of the input sequence.')
+    # Automatically add UL2 tokens.
+    group.add_argument('--_is_ul2', default=True, help=argparse.SUPPRESS)
     return parser
 
 from megatron.global_vars import _parse_args

From 3a8bc356e8d5dc63a6a93f228b4744a741f29d13 Mon Sep 17 00:00:00 2001
From: janEbert <janpublicebert@posteo.net>
Date: Wed, 1 Mar 2023 14:18:15 +0100
Subject: [PATCH 083/122] Fix context lengths batch chunking

---
 tasks/eval_harness/evaluate.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/tasks/eval_harness/evaluate.py b/tasks/eval_harness/evaluate.py
index 0c07ebb22..aec50dae3 100644
--- a/tasks/eval_harness/evaluate.py
+++ b/tasks/eval_harness/evaluate.py
@@ -271,8 +271,11 @@ def _model_call(self, inps):
             padded = F.pad(inps, (0, 0, 0, new_size-len(inps)), value = 0)
             # dummy data iterator for pipelining.
             data_iterator = list((
-                (torch.stack(inp), ctxlens)
-                for inp in utils.chunks(padded, args.micro_batch_size)
+                (torch.stack(inp), ctxlen)
+                for (inp, ctxlen) in zip(
+                        utils.chunks(padded, args.micro_batch_size),
+                        utils.chunks(ctxlens, args.micro_batch_size),
+                )
             ))
             self.model.micro_batches = len(data_iterator)
             

From 6f0e33a7b6784d76846a31b1f650b3379ab47156 Mon Sep 17 00:00:00 2001
From: janEbert <janpublicebert@posteo.net>
Date: Wed, 1 Mar 2023 19:44:31 +0100
Subject: [PATCH 084/122] Allow different models to be loaded

---
 tasks/eval_harness/evaluate.py | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/tasks/eval_harness/evaluate.py b/tasks/eval_harness/evaluate.py
index aec50dae3..46e047a8b 100644
--- a/tasks/eval_harness/evaluate.py
+++ b/tasks/eval_harness/evaluate.py
@@ -1,5 +1,6 @@
 import argparse
 from functools import reduce
+import importlib
 from logging import logMultiprocessing
 import os
 import sys
@@ -14,7 +15,6 @@
 import torch.nn.functional as F
 
 from lm_eval.tasks import ALL_TASKS
-from pretrain_gpt import model_provider
 import numpy as np
 
 import torch
@@ -411,6 +411,10 @@ def load_ds_checkpoint_and_setup_megatron(args):
     # Initializing megatron will update eg. tokenizer size. Override again.
     override_args(args, cp_args, skip_keys, skip_if_specified)
 
+    model_provider = importlib.import_module(
+        f'pretrain_{args.model_name}',
+    ).model_provider
+
     # print final arguments.
     _print_args(args)
     if args.deepspeed:
@@ -453,6 +457,11 @@ def tasks_args(parser):
 
     """Provide extra arguments required for tasks."""
     group = parser.add_argument_group(title='Evaluation options')
+    group.add_argument('--model_name', type=str, default="gpt",
+                       help=(
+                           'Which model architecture to use (must exist as '
+                           '`pretrain_{model_name}.py` script).'
+                       ))
     group.add_argument('--task_list', type=str, default = "all", help='Either "all" or comma separated list of tasks.')
     group.add_argument('--results_path', type=str, default = results_path_default, help='Path to where the results will be stored.')
     group.add_argument('--adaptive_seq_len',  default = False, action='store_true',

From 9c4c71872bebf9f45a39f8c495fca669b00587eb Mon Sep 17 00:00:00 2001
From: janEbert <janpublicebert@posteo.net>
Date: Thu, 2 Mar 2023 21:28:17 +0100
Subject: [PATCH 085/122] Fix context batch size padding

---
 tasks/eval_harness/evaluate.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tasks/eval_harness/evaluate.py b/tasks/eval_harness/evaluate.py
index 46e047a8b..e21541916 100644
--- a/tasks/eval_harness/evaluate.py
+++ b/tasks/eval_harness/evaluate.py
@@ -269,6 +269,7 @@ def _model_call(self, inps):
             # round up to multiple of micro_batch_size
             new_size = ((len(inps) + args.micro_batch_size-1)  // args.micro_batch_size) * args.micro_batch_size
             padded = F.pad(inps, (0, 0, 0, new_size-len(inps)), value = 0)
+            ctxlens = ctxlens + [1] * (new_size - len(ctxlens))
             # dummy data iterator for pipelining.
             data_iterator = list((
                 (torch.stack(inp), ctxlen)

From 754cf21a487eb2a63eaaa7f9def52d725262b639 Mon Sep 17 00:00:00 2001
From: janEbert <janpublicebert@posteo.net>
Date: Tue, 7 Mar 2023 10:33:33 +0100
Subject: [PATCH 086/122] Add xPos embeddings

---
 megatron/arguments.py                   |   2 +-
 megatron/enums.py                       |   1 +
 megatron/model/positional_embeddings.py | 109 +++++++++++++++++++++++-
 3 files changed, 110 insertions(+), 2 deletions(-)

diff --git a/megatron/arguments.py b/megatron/arguments.py
index 7cd3a7fbe..95b7adc90 100644
--- a/megatron/arguments.py
+++ b/megatron/arguments.py
@@ -410,7 +410,7 @@ def _add_network_size_args(parser):
     group.add_argument('--position-embedding-type', type=lambda x: PositionEmbeddingType[x],
                        choices=list(PositionEmbeddingType),
                        default=PositionEmbeddingType.absolute,
-                       help='Define position embedding type ("absolute" | "rotary" | "alibi"). "absolute" by default.'
+                       help='Define position embedding type ("absolute" | "rotary" | "alibi" | "xpos"). "absolute" by default.'
                        )
     group.add_argument('--glu-activation', type=str,
                        choices=megatron.model.glu_activations.GLU_ACTIVATIONS.keys(),
diff --git a/megatron/enums.py b/megatron/enums.py
index 2961cbb66..c749ab915 100644
--- a/megatron/enums.py
+++ b/megatron/enums.py
@@ -33,6 +33,7 @@ class PositionEmbeddingType(enum.Enum):
     rotary = 1
     absolute = 2
     alibi = 3
+    xpos = 4
 
 class UL2ModelType(enum.Enum):
     ENCODER_DECODER = 'ED'
diff --git a/megatron/model/positional_embeddings.py b/megatron/model/positional_embeddings.py
index 3494f9e4e..5ddf473e4 100644
--- a/megatron/model/positional_embeddings.py
+++ b/megatron/model/positional_embeddings.py
@@ -48,4 +48,111 @@ def apply_rotary_pos_emb(q, k, cos, sin, offset: int = 0):
 
 def apply_rotary_pos_emb_torch(q, k, cos, sin, offset: int = 0):  # jitting fails with bf16
     cos, sin = cos[offset:q.shape[0] + offset, ...], sin[offset:q.shape[0] + offset, ...]
-    return (q * cos) + (rotate_half(q) * sin), (k * cos) + (rotate_half(k) * sin)
\ No newline at end of file
+    return (q * cos) + (rotate_half(q) * sin), (k * cos) + (rotate_half(k) * sin)
+
+
+# Original implementation adjusted from https://github.com/sunyt32/torchscale
+
+def fixed_pos_embedding(x, base):
+    seq_len, dim = x.shape
+    inv_freq = 1.0 / (base ** (torch.arange(0, dim) / dim))
+    sinusoid_inp = (
+        torch.einsum("i , j -> i j", torch.arange(0, seq_len, dtype=torch.float), inv_freq).to(x)
+    )
+    return torch.cos(sinusoid_inp), torch.sin(sinusoid_inp)
+
+
+class XPos(torch.nn.Module):
+    """
+    xPos positional embeddings from https://arxiv.org/abs/2212.10554.
+    """
+
+    def __init__(self, head_dim, freq_base=10000, scale_base=512, gamma=0.4, precision=torch.half):
+        super().__init__()
+        self.scale_base = scale_base
+        self.register_buffer(
+            "scale",
+            (
+                (torch.arange(0, head_dim, 2) + gamma * head_dim)
+                / ((1.0 + gamma) * head_dim)
+            ),
+        )
+        self.max_seq_len_cached = None
+        self.precision = precision
+        self.freq_base = freq_base
+
+    def forward(self, x, seq_dim=1, seq_len=None):
+        if seq_len is None:
+            seq_len = x.shape[seq_dim]
+        if (
+                self.max_seq_len_cached is None
+                or (seq_len > self.max_seq_len_cached)
+        ):
+            self.max_seq_len_cached = seq_len
+            scale = (
+                self.scale
+                ** (
+                    torch.arange(0, seq_len, 1) - seq_len // 2
+                ).to(self.scale).div(self.scale_base)[:, None]
+            )
+            cos, sin = fixed_pos_embedding(scale, self.freq_base)
+            self.cos_cached = cos
+            self.sin_cached = sin
+            self.scale_cached = scale
+            if self.precision == torch.bfloat16:
+                self.cos_cached = self.cos_cached.bfloat16()
+                self.sin_cached = self.sin_cached.bfloat16()
+        return (
+            self.cos_cached[:seq_len],
+            self.sin_cached[:seq_len],
+            self.scale_cached[:seq_len],
+        )
+
+
+def rotate_every_two(x):
+    x1 = x[:, :, ::2]
+    x2 = x[:, :, 1::2]
+    x = torch.stack((-x2, x1), dim=-1)
+    return x.flatten(-2)  # in einsum notation: rearrange(x, '... d j -> ... (d j)')\
+
+
+def duplicate_interleave(m):
+    """
+    A simple version of `torch.repeat_interleave` for duplicating a matrix while interleaving the copy.
+    """
+    dim0 = m.shape[0]
+    m = m.view(-1, 1)  # flatten the matrix
+    m = m.repeat(1, 2)  # repeat all elements into the 2nd dimension
+    m = m.view(dim0, -1)  # reshape into a matrix, interleaving the copy
+    return m.unsqueeze(1)
+
+
+def _apply_xpos_emb(x, cos, sin, scale):
+    # x is assumed to be (seq_len, batch_size, dim) here.
+    cos = duplicate_interleave(cos * scale)
+    sin = duplicate_interleave(sin * scale)
+    # einsum notation for lambda t: repeat(t[offset:x.shape[1]+offset,:], "n d -> () n () (d j)", j=2)
+    return (x * cos) + (rotate_every_two(x) * sin)
+
+
+@torch.jit.script
+def apply_xpos_emb(q, k, cos, sin, scale, offset: int = 0):
+    # q/k are assumed to be (seq_len, batch_size, dim) here.
+    cos = cos[offset:q.shape[0] + offset]
+    sin = sin[offset:q.shape[0] + offset]
+    scale = scale[offset:q.shape[0] + offset]
+    return (
+        _apply_xpos_emb(q, cos, sin, scale),
+        _apply_xpos_emb(q, cos, sin, 1.0 / scale),
+    )
+
+
+def apply_xpos_emb_torch(q, k, cos, sin, scale, offset: int = 0):
+    # q/k are assumed to be (seq_len, batch_size, dim) here.
+    cos = cos[offset:q.shape[0] + offset]
+    sin = sin[offset:q.shape[0] + offset]
+    scale = scale[offset:q.shape[0] + offset]
+    return (
+        _apply_xpos_emb(q, cos, sin, scale),
+        _apply_xpos_emb(q, cos, sin, 1.0 / scale),
+    )

From 08b0eaf796a2f0d048ce9c266cedebd5eec7f51d Mon Sep 17 00:00:00 2001
From: janEbert <janpublicebert@posteo.net>
Date: Tue, 7 Mar 2023 10:59:38 +0100
Subject: [PATCH 087/122] Add optional UL2 normal distribution scaling

---
 megatron/arguments.py          |  3 +++
 megatron/data/dataset_utils.py | 18 +++++++++++++++---
 megatron/data/ul2_dataset.py   | 25 ++++++++++++++++---------
 3 files changed, 34 insertions(+), 12 deletions(-)

diff --git a/megatron/arguments.py b/megatron/arguments.py
index 95b7adc90..160a5afb7 100644
--- a/megatron/arguments.py
+++ b/megatron/arguments.py
@@ -1080,6 +1080,9 @@ def _add_ul2_args(parser):
                        help='What token to prepend for the UL2 X-denoising '
                        'objective. If empty, do not prepend a token for this '
                        'objective.')
+    group.add_argument('--ul2-scale-normal-std', action='store_true',
+                       help='Whether to scale the standard deviation when '
+                       'using a normal distribution for span length sampling.')
     group.add_argument('--ul2-like-ul2r', action='store_true',
                        help='Whether to use the updated implementation as '
                        'described in the UL2R paper. This only changes the '
diff --git a/megatron/data/dataset_utils.py b/megatron/data/dataset_utils.py
index 5b9c7eba1..23992bbef 100644
--- a/megatron/data/dataset_utils.py
+++ b/megatron/data/dataset_utils.py
@@ -49,6 +49,7 @@ class SamplingStyle(Enum):
     GEOMETRIC = 'geometric'
     UNIFORM = 'uniform'
     NORMAL = 'normal'
+    UNSCALED_NORMAL = 'unscaled normal'
 
 
 def analyze_data_prefix(data_prefix):
@@ -279,12 +280,20 @@ def create_masked_lm_predictions(tokens,
         return (output_tokens, masked_lm_positions,
                 masked_lm_labels, token_boundary)
 
-    if sampling_style is SamplingStyle.NORMAL:
+    if (
+            sampling_style is SamplingStyle.NORMAL
+            or sampling_style is SamplingStyle.UNSCALED_NORMAL
+    ):
         # First, we get the center of our normal distribution from
         # `max_ngrams`. Keeping the meaning of `max_ngrams` this way
         # plays nicely with the other probability distributions in terms
         # of math.
         normal_mean = (max_ngrams + 1) / 2
+        normal_std = (
+            math.sqrt(normal_mean)
+            if sampling_style is not SamplingStyle.UNSCALED_NORMAL
+            else 1.0
+        )
         # However, we do not want to bound the maximum length of
         # n-grams.
         max_ngrams = num_filtered_tokens - 1
@@ -354,9 +363,12 @@ def get_ngram_indices_(idx):
             n = min(np_rng.geometric(0.2), max_ngrams)
         elif sampling_style is SamplingStyle.UNIFORM:
             n = np_rng.choice(ngrams[:len(cand_index_set)])
-        elif sampling_style is SamplingStyle.NORMAL:
+        elif (
+                sampling_style is SamplingStyle.NORMAL
+                or sampling_style is SamplingStyle.UNSCALED_NORMAL
+        ):
             n = round(np.clip(
-                np_rng.normal(loc=normal_mean),
+                np_rng.normal(loc=normal_mean, scale=normal_std),
                 1,
                 len(cand_index_set),
             ))
diff --git a/megatron/data/ul2_dataset.py b/megatron/data/ul2_dataset.py
index 7ab9936c8..f8e8a8470 100644
--- a/megatron/data/ul2_dataset.py
+++ b/megatron/data/ul2_dataset.py
@@ -56,8 +56,10 @@ def __init__(self, name, indexed_dataset, data_prefix,
                  num_epochs, max_num_samples, model_type,
                  denoiser_ratios, denoisers, mean_span_lengths,
                  mask_ratios, add_mask_tokens, pack_samples,
-                 denoiser_tokens, like_ul2r, pack_any, pack_repeat_prompt,
-                 max_seq_length, max_seq_length_dec, short_seq_prob, seed):
+                 denoiser_tokens, scale_normal_std, like_ul2r,
+                 pack_any, pack_repeat_prompt,
+                 max_seq_length, max_seq_length_dec,
+                 short_seq_prob, seed):
         super().__init__()
 
         if denoiser_ratios is None:
@@ -86,6 +88,7 @@ def __init__(self, name, indexed_dataset, data_prefix,
         self.denoisers = [denoiser.upper() for denoiser in denoisers]
         self.mean_span_lengths = mean_span_lengths
         self.mask_ratios = mask_ratios
+        self.scale_normal_std = scale_normal_std
         self.like_ul2r = like_ul2r
 
         # Dataset.
@@ -233,8 +236,8 @@ def _pack_samples(self, np_rng, idx, denoiser_index):
                 self.vocab_id_to_token_dict, cls_ids, self.sep_id,
                 self.mask_id, self.pad_id, self.model_type, denoiser_index,
                 self.denoisers, self.mean_span_lengths,
-                self.mask_ratios, self.like_ul2r, np_rng,
-                self.bos_id, self.eos_id, self.sentinel_tokens)
+                self.mask_ratios, self.scale_normal_std, self.like_ul2r,
+                np_rng, self.bos_id, self.eos_id, self.sentinel_tokens)
             if is_decoder_only(self.model_type):
                 maybe_lens = update_samples_dict_decoder_only(
                     samples_dict,
@@ -308,8 +311,8 @@ def __getitem__(self, idx):
                 self.vocab_id_to_token_dict, self.cls_ids, self.sep_id,
                 self.mask_id, self.pad_id, self.model_type, denoiser_index,
                 self.denoisers, self.mean_span_lengths,
-                self.mask_ratios, self.like_ul2r, np_rng,
-                self.bos_id, self.eos_id, self.sentinel_tokens)
+                self.mask_ratios, self.scale_normal_std, self.like_ul2r,
+                np_rng, self.bos_id, self.eos_id, self.sentinel_tokens)
         return samples_dict
 
 
@@ -319,8 +322,8 @@ def build_training_sample(sample, target_seq_length,
                           cls_ids, sep_id, mask_id, pad_id,
                           model_type, denoiser_index,
                           denoisers, mean_span_lengths,
-                          mask_ratios, like_ul2r, np_rng,
-                          bos_id=None, eos_id=None,
+                          mask_ratios, scale_normal_std, like_ul2r,
+                          np_rng, bos_id=None, eos_id=None,
                           sentinel_tokens=None):
     """Build training sample.
 
@@ -344,6 +347,8 @@ def build_training_sample(sample, target_seq_length,
         mean_span_lengths: Mean length for sampling span lengths. Numbers < 1
               indicate a mean length of the sequence length times that number.
         mask_ratios: Ratio of masked token in the full sequence.
+        scale_normal_std: Whether to scale the standard deviation when using a
+            normal distribution for span length sampling.
         like_ul2r: Whether to use the updated implementation as specified in
             the UL2R paper.
         np_rng: Random number genenrator. Note that this rng state should be
@@ -399,8 +404,10 @@ def build_training_sample(sample, target_seq_length,
     if denoiser == 'R' or denoiser == 'X':
         if like_ul2r:
             sampling_style = SamplingStyle.UNIFORM
-        else:
+        elif scale_normal_std:
             sampling_style = SamplingStyle.NORMAL
+        else:
+            sampling_style = SamplingStyle.UNSCALED_NORMAL
         prefix_lm = False
         max_predictions_per_seq = len(tokens) - 1
     elif denoiser == 'S':

From 15622d21223b2816b7a9b67b66faa1f97c943592 Mon Sep 17 00:00:00 2001
From: janEbert <janpublicebert@posteo.net>
Date: Tue, 7 Mar 2023 17:25:30 +0100
Subject: [PATCH 088/122] Allow evaluating encoder-decoder models

---
 tasks/eval_harness/evaluate.py | 120 ++++++++++++++++++++++++---------
 1 file changed, 90 insertions(+), 30 deletions(-)

diff --git a/tasks/eval_harness/evaluate.py b/tasks/eval_harness/evaluate.py
index e21541916..3f7d29a99 100644
--- a/tasks/eval_harness/evaluate.py
+++ b/tasks/eval_harness/evaluate.py
@@ -22,6 +22,10 @@
 from megatron import print_rank_0
 from megatron import get_tokenizer
 from megatron import mpu
+from megatron.data.t5_dataset import (
+    make_attention_mask_3d,
+    make_history_mask_3d,
+)
 from megatron.training import setup_model_and_optimizer, get_model
 from megatron.mpu.mappings import gather_from_tensor_model_parallel_region
 
@@ -36,6 +40,7 @@
 
 from torch.nn.parallel.distributed import DistributedDataParallel as torchDDP
 from megatron.model.distributed import DistributedDataParallel as LocalDDP
+from megatron.model.gpt_model import GPTModelPipe
 from megatron.model.module import Float16Module
 from deepspeed.runtime.pipe import schedule
 
@@ -55,6 +60,14 @@ def __init__(self, model, tokenizer):
             for token in self._prefix_tokens
         ]
 
+        # TODO More general check for pipelined models would be desirable.
+        self._is_encoder_decoder = not (
+                isinstance(self.model, GPTModelPipe)
+                or hasattr(self.model, 'language_model')
+                and hasattr(self.model.language_model, 'add_decoder')
+                and not self.model.language_model.add_decoder
+        )
+
         # For ds we split into mini batches and then micro batches to keep pipelining api happy.
         # With Megatron we just go to micro_batches directly
         self._batch_size = args.micro_batch_size * args.micro_bs_multiplier
@@ -230,31 +243,66 @@ def create_model_inputs(self, tokens):
         else:
             ctxlens = None
 
-        if args.prefix_lm and ctxlens is not None:
-            prefix_indices = get_prefix_indices(
+        # TODO Handle encoder-only
+        if not self._is_encoder_decoder:
+            if args.prefix_lm and ctxlens is not None:
+                prefix_indices = get_prefix_indices(
+                    tokens,
+                    self.EOT_TOKEN_ID,
+                    partial_prefix_indices=ctxlens,
+                    reset_attention_mask=args.reset_attention_mask
+                )
+            else:
+                if args.prefix_lm:
+                    print(
+                        'Warning: requested PrefixLM inputs, but cannot determine '
+                        'prefix length – prefix is empty.'
+                    )
+                prefix_indices = None
+
+            attention_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids(
                 tokens,
                 self.EOT_TOKEN_ID,
-                partial_prefix_indices=ctxlens,
-                reset_attention_mask=args.reset_attention_mask
-            )
+                args.reset_position_ids,
+                args.reset_attention_mask,
+                args.eod_mask_loss,
+                prefix_indices=prefix_indices,
+                loss_on_targets_only=False)
+            return (tokens, position_ids, attention_mask), (tokens, loss_mask)
         else:
-            if args.prefix_lm:
-                print(
-                    'Warning: requested PrefixLM inputs, but cannot determine '
-                    'prefix length – prefix is empty.'
-                )
-            prefix_indices = None
-
-        attention_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids(
-            tokens,
-            self.EOT_TOKEN_ID,
-            args.reset_position_ids,
-            args.reset_attention_mask,
-            args.eod_mask_loss,
-            prefix_indices=prefix_indices,
-            loss_on_targets_only=False)
-
-        return (tokens, position_ids, attention_mask), (tokens, loss_mask)
+            assert ctxlens is not None
+
+            # Split tokens to separate encoder and decoder input.
+            # No BOS token used with eval harness, so we do not need to
+            # worry about the decoder receiving it in mind of the split.
+            enc_tokens = torch.stack([
+                F.pad(tok[:ctxlen], (0, len(tok) - ctxlen), value=0)
+                for (tok, ctxlen) in zip(tokens, ctxlens)
+            ])
+            dec_tokens = torch.stack([
+                F.pad(tok[ctxlen:], (0, ctxlen), value=0)
+                for (tok, ctxlen) in zip(tokens, ctxlens)
+            ])
+
+            enc_attn_mask = make_attention_mask_3d(enc_tokens, enc_tokens)
+            dec_attn_mask = make_attention_mask_3d(dec_tokens, dec_tokens)
+            dec_attn_mask *= make_history_mask_3d(dec_tokens)
+            enc_dec_attn_mask = make_attention_mask_3d(dec_tokens, enc_tokens)
+
+            loss_mask = torch.ones(
+                dec_tokens.shape[:2],
+                device=dec_tokens.device,
+                dtype=dec_tokens.dtype,
+            )
+            for (i, ctxlen) in enumerate(ctxlens):
+                if ctxlen != 0:
+                    loss_mask[i, -ctxlen] = 0
+
+            return (
+                (enc_tokens, dec_tokens, enc_attn_mask,
+                 dec_attn_mask, enc_dec_attn_mask),
+                (dec_tokens, loss_mask)
+            )
 
     def _model_call(self, inps):
         args = get_args()
@@ -312,6 +360,8 @@ def _model_call(self, inps):
             unwrapped_model = unwrap_model(self.model, (torchDDP, LocalDDP, Float16Module))
             unwrapped_model.set_input_tensor(input_tensor)
             output = self.model(*self.create_model_inputs((inps, ctxlens))[0])
+            if isinstance(output, tuple):
+                output = output[0]
             send_forward(output)
 
         if mpu.is_pipeline_last_stage():
@@ -365,12 +415,18 @@ def load_ds_checkpoint_and_setup_megatron(args):
     if not os.path.exists(args.load):
         raise ValueError(f"checkpoint path {args.load} doesn't exit")
 
-    ds_checkpoint = DeepSpeedCheckpoint(args.load,
-                                        tp_degree=args.tensor_model_parallel_size,
-                                        pp_degree=args.pipeline_model_parallel_size)
-
-
-    cp_args = ds_checkpoint.get_args()
+    try:
+        is_ds_cp = True
+        ds_checkpoint = DeepSpeedCheckpoint(args.load,
+                                            tp_degree=args.tensor_model_parallel_size,
+                                            pp_degree=args.pipeline_model_parallel_size)
+
+        cp_args = ds_checkpoint.get_args()
+    except AssertionError:
+        is_ds_cp = False
+        cp_path = os.path.join(args.load, 'mp_rank_00', 'model_optim_rng.pt')
+        state_dict = torch.load(cp_path, map_location='cpu')
+        cp_args = state_dict['args']
     # Merge the current args with the checkpoint args.
     skip_keys = [
         'abort_on_unmet_fused_kernel_constraints',
@@ -418,7 +474,10 @@ def load_ds_checkpoint_and_setup_megatron(args):
 
     # print final arguments.
     _print_args(args)
-    if args.deepspeed:
+    if not is_ds_cp:
+        model = get_model(model_provider)[0]
+        model.load_state_dict(state_dict['model'], strict=True)
+    elif args.deepspeed:
 
         # Hack #3:
         # Loading pipelined models in deepspeed with different TP than it was originally trained on fails
@@ -501,7 +560,8 @@ def main():
     task_list = ALL_TASKS if args.task_list == 'all' else args.task_list.split(',')
     task_dict = tasks.get_task_dict(task_list)
 
-    model.module.activation_checkpoint_interval = 0
+    if hasattr(model, 'module'):
+        model.module.activation_checkpoint_interval = 0
     model._compute_loss = False
     model.fwd_outputs = []
 

From e5a6169d415f529eeebe80f257b9d9d0efce481a Mon Sep 17 00:00:00 2001
From: janEbert <janpublicebert@posteo.net>
Date: Wed, 8 Mar 2023 09:34:44 +0100
Subject: [PATCH 089/122] Fix not passing `scale_normal_std`

---
 megatron/data/dataset_utils.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/megatron/data/dataset_utils.py b/megatron/data/dataset_utils.py
index 23992bbef..18b7c03a7 100644
--- a/megatron/data/dataset_utils.py
+++ b/megatron/data/dataset_utils.py
@@ -680,6 +680,7 @@ def build_dataset(index, name):
                         'S': args.ul2_s_denoiser_token,
                         'X': args.ul2_x_denoiser_token,
                     },
+                    scale_normal_std=args.ul2_scale_normal_std,
                     like_ul2r=args.ul2_like_ul2r,
                     pack_any=args.ul2_pack_any,
                     pack_repeat_prompt=args.ul2_pack_repeat_prompt,

From d583fe9d9ce5856b8a5347076f3cca1d58362a5d Mon Sep 17 00:00:00 2001
From: janEbert <janpublicebert@posteo.net>
Date: Tue, 7 Mar 2023 18:29:50 +0100
Subject: [PATCH 090/122] Add T5-style GLU layers

---
 megatron/model/glu_activations.py | 122 ++++++++++++++++++++++++++++++
 megatron/model/transformer.py     |  41 +++++++---
 2 files changed, 151 insertions(+), 12 deletions(-)

diff --git a/megatron/model/glu_activations.py b/megatron/model/glu_activations.py
index c479d9683..4ffd315e5 100644
--- a/megatron/model/glu_activations.py
+++ b/megatron/model/glu_activations.py
@@ -4,6 +4,7 @@
 
 from megatron import logging
 from megatron.model.utils import log_debug_usage
+from megatron import mpu
 
 logger = logging.get_logger(__name__)
 
@@ -38,10 +39,127 @@ def __init__(self):
         super().__init__(F.silu)
 
 
+class _T5GLUBase(nn.Module):
+    def __init__(
+            self,
+            in_features,
+            out_features,
+            activation_fn=torch.sigmoid,
+            bias=False,
+            gather_output=True,
+            init_method=torch.nn.init.xavier_normal_,
+    ):
+        super().__init__()
+        self.linear = mpu.ColumnParallelLinear(
+            in_features,
+            out_features,
+            bias=bias,
+            gather_output=gather_output,
+            init_method=init_method)
+        self.nonlinear = mpu.ColumnParallelLinear(
+            in_features,
+            out_features,
+            bias=bias,
+            gather_output=gather_output,
+            init_method=init_method)
+        self.activation_fn = activation_fn
+
+    def forward(self, x):
+        output = self.linear(x)[0] * self.activation_fn(self.nonlinear(x)[0])
+        return output, None
+
+
+class T5LiGLU(_T5GLUBase):
+    def __init__(
+            self,
+            in_features,
+            out_features,
+            bias=False,
+            device=None,
+            dtype=None,
+    ):
+        super().__init__(
+            in_features,
+            out_features,
+            activation_fn=nn.Identity(),
+            bias=bias,
+            device=device,
+            dtype=dtype,
+        )
+
+
+class T5GEGLU(_T5GLUBase):
+    def __init__(
+            self,
+            in_features,
+            out_features,
+            bias=False,
+            device=None,
+            dtype=None,
+    ):
+        super().__init__(
+            in_features,
+            out_features,
+            activation_fn=F.gelu,
+            bias=bias,
+            device=device,
+            dtype=dtype,
+        )
+
+
+class T5ReGLU(_T5GLUBase):
+    def __init__(
+            self,
+            in_features,
+            out_features,
+            bias=False,
+            device=None,
+            dtype=None,
+    ):
+        super().__init__(
+            in_features,
+            out_features,
+            activation_fn=F.relu,
+            bias=bias,
+            device=device,
+            dtype=dtype,
+        )
+
+
+class T5SwiGLU(_T5GLUBase):
+    def __init__(
+            self,
+            in_features,
+            out_features,
+            bias=False,
+            device=None,
+            dtype=None,
+    ):
+        super().__init__(
+            in_features,
+            out_features,
+            activation_fn=F.silu,
+            bias=bias,
+            device=device,
+            dtype=dtype,
+        )
+
+
+def replaces_linear(wrapped_glu_act):
+    """Return whether the GLU activation wrapped by `log_debug_usage`
+    contains a type.
+    """
+    return isinstance(wrapped_glu_act.__closure__[0].cell_contents, type)
+
+
 liglu = log_debug_usage(logger, "Using GLU activation: LiGLU.")(torch.jit.script(LiGLU()))
 geglu = log_debug_usage(logger, "Using GLU activation: GELU.")(torch.jit.script(GEGLU()))
 reglu = log_debug_usage(logger, "Using GLU activation: ReGLU.")(torch.jit.script(ReGLU()))
 swiglu = log_debug_usage(logger, "Using GLU activation: SwiGLU.")(torch.jit.script(SwiGLU()))
+t5_liglu = log_debug_usage(logger, "Using GLU activation: T5LiGLU.")(T5LiGLU)
+t5_geglu = log_debug_usage(logger, "Using GLU activation: T5GELU.")(T5GEGLU)
+t5_reglu = log_debug_usage(logger, "Using GLU activation: T5ReGLU.")(T5ReGLU)
+t5_swiglu = log_debug_usage(logger, "Using GLU activation: T5SwiGLU.")(T5SwiGLU)
 
 
 GLU_ACTIVATIONS = {
@@ -49,4 +167,8 @@ def __init__(self):
     "liglu": liglu,
     "reglu": reglu,
     "swiglu": swiglu,
+    "t5_geglu": t5_geglu,
+    "t5_liglu": t5_liglu,
+    "t5_reglu": t5_reglu,
+    "t5_swiglu": t5_swiglu,
 }
diff --git a/megatron/model/transformer.py b/megatron/model/transformer.py
index 03e6faaec..2118bb0ee 100644
--- a/megatron/model/transformer.py
+++ b/megatron/model/transformer.py
@@ -30,7 +30,7 @@
 
 import deepspeed
 
-from .glu_activations import GLU_ACTIVATIONS
+from .glu_activations import GLU_ACTIVATIONS, replaces_linear
 from .positional_embeddings import RotaryEmbedding, apply_rotary_pos_emb_torch, apply_rotary_pos_emb
 
 # flags required to enable jit fusion kernels
@@ -69,19 +69,34 @@ def __init__(self, init_method, output_layer_init_method):
         super(ParallelMLP, self).__init__()
         args = get_args()
 
-        # Project to ffn_hidden_size
-        self.dense_h_to_4h = mpu.ColumnParallelLinear(
-            args.hidden_size,
-            # GLU is a special activation that divides the dimension by a factor 2.
-            2 * args.ffn_hidden_size if args.glu_activation else args.ffn_hidden_size,
-            gather_output=False,
-            init_method=init_method,
-            skip_bias_add=True)
+        if args.glu_activation:
+            glu_activation = GLU_ACTIVATIONS[args.glu_activation]
+        else:
+            glu_activation = None
 
+        # Project to ffn_hidden_size
+        if replaces_linear(glu_activation):
+            self.dense_h_to_4h = glu_activation(
+                args.hidden_size,
+                args.ffn_hidden_size,
+                gather_output=False,
+                init_method=init_method)
+        else:
+            self.dense_h_to_4h = mpu.ColumnParallelLinear(
+                args.hidden_size,
+                # GLU is a special activation that divides the dimension by a factor 2.
+                # Only the case for non-T5 GLU.
+                2 * args.ffn_hidden_size if args.glu_activation else args.ffn_hidden_size,
+                gather_output=False,
+                init_method=init_method,
+                skip_bias_add=True)
         self.bias_gelu_fusion = args.bias_gelu_fusion
         self.activation_func = F.gelu
-        if args.glu_activation:
-            self.activation_func = GLU_ACTIVATIONS[args.glu_activation]
+
+        if replaces_linear(glu_activation):
+            self.activation_func = nn.Identity()
+        elif glu_activation:
+            self.activation_func = glu_activation
         elif args.openai_gelu:
             self.activation_func = openai_gelu
         elif args.onnx_safe:
@@ -101,7 +116,9 @@ def forward(self, hidden_states):
         # [s, b, 4hp]
         intermediate_parallel, bias_parallel = self.dense_h_to_4h(hidden_states)
 
-        if self.bias_gelu_fusion:
+        if bias_parallel is None:
+            intermediate_parallel = self.activation_func(intermediate_parallel)
+        elif self.bias_gelu_fusion:
              intermediate_parallel = \
                      bias_gelu_impl(intermediate_parallel, bias_parallel)
         else:

From ad7de7ee8ab881da92527f1342f91cc43b5b9a49 Mon Sep 17 00:00:00 2001
From: janEbert <janpublicebert@posteo.net>
Date: Thu, 9 Mar 2023 08:27:00 +0100
Subject: [PATCH 091/122] Rename xPos embedding class
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

`XPos` → `XPosEmbedding`
---
 megatron/model/positional_embeddings.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/megatron/model/positional_embeddings.py b/megatron/model/positional_embeddings.py
index 5ddf473e4..8c3609c72 100644
--- a/megatron/model/positional_embeddings.py
+++ b/megatron/model/positional_embeddings.py
@@ -62,7 +62,7 @@ def fixed_pos_embedding(x, base):
     return torch.cos(sinusoid_inp), torch.sin(sinusoid_inp)
 
 
-class XPos(torch.nn.Module):
+class XPosEmbedding(torch.nn.Module):
     """
     xPos positional embeddings from https://arxiv.org/abs/2212.10554.
     """

From 81a68f79d228b3ed035078f39c47ad3ed1925a5d Mon Sep 17 00:00:00 2001
From: janEbert <janpublicebert@posteo.net>
Date: Thu, 9 Mar 2023 08:27:53 +0100
Subject: [PATCH 092/122] Integrate xPos embedding

---
 megatron/model/transformer.py | 27 +++++++++++++++++++++++----
 1 file changed, 23 insertions(+), 4 deletions(-)

diff --git a/megatron/model/transformer.py b/megatron/model/transformer.py
index 2118bb0ee..0724dbb09 100644
--- a/megatron/model/transformer.py
+++ b/megatron/model/transformer.py
@@ -31,7 +31,14 @@
 import deepspeed
 
 from .glu_activations import GLU_ACTIVATIONS, replaces_linear
-from .positional_embeddings import RotaryEmbedding, apply_rotary_pos_emb_torch, apply_rotary_pos_emb
+from .positional_embeddings import (
+    apply_rotary_pos_emb,
+    apply_rotary_pos_emb_torch,
+    apply_xpos_emb,
+    apply_xpos_emb_torch,
+    RotaryEmbedding,
+    XPosEmbedding,
+)
 
 # flags required to enable jit fusion kernels
 torch._C._jit_set_profiling_mode(False)
@@ -221,6 +228,11 @@ def __init__(self, init_method,
 
         if self.position_embedding_type == PositionEmbeddingType.rotary:
             self.rotary_emb = RotaryEmbedding(self.hidden_size_per_attention_head, precision=args.params_dtype)
+        elif self.position_embedding_type == PositionEmbeddingType.xpos:
+            self.xpos_emb = XPosEmbedding(
+                self.hidden_size_per_attention_head,
+                precision=args.params_dtype,
+            )
 
     def forward(self, hidden_states, attention_mask, layer_past=None,
                 get_key_value=False, encoder_output=None, alibi=None):
@@ -308,16 +320,23 @@ def forward(self, hidden_states, attention_mask, layer_past=None,
             matmul_result = alibi[:output_size[0]*output_size[1], :, :output_size[3]]
 
         # Rotary embeddings
-        if self.position_embedding_type == PositionEmbeddingType.rotary:
-            apply_rotary_fn = apply_rotary_pos_emb_torch if self.bf16 else apply_rotary_pos_emb
-
+        if self.position_embedding_type in [
+                PositionEmbeddingType.rotary, PositionEmbeddingType.xpos]:
             seq_len = key_layer.shape[0]
             offset = 0
             if layer_past is not None and layer_past.numel() > 0:
                 offset = layer_past[0].shape[0]
                 seq_len += offset
+
+        if self.position_embedding_type == PositionEmbeddingType.rotary:
+            apply_rotary_fn = apply_rotary_pos_emb_torch if self.bf16 else apply_rotary_pos_emb
             cos, sin = self.rotary_emb(value_layer, seq_len=seq_len)
             query_layer, key_layer = apply_rotary_fn(query_layer, key_layer, cos, sin, offset=offset)
+        elif self.position_embedding_type == PositionEmbeddingType.xpos:
+            apply_xpos_fn = apply_xpos_emb_torch if self.bf16 else apply_xpos_emb
+            cos, sin, scale = self.xpos_emb(value_layer, seq_len=seq_len)
+            query_layer, key_layer = apply_xpos_fn(
+                query_layer, key_layer, cos, sin, scale, offset=offset)
 
         # Raw attention scores. [b * np, sq, sk]
         if alibi is None:

From 46e145d56d0510e0a31fa0550322251591064b4a Mon Sep 17 00:00:00 2001
From: janEbert <janpublicebert@posteo.net>
Date: Thu, 9 Mar 2023 08:35:50 +0100
Subject: [PATCH 093/122] Handle xPos embedding

---
 finetune_t0_non_causal_decoder.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/finetune_t0_non_causal_decoder.py b/finetune_t0_non_causal_decoder.py
index 14650a6e5..7a15bb735 100644
--- a/finetune_t0_non_causal_decoder.py
+++ b/finetune_t0_non_causal_decoder.py
@@ -94,7 +94,11 @@ def get_batch_pipe(data):
         segment_ids=segment_ids.long(),
     )
 
-    if args.position_embedding_type not in [PositionEmbeddingType.alibi, PositionEmbeddingType.rotary]:
+    if args.position_embedding_type not in [
+            PositionEmbeddingType.alibi,
+            PositionEmbeddingType.rotary,
+            PositionEmbeddingType.xpos,
+    ]:
         raise NotImplementedError("absolute positional embeddings require us to reset position_ids accordingly.")
 
     return (tokens, position_ids, attention_mask), (labels, loss_mask)

From 482f0ea9b73d87b79adf6faffef4e85ec3fd8bea Mon Sep 17 00:00:00 2001
From: janEbert <janpublicebert@posteo.net>
Date: Thu, 9 Mar 2023 09:17:34 +0100
Subject: [PATCH 094/122] Do not use bias for 2nd MLP layer if using T5 GLU

As in the T5 codebase. This could have highly detrimental effects on
performance of TorchScript cannot easily type-dispatch the
`bias_dropout_add` function.
---
 megatron/model/transformer.py | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/megatron/model/transformer.py b/megatron/model/transformer.py
index 0724dbb09..077b24763 100644
--- a/megatron/model/transformer.py
+++ b/megatron/model/transformer.py
@@ -113,6 +113,7 @@ def __init__(self, init_method, output_layer_init_method):
         self.dense_4h_to_h = mpu.RowParallelLinear(
             args.ffn_hidden_size,
             args.hidden_size,
+            bias=not replaces_linear(glu_activation),
             input_is_parallel=True,
             init_method=output_layer_init_method,
             skip_bias_add=True)
@@ -442,8 +443,10 @@ def forward(self, hidden_states, attention_mask, layer_past=None,
 
 
 def bias_dropout_add(x, bias, residual, prob, training):
-    # type: (Tensor, Tensor, Tensor, float, bool) -> Tensor
-    out = torch.nn.functional.dropout(x + bias, p=prob, training=training)
+    # type: (Tensor, Optional[Tensor], Tensor, float, bool) -> Tensor
+    if bias is not None:
+        x = x + bias
+    out = torch.nn.functional.dropout(x, p=prob, training=training)
     out = residual + out
     return out
 
@@ -456,13 +459,13 @@ def _bias_dropout_add(x, bias, residual, prob):
 
 @torch.jit.script
 def bias_dropout_add_fused_train(x, bias, residual, prob):
-    # type: (Tensor, Tensor, Tensor, float) -> Tensor
+    # type: (Tensor, Optional[Tensor], Tensor, float) -> Tensor
     return bias_dropout_add(x, bias, residual, prob, True)
 
 
 @torch.jit.script
 def bias_dropout_add_fused_inference(x, bias, residual, prob):
-    # type: (Tensor, Tensor, Tensor, float) -> Tensor
+    # type: (Tensor, Optional[Tensor], Tensor, float) -> Tensor
     return bias_dropout_add(x, bias, residual, prob, False)
 
 
@@ -615,7 +618,7 @@ def forward(self, hidden_states, attention_mask,
         with torch.enable_grad():
             output = bias_dropout_add_func(
                 mlp_output,
-                mlp_bias.expand_as(residual),
+                mlp_bias.expand_as(residual) if mlp_bias is not None else None,
                 residual,
                 self.hidden_dropout)
 

From 4385f7b6c55ca2374fc5304be42668b96297cd65 Mon Sep 17 00:00:00 2001
From: janEbert <janpublicebert@posteo.net>
Date: Thu, 9 Mar 2023 12:34:32 +0100
Subject: [PATCH 095/122] Fix T5 GLU constructor arguments

---
 megatron/model/glu_activations.py | 38 ++++++++++++++++---------------
 1 file changed, 20 insertions(+), 18 deletions(-)

diff --git a/megatron/model/glu_activations.py b/megatron/model/glu_activations.py
index 4ffd315e5..95eb2d0c4 100644
--- a/megatron/model/glu_activations.py
+++ b/megatron/model/glu_activations.py
@@ -55,13 +55,15 @@ def __init__(
             out_features,
             bias=bias,
             gather_output=gather_output,
-            init_method=init_method)
+            init_method=init_method,
+        )
         self.nonlinear = mpu.ColumnParallelLinear(
             in_features,
             out_features,
             bias=bias,
             gather_output=gather_output,
-            init_method=init_method)
+            init_method=init_method,
+        )
         self.activation_fn = activation_fn
 
     def forward(self, x):
@@ -75,16 +77,16 @@ def __init__(
             in_features,
             out_features,
             bias=False,
-            device=None,
-            dtype=None,
+            gather_output=True,
+            init_method=torch.nn.init.xavier_normal_,
     ):
         super().__init__(
             in_features,
             out_features,
             activation_fn=nn.Identity(),
             bias=bias,
-            device=device,
-            dtype=dtype,
+            gather_output=gather_output,
+            init_method=init_method,
         )
 
 
@@ -94,16 +96,16 @@ def __init__(
             in_features,
             out_features,
             bias=False,
-            device=None,
-            dtype=None,
+            gather_output=True,
+            init_method=torch.nn.init.xavier_normal_,
     ):
         super().__init__(
             in_features,
             out_features,
             activation_fn=F.gelu,
             bias=bias,
-            device=device,
-            dtype=dtype,
+            gather_output=gather_output,
+            init_method=init_method,
         )
 
 
@@ -113,16 +115,16 @@ def __init__(
             in_features,
             out_features,
             bias=False,
-            device=None,
-            dtype=None,
+            gather_output=True,
+            init_method=torch.nn.init.xavier_normal_,
     ):
         super().__init__(
             in_features,
             out_features,
             activation_fn=F.relu,
             bias=bias,
-            device=device,
-            dtype=dtype,
+            gather_output=gather_output,
+            init_method=init_method,
         )
 
 
@@ -132,16 +134,16 @@ def __init__(
             in_features,
             out_features,
             bias=False,
-            device=None,
-            dtype=None,
+            gather_output=True,
+            init_method=torch.nn.init.xavier_normal_,
     ):
         super().__init__(
             in_features,
             out_features,
             activation_fn=F.silu,
             bias=bias,
-            device=device,
-            dtype=dtype,
+            gather_output=gather_output,
+            init_method=init_method,
         )
 
 

From 2d24b13bc7ab9efc55eec9e7059f5e165a74ab3f Mon Sep 17 00:00:00 2001
From: janEbert <janpublicebert@posteo.net>
Date: Thu, 9 Mar 2023 16:15:09 +0100
Subject: [PATCH 096/122] Refactor samples dict creation

More code reuse, change some methods to functions and change their
visibility.
---
 megatron/data/t5_dataset.py  | 54 +++++++++++++++++----------------
 megatron/data/ul2_dataset.py | 59 +++++++++++++-----------------------
 2 files changed, 49 insertions(+), 64 deletions(-)

diff --git a/megatron/data/t5_dataset.py b/megatron/data/t5_dataset.py
index 93b1b49d9..9060013ff 100644
--- a/megatron/data/t5_dataset.py
+++ b/megatron/data/t5_dataset.py
@@ -110,35 +110,11 @@ def __len__(self):
         else:
             return self.samples_mapping.shape[0]
 
-    def _create_samples_dict(self):
-        samples_dict = {
-            'text_enc': np.empty((self.max_seq_length,), dtype=np.int64),
-            'text_dec': np.empty(
-                (self.max_seq_length_dec,), dtype=np.int64),
-            'labels': np.empty(
-                (self.max_seq_length_dec,), dtype=np.int64),
-            'loss_mask': np.zeros(
-                (self.max_seq_length_dec,), dtype=np.int64),
-            'truncated': 0,
-            'enc_mask': np.zeros(
-                (self.max_seq_length, self.max_seq_length),
-                dtype=np.int64,
-            ),
-            'dec_mask': np.zeros(
-                (self.max_seq_length_dec, self.max_seq_length_dec),
-                dtype=np.int64,
-            ),
-            'enc_dec_mask': np.zeros(
-                (self.max_seq_length_dec, self.max_seq_length),
-                dtype=np.int64,
-            ),
-        }
-        return samples_dict
-
     def _pack_samples(self, np_rng, idx):
         samples = get_samples(self.indexed_dataset, self.doc_idx,
                               self.sample_idx, self.shuffle_idx, idx)
-        samples_dict = self._create_samples_dict()
+        samples_dict = create_samples_dict(
+            self.max_seq_length, self.max_seq_length_dec)
         prev_len = 0
         prev_len_dec = 0
 
@@ -404,6 +380,32 @@ def make_history_mask_3d(block):
     return history_mask
 
 
+def create_samples_dict(max_seq_length, max_seq_length_dec):
+    samples_dict = {
+        'text_enc': np.empty((max_seq_length,), dtype=np.int64),
+        'text_dec': np.empty(
+            (max_seq_length_dec,), dtype=np.int64),
+        'labels': np.empty(
+            (max_seq_length_dec,), dtype=np.int64),
+        'loss_mask': np.zeros(
+            (max_seq_length_dec,), dtype=np.int64),
+        'truncated': 0,
+        'enc_mask': np.zeros(
+            (max_seq_length, max_seq_length),
+            dtype=np.int64,
+        ),
+        'dec_mask': np.zeros(
+            (max_seq_length_dec, max_seq_length_dec),
+            dtype=np.int64,
+        ),
+        'enc_dec_mask': np.zeros(
+            (max_seq_length_dec, max_seq_length),
+            dtype=np.int64,
+        ),
+    }
+    return samples_dict
+
+
 def _remove_padding(result_sample, pad_id):
     # Remove padding
     padding_start = np.argmax(result_sample['text_enc'] == pad_id)
diff --git a/megatron/data/ul2_dataset.py b/megatron/data/ul2_dataset.py
index f8e8a8470..371c8ad35 100644
--- a/megatron/data/ul2_dataset.py
+++ b/megatron/data/ul2_dataset.py
@@ -30,6 +30,7 @@
 from megatron.data.gpt_dataset import build_index_mappings, get_samples
 from megatron.data.t5_dataset import (
     add_final_padding,
+    create_samples_dict as t5_create_samples_dict,
     LengthExceededError,
     make_history_mask,
     merge_subsequent_masks,
@@ -180,47 +181,11 @@ def __len__(self):
         else:
             return self.samples_mapping.shape[0]
 
-    def _create_samples_dict(self):
-        if is_decoder_only(self.model_type):
-            samples_dict = {
-                'text': np.empty((self.max_seq_length,), dtype=np.int64),
-                'labels': np.empty((self.max_seq_length,), dtype=np.int64),
-                'loss_mask': np.zeros((self.max_seq_length,), dtype=np.int64),
-                'truncated': 0,
-                'dec_mask': np.zeros(
-                    (self.max_seq_length, self.max_seq_length),
-                    dtype=np.int64,
-                ),
-            }
-        else:
-            samples_dict = {
-                'text_enc': np.empty((self.max_seq_length,), dtype=np.int64),
-                'text_dec': np.empty(
-                    (self.max_seq_length_dec,), dtype=np.int64),
-                'labels': np.empty(
-                    (self.max_seq_length_dec,), dtype=np.int64),
-                'loss_mask': np.zeros(
-                    (self.max_seq_length_dec,), dtype=np.int64),
-                'truncated': 0,
-                'enc_mask': np.zeros(
-                    (self.max_seq_length, self.max_seq_length),
-                    dtype=np.int64,
-                ),
-                'dec_mask': np.zeros(
-                    (self.max_seq_length_dec, self.max_seq_length_dec),
-                    dtype=np.int64,
-                ),
-                'enc_dec_mask': np.zeros(
-                    (self.max_seq_length_dec, self.max_seq_length),
-                    dtype=np.int64,
-                ),
-            }
-        return samples_dict
-
     def _pack_samples(self, np_rng, idx, denoiser_index):
         samples = get_samples(self.indexed_dataset, self.doc_idx,
                               self.sample_idx, self.shuffle_idx, idx)
-        samples_dict = self._create_samples_dict()
+        samples_dict = create_samples_dict(
+            self.max_seq_length, self.max_seq_length_dec, self.model_type)
         prev_len = 0
         prev_len_dec = 0
         cls_ids = self.cls_ids
@@ -507,6 +472,24 @@ def build_training_sample(sample, target_seq_length,
     return train_sample
 
 
+def create_samples_dict(max_seq_length, max_seq_length_dec, model_type):
+    if is_decoder_only(model_type):
+        samples_dict = {
+            'text': np.empty((max_seq_length,), dtype=np.int64),
+            'labels': np.empty((max_seq_length,), dtype=np.int64),
+            'loss_mask': np.zeros((max_seq_length,), dtype=np.int64),
+            'truncated': 0,
+            'dec_mask': np.zeros(
+                (max_seq_length, max_seq_length),
+                dtype=np.int64,
+            ),
+        }
+    else:
+        samples_dict = t5_create_samples_dict(
+            max_seq_length, max_seq_length_dec)
+    return samples_dict
+
+
 def _remove_padding(result_sample, pad_id):
     # Remove padding
     padding_start = np.argmax(result_sample['text'] == pad_id)

From bd461f5f56a97155a2ae3f83fe299584ac938244 Mon Sep 17 00:00:00 2001
From: janEbert <janpublicebert@posteo.net>
Date: Thu, 9 Mar 2023 16:16:37 +0100
Subject: [PATCH 097/122] Move callees under caller

For readability.
---
 megatron/data/t5_dataset.py  | 40 +++++++++++++-------------
 megatron/data/ul2_dataset.py | 56 ++++++++++++++++++------------------
 2 files changed, 48 insertions(+), 48 deletions(-)

diff --git a/megatron/data/t5_dataset.py b/megatron/data/t5_dataset.py
index 9060013ff..c75803311 100644
--- a/megatron/data/t5_dataset.py
+++ b/megatron/data/t5_dataset.py
@@ -110,6 +110,26 @@ def __len__(self):
         else:
             return self.samples_mapping.shape[0]
 
+    def __getitem__(self, idx):
+        # Note that this rng state should be numpy and not python since
+        # python randint is inclusive whereas the numpy one is exclusive.
+        np_rng = np.random.RandomState(seed=(self.seed + idx))
+        if self.pack_samples:
+            samples_dict = self._pack_samples(np_rng, idx)
+        else:
+            start_index, end_index, seq_length = self.samples_mapping[idx]
+            sample = []
+            for index in range(start_index, end_index):
+                sample.append(self.indexed_dataset[index])
+            samples_dict = build_training_sample(
+                sample, seq_length,
+                self.max_seq_length,  # needed for padding
+                self.max_seq_length_dec, self.vocab_id_list,
+                self.vocab_id_to_token_dict, self.cls_id, self.sep_id,
+                self.mask_id, self.pad_id, self.masked_lm_prob, np_rng,
+                self.bos_id, self.eos_id, self.sentinel_tokens)
+        return samples_dict
+
     def _pack_samples(self, np_rng, idx):
         samples = get_samples(self.indexed_dataset, self.doc_idx,
                               self.sample_idx, self.shuffle_idx, idx)
@@ -150,26 +170,6 @@ def _pack_samples(self, np_rng, idx):
         add_final_padding(samples_dict, prev_len, prev_len_dec, self.pad_id)
         return samples_dict
 
-    def __getitem__(self, idx):
-        # Note that this rng state should be numpy and not python since
-        # python randint is inclusive whereas the numpy one is exclusive.
-        np_rng = np.random.RandomState(seed=(self.seed + idx))
-        if self.pack_samples:
-            samples_dict = self._pack_samples(np_rng, idx)
-        else:
-            start_index, end_index, seq_length = self.samples_mapping[idx]
-            sample = []
-            for index in range(start_index, end_index):
-                sample.append(self.indexed_dataset[index])
-            samples_dict = build_training_sample(
-                sample, seq_length,
-                self.max_seq_length,  # needed for padding
-                self.max_seq_length_dec, self.vocab_id_list,
-                self.vocab_id_to_token_dict, self.cls_id, self.sep_id,
-                self.mask_id, self.pad_id, self.masked_lm_prob, np_rng,
-                self.bos_id, self.eos_id, self.sentinel_tokens)
-        return samples_dict
-
 
 def build_training_sample(sample, target_seq_length,
                           max_seq_length, max_seq_length_dec,
diff --git a/megatron/data/ul2_dataset.py b/megatron/data/ul2_dataset.py
index 371c8ad35..aaf965991 100644
--- a/megatron/data/ul2_dataset.py
+++ b/megatron/data/ul2_dataset.py
@@ -181,6 +181,34 @@ def __len__(self):
         else:
             return self.samples_mapping.shape[0]
 
+    def __getitem__(self, idx):
+        # Note that this rng state should be numpy and not python since
+        # python randint is inclusive whereas the numpy one is exclusive.
+        np_rng = np.random.RandomState(seed=(self.seed + idx))
+        # Denoiser selection
+        denoiser_index = np_rng.choice(
+            np.arange(len(self.denoisers)),
+            p=self.denoiser_ratios,
+        )
+
+        if self.pack_samples:
+            samples_dict = self._pack_samples(np_rng, idx, denoiser_index)
+        else:
+            start_index, end_index, seq_length = self.samples_mapping[idx]
+            sample = []
+            for index in range(start_index, end_index):
+                sample.append(self.indexed_dataset[index])
+            samples_dict = build_training_sample(
+                sample, seq_length,
+                self.max_seq_length,  # needed for padding
+                self.max_seq_length_dec, self.vocab_id_list,
+                self.vocab_id_to_token_dict, self.cls_ids, self.sep_id,
+                self.mask_id, self.pad_id, self.model_type, denoiser_index,
+                self.denoisers, self.mean_span_lengths,
+                self.mask_ratios, self.scale_normal_std, self.like_ul2r,
+                np_rng, self.bos_id, self.eos_id, self.sentinel_tokens)
+        return samples_dict
+
     def _pack_samples(self, np_rng, idx, denoiser_index):
         samples = get_samples(self.indexed_dataset, self.doc_idx,
                               self.sample_idx, self.shuffle_idx, idx)
@@ -252,34 +280,6 @@ def _pack_samples(self, np_rng, idx, denoiser_index):
 
         return samples_dict
 
-    def __getitem__(self, idx):
-        # Note that this rng state should be numpy and not python since
-        # python randint is inclusive whereas the numpy one is exclusive.
-        np_rng = np.random.RandomState(seed=(self.seed + idx))
-        # Denoiser selection
-        denoiser_index = np_rng.choice(
-            np.arange(len(self.denoisers)),
-            p=self.denoiser_ratios,
-        )
-
-        if self.pack_samples:
-            samples_dict = self._pack_samples(np_rng, idx, denoiser_index)
-        else:
-            start_index, end_index, seq_length = self.samples_mapping[idx]
-            sample = []
-            for index in range(start_index, end_index):
-                sample.append(self.indexed_dataset[index])
-            samples_dict = build_training_sample(
-                sample, seq_length,
-                self.max_seq_length,  # needed for padding
-                self.max_seq_length_dec, self.vocab_id_list,
-                self.vocab_id_to_token_dict, self.cls_ids, self.sep_id,
-                self.mask_id, self.pad_id, self.model_type, denoiser_index,
-                self.denoisers, self.mean_span_lengths,
-                self.mask_ratios, self.scale_normal_std, self.like_ul2r,
-                np_rng, self.bos_id, self.eos_id, self.sentinel_tokens)
-        return samples_dict
-
 
 def build_training_sample(sample, target_seq_length,
                           max_seq_length, max_seq_length_dec,

From 35b2956af999271c8752f4ad976c4e390d80e18a Mon Sep 17 00:00:00 2001
From: janEbert <janpublicebert@posteo.net>
Date: Fri, 10 Mar 2023 15:42:05 +0100
Subject: [PATCH 098/122] Handle empty context

---
 tasks/eval_harness/evaluate.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tasks/eval_harness/evaluate.py b/tasks/eval_harness/evaluate.py
index 3f7d29a99..6c50601a5 100644
--- a/tasks/eval_harness/evaluate.py
+++ b/tasks/eval_harness/evaluate.py
@@ -100,7 +100,7 @@ def device(self):
     def _prepend_prefix_token_ids(self, tokens):
         if not self._prefix_token_ids:
             pass
-        elif tokens[0] == self.EOT_TOKEN_ID:
+        elif tokens and tokens[0] == self.EOT_TOKEN_ID:
             tokens = tokens[:1] + self._prefix_token_ids + tokens[1:]
         else:
             tokens = self._prefix_token_ids + tokens

From f0171e0174b0b3395f979be17ff2f184ba6a2311 Mon Sep 17 00:00:00 2001
From: janEbert <janpublicebert@posteo.net>
Date: Fri, 10 Mar 2023 15:53:01 +0100
Subject: [PATCH 099/122] Handle more possible model types

---
 tasks/eval_harness/evaluate.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tasks/eval_harness/evaluate.py b/tasks/eval_harness/evaluate.py
index 6c50601a5..674e41afb 100644
--- a/tasks/eval_harness/evaluate.py
+++ b/tasks/eval_harness/evaluate.py
@@ -43,6 +43,7 @@
 from megatron.model.gpt_model import GPTModelPipe
 from megatron.model.module import Float16Module
 from deepspeed.runtime.pipe import schedule
+from deepspeed.runtime.pipe.engine import PipelineEngine
 
 class EvalHarnessAdaptor(GPT2LM):
     def __init__(self, model, tokenizer):
@@ -63,6 +64,7 @@ def __init__(self, model, tokenizer):
         # TODO More general check for pipelined models would be desirable.
         self._is_encoder_decoder = not (
                 isinstance(self.model, GPTModelPipe)
+                or isinstance(self.model, PipelineEngine)
                 or hasattr(self.model, 'language_model')
                 and hasattr(self.model.language_model, 'add_decoder')
                 and not self.model.language_model.add_decoder

From 92158d86b7cffcac2503dfe58b83c59d8810beb1 Mon Sep 17 00:00:00 2001
From: janEbert <janpublicebert@posteo.net>
Date: Fri, 10 Mar 2023 16:32:33 +0100
Subject: [PATCH 100/122] Fix fully truncated contexts with prefix tokens

---
 tasks/eval_harness/evaluate.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tasks/eval_harness/evaluate.py b/tasks/eval_harness/evaluate.py
index 674e41afb..09dbf3aae 100644
--- a/tasks/eval_harness/evaluate.py
+++ b/tasks/eval_harness/evaluate.py
@@ -170,8 +170,10 @@ def _collate(x):
 
                     context_num_truncated = max(
                         total_len - self.max_length + 1, 0)
+                    # Need actual truncated length of context here
+                    # (without prefix tokens).
                     continuation_num_truncated = max(
-                        context_num_truncated - context_len, 0)
+                        context_num_truncated - len(context_enc), 0)
 
                     context_enc = context_enc[context_num_truncated:]
                     continuation_enc = \

From 3b7692f9aab8bb4fd63b5febfdb6abbac36e3369 Mon Sep 17 00:00:00 2001
From: janEbert <janpublicebert@posteo.net>
Date: Fri, 10 Mar 2023 16:56:07 +0100
Subject: [PATCH 101/122] Make T5 GLU checks safer

---
 megatron/model/glu_activations.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/megatron/model/glu_activations.py b/megatron/model/glu_activations.py
index 95eb2d0c4..5f30ef846 100644
--- a/megatron/model/glu_activations.py
+++ b/megatron/model/glu_activations.py
@@ -151,7 +151,11 @@ def replaces_linear(wrapped_glu_act):
     """Return whether the GLU activation wrapped by `log_debug_usage`
     contains a type.
     """
-    return isinstance(wrapped_glu_act.__closure__[0].cell_contents, type)
+    return (
+        hasattr(wrapped_glu_act, '__closure__')
+        and wrapped_glu_act.__closure__
+        and isinstance(wrapped_glu_act.__closure__[0].cell_contents, type)
+    )
 
 
 liglu = log_debug_usage(logger, "Using GLU activation: LiGLU.")(torch.jit.script(LiGLU()))

From b37d3ee17b02003b1beb32eeacf709d6b6949565 Mon Sep 17 00:00:00 2001
From: janEbert <janpublicebert@posteo.net>
Date: Mon, 20 Mar 2023 12:18:17 +0100
Subject: [PATCH 102/122] Improve import code style

---
 megatron/data/gpt_dataset.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/megatron/data/gpt_dataset.py b/megatron/data/gpt_dataset.py
index 61d182445..e43c0d178 100644
--- a/megatron/data/gpt_dataset.py
+++ b/megatron/data/gpt_dataset.py
@@ -23,8 +23,11 @@
 
 from megatron import mpu, print_rank_0
 from megatron.data.blendable_dataset import BlendableDataset
-from megatron.data.dataset_utils import get_datasets_weights_and_num_samples
-from megatron.data.dataset_utils import get_train_valid_test_split_, get_split_by_range_
+from megatron.data.dataset_utils import (
+    get_datasets_weights_and_num_samples,
+    get_split_by_range_,
+    get_train_valid_test_split_,
+)
 from megatron.data.indexed_dataset import make_dataset as make_indexed_dataset
 
 

From 5959e89ebeaae1672e496675c56e042fd16e1272 Mon Sep 17 00:00:00 2001
From: janEbert <janpublicebert@posteo.net>
Date: Mon, 20 Mar 2023 12:22:32 +0100
Subject: [PATCH 103/122] Refactor dummy barriers

---
 megatron/data/dataset_utils.py              | 25 +++++++++++++--------
 megatron/data/decoder_packed_mtf_dataset.py | 12 ++--------
 megatron/data/gpt_dataset.py                | 13 +++--------
 3 files changed, 21 insertions(+), 29 deletions(-)

diff --git a/megatron/data/dataset_utils.py b/megatron/data/dataset_utils.py
index 18b7c03a7..383f676cf 100644
--- a/megatron/data/dataset_utils.py
+++ b/megatron/data/dataset_utils.py
@@ -842,15 +842,7 @@ def get_samples_mapping(indexed_dataset,
         print_rank_0(' > elasped time to build and save samples mapping '
                      '(seconds): {:4f}'.format(
                          time.time() - start_time))
-    # This should be a barrier but nccl barrier assumes
-    # device_index=rank which is not the case for model
-    # parallel case
-    counts = torch.cuda.LongTensor([1])
-    torch.distributed.all_reduce(counts, group=mpu.get_data_parallel_group())
-    torch.distributed.all_reduce(counts, group=mpu.get_pipeline_model_parallel_group())
-    assert counts[0].item() == (
-        torch.distributed.get_world_size() //
-        torch.distributed.get_world_size(group=mpu.get_tensor_model_parallel_group()))
+    dp_pp_barrier()
 
     # Load indexed dataset.
     print_rank_0(' > loading indexed mapping from {}'.format(
@@ -863,3 +855,18 @@ def get_samples_mapping(indexed_dataset,
         samples_mapping.shape[0]))
 
     return samples_mapping
+
+
+def dp_pp_barrier():
+    # This should be a barrier but nccl barrier assumes
+    # device_index=rank which is not the case for model
+    # parallel case
+    counts = torch.cuda.LongTensor([1])
+    torch.distributed.all_reduce(counts, group=mpu.get_data_parallel_group())
+    torch.distributed.all_reduce(
+        counts, group=mpu.get_pipeline_model_parallel_group())
+    assert counts[0].item() == (
+        torch.distributed.get_world_size()
+        // torch.distributed.get_world_size(
+            group=mpu.get_tensor_model_parallel_group())
+    )
diff --git a/megatron/data/decoder_packed_mtf_dataset.py b/megatron/data/decoder_packed_mtf_dataset.py
index 4edf14207..944affb70 100644
--- a/megatron/data/decoder_packed_mtf_dataset.py
+++ b/megatron/data/decoder_packed_mtf_dataset.py
@@ -7,7 +7,7 @@
 from megatron import print_rank_0, mpu, logging
 from megatron.data.blendable_dataset import BlendableDataset
 from megatron.data.dataset_utils import get_datasets_weights_and_num_samples, get_split_by_range_, \
-    get_train_valid_test_split_
+    get_train_valid_test_split_, dp_pp_barrier
 from megatron.data.mtf_dataset import MTFDataset
 from megatron.data.indexed_dataset import make_dataset as make_indexed_dataset
 
@@ -437,15 +437,7 @@ def _build_index_mappings(
             print_rank_0(' > elasped time to build and save shuffle-idx and sample-idx mapping'
                          ' (seconds): {:4f}'.format(time.time() - start_time))
 
-    # This should be a barrier but nccl barrier assumes
-    # device_index=rank which is not the case for model
-    # parallel case
-    counts = torch.cuda.LongTensor([1])
-    torch.distributed.all_reduce(counts, group=mpu.get_data_parallel_group())
-    torch.distributed.all_reduce(counts, group=mpu.get_pipeline_model_parallel_group())
-    assert counts[0].item() == (
-        torch.distributed.get_world_size() //
-        torch.distributed.get_world_size(group=mpu.get_tensor_model_parallel_group()))
+    dp_pp_barrier()
 
     # Load mappings.
     start_time = time.time()
diff --git a/megatron/data/gpt_dataset.py b/megatron/data/gpt_dataset.py
index e43c0d178..304598975 100644
--- a/megatron/data/gpt_dataset.py
+++ b/megatron/data/gpt_dataset.py
@@ -21,9 +21,10 @@
 import numpy as np
 import torch
 
-from megatron import mpu, print_rank_0
+from megatron import print_rank_0
 from megatron.data.blendable_dataset import BlendableDataset
 from megatron.data.dataset_utils import (
+    dp_pp_barrier,
     get_datasets_weights_and_num_samples,
     get_split_by_range_,
     get_train_valid_test_split_,
@@ -388,15 +389,7 @@ def build_index_mappings(name, data_prefix, documents, sizes,
             print_rank_0(' > elasped time to build and save shuffle-idx mapping'
                          ' (seconds): {:4f}'.format(time.time() - start_time))
 
-    # This should be a barrier but nccl barrier assumes
-    # device_index=rank which is not the case for model
-    # parallel case
-    counts = torch.cuda.LongTensor([1])
-    torch.distributed.all_reduce(counts, group=mpu.get_data_parallel_group())
-    torch.distributed.all_reduce(counts, group=mpu.get_pipeline_model_parallel_group())
-    assert counts[0].item() == (
-        torch.distributed.get_world_size() //
-        torch.distributed.get_world_size(group=mpu.get_tensor_model_parallel_group()))
+    dp_pp_barrier()
 
     # Load mappings.
     start_time = time.time()

From ce8c1a5a06fcfa1c06a3dc07bf0e4be5678be518 Mon Sep 17 00:00:00 2001
From: janEbert <janpublicebert@posteo.net>
Date: Mon, 20 Mar 2023 12:29:38 +0100
Subject: [PATCH 104/122] Refactor file name creation

---
 megatron/data/gpt_dataset.py | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/megatron/data/gpt_dataset.py b/megatron/data/gpt_dataset.py
index 304598975..3a049ed6e 100644
--- a/megatron/data/gpt_dataset.py
+++ b/megatron/data/gpt_dataset.py
@@ -285,6 +285,15 @@ def get_samples(indexed_dataset, doc_idx, sample_idx, shuffle_idx, idx):
     return sample_list
 
 
+def _get_filename_prefix(data_prefix, name, num_samples, seq_length, seed):
+    _filename = data_prefix
+    _filename += '_{}_indexmap'.format(name)
+    _filename += '_{}ns'.format(num_samples)
+    _filename += '_{}sl'.format(seq_length)
+    _filename += '_{}s'.format(seed)
+    return _filename
+
+
 def build_index_mappings(name, data_prefix, documents, sizes,
                           num_samples, seq_length, seed, cutoff_last_epoch=0.95):
     """Build doc-idx, sample-idx, and shuffle-idx.
@@ -300,11 +309,8 @@ def build_index_mappings(name, data_prefix, documents, sizes,
     np_rng = np.random.RandomState(seed=seed)
 
     # Filename of the index mappings.
-    _filename = data_prefix
-    _filename += '_{}_indexmap'.format(name)
-    _filename += '_{}ns'.format(num_samples)
-    _filename += '_{}sl'.format(seq_length)
-    _filename += '_{}s'.format(seed)
+    _filename = _get_filename_prefix(
+        data_prefix, name, num_samples, seq_length, seed)
     doc_idx_filename = _filename + '_doc_idx.npy'
     sample_idx_filename = _filename + '_sample_idx.npy'
     shuffle_idx_filename = _filename + '_shuffle_idx.npy'

From 3e529661c57ceeca3cd0e00a551c5e8ffb49e49a Mon Sep 17 00:00:00 2001
From: janEbert <janpublicebert@posteo.net>
Date: Mon, 20 Mar 2023 12:30:55 +0100
Subject: [PATCH 105/122] Allow packing only full documents

---
 megatron/data/gpt_dataset.py | 131 +++++++++++++++++++++++++++++++++++
 1 file changed, 131 insertions(+)

diff --git a/megatron/data/gpt_dataset.py b/megatron/data/gpt_dataset.py
index 3a049ed6e..b99c37346 100644
--- a/megatron/data/gpt_dataset.py
+++ b/megatron/data/gpt_dataset.py
@@ -15,6 +15,7 @@
 
 """GPT style dataset."""
 
+import math
 import os
 import time
 
@@ -417,6 +418,85 @@ def build_index_mappings(name, data_prefix, documents, sizes,
     return doc_idx, sample_idx, shuffle_idx
 
 
+def build_index_mappings_full_docs(
+        name, data_prefix, documents, sizes,
+        num_samples, seq_length, seed):
+    """Build doc-idx, sample-idx, and shuffle-idx.
+    doc-idx: is an array (ordered) of documents to be used in training.
+    sample-idx: is the start document index and document offset for each
+       training sample.
+    shuffle-idx: maps the sample index into a random index into sample-idx.
+    """
+    # rng state
+    np_rng = np.random.RandomState(seed=seed)
+
+    # Filename of the index mappings.
+    _filename = _get_filename_prefix(
+        data_prefix, name, num_samples, seq_length, seed)
+    _filename += '_fd'  # Full docs
+    doc_idx_filename = _filename + '_doc_idx.npy'
+    sample_idx_filename = _filename + '_sample_idx.npy'
+    shuffle_idx_filename = _filename + '_shuffle_idx.npy'
+
+    # Build the indexed mapping if not exist.
+    if torch.distributed.get_rank() == 0:
+        if (not os.path.isfile(doc_idx_filename)) or \
+           (not os.path.isfile(sample_idx_filename)) or \
+           (not os.path.isfile(shuffle_idx_filename)):
+
+            print_rank_0(
+                ' > WARNING: could not find index map files, building '
+                'the indices on rank 0 ...')
+
+            # sample-idx.
+            start_time = time.time()
+            # Use C++ implementation for speed.
+            # First compile and then import.
+            from megatron.data import helpers
+            assert sizes.dtype == np.int32
+            # sample_idx = helpers.build_sample_idx_full_docs(
+            #     sizes, doc_idx, seq_length, num_samples)
+            doc_idx, sample_idx = _build_sample_idx_full_docs(
+                sizes, documents, seq_length, num_samples, np_rng)
+            np.save(doc_idx_filename, doc_idx, allow_pickle=True)
+            np.save(sample_idx_filename, sample_idx, allow_pickle=True)
+            print_rank_0(' > elasped time to build and save doc-idx and '
+                         'sample-idx mapping (seconds): {:4f}'.format(
+                             time.time() - start_time))
+            # shuffle-idx.
+            start_time = time.time()
+            num_samples_ = sample_idx.shape[0]
+            shuffle_idx = _build_shuffle_idx(
+                num_samples_, sample_idx.shape[0], np_rng)
+            np.save(shuffle_idx_filename, shuffle_idx, allow_pickle=True)
+            print_rank_0(
+                ' > elasped time to build and save shuffle-idx mapping'
+                ' (seconds): {:4f}'.format(time.time() - start_time))
+
+    dp_pp_barrier()
+
+    # Load mappings.
+    start_time = time.time()
+    print_rank_0(' > loading doc-idx mapping from {}'.format(
+        doc_idx_filename))
+    doc_idx = np.load(doc_idx_filename, allow_pickle=True, mmap_mode='r')
+    print_rank_0(' > loading sample-idx mapping from {}'.format(
+        sample_idx_filename))
+    sample_idx = np.load(sample_idx_filename, allow_pickle=True, mmap_mode='r')
+    print_rank_0(' > loading shuffle-idx mapping from {}'.format(
+        shuffle_idx_filename))
+    shuffle_idx = np.load(
+        shuffle_idx_filename, allow_pickle=True, mmap_mode='r')
+    print_rank_0('    loaded indexed file in {:3.3f} seconds'.format(
+        time.time() - start_time))
+    print_rank_0('    total number of samples: {}'.format(
+        sample_idx.shape[0]))
+    num_epochs = math.ceil(len(doc_idx) / len(documents))
+    print_rank_0('    total number of epochs: {}'.format(num_epochs))
+
+    return doc_idx, sample_idx, shuffle_idx
+
+
 def _num_tokens(documents, sizes):
     """Total number of tokens in the dataset."""
     return np.sum(sizes[documents])
@@ -502,6 +582,57 @@ def _build_sample_idx(sizes, doc_idx, seq_length,
     return sample_idx
 
 
+def _build_sample_idx_full_docs(
+        sizes, documents, seq_length, num_samples, np_rng):
+    """Sample index mapping is a 1D array with sizes
+    [number-of-samples] where [..., 0] contains
+    the last index into `doc_idx`.
+    """
+    sample_idx = np.zeros([num_samples], dtype=np.int32)
+
+    doc_idx = np.empty((0,), dtype=np.int32)
+    # Index into sample_idx.
+    sample_index = 0
+    # Index into doc_idx.
+    doc_idx_index = 0
+    while sample_index < num_samples:
+
+        # Start with a fresh sequence.
+        remaining_seq_length = seq_length + 1
+        is_multiple = False
+        while remaining_seq_length != 0:
+            if doc_idx_index >= len(doc_idx):
+                # Extend doc-idx.
+                doc_idx = np.concatenate([doc_idx, _build_doc_idx(
+                    documents, 1, np_rng, False)])
+
+            # Get the document length.
+            doc_id = doc_idx[doc_idx_index]
+            doc_length = sizes[doc_id]
+            # And add it to the current sequence.
+            remaining_seq_length -= doc_length
+            if is_multiple:
+                remaining_seq_length -= 1
+            # If we have more than a full sequence, set remaining length
+            # to zero so we return from the while loop.
+            if remaining_seq_length <= 0:
+                remaining_seq_length = 0
+            else:
+                # Otherwise, start from the begining of the next document.
+                doc_idx_index += 1
+                is_multiple = True
+        # Record the sequence.
+        sample_idx[sample_index] = doc_idx_index
+
+        # Reset to next document.
+        doc_idx_index += 1
+        sample_index += 1
+
+    # `doc_idx_index` is already incremented by one. We want to include
+    # it because the last document index is inclusive.
+    return doc_idx[:doc_idx_index], sample_idx
+
+
 def _build_shuffle_idx(num_samples, total_size, np_rng):
     """Build the range [0, size) and shuffle."""
     print(' > building shuffle index with split [0, {}) and [{}, {}) '

From 23efa88bb508ce0625d932667b6187554465799f Mon Sep 17 00:00:00 2001
From: janEbert <janpublicebert@posteo.net>
Date: Mon, 20 Mar 2023 12:33:22 +0100
Subject: [PATCH 106/122] Use full-doc packing for T5-style datasets

---
 megatron/data/t5_dataset.py  | 35 ++++++++++++++++++++++++++++++++---
 megatron/data/ul2_dataset.py |  7 ++++---
 2 files changed, 36 insertions(+), 6 deletions(-)

diff --git a/megatron/data/t5_dataset.py b/megatron/data/t5_dataset.py
index c75803311..647fe8a4f 100644
--- a/megatron/data/t5_dataset.py
+++ b/megatron/data/t5_dataset.py
@@ -25,7 +25,7 @@
     create_masked_lm_predictions,
     get_samples_mapping
 )
-from megatron.data.gpt_dataset import build_index_mappings, get_samples
+from megatron.data.gpt_dataset import build_index_mappings_full_docs
 
 
 class LengthExceededError(ValueError):
@@ -69,7 +69,11 @@ def __init__(self, name, indexed_dataset, data_prefix,
         # Minimum number of tokens added: BOS and EOS.
         min_added_tokens = 2
         if self.pack_samples:
-            self.doc_idx, self.sample_idx, self.shuffle_idx = build_index_mappings(
+            (
+                self.doc_idx,
+                self.sample_idx,
+                self.shuffle_idx,
+            ) = build_index_mappings_full_docs(
                 self.name, data_prefix, self.indexed_dataset.get_doc_idx()[:-1],
                 self.indexed_dataset.sizes, max_num_samples,
                 self.max_seq_length - min_added_tokens, self.seed)
@@ -106,7 +110,7 @@ def __init__(self, name, indexed_dataset, data_prefix,
 
     def __len__(self):
         if self.pack_samples:
-            return self.sample_idx.shape[0] - 1
+            return self.sample_idx.shape[0]
         else:
             return self.samples_mapping.shape[0]
 
@@ -380,6 +384,31 @@ def make_history_mask_3d(block):
     return history_mask
 
 
+def get_samples(indexed_dataset, doc_idx, sample_idx, shuffle_idx, idx):
+    # Get the shuffled index.
+    idx = shuffle_idx[idx]
+    # Start and end documents.
+    if idx == 0:
+        doc_index_f = 0
+    else:
+        doc_index_f = sample_idx[idx - 1] + 1
+    doc_index_l = sample_idx[idx]
+    # If we are within the same document, just extract the chunk.
+    if doc_index_f == doc_index_l:
+        sample = indexed_dataset.get(doc_idx[doc_index_f])
+        sample_list = [sample]
+    else:
+        # Otherwise, get the rest of the initial document.
+        sample_list = [indexed_dataset.get(doc_idx[doc_index_f])]
+        # Loop over all in between documents and add the entire document.
+        for i in range(doc_index_f + 1, doc_index_l):
+            sample_list.append(indexed_dataset.get(doc_idx[i]))
+        # And finally add the relevant portion of last document.
+        sample_list.append(indexed_dataset.get(
+            doc_idx[doc_index_l]))
+    return sample_list
+
+
 def create_samples_dict(max_seq_length, max_seq_length_dec):
     samples_dict = {
         'text_enc': np.empty((max_seq_length,), dtype=np.int64),
diff --git a/megatron/data/ul2_dataset.py b/megatron/data/ul2_dataset.py
index aaf965991..ca7438a63 100644
--- a/megatron/data/ul2_dataset.py
+++ b/megatron/data/ul2_dataset.py
@@ -27,10 +27,11 @@
     get_samples_mapping,
     SamplingStyle
 )
-from megatron.data.gpt_dataset import build_index_mappings, get_samples
+from megatron.data.gpt_dataset import build_index_mappings_full_docs
 from megatron.data.t5_dataset import (
     add_final_padding,
     create_samples_dict as t5_create_samples_dict,
+    get_samples,
     LengthExceededError,
     make_history_mask,
     merge_subsequent_masks,
@@ -110,7 +111,7 @@ def __init__(self, name, indexed_dataset, data_prefix,
                 self.doc_idx,
                 self.sample_idx,
                 self.shuffle_idx,
-            ) = build_index_mappings(
+            ) = build_index_mappings_full_docs(
                 self.name, data_prefix,
                 self.indexed_dataset.get_doc_idx()[:-1],
                 self.indexed_dataset.sizes, max_num_samples,
@@ -177,7 +178,7 @@ def __init__(self, name, indexed_dataset, data_prefix,
 
     def __len__(self):
         if self.pack_samples:
-            return self.sample_idx.shape[0] - 1
+            return self.sample_idx.shape[0]
         else:
             return self.samples_mapping.shape[0]
 

From 88eb98ad398ea2fec626fa9ac7c1071029cd732c Mon Sep 17 00:00:00 2001
From: janEbert <janpublicebert@posteo.net>
Date: Mon, 20 Mar 2023 14:22:09 +0100
Subject: [PATCH 107/122] Fix trying to all-reduce non-existent bias

---
 megatron/mpu/layers.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/megatron/mpu/layers.py b/megatron/mpu/layers.py
index c419d9bf6..e6d3c95bb 100644
--- a/megatron/mpu/layers.py
+++ b/megatron/mpu/layers.py
@@ -423,7 +423,7 @@ def __init__(self, input_size, output_size, bias=True,
         else:
             self.register_parameter('bias', None)
 
-        self.bias_tp_auto_sync = args.sync_tp_duplicated_parameters
+        self.bias_tp_auto_sync = bias and args.sync_tp_duplicated_parameters
 
 
     def forward(self, input_):

From 59e845163f281454efbecc3a9057f0f7a3db6e9f Mon Sep 17 00:00:00 2001
From: janEbert <janpublicebert@posteo.net>
Date: Tue, 21 Mar 2023 10:01:51 +0100
Subject: [PATCH 108/122] Fix truncating packed sequences without padding

---
 megatron/data/t5_dataset.py  | 4 ++++
 megatron/data/ul2_dataset.py | 2 ++
 2 files changed, 6 insertions(+)

diff --git a/megatron/data/t5_dataset.py b/megatron/data/t5_dataset.py
index 647fe8a4f..b20eabcf4 100644
--- a/megatron/data/t5_dataset.py
+++ b/megatron/data/t5_dataset.py
@@ -438,7 +438,11 @@ def create_samples_dict(max_seq_length, max_seq_length_dec):
 def _remove_padding(result_sample, pad_id):
     # Remove padding
     padding_start = np.argmax(result_sample['text_enc'] == pad_id)
+    if padding_start == 0:
+        padding_start = None
     padding_start_dec = np.argmax(result_sample['text_dec'] == pad_id)
+    if padding_start_dec == 0:
+        padding_start_dec = None
 
     result_sample['text_enc'] = result_sample['text_enc'][:padding_start]
     for key in ['text_dec', 'labels', 'loss_mask']:
diff --git a/megatron/data/ul2_dataset.py b/megatron/data/ul2_dataset.py
index ca7438a63..45237ce4b 100644
--- a/megatron/data/ul2_dataset.py
+++ b/megatron/data/ul2_dataset.py
@@ -494,6 +494,8 @@ def create_samples_dict(max_seq_length, max_seq_length_dec, model_type):
 def _remove_padding(result_sample, pad_id):
     # Remove padding
     padding_start = np.argmax(result_sample['text'] == pad_id)
+    if padding_start == 0:
+        padding_start = None
     result_sample['text'] = result_sample['text'][:padding_start]
     for key in ['labels', 'loss_mask']:
         result_sample[key] = result_sample[key][:padding_start]

From 24d46ff017615dce12df5e8d4ae579f66fb89878 Mon Sep 17 00:00:00 2001
From: janEbert <janpublicebert@posteo.net>
Date: Fri, 24 Mar 2023 12:36:40 +0100
Subject: [PATCH 109/122] Speed up packed dataset indexing

By pre-allocating more data.
---
 megatron/data/gpt_dataset.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/megatron/data/gpt_dataset.py b/megatron/data/gpt_dataset.py
index b99c37346..05944503d 100644
--- a/megatron/data/gpt_dataset.py
+++ b/megatron/data/gpt_dataset.py
@@ -589,8 +589,11 @@ def _build_sample_idx_full_docs(
     the last index into `doc_idx`.
     """
     sample_idx = np.zeros([num_samples], dtype=np.int32)
+    # If we only manage to pack one sample each time, we need this many
+    # epochs.
+    min_epochs = math.ceil(num_samples / len(documents))
 
-    doc_idx = np.empty((0,), dtype=np.int32)
+    doc_idx = _build_doc_idx(documents, min_epochs, np_rng, False)
     # Index into sample_idx.
     sample_index = 0
     # Index into doc_idx.

From 600542da7f53b07fa8daaf9f625d16fba90164bd Mon Sep 17 00:00:00 2001
From: janEbert <janpublicebert@posteo.net>
Date: Mon, 3 Apr 2023 14:55:26 +0200
Subject: [PATCH 110/122] Try to exit padding removal early

---
 megatron/data/t5_dataset.py  | 8 +++++---
 megatron/data/ul2_dataset.py | 2 +-
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/megatron/data/t5_dataset.py b/megatron/data/t5_dataset.py
index b20eabcf4..c1b10bb70 100644
--- a/megatron/data/t5_dataset.py
+++ b/megatron/data/t5_dataset.py
@@ -438,10 +438,12 @@ def create_samples_dict(max_seq_length, max_seq_length_dec):
 def _remove_padding(result_sample, pad_id):
     # Remove padding
     padding_start = np.argmax(result_sample['text_enc'] == pad_id)
-    if padding_start == 0:
-        padding_start = None
     padding_start_dec = np.argmax(result_sample['text_dec'] == pad_id)
-    if padding_start_dec == 0:
+    if padding_start == 0 and padding_start_dec == 0:
+        return
+    elif padding_start == 0:
+        padding_start = None
+    elif padding_start_dec == 0:
         padding_start_dec = None
 
     result_sample['text_enc'] = result_sample['text_enc'][:padding_start]
diff --git a/megatron/data/ul2_dataset.py b/megatron/data/ul2_dataset.py
index 45237ce4b..068487d23 100644
--- a/megatron/data/ul2_dataset.py
+++ b/megatron/data/ul2_dataset.py
@@ -495,7 +495,7 @@ def _remove_padding(result_sample, pad_id):
     # Remove padding
     padding_start = np.argmax(result_sample['text'] == pad_id)
     if padding_start == 0:
-        padding_start = None
+        return
     result_sample['text'] = result_sample['text'][:padding_start]
     for key in ['labels', 'loss_mask']:
         result_sample[key] = result_sample[key][:padding_start]

From 58831d2b91d33bb0e6b390b1d648ac17113032f1 Mon Sep 17 00:00:00 2001
From: janEbert <janpublicebert@posteo.net>
Date: Tue, 4 Apr 2023 18:51:58 +0200
Subject: [PATCH 111/122] Fix xPos embedding

---
 megatron/model/positional_embeddings.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/megatron/model/positional_embeddings.py b/megatron/model/positional_embeddings.py
index 8c3609c72..22d1f9fa2 100644
--- a/megatron/model/positional_embeddings.py
+++ b/megatron/model/positional_embeddings.py
@@ -143,7 +143,7 @@ def apply_xpos_emb(q, k, cos, sin, scale, offset: int = 0):
     scale = scale[offset:q.shape[0] + offset]
     return (
         _apply_xpos_emb(q, cos, sin, scale),
-        _apply_xpos_emb(q, cos, sin, 1.0 / scale),
+        _apply_xpos_emb(k, cos, sin, 1.0 / scale),
     )
 
 
@@ -154,5 +154,5 @@ def apply_xpos_emb_torch(q, k, cos, sin, scale, offset: int = 0):
     scale = scale[offset:q.shape[0] + offset]
     return (
         _apply_xpos_emb(q, cos, sin, scale),
-        _apply_xpos_emb(q, cos, sin, 1.0 / scale),
+        _apply_xpos_emb(k, cos, sin, 1.0 / scale),
     )

From fe45cea4123fc5246d3d337f7c12e97804ca1050 Mon Sep 17 00:00:00 2001
From: janEbert <janpublicebert@posteo.net>
Date: Thu, 13 Apr 2023 13:17:57 +0200
Subject: [PATCH 112/122] Fix padding loss mask

---
 tasks/eval_harness/evaluate.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tasks/eval_harness/evaluate.py b/tasks/eval_harness/evaluate.py
index 09dbf3aae..f6d324753 100644
--- a/tasks/eval_harness/evaluate.py
+++ b/tasks/eval_harness/evaluate.py
@@ -300,7 +300,7 @@ def create_model_inputs(self, tokens):
             )
             for (i, ctxlen) in enumerate(ctxlens):
                 if ctxlen != 0:
-                    loss_mask[i, -ctxlen] = 0
+                    loss_mask[i, -ctxlen:] = 0
 
             return (
                 (enc_tokens, dec_tokens, enc_attn_mask,

From 15e7b988cb21b19db97aa729429ae5fd7f2e5997 Mon Sep 17 00:00:00 2001
From: janEbert <janpublicebert@posteo.net>
Date: Thu, 13 Apr 2023 16:56:13 +0200
Subject: [PATCH 113/122] Handle failure mode regarding non-DS checkpoints

DS = DeepSpeed

No idea why this happens, I couldn't explain it after briefly looking
into the DeepSpeed source.
---
 tasks/eval_harness/evaluate.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tasks/eval_harness/evaluate.py b/tasks/eval_harness/evaluate.py
index f6d324753..2549c1fac 100644
--- a/tasks/eval_harness/evaluate.py
+++ b/tasks/eval_harness/evaluate.py
@@ -426,7 +426,7 @@ def load_ds_checkpoint_and_setup_megatron(args):
                                             pp_degree=args.pipeline_model_parallel_size)
 
         cp_args = ds_checkpoint.get_args()
-    except AssertionError:
+    except (AssertionError, ZeroDivisionError):
         is_ds_cp = False
         cp_path = os.path.join(args.load, 'mp_rank_00', 'model_optim_rng.pt')
         state_dict = torch.load(cp_path, map_location='cpu')

From ae45a9ecff6de37f02cb641a0ff5fd72f5d24ed4 Mon Sep 17 00:00:00 2001
From: janEbert <janpublicebert@posteo.net>
Date: Wed, 7 Jun 2023 17:37:31 +0200
Subject: [PATCH 114/122] Fix decoder-only and no-mask-tokens seq lengths

---
 megatron/data/ul2_dataset.py | 30 +++++++++++++++++++++++++-----
 1 file changed, 25 insertions(+), 5 deletions(-)

diff --git a/megatron/data/ul2_dataset.py b/megatron/data/ul2_dataset.py
index 068487d23..a89b020b0 100644
--- a/megatron/data/ul2_dataset.py
+++ b/megatron/data/ul2_dataset.py
@@ -324,6 +324,7 @@ def build_training_sample(sample, target_seq_length,
         eos_id: end of generation id
         sentinel_tokens: unique value to be substituted for every replaced span
     """
+    add_mask_tokens = sentinel_tokens is not None
 
     # Denoiser selection
     denoiser = denoisers[denoiser_index]
@@ -344,7 +345,11 @@ def build_training_sample(sample, target_seq_length,
         tokens = [cls_id] + tokens
 
     max_num_tokens = target_seq_length
-    if is_decoder_only(model_type) and denoiser != 'S':
+    if (
+            is_decoder_only(model_type)
+            and denoiser != 'S'
+            and add_mask_tokens
+    ):
         # Keep space for repeated `extra_id` tokens; not the most data
         # efficient since we calculate this based on the maximum number
         # of possible `extra_id` tokens.
@@ -352,10 +357,25 @@ def build_training_sample(sample, target_seq_length,
         truncated = len(tokens) > safe_max_seq_len
         tokens = tokens[:safe_max_seq_len]
     else:
-        # If we are S-denoising, we know only one `extra_id` token is
-        # going to be added.
-        if is_decoder_only(model_type) and denoiser == 'S':
-            max_num_tokens -= 1
+        # If we are S-denoising, we know three tokens are going to be
+        # added: `bos`, `sep`, and `eos`. Same when not adding mask
+        # tokens.
+        if (
+                is_decoder_only(model_type) and denoiser == 'S'
+                or not add_mask_tokens
+        ):
+            max_num_tokens -= 3
+
+        # If we have a decoder-only model and do not add mask tokens, we
+        # basically duplicate the sequence. So cut the maximum length in
+        # half.
+        if (
+                is_decoder_only(model_type)
+                and denoiser != 'S'
+                and not add_mask_tokens
+        ):
+            max_num_tokens = max_num_tokens // 2
+
         # Truncate to `target_sequence_length`.
         truncated = len(tokens) > max_num_tokens
         tokens = tokens[:max_num_tokens]

From 0c91b960f3a46b486b61739b6647d41434b76497 Mon Sep 17 00:00:00 2001
From: janEbert <janpublicebert@posteo.net>
Date: Wed, 7 Jun 2023 17:37:41 +0200
Subject: [PATCH 115/122] Omit second objective token if without mask tokens

That is, the reproduced objective token.
---
 megatron/data/ul2_dataset.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/megatron/data/ul2_dataset.py b/megatron/data/ul2_dataset.py
index a89b020b0..ce3e91dc9 100644
--- a/megatron/data/ul2_dataset.py
+++ b/megatron/data/ul2_dataset.py
@@ -432,6 +432,11 @@ def build_training_sample(sample, target_seq_length,
 
         # Move BOS token to start of sequence.
         tokens_dec_in = tokens_dec_in[1:]
+        if not add_mask_tokens:
+            # Do not reproduce objective token when not using masking
+            # tokens.
+            tokens_dec_in = tokens_dec_in[1:]
+            labels = labels[1:]
         tokens = (
             [bos_id]
             + tokens_enc

From 0c246c46740af9f6f30203682d9857f6197063ee Mon Sep 17 00:00:00 2001
From: janEbert <janpublicebert@posteo.net>
Date: Wed, 7 Jun 2023 17:38:31 +0200
Subject: [PATCH 116/122] Fix NumPy deprecations

---
 megatron/data/indexed_dataset.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/megatron/data/indexed_dataset.py b/megatron/data/indexed_dataset.py
index d0d312544..0676e697f 100644
--- a/megatron/data/indexed_dataset.py
+++ b/megatron/data/indexed_dataset.py
@@ -99,8 +99,8 @@ def write_longs(f, a):
     3: np.int16,
     4: np.int32,
     5: np.int64,
-    6: np.float,
-    7: np.double,
+    7: np.single,
+    6: np.double,
     8: np.uint16
 }
 
@@ -273,7 +273,7 @@ class IndexedDatasetBuilder(object):
         np.int16: 2,
         np.int32: 4,
         np.int64: 8,
-        np.float: 4,
+        np.single: 4,
         np.double: 8
     }
 

From 7ce8635052a3937200a9e59bf700add7208ab73c Mon Sep 17 00:00:00 2001
From: janEbert <janpublicebert@posteo.net>
Date: Mon, 26 Jun 2023 09:49:05 +0200
Subject: [PATCH 117/122] Fix supplied arguments

Was missing `max_seq_length_dec`.
---
 megatron/data/dataset_utils.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/megatron/data/dataset_utils.py b/megatron/data/dataset_utils.py
index 383f676cf..e3e528e74 100644
--- a/megatron/data/dataset_utils.py
+++ b/megatron/data/dataset_utils.py
@@ -550,7 +550,8 @@ def build_train_valid_test_datasets(data_prefix, data_impl, splits_string,
             prefixes[i], data_impl, splits_string,
             datasets_train_valid_test_num_samples[i],
             max_seq_length, masked_lm_prob, short_seq_prob,
-            seed, skip_warmup, binary_head, dataset_type=dataset_type)
+            seed, skip_warmup, binary_head, max_seq_length_dec,
+            dataset_type=dataset_type)
         if train_ds:
             train_datasets.append(train_ds)
         if valid_ds:

From 7290181c97f89dfcd2ea8e54d85f21b6f88477c6 Mon Sep 17 00:00:00 2001
From: janEbert <janpublicebert@posteo.net>
Date: Mon, 26 Jun 2023 09:55:43 +0200
Subject: [PATCH 118/122] Do not add separator if S-denoising

This was already the case for encoder-decoders, but is now also the case
for decoder-only models.
---
 megatron/data/ul2_dataset.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/megatron/data/ul2_dataset.py b/megatron/data/ul2_dataset.py
index ce3e91dc9..eda4130d2 100644
--- a/megatron/data/ul2_dataset.py
+++ b/megatron/data/ul2_dataset.py
@@ -437,10 +437,13 @@ def build_training_sample(sample, target_seq_length,
             # tokens.
             tokens_dec_in = tokens_dec_in[1:]
             labels = labels[1:]
+
+        # Do not add separator token if S-denoising.
+        separator = [sep_id] if denoiser != 'S' else []
         tokens = (
             [bos_id]
             + tokens_enc
-            + [sep_id]
+            + separator
             + tokens_dec_in
         )
 
@@ -453,7 +456,7 @@ def build_training_sample(sample, target_seq_length,
         tokens = np.array(tokens + filler, dtype=np.int64)
         labels = np.array((
             tokens_enc
-            + [sep_id]
+            + separator
             + labels
             + filler
         ), dtype=np.int64)

From 628d847beadb489a26e180aee479a1c207f1337a Mon Sep 17 00:00:00 2001
From: janEbert <janpublicebert@posteo.net>
Date: Fri, 12 May 2023 16:25:08 +0200
Subject: [PATCH 119/122] Fix caching error

---
 megatron/model/positional_embeddings.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/megatron/model/positional_embeddings.py b/megatron/model/positional_embeddings.py
index 22d1f9fa2..3ca68ef9e 100644
--- a/megatron/model/positional_embeddings.py
+++ b/megatron/model/positional_embeddings.py
@@ -84,28 +84,28 @@ def __init__(self, head_dim, freq_base=10000, scale_base=512, gamma=0.4, precisi
     def forward(self, x, seq_dim=1, seq_len=None):
         if seq_len is None:
             seq_len = x.shape[seq_dim]
+        scale = (
+            self.scale
+            ** (
+                torch.arange(0, seq_len, 1) - seq_len // 2
+            ).to(self.scale).div(self.scale_base)[:, None]
+        )
+
         if (
                 self.max_seq_len_cached is None
                 or (seq_len > self.max_seq_len_cached)
         ):
             self.max_seq_len_cached = seq_len
-            scale = (
-                self.scale
-                ** (
-                    torch.arange(0, seq_len, 1) - seq_len // 2
-                ).to(self.scale).div(self.scale_base)[:, None]
-            )
             cos, sin = fixed_pos_embedding(scale, self.freq_base)
             self.cos_cached = cos
             self.sin_cached = sin
-            self.scale_cached = scale
             if self.precision == torch.bfloat16:
                 self.cos_cached = self.cos_cached.bfloat16()
                 self.sin_cached = self.sin_cached.bfloat16()
         return (
             self.cos_cached[:seq_len],
             self.sin_cached[:seq_len],
-            self.scale_cached[:seq_len],
+            scale,
         )
 
 

From 9c727e7b63b7523b9cdcf9ab52d23cde3aba7722 Mon Sep 17 00:00:00 2001
From: janEbert <janpublicebert@posteo.net>
Date: Thu, 29 Jun 2023 15:09:58 +0200
Subject: [PATCH 120/122] Fix number of labels calculation for decoder-only

---
 megatron/data/ul2_dataset.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/megatron/data/ul2_dataset.py b/megatron/data/ul2_dataset.py
index eda4130d2..a438bbbf5 100644
--- a/megatron/data/ul2_dataset.py
+++ b/megatron/data/ul2_dataset.py
@@ -428,8 +428,6 @@ def build_training_sample(sample, target_seq_length,
             tokens_dec_in.append(eos_id)
             labels.append(eos_id)
 
-        num_labels = len(labels)
-
         # Move BOS token to start of sequence.
         tokens_dec_in = tokens_dec_in[1:]
         if not add_mask_tokens:
@@ -438,6 +436,8 @@ def build_training_sample(sample, target_seq_length,
             tokens_dec_in = tokens_dec_in[1:]
             labels = labels[1:]
 
+        num_labels = len(labels)
+
         # Do not add separator token if S-denoising.
         separator = [sep_id] if denoiser != 'S' else []
         tokens = (

From 4ffa95191e64c9aa47d2aa1e948a3c56dd9e3ec6 Mon Sep 17 00:00:00 2001
From: janEbert <janpublicebert@posteo.net>
Date: Thu, 29 Jun 2023 15:10:55 +0200
Subject: [PATCH 121/122] Do not automatically add <EOS> token when packing

This also fixes problems with decoder-only attention masks.
---
 megatron/data/t5_dataset.py  | 81 ++----------------------------------
 megatron/data/ul2_dataset.py | 36 +---------------
 2 files changed, 4 insertions(+), 113 deletions(-)

diff --git a/megatron/data/t5_dataset.py b/megatron/data/t5_dataset.py
index c1b10bb70..84d43fb7c 100644
--- a/megatron/data/t5_dataset.py
+++ b/megatron/data/t5_dataset.py
@@ -161,7 +161,6 @@ def _pack_samples(self, np_rng, idx):
                 prev_len,
                 prev_len_dec,
                 self.pad_id,
-                self.eos_id,
             )
             if maybe_lens is None:
                 # We are exceeding our sequence length already.
@@ -476,7 +475,6 @@ def update_samples_dict(
         prev_len,
         prev_len_dec,
         pad_id,
-        eos_id,
 ):
     _remove_padding(result_sample, pad_id)
 
@@ -484,55 +482,17 @@ def update_samples_dict(
     len_dec = len(result_sample['text_dec'])
 
     if (
-            (
-                prev_len
-                + len_enc
-                + int(result_sample['text_enc'][-1] != eos_id)
-            ) > max_seq_len
-            or (
-                prev_len_dec
-                + len_dec
-                + int(result_sample['text_dec'][-1] != eos_id)
-            ) > max_seq_len_dec
+            prev_len + len_enc > max_seq_len
+            or prev_len_dec + len_dec > max_seq_len_dec
     ):
         return None
 
-    eos_added = {
-        'text_enc': False,
-        'text_dec': False,
-        'labels': False,
-    }
-    for (key, is_enc) in zip(
-            ['text_enc', 'text_dec', 'labels'],
-            [True, False, False],
-    ):
+    for key in ['text_enc', 'text_dec', 'labels']:
         curr_sample = result_sample[key]
         offset, length = get_lens(
             key, prev_len, prev_len_dec, len_enc, len_dec)
         samples_dict[key][offset:offset + length] = curr_sample
 
-        # Add EOS token if not present.
-        if (
-                curr_sample[-1] != eos_id
-                or key == 'labels' and eos_added['text_dec']
-        ):
-            samples_dict[key][offset + length] = eos_id
-            eos_added[key] = True
-
-    need_extras = {
-        'loss_mask': False,
-        'enc_mask': False,
-        'dec_mask': False,
-        'enc_dec_mask': [False, False],
-    }
-    if eos_added['text_enc']:
-        need_extras['enc_mask'] = True
-        need_extras['enc_dec_mask'][1] = True
-    if eos_added['text_dec']:
-        need_extras['loss_mask'] = True
-        need_extras['dec_mask'] = True
-        need_extras['enc_dec_mask'][0] = True
-
     samples_dict['loss_mask'][
         prev_len_dec:prev_len_dec + len_dec,
     ] += result_sample['loss_mask']
@@ -549,42 +509,7 @@ def update_samples_dict(
         prev_len:prev_len + len_enc,
     ] += result_sample['enc_dec_mask']
 
-    if need_extras['loss_mask']:
-        samples_dict['loss_mask'][prev_len_dec + len_dec] = 1
-
-    for key in ['enc_mask', 'dec_mask']:
-        if need_extras[key]:
-            all_samples = samples_dict[key]
-            offset, length = get_lens(
-                key, prev_len, prev_len_dec, len_enc, len_dec)
-            all_samples[
-                offset + length,
-                offset:offset + length,
-            ] = 1
-            all_samples[
-                offset:offset + length,
-                offset + length,
-            ] = 1
-
-    if need_extras['enc_dec_mask'][0] or need_extras['enc_dec_mask'][1]:
-        all_samples = samples_dict['enc_dec_mask']
-        if need_extras['enc_dec_mask'][0]:
-            all_samples[
-                prev_len_dec + len_dec,
-                prev_len:prev_len + len_enc,
-            ] = 1
-        elif need_extras['enc_dec_mask'][1]:
-            all_samples[
-                prev_len_dec:prev_len_dec + len_dec,
-                prev_len + len_enc,
-            ] = 1
     samples_dict['truncated'] += result_sample['truncated']
-
-    if eos_added['text_enc']:
-        len_enc += 1
-    if eos_added['text_dec']:
-        len_dec += 1
-
     return len_enc, len_dec
 
 
diff --git a/megatron/data/ul2_dataset.py b/megatron/data/ul2_dataset.py
index a438bbbf5..e8d3862ed 100644
--- a/megatron/data/ul2_dataset.py
+++ b/megatron/data/ul2_dataset.py
@@ -239,7 +239,6 @@ def _pack_samples(self, np_rng, idx, denoiser_index):
                     self.max_seq_length,
                     prev_len,
                     self.pad_id,
-                    self.eos_id,
                 )
             else:
                 maybe_lens = update_samples_dict(
@@ -250,7 +249,6 @@ def _pack_samples(self, np_rng, idx, denoiser_index):
                     prev_len,
                     prev_len_dec,
                     self.pad_id,
-                    self.eos_id,
                 )
             if maybe_lens is None:
                 # We are exceeding our sequence length already.
@@ -537,33 +535,17 @@ def update_samples_dict_decoder_only(
         max_seq_len,
         prev_len,
         pad_id,
-        eos_id,
 ):
     _remove_padding(result_sample, pad_id)
     len_enc = len(result_sample['text'])
 
-    if (
-            (
-                prev_len
-                + len_enc
-                + int(result_sample['text'][-1] != eos_id)
-            ) > max_seq_len
-    ):
+    if prev_len + len_enc > max_seq_len:
         return None
 
-    eos_added = False
     for key in ['text', 'labels']:
         curr_sample = result_sample[key]
         samples_dict[key][prev_len:prev_len + len_enc] = curr_sample
 
-        # Add EOS token if not present.
-        if (
-                curr_sample[-1] != eos_id
-                or key == 'labels' and eos_added
-        ):
-            samples_dict[key][prev_len + len_enc] = eos_id
-            eos_added = True
-
     samples_dict['loss_mask'][
         prev_len:prev_len + len_enc,
     ] += result_sample['loss_mask']
@@ -572,21 +554,5 @@ def update_samples_dict_decoder_only(
         prev_len:prev_len + len_enc,
     ] += result_sample['dec_mask']
 
-    if eos_added:
-        samples_dict['loss_mask'][prev_len + len_enc] = 1
-
-        all_samples = samples_dict['dec_mask']
-        all_samples[
-            prev_len + len_enc,
-            prev_len:prev_len + len_enc,
-        ] = 1
-        all_samples[
-            prev_len:prev_len + len_enc,
-            prev_len + len_enc,
-        ] = 1
-
-        len_enc += 1
-
     samples_dict['truncated'] += result_sample['truncated']
-
     return len_enc

From ff5787ee101b8aff78f95fdd9e0be75a1da3671f Mon Sep 17 00:00:00 2001
From: janEbert <janpublicebert@posteo.net>
Date: Thu, 29 Jun 2023 15:22:22 +0200
Subject: [PATCH 122/122] Allow silently ignoring causal attention mask

When using the custom fused softmax kernel.
---
 megatron/model/fused_softmax.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/megatron/model/fused_softmax.py b/megatron/model/fused_softmax.py
index 973c2a384..5fd2ceab4 100644
--- a/megatron/model/fused_softmax.py
+++ b/megatron/model/fused_softmax.py
@@ -188,7 +188,6 @@ def forward_fused_softmax(self, input, mask):
 
         if self.attn_mask_type == AttnMaskType.causal:
             assert sq == sk, "causal mask is only for self attention"
-            assert mask is None, "Mask is silently ignored due to the use of a custom kernel"
 
             # input is 3D tensor (attn_batches, sq, sk)
             input = input.view(-1, sq, sk)