Fix review comments

ohadmo · ohadmo · commit 22c77398f46c · 2025-09-10T16:32:39.000-07:00
diff --git a/models/esm2/README.md b/models/esm2/README.md
@@ -37,6 +37,7 @@ This loads the pre-trained ESM2 model that will serve as our reference for compa
 Convert the Hugging Face model to Transformer Engine format using the high-level export API:
 
 ```python
+from pathlib import Path
 from esm.export import export_hf_checkpoint
 
 te_checkpoint_path = Path("te_checkpoint")
@@ -64,6 +65,7 @@ This step creates a new Hugging Face model that should be functionally equivalen
 Load the exported model and perform validation:
 
 ```python
+from transformers import AutoTokenizer
 model_hf_exported = AutoModelForMaskedLM.from_pretrained(str(hf_export_path))
 tokenizer = AutoTokenizer.from_pretrained("facebook/esm2_t6_8M_UR50D")
 ```
@@ -73,21 +75,14 @@ tokenizer = AutoTokenizer.from_pretrained("facebook/esm2_t6_8M_UR50D")
 Test the exported model against the original using masked language modeling:
 
 ```python
+import torch
+from transformers import DataCollatorForLanguageModeling
+
 # Prepare test sequence
 sequence = "MKTVRQERLKSIVRILERSKEPVSGAQLAEELSVSRQVIVQDIAYLRSLGYNIVATPRGYVLAGG"
-inputs = tokenizer(sequence, return_tensors="pt")
-
-# Create masked inputs (15% masking)
-input_ids = inputs["input_ids"].clone()
-labels = inputs["input_ids"].clone()
-mask_token_id = tokenizer.mask_token_id
-
-for i in range(input_ids.shape[1]):
-    if torch.rand(1).item() < 0.15:
-        input_ids[0, i] = mask_token_id
-
-inputs["input_ids"] = input_ids
-inputs["labels"] = labels
+batch = tokenizer([sequence], return_tensors="pt")
+collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.15)
+inputs = collator([{"input_ids": batch["input_ids"][0]}])
 
 # Compare outputs
 with torch.no_grad():
diff --git a/models/esm2/export_te_checkpoint_to_hf.py b/models/esm2/export_te_checkpoint_to_hf.py
@@ -44,13 +44,14 @@ def main(te_checkpoint_dir: str, output_dir: str):
     parser.add_argument(
         "--model",
         type=str,
+        required=True,
         help="Path to the TE checkpoint.",
     )
     parser.add_argument(
         "--output_path",
         type=str,
         default="./hf_checkpoints",
-        help="Base output directory for the converted models. Each checkpoint will be saved in a subdirectory named after the checkpoint. If not provided, uses './hf_checkpoints'.",
+        help="Output directory for the converted model. The model will be saved directly to this directory. If not provided, uses './hf_checkpoints'.",
     )
     args = parser.parse_args()
 
diff --git a/models/esm2/src/esm/convert.py b/models/esm2/src/esm/convert.py
@@ -103,10 +103,13 @@ def convert_esm_te_to_hf(model_te: nn.Module, **config_kwargs) -> nn.Module:
         "micro_batch_size",
         "max_seq_length",
         "model_type",
+        "auto_map",
     ]
     for key in te_specific_keys:
         hf_config_dict.pop(key, None)
 
+    hf_config_dict["model_type"] = "esm"
+
     hf_config = EsmConfig(**hf_config_dict, **config_kwargs)
 
     with init_empty_weights():
@@ -149,11 +152,11 @@ def _pack_qkv_weight(ctx: io.TransformCTX, query, key, value):
     """Pad the embedding layer to the new input dimension."""
     concat_weights = torch.cat((query, key, value), dim=0)
     input_shape = concat_weights.size()
-    np = ctx.target.config.num_attention_heads
+    num_heads = ctx.target.config.num_attention_heads
     # transpose weights
     # [sequence length, batch size, num_splits_model_parallel * attention head size * #attention heads]
     # --> [sequence length, batch size, attention head size * num_splits_model_parallel * #attention heads]
-    concat_weights = concat_weights.view(3, np, -1, query.size()[-1])
+    concat_weights = concat_weights.view(3, num_heads, -1, query.size()[-1])
     concat_weights = concat_weights.transpose(0, 1).contiguous()
     concat_weights = concat_weights.view(*input_shape)
     return concat_weights
@@ -171,11 +174,11 @@ def _pack_qkv_bias(ctx: io.TransformCTX, query, key, value):
     """Pad the embedding layer to the new input dimension."""
     concat_biases = torch.cat((query, key, value), dim=0)
     input_shape = concat_biases.size()
-    np = ctx.target.config.num_attention_heads
+    num_heads = ctx.target.config.num_attention_heads
     # transpose biases
     # [num_splits_model_parallel * attention head size * #attention heads]
     # --> [attention head size * num_splits_model_parallel * #attention heads]
-    concat_biases = concat_biases.view(3, np, -1)
+    concat_biases = concat_biases.view(3, num_heads, -1)
     concat_biases = concat_biases.transpose(0, 1).contiguous()
     concat_biases = concat_biases.view(*input_shape)
     return concat_biases
@@ -190,26 +193,20 @@ def _pack_qkv_bias(ctx: io.TransformCTX, query, key, value):
     ),
 )
 def _unpack_qkv_weight(ctx: io.TransformCTX, qkv_weight):
-    """Unpack the fused QKV weight into separate query, key, and value weights."""
-    np = ctx.source.config.num_attention_heads
-
-    # Reverse the packing transformation
-    # First, reshape to separate the interleaved Q, K, V
-    # [attention head size * num_splits_model_parallel * #attention heads]
-    # --> [num_splits_model_parallel * attention head size * #attention heads]
-    qkv_weight = qkv_weight.view(np, 3, -1, qkv_weight.size()[-1])  # Output:[num_heads, 3, head_dim, vocab_size]
-    qkv_weight = qkv_weight.transpose(0, 1).contiguous()  # Output:[3, num_heads, head_dim, vocab_size]
-
-    # Split into Q, K, V directly from the transposed tensor
-    # qkv_weight shape: [3, num_heads, head_dim, input_dim]
-    query = qkv_weight[0]  # [num_heads, head_dim, input_dim]
-    key = qkv_weight[1]  # [num_heads, head_dim, input_dim]
-    value = qkv_weight[2]  # [num_heads, head_dim, input_dim]
-
-    # Reshape to match HF format: [total_head_dim, input_dim]
-    query = query.view(-1, query.size()[-1])  # [num_heads * head_dim, input_dim]
-    key = key.view(-1, key.size()[-1])  # [num_heads * head_dim, input_dim]
-    value = value.view(-1, value.size()[-1])  # [num_heads * head_dim, input_dim]
+    """Unpack fused QKV weights into separate [hidden_size, input_dim] tensors for query/key/value."""
+    num_heads = ctx.source.config.num_attention_heads
+    total_rows, input_dim = qkv_weight.size() # size: [num_heads * 3 *head_dim, input_dim]
+    assert total_rows % (3 * num_heads) == 0, (
+        f"QKV weight rows {total_rows} not divisible by 3*num_heads {3*num_heads}"
+    )
+    head_dim = total_rows // (3 * num_heads)
+
+    qkv_weight = qkv_weight.view(num_heads, 3, head_dim, input_dim).transpose(0, 1).contiguous() # size: [3, num_heads, head_dim, input_dim]
+    query, key, value = qkv_weight[0], qkv_weight[1], qkv_weight[2] # size: [num_heads, head_dim, input_dim]
+
+    query = query.reshape(-1, input_dim) # size: [num_heads * head_dim, input_dim]
+    key = key.reshape(-1, input_dim) # size: [num_heads * head_dim, input_dim]
+    value = value.reshape(-1, input_dim) # size: [num_heads * head_dim, input_dim]
 
     return query, key, value
 
@@ -223,25 +220,19 @@ def _unpack_qkv_weight(ctx: io.TransformCTX, qkv_weight):
     ),
 )
 def _unpack_qkv_bias(ctx: io.TransformCTX, qkv_bias):
-    """Unpack the fused QKV bias into separate query, key, and value biases."""
-    np = ctx.source.config.num_attention_heads
+    """Unpack fused QKV biases into separate [hidden_size] tensors for query/key/value."""
+    num_heads = ctx.source.config.num_attention_heads
+    total_size = qkv_bias.size(0) # size: [num_heads * 3 * head_dim]
+    assert total_size % (3 * num_heads) == 0, (
+        f"QKV bias size {total_size} not divisible by 3*num_heads {3*num_heads}"
+    )
+    head_dim = total_size // (3 * num_heads)
 
-    # Reverse the packing transformation
-    # First, reshape to separate the interleaved Q, K, V
-    # [num_splits_model_parallel * attention head size * #attention heads]
-    # --> [attention head size * num_splits_model_parallel * #attention heads]
-    qkv_bias = qkv_bias.view(np, 3, -1)
-    qkv_bias = qkv_bias.transpose(0, 1).contiguous()
-
-    # Split into Q, K, V directly from the transposed tensor
-    # qkv_bias shape: [3, num_heads, head_dim]
-    query = qkv_bias[0]  # [num_heads, head_dim]
-    key = qkv_bias[1]  # [num_heads, head_dim]
-    value = qkv_bias[2]  # [num_heads, head_dim]
-
-    # Reshape to match HF format: [total_head_dim]
-    query = query.view(-1)  # [num_heads * head_dim]
-    key = key.view(-1)  # [num_heads * head_dim]
-    value = value.view(-1)  # [num_heads * head_dim]
+    qkv_bias = qkv_bias.view(num_heads, 3, head_dim).transpose(0, 1).contiguous() # size: [3, num_heads, head_dim]
+    query, key, value = qkv_bias[0], qkv_bias[1], qkv_bias[2] # size: [num_heads, head_dim]
+
+    query = query.reshape(-1) # size: [num_heads * head_dim]
+    key = key.reshape(-1) # size: [num_heads * head_dim]
+    value = value.reshape(-1) # size: [num_heads * head_dim]
 
     return query, key, value
diff --git a/models/esm2/src/esm/export.py b/models/esm2/src/esm/export.py
@@ -90,10 +90,8 @@ def export_te_checkpoint(te_checkpoint_path: str, output_path: str):
 
     print(f"Converting {te_checkpoint_path} from TE format back to original HuggingFace Facebook ESM-2 format")
 
-    # Load the TE model
+    # Load the TE model and convert to HF format
     model_te = NVEsmForMaskedLM.from_pretrained(te_checkpoint_path)
-
-    # Convert TE model to HF format
     model_hf = convert_esm_te_to_hf(model_te)
     model_hf.save_pretrained(output_path)
 
@@ -110,16 +108,6 @@ def export_te_checkpoint(te_checkpoint_path: str, output_path: str):
     if vocab_path.exists():
         shutil.copy(vocab_path, Path(output_path) / "vocab.txt")
 
-    # Update config to remove TE-specific settings
-    config_path = Path(output_path) / "config.json"
-    if config_path.exists():
-        with open(config_path, "r") as f:
-            config = json.load(f)
-        config.pop("auto_map", None)
-        config["model_type"] = "esm"
-        with open(config_path, "w") as f:
-            json.dump(config, f, indent=2, sort_keys=True)
-
     model_hf = AutoModelForMaskedLM.from_pretrained(
         output_path,
         torch_dtype=torch.bfloat16,
diff --git a/models/esm2/tests/test_convert_reverse.py b/models/esm2/tests/test_convert_reverse.py
@@ -18,29 +18,9 @@
 
 import pytest
 import torch
-from torch import nn
 from transformers import AutoModelForMaskedLM
 
 
-def test_esm_model_has_all_te_layers():
-    """Test that the converted TE model doesn't contain vanilla PyTorch layers."""
-    from esm.convert import convert_esm_hf_to_te
-
-    model_hf = AutoModelForMaskedLM.from_pretrained("facebook/esm2_t6_8M_UR50D")
-    model_te = convert_esm_hf_to_te(model_hf)
-    vanilla_layers_found = []
-    for name, module in model_te.named_modules():
-        if isinstance(module, nn.Linear):
-            vanilla_layers_found.append(f"Linear layer found in {name}")
-        if isinstance(module, nn.LayerNorm):
-            vanilla_layers_found.append(f"LayerNorm layer found in {name}")
-    if vanilla_layers_found:
-        print("ERROR: Found vanilla PyTorch layers in converted TE model:")
-        for error in vanilla_layers_found:
-            print(f"WARNING: {error}")
-        assert not vanilla_layers_found, f"Found {len(vanilla_layers_found)} vanilla layers in converted model"
-
-
 def test_convert_te_to_hf_roundtrip():
     """Test that converting HF -> TE -> HF produces the same model."""
     from esm.convert import convert_esm_hf_to_te, convert_esm_te_to_hf
@@ -124,16 +104,22 @@ def test_config_conversion():
     model_te = convert_esm_hf_to_te(model_hf)
     model_hf_converted = convert_esm_te_to_hf(model_te)
 
+    original_config_dict = model_hf.config.to_dict()
+    converted_config_dict = model_hf_converted.config.to_dict()
+
+    for key, value in original_config_dict.items():
+        assert key in converted_config_dict, f"Config field '{key}' missing in converted model"
+        assert converted_config_dict[key] == value, f"Config field '{key}' differs: original={value}, converted={converted_config_dict[key]}"
+
     assert model_hf_converted.config.model_type == "esm"
-    assert model_hf_converted.config.hidden_size == model_hf.config.hidden_size
-    assert model_hf_converted.config.num_hidden_layers == model_hf.config.num_hidden_layers
-    assert model_hf_converted.config.num_attention_heads == model_hf.config.num_attention_heads
-    assert model_hf_converted.config.intermediate_size == model_hf.config.intermediate_size
-    assert model_hf_converted.config.vocab_size == model_hf.config.vocab_size
-
-    # assert not hasattr(model_hf_converted.config, 'qkv_weight_interleaved')
-    # assert not hasattr(model_hf_converted.config, 'encoder_activation')
-    # assert not hasattr(model_hf_converted.config, 'attn_input_format')
-    # assert not hasattr(model_hf_converted.config, 'fuse_qkv_params')
-    # assert not hasattr(model_hf_converted.config, 'micro_batch_size')
-    # assert not hasattr(model_hf_converted.config, 'max_seq_length')
+
+    te_specific_fields = [
+        'qkv_weight_interleaved',
+        'encoder_activation',
+        'attn_input_format',
+        'fuse_qkv_params',
+        'micro_batch_size',
+        'auto_map'
+    ]
+    for field in te_specific_fields:
+        assert not hasattr(model_hf_converted.config, field), f"TE-specific field '{field}' should not be present in converted model"