facebookresearch · jacklanchantin · Oct 14, 2025 · Oct 14, 2025 · Oct 14, 2025 · Oct 14, 2025
@@ -66,15 +66,17 @@ num_shards: 8
 
 name: llama3
 model_family: llama
-checkpoint: "https://ai.meta.com/llama/;gated=true"
-tokenizer: "https://ai.meta.com/llama/;gated=true"
+checkpoint: "/datasets/pretrained-llms/Llama-3.1-8B/"
+tokenizer: "/datasets/pretrained-llms/Llama-3.1-8B/"
 tokenizer_family: llama
 use_v2_tokenizer: true
 
 ---
 
 name: llama3_instruct
 base: llama3
+checkpoint: "/datasets/pretrained-llms/Llama-3.1-8B-Instruct/"
+tokenizer: "/datasets/pretrained-llms/Llama-3.1-8B-Instruct/"
 use_eot: true  # instruct tokenizer to use EOT instead of EOS
 
 ---
@@ -168,4 +170,4 @@ model_arch: llama3_1_8b
 checkpoint: "hg://deepseek-ai/DeepSeek-R1-Distill-Llama-8B"
 tokenizer: "hg://deepseek-ai/DeepSeek-R1-Distill-Llama-8B"
 tokenizer_family: llama
-use_v2_tokenizer: true
+use_v2_tokenizer: true
@@ -139,8 +139,8 @@ use_im_end: true
 name: qwen3_8b_base
 model_family: qwen
 model_arch: qwen3_8b
-checkpoint: "hg://qwen/qwen3-8b-base"
-tokenizer: "hg://qwen/qwen3-8b-base"
+checkpoint: "/checkpoint/data/jacklanchantin/pretrained-llms/Qwen3-8B-Base/"
+tokenizer: "/checkpoint/data/jacklanchantin/pretrained-llms/Qwen3-8B-Base/"
 tokenizer_family: qwen
 
 ---

@@ -189,7 +189,7 @@ def __init__(
         device_stat_tracker: DeviceStatTracker,
         wall_watch: Stopwatch,
         progress_reporter: ProgressReporter,
-        fp16_loss_scale: tuple[float, float] = (128.0, 0.0001),
+        fp16_loss_scale: tuple[float, float] = (65536, 0.0001),
         no_sync_grad_accumulation: bool = False,
         max_grad_norm: float | None = None,
         grad_check: bool = False,

@@ -165,7 +165,7 @@ class TrainerSection:
     max_grad_norm: float | None = None
     """The maximum gradient norm. If ``None``, no clipping will be applied."""
 
-    fp16_loss_scale: tuple[float, float] = (128.0, 0.0001)
+    fp16_loss_scale: tuple[float, float] = (65536, 0.0001)
     """The initial and minimum loss scale for fp16 training."""
 
     gc_every_n_steps: int | None = None

@@ -1,3 +1,3 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
@@ -54,6 +54,12 @@
 from fairseq2.recipes.lm._online_finetune._generative_judge import (
     J1PointwiseExtractorHandler as J1PointwiseExtractorHandler,
 )
+from fairseq2.recipes.lm._online_finetune._generative_judge import (
+    SelfAugmentingExtractor as SelfAugmentingExtractor,
+)
+from fairseq2.recipes.lm._online_finetune._generative_judge import (
+    SelfAugmentingExtractorHandler as SelfAugmentingExtractorHandler,
+)
 from fairseq2.recipes.lm._online_finetune._generative_judge import (
     JudgmentExtractorHandler as JudgmentExtractorHandler,
 )

@@ -1,3 +1,3 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
@@ -361,6 +361,7 @@
 
 def get_vllm_logprobs(
     vllm_outputs: List[RequestOutput],
+    model_logps: Tensor,
     gangs,
     rollout_start_end: tuple[int, int] | None = None,
 ):
@@ -404,7 +405,10 @@
     padded = torch.zeros(len(sequences), max_len)
     for i, t in enumerate(sequences):
         padded[i, : t.size(0)] = t
-
+
+    # clip outputs to be same size as model_logps
+    if padded.size() != model_logps.size():
+        padded = padded[:, : model_logps.size(1)]
     return padded
 
 
@@ -459,6 +463,46 @@
     return rollout_lengths
 
 
+def get_think_rollout_lengths(rollouts: List[SequenceData]):
+    """Get the lengths of tokens before the </think> tag in rollouts.
+
+    This function calculates the approximate number of tokens generated before
+    the </think> closing tag in each rollout. It uses a proportional approximation
+    based on character positions to estimate token counts.
+
+    Args:
+        rollouts: List of SequenceData containing rollout outputs
+
+    Returns:
+        List of token lengths before </think> tag for rollouts that contain the tag
+    """
+    think_rollout_lengths = []
+    think_tag = "</think>"
+
+    for rollout in rollouts:
+        for sample in rollout.outputs:
+            rollout_text = sample.text
+            if think_tag in rollout_text:
+                # Find the position of </think> in the text
+                think_end_pos = rollout_text.find(think_tag) + len(think_tag)
+                # Count tokens up to and including </think>
+                # We need to find how many tokens correspond to the text before </think>
+                # Since we have token_ids, we'll approximate by finding the proportion
+                text_before_think = rollout_text[:think_end_pos]
+                total_text = rollout_text
+                total_tokens = len(sample.token_ids)
+                # Approximate token count proportionally (rough estimate)
+                # A better approach would be to tokenize text_before_think, but we use approximation
+                think_token_length = (
+                    int((len(text_before_think) / len(total_text)) * total_tokens)
+                    if len(total_text) > 0
+                    else 0
+                )
+                think_rollout_lengths.append(think_token_length)
+
+    return think_rollout_lengths
+
+
 class StatefulRolloutBag:
     """A stateful container for managing and reusing model rollouts across multiple micro-batches.
 
@@ -559,6 +603,13 @@
     metric_bag.get(Mean, "avg_rollout_length").update(avg_rollout_length, weight=1)
 
 
+@torch.inference_mode()
+def update_avg_think_rollout_length(metric_bag: MetricBag, avg_think_rollout_length):
+    metric_bag.get(Mean, "avg_think_rollout_length").update(
+        avg_think_rollout_length, weight=1
+    )
+
+
 @torch.inference_mode()
 def update_avg_reward_len_norm(metric_bag: MetricBag, avg_reward_len_norm):
     metric_bag.get(Mean, "avg_reward_len_norm").update(avg_reward_len_norm, weight=1)
@@ -599,7 +650,7 @@
 
 
 @torch.inference_mode()
-def update_grpo_loss(metric_bag: MetricBag, batch: PromptBatch, loss: Tensor) -> None:
+def update_grpo_loss(metric_bag: MetricBag, batch: PromptBatch, loss: Tensor, tis_imp_ratio: Tensor) -> None:
     """Update the GRPO loss metric.
 
     :param batch:
@@ -611,6 +662,10 @@
         loss / batch.batch_size, weight=batch.batch_size
     )
 
+    metric_bag.get(Mean, "tis_imp_ratio").update(tis_imp_ratio)
+
+
+
 
 def compute_reference_logps(
     gangs: Gangs,

@@ -1,12 +1,12 @@
 POINTWISE_J1_PROMPT = """
-You are given a user question and a response from an AI assistant. Your task is to act as an impartial judge and evaluate how well the response fulfills the user's instructions. You will be shown multiple responses to the same prompt, but only one at a time. Evaluate each response independently. 
+You are given a user question and a response from an AI assistant. Your task is to act as an impartial judge and evaluate how well the response fulfills the user's instructions. You will be shown multiple responses to the same prompt, but only one at a time. Evaluate each response independently.
 
-Think carefully about how to assess the quality of the response, and enclose your reasoning within <think> and </think> tags. Your reasoning should include your evaluation criteria, a clear understanding of what an ideal response would look like for this particular question, and a concrete example of such an ideal or reference answer if possible. Then compare the assistant's response to your ideal or reference answer, explaining how it aligns with or deviates from your expectations. Be specific and avoid vague or overly general judgments. Remain as objective as possible. 
+Think carefully about how to assess the quality of the response, and enclose your reasoning within <think> and </think> tags. Your reasoning should include your evaluation criteria, a clear understanding of what an ideal response would look like for this particular question, and a concrete example of such an ideal or reference answer if possible. Then compare the assistant's response to your ideal or reference answer, explaining how it aligns with or deviates from your expectations. Be specific and avoid vague or overly general judgments. Remain as objective as possible.
 
 Finally, assign the assistant's response a score from 0 to 10, using either an integer or a decimal with up to 0.1 precision. A higher score should indicate a higher-quality response. Enclose the score within <score> and </score> tags.
 
-Format your output like this: 
-<think> your_thinking_process </think> 
+Format your output like this:
+<think> your_thinking_process </think>
 <score> your_score </score>
 
 Below are the user's question and the assistant's response:
@@ -71,6 +71,26 @@
 [The End of Assistant B's Answer]
 """
 
+SELF_AUGMENTING_PROMPT = """
+You are given a ground truth text, and a generated text from an AI assistant. Your task is to act as an impartial judge and evaluate how well the response matches the ground truth text. It doesn't have to match word for word, but it should be very similar.
+
+Think carefully about how to assess how well the generated text matches the ground truth. Your reasoning should include your evaluation criteria.
+
+Finally, assign the assistant's generation a binary score, either 0 or 1. A 0 indicates that the generated text does not match the ground truth text, and a 1 indicates that it matches well.
+
+Format your score as \\boxed{{SCORE}} where SCORE is either 0 or 1.
+
+Below are the ground truth text and the assistant's Generation:
+
+[Start of Ground Truth Text]
+{ground_truth}
+[End of Ground Truth Text]
+
+[Start of Assistant's Generation]
+{generation}
+[End of Assistant's Generation]
+"""
+
 
 import re
 from abc import ABC, abstractmethod
@@ -83,7 +103,7 @@

 class JudgmentExtractorHandler(ABC):
    @abstractmethod
    def create(self): ...

    @property
    @abstractmethod
@@ -113,7 +133,7 @@
    def prompt(self) -> str: ...

    @abstractmethod
    def format_prompt(self, prompt_text, **kwargs: Any) -> str: ...

    """
    Format the prompt text and additional arguments into a string suitable for input to the reward model.
@@ -126,7 +146,7 @@
    """

    @abstractmethod
    def extract(self, generation) -> float | str: ...

    """
    Extract the final scalar reward score from the model's response.
@@ -145,7 +165,7 @@
    """

    @abstractmethod
    def aggregate(self, judgments) -> float | str: ...

    """
    Aggregate multiple responses (judgments) from the reward model into a single value.
@@ -161,28 +181,28 @@


 class GeneralVerifierExtractorHandler(JudgmentExtractorHandler):
    def __init__(self):
        pass

    @override
    def create(self):
        return GeneralVerifierExtractor()

    @property
    @override
    def name(self):
        return "general_verifier_extractor"

    @property
    @override
    def config_kls(self):
        return None


 class GeneralVerifierExtractor(JudgmentExtractor):
    def __init__(self):
        try:
            from math_verify import parse
            from math_verify.parser import (
                ExprExtractionConfig,
                LatexExtractionConfig,
@@ -248,6 +268,82 @@
         return round(avg_score / len(judgments), 4)
 
 
+class SelfAugmentingExtractorHandler(JudgmentExtractorHandler):
+    def __init__(self):
+        pass
+
+    @override
+    def create(self):
+        return SelfAugmentingExtractor()
+
+    @property
+    @override
+    def name(self):
+        return "self_augmenting_extractor"
+
+    @property
+    @override
+    def config_kls(self):
+        return None
+
+
+class SelfAugmentingExtractor(JudgmentExtractor):
+    def __init__(
+        self,
+    ):
+        pass
+
+    @override
+    def prompt(self):
+        return SELF_AUGMENTING_PROMPT
+
+
+    def remove_think_tags(self, rollout_text):
+        tag = "</think>"
+        count = rollout_text.count(tag)
+        if count == 1:
+            # Find the position after the tag and return everything after it
+            index = rollout_text.find(tag) + len(tag)
+            return rollout_text[index:]
+        else:
+            return "" # set rollout to empty string if it doesn't contain thought or has multiple
+
+    @override
+    def format_prompt(self, tokenizer, prompt_text, rollout_text, reference_answer, dp_gangs):
+        # if dp_gangs.rank == 0
+        #     breakpoint()
+        # dp_gangs.root.barrier()
+
+        rollout_text = self.remove_think_tags(rollout_text)
+
+        content = self.prompt().format(ground_truth=reference_answer, generation=rollout_text)
+
+        # log.info(f"Judge prompt = {content}")
+        wrapped_text = [{"role": "user", "content": content}]
+        chat_str = tokenizer.apply_chat_template(
+            wrapped_text, tokenize=False, add_generation_prompt=True
+        )
+        return chat_str
+
+    @override
+    def extract(self, generation):
+        # pattern = r'\\boxed\{(-?\d+)\}'
+        pattern = r'\\boxed\{([01])\}'
+        match = re.search(pattern, generation)
+        if match:
+            score = float(match.group(1))
+        else:
+            score = 0.0
+        return score
+
+    @override
+    def aggregate(self, judgments):
+        avg_score = 0.0
+        for score in judgments:
+            avg_score += score
+
+        return round(avg_score / len(judgments), 4)
+
 class J1PointwiseExtractorHandler(JudgmentExtractorHandler):
     def __init__(self):
         pass
@@ -268,7 +364,9 @@
 
 
 class J1PointwiseExtractor(JudgmentExtractor):
-    def __init__(self):
+    def __init__(
+        self,
+    ):
         pass
 
     @override