Adding support for acemath

swarna · swarna · commit d4740707ca2b · 2025-09-03T23:57:35.000Z
diff --git a/src/fairseq2/recipes/lm/__init__.py b/src/fairseq2/recipes/lm/__init__.py
@@ -90,6 +90,9 @@
 from fairseq2.recipes.lm._online_finetune._remote_model import (
     NoEnvGeneralVerifierPipeline as NoEnvGeneralVerifierPipeline,
 )
+from fairseq2.recipes.lm._online_finetune._remote_model import (
+    NoEnvAceMathRMPipeline as NoEnvAceMathRMPipeline,
+)
 from fairseq2.recipes.lm._online_finetune._remote_model import (
     RemoteModelHandler as RemoteModelHandler,
 )
@@ -105,6 +108,12 @@
 from fairseq2.recipes.lm._online_finetune._rewards import (
     SkyworkVerifierHandler as SkyworkVerifierHandler,
 )
+from fairseq2.recipes.lm._online_finetune._rewards import (
+    AceMathVerifier as AceMathVerifier,
+)
+from fairseq2.recipes.lm._online_finetune._rewards import (
+    AceMathVerifierHandler as AceMathVerifierHandler,
+)
 from fairseq2.recipes.lm._online_finetune._rewards import (
     GenerativePairwiseVerifier as GenerativePairwiseVerifier,
 )
diff --git a/src/fairseq2/recipes/lm/_online_finetune/_common.py b/src/fairseq2/recipes/lm/_online_finetune/_common.py
@@ -394,6 +394,8 @@ def log_rollouts(prompt_batch: PromptBatch, rollouts, split_name, num_rollouts=1
         prompt = prompt_batch.meta_info.get("prompt_raw")[0]
     elif "raw_prompt" in prompt_batch.meta_info:
         prompt = prompt_batch.meta_info.get("raw_prompt")[0]
+    elif "problem" in prompt_batch.meta_info:
+        prompt = prompt_batch.meta_info.get("problem")[0]
     else:
         # raw text prompt doesn't exist for this dataset
         prompt = "DUMMY PROMPT"
diff --git a/src/fairseq2/recipes/lm/_online_finetune/_remote_model.py b/src/fairseq2/recipes/lm/_online_finetune/_remote_model.py
@@ -31,6 +31,9 @@
 from fairseq2.recipes.lm._online_finetune.third_party.general_verifier import (
     GeneralVerifierPipeline,
 )
+from fairseq2.recipes.lm._online_finetune.third_party.ace_math import (
+    AceMathRMPipeline,
+)
 from fairseq2.utils.structured import StructureError, structure
 
 
@@ -48,6 +51,7 @@ class VllmEngineArgs:
     tokenizer: str = "/datasets/pretrained-llms/Llama-3.1-8B-Instruct"
     task: str = "generate"
     tensor_parallel_size: int = 4
+    max_model_len: int | None = None
     trust_remote_code: bool = False
     model_impl: str = "auto"
     enforce_eager: bool = True
@@ -136,6 +140,26 @@ def is_ready(self):
     @property
     def name(self):
         return "general_verifier_pipeline"
+    
+@ray.remote
+class NoEnvAceMathRMPipeline(AceMathRMPipeline):
+    """
+    This is for running Ace Math RM pipeline with HF backend.
+    """
+
+    def __init__(self, *args, **kwargs):
+        # stop ray from manipulating CUDA_VISIBLE_DEVICES
+        # at the top-level
+        del os.environ["CUDA_VISIBLE_DEVICES"]
+        super().__init__(*args, **kwargs)
+        self.ready = True  # Set a flag or return a signal
+
+    def is_ready(self):
+        return self.ready
+
+    @property
+    def name(self):
+        return "ace_math_rm_pipeline"
 
 
 class WorkerExtension:
@@ -309,6 +333,7 @@ def setup_vllm_worker(self, ray_actor_name, vllm_engine_args, gangs: Gangs):
         ).remote(
             model=vllm_engine_args.model,
             tokenizer=vllm_engine_args.tokenizer,
+            max_model_len=vllm_engine_args.max_model_len,
             enforce_eager=vllm_engine_args.enforce_eager,
             worker_extension_cls="fairseq2.recipes.lm._online_finetune._remote_model.WorkerExtension",
             tensor_parallel_size=vllm_engine_args.tensor_parallel_size,
@@ -437,6 +462,8 @@ def reward_from_model(self, prompt_list, batch_size=64):
         ray_outputs = ray.get(outputs)
         ray_outputs_flat = [o for sublist in ray_outputs for o in sublist]
         rewards = [o.outputs.data.item() for o in ray_outputs_flat]
+        
+        log.info(f"Rewards = {rewards}")
 
         return rewards
 
@@ -537,7 +564,7 @@ def rollout_from_model(self, prompt_list, sampling_params=None, string_input=Fal
             "RemoteHFModel.rollout_from_model is not implemented. "
         )
 
-    def reward_from_model(self, prompt_list, batch_size=64):
+    def reward_from_model(self, prompt_list, batch_size=4):
         # NOTE: need to batch inputs to hf.encode model for current models that aren't supported by hf
         rewards = []
         outputs = []
diff --git a/src/fairseq2/recipes/lm/_online_finetune/_rewards.py b/src/fairseq2/recipes/lm/_online_finetune/_rewards.py
@@ -461,6 +461,187 @@ def prepare_preference_batch(
         )
 
         return batch, is_bad_batch, reward_output
+    
+class AceMathVerifierHandler(VLLMOutputRewardHandler):
+    def __init__(self):
+        pass
+
+    @override
+    def create(self, reward_model, reward_name, reward_config, gangs, context):
+        if reward_config.tokenizer is not None:
+            tokenizer = reward_config.tokenizer
+        else:
+            tokenizer = "nvidia/AceMath-7B-RM"
+
+        return AceMathVerifier(
+            gangs,
+            context,
+            reward_model,
+            reward_name=reward_name,
+            answer_key=reward_config.answer_key,
+            prompt_key=reward_config.prompt_key,
+            tokenizer=tokenizer,
+        )
+
+    @property
+    @override
+    def name(self):
+        return "acemath_verifier"
+
+    @property
+    @override
+    def config_kls(self):
+        return None
+    
+class AceMathVerifier(VLLMOutputReward):
+    def __init__(
+        self,
+        gangs,
+        context,
+        reward_model,
+        reward_name,
+        answer_key,
+        prompt_key,
+        tokenizer,
+    ):
+        self.answer_key = answer_key
+        self.prompt_key = prompt_key
+        self._gangs = gangs
+        self._context = context
+        self.reward_model = reward_model
+        self.reward_name = reward_name
+        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer)
+
+    def wrap_text(self, prompt_text, rollout_text):
+        wrapped_text = [
+            {"role": "system", "content": "Please reason step by step, and check your final answer within \\boxed{}."},
+            {"role": "user", "content": prompt_text},
+            {"role": "assistant", "content": rollout_text}
+        ]
+        chat_str = self.tokenizer.apply_chat_template(wrapped_text, tokenize=False, add_generation_prompt=False)
+        if self.tokenizer.bos_token is not None and chat_str.startswith(
+            self.tokenizer.bos_token
+        ):
+            chat_str = chat_str[len(self.tokenizer.bos_token) :]
+
+        return chat_str
+
+    @override
+    def process_rollouts(
+        self, vllm_outputs: list[RequestOutput], prompt_batch: PromptBatch
+    ):
+        vllm_inputs = []
+        batch_text = []
+        batch_tokens = []
+
+        if vllm_outputs is None:
+            vllm_outputs = [None] * len(prompt_batch.prompts)
+
+        text_prompts = prompt_batch.meta_info.get(self.prompt_key)
+        for i, (i_batch_request_output, prompt_text) in enumerate(
+            zip(vllm_outputs, text_prompts)
+        ):
+
+            rollouts_text = []
+            rollouts_tokens = []
+            for rollout_output in i_batch_request_output.outputs:
+                rollout_text = rollout_output.text
+                vllm_input = self.wrap_text(prompt_text, rollout_text)
+                vllm_inputs.append(vllm_input)
+                rollouts_text.append(rollout_output.text)
+                rollouts_tokens.append(rollout_output.token_ids)
+
+            batch_text.append(rollouts_text)
+            batch_tokens.append(rollouts_tokens)
+
+        batch_rewards = generate_rewards(
+            vllm_inputs, dp_gang=self._gangs.dp, vllm_model=self.reward_model
+        )
+        
+        log.info(f"Batch rewards = {batch_rewards}")
+
+        # reshape batch_rewards to [Batch, Rollouts]
+        B, R = len(batch_text), len(batch_text[0])  # batch size, rollouts
+        batch_rewards = [batch_rewards[i * R : (i + 1) * R] for i in range(B)]
+
+        return {"text": batch_text, "tokens": batch_tokens, "rewards": batch_rewards}
+
+    def prepare_preference_batch(
+        self, prompt_batch: PromptBatch, rollouts
+    ) -> PreferenceBatch:
+
+        reward_output = self.process_rollouts(rollouts, prompt_batch)
+
+        chosen_batch = []
+        rejected_batch = []
+        prompt_lens = []
+        dummy_batch_ids = []  # keep posiitons of dummy pairs here
+
+        # choosing first rollouts with reward 1 as chosen and 0 as rejected (sort of random given that we sample rollouts randomly)
+        for i_batch, (i_batch_rewards, i_batch_tokens) in enumerate(
+            zip(reward_output["rewards"], reward_output["tokens"])
+        ):
+            chosen_rollout_position = i_batch_rewards.index(max(i_batch_rewards))
+            rejected_rollout_position = i_batch_rewards.index(min(i_batch_rewards))
+
+            if chosen_rollout_position == rejected_rollout_position:
+                # cant form preference pair when we dont have such rollouts
+                # this will be dummy batch and we zero out loss
+                dummy_batch_ids.append(i_batch)
+
+            chosen_rollout_tokens = list(i_batch_tokens[chosen_rollout_position])
+            rejected_rollout_tokens = list(i_batch_tokens[rejected_rollout_position])
+            prompt_tokens = prompt_batch.prompts[i_batch]
+
+            chosen_tokens = prompt_tokens + chosen_rollout_tokens
+            chosen_batch.append(chosen_tokens)
+
+            rejected_tokens = prompt_tokens + rejected_rollout_tokens
+            rejected_batch.append(rejected_tokens)
+
+            prompt_lens.append(len(prompt_tokens))
+
+        filter_batch = lambda batch: [
+            item for index, item in enumerate(batch) if index not in dummy_batch_ids
+        ]
+
+        if len(dummy_batch_ids) == len(reward_output["tokens"]):
+            # entire batch does not have a valid preference pair
+            # we use it as dummy batch and zero the loss in the end
+            is_bad_batch = True
+        else:
+            # removing dummy pairs from the batch
+            chosen_batch = filter_batch(chosen_batch)
+            rejected_batch = filter_batch(rejected_batch)
+            prompt_lens = filter_batch(prompt_lens)
+            is_bad_batch = False
+
+        prompt_lens = torch.tensor(prompt_lens)
+
+        chosen_batch = [
+            torch.tensor(sequence, device=self._gangs.dp.device)
+            for sequence in chosen_batch
+        ]
+        chosen_batch = collate_with_target_mask(
+            chosen_batch, prompt_lens, device=self._gangs.dp.device
+        )
+
+        rejected_batch = [
+            torch.tensor(sequence, device=self._gangs.dp.device)
+            for sequence in rejected_batch
+        ]
+        rejected_batch = collate_with_target_mask(
+            rejected_batch, prompt_lens, device=self._gangs.dp.device
+        )
+
+        batch = PreferenceBatch(
+            chosen=chosen_batch,
+            rejected=rejected_batch,
+            reference_score_chosen=None,
+            reference_score_rejected=None,
+        )
+
+        return batch, is_bad_batch, reward_output
 
 
 class AtheneVerifierHandler(VLLMOutputRewardHandler):
diff --git a/src/fairseq2/recipes/lm/_online_finetune/third_party/ace_math.py b/src/fairseq2/recipes/lm/_online_finetune/third_party/ace_math.py
@@ -0,0 +1,28 @@
+import torch
+from fairseq2.logging import log
+from transformers import AutoModelForSequenceClassification, AutoTokenizer
+
+class AceMathRMPipeline:
+    def __init__(self, *args, **kwargs):
+        model_path = "/datasets/pretrained-llms/AceMath-7B-RM"
+        self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
+        self.model = AutoModelForSequenceClassification.from_pretrained(
+            model_path, num_labels=1, torch_dtype=torch.bfloat16, trust_remote_code=True, device_map = "auto"
+        ).eval()
+        self.model.config.pad_token_id = self.tokenizer.pad_token_id
+
+    def __call__(self, prompt_chunk):
+        inputs = self.tokenizer(
+            prompt_chunk, 
+            return_tensors="pt", 
+            padding=True,
+            add_special_tokens=False
+        ).to(self.model.device)
+        
+        outputs = self.model(**inputs)[0]
+        log.info(f"outputs = {outputs}")
+        rewards =[output[0] for output in outputs]
+        
+        log.info(f"Length of rewards = {len(rewards)}")
+
+        return rewards
diff --git a/src/fairseq2/setup/_po_finetune_units.py b/src/fairseq2/setup/_po_finetune_units.py
@@ -12,6 +12,7 @@
 from fairseq2.recipes.lm import (  # GroupDpoFinetuneUnitHandler,
     AtheneVerifierHandler,
     SkyworkVerifierHandler,
+    AceMathVerifierHandler,
     CpoFinetuneUnitHandler,
     DpoFinetuneUnitHandler,
     GeneralVerifierExtractorHandler,
@@ -27,6 +28,7 @@
     MathVerifyHandler,
     NoEnvAtheneRewardPipeline,
     NoEnvGeneralVerifierPipeline,
+    NoEnvAceMathRMPipeline,
     OnlineDpoFinetuneUnitHandler,
     OnlineFinetuneUnitHandler,
     OrpoFinetuneUnitHandler,
@@ -93,6 +95,10 @@ def _register_online_finetune_units(context: RuntimeContext) -> None:
     # SkyworkVerifier
     handler = SkyworkVerifierHandler()
     registry.register(handler.name, handler)
+    
+    # AceMath RM
+    handler = AceMathVerifierHandler()
+    registry.register(handler.name, handler)
 
     # AtheneVerifier
     handler = AtheneVerifierHandler()
@@ -123,6 +129,10 @@ def _register_online_finetune_units(context: RuntimeContext) -> None:
     # NoEnvGeneralVerifierPipeline
     handler = NoEnvGeneralVerifierPipeline
     registry.register(handler.name, handler)
+    
+    # NoEnvAceMathRMPipeline
+    handler = NoEnvAceMathRMPipeline
+    registry.register(handler.name, handler)
 
     # Generative judgment extractors
     registry = context.get_registry(JudgmentExtractorHandler)

Original file line number	Diff line number	Diff line change
`@@ -90,6 +90,9 @@`
`90`	`90`	`from fairseq2.recipes.lm._online_finetune._remote_model import (`
`91`	`91`	`NoEnvGeneralVerifierPipeline as NoEnvGeneralVerifierPipeline,`
`92`	`92`	`)`
	`93`	`+from fairseq2.recipes.lm._online_finetune._remote_model import (`
	`94`	`+ NoEnvAceMathRMPipeline as NoEnvAceMathRMPipeline,`
	`95`	`+)`
`93`	`96`	`from fairseq2.recipes.lm._online_finetune._remote_model import (`
`94`	`97`	`RemoteModelHandler as RemoteModelHandler,`
`95`	`98`	`)`
`@@ -105,6 +108,12 @@`
`105`	`108`	`from fairseq2.recipes.lm._online_finetune._rewards import (`
`106`	`109`	`SkyworkVerifierHandler as SkyworkVerifierHandler,`
`107`	`110`	`)`
	`111`	`+from fairseq2.recipes.lm._online_finetune._rewards import (`
	`112`	`+ AceMathVerifier as AceMathVerifier,`
	`113`	`+)`
	`114`	`+from fairseq2.recipes.lm._online_finetune._rewards import (`
	`115`	`+ AceMathVerifierHandler as AceMathVerifierHandler,`
	`116`	`+)`
`108`	`117`	`from fairseq2.recipes.lm._online_finetune._rewards import (`
`109`	`118`	`GenerativePairwiseVerifier as GenerativePairwiseVerifier,`
`110`	`119`	`)`