Support reward models (#3192)

lvhan028 · grimoire · web-flow · commit c3ecd1034094 · 2025-03-03T17:16:21.000+08:00
* tmp

* remove update badwords

* update

* update

* update

---------

Co-authored-by: grimoire &lt;yaoqian@pjlab.org.cn&gt;
diff --git a/lmdeploy/pytorch/engine/engine.py b/lmdeploy/pytorch/engine/engine.py
@@ -723,7 +723,6 @@ def __update_inputs(next_token_ids):
                                                      return_logits=return_logits)
             logits = output['logits']
             logits = logits[0]  # [bs, seq, prob] -> [seq, prob]
-
             # sampling
             next_token_ids = await self.async_sampling_logits(logits, all_ids, guided_input_ids, sampling_inputs,
                                                               inputs, num_ignore_eos > 0)
diff --git a/lmdeploy/pytorch/models/internlm2_reward.py b/lmdeploy/pytorch/models/internlm2_reward.py
@@ -0,0 +1,147 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+from typing import Any, Iterable, List, Optional, Tuple
+
+import torch
+from torch import nn
+from transformers.configuration_utils import PretrainedConfig
+
+from lmdeploy.pytorch.model_inputs import StepContext, StepContextManager
+from lmdeploy.pytorch.nn.linear import build_rowwise_linear
+from lmdeploy.pytorch.weight_loader.model_weight_loader import load_weight
+
+from .internlm2 import InternLM2Model
+from .utils.cudagraph import CudaGraphMixin
+
+
+class InternLM2ForRewardModel(nn.Module, CudaGraphMixin):
+    """rewrote model of InternLM2ForRewardModel."""
+
+    packed_modules_mapping = {
+        'gate_up_proj': [
+            'w1',
+            'w3',
+        ],
+    }
+
+    def __init__(self,
+                 config: PretrainedConfig,
+                 ctx_mgr: StepContextManager,
+                 dtype: torch.dtype = None,
+                 device: torch.device = None):
+        super().__init__()
+        self.config = config
+        self.ctx_mgr = ctx_mgr
+        # build Model
+        self.model = InternLM2Model(config, dtype=dtype, device=device)
+        # build v_head
+        self.v_head = build_rowwise_linear(config.hidden_size, 1, bias=False, dtype=dtype, device=device)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        position_ids: torch.Tensor,
+        past_key_values: List[List[torch.Tensor]],
+        attn_metadata: Any = None,
+        inputs_embeds: torch.Tensor = None,
+        **kwargs,
+    ):
+        """model forward, return logits."""
+        hidden_states = self.model(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            attn_metadata=attn_metadata,
+            inputs_embeds=inputs_embeds,
+        )
+        return hidden_states
+
+    def get_logits(self, hidden_states: torch.Tensor):
+        """compute logits of the model output."""
+        return self.v_head(hidden_states)
+
+    def get_input_embeddings(self):
+        """get input embeddings."""
+        return self.model.get_input_embeddings()
+
+    def prepare_inputs_for_generation(
+        self,
+        past_key_values: List[List[torch.Tensor]],
+        inputs_embeds: Optional[torch.Tensor] = None,
+        context: StepContext = None,
+    ):
+        """prepare input."""
+        # get input_ids, position_ids and attention metadatas
+        input_ids = context.input_ids
+        position_ids = context.position_ids
+        attn_metadata = context.attn_metadata
+
+        vision_embeddings = context.input_embeddings
+        if vision_embeddings is not None and len(vision_embeddings) > 0:
+            raise ValueError('InternLM2RewardModel does not support vision embedding')
+
+        # inputs of forward
+        return dict(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            attn_metadata=attn_metadata,
+            inputs_embeds=inputs_embeds,
+        )
+
+    def load_lora_weights(self, weights: Iterable[Tuple[str, torch.Tensor]], adapter_id: int):
+        """load lora weights."""
+
+        from lmdeploy.pytorch.adapter.adapter import load_lora_weights
+
+        num_heads = self.config.num_attention_heads
+        num_key_value_heads = self.config.num_key_value_heads
+        hidden_size = self.config.hidden_size
+        head_dim = hidden_size // num_heads
+        group_size = num_heads // num_key_value_heads
+
+        def _rearange_wqkv(weights):
+            for name, loaded_weight in weights:
+                if 'wqkv.lora_B' in name:
+                    loaded_weight = loaded_weight.unflatten(0, (-1, 2 + group_size, head_dim))
+                    q = loaded_weight[:, :-2].flatten(0, 2)
+                    k = loaded_weight[:, -2].flatten(0, 1)
+                    v = loaded_weight[:, -1].flatten(0, 1)
+                    loaded_weight = torch.cat([q, k, v], dim=0)
+                yield name, loaded_weight
+
+        weights_iter = _rearange_wqkv(weights)
+        load_lora_weights(self, weights_iter, adapter_id)
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        """load weights."""
+        # modify from vllm
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ('.gate_up_proj', '.w1', 0),
+            ('.gate_up_proj', '.w3', 1),
+        ]
+
+        params_dict = dict(self.named_parameters())
+        for name, loaded_weight in weights:
+            if 'rotary_emb.inv_freq' in name:
+                continue
+            if ('rotary_emb.cos_cached' in name or 'rotary_emb.sin_cached' in name):
+                continue
+            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                param = params_dict[name]
+                load_weight(param, loaded_weight, shard_id=shard_id)
+                break
+            else:
+                if '.wqkv' in name:
+                    param = params_dict[name]
+                    q, k, v = param.weight_spliter(loaded_weight, layout='hgd')
+                    load_weight(param, q, shard_id='q')
+                    load_weight(param, k, shard_id='k')
+                    load_weight(param, v, shard_id='v')
+                else:
+                    param = params_dict[name]
+                    load_weight(param, loaded_weight)
diff --git a/lmdeploy/pytorch/models/module_map.py b/lmdeploy/pytorch/models/module_map.py
@@ -168,4 +168,11 @@
     'InternLM3ForCausalLM': f'{LMDEPLOY_PYTORCH_MODEL_PATH}.internlm3.InternLM3ForCausalLM',
 })
 
+# internlm2 reward model
+MODULE_MAP.update(
+    {'InternLM2ForRewardModel': f'{LMDEPLOY_PYTORCH_MODEL_PATH}.internlm2_reward.InternLM2ForRewardModel'})
+
+# qwen2 reward model
+MODULE_MAP.update({'Qwen2ForRewardModel': f'{LMDEPLOY_PYTORCH_MODEL_PATH}.qwen2_reward.Qwen2ForRewardModel'})
+
 CUSTOM_MODULE_MAP = dict()
diff --git a/lmdeploy/pytorch/models/qwen2_reward.py b/lmdeploy/pytorch/models/qwen2_reward.py
@@ -0,0 +1,135 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Any, Iterable, List, Optional, Tuple
+
+import torch
+from torch import nn
+from transformers.configuration_utils import PretrainedConfig
+
+from lmdeploy.pytorch.model_inputs import StepContext, StepContextManager
+from lmdeploy.pytorch.nn.linear import build_rowwise_linear
+from lmdeploy.pytorch.weight_loader.model_weight_loader import load_weight
+
+from .qwen2 import Qwen2Model
+from .utils.cudagraph import CudaGraphMixin
+
+
+class Qwen2ForRewardModel(nn.Module, CudaGraphMixin):
+    """ModelForCausalLM."""
+
+    packed_modules_mapping = {
+        'qkv_proj': [
+            'q_proj',
+            'k_proj',
+            'v_proj',
+        ],
+        'gate_up_proj': [
+            'gate_proj',
+            'up_proj',
+        ],
+    }
+
+    def __init__(self,
+                 config: PretrainedConfig,
+                 ctx_mgr: StepContextManager,
+                 dtype: torch.dtype = None,
+                 device: torch.device = None):
+        super().__init__()
+        self.config = config
+        self.ctx_mgr = ctx_mgr
+        # build model
+        self.model = Qwen2Model(config, dtype=dtype, device=device)
+
+        self.lm_head = build_rowwise_linear(config.hidden_size,
+                                            config.vocab_size,
+                                            bias=False,
+                                            dtype=dtype,
+                                            device=device)
+
+        self.num_labels = 1
+        self.score = nn.Sequential(
+            build_rowwise_linear(config.hidden_size, config.hidden_size, bias=True, dtype=dtype, device=device),
+            nn.ReLU(), build_rowwise_linear(config.hidden_size, self.num_labels, bias=True, dtype=dtype, device=device))
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        position_ids: torch.Tensor,
+        past_key_values: List[List[torch.Tensor]],
+        attn_metadata: Any = None,
+        inputs_embeds: torch.Tensor = None,
+        **kwargs,
+    ):
+        """model forward, return logits."""
+        hidden_states = self.model(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            attn_metadata=attn_metadata,
+            inputs_embeds=inputs_embeds,
+        )
+        return hidden_states
+
+    def get_logits(self, hidden_states: torch.Tensor):
+        """compute logits of the model output."""
+        logits = self.score(hidden_states)
+        return logits
+
+    def update_weights(self):
+        """update weights."""
+        pass
+
+    def get_input_embeddings(self):
+        """get input embeddings."""
+        return self.model.get_input_embeddings()
+
+    def prepare_inputs_for_generation(
+        self,
+        past_key_values: List[List[torch.Tensor]],
+        inputs_embeds: Optional[torch.Tensor] = None,
+        context: StepContext = None,
+    ):
+        """prepare input."""
+        # get input_ids, position_ids and attention metadatas
+        input_ids = context.input_ids
+        position_ids = context.position_ids
+        attn_metadata = context.attn_metadata
+
+        # inputs of forward
+        return dict(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            attn_metadata=attn_metadata,
+            # inputs_embeds=inputs_embeds,
+        )
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        """load weights."""
+        # modify from vllm
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ('.qkv_proj', '.q_proj', 'q'),
+            ('.qkv_proj', '.k_proj', 'k'),
+            ('.qkv_proj', '.v_proj', 'v'),
+            ('.gate_up_proj', '.gate_proj', 0),
+            ('.gate_up_proj', '.up_proj', 1),
+        ]
+
+        params_dict = dict(self.named_parameters())
+        for name, loaded_weight in weights:
+            if 'rotary_emb.inv_freq' in name:
+                continue
+            if ('rotary_emb.cos_cached' in name or 'rotary_emb.sin_cached' in name):
+                continue
+            if self.config.tie_word_embeddings and 'lm_head.weight' in name:
+                continue
+            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                param = params_dict[name]
+                load_weight(param, loaded_weight, shard_id=shard_id)
+                break
+            else:
+                param = params_dict[name]
+                load_weight(param, loaded_weight)
diff --git a/lmdeploy/pytorch/nn/rotary_embedding.py b/lmdeploy/pytorch/nn/rotary_embedding.py
@@ -64,7 +64,9 @@ def _get_llama3_parameters(config: PretrainedConfig):
 def build_rotary_params(config: PretrainedConfig):
     """get scaling_factor rotary params, and emb_type."""
     params = dict(emb_type=RopeType.Default)
-    if config.rope_scaling is not None:
+    # cannot access config.rope_scaling when the model is "Qwen/Qwen2-Math-RM-72B"
+    rope_scaling = getattr(config, 'rope_scaling', None)
+    if rope_scaling is not None:
         rope_type_str = config.rope_scaling.get('rope_type', 'default')
         build_funcs = dict(default=_get_default_rope_parameters,
                            linear=_get_linear_scaling_rope_parameters,
diff --git a/lmdeploy/pytorch/supported_models.py b/lmdeploy/pytorch/supported_models.py
@@ -23,6 +23,7 @@
     InternLMForCausalLM=True,
     # internlm2
     InternLM2ForCausalLM=True,
+    InternLM2ForRewardModel=True,
     # internlm-xcomposer
     InternLMXComposerForCausalLM=False,
     # internlm2-xcomposer
@@ -107,7 +108,7 @@ def is_supported(model_path: str):
 
     triton_model_path = os.path.join(model_path, 'triton_models')
     if os.path.exists(triton_model_path):
-        logger.warning(f'{model_path} seems to be a turbomind workspace, '
+        logger.warning(f'{model_path} seems to be a turbomind model, '
                        'which can only be ran with turbomind engine.')
     else:
         try:
diff --git a/lmdeploy/serve/async_engine.py b/lmdeploy/serve/async_engine.py
@@ -20,6 +20,7 @@
 import tqdm
 
 from lmdeploy import Tokenizer
+from lmdeploy.archs import get_model_arch
 from lmdeploy.logger import RequestLogger
 from lmdeploy.messages import GenerationConfig, PytorchEngineConfig, Response, ResponseType, TurbomindEngineConfig
 from lmdeploy.model import MODELS, ChatTemplateConfig, best_match_model
@@ -271,6 +272,7 @@ def __init__(self,
 
         self.tokenizer = Tokenizer(model_path)
         self.hf_gen_cfg = get_hf_gen_cfg(model_path)
+        self.arch, _ = get_model_arch(model_path)
 
         # build backend engine
         if backend == 'turbomind':
diff --git a/lmdeploy/serve/utils.py b/lmdeploy/serve/utils.py
@@ -17,7 +17,28 @@
 
 
 class LogitsMixin:
-    """Helper class to calculate ppl."""
+    """Helper class to get logits, reward score and calculate ppl."""
+
+    def get_reward_score(self, input_ids: List) -> List[float]:
+        """
+        Args:
+            input_ids(List): a list of token_id or a list of token_id list or a tensor containing
+                token_ids
+        Return:
+            reward score in a list. If the input_ids is a list of token_id, the return value
+            is still a list with length 1.
+        """
+        supported_reward_models = ['InternLM2ForRewardModel', 'Qwen2ForRewardModel']
+        if self.arch not in supported_reward_models:
+            raise ValueError(f'{self.arch} is not in reward mode list: {supported_reward_models}')
+        assert isinstance(input_ids, List)
+        assert all(isinstance(x, int) for x in input_ids) or all(isinstance(x, List) for x in input_ids)
+        # Make input_ids a list of token_id list
+        input_ids = [input_ids] if isinstance(input_ids[0], int) else input_ids
+        logits = self._run(coro=self._async_get_logits(input_ids=input_ids)).result()
+        logits = [x.squeeze() for x in logits]
+        scores = [x[-1].cpu().item() for x in logits]
+        return scores
 
     async def _async_get_logits(self,
                                 input_ids,
@@ -34,7 +55,9 @@ async def _proc(i):
                 input_len = len(input_ids[i])
                 # TODO(lvhan): Fix the ugly code later on
                 max_new_tokens = 1 if self.backend == 'turbomind' else 0
-                gen_config = GenerationConfig(max_new_tokens=max_new_tokens, output_logits='all')
+                # The reason to set `top_k=1` is that pt engine crashes at top_k sampling stage
+                # when perform inference on a reward model.
+                gen_config = GenerationConfig(max_new_tokens=max_new_tokens, output_logits='all', top_k=1)
                 async with self.safe_run(inst,
                                          session_id=i,
                                          input_ids=input_ids[i],