modelscope
diff --git a/‎docs/source/Instruction/Command-line-parameters.md‎
Lines changed: 7 additions & 2 deletions b/‎docs/source/Instruction/Command-line-parameters.md‎
Lines changed: 7 additions & 2 deletions
diff --git a/‎docs/source/Instruction/Supported-models-and-datasets.md‎
Lines changed: 5 additions & 0 deletions b/‎docs/source/Instruction/Supported-models-and-datasets.md‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎docs/source_en/Instruction/Command-line-parameters.md‎
Lines changed: 7 additions & 2 deletions b/‎docs/source_en/Instruction/Command-line-parameters.md‎
Lines changed: 7 additions & 2 deletions
diff --git a/‎docs/source_en/Instruction/Supported-models-and-datasets.md‎
Lines changed: 5 additions & 0 deletions b/‎docs/source_en/Instruction/Supported-models-and-datasets.md‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎swift/llm/infer/deploy.py‎
Lines changed: 7 additions & 1 deletion b/‎swift/llm/infer/deploy.py‎
Lines changed: 7 additions & 1 deletion
diff --git a/‎swift/llm/model/constant.py‎
Lines changed: 2 additions & 0 deletions b/‎swift/llm/model/constant.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎swift/llm/model/model/baidu.py‎
Lines changed: 45 additions & 0 deletions b/‎swift/llm/model/model/baidu.py‎
Lines changed: 45 additions & 0 deletions
diff --git a/‎swift/llm/model/model/qwen.py‎
Lines changed: 2 additions & 1 deletion b/‎swift/llm/model/model/qwen.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎swift/llm/model/model_arch.py‎
Lines changed: 9 additions & 0 deletions b/‎swift/llm/model/model_arch.py‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎swift/llm/template/constant.py‎
Lines changed: 2 additions & 0 deletions b/‎swift/llm/template/constant.py‎
Lines changed: 2 additions & 0 deletions
@@ -503,11 +503,16 @@ RLHF参数继承于[训练参数](#训练参数)。
 - center_rewards_coefficient: 用于RM训练。用于激励奖励模型输出均值为零的奖励的系数，具体查看这篇[论文](https://huggingface.co/papers/2312.09244)。推荐值：0.01。
 - loss_scale: 覆盖模板参数。rlhf训练时，默认为'last_round'。
 - temperature: 默认为0.9，该参数将在PPO、GRPO、GKD中使用。
+
+#### GKD参数
 - lmbda: 默认为0.5。该参数在GKD中使用。控制学生数据比例的 lambda 参数（即策略内学生生成输出所占的比例）。若lmbda为0，则不使用学生生成数据。
 - sft_alpha: 默认为0。控制GKD中加入sft_loss的权重。最后的loss为`gkd_loss + sft_alpha * sft_loss`。
 - seq_kd: 默认为False。该参数在GKD中使用。控制是否执行序列级知识蒸馏（Sequence-Level KD）的 seq_kd 参数（可视为对教师模型生成输出的监督式微调）。
   - 注意：你可以提前对数据集内容使用teacher模型进行推理（使用vllm/sglang/lmdeploy等推理引擎加速），并在训练时将`seq_kd`设置为False。或者将`seq_kd`设置为True，在训练时使用teacher模型生成序列（能保证多个epoch生成数据的不同，但效率较慢）。
 - offload_teacher_model: 卸载教师模型以节约显存，只在采样/计算logps时加载，默认为False。
+- log_completions: 是否记录训练中的模型生成内容，搭配 `--report_to wandb/swanlab` 使用。默认为False。
+  - 提示：若没有设置`--report_to wandb/swanlab`，则会在checkpoint中创建`completions.jsonl`来存储生成内容。
+  - 仅记录 vLLM 采样结果。
 
 #### Reward/Teacher模型参数
 reward模型参数将在PPO、GRPO中使用。
@@ -555,8 +560,8 @@ reward模型参数将在PPO、GRPO中使用。
 - truncation_strategy: 对输入长度超过 `max_length`的处理方式，支持`delete`和`left`，代表删除、左侧裁剪，默认为`left`, 注意对于多模态模型，
 左裁剪可能会裁剪掉多模态token导致模型前向报错shape mismatch。使用`delete`方式，对于超长数据和编码失败的样例会在原数据集中重采样其他数据作为补充。
 - loss_type: loss 归一化的类型，可选项为['grpo', 'bnpo', 'dr_grpo'], 默认为'grpo', 具体查看该[pr](https://github.com/huggingface/trl/pull/3256#discussion_r2033213348)。
-- log_completions: 是否记录训练中的模型生成内容，搭配 `--report_to wandb` 使用。默认为False。
-  - 提示：若没有设置`--report_to wandb`，则会在checkpoint中创建`completions.jsonl`来存储生成内容。
+- log_completions: 是否记录训练中的模型生成内容，搭配 `--report_to wandb/swanlab` 使用。默认为False。
+  - 提示：若没有设置`--report_to wandb/swanlab`，则会在checkpoint中创建`completions.jsonl`来存储生成内容。
 - use_vllm: 是否使用 vLLM 作为 GRPO 生成的 infer_backend，默认为False。
 - vllm_mode: vLLM 集成模式，可选项为 `server` 和 `colocate`。server 模式使用 `swift rollout` 拉起的 vLLM 服务器进行采样，colocate 模式在程序内部署 vLLM。使用server端时，
 - vllm_mode server 参数
 
@@ -924,6 +924,11 @@
 |[llava-hf/llava-onevision-qwen2-72b-ov-hf](https://modelscope.cn/models/llava-hf/llava-onevision-qwen2-72b-ov-hf)|llava_onevision_hf|llava_onevision_hf|transformers>=4.45|&#x2718;|vision, video|[llava-hf/llava-onevision-qwen2-72b-ov-hf](https://huggingface.co/llava-hf/llava-onevision-qwen2-72b-ov-hf)|
 |[01ai/Yi-VL-6B](https://modelscope.cn/models/01ai/Yi-VL-6B)|yi_vl|yi_vl|transformers>=4.34|&#x2718;|vision|[01-ai/Yi-VL-6B](https://huggingface.co/01-ai/Yi-VL-6B)|
 |[01ai/Yi-VL-34B](https://modelscope.cn/models/01ai/Yi-VL-34B)|yi_vl|yi_vl|transformers>=4.34|&#x2718;|vision|[01-ai/Yi-VL-34B](https://huggingface.co/01-ai/Yi-VL-34B)|
+|[PaddlePaddle/ERNIE-4.5-VL-28B-A3B-PT](https://modelscope.cn/models/PaddlePaddle/ERNIE-4.5-VL-28B-A3B-PT)|ernie_vl|ernie_vl|transformers>=4.52, moviepy|&#x2718;|-|[baidu/ERNIE-4.5-VL-28B-A3B-PT](https://huggingface.co/baidu/ERNIE-4.5-VL-28B-A3B-PT)|
+|[PaddlePaddle/ERNIE-4.5-VL-424B-A47B-PT](https://modelscope.cn/models/PaddlePaddle/ERNIE-4.5-VL-424B-A47B-PT)|ernie_vl|ernie_vl|transformers>=4.52, moviepy|&#x2718;|-|[baidu/ERNIE-4.5-VL-424B-A47B-PT](https://huggingface.co/baidu/ERNIE-4.5-VL-424B-A47B-PT)|
+|[PaddlePaddle/ERNIE-4.5-VL-28B-A3B-Base-PT](https://modelscope.cn/models/PaddlePaddle/ERNIE-4.5-VL-28B-A3B-Base-PT)|ernie_vl|ernie_vl|transformers>=4.52, moviepy|&#x2718;|-|[baidu/ERNIE-4.5-VL-28B-A3B-Base-PT](https://huggingface.co/baidu/ERNIE-4.5-VL-28B-A3B-Base-PT)|
+|[PaddlePaddle/ERNIE-4.5-VL-424B-A47B-Base-PT](https://modelscope.cn/models/PaddlePaddle/ERNIE-4.5-VL-424B-A47B-Base-PT)|ernie_vl|ernie_vl|transformers>=4.52, moviepy|&#x2718;|-|[baidu/ERNIE-4.5-VL-424B-A47B-Base-PT](https://huggingface.co/baidu/ERNIE-4.5-VL-424B-A47B-Base-PT)|
+|[PaddlePaddle/ERNIE-4.5-VL-28B-A3B-Thinking](https://modelscope.cn/models/PaddlePaddle/ERNIE-4.5-VL-28B-A3B-Thinking)|ernie_vl_thinking|ernie_vl_thinking|transformers>=4.52, moviepy|&#x2718;|-|[baidu/ERNIE-4.5-VL-28B-A3B-Thinking](https://huggingface.co/baidu/ERNIE-4.5-VL-28B-A3B-Thinking)|
 |[swift/llava-llama3.1-8b](https://modelscope.cn/models/swift/llava-llama3.1-8b)|llava_llama3_1_hf|llava_llama3_1_hf|transformers>=4.41|&#x2718;|vision|-|
 |[AI-ModelScope/llava-llama-3-8b-v1_1-transformers](https://modelscope.cn/models/AI-ModelScope/llava-llama-3-8b-v1_1-transformers)|llava_llama3_hf|llava_llama3_hf|transformers>=4.36|&#x2718;|vision|[xtuner/llava-llama-3-8b-v1_1-transformers](https://huggingface.co/xtuner/llava-llama-3-8b-v1_1-transformers)|
 |[AI-ModelScope/llava-v1.6-mistral-7b](https://modelscope.cn/models/AI-ModelScope/llava-v1.6-mistral-7b)|llava1_6_mistral|llava1_6_mistral|transformers>=4.34|&#x2718;|vision|[liuhaotian/llava-v1.6-mistral-7b](https://huggingface.co/liuhaotian/llava-v1.6-mistral-7b)|
 
@@ -512,11 +512,16 @@ RLHF arguments inherit from the [training arguments](#training-arguments).
 - center_rewards_coefficient: A coefficient used in reward model (RM) training to incentivize the model to output rewards with zero mean. See this [paper](https://huggingface.co/papers/2312.09244) for details. Recommended value: 0.01.
 - loss_scale: Overrides the template parameter. During RLHF training, the default is `'last_round'`.
 - temperature: Default is 0.9; this parameter will be used in PPO, GRPO and GKD.
+
+#### GKD Arguments
 - lmbda: Default is 0.5. This parameter is used in GKD. It controls the lambda parameter for the proportion of student data (i.e., the proportion of student-generated outputs within the strategy). If lmbda is 0, student-generated data is not used.
 - sft_alpha: The default value is 0. It controls the weight of sft_loss added in GKD. The final loss is `gkd_loss + sft_alpha * sft_loss`.
 - seq_kd: Default is False. This parameter is used in GKD. It is the `seq_kd` parameter that controls whether to perform Sequence-Level KD (can be viewed as supervised fine-tuning on teacher-generated output).
   - Note: You can perform inference on the dataset using the teacher model in advance (accelerated by inference engines such as vLLM, SGLang, or lmdeploy), and set `seq_kd` to False during training. Alternatively, you can set `seq_kd` to True, which will use the teacher model to generate sequences during training (ensuring different generated data across multiple epochs, but at a slower efficiency).
 - offload_teacher_model: Whether to offload the teacher model to save GPU memory. If set to True, the teacher model will be loaded only during generate/logps computation. Default: False.
+- log_completions: Whether to log the model-generated content during training, to be used in conjunction with `--report_to wandb/swanlab`, default is False.
+  - Note: If `--report_to wandb/swanlab` is not set, a `completions.jsonl` will be created in the checkpoint to store the generated content.
+  - Log vLLM rollout results only.
 
 #### Reward/Teacher Model Parameters
 
@@ -566,8 +571,8 @@ The meanings of the following parameters can be referenced [here](https://huggin
 - dataset_shuffle: Whether to shuffle the dataset randomly. Default is True.
 - truncation_strategy: The method to handle inputs exceeding `max_length`. Supported values are `delete` and `left`, representing deletion and left-side truncation respectively. The default is `left`. Note that for multi-modal models, left-side truncation may remove multi-modal tokens and cause a shape mismatch error during model forward. With the delete strategy, over-long or encoding-failed samples are discarded, and new samples are resampled from the original dataset to maintain the intended batch size.
 - loss_type: The type of loss normalization. Options are ['grpo', 'bnpo', 'dr_grpo'], default is 'grpo'. For details, see this [pr](https://github.com/huggingface/trl/pull/3256#discussion_r2033213348)
-- log_completions: Whether to log the model-generated content during training, to be used in conjunction with `--report_to wandb`, default is False.
-  - Note: If `--report_to wandb` is not set, a `completions.jsonl` will be created in the checkpoint to store the generated content.
+- log_completions: Whether to log the model-generated content during training, to be used in conjunction with `--report_to wandb/swanlab`, default is False.
+  - Note: If `--report_to wandb/swanlab` is not set, a `completions.jsonl` will be created in the checkpoint to store the generated content.
 - use_vllm: Whether to use vLLM as the infer_backend for GRPO generation, default is False.
 - vllm_mode: Mode to use for vLLM integration when `use_vllm` is set to `True`. Must be one of `server` or `colocate`
 - vllm_mode server parameter
 
@@ -924,6 +924,11 @@ The table below introduces the models integrated with ms-swift:
 |[llava-hf/llava-onevision-qwen2-72b-ov-hf](https://modelscope.cn/models/llava-hf/llava-onevision-qwen2-72b-ov-hf)|llava_onevision_hf|llava_onevision_hf|transformers>=4.45|&#x2718;|vision, video|[llava-hf/llava-onevision-qwen2-72b-ov-hf](https://huggingface.co/llava-hf/llava-onevision-qwen2-72b-ov-hf)|
 |[01ai/Yi-VL-6B](https://modelscope.cn/models/01ai/Yi-VL-6B)|yi_vl|yi_vl|transformers>=4.34|&#x2718;|vision|[01-ai/Yi-VL-6B](https://huggingface.co/01-ai/Yi-VL-6B)|
 |[01ai/Yi-VL-34B](https://modelscope.cn/models/01ai/Yi-VL-34B)|yi_vl|yi_vl|transformers>=4.34|&#x2718;|vision|[01-ai/Yi-VL-34B](https://huggingface.co/01-ai/Yi-VL-34B)|
+|[PaddlePaddle/ERNIE-4.5-VL-28B-A3B-PT](https://modelscope.cn/models/PaddlePaddle/ERNIE-4.5-VL-28B-A3B-PT)|ernie_vl|ernie_vl|transformers>=4.52, moviepy|&#x2718;|-|[baidu/ERNIE-4.5-VL-28B-A3B-PT](https://huggingface.co/baidu/ERNIE-4.5-VL-28B-A3B-PT)|
+|[PaddlePaddle/ERNIE-4.5-VL-424B-A47B-PT](https://modelscope.cn/models/PaddlePaddle/ERNIE-4.5-VL-424B-A47B-PT)|ernie_vl|ernie_vl|transformers>=4.52, moviepy|&#x2718;|-|[baidu/ERNIE-4.5-VL-424B-A47B-PT](https://huggingface.co/baidu/ERNIE-4.5-VL-424B-A47B-PT)|
+|[PaddlePaddle/ERNIE-4.5-VL-28B-A3B-Base-PT](https://modelscope.cn/models/PaddlePaddle/ERNIE-4.5-VL-28B-A3B-Base-PT)|ernie_vl|ernie_vl|transformers>=4.52, moviepy|&#x2718;|-|[baidu/ERNIE-4.5-VL-28B-A3B-Base-PT](https://huggingface.co/baidu/ERNIE-4.5-VL-28B-A3B-Base-PT)|
+|[PaddlePaddle/ERNIE-4.5-VL-424B-A47B-Base-PT](https://modelscope.cn/models/PaddlePaddle/ERNIE-4.5-VL-424B-A47B-Base-PT)|ernie_vl|ernie_vl|transformers>=4.52, moviepy|&#x2718;|-|[baidu/ERNIE-4.5-VL-424B-A47B-Base-PT](https://huggingface.co/baidu/ERNIE-4.5-VL-424B-A47B-Base-PT)|
+|[PaddlePaddle/ERNIE-4.5-VL-28B-A3B-Thinking](https://modelscope.cn/models/PaddlePaddle/ERNIE-4.5-VL-28B-A3B-Thinking)|ernie_vl_thinking|ernie_vl_thinking|transformers>=4.52, moviepy|&#x2718;|-|[baidu/ERNIE-4.5-VL-28B-A3B-Thinking](https://huggingface.co/baidu/ERNIE-4.5-VL-28B-A3B-Thinking)|
 |[swift/llava-llama3.1-8b](https://modelscope.cn/models/swift/llava-llama3.1-8b)|llava_llama3_1_hf|llava_llama3_1_hf|transformers>=4.41|&#x2718;|vision|-|
 |[AI-ModelScope/llava-llama-3-8b-v1_1-transformers](https://modelscope.cn/models/AI-ModelScope/llava-llama-3-8b-v1_1-transformers)|llava_llama3_hf|llava_llama3_hf|transformers>=4.36|&#x2718;|vision|[xtuner/llava-llama-3-8b-v1_1-transformers](https://huggingface.co/xtuner/llava-llama-3-8b-v1_1-transformers)|
 |[AI-ModelScope/llava-v1.6-mistral-7b](https://modelscope.cn/models/AI-ModelScope/llava-v1.6-mistral-7b)|llava1_6_mistral|llava1_6_mistral|transformers>=4.34|&#x2718;|vision|[liuhaotian/llava-v1.6-mistral-7b](https://huggingface.co/liuhaotian/llava-v1.6-mistral-7b)|
 
@@ -249,6 +249,12 @@ def is_accessible(port: int):
     return True
 
 
+def _deploy_main(args):
+    args._init_custom_register()
+    args._import_external_plugins()
+    return deploy_main(args)
+
+
 @contextmanager
 def run_deploy(args: DeployArguments, return_url: bool = False):
     if isinstance(args, DeployArguments) and args.__class__.__name__ == 'DeployArguments':
@@ -262,7 +268,7 @@ def run_deploy(args: DeployArguments, return_url: bool = False):
         deploy_args = DeployArguments(**args_dict)
 
     mp = multiprocessing.get_context('spawn')
-    process = mp.Process(target=deploy_main, args=(deploy_args, ))
+    process = mp.Process(target=_deploy_main, args=(deploy_args, ))
     process.start()
     try:
         while not is_accessible(deploy_args.port):
 
@@ -215,6 +215,8 @@ class MLLMModelType:
     llava_next_video_yi_hf = 'llava_next_video_yi_hf'
     llava_onevision_hf = 'llava_onevision_hf'
     yi_vl = 'yi_vl'
+    ernie_vl = 'ernie_vl'
+    ernie_vl_thinking = 'ernie_vl_thinking'
 
     llava_llama3_1_hf = 'llava_llama3_1_hf'  # DaozeZhang
     llava_llama3_hf = 'llava_llama3_hf'  # xtuner
 
@@ -1,4 +1,6 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
+from transformers.dynamic_module_utils import get_class_from_dynamic_module
+
 from swift.llm import TemplateType
 from swift.utils import get_logger
 from ..constant import LLMModelType, MLLMModelType
@@ -54,3 +56,46 @@
         model_arch=ModelArch.keye_vl,
         architectures=['PaddleOCRVLForConditionalGeneration'],
     ))
+
+
+def get_model_tokenizer_ernie_vl(model_dir, *args, **kwargs):
+    MOEAllGatherLayerV2 = get_class_from_dynamic_module('modeling_ernie4_5_vl.MOEAllGatherLayerV2', model_dir)
+    kwargs['leaf_modules'] = MOEAllGatherLayerV2
+    model, processor = get_model_tokenizer_multimodal(model_dir, *args, **kwargs)
+    if model is not None:
+        model.add_image_preprocess(processor)
+    return model, processor
+
+
+register_model(
+    ModelMeta(
+        MLLMModelType.ernie_vl,
+        [
+            ModelGroup([
+                Model('PaddlePaddle/ERNIE-4.5-VL-28B-A3B-PT', 'baidu/ERNIE-4.5-VL-28B-A3B-PT'),
+                Model('PaddlePaddle/ERNIE-4.5-VL-424B-A47B-PT', 'baidu/ERNIE-4.5-VL-424B-A47B-PT'),
+                Model('PaddlePaddle/ERNIE-4.5-VL-28B-A3B-Base-PT', 'baidu/ERNIE-4.5-VL-28B-A3B-Base-PT'),
+                Model('PaddlePaddle/ERNIE-4.5-VL-424B-A47B-Base-PT', 'baidu/ERNIE-4.5-VL-424B-A47B-Base-PT'),
+            ]),
+        ],
+        TemplateType.ernie_vl,
+        get_model_tokenizer_ernie_vl,
+        model_arch=ModelArch.ernie_vl,
+        architectures=['Ernie4_5_VLMoeForConditionalGeneration'],
+        requires=['transformers>=4.52', 'moviepy'],
+    ))
+
+register_model(
+    ModelMeta(
+        MLLMModelType.ernie_vl_thinking,
+        [
+            ModelGroup([
+                Model('PaddlePaddle/ERNIE-4.5-VL-28B-A3B-Thinking', 'baidu/ERNIE-4.5-VL-28B-A3B-Thinking'),
+            ]),
+        ],
+        TemplateType.ernie_vl_thinking,
+        get_model_tokenizer_ernie_vl,
+        model_arch=ModelArch.ernie_vl,
+        architectures=['Ernie4_5_VLMoeForConditionalGeneration'],
+        requires=['transformers>=4.52', 'moviepy'],
+    ))
@@ -936,7 +936,8 @@ def _patch_deepstack_process(model):
     def _deepstack_process(self, hidden_states: torch.Tensor, visual_pos_masks: torch.Tensor,
                            visual_embeds: torch.Tensor):
         from swift.trainers.sequence_parallel import sequence_parallel
-        if sequence_parallel.world_size and visual_pos_masks is not None:
+        world_size = sequence_parallel.world_size
+        if world_size and world_size > 1 and visual_pos_masks is not None:
             visual_pos_masks, visual_embeds = sequence_parallel.pad_and_split_mm_tokens(visual_pos_masks, visual_embeds)
         if visual_pos_masks is None:
             return hidden_states + visual_embeds.mean() * 0
 
@@ -72,6 +72,7 @@ class MLLMModelArch:
 
     got_ocr2 = 'got_ocr2'
     dots_ocr = 'dots_ocr'
+    ernie_vl = 'ernie_vl'
 
     ovis = 'ovis'
     ovis2_5 = 'ovis2_5'
@@ -619,6 +620,14 @@ def register_model_arch(model_arch: ModelKeys, *, exist_ok: bool = False) -> Non
         vision_tower='model.vision_tower_high',
     ))
 
+register_model_arch(
+    MultiModelKeys(
+        MLLMModelArch.ernie_vl,
+        language_model='model',
+        aligner='model.resampler_model',
+        vision_tower='vision_model',
+    ))
+
 if transformers_ge_4_52:
     register_model_arch(
         MultiModelKeys(
 
@@ -156,6 +156,8 @@ class MLLMTemplateType:
     llava_onevision1_5 = 'llava_onevision1_5'
 
     yi_vl = 'yi_vl'
+    ernie_vl = 'ernie_vl'
+    ernie_vl_thinking = 'ernie_vl_thinking'
 
     internvl = 'internvl'
     internvl_phi3 = 'internvl_phi3'