|
| 1 | +# SPDX-License-Identifier: Apache-2.0 |
| 2 | +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project |
| 3 | + |
| 4 | +# yapf: disable |
| 5 | +# ruff: noqa: E501 |
| 6 | +# Adapted from |
| 7 | +# https://huggingface.co/nvidia/Llama-3.1-Nemotron-Nano-VL-8B-V1/blob/main/configuration.py |
| 8 | +# -------------------------------------------------------- |
| 9 | +# Adapted from https://huggingface.co/OpenGVLab/InternVL2-Llama3-76B under MIT License |
| 10 | +# LICENSE is in incl_licenses directory. |
| 11 | +# -------------------------------------------------------- |
| 12 | + |
| 13 | +from transformers import LlamaConfig |
| 14 | +from transformers.configuration_utils import PretrainedConfig |
| 15 | +from transformers.dynamic_module_utils import get_class_from_dynamic_module |
| 16 | + |
| 17 | + |
| 18 | +class Nemotron_Nano_VL_Config(PretrainedConfig): |
| 19 | + model_type = 'Llama_Nemotron_Nano_VL' |
| 20 | + is_composition = True |
| 21 | + |
| 22 | + def __init__( |
| 23 | + self, |
| 24 | + vision_config=None, |
| 25 | + llm_config=None, |
| 26 | + force_image_size=None, |
| 27 | + downsample_ratio=0.5, |
| 28 | + template=None, |
| 29 | + ps_version='v1', |
| 30 | + image_tag_type="internvl", |
| 31 | + projector_hidden_size=4096, |
| 32 | + vit_hidden_size=1280, |
| 33 | + **kwargs |
| 34 | + ): |
| 35 | + super().__init__(**kwargs) |
| 36 | + |
| 37 | + if vision_config is not None: |
| 38 | + assert "auto_map" in vision_config and "AutoConfig" in vision_config["auto_map"] |
| 39 | + vision_auto_config = get_class_from_dynamic_module(*vision_config["auto_map"]["AutoConfig"].split("--")[::-1]) |
| 40 | + self.vision_config = vision_auto_config(**vision_config) |
| 41 | + else: |
| 42 | + self.vision_config = PretrainedConfig() |
| 43 | + |
| 44 | + if llm_config is None: |
| 45 | + self.text_config = LlamaConfig() |
| 46 | + else: |
| 47 | + self.text_config = LlamaConfig(**llm_config) |
| 48 | + |
| 49 | + # Assign configuration values |
| 50 | + self.force_image_size = force_image_size |
| 51 | + self.downsample_ratio = downsample_ratio |
| 52 | + self.template = template # TODO move out of here and into the tokenizer |
| 53 | + self.ps_version = ps_version # Pixel shuffle version |
| 54 | + self.image_tag_type = image_tag_type # TODO: into the tokenizer too? |
| 55 | + self.projector_hidden_size = projector_hidden_size |
| 56 | + self.vit_hidden_size = vit_hidden_size |
0 commit comments