|
2 | 2 |
|
3 | 3 | from abc import ABC, abstractmethod |
4 | 4 | from dataclasses import dataclass |
5 | | -from typing import Optional |
| 5 | +from typing import Any, Optional |
6 | 6 |
|
7 | 7 | from megatron.core.transformer.spec_utils import ModuleSpec |
8 | 8 |
|
9 | 9 |
|
10 | 10 | @dataclass |
11 | 11 | class EncoderTransformerConfig: |
12 | | - """Lightweight base config for encoder providers.""" |
| 12 | + """Lightweight base config for encoder providers. |
| 13 | +
|
| 14 | + Attributes: |
| 15 | + num_layers: Number of transformer layers in the encoder. |
| 16 | + hidden_size: Hidden dimension size of the encoder. |
| 17 | + num_attention_heads: Number of attention heads. |
| 18 | + seq_length: Sequence length for the encoder. |
| 19 | + projector_type: Type of projector (e.g., "mlp", "linear", "qformer"). |
| 20 | + None means no projection is needed. |
| 21 | + projector_input_size: Input size for projector. Defaults to hidden_size. |
| 22 | + projector_output_size: Output size for projector (e.g., LLM hidden size). |
| 23 | + Required if projector_type is set. |
| 24 | + projector_config: Optional TransformerConfig for the projector module. |
| 25 | + """ |
13 | 26 |
|
14 | 27 | num_layers: int |
15 | 28 | hidden_size: int |
16 | 29 | num_attention_heads: int |
17 | 30 | seq_length: int |
18 | 31 |
|
| 32 | + # Projector support for VLM setups |
| 33 | + projector_type: Optional[str] = None |
| 34 | + projector_input_size: Optional[int] = None |
| 35 | + projector_output_size: Optional[int] = None |
| 36 | + projector_config: Optional[Any] = None |
| 37 | + |
| 38 | + def __post_init__(self) -> None: |
| 39 | + """Set default projector_input_size to hidden_size if not specified.""" |
| 40 | + if self.projector_input_size is None: |
| 41 | + self.projector_input_size = self.hidden_size |
| 42 | + |
19 | 43 |
|
20 | 44 | class EncoderProvider(ABC): |
21 | | - """Interface for encoder providers used in MIMO setups.""" |
| 45 | + """Interface for encoder providers used in MIMO setups. |
| 46 | +
|
| 47 | + Subclasses must set the `config` attribute to an EncoderTransformerConfig. |
| 48 | + """ |
| 49 | + |
| 50 | + config: EncoderTransformerConfig |
22 | 51 |
|
23 | 52 | @abstractmethod |
24 | 53 | def provide_model(self, pg_collection) -> object: |
25 | 54 | """Create the encoder module (unwrapped).""" |
26 | 55 |
|
27 | 56 | @abstractmethod |
28 | 57 | def get_transformer_layer_spec(self) -> ModuleSpec: |
29 | | - """Return the ModuleSpec for the encoder stack.""" |
| 58 | + """Return the ModuleSpec for the encoder transformer layers.""" |
30 | 59 |
|
31 | 60 | @abstractmethod |
32 | 61 | def get_projection_spec(self) -> Optional[ModuleSpec]: |
33 | | - """Optional projection ModuleSpec for encoder outputs.""" |
| 62 | + """Optional projection ModuleSpec for encoder outputs. |
| 63 | +
|
| 64 | + Returns None if no projection is needed. |
| 65 | + """ |
| 66 | + |
| 67 | + def has_projector(self) -> bool: |
| 68 | + """Check if this encoder requires a projector.""" |
| 69 | + return self.config.projector_type is not None |
| 70 | + |
| 71 | + def validate_projector_config(self) -> None: |
| 72 | + """Validate projector configuration consistency. |
| 73 | +
|
| 74 | + Raises: |
| 75 | + ValueError: If projector_type is set but required fields are missing, |
| 76 | + or if projector_type is set but get_projection_spec() returns None. |
| 77 | + """ |
| 78 | + if self.config.projector_type is not None: |
| 79 | + if self.config.projector_output_size is None: |
| 80 | + raise ValueError( |
| 81 | + f"projector_output_size must be set when projector_type='{self.config.projector_type}'" |
| 82 | + ) |
| 83 | + if self.get_projection_spec() is None: |
| 84 | + raise ValueError( |
| 85 | + f"get_projection_spec() must return a ModuleSpec when " |
| 86 | + f"projector_type='{self.config.projector_type}'" |
| 87 | + ) |
34 | 88 |
|
35 | 89 |
|
36 | 90 | class GenericVisionEncoderProvider(EncoderProvider): |
37 | | - """Minimal stub encoder provider for Phase 1 wiring.""" |
| 91 | + """Minimal stub encoder provider for Phase 1/2 wiring.""" |
38 | 92 |
|
39 | 93 | def __init__(self, config: EncoderTransformerConfig) -> None: |
40 | 94 | self.config = config |
41 | 95 |
|
42 | 96 | def provide_model(self, pg_collection) -> object: |
43 | | - # Stub: actual encoder creation will be implemented in Phase 2. |
44 | 97 | raise NotImplementedError("GenericVisionEncoderProvider.provide_model not implemented.") |
45 | 98 |
|
46 | 99 | def get_transformer_layer_spec(self) -> ModuleSpec: |
|
0 commit comments