|
6 | 6 |
|
7 | 7 | class ModelWeightManager: |
8 | 8 | @staticmethod |
9 | | - def get_model_weights(model) -> list[np.ndarray]: |
10 | | - """Extract weights from any model type.""" |
11 | | - weights = [] |
| 9 | + def get_model_weights(model) -> list[tuple[np.ndarray, np.ndarray | None]]: |
| 10 | + """Extract weights and biases from any model type.""" |
| 11 | + params = [] |
| 12 | + |
| 13 | + def get_params_from_layer(layer): |
| 14 | + if hasattr(layer, 'weights'): |
| 15 | + weights = layer.weights.copy() |
| 16 | + bias = layer.bias.copy() if hasattr(layer, 'bias') else None |
| 17 | + return (weights, bias) |
| 18 | + return None |
| 19 | + |
| 20 | + def get_params_from_dense_layers(layers): |
| 21 | + layer_params = [] |
| 22 | + for layer in layers: |
| 23 | + p = get_params_from_layer(layer) |
| 24 | + if p: |
| 25 | + layer_params.append(p) |
| 26 | + return layer_params |
12 | 27 |
|
13 | 28 | if hasattr(model, 'layers'): # Sequential model |
14 | | - weights.extend( |
15 | | - [layer.weights for layer in model.layers if hasattr(layer, 'weights')]) |
| 29 | + for layer in model.layers: |
| 30 | + p = get_params_from_layer(layer) |
| 31 | + if p: |
| 32 | + params.append(p) |
16 | 33 |
|
17 | 34 | elif hasattr(model, 'encoder_layers') and hasattr(model, 'decoder_layers'): # Autoencoder |
18 | | - weights.extend( |
19 | | - [layer.weights for layer in model.encoder_layers if hasattr(layer, 'weights')]) |
20 | | - weights.extend( |
21 | | - [layer.weights for layer in model.decoder_layers if hasattr(layer, 'weights')]) |
| 35 | + for layer in model.encoder_layers: |
| 36 | + p = get_params_from_layer(layer) |
| 37 | + if p: |
| 38 | + params.append(p) |
| 39 | + for layer in model.decoder_layers: |
| 40 | + p = get_params_from_layer(layer) |
| 41 | + if p: |
| 42 | + params.append(p) |
22 | 43 |
|
23 | | - elif hasattr(model, 'embedding'): # Transformer |
24 | | - if hasattr(model.embedding, 'weights'): |
25 | | - weights.append(model.embedding.weights) |
| 44 | + elif hasattr(model, 'src_embedding'): # Transformer |
| 45 | + params.append(get_params_from_layer(model.src_embedding)) |
| 46 | + params.append(get_params_from_layer(model.tgt_embedding)) |
26 | 47 |
|
27 | 48 | for encoder_layer in model.encoder_layers: |
28 | | - if hasattr(encoder_layer, 'attention'): |
29 | | - weights.extend([ |
30 | | - encoder_layer.attention.query_dense.weights, |
31 | | - encoder_layer.attention.key_dense.weights, |
32 | | - encoder_layer.attention.value_dense.weights, |
33 | | - encoder_layer.attention.output_dense.weights |
34 | | - ]) |
35 | | - if hasattr(encoder_layer, 'ffn'): |
36 | | - weights.extend([ |
37 | | - encoder_layer.ffn.dense1.weights, |
38 | | - encoder_layer.ffn.dense2.weights |
39 | | - ]) |
| 49 | + params.extend(get_params_from_dense_layers([ |
| 50 | + encoder_layer.attention.query_dense, |
| 51 | + encoder_layer.attention.key_dense, |
| 52 | + encoder_layer.attention.value_dense, |
| 53 | + encoder_layer.attention.output_dense, |
| 54 | + encoder_layer.ffn.dense1, |
| 55 | + encoder_layer.ffn.dense2 |
| 56 | + ])) |
40 | 57 |
|
41 | 58 | for decoder_layer in model.decoder_layers: |
42 | | - if hasattr(decoder_layer, 'self_attention'): |
43 | | - weights.extend([ |
44 | | - decoder_layer.self_attention.query_dense.weights, |
45 | | - decoder_layer.self_attention.key_dense.weights, |
46 | | - decoder_layer.self_attention.value_dense.weights, |
47 | | - decoder_layer.self_attention.output_dense.weights |
48 | | - ]) |
49 | | - if hasattr(decoder_layer, 'cross_attention'): |
50 | | - weights.extend([ |
51 | | - decoder_layer.cross_attention.query_dense.weights, |
52 | | - decoder_layer.cross_attention.key_dense.weights, |
53 | | - decoder_layer.cross_attention.value_dense.weights, |
54 | | - decoder_layer.cross_attention.output_dense.weights |
55 | | - ]) |
56 | | - if hasattr(decoder_layer, 'ffn'): |
57 | | - weights.extend([ |
58 | | - decoder_layer.ffn.dense1.weights, |
59 | | - decoder_layer.ffn.dense2.weights |
60 | | - ]) |
61 | | - |
62 | | - if hasattr(model.output_layer, 'weights'): |
63 | | - weights.append(model.output_layer.weights) |
64 | | - |
65 | | - return weights |
| 59 | + params.extend(get_params_from_dense_layers([ |
| 60 | + decoder_layer.self_attention.query_dense, |
| 61 | + decoder_layer.self_attention.key_dense, |
| 62 | + decoder_layer.self_attention.value_dense, |
| 63 | + decoder_layer.self_attention.output_dense, |
| 64 | + decoder_layer.cross_attention.query_dense, |
| 65 | + decoder_layer.cross_attention.key_dense, |
| 66 | + decoder_layer.cross_attention.value_dense, |
| 67 | + decoder_layer.cross_attention.output_dense, |
| 68 | + decoder_layer.ffn.dense1, |
| 69 | + decoder_layer.ffn.dense2 |
| 70 | + ])) |
| 71 | + |
| 72 | + params.append(get_params_from_layer(model.output_layer)) |
| 73 | + |
| 74 | + return [p for p in params if p is not None] |
66 | 75 |
|
67 | 76 | @staticmethod |
68 | | - def set_model_weights(model, weights: list[np.ndarray]) -> None: |
69 | | - """Restore weights to any model type.""" |
70 | | - weight_idx = 0 |
| 77 | + def set_model_weights(model, params: list[tuple[np.ndarray, np.ndarray | None]]) -> None: |
| 78 | + """Restore weights and biases to any model type.""" |
| 79 | + param_idx = 0 |
| 80 | + |
| 81 | + def set_params_for_layer(layer): |
| 82 | + nonlocal param_idx |
| 83 | + if hasattr(layer, 'weights'): |
| 84 | + if param_idx < len(params): |
| 85 | + weights, bias = params[param_idx] |
| 86 | + layer.weights = weights.copy() |
| 87 | + if hasattr(layer, 'bias') and bias is not None: |
| 88 | + layer.bias = bias.copy() |
| 89 | + param_idx += 1 |
| 90 | + |
| 91 | + def set_params_for_dense_layers(layers): |
| 92 | + for layer in layers: |
| 93 | + set_params_for_layer(layer) |
71 | 94 |
|
72 | 95 | if hasattr(model, 'layers'): # Sequential model |
73 | 96 | for layer in model.layers: |
74 | | - if hasattr(layer, 'weights'): |
75 | | - layer.weights = weights[weight_idx] |
76 | | - weight_idx += 1 |
| 97 | + set_params_for_layer(layer) |
77 | 98 |
|
78 | 99 | elif hasattr(model, 'encoder_layers') and hasattr(model, 'decoder_layers'): # Autoencoder |
79 | 100 | for layer in model.encoder_layers: |
80 | | - if hasattr(layer, 'weights'): |
81 | | - layer.weights = weights[weight_idx] |
82 | | - weight_idx += 1 |
83 | | - |
| 101 | + set_params_for_layer(layer) |
84 | 102 | for layer in model.decoder_layers: |
85 | | - if hasattr(layer, 'weights'): |
86 | | - layer.weights = weights[weight_idx] |
87 | | - weight_idx += 1 |
| 103 | + set_params_for_layer(layer) |
88 | 104 |
|
89 | | - elif hasattr(model, 'embedding'): |
90 | | - if hasattr(model.embedding, 'weights'): |
91 | | - model.embedding.weights = weights[weight_idx] |
92 | | - weight_idx += 1 |
| 105 | + elif hasattr(model, 'src_embedding'): # Transformer |
| 106 | + set_params_for_layer(model.src_embedding) |
| 107 | + set_params_for_layer(model.tgt_embedding) |
93 | 108 |
|
94 | 109 | for encoder_layer in model.encoder_layers: |
95 | | - if hasattr(encoder_layer, 'attention'): |
96 | | - encoder_layer.attention.query_dense.weights = weights[weight_idx] |
97 | | - encoder_layer.attention.key_dense.weights = weights[weight_idx + 1] |
98 | | - encoder_layer.attention.value_dense.weights = weights[weight_idx + 2] |
99 | | - encoder_layer.attention.output_dense.weights = weights[weight_idx + 3] |
100 | | - weight_idx += 4 |
101 | | - if hasattr(encoder_layer, 'ffn'): |
102 | | - encoder_layer.ffn.dense1.weights = weights[weight_idx] |
103 | | - encoder_layer.ffn.dense2.weights = weights[weight_idx + 1] |
104 | | - weight_idx += 2 |
| 110 | + set_params_for_dense_layers([ |
| 111 | + encoder_layer.attention.query_dense, |
| 112 | + encoder_layer.attention.key_dense, |
| 113 | + encoder_layer.attention.value_dense, |
| 114 | + encoder_layer.attention.output_dense, |
| 115 | + encoder_layer.ffn.dense1, |
| 116 | + encoder_layer.ffn.dense2 |
| 117 | + ]) |
105 | 118 |
|
106 | 119 | for decoder_layer in model.decoder_layers: |
107 | | - if hasattr(decoder_layer, 'self_attention'): |
108 | | - decoder_layer.self_attention.query_dense.weights = weights[weight_idx] |
109 | | - decoder_layer.self_attention.key_dense.weights = weights[weight_idx + 1] |
110 | | - decoder_layer.self_attention.value_dense.weights = weights[weight_idx + 2] |
111 | | - decoder_layer.self_attention.output_dense.weights = weights[weight_idx + 3] |
112 | | - weight_idx += 4 |
113 | | - if hasattr(decoder_layer, 'cross_attention'): |
114 | | - decoder_layer.cross_attention.query_dense.weights = weights[weight_idx] |
115 | | - decoder_layer.cross_attention.key_dense.weights = weights[weight_idx + 1] |
116 | | - decoder_layer.cross_attention.value_dense.weights = weights[weight_idx + 2] |
117 | | - decoder_layer.cross_attention.output_dense.weights = weights[weight_idx + 3] |
118 | | - weight_idx += 4 |
119 | | - if hasattr(decoder_layer, 'ffn'): |
120 | | - decoder_layer.ffn.dense1.weights = weights[weight_idx] |
121 | | - decoder_layer.ffn.dense2.weights = weights[weight_idx + 1] |
122 | | - weight_idx += 2 |
123 | | - |
124 | | - # Restore output layer weights |
125 | | - if hasattr(model.output_layer, 'weights'): |
126 | | - model.output_layer.weights = weights[weight_idx] |
| 120 | + set_params_for_dense_layers([ |
| 121 | + decoder_layer.self_attention.query_dense, |
| 122 | + decoder_layer.self_attention.key_dense, |
| 123 | + decoder_layer.self_attention.value_dense, |
| 124 | + decoder_layer.self_attention.output_dense, |
| 125 | + decoder_layer.cross_attention.query_dense, |
| 126 | + decoder_layer.cross_attention.key_dense, |
| 127 | + decoder_layer.cross_attention.value_dense, |
| 128 | + decoder_layer.cross_attention.output_dense, |
| 129 | + decoder_layer.ffn.dense1, |
| 130 | + decoder_layer.ffn.dense2 |
| 131 | + ]) |
| 132 | + |
| 133 | + set_params_for_layer(model.output_layer) |
127 | 134 |
|
128 | 135 |
|
129 | 136 | class Callback: |
|
0 commit comments