TP 128 draft fix

kgopalsw · aws-yishanm · commit 51e8784469cb · 2025-01-16T02:41:44.000Z
GitOrigin-RevId: b68155341b188362a4cb7137d2724fb461a9a50a
diff --git a/src/transformers_neuronx/layers/flash_decoding.py b/src/transformers_neuronx/layers/flash_decoding.py
@@ -25,7 +25,7 @@ def gather_query_group(query, cores_per_kv_head, n_heads, tp_degree):
     # Communication 1: all-gather query from cores
     # Notice that this is not necessary for context encoding because we don't read from the KV cache
     cores_per_q_head = tp_degree // n_heads
-    group_size = cores_per_kv_head // cores_per_q_head if cores_per_q_head else cores_per_kv_head
+    group_size = cores_per_kv_head # note this cores per kv head is already divide by cores_per_q_head
     num_groups = tp_degree // group_size 
     interleave=False
     n_kv_heads = tp_degree // cores_per_kv_head
@@ -61,9 +61,11 @@ def context(past_scores, active_score, past_values, active_values,
     # How many cores should compute each head collectively
     # All cores that hold the KV cache for the same head should communicate here
     cores_per_kv_head = tp_degree // n_kv_heads
+    cores_per_q_head = tp_degree // n_heads
+    cores_per_kv_head = cores_per_kv_head // cores_per_q_head if cores_per_q_head else cores_per_kv_head
     if cores_per_kv_head > 1:
         group_size = cores_per_kv_head
-        num_groups = n_kv_heads
+        num_groups = tp_degree // group_size
     else:
         # MHA case, assume all cores will have all heads in cache and kv sharded by seq
         num_groups = 1
@@ -164,7 +166,7 @@ def context(past_scores, active_score, past_values, active_values,
 
     # Communication 3: send the results of other Q heads back to their corresponding cores
     # Also gather the results of the current Q head from other cores
-    assert output.sizes[1] == group_size*size , f"n_heads {n_heads} after gather not matching kv_replication x n_heads_tp {group_size}x {size}"
+    assert output.sizes[1] == group_size*size , f"n_heads {output.sizes[1]} after gather not matching kv_replication x n_heads_tp {group_size}x {size}"
     apply_fn = hlo.gen_add_func(output.dtype)
     output = hlo.reduce_scatter(output, dim=1, replica_groups=replica_groups, to_apply=apply_fn)
     assert output.sizes[1] == size , f"n_heads post scatter size mismatch, check replica_groups {replica_groups}"
@@ -174,8 +176,8 @@ def context(past_scores, active_score, past_values, active_values,
     # multiplied with its corresponding weights, and then an all-reduce is used to sum
     # results for all heads together.
     # We need a scaling here because multiple cores hold the same result
-    if cores_per_q_head:
-        output = hlo.divide(output, cores_per_q_head)
+    #if cores_per_q_head: # we do zero padding now, so enable once replication is done
+    #    output = hlo.divide(output, cores_per_q_head)
     return output
 
 
@@ -291,4 +293,4 @@ def select_values_within_bound(cache_ids, values, keys, cores_per_kv_head, core_
         keys =  hlo.slice_along(keys, dim=dim,limit=slice_size,stride=stride)
         cache_ids = hlo.slice_along(cache_ids, dim=cache_dim,limit=cache_slice_size, stride=stride)
         
-    return cache_ids, values, keys
+    return cache_ids, values, keys
diff --git a/src/transformers_neuronx/llama/hlo.py b/src/transformers_neuronx/llama/hlo.py
@@ -124,6 +124,8 @@ def pre_layer(self, hidden, cache_ids, start_ids, last_token_id, block_tables, c
             n_kv_heads = self.config.num_key_value_heads if hasattr(self.config, "num_key_value_heads") else self.config.num_attention_heads
             cores_per_kv_head = self.config.tp_degree // n_kv_heads
             self.cores_per_kv_head  = cores_per_kv_head if cores_per_kv_head > 1 else self.config.tp_degree
+            cores_per_q_head = self.config.tp_degree // self.config.num_attention_heads
+            self.cores_per_kv_head = self.cores_per_kv_head // cores_per_q_head if cores_per_q_head else self.cores_per_kv_head
         if self.neuron_config.optimized_paged_attention and len(last_token_id.sizes) == 2:
             # For decoding with multiple KV cache blocks:
             # - cache_ids are used as context_lens
@@ -532,7 +534,7 @@ def attention(
         if self.config.num_key_value_heads is not None:
             n_head = self.config.num_attention_heads
             n_kv_head = self.config.num_key_value_heads
-            n_head, n_kv_head_padded = utils.get_qkv_padding(n_head, n_kv_head, tp_degree, self.neuron_config)
+            n_head_padded, n_kv_head_padded = utils.get_qkv_padding(n_head, n_kv_head, tp_degree, self.neuron_config)
             n_kv_heads_tp = n_kv_head_padded // tp_degree
 
         # Q = (hidden @ wQ) + bQ
@@ -748,9 +750,9 @@ def attention(
 
         # O = (C @ wO) + bO
         output = attention.output(context, out_weight, out_scales, out_bias, tp_degree, self.neuron_config)
-        cores_per_attn_head = tp_degree // self.config.num_attention_heads
-        if cores_per_attn_head and not self.neuron_config.shard_over_sequence:
-            output = hlo.divide(output, cores_per_attn_head)
+       # we do zero padding so disable now
+       #  if cores_per_attn_head and not self.neuron_config.shard_over_sequence:
+       #      output = hlo.divide(output, cores_per_attn_head)
         return output, updated_keys, updated_values