zq/fix token_attention (DeepLink-org#873)

NeosZhang · web-flow · commit ad2c04654a74 · 2024-01-18T15:31:45.000+08:00
fix token_attention
diff --git a/impl/ascend_npu/diopi_impl/functions_ext/token_attention_inference.cpp b/impl/ascend_npu/diopi_impl/functions_ext/token_attention_inference.cpp
@@ -25,9 +25,9 @@ diopiError_t diopiTokenAttentionInference(diopiContextHandle_t ctx, diopiTensorH
     for (int i = 0; i < batch; ++i) {
         int curSeqLen = bSeqLenAt[i].item<int>();
         int curSeqStartLoc = bStartLocAt[i].item<int>();
-        at::Tensor kLoc = at::index_select(bLocAt[i], 0, acl_op::arange(maxInputLen - curSeqLen, maxInputLen, at::kLong, layout, device));
+        at::Tensor kLoc = at::index_select(bLocAt[i], 0, acl_op::arange(maxInputLen - curSeqLen, maxInputLen, at::kInt, layout, device));
         at::Tensor key = at::index(kAt, {kLoc}).view({1, curSeqLen, head, dim}).transpose(1, 2);
-        at::Tensor outLoc = acl_op::arange(curSeqStartLoc, curSeqStartLoc + curSeqLen, at::kLong, layout, device);
+        at::Tensor outLoc = acl_op::arange(curSeqStartLoc, curSeqStartLoc + curSeqLen, at::kInt, layout, device);
         at::Tensor values =
             (at::matmul(at::index(qAt, {torch::scalar_to_tensor(i)}).toType(at::kFloat), key.transpose(2, 3).toType(at::kFloat)) / std::sqrt(dim))
                 .view({head, curSeqLen})
diff --git a/impl/ascend_npu/diopi_impl/functions_ext/token_softmax_reducev.cpp b/impl/ascend_npu/diopi_impl/functions_ext/token_softmax_reducev.cpp
@@ -24,11 +24,11 @@ diopiError_t diopiTokenSoftmaxReduceVInference(diopiContextHandle_t ctx, diopiTe
     for (int i = 0; i < batch; ++i) {
         int curSeqLen = bSeqLenAt[i].item<int>();
         int curSeqStartLoc = bStartLocAt[i].item<int>();
-        at::Tensor p = at::index(logicsAt, {at::Tensor(), acl_op::arange(curSeqStartLoc, curSeqStartLoc + curSeqLen, at::kLong, layout, device)})
+        at::Tensor p = at::index(logicsAt, {at::Tensor(), acl_op::arange(curSeqStartLoc, curSeqStartLoc + curSeqLen, at::kInt, layout, device)})
                            .softmax(-1)
                            .reshape({head, 1, 1, curSeqLen})
                            .transpose(0, 1);
-        at::Tensor vLoc = bLocAt[i].index_select(0, acl_op::arange(maxInputLen - curSeqLen, maxInputLen, at::kLong, layout, device));
+        at::Tensor vLoc = bLocAt[i].index_select(0, acl_op::arange(maxInputLen - curSeqLen, maxInputLen, at::kInt, layout, device));
         at::Tensor v = at::index(vAt, {vLoc}).view({1, curSeqLen, head, dim}).transpose(1, 2);
         at::Tensor values = at::matmul(p.toType(at::kFloat), v.toType(at::kFloat)).view({head, dim}).toType(dtype);
         at::index_put_(outAt, {torch::scalar_to_tensor(i)}, values);