From 73439beb1b3511b9b32bc436b9ac9d7426501ef9 Mon Sep 17 00:00:00 2001 From: Francis Couture-Harpin Date: Sat, 19 Jul 2025 12:57:57 -0400 Subject: [PATCH 1/3] imatrix : use a single count for dense 3d tensors --- tools/imatrix/imatrix.cpp | 58 ++++++++++++++++++++++++--------------- 1 file changed, 36 insertions(+), 22 deletions(-) diff --git a/tools/imatrix/imatrix.cpp b/tools/imatrix/imatrix.cpp index a1f21d7ee56d1..5b0a423722abc 100644 --- a/tools/imatrix/imatrix.cpp +++ b/tools/imatrix/imatrix.cpp @@ -112,13 +112,6 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void * const char * data = is_host ? (const char *) src1->data : m_src1_data.data(); GGML_ASSERT(src1->nb[0] == ggml_element_size(src1)); - // TODO: 4d? (is that even used in practice?) - // the extra dimension would need to be stored somewhere to be reflected in the imatrix file - if (ggml_nrows(src1) != src1->ne[1] * src1->ne[2]) { - LOG_ERR("%s: tensor has more than 3 dimensions: %s", __func__, wname.c_str()); - GGML_ASSERT(false); - } - // this has been adapted to the new format of storing merged experts in a single 3d tensor // ref: https://github.com/ggml-org/llama.cpp/pull/6387 if (t->op == GGML_OP_MUL_MAT_ID) { @@ -134,6 +127,13 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void * GGML_ASSERT(ids->ne[1] == src1->ne[2]); + // TODO: 4d? (is that even used in practice?) + // the extra dimension would need to be stored somewhere to be reflected in the imatrix file + if (ggml_nrows(src1) != src1->ne[1] * src1->ne[2]) { + LOG_ERR("%s: tensor has more than 3 dimensions: %s", __func__, wname.c_str()); + GGML_ASSERT(false); + } + m_ids.resize(ggml_nbytes(ids)); ggml_backend_tensor_get(ids, m_ids.data(), 0, ggml_nbytes(ids)); @@ -199,19 +199,33 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void * auto & e = m_stats[wname]; const int64_t n_mat = src1->ne[2] * src1->ne[3]; + // use a single count per dense tensor + if ((int64_t) e.counts.size() == n_mat) { + bool all_equal = true; + for (size_t i = 1; i < e.counts.size(); ++i) { + if (e.counts[0] != e.counts[i]) { + all_equal = false; + break; + } + } + if (all_equal) { + e.counts.resize(1); + } + } if (e.values.empty()) { e.values.resize(src1->ne[0] * n_mat, 0); - e.counts.resize(n_mat, 0); + e.counts.resize(1, 0); } else if (e.values.size() != (size_t)(src1->ne[0] * n_mat)) { LOG_ERR("%s: inconsistent size for %s (%d vs %d)\n", __func__, wname.c_str(), (int)e.values.size(), (int)(src1->ne[0] * n_mat)); exit(1); //GGML_ABORT("fatal error"); } - else if (e.counts.size() != (size_t)n_mat) { - LOG_ERR("%s: inconsistent expert count for %s (%d vs %d)\n", __func__, wname.c_str(), (int)e.counts.size(), (int)n_mat); + else if (e.counts.size() != 1) { + LOG_ERR("%s: inconsistent matrix count for %s (%d vs %d)\n", __func__, wname.c_str(), (int)e.counts.size(), 1); exit(1); //GGML_ABORT("fatal error"); } LOG_DBGV(2, "%s[%d]: %32s, %s, %5d x %5d x %5d, %d\n", __func__, m_last_chunk, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[1], (int)src1->ne[2], (int)src1->type); + for (int64_t i3 = 0; i3 < src1->ne[3]; ++i3) { for (int64_t i2 = 0; i2 < src1->ne[2]; ++i2) { const int64_t mat_id = i3 * src1->ne[2] + i2; @@ -219,7 +233,6 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void * for (int64_t row = 0; row < src1->ne[1]; ++row) { const float * x = (const float *) (data + row * src1->nb[1] + i2 * src1->nb[2] + i3 * src1->ne[3]); - e.counts[mat_id]++; for (int64_t j = 0; j < src1->ne[0]; ++j) { e.values[mat_start + j] += x[j] * x[j]; if (!std::isfinite((float)e.values[j])) { @@ -228,17 +241,18 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void * } } } - const int32_t n_chunk = e.counts[mat_id] / chunk_size; - if (n_chunk > m_last_chunk) { - const int32_t chunk_step = n_chunk - m_last_chunk; - m_last_chunk = n_chunk; - if ((m_last_chunk % m_params.n_out_freq) / chunk_step == 0) { - save_imatrix(); - } - if (m_params.n_save_freq > 0 && (m_last_chunk % m_params.n_save_freq) / chunk_step == 0) { - save_imatrix(m_last_chunk); - } - } + } + } + e.counts[0] += src1->ne[1]; + const int32_t n_chunk = e.counts[0] / chunk_size; + if (n_chunk > m_last_chunk) { + const int32_t chunk_step = n_chunk - m_last_chunk; + m_last_chunk = n_chunk; + if ((m_last_chunk % m_params.n_out_freq) / chunk_step == 0) { + save_imatrix(); + } + if (m_params.n_save_freq > 0 && (m_last_chunk % m_params.n_save_freq) / chunk_step == 0) { + save_imatrix(m_last_chunk); } } } From d4f36e5e2bb3759721300241d99c8e63843b50c9 Mon Sep 17 00:00:00 2001 From: Francis Couture-Harpin Date: Thu, 31 Jul 2025 11:20:58 -0400 Subject: [PATCH 2/3] imatrix : fix 3d activations when model tensor is 2d --- tools/imatrix/imatrix.cpp | 35 +++++++++++++++++------------------ 1 file changed, 17 insertions(+), 18 deletions(-) diff --git a/tools/imatrix/imatrix.cpp b/tools/imatrix/imatrix.cpp index 5b0a423722abc..bc4aacd04d4f2 100644 --- a/tools/imatrix/imatrix.cpp +++ b/tools/imatrix/imatrix.cpp @@ -127,7 +127,6 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void * GGML_ASSERT(ids->ne[1] == src1->ne[2]); - // TODO: 4d? (is that even used in practice?) // the extra dimension would need to be stored somewhere to be reflected in the imatrix file if (ggml_nrows(src1) != src1->ne[1] * src1->ne[2]) { LOG_ERR("%s: tensor has more than 3 dimensions: %s", __func__, wname.c_str()); @@ -197,7 +196,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void * } } else { auto & e = m_stats[wname]; - const int64_t n_mat = src1->ne[2] * src1->ne[3]; + const int64_t n_mat = src0->ne[2] * src0->ne[3]; // use a single count per dense tensor if ((int64_t) e.counts.size() == n_mat) { @@ -220,19 +219,16 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void * LOG_ERR("%s: inconsistent size for %s (%d vs %d)\n", __func__, wname.c_str(), (int)e.values.size(), (int)(src1->ne[0] * n_mat)); exit(1); //GGML_ABORT("fatal error"); } - else if (e.counts.size() != 1) { - LOG_ERR("%s: inconsistent matrix count for %s (%d vs %d)\n", __func__, wname.c_str(), (int)e.counts.size(), 1); - exit(1); //GGML_ABORT("fatal error"); - } LOG_DBGV(2, "%s[%d]: %32s, %s, %5d x %5d x %5d, %d\n", __func__, m_last_chunk, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[1], (int)src1->ne[2], (int)src1->type); for (int64_t i3 = 0; i3 < src1->ne[3]; ++i3) { for (int64_t i2 = 0; i2 < src1->ne[2]; ++i2) { - const int64_t mat_id = i3 * src1->ne[2] + i2; + // handle 3D+ tensors, but flatten 3D+ activations when model tensor is 2D + const int64_t mat_id = (i3 % src0->ne[3]) * src0->ne[2] + (i2 % src0->ne[2]); const int64_t mat_start = mat_id * src1->ne[0]; for (int64_t row = 0; row < src1->ne[1]; ++row) { - const float * x = (const float *) (data + row * src1->nb[1] + i2 * src1->nb[2] + i3 * src1->ne[3]); + const float * x = (const float *) (data + row * src1->nb[1] + i2 * src1->nb[2] + i3 * src1->nb[3]); for (int64_t j = 0; j < src1->ne[0]; ++j) { e.values[mat_start + j] += x[j] * x[j]; if (!std::isfinite((float)e.values[j])) { @@ -243,16 +239,19 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void * } } } - e.counts[0] += src1->ne[1]; - const int32_t n_chunk = e.counts[0] / chunk_size; - if (n_chunk > m_last_chunk) { - const int32_t chunk_step = n_chunk - m_last_chunk; - m_last_chunk = n_chunk; - if ((m_last_chunk % m_params.n_out_freq) / chunk_step == 0) { - save_imatrix(); - } - if (m_params.n_save_freq > 0 && (m_last_chunk % m_params.n_save_freq) / chunk_step == 0) { - save_imatrix(m_last_chunk); + // only 1 count in practice, except when a tensor is used for both MUL_MAT_ID and MUL_MAT + for (size_t i = 0; i < e.counts.size(); ++i) { + e.counts[i] += ggml_nrows(src1); + const int32_t n_chunk = e.counts[i] / chunk_size; + if (n_chunk > m_last_chunk) { + const int32_t chunk_step = n_chunk - m_last_chunk; + m_last_chunk = n_chunk; + if ((m_last_chunk % m_params.n_out_freq) / chunk_step == 0) { + save_imatrix(); + } + if (m_params.n_save_freq > 0 && (m_last_chunk % m_params.n_save_freq) / chunk_step == 0) { + save_imatrix(m_last_chunk); + } } } } From 91e67b85838df2f6f86b3b92f6f86af4e219023d Mon Sep 17 00:00:00 2001 From: Francis Couture-Harpin Date: Thu, 31 Jul 2025 11:56:13 -0400 Subject: [PATCH 3/3] imatrix : fix 3d tensor counts --- tools/imatrix/imatrix.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tools/imatrix/imatrix.cpp b/tools/imatrix/imatrix.cpp index a0f4d7c2ff231..f5262e5e83da9 100644 --- a/tools/imatrix/imatrix.cpp +++ b/tools/imatrix/imatrix.cpp @@ -337,7 +337,8 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void * const int64_t n_mat = src0->ne[2] * src0->ne[3]; // use a single count per dense tensor - if ((int64_t) e.counts.size() == n_mat) { + // (necessary when merging older GGUF-imatrix files with 3d tensors) + if (e.counts.size() > 1) { bool all_equal = true; for (size_t i = 1; i < e.counts.size(); ++i) { if (e.counts[0] != e.counts[i]) { @@ -379,7 +380,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void * } // only 1 count in practice, except when a tensor is used for both MUL_MAT_ID and MUL_MAT for (size_t i = 0; i < e.counts.size(); ++i) { - e.counts[i] += ggml_nrows(src1); + e.counts[i] += ggml_nrows(src1) / n_mat; const int32_t n_chunk = e.counts[i] / chunk_size; if (n_chunk > m_last_chunk) { const int32_t chunk_step = n_chunk - m_last_chunk;