From 5aacb9c74af6558e8af85403c3cff2cdd57a38f7 Mon Sep 17 00:00:00 2001 From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com> Date: Wed, 29 Oct 2025 18:13:42 +0000 Subject: [PATCH 1/9] Improve: Draft more register-optimal variant --- include/stringzillas/similarities.cuh | 197 +++++++++++++++++++++++++- scripts/test_similarities.cuh | 120 ++++++++++++++++ 2 files changed, 312 insertions(+), 5 deletions(-) diff --git a/include/stringzillas/similarities.cuh b/include/stringzillas/similarities.cuh index f20ae59f..9c0a4882 100644 --- a/include/stringzillas/similarities.cuh +++ b/include/stringzillas/similarities.cuh @@ -1629,7 +1629,6 @@ __global__ void linear_score_on_each_cuda_warp_( // substituter_type_ const substituter, linear_gap_costs_t const gap_costs, // unsigned const shared_memory_size) { - // Simplify usage in higher-level libraries, where wrapping custom allocators may be troublesome. using task_t = task_type_; using char_t = char_type_; using index_t = index_type_; @@ -1674,7 +1673,7 @@ __global__ void linear_score_on_each_cuda_warp_( // size_t const longer_length = task.longer_length; auto &result_ref = task.result; - // We are going to store 3 diagonals of the matrix, assuming each would fit into a single ZMM register. + // We are going to store 3 diagonals of the matrix. // The length of the longest (main) diagonal would be `shorter_dim = (shorter_length + 1)`. unsigned const shorter_dim = static_cast(shorter_length + 1); unsigned const longer_dim = static_cast(longer_length + 1); @@ -1822,7 +1821,6 @@ __global__ void affine_score_on_each_cuda_warp_( // substituter_type_ const substituter, affine_gap_costs_t const gap_costs, // unsigned const shared_memory_size) { - // Simplify usage in higher-level libraries, where wrapping custom allocators may be troublesome. using task_t = task_type_; using char_t = char_type_; using index_t = index_type_; @@ -1867,7 +1865,7 @@ __global__ void affine_score_on_each_cuda_warp_( // size_t const longer_length = task.longer_length; auto &result_ref = task.result; - // We are going to store 3 diagonals of the matrix, assuming each would fit into a single ZMM register. + // We are going to store 3 diagonals of the matrix. // The length of the longest (main) diagonal would be `shorter_dim = (shorter_length + 1)`. unsigned const shorter_dim = static_cast(shorter_length + 1); unsigned const longer_dim = static_cast(longer_length + 1); @@ -2017,6 +2015,177 @@ __global__ void affine_score_on_each_cuda_warp_( // } } +/** + * @brief Levenshtein edit distances algorithm evaluating the Dynamic Programming matrix + * @b three skewed (reverse) diagonals at a time on a GPU, leveraging CUDA for parallelization. + * Each pair of strings gets its own @b "thread" and uses only @b register memory! + * + * @param[in] tasks Tasks containing the strings and output locations. + * @param[in] tasks_count The number of tasks to process. + * @param[in] substituter The substitution costs. + * @param[in] gap_costs The @b linear gap costs. + */ +template < // + typename task_type_, + typename char_type_ = char, // + typename index_type_ = unsigned, // + typename score_type_ = size_t, // + typename substituter_type_ = uniform_substitution_costs_t, // + sz_similarity_objective_t objective_ = sz_maximize_score_k, // + sz_similarity_locality_t locality_ = sz_similarity_global_k, // + sz_capability_t capability_ = sz_cap_cuda_k, // + unsigned max_length_ = 128 // + > +__global__ void linear_score_on_each_cuda_thread_( // + task_type_ *tasks, size_t tasks_count, // + substituter_type_ const substituter, linear_gap_costs_t const gap_costs) { + + using task_t = task_type_; + using char_t = char_type_; + using index_t = index_type_; + using score_t = score_type_; + + static constexpr sz_capability_t capability_k = capability_; + static constexpr sz_similarity_locality_t locality_k = locality_; + static constexpr sz_similarity_objective_t objective_k = objective_; + static_assert(locality_k == sz_similarity_global_k, + "Only global alignments are supported in the register-based implementation."); + + // Unlike other algorithms, where the difference between strings may be significant, + // on smaller strings the scheduling and control-flow overhead takes a larger toll. + // So instead of 3 individual loops processing (1) the top-left triangle, (2) the anti-diagonal band, + // and (3) the bottom-right triangle of the DP matrix, we will just have a square matrix, dropping the + // middle band. + // + // Assuming we don't have enough register memory to store the entire DP matrix, we need a cheap way + // to extract the (shorter_dim, longer_dim) cell on the fly, as soon as we pass the containing diagonal. + // That solves the issue for global alignments, but not the local ones. + static constexpr unsigned max_length_k = max_length_; + static constexpr unsigned padded_diagonals_count_k = max_length_k + max_length_k - 1; + + // Pre-load the substituter and gap costs. + using substituter_t = substituter_type_; + using gap_costs_t = linear_gap_costs_t; + static_assert(std::is_trivially_copyable::value, "Substituter must be trivially copyable."); + static_assert(std::is_trivially_copyable::value, "Gap costs must be trivially copyable."); + + using cuda_warp_scorer_t = tile_scorer; + + // We may have multiple warps operating in the same block. + unsigned const warp_size = warpSize; + size_t const global_thread_index = static_cast(blockIdx.x * blockDim.x + threadIdx.x); + size_t const global_warp_index = static_cast(global_thread_index / warp_size); + size_t const warps_per_block = static_cast(blockDim.x / warp_size); + size_t const warps_per_device = static_cast(gridDim.x * warps_per_block); + size_t const threads_per_device = warps_per_device * warp_size; + unsigned const thread_in_warp_index = static_cast(global_thread_index % warp_size); + + // No shared memory, only registers! + char_t diagonals_registers[3][max_length_k + 1]; + char_t padded_strings_registers[2][max_length_k]; + + // We are computing N edit distances for N pairs of strings. Not a cartesian product! + for (size_t task_idx = global_warp_index; task_idx < tasks_count; task_idx += threads_per_device) { + task_t &task = tasks[task_idx]; + char_t const *shorter_global = task.shorter_ptr; + char_t const *longer_global = task.longer_ptr; + size_t const shorter_length = task.shorter_length; + size_t const longer_length = task.longer_length; + auto &result_ref = task.result; + + // We are going to store 3 diagonals of the matrix. + // The length of the longest (main) diagonal would be `shorter_dim = (shorter_length + 1)`. + unsigned const shorter_dim = static_cast(shorter_length + 1); + unsigned const longer_dim = static_cast(longer_length + 1); + + // Knowing the lengths of both strings, we can pre-compute the relevant diagonal + // and the offset of a relevant entry within that diagonal. + unsigned const relevant_diagonal_index = shorter_dim + longer_dim - 1; + unsigned const relevant_diagonal_offset = max_length_k - shorter_dim; + + // The next few pointers will be swapped around. + score_t *previous_scores = &diagonals_registers[0][0]; + score_t *current_scores = &diagonals_registers[1][0]; + score_t *next_scores = &diagonals_registers[2][0]; + char_t *const longer = &padded_strings_registers[0][0]; + char_t *const shorter = &padded_strings_registers[1][0]; + + // Copy the string into registers once. + for (unsigned i = thread_in_warp_index; i < longer_length; ++i) longer[i] = longer_global[i]; + for (unsigned i = thread_in_warp_index; i < shorter_length; ++i) shorter[i] = shorter_global[i]; + + // Initialize the first two diagonals: + cuda_warp_scorer_t diagonal_aligner {substituter, gap_costs}; + diagonal_aligner.init_score(previous_scores[0], 0); + diagonal_aligner.init_score(current_scores[0], 1); + diagonal_aligner.init_score(current_scores[1], 1); + + // We skip diagonals 0 and 1, as they are trivial. + // We will start with diagonal 2, which has length 3, with the first and last elements being preset, + // so we are effectively computing just one value, as will be marked by a single set bit in + // the `next_diagonal_mask` on the very first iteration. + unsigned next_diagonal_index = 2; + score_t relevant_score = 0; + + // Progress through the upper-left triangle of the Levenshtein matrix. + for (; next_diagonal_index < max_length_k; ++next_diagonal_index) { + + unsigned const next_diagonal_length = next_diagonal_index + 1; + diagonal_aligner( // + shorter, // first sequence of characters + longer, // second sequence of characters + thread_in_warp_index, warp_size, // + next_diagonal_length - 2, // number of elements to compute with the `diagonal_aligner` + previous_scores, // costs pre substitution + current_scores, current_scores + 1, // costs pre insertion/deletion + next_scores + 1); // ! notice unaligned write destination + + // Don't forget to populate the first row and the first column of the Levenshtein matrix. + diagonal_aligner.init_score(next_scores[0], next_diagonal_index); + diagonal_aligner.init_score(next_scores[next_diagonal_length - 1], next_diagonal_index); + + // Extract the relevant score, if we are on the relevant diagonal. + relevant_score = next_diagonal_index == relevant_diagonal_index // + ? next_scores[relevant_diagonal_offset] + : relevant_score; + + // Perform a circular rotation of those buffers, to reuse the memory. + rotate_three(previous_scores, current_scores, next_scores); + } + + // Now let's handle the bottom-right triangle of the matrix. + for (; next_diagonal_index < padded_diagonals_count_k; ++next_diagonal_index) { + + unsigned const next_diagonal_length = padded_diagonals_count_k - next_diagonal_index; + diagonal_aligner( // + shorter + next_diagonal_index - longer_dim, // first sequence of characters + longer + next_diagonal_index - shorter_dim, // second sequence of characters + thread_in_warp_index, warp_size, // + next_diagonal_length, // number of elements to compute with the `diagonal_aligner` + previous_scores, // costs pre substitution + current_scores, current_scores + 1, // costs pre insertion/deletion + next_scores); + + // Perform a circular rotation of those buffers, to reuse the memory. + rotate_three(previous_scores, current_scores, next_scores); + + // Extract the relevant score, if we are on the relevant diagonal. + relevant_score = next_diagonal_index == relevant_diagonal_index // + ? next_scores[relevant_diagonal_offset] + : relevant_score; + + // ! Drop the first entry among the current scores. + // ! Assuming every next diagonal is shorter by one element, + // ! we don't need a full-blown `sz_move` to shift the array by one element. + previous_scores++; + } + + // Export one result per each block. + result_ref = relevant_score; + } +} + #pragma endregion #pragma region - Levenshtein Distance in CUDA @@ -2052,6 +2221,12 @@ struct cuda_similarity_task_ { } constexpr size_t max_diagonal_length() const noexcept { return sz_max_of_two(shorter_length, longer_length) + 1; } + + constexpr bool fits_in_registers() const noexcept { + return // + shorter_length <= 128 && longer_length <= 128 && // ? H100 has 256x 32-bit registers per thread + bytes_per_cell == one_byte_per_cell_k; // ? 2-byte entries require too many registers + } }; /** @@ -2106,7 +2281,7 @@ struct levenshtein_distances; for (size_t i = 0; i < first_strings.size(); ++i) { // Ensure inputs are device-accessible (Unified/Device memory) @@ -2132,6 +2307,18 @@ struct levenshtein_distances; } auto [device_level_tasks, warp_level_tasks, empty_tasks] = diff --git a/scripts/test_similarities.cuh b/scripts/test_similarities.cuh index 528488a5..271063d4 100644 --- a/scripts/test_similarities.cuh +++ b/scripts/test_similarities.cuh @@ -50,6 +50,126 @@ inline std::size_t levenshtein_baseline( // return matrix_buffer.back(); } +/** + * The purpose of this class is to emulate SIMD processing in CUDA with 4-byte-wide words, + * entirely unrolling the entire Levenshtein matrix calculation in diagonal order, only + * using properly aligned loads! + */ +template +struct levenshtein_recursive_algorithm { + + using text_t = char[max_length_]; + using diagonal_scores_t = std::uint8_t[max_length_ + 1]; + + static constexpr unsigned diagonals_count = max_length_ * 2 + 1; + static constexpr unsigned max_diagonal_length = max_length_ + 1; + constexpr unsigned next_diagonal_length = next_diagonal_index_ + 1; + + std::size_t operator()( // + text_t s1, std::size_t len1, text_t s2, std::size_t len2, // + error_cost_t match_cost = 0, error_cost_t mismatch_cost = 1, error_cost_t gap_cost = 1, // + diagonal_scores_t previous, diagonal_scores_t current, diagonal_scores_t next) { + + // Top-left cell of the matrix + if constexpr (next_diagonal_index_ == 0) { + next[0] = 0; + // Recurse to the next diagonal + levenshtein_recursive_algorithm next_run; + return next_run(s1, s2, // same texts as before + match_cost, mismatch_cost, gap_cost, // same costs + current, next, previous); // 3-way rotation of the diagonals + } + // The cells below and next to it + else if constexpr (next_diagonal_index_ == 1) { + next[0] = gap_cost, next[1] = gap_cost; + + // Recurse to the next diagonal + levenshtein_recursive_algorithm next_run; + return next_run(s1, s2, // same texts as before + match_cost, mismatch_cost, gap_cost, // same costs + current, next, previous); // 3-way rotation of the diagonals + } + // We are still in the top-left triangle + else if constexpr (next_diagonal_index_ <= max_length_) { + + constexpr unsigned next_diagonal_length = next_diagonal_index_ + 1; +#pragma unroll + for (unsigned k = 1; k < next_diagonal_length - 1; ++k) { + char first_char = s1[next_diagonal_length - k]; + char second_char = s2[k - 1]; + + error_cost_t substitution_cost = first_char == second_char ? match_cost : mismatch_cost; + error_cost_t if_substitution = previous[k - 1] + substitution_cost; + error_cost_t if_deletion_or_insertion = std::min(current[k], current[k - 1]) + gap_cost; + next[k] = std::min(if_deletion_or_insertion, if_substitution); + } + + // Single-byte stores at the edges of the diagonal - to overwrite the noisy values: + next[0] = gap_cost * next_diagonal_index_; + next[next_diagonal_length - 1] = gap_cost * next_diagonal_index_; + + // Recurse to the next diagonal + levenshtein_recursive_algorithm next_run; + return next_run(s1, s2, // same texts as before + match_cost, mismatch_cost, gap_cost, // same costs + current, next, previous); // 3-way rotation of the diagonals + } + } + // We are in the last diagonal and need to return the bottom-right cell + else if constexpr (next_diagonal_index_ + 1 == diagonals_count) { + char first_char = s1[max_length_ - 1]; + char second_char = s2[max_length_ - 1]; + + error_cost_t substitution_cost = first_char == second_char ? match_cost : mismatch_cost; + error_cost_t if_substitution = previous[0] + substitution_cost; + error_cost_t if_deletion_or_insertion = std::min(current[0], current[1]) + gap_cost; + next[0] = std::min(if_deletion_or_insertion, if_substitution); + + return next[0]; + } + // We are in the bottom-right triangle + else { + + constexpr unsigned next_diagonal_length = diagonals_count - next_diagonal_index_; +#pragma unroll + for (unsigned k = 0; k < next_diagonal_length; ++k) { + char first_char = s1[...]; + char second_char = s2[...]; + + error_cost_t substitution_cost = first_char == second_char ? match_cost : mismatch_cost; + error_cost_t if_substitution = previous[k - 1] + substitution_cost; + error_cost_t if_deletion_or_insertion = std::min(current[k], current[k - 1]) + gap_cost; + next[k] = std::min(if_deletion_or_insertion, if_substitution); + } + + // Recurse to the next diagonal + levenshtein_recursive_algorithm next_run; + return next_run(s1, s2, // same texts as before + match_cost, mismatch_cost, gap_cost, // same costs + current, next, previous); // 3-way rotation of the diagonals + } +}; + +inline std::size_t levenshtein_recursive_baseline( // + char const *s1, std::size_t len1, char const *s2, std::size_t len2, // + error_cost_t match_cost = 0, error_cost_t mismatch_cost = 1, error_cost_t gap_cost = 1) noexcept(false) { + + static constexpr std::size_t max_length = 8; + levenshtein_recursive_algorithm algorithm; + + char s1_padded[max_length] = {0}; + char s2_padded[max_length] = {0}; + std::memcpy(s1_padded, s1, len1); + std::memcpy(s2_padded, s2, len2); + + // Need 3 diagonals for the rotation + std::uint8_t previous[max_length + 1] = {0}; + std::uint8_t current[max_length + 1] = {0}; + std::uint8_t next[max_length + 1] = {0}; + + return algorithm(s1_padded, s2_padded, match_cost, mismatch_cost, gap_cost, previous, current, next); +} + /** * @brief Inefficient baseline Needleman-Wunsch alignment score computation, as implemented in most codebases. * @warning Allocates a new matrix on every call, with rows potentially scattered around memory. From f75405bde3ed29bba128af9cccc8a5cff376720f Mon Sep 17 00:00:00 2001 From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com> Date: Wed, 29 Oct 2025 20:46:44 +0000 Subject: [PATCH 2/9] Improve: Simplify the template --- CMakeLists.txt | 1 - include/stringzillas/similarities.cuh | 246 +++++++++++--------------- scripts/test_similarities.cuh | 25 +-- 3 files changed, 122 insertions(+), 150 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 79a3cc7a..421aaac7 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -702,7 +702,6 @@ function (define_stringzillas_shared target source_file backend_flags) target_include_directories(${target} PRIVATE fork_union/include) target_compile_definitions(${target} PRIVATE "SZ_DYNAMIC_DISPATCH=1") target_compile_definitions(${target} PRIVATE "SZ_AVOID_LIBC=0") - target_compile_definitions(${target} PRIVATE "SZ_DEBUG=0") # Set backend-specific compilation flags foreach (flag ${backend_flags}) diff --git a/include/stringzillas/similarities.cuh b/include/stringzillas/similarities.cuh index 9c0a4882..2064f6fc 100644 --- a/include/stringzillas/similarities.cuh +++ b/include/stringzillas/similarities.cuh @@ -2017,9 +2017,13 @@ __global__ void affine_score_on_each_cuda_warp_( // /** * @brief Levenshtein edit distances algorithm evaluating the Dynamic Programming matrix - * @b three skewed (reverse) diagonals at a time on a GPU, leveraging CUDA for parallelization. + * @b two-rows at a time on a GPU, leveraging CUDA for parallelization. * Each pair of strings gets its own @b "thread" and uses only @b register memory! * + * As each thread receives a different input, the DP matrix size would also be different and we would end up + * with divergent branches. To address this issue, we pad all the inputs to a fixed maximum length, and keep + * a separate variable tracking the passed entry. + * * @param[in] tasks Tasks containing the strings and output locations. * @param[in] tasks_count The number of tasks to process. * @param[in] substituter The substitution costs. @@ -2027,161 +2031,93 @@ __global__ void affine_score_on_each_cuda_warp_( // */ template < // typename task_type_, - typename char_type_ = char, // - typename index_type_ = unsigned, // - typename score_type_ = size_t, // - typename substituter_type_ = uniform_substitution_costs_t, // - sz_similarity_objective_t objective_ = sz_maximize_score_k, // - sz_similarity_locality_t locality_ = sz_similarity_global_k, // - sz_capability_t capability_ = sz_cap_cuda_k, // - unsigned max_length_ = 128 // + typename char_type_ = char, // + typename score_type_ = size_t, // + sz_capability_t capability_ = sz_cap_cuda_k, // + unsigned max_text_length_ = 127 // > -__global__ void linear_score_on_each_cuda_thread_( // - task_type_ *tasks, size_t tasks_count, // - substituter_type_ const substituter, linear_gap_costs_t const gap_costs) { +__global__ void levenshtein_on_each_cuda_thread_( // + task_type_ *tasks, size_t tasks_count, // + uniform_substitution_costs_t const substituter, linear_gap_costs_t const gap_costs) { using task_t = task_type_; using char_t = char_type_; - using index_t = index_type_; using score_t = score_type_; - static constexpr sz_capability_t capability_k = capability_; - static constexpr sz_similarity_locality_t locality_k = locality_; - static constexpr sz_similarity_objective_t objective_k = objective_; - static_assert(locality_k == sz_similarity_global_k, - "Only global alignments are supported in the register-based implementation."); - - // Unlike other algorithms, where the difference between strings may be significant, - // on smaller strings the scheduling and control-flow overhead takes a larger toll. - // So instead of 3 individual loops processing (1) the top-left triangle, (2) the anti-diagonal band, - // and (3) the bottom-right triangle of the DP matrix, we will just have a square matrix, dropping the - // middle band. - // - // Assuming we don't have enough register memory to store the entire DP matrix, we need a cheap way - // to extract the (shorter_dim, longer_dim) cell on the fly, as soon as we pass the containing diagonal. - // That solves the issue for global alignments, but not the local ones. - static constexpr unsigned max_length_k = max_length_; - static constexpr unsigned padded_diagonals_count_k = max_length_k + max_length_k - 1; - - // Pre-load the substituter and gap costs. - using substituter_t = substituter_type_; - using gap_costs_t = linear_gap_costs_t; - static_assert(std::is_trivially_copyable::value, "Substituter must be trivially copyable."); - static_assert(std::is_trivially_copyable::value, "Gap costs must be trivially copyable."); - - using cuda_warp_scorer_t = tile_scorer; - // We may have multiple warps operating in the same block. unsigned const warp_size = warpSize; size_t const global_thread_index = static_cast(blockIdx.x * blockDim.x + threadIdx.x); - size_t const global_warp_index = static_cast(global_thread_index / warp_size); size_t const warps_per_block = static_cast(blockDim.x / warp_size); size_t const warps_per_device = static_cast(gridDim.x * warps_per_block); size_t const threads_per_device = warps_per_device * warp_size; - unsigned const thread_in_warp_index = static_cast(global_thread_index % warp_size); // No shared memory, only registers! - char_t diagonals_registers[3][max_length_k + 1]; - char_t padded_strings_registers[2][max_length_k]; + constexpr unsigned max_text_length_k = max_text_length_; + constexpr unsigned matrix_side_k = max_text_length_k + 1; + + score_t row_registers[2][matrix_side_k]; + char_t first_string_registers[max_text_length_k]; + char_t second_string_registers[max_text_length_k]; // We are computing N edit distances for N pairs of strings. Not a cartesian product! - for (size_t task_idx = global_warp_index; task_idx < tasks_count; task_idx += threads_per_device) { + for (size_t task_idx = global_thread_index; task_idx < tasks_count; task_idx += threads_per_device) { task_t &task = tasks[task_idx]; - char_t const *shorter_global = task.shorter_ptr; - char_t const *longer_global = task.longer_ptr; - size_t const shorter_length = task.shorter_length; - size_t const longer_length = task.longer_length; + char_t const *first_global = task.shorter_ptr; + char_t const *second_global = task.longer_ptr; + size_t const first_length = task.shorter_length; + size_t const second_length = task.longer_length; auto &result_ref = task.result; - // We are going to store 3 diagonals of the matrix. - // The length of the longest (main) diagonal would be `shorter_dim = (shorter_length + 1)`. - unsigned const shorter_dim = static_cast(shorter_length + 1); - unsigned const longer_dim = static_cast(longer_length + 1); - - // Knowing the lengths of both strings, we can pre-compute the relevant diagonal - // and the offset of a relevant entry within that diagonal. - unsigned const relevant_diagonal_index = shorter_dim + longer_dim - 1; - unsigned const relevant_diagonal_offset = max_length_k - shorter_dim; + // We are going to store 2 rows of the matrix. + // The length of the longest (main) diagonal would be `first_dim = (first_length + 1)`. + unsigned const first_dim = static_cast(first_length + 1); + unsigned const second_dim = static_cast(second_length + 1); // The next few pointers will be swapped around. - score_t *previous_scores = &diagonals_registers[0][0]; - score_t *current_scores = &diagonals_registers[1][0]; - score_t *next_scores = &diagonals_registers[2][0]; - char_t *const longer = &padded_strings_registers[0][0]; - char_t *const shorter = &padded_strings_registers[1][0]; + score_t *previous_scores = &row_registers[0][0]; + score_t *current_scores = &row_registers[1][0]; // Copy the string into registers once. - for (unsigned i = thread_in_warp_index; i < longer_length; ++i) longer[i] = longer_global[i]; - for (unsigned i = thread_in_warp_index; i < shorter_length; ++i) shorter[i] = shorter_global[i]; - - // Initialize the first two diagonals: - cuda_warp_scorer_t diagonal_aligner {substituter, gap_costs}; - diagonal_aligner.init_score(previous_scores[0], 0); - diagonal_aligner.init_score(current_scores[0], 1); - diagonal_aligner.init_score(current_scores[1], 1); - - // We skip diagonals 0 and 1, as they are trivial. - // We will start with diagonal 2, which has length 3, with the first and last elements being preset, - // so we are effectively computing just one value, as will be marked by a single set bit in - // the `next_diagonal_mask` on the very first iteration. - unsigned next_diagonal_index = 2; - score_t relevant_score = 0; - - // Progress through the upper-left triangle of the Levenshtein matrix. - for (; next_diagonal_index < max_length_k; ++next_diagonal_index) { - - unsigned const next_diagonal_length = next_diagonal_index + 1; - diagonal_aligner( // - shorter, // first sequence of characters - longer, // second sequence of characters - thread_in_warp_index, warp_size, // - next_diagonal_length - 2, // number of elements to compute with the `diagonal_aligner` - previous_scores, // costs pre substitution - current_scores, current_scores + 1, // costs pre insertion/deletion - next_scores + 1); // ! notice unaligned write destination - - // Don't forget to populate the first row and the first column of the Levenshtein matrix. - diagonal_aligner.init_score(next_scores[0], next_diagonal_index); - diagonal_aligner.init_score(next_scores[next_diagonal_length - 1], next_diagonal_index); - - // Extract the relevant score, if we are on the relevant diagonal. - relevant_score = next_diagonal_index == relevant_diagonal_index // - ? next_scores[relevant_diagonal_offset] - : relevant_score; - - // Perform a circular rotation of those buffers, to reuse the memory. - rotate_three(previous_scores, current_scores, next_scores); + { + unsigned i = 0, j = 0; + for (; i < first_length; ++i) first_string_registers[i] = first_global[i]; + for (; i < matrix_side_k; ++i) first_string_registers[i] = 0xFF; + for (; j < second_length; ++j) second_string_registers[j] = second_global[j]; + for (; j < matrix_side_k; ++j) second_string_registers[j] = 0xFF; } - // Now let's handle the bottom-right triangle of the matrix. - for (; next_diagonal_index < padded_diagonals_count_k; ++next_diagonal_index) { + // Initialize the first row (all elements, including padding): + for (unsigned col_idx = 0; col_idx <= matrix_side_k; ++col_idx) + previous_scores[col_idx] = col_idx * gap_costs.open_or_extend; - unsigned const next_diagonal_length = padded_diagonals_count_k - next_diagonal_index; - diagonal_aligner( // - shorter + next_diagonal_index - longer_dim, // first sequence of characters - longer + next_diagonal_index - shorter_dim, // second sequence of characters - thread_in_warp_index, warp_size, // - next_diagonal_length, // number of elements to compute with the `diagonal_aligner` - previous_scores, // costs pre substitution - current_scores, current_scores + 1, // costs pre insertion/deletion - next_scores); + // We are computing a larger matrix - let's use this scalar for pulling the relevant score out, + // before it's wiped out by subsequent loop iterations. + score_t relevant_score = 0; - // Perform a circular rotation of those buffers, to reuse the memory. - rotate_three(previous_scores, current_scores, next_scores); + // Progress through the matrix row-by-row (compute full padded matrix to avoid divergence): + for (unsigned row_idx = 1; row_idx <= /* second_dim */ matrix_side_k; ++row_idx) { + + // Don't forget to populate the first column of each row: + current_scores[0] = row_idx * gap_costs.open_or_extend; + for (unsigned col_idx = 1; col_idx <= /* first_dim */ matrix_side_k; ++col_idx) { + score_t substitution_cost = substituter( // + second_string_registers[row_idx - 1], // + first_string_registers[col_idx - 1]); + current_scores[col_idx] = sz_min_of_three( // + previous_scores[col_idx - 1] + substitution_cost, // substitution + current_scores[col_idx - 1] + gap_costs.open_or_extend, // insertion + previous_scores[col_idx] + gap_costs.open_or_extend); // deletion + } - // Extract the relevant score, if we are on the relevant diagonal. - relevant_score = next_diagonal_index == relevant_diagonal_index // - ? next_scores[relevant_diagonal_offset] - : relevant_score; + // Extract the actual result at the exact row when we finish computing it + // The following code is akin to a branch: + bool is_relevant_row = row_idx == second_length; + relevant_score = is_relevant_row ? current_scores[first_length] : relevant_score; - // ! Drop the first entry among the current scores. - // ! Assuming every next diagonal is shorter by one element, - // ! we don't need a full-blown `sz_move` to shift the array by one element. - previous_scores++; + // Reuse the memory. + trivial_swap(previous_scores, current_scores); } - // Export one result per each block. result_ref = relevant_score; } } @@ -2312,17 +2248,43 @@ struct levenshtein_distances 0) { + std::partition( // + tasks.begin(), tasks.end(), [](task_t const &task) { return task.fits_in_registers(); }); + auto thread_level_kernel = &levenshtein_on_each_cuda_thread_; + + // Store values in variables since we need their addresses + task_t *tasks_ptr = tasks.data(); + size_t tasks_count = count_register_level_tasks; // Only register-level tasks! + + void *thread_level_kernel_args[4]; + thread_level_kernel_args[0] = (void *)(&tasks_ptr); + thread_level_kernel_args[1] = (void *)(&tasks_count); + thread_level_kernel_args[2] = (void *)(&substituter_); + thread_level_kernel_args[3] = (void *)(&gap_costs_); - linear_score_on_each_cuda_thread_; + // TODO: We can be wiser about the dimensions of this grid. + unsigned const random_block_size = 128; + unsigned const random_blocks_per_multiprocessor = 8; + cudaError_t launch_error = cudaLaunchKernel( // + reinterpret_cast(thread_level_kernel), // Kernel function pointer + dim3(random_blocks_per_multiprocessor * specs.streaming_multiprocessors), // Grid dimensions + dim3(random_block_size), // Block dimensions + thread_level_kernel_args, // Array of kernel argument pointers + 0, // Shared memory per block (in bytes) + executor.stream()); // CUDA stream + if (launch_error != cudaSuccess) { + if (launch_error == cudaErrorMemoryAllocation) return {status_t::bad_alloc_k, launch_error}; + else + return {status_t::unknown_k, launch_error}; + } + } } - auto [device_level_tasks, warp_level_tasks, empty_tasks] = - warp_tasks_grouping({tasks.data(), tasks.size()}, specs); + // Group remaining non-register tasks into device-level and warp-level. + auto [device_level_tasks, warp_level_tasks, empty_tasks] = warp_tasks_grouping( + {tasks.data() + count_register_level_tasks, tasks.size() - count_register_level_tasks}, specs); if (device_level_tasks.size()) { auto device_level_u16_kernel = @@ -2445,6 +2407,13 @@ struct levenshtein_distances(indicative_task.memory_requirement * optimal_density); @@ -2478,18 +2447,17 @@ struct levenshtein_distances matrix_buffer(rows * cols); // Initialize the borders of the matrix. - for (std::size_t i = 0; i < rows; ++i) matrix_buffer[i * cols + 0] /* [i][0] in 2D */ = i * gap_cost; - for (std::size_t j = 0; j < cols; ++j) matrix_buffer[0 * cols + j] /* [0][j] in 2D */ = j * gap_cost; - - for (std::size_t i = 1; i < rows; ++i) { - std::size_t const *last_row = &matrix_buffer[(i - 1) * cols]; - std::size_t *row = &matrix_buffer[i * cols]; - for (std::size_t j = 1; j < cols; ++j) { - std::size_t substitution_cost = (s1[i - 1] == s2[j - 1]) ? match_cost : mismatch_cost; - std::size_t if_deletion_or_insertion = std::min(last_row[j], row[j - 1]) + gap_cost; - row[j] = std::min(if_deletion_or_insertion, last_row[j - 1] + substitution_cost); + for (std::size_t row = 0; row < rows; ++row) matrix_buffer[row * cols + 0] /* [row][0] in 2D */ = row * gap_cost; + for (std::size_t col = 0; col < cols; ++col) matrix_buffer[0 * cols + col] /* [0][col] in 2D */ = col * gap_cost; + + for (std::size_t row = 1; row < rows; ++row) { + std::size_t const *last_row_buffer = &matrix_buffer[(row - 1) * cols]; + std::size_t *current_row_buffer = &matrix_buffer[row * cols]; + for (std::size_t col = 1; col < cols; ++col) { + std::size_t substitution_cost = (s1[row - 1] == s2[col - 1]) ? match_cost : mismatch_cost; + std::size_t if_deletion_or_insertion = + std::min(last_row_buffer[col], current_row_buffer[col - 1]) + gap_cost; + current_row_buffer[col] = std::min(if_deletion_or_insertion, last_row_buffer[col - 1] + substitution_cost); } } return matrix_buffer.back(); } +#if 0 // ! Coming later :) + /** * The purpose of this class is to emulate SIMD processing in CUDA with 4-byte-wide words, * entirely unrolling the entire Levenshtein matrix calculation in diagonal order, only @@ -170,6 +173,8 @@ inline std::size_t levenshtein_recursive_baseline( // return algorithm(s1_padded, s2_padded, match_cost, mismatch_cost, gap_cost, previous, current, next); } +#endif // ! Coming later :) + /** * @brief Inefficient baseline Needleman-Wunsch alignment score computation, as implemented in most codebases. * @warning Allocates a new matrix on every call, with rows potentially scattered around memory. From 8bd04bca43eb2e7446eb222046936b4eba37f51d Mon Sep 17 00:00:00 2001 From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com> Date: Wed, 29 Oct 2025 20:47:12 +0000 Subject: [PATCH 3/9] Improve: Cover single-byte slots by benchmarks --- .vscode/launch.json | 10 +++++++++- scripts/bench_similarities.cuh | 4 ++-- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/.vscode/launch.json b/.vscode/launch.json index 985fa924..bd31386c 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -176,9 +176,17 @@ "program": "${workspaceFolder}/build_debug/stringzillas_${fileBasenameNoExtension}_cu20", "cwd": "${workspaceFolder}", "environment": [ + { + "name": "STRINGWARS_STRESS", + "value": "0" + }, + { + "name": "STRINGWARS_FILTER", + "value": "levenshtein_cuda:batch64" + }, { "name": "STRINGWARS_DATASET", - "value": "leipzig1M.txt" + "value": "acgt_100.txt" } ], "stopAtEntry": false, diff --git a/scripts/bench_similarities.cuh b/scripts/bench_similarities.cuh index 82d805ac..b3b97702 100644 --- a/scripts/bench_similarities.cuh +++ b/scripts/bench_similarities.cuh @@ -112,9 +112,9 @@ void bench_levenshtein(environment_t const &env) { sz_unused_(scramble_accelerated_results); // Let's define some weird scoring schemes for Levenshtein-like distance, that are not unary: - constexpr linear_gap_costs_t weird_linear {3}; + constexpr linear_gap_costs_t weird_linear {2}; constexpr affine_gap_costs_t weird_affine {4, 2}; - constexpr uniform_substitution_costs_t weird_uniform {1, 3}; + constexpr uniform_substitution_costs_t weird_uniform {1, 2}; for (std::size_t batch_size : batch_sizes) { results_linear_baseline.resize(batch_size), results_linear_accelerated.resize(batch_size); From b42ce43a820637c76ec0a76ae15cd7733e91fb9e Mon Sep 17 00:00:00 2001 From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com> Date: Thu, 30 Oct 2025 16:10:04 +0000 Subject: [PATCH 4/9] Fix: Matrix out-of-bound access Release builds still crash --- include/stringzillas/similarities.cuh | 45 ++++++++++++++------------- 1 file changed, 24 insertions(+), 21 deletions(-) diff --git a/include/stringzillas/similarities.cuh b/include/stringzillas/similarities.cuh index 2064f6fc..5b89952b 100644 --- a/include/stringzillas/similarities.cuh +++ b/include/stringzillas/similarities.cuh @@ -2015,6 +2015,8 @@ __global__ void affine_score_on_each_cuda_warp_( // } } +inline static constexpr unsigned levenshtein_on_each_cuda_thread_default_text_limit_k = 127; + /** * @brief Levenshtein edit distances algorithm evaluating the Dynamic Programming matrix * @b two-rows at a time on a GPU, leveraging CUDA for parallelization. @@ -2031,10 +2033,10 @@ __global__ void affine_score_on_each_cuda_warp_( // */ template < // typename task_type_, - typename char_type_ = char, // - typename score_type_ = size_t, // - sz_capability_t capability_ = sz_cap_cuda_k, // - unsigned max_text_length_ = 127 // + typename char_type_ = char, // + typename score_type_ = size_t, // + sz_capability_t capability_ = sz_cap_cuda_k, // + unsigned max_text_length_ = levenshtein_on_each_cuda_thread_default_text_limit_k // > __global__ void levenshtein_on_each_cuda_thread_( // task_type_ *tasks, size_t tasks_count, // @@ -2069,15 +2071,10 @@ __global__ void levenshtein_on_each_cuda_thread_( // auto &result_ref = task.result; // We are going to store 2 rows of the matrix. - // The length of the longest (main) diagonal would be `first_dim = (first_length + 1)`. - unsigned const first_dim = static_cast(first_length + 1); - unsigned const second_dim = static_cast(second_length + 1); - - // The next few pointers will be swapped around. score_t *previous_scores = &row_registers[0][0]; score_t *current_scores = &row_registers[1][0]; - // Copy the string into registers once. + // Copy the strings into registers once { unsigned i = 0, j = 0; for (; i < first_length; ++i) first_string_registers[i] = first_global[i]; @@ -2087,19 +2084,20 @@ __global__ void levenshtein_on_each_cuda_thread_( // } // Initialize the first row (all elements, including padding): - for (unsigned col_idx = 0; col_idx <= matrix_side_k; ++col_idx) + for (unsigned col_idx = 0; col_idx < matrix_side_k; ++col_idx) previous_scores[col_idx] = col_idx * gap_costs.open_or_extend; - // We are computing a larger matrix - let's use this scalar for pulling the relevant score out, - // before it's wiped out by subsequent loop iterations. + // We are computing a larger matrix - use this scalar for pulling the relevant score out score_t relevant_score = 0; // Progress through the matrix row-by-row (compute full padded matrix to avoid divergence): - for (unsigned row_idx = 1; row_idx <= /* second_dim */ matrix_side_k; ++row_idx) { - +#pragma unroll + for (unsigned row_idx = 1; row_idx < matrix_side_k; ++row_idx) { // Don't forget to populate the first column of each row: current_scores[0] = row_idx * gap_costs.open_or_extend; - for (unsigned col_idx = 1; col_idx <= /* first_dim */ matrix_side_k; ++col_idx) { + +#pragma unroll + for (unsigned col_idx = 1; col_idx < matrix_side_k; ++col_idx) { score_t substitution_cost = substituter( // second_string_registers[row_idx - 1], // first_string_registers[col_idx - 1]); @@ -2114,7 +2112,7 @@ __global__ void levenshtein_on_each_cuda_thread_( // bool is_relevant_row = row_idx == second_length; relevant_score = is_relevant_row ? current_scores[first_length] : relevant_score; - // Reuse the memory. + // Reuse the memory trivial_swap(previous_scores, current_scores); } @@ -2159,9 +2157,12 @@ struct cuda_similarity_task_ { constexpr size_t max_diagonal_length() const noexcept { return sz_max_of_two(shorter_length, longer_length) + 1; } constexpr bool fits_in_registers() const noexcept { - return // - shorter_length <= 128 && longer_length <= 128 && // ? H100 has 256x 32-bit registers per thread - bytes_per_cell == one_byte_per_cell_k; // ? 2-byte entries require too many registers + return + // ? 2-byte entries require too many registers + bytes_per_cell == one_byte_per_cell_k && + // ? H100 has 256x 32-bit registers per thread + shorter_length <= levenshtein_on_each_cuda_thread_default_text_limit_k && + longer_length <= levenshtein_on_each_cuda_thread_default_text_limit_k; } }; @@ -2252,7 +2253,9 @@ struct levenshtein_distances 0) { std::partition( // tasks.begin(), tasks.end(), [](task_t const &task) { return task.fits_in_registers(); }); - auto thread_level_kernel = &levenshtein_on_each_cuda_thread_; + auto thread_level_kernel = + &levenshtein_on_each_cuda_thread_; // Store values in variables since we need their addresses task_t *tasks_ptr = tasks.data(); From d9aa7fdce1ec77f71643ef795f4f1d2b562d4dc0 Mon Sep 17 00:00:00 2001 From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com> Date: Thu, 30 Oct 2025 17:00:25 +0000 Subject: [PATCH 5/9] Improve: 1-row Wagner-Fisher for minimum register pressure Still crashes in release builds --- include/stringzillas/similarities.cuh | 62 +++++++++++---------------- 1 file changed, 25 insertions(+), 37 deletions(-) diff --git a/include/stringzillas/similarities.cuh b/include/stringzillas/similarities.cuh index 5b89952b..d74c8a74 100644 --- a/include/stringzillas/similarities.cuh +++ b/include/stringzillas/similarities.cuh @@ -2015,7 +2015,7 @@ __global__ void affine_score_on_each_cuda_warp_( // } } -inline static constexpr unsigned levenshtein_on_each_cuda_thread_default_text_limit_k = 127; +inline static constexpr unsigned levenshtein_on_each_cuda_thread_default_text_limit_k = 31; /** * @brief Levenshtein edit distances algorithm evaluating the Dynamic Programming matrix @@ -2057,9 +2057,8 @@ __global__ void levenshtein_on_each_cuda_thread_( // constexpr unsigned max_text_length_k = max_text_length_; constexpr unsigned matrix_side_k = max_text_length_k + 1; - score_t row_registers[2][matrix_side_k]; + score_t row_registers[matrix_side_k]; char_t first_string_registers[max_text_length_k]; - char_t second_string_registers[max_text_length_k]; // We are computing N edit distances for N pairs of strings. Not a cartesian product! for (size_t task_idx = global_thread_index; task_idx < tasks_count; task_idx += threads_per_device) { @@ -2070,50 +2069,39 @@ __global__ void levenshtein_on_each_cuda_thread_( // size_t const second_length = task.longer_length; auto &result_ref = task.result; - // We are going to store 2 rows of the matrix. - score_t *previous_scores = &row_registers[0][0]; - score_t *current_scores = &row_registers[1][0]; + // We are going to store 1 row of the matrix (one-row Wagner-Fischer algorithm) + for (unsigned i = 0; i < first_length; ++i) first_string_registers[i] = first_global[i]; - // Copy the strings into registers once - { - unsigned i = 0, j = 0; - for (; i < first_length; ++i) first_string_registers[i] = first_global[i]; - for (; i < matrix_side_k; ++i) first_string_registers[i] = 0xFF; - for (; j < second_length; ++j) second_string_registers[j] = second_global[j]; - for (; j < matrix_side_k; ++j) second_string_registers[j] = 0xFF; - } - - // Initialize the first row (all elements, including padding): + // Initialize the first row for (unsigned col_idx = 0; col_idx < matrix_side_k; ++col_idx) - previous_scores[col_idx] = col_idx * gap_costs.open_or_extend; + row_registers[col_idx] = col_idx * gap_costs.open_or_extend; - // We are computing a larger matrix - use this scalar for pulling the relevant score out score_t relevant_score = 0; - // Progress through the matrix row-by-row (compute full padded matrix to avoid divergence): -#pragma unroll - for (unsigned row_idx = 1; row_idx < matrix_side_k; ++row_idx) { - // Don't forget to populate the first column of each row: - current_scores[0] = row_idx * gap_costs.open_or_extend; + // Outer loop: runtime bound (no unroll to avoid code explosion) + for (unsigned row_idx = 1; row_idx <= second_length; ++row_idx) { + score_t diagonal_value = row_registers[0]; // Save top-left corner (previous row, previous col) + row_registers[0] = row_idx * gap_costs.open_or_extend; // First column -#pragma unroll - for (unsigned col_idx = 1; col_idx < matrix_side_k; ++col_idx) { - score_t substitution_cost = substituter( // - second_string_registers[row_idx - 1], // + // Inner loop: no unrolling to minimize register pressure + for (unsigned col_idx = 1; col_idx <= first_length; ++col_idx) { + score_t top_left = diagonal_value; // This is previous_row[col-1] + diagonal_value = row_registers[col_idx]; // Save current value before overwriting + + // Read second string from global memory (only once per row) + score_t substitution_cost = substituter( // + second_global[row_idx - 1], // first_string_registers[col_idx - 1]); - current_scores[col_idx] = sz_min_of_three( // - previous_scores[col_idx - 1] + substitution_cost, // substitution - current_scores[col_idx - 1] + gap_costs.open_or_extend, // insertion - previous_scores[col_idx] + gap_costs.open_or_extend); // deletion + + row_registers[col_idx] = sz_min_of_three( // + top_left + substitution_cost, // diagonal (substitution) + row_registers[col_idx - 1] + gap_costs.open_or_extend, // left (insertion) + diagonal_value + gap_costs.open_or_extend); // top (deletion) } - // Extract the actual result at the exact row when we finish computing it - // The following code is akin to a branch: + // Extract result at the exact position bool is_relevant_row = row_idx == second_length; - relevant_score = is_relevant_row ? current_scores[first_length] : relevant_score; - - // Reuse the memory - trivial_swap(previous_scores, current_scores); + relevant_score = is_relevant_row ? row_registers[first_length] : relevant_score; } result_ref = relevant_score; From 13628a41f0adf12544f46b6ca4bea6468a926e29 Mon Sep 17 00:00:00 2001 From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com> Date: Thu, 30 Oct 2025 19:06:41 +0000 Subject: [PATCH 6/9] Fix: Cache second string char Yields 4x improvement --- include/stringzillas/similarities.cuh | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/include/stringzillas/similarities.cuh b/include/stringzillas/similarities.cuh index d74c8a74..880d7f99 100644 --- a/include/stringzillas/similarities.cuh +++ b/include/stringzillas/similarities.cuh @@ -2015,7 +2015,7 @@ __global__ void affine_score_on_each_cuda_warp_( // } } -inline static constexpr unsigned levenshtein_on_each_cuda_thread_default_text_limit_k = 31; +inline static constexpr unsigned levenshtein_on_each_cuda_thread_default_text_limit_k = 127; /** * @brief Levenshtein edit distances algorithm evaluating the Dynamic Programming matrix @@ -2080,6 +2080,9 @@ __global__ void levenshtein_on_each_cuda_thread_( // // Outer loop: runtime bound (no unroll to avoid code explosion) for (unsigned row_idx = 1; row_idx <= second_length; ++row_idx) { + // Cache second_char in scalar (read once per row, used for all columns) + char_t second_char = second_global[row_idx - 1]; + score_t diagonal_value = row_registers[0]; // Save top-left corner (previous row, previous col) row_registers[0] = row_idx * gap_costs.open_or_extend; // First column @@ -2088,9 +2091,9 @@ __global__ void levenshtein_on_each_cuda_thread_( // score_t top_left = diagonal_value; // This is previous_row[col-1] diagonal_value = row_registers[col_idx]; // Save current value before overwriting - // Read second string from global memory (only once per row) + // Use cached second_char (read once, used first_length times) score_t substitution_cost = substituter( // - second_global[row_idx - 1], // + second_char, // Cached in scalar first_string_registers[col_idx - 1]); row_registers[col_idx] = sz_min_of_three( // From 4efb2c44aec4b582abb47630a252f04bfc5bcab9 Mon Sep 17 00:00:00 2001 From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com> Date: Thu, 30 Oct 2025 23:57:57 +0000 Subject: [PATCH 7/9] Improve: Register-optimal Levenshtein algorithm --- include/stringzillas/similarities.cuh | 282 +++++++++++++++++++++----- 1 file changed, 233 insertions(+), 49 deletions(-) diff --git a/include/stringzillas/similarities.cuh b/include/stringzillas/similarities.cuh index 880d7f99..effc3851 100644 --- a/include/stringzillas/similarities.cuh +++ b/include/stringzillas/similarities.cuh @@ -2015,7 +2015,229 @@ __global__ void affine_score_on_each_cuda_warp_( // } } -inline static constexpr unsigned levenshtein_on_each_cuda_thread_default_text_limit_k = 127; +inline static constexpr unsigned levenshtein_on_each_cuda_thread_default_text_limit_k = 128; + +/** + * @brief Byte-wise equality comparison: returns 0xFF per byte where equal, 0x00 where different. + * Example: vcmpeq4(0x12345678, 0x12FF5678) = 0xFF00FFFF + */ +__forceinline__ __device__ __host__ sz_u32_t sz_u32_vcmpeq4_(sz_u32_t a, sz_u32_t b) { +#ifdef __CUDA_ARCH__ + return __vcmpeq4(a, b); +#else + sz_u32_t result = 0; + for (int i = 0; i < 4; ++i) { + sz_u8_t byte_a = (a >> (i * 8)) & 0xFF; + sz_u8_t byte_b = (b >> (i * 8)) & 0xFF; + if (byte_a == byte_b) result |= 0xFFu << (i * 8); + } + return result; +#endif +} + +/** + * @brief Byte-wise unsigned minimum. + * Example: vminu4(0x12345678, 0x34127856) = 0x12125656 + */ +__forceinline__ __device__ __host__ sz_u32_t sz_u32_vminu4_(sz_u32_t a, sz_u32_t b) { +#ifdef __CUDA_ARCH__ + return __vminu4(a, b); +#else + sz_u32_t result = 0; + for (int i = 0; i < 4; ++i) { + sz_u8_t byte_a = (a >> (i * 8)) & 0xFF; + sz_u8_t byte_b = (b >> (i * 8)) & 0xFF; + sz_u8_t min_byte = (byte_a < byte_b) ? byte_a : byte_b; + result |= (sz_u32_t)min_byte << (i * 8); + } + return result; +#endif +} + +/** + * @brief Byte-wise saturating addition (clamps at 0xFF). + * Example: vaddus4(0xFE020304, 0x03020100) = 0xFF040404 + */ +__forceinline__ __device__ __host__ sz_u32_t sz_u32_vaddus4_(sz_u32_t a, sz_u32_t b) { +#ifdef __CUDA_ARCH__ + return __vaddus4(a, b); +#else + sz_u32_t result = 0; + for (int i = 0; i < 4; ++i) { + sz_u8_t byte_a = (a >> (i * 8)) & 0xFF; + sz_u8_t byte_b = (b >> (i * 8)) & 0xFF; + sz_u32_t sum = byte_a + byte_b; + sz_u8_t sat_byte = (sum > 0xFF) ? 0xFF : (sz_u8_t)sum; + result |= (sz_u32_t)sat_byte << (i * 8); + } + return result; +#endif +} + +/** + * @brief Byte permutation: select 4 bytes from 8-byte source {x[0..3], y[0..3]}. + * Selector format (nibbles): each 4-bit value selects one of 8 source bytes. + * + * Selector nibbles: [3] [2] [1] [0] (hex digits in selector) + * Result bytes: byte3 byte2 byte1 byte0 + * + * Example: byte_perm(0x44332211, 0x88776655, 0x6543) + * Source: [11 22 33 44 | 55 66 77 88] (indices 0-7) + * Selector 0x6543 = nibbles [6,5,4,3] + * Result: [77 66 55 44] = 0x77665544 + */ +__forceinline__ __device__ __host__ sz_u32_t sz_u32_byte_perm_(sz_u32_t x, sz_u32_t y, sz_u32_t selector) { +#ifdef __CUDA_ARCH__ + return __byte_perm(x, y, selector); +#else + sz_u8_t source[8]; + for (int i = 0; i < 4; ++i) { + source[i] = (x >> (i * 8)) & 0xFF; + source[i + 4] = (y >> (i * 8)) & 0xFF; + } + + sz_u32_t result = 0; + for (int i = 0; i < 4; ++i) { + sz_u8_t sel = (selector >> (i * 4)) & 0x7; + result |= (sz_u32_t)source[sel] << (i * 8); + } + return result; +#endif +} + +/** + * @brief Register-only Levenshtein distance for strings up to 128 bytes using SIMD operations. + * + * Implements the Wagner-Fischer algorithm using a single-row optimization where only the current + * row of the DP matrix is stored in registers. The longer input string is also cached in registers + * to avoid repeated memory access. This design targets GPU architectures (H100) where register + * memory is abundant but shared/global memory access is expensive. The algorithm processes 4 columns + * at a time by packing them into uint32 vectors and using video instructions (vcmpeq4, vminu4, + * vaddus4, byte_perm) for parallel byte-wise operations. + * + * Each iteration computes one row by: (1) comparing the current character against 4 cached characters + * in parallel, (2) computing vertical and diagonal dependencies using SIMD min/add, and (3) propagating + * horizontal dependencies through 4 sequential iterations within each uint32. The result is extracted + * from the final row vector at the position corresponding to the longer string's length. Total register + * usage is approximately 70 registers (256 bytes for arrays plus temporaries). + * + * Register layout (max_text_length_ = 128): + * + * row_vec_[32] : current DP row, 4 cells per uint32 (128 bytes) + * longer_string_vec_[32] : cached string, 4 chars per uint32 (128 bytes) + * + * Processing order per uint32: + * + * 1. Compare: match[0..3] = (char == longer[0..3]) [parallel] + * 2. Vertical: r[0..3] = min(top[0..3]+gap, diag[0..3]+cost) [parallel] + * 3. Horizontal: r[0]=min(r[0],left+gap), r[1]=min(r[1],r[0]+gap), ... [sequential] + */ +template +struct register_optimal_levenshtein { + static constexpr unsigned max_text_length_k = max_text_length_; + static constexpr unsigned vec_count_k = max_text_length_k / sizeof(sz_u32_vec_t); + + sz_u32_vec_t row_vec_[vec_count_k]; + sz_u32_vec_t longer_string_vec_[vec_count_k]; + + __forceinline__ __device__ __host__ sz_u8_t operator()( // + sz_u8_t const *longer_string, unsigned longer_length, // + sz_u8_t const *shorter_string, unsigned shorter_length, // + uniform_substitution_costs_t const substituter, linear_gap_costs_t const gap_costs) { + + // Initialize the first row with the vectorized variant of: + // for (unsigned col_idx = 0; col_idx < matrix_side_k; ++col_idx) + // row_registers[col_idx] = col_idx * gap_costs.open_or_extend; + for (unsigned i = 0, running_gap = gap_costs.open_or_extend; i < vec_count_k; ++i) { + row_vec_[i].u32 = running_gap; + running_gap += gap_costs.open_or_extend; + row_vec_[i].u32 |= running_gap << 8; + running_gap += gap_costs.open_or_extend; + row_vec_[i].u32 |= running_gap << 16; + running_gap += gap_costs.open_or_extend; + row_vec_[i].u32 |= running_gap << 24; + running_gap += gap_costs.open_or_extend; + } + + // Load longer string into vector array (stays in registers, accessed in inner loop) + for (unsigned i = 0; i < longer_length; ++i) longer_string_vec_[0].u8s[i] = longer_string[i]; + + // Broadcast costs to all 4 bytes per uint32 + error_cost_t const gap_cost = gap_costs.open_or_extend; + error_cost_t const match_cost = substituter.match; + error_cost_t const mismatch_cost = substituter.mismatch; + sz_u32_vec_t gap_vec, match_vec, mismatch_vec; + gap_vec.u32 = gap_cost | (gap_cost << 8) | (gap_cost << 16) | (gap_cost << 24); + match_vec.u32 = match_cost | (match_cost << 8) | (match_cost << 16) | (match_cost << 24); + mismatch_vec.u32 = mismatch_cost | (mismatch_cost << 8) | (mismatch_cost << 16) | (mismatch_cost << 24); + + // Outer loop: iterate over shorter string (fewer iterations) + sz_u32_vec_t shorter_char_vec; + for (unsigned row_idx = 1; row_idx <= shorter_length; ++row_idx) { + // Load one character from shorter string and broadcast to all 4 bytes + sz_u8_t const shorter_char = shorter_string[row_idx - 1]; + shorter_char_vec.u32 = shorter_char | (shorter_char << 8) | (shorter_char << 16) | (shorter_char << 24); + + // Column 0 values (not stored in row_vec_) + sz_u8_t col0_prev = (row_idx - 1) * gap_cost; + sz_u8_t col0_curr = row_idx * gap_cost; + + // Broadcast col0_prev for diagonal construction + sz_u32_t prev_u32vec = col0_prev | (col0_prev << 8) | (col0_prev << 16) | (col0_prev << 24); + +#pragma unroll(vec_count_k) + // Inner loop: process longer string 4 bytes at a time + for (unsigned vec_idx = 0; vec_idx < vec_count_k; ++vec_idx) { + // Load DP values from previous row ("top" in DP matrix) + sz_u32_t top_u32vec = row_vec_[vec_idx].u32; + + // Construct diagonal: {prev[byte3], top[byte0], top[byte1], top[byte2]} + sz_u32_t diag_u32vec = sz_u32_byte_perm_(prev_u32vec, top_u32vec, 0x6543); + + // Compare shorter_char with 4 chars from longer string + sz_u32_t longer_u32vec = longer_string_vec_[vec_idx].u32; + sz_u32_t match_u32vec = sz_u32_vcmpeq4_(shorter_char_vec.u32, longer_u32vec); + + // Blend match_cost/mismatch_cost using match mask + // match_u32vec = 0xFF (match) or 0x00 (mismatch) + sz_u32_t substitutions_u32vec = (match_vec.u32 & match_u32vec) | (mismatch_vec.u32 & ~match_u32vec); + + // DP recurrence: min(diagonal + subst, top + gap, left + gap) + sz_u32_t from_diag_u32vec = sz_u32_vaddus4_(diag_u32vec, substitutions_u32vec); + sz_u32_t from_top_u32vec = sz_u32_vaddus4_(top_u32vec, gap_vec.u32); + sz_u32_t result_u32vec = sz_u32_vminu4_(from_diag_u32vec, from_top_u32vec); + + // Propagate left dependency across 4 bytes (sequential scan) + sz_u32_t left_source = (vec_idx == 0) ? col0_curr : (row_vec_[vec_idx - 1].u32 >> 24); + sz_u32_t left_u32vec = left_source | (left_source << 8) | (left_source << 16) | (left_source << 24); + + // 4 iterations to propagate left-to-right within uint32: + // After iter N, bytes [0..N] are finalized + sz_u32_t shifted_u32vec = sz_u32_byte_perm_(left_u32vec, result_u32vec, 0x6540); + result_u32vec = sz_u32_vminu4_(result_u32vec, sz_u32_vaddus4_(shifted_u32vec, gap_vec.u32)); + + shifted_u32vec = sz_u32_byte_perm_(result_u32vec, result_u32vec, 0x2100); + result_u32vec = sz_u32_vminu4_(result_u32vec, sz_u32_vaddus4_(shifted_u32vec, gap_vec.u32)); + + shifted_u32vec = sz_u32_byte_perm_(result_u32vec, result_u32vec, 0x2110); + result_u32vec = sz_u32_vminu4_(result_u32vec, sz_u32_vaddus4_(shifted_u32vec, gap_vec.u32)); + + shifted_u32vec = sz_u32_byte_perm_(result_u32vec, result_u32vec, 0x2221); + result_u32vec = sz_u32_vminu4_(result_u32vec, sz_u32_vaddus4_(shifted_u32vec, gap_vec.u32)); + + prev_u32vec = top_u32vec; + row_vec_[vec_idx].u32 = result_u32vec; + } + } + + // Extract result once after loop completes + unsigned result_vec_idx = (longer_length - 1) / 4; + unsigned result_byte_idx = (longer_length - 1) % 4; + sz_u32_t result_vec = row_vec_[result_vec_idx].u32; + sz_u8_t relevant_score = (result_vec >> (result_byte_idx * 8)) & 0xFF; + return relevant_score; + } +}; /** * @brief Levenshtein edit distances algorithm evaluating the Dynamic Programming matrix @@ -2053,61 +2275,23 @@ __global__ void levenshtein_on_each_cuda_thread_( // size_t const warps_per_device = static_cast(gridDim.x * warps_per_block); size_t const threads_per_device = warps_per_device * warp_size; - // No shared memory, only registers! - constexpr unsigned max_text_length_k = max_text_length_; - constexpr unsigned matrix_side_k = max_text_length_k + 1; - - score_t row_registers[matrix_side_k]; - char_t first_string_registers[max_text_length_k]; + // Instantiate register-optimal Levenshtein struct (processes 4 bytes at a time) + register_optimal_levenshtein levenshtein_computer; // We are computing N edit distances for N pairs of strings. Not a cartesian product! for (size_t task_idx = global_thread_index; task_idx < tasks_count; task_idx += threads_per_device) { task_t &task = tasks[task_idx]; - char_t const *first_global = task.shorter_ptr; - char_t const *second_global = task.longer_ptr; - size_t const first_length = task.shorter_length; - size_t const second_length = task.longer_length; + sz_u8_t const *shorter_ptr = reinterpret_cast(task.shorter_ptr); + sz_u8_t const *longer_ptr = reinterpret_cast(task.longer_ptr); + unsigned const shorter_length = task.shorter_length; + unsigned const longer_length = task.longer_length; auto &result_ref = task.result; - // We are going to store 1 row of the matrix (one-row Wagner-Fischer algorithm) - for (unsigned i = 0; i < first_length; ++i) first_string_registers[i] = first_global[i]; - - // Initialize the first row - for (unsigned col_idx = 0; col_idx < matrix_side_k; ++col_idx) - row_registers[col_idx] = col_idx * gap_costs.open_or_extend; - - score_t relevant_score = 0; - - // Outer loop: runtime bound (no unroll to avoid code explosion) - for (unsigned row_idx = 1; row_idx <= second_length; ++row_idx) { - // Cache second_char in scalar (read once per row, used for all columns) - char_t second_char = second_global[row_idx - 1]; - - score_t diagonal_value = row_registers[0]; // Save top-left corner (previous row, previous col) - row_registers[0] = row_idx * gap_costs.open_or_extend; // First column - - // Inner loop: no unrolling to minimize register pressure - for (unsigned col_idx = 1; col_idx <= first_length; ++col_idx) { - score_t top_left = diagonal_value; // This is previous_row[col-1] - diagonal_value = row_registers[col_idx]; // Save current value before overwriting - - // Use cached second_char (read once, used first_length times) - score_t substitution_cost = substituter( // - second_char, // Cached in scalar - first_string_registers[col_idx - 1]); - - row_registers[col_idx] = sz_min_of_three( // - top_left + substitution_cost, // diagonal (substitution) - row_registers[col_idx - 1] + gap_costs.open_or_extend, // left (insertion) - diagonal_value + gap_costs.open_or_extend); // top (deletion) - } - - // Extract result at the exact position - bool is_relevant_row = row_idx == second_length; - relevant_score = is_relevant_row ? row_registers[first_length] : relevant_score; - } + // Call SIMD/SWAR optimized Levenshtein distance computation + sz_u8_t distance = + levenshtein_computer(longer_ptr, longer_length, shorter_ptr, shorter_length, substituter, gap_costs); - result_ref = relevant_score; + result_ref = distance; } } From 376dc13e6fae1e7fc9642989d06e6c9a76d380d5 Mon Sep 17 00:00:00 2001 From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com> Date: Fri, 31 Oct 2025 00:04:28 +0000 Subject: [PATCH 8/9] Add: Desperate `__launch_bounds__` attempt --- include/stringzillas/similarities.cuh | 29 ++++++++++++++------------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/include/stringzillas/similarities.cuh b/include/stringzillas/similarities.cuh index effc3851..5c363d7d 100644 --- a/include/stringzillas/similarities.cuh +++ b/include/stringzillas/similarities.cuh @@ -2185,7 +2185,6 @@ struct register_optimal_levenshtein { // Broadcast col0_prev for diagonal construction sz_u32_t prev_u32vec = col0_prev | (col0_prev << 8) | (col0_prev << 16) | (col0_prev << 24); -#pragma unroll(vec_count_k) // Inner loop: process longer string 4 bytes at a time for (unsigned vec_idx = 0; vec_idx < vec_count_k; ++vec_idx) { // Load DP values from previous row ("top" in DP matrix) @@ -2260,9 +2259,10 @@ template < // sz_capability_t capability_ = sz_cap_cuda_k, // unsigned max_text_length_ = levenshtein_on_each_cuda_thread_default_text_limit_k // > -__global__ void levenshtein_on_each_cuda_thread_( // - task_type_ *tasks, size_t tasks_count, // - uniform_substitution_costs_t const substituter, linear_gap_costs_t const gap_costs) { +__global__ __launch_bounds__(128, 4) // Target: 128 threads/block, 4 blocks/SM for occupancy + void levenshtein_on_each_cuda_thread_( // + task_type_ *tasks, size_t tasks_count, // + uniform_substitution_costs_t const substituter, linear_gap_costs_t const gap_costs) { using task_t = task_type_; using char_t = char_type_; @@ -2442,16 +2442,17 @@ struct levenshtein_distances(thread_level_kernel), // Kernel function pointer - dim3(random_blocks_per_multiprocessor * specs.streaming_multiprocessors), // Grid dimensions - dim3(random_block_size), // Block dimensions - thread_level_kernel_args, // Array of kernel argument pointers - 0, // Shared memory per block (in bytes) - executor.stream()); // CUDA stream + // Launch configuration tuned for register-heavy kernel with __launch_bounds__(128, 4) + unsigned const threads_per_block = 128; + unsigned const blocks_per_multiprocessor = 16; // Aggressive: H100 can handle high block count + unsigned const total_blocks = blocks_per_multiprocessor * specs.streaming_multiprocessors; + cudaError_t launch_error = cudaLaunchKernel( // + reinterpret_cast(thread_level_kernel), // Kernel function pointer + dim3(total_blocks), // Grid dimensions + dim3(threads_per_block), // Block dimensions + thread_level_kernel_args, // Array of kernel argument pointers + 0, // Shared memory per block (in bytes) + executor.stream()); // CUDA stream if (launch_error != cudaSuccess) { if (launch_error == cudaErrorMemoryAllocation) return {status_t::bad_alloc_k, launch_error}; else From b7bbd63f8509d0a535eb9af20ff2b81b0ccb27da Mon Sep 17 00:00:00 2001 From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com> Date: Fri, 31 Oct 2025 01:02:28 +0000 Subject: [PATCH 9/9] Improve: Tune block size --- include/stringzillas/similarities.cuh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/include/stringzillas/similarities.cuh b/include/stringzillas/similarities.cuh index 5c363d7d..5a03c214 100644 --- a/include/stringzillas/similarities.cuh +++ b/include/stringzillas/similarities.cuh @@ -2259,7 +2259,7 @@ template < // sz_capability_t capability_ = sz_cap_cuda_k, // unsigned max_text_length_ = levenshtein_on_each_cuda_thread_default_text_limit_k // > -__global__ __launch_bounds__(128, 4) // Target: 128 threads/block, 4 blocks/SM for occupancy +__global__ __launch_bounds__(256, 4) // Target: 256 threads/block, 4 blocks/SM minimum void levenshtein_on_each_cuda_thread_( // task_type_ *tasks, size_t tasks_count, // uniform_substitution_costs_t const substituter, linear_gap_costs_t const gap_costs) { @@ -2442,9 +2442,9 @@ struct levenshtein_distances(thread_level_kernel), // Kernel function pointer