diff --git a/c/parallel/include/cccl/c/unique_by_key.h b/c/parallel/include/cccl/c/unique_by_key.h index 0983b5d07cb..13fd5618e6c 100644 --- a/c/parallel/include/cccl/c/unique_by_key.h +++ b/c/parallel/include/cccl/c/unique_by_key.h @@ -32,6 +32,7 @@ typedef struct cccl_device_unique_by_key_build_result_t CUkernel sweep_kernel; size_t description_bytes_per_tile; size_t payload_bytes_per_tile; + void* runtime_policy; } cccl_device_unique_by_key_build_result_t; CCCL_C_API CUresult cccl_device_unique_by_key_build( diff --git a/c/parallel/src/scan.cu b/c/parallel/src/scan.cu index 32c79235630..33819890ecf 100644 --- a/c/parallel/src/scan.cu +++ b/c/parallel/src/scan.cu @@ -485,6 +485,7 @@ CUresult cccl_device_scan_cleanup(cccl_device_scan_build_result_t* build_ptr) return CUDA_ERROR_INVALID_VALUE; } std::unique_ptr cubin(reinterpret_cast(build_ptr->cubin)); + std::unique_ptr policy(reinterpret_cast(build_ptr->runtime_policy)); check(cuLibraryUnload(build_ptr->library)); } catch (const std::exception& exc) diff --git a/c/parallel/src/unique_by_key.cu b/c/parallel/src/unique_by_key.cu index 4a3b5527192..2fafd39d4ae 100644 --- a/c/parallel/src/unique_by_key.cu +++ b/c/parallel/src/unique_by_key.cu @@ -24,6 +24,7 @@ #include #include #include +#include #include #include #include @@ -39,35 +40,23 @@ namespace unique_by_key { struct unique_by_key_runtime_tuning_policy { - int block_size; - int items_per_thread; - cub::BlockLoadAlgorithm load_algorithm; - cub::CacheLoadModifier load_modifier; - cub::BlockScanAlgorithm scan_algorithm; + cub::detail::RuntimeUniqueByKeyAgentPolicy unique_by_key; - unique_by_key_runtime_tuning_policy UniqueByKey() const + auto UniqueByKey() const { - return *this; + return unique_by_key; } - using UniqueByKeyPolicyT = unique_by_key_runtime_tuning_policy; -}; + using UniqueByKeyPolicyT = cub::detail::RuntimeUniqueByKeyAgentPolicy; + using MaxPolicy = unique_by_key_runtime_tuning_policy; -struct unique_by_key_tuning_t -{ - int cc; - int block_size; - int items_per_thread; + template + cudaError_t Invoke(int, F& op) + { + return op.template Invoke(*this); + } }; -unique_by_key_runtime_tuning_policy get_policy(int /*cc*/, int /*key_size*/) -{ - // TODO: we should update this once we figure out a way to reuse - // tuning logic from C++. Alternately, we should implement - // something better than a hardcoded default: - return {128, 4, cub::BLOCK_LOAD_DIRECT, cub::LOAD_DEFAULT, cub::BLOCK_SCAN_WARP_SCANS}; -} - enum class unique_by_key_iterator_t { input_keys = 0, @@ -163,21 +152,6 @@ std::string get_sweep_kernel_name( offset_t); } -template -struct dynamic_unique_by_key_policy_t -{ - using MaxPolicy = dynamic_unique_by_key_policy_t; - - template - cudaError_t Invoke(int device_ptx_version, F& op) - { - return op.template Invoke( - GetPolicy(device_ptx_version, static_cast(key_size))); - } - - uint64_t key_size; -}; - struct unique_by_key_kernel_source { cccl_device_unique_by_key_build_result_t& build; @@ -203,13 +177,13 @@ struct dynamic_vsmem_helper_t template static int BlockThreads(PolicyT policy) { - return policy.block_size; + return policy.BlockThreads(); } template static int ItemsPerThread(PolicyT policy) { - return policy.items_per_thread; + return policy.ItemsPerThread(); } template @@ -243,9 +217,7 @@ CUresult cccl_device_unique_by_key_build_ex( { const char* name = "test"; - const int cc = cc_major * 10 + cc_minor; - const auto policy = unique_by_key::get_policy(cc, static_cast(input_keys_it.value_type.size)); - + const int cc = cc_major * 10 + cc_minor; const auto input_keys_it_value_t = cccl_type_enum_to_name(input_keys_it.value_type.type); const auto input_values_it_value_t = cccl_type_enum_to_name(input_values_it.value_type.type); const auto output_keys_it_value_t = cccl_type_enum_to_name(output_keys_it.value_type.type); @@ -295,30 +267,71 @@ struct __align__({3}) items_storage_t {{ struct __align__({5}) num_out_storage_t {{ char data[{4}]; }}; +{6} +{7} {8} {9} {10} {11} -{12} -struct agent_policy_t {{ - static constexpr int ITEMS_PER_THREAD = {7}; - static constexpr int BLOCK_THREADS = {6}; - static constexpr cub::BlockLoadAlgorithm LOAD_ALGORITHM = cub::BLOCK_LOAD_DIRECT; - static constexpr cub::CacheLoadModifier LOAD_MODIFIER = cub::LOAD_DEFAULT; - static constexpr cub::BlockScanAlgorithm SCAN_ALGORITHM = cub::BLOCK_SCAN_WARP_SCANS; - struct detail {{ - using delay_constructor_t = cub::detail::default_delay_constructor_t; - }}; -}}; +)XXX"; + + const std::string src = std::format( + src_template, + input_keys_it.value_type.size, // 0 + input_keys_it.value_type.alignment, // 1 + input_values_it.value_type.size, // 2 + input_values_it.value_type.alignment, // 3 + output_values_it.value_type.size, // 4 + output_values_it.value_type.alignment, // 5 + input_keys_iterator_src, // 6 + input_values_iterator_src, // 7 + output_keys_iterator_src, // 8 + output_values_iterator_src, // 9 + output_num_selected_iterator_src, // 10 + op_src); // 11 + + const std::string ptx_arch = std::format("-arch=compute_{}{}", cc_major, cc_minor); + + std::vector ptx_args = { + ptx_arch.c_str(), cub_path, thrust_path, libcudacxx_path, ctk_path, "-rdc=true"}; + + cccl::detail::extend_args_with_build_config(ptx_args, config); + + std::string policy_hub_expr = + std::format("cub::detail::unique_by_key::policy_hub<{}, {}>", input_keys_it_value_t, input_values_it_value_t); + + nlohmann::json runtime_policy = get_policy( + std::format("cub::detail::unique_by_key::MakeUniqueByKeyPolicyWrapper({}::MaxPolicy::ActivePolicy{{}})", + policy_hub_expr), + "#include \n" + src, + ptx_args); + + auto delay_ctor_info = runtime_policy["DelayConstructor"]; + std::string delay_ctor_params; + for (auto&& param : delay_ctor_info["params"]) + { + delay_ctor_params.append(to_string(param) + ", "); + } + delay_ctor_params.erase(delay_ctor_params.size() - 2); // remove last ", " + auto delay_ctor_t = + std::format("cub::detail::{}<{}>", delay_ctor_info["name"].get(), delay_ctor_params); + + using cub::detail::RuntimeUniqueByKeyAgentPolicy; + auto [ubk_policy, ubk_policy_str] = + RuntimeUniqueByKeyAgentPolicy::from_json(runtime_policy, "UniqueByKeyPolicyT", delay_ctor_t); + + std::string final_src = std::format( + R"XXX( +{0} struct device_unique_by_key_policy {{ struct ActivePolicy {{ - using UniqueByKeyPolicyT = agent_policy_t; + {1} }}; }}; struct device_unique_by_key_vsmem_helper {{ template struct VSMemHelperDefaultFallbackPolicyT {{ - using agent_policy_t = agent_policy_t; + using agent_policy_t = device_unique_by_key_policy::ActivePolicy::UniqueByKeyPolicyT; using agent_t = cub::detail::unique_by_key::AgentUniqueByKey; using static_temp_storage_t = typename cub::detail::unique_by_key::AgentUniqueByKey::TempStorage; static _CCCL_DEVICE _CCCL_FORCEINLINE static_temp_storage_t& get_temp_storage( @@ -333,29 +346,13 @@ struct device_unique_by_key_vsmem_helper {{ }} }}; }}; -{13} -)XXX"; - - const std::string src = std::format( - src_template, - input_keys_it.value_type.size, // 0 - input_keys_it.value_type.alignment, // 1 - input_values_it.value_type.size, // 2 - input_values_it.value_type.alignment, // 3 - output_values_it.value_type.size, // 4 - output_values_it.value_type.alignment, // 5 - policy.block_size, // 6 - policy.items_per_thread, // 7 - input_keys_iterator_src, // 8 - input_values_iterator_src, // 9 - output_keys_iterator_src, // 10 - output_values_iterator_src, // 11 - output_num_selected_iterator_src, // 12 - op_src); // 13 +)XXX", + src, + ubk_policy_str); #if false // CCCL_DEBUGGING_SWITCH fflush(stderr); - printf("\nCODE4NVRTC BEGIN\n%sCODE4NVRTC END\n", src.c_str()); + printf("\nCODE4NVRTC BEGIN\n%sCODE4NVRTC END\n", final_src.c_str()); fflush(stdout); #endif @@ -388,7 +385,7 @@ struct device_unique_by_key_vsmem_helper {{ nvrtc_link_result result = begin_linking_nvrtc_program(num_lto_args, lopts) - ->add_program(nvrtc_translation_unit{src.c_str(), name}) + ->add_program(nvrtc_translation_unit{final_src.c_str(), name}) ->add_expression({compact_init_kernel_name}) ->add_expression({sweep_kernel_name}) ->compile_program({args.data(), args.size()}) @@ -411,6 +408,7 @@ struct device_unique_by_key_vsmem_helper {{ build_ptr->cubin_size = result.size; build_ptr->description_bytes_per_tile = description_bytes_per_tile; build_ptr->payload_bytes_per_tile = payload_bytes_per_tile; + build_ptr->runtime_policy = new unique_by_key::unique_by_key_runtime_tuning_policy{ubk_policy}; } catch (const std::exception& exc) { @@ -453,24 +451,26 @@ CUresult cccl_device_unique_by_key( indirect_arg_t, indirect_arg_t, OffsetT, - unique_by_key::dynamic_unique_by_key_policy_t<&unique_by_key::get_policy>, + unique_by_key::unique_by_key_runtime_tuning_policy, unique_by_key::unique_by_key_kernel_source, cub::detail::CudaDriverLauncherFactory, unique_by_key::dynamic_vsmem_helper_t, indirect_arg_t, - indirect_arg_t>::Dispatch(d_temp_storage, - *temp_storage_bytes, - d_keys_in, - d_values_in, - d_keys_out, - d_values_out, - d_num_selected_out, - op, - num_items, - stream, - {build}, - cub::detail::CudaDriverLauncherFactory{cu_device, build.cc}, - {d_keys_in.value_type.size}); + indirect_arg_t>:: + Dispatch( + d_temp_storage, + *temp_storage_bytes, + d_keys_in, + d_values_in, + d_keys_out, + d_values_out, + d_num_selected_out, + op, + num_items, + stream, + {build}, + cub::detail::CudaDriverLauncherFactory{cu_device, build.cc}, + *reinterpret_cast(build.runtime_policy)); error = static_cast(exec_status); } @@ -533,6 +533,7 @@ CUresult cccl_device_unique_by_key_cleanup(cccl_device_unique_by_key_build_resul } std::unique_ptr cubin(reinterpret_cast(build_ptr->cubin)); + std::unique_ptr policy(reinterpret_cast(build_ptr->runtime_policy)); check(cuLibraryUnload(build_ptr->library)); } catch (const std::exception& exc) diff --git a/c/parallel/test/test_unique_by_key.cpp b/c/parallel/test/test_unique_by_key.cpp index b9fab0f73a7..499315df843 100644 --- a/c/parallel/test/test_unique_by_key.cpp +++ b/c/parallel/test/test_unique_by_key.cpp @@ -641,59 +641,6 @@ struct large_key_pair } }; -C2H_TEST("DeviceSelect::UniqueByKey fails to build for large types due to no vsmem", "[device][select_unique_by_key]") -{ - const int num_items = 1; - - operation_t op = make_operation( - "op", - "struct large_key_pair { int a; char c[100]; };\n" - "extern \"C\" __device__ bool op(large_key_pair lhs, large_key_pair rhs) {\n" - " return lhs.a == rhs.a;\n" - "}"); - const std::vector a = generate(num_items); - std::vector input_keys(num_items); - for (int i = 0; i < num_items; ++i) - { - input_keys[i] = large_key_pair{a[i], {}}; - } - - pointer_t input_keys_it(input_keys); - pointer_t input_values_it; - pointer_t output_keys_it(num_items); - pointer_t output_values_it; - pointer_t output_num_selected_it(1); - - cudaDeviceProp deviceProp; - cudaGetDeviceProperties(&deviceProp, 0); - - const int cc_major = deviceProp.major; - const int cc_minor = deviceProp.minor; - - const char* cub_path = TEST_CUB_PATH; - const char* thrust_path = TEST_THRUST_PATH; - const char* libcudacxx_path = TEST_LIBCUDACXX_PATH; - const char* ctk_path = TEST_CTK_PATH; - - cccl_device_unique_by_key_build_result_t build; - REQUIRE( - CUDA_ERROR_UNKNOWN - == cccl_device_unique_by_key_build( - &build, - input_keys_it, - input_values_it, - output_keys_it, - output_values_it, - output_num_selected_it, - op, - cc_major, - cc_minor, - cub_path, - thrust_path, - libcudacxx_path, - ctk_path)); -} - C2H_TEST("UniqueByKey works with C++ source operations", "[unique_by_key]") { using key_t = int32_t; diff --git a/cub/cub/agent/agent_unique_by_key.cuh b/cub/cub/agent/agent_unique_by_key.cuh index 7c9ca7ec4b7..e2c0bcf65e8 100644 --- a/cub/cub/agent/agent_unique_by_key.cuh +++ b/cub/cub/agent/agent_unique_by_key.cuh @@ -85,6 +85,26 @@ struct AgentUniqueByKeyPolicy }; }; +#if defined(CUB_DEFINE_RUNTIME_POLICIES) || defined(CUB_ENABLE_POLICY_PTX_JSON) +namespace detail +{ +// Only define this when needed. +// Because of overload woes, this depends on C++20 concepts. util_device.h checks that concepts are available when +// either runtime policies or PTX JSON information are enabled, so if they are, this is always valid. The generic +// version is always defined, and that's the only one needed for regular CUB operations. +// +// TODO: enable this unconditionally once concepts are always available +CUB_DETAIL_POLICY_WRAPPER_DEFINE( + UniqueByKeyAgentPolicy, + (GenericAgentPolicy), + (BLOCK_THREADS, BlockThreads, int), + (ITEMS_PER_THREAD, ItemsPerThread, int), + (LOAD_ALGORITHM, LoadAlgorithm, cub::BlockLoadAlgorithm), + (LOAD_MODIFIER, LoadModifier, cub::CacheLoadModifier), + (SCAN_ALGORITHM, ScanAlgorithm, cub::BlockScanAlgorithm)) +} // namespace detail +#endif // defined(CUB_DEFINE_RUNTIME_POLICIES) || defined(CUB_ENABLE_POLICY_PTX_JSON) + /****************************************************************************** * Thread block abstractions ******************************************************************************/ diff --git a/cub/cub/device/dispatch/tuning/tuning_unique_by_key.cuh b/cub/cub/device/dispatch/tuning/tuning_unique_by_key.cuh index 9090a679973..4a8b67b9c79 100644 --- a/cub/cub/device/dispatch/tuning/tuning_unique_by_key.cuh +++ b/cub/cub/device/dispatch/tuning/tuning_unique_by_key.cuh @@ -788,6 +788,16 @@ struct UniqueByKeyPolicyWrapper() = UniqueByKey().EncodedPolicy(), + key<"DelayConstructor">() = + StaticPolicyT::UniqueByKeyPolicyT::detail::delay_constructor_t::EncodedConstructor()>(); + } +#endif }; template