Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions c/parallel/include/cccl/c/unique_by_key.h
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ typedef struct cccl_device_unique_by_key_build_result_t
CUkernel sweep_kernel;
size_t description_bytes_per_tile;
size_t payload_bytes_per_tile;
void* runtime_policy;
} cccl_device_unique_by_key_build_result_t;

CCCL_C_API CUresult cccl_device_unique_by_key_build(
Expand Down
1 change: 1 addition & 0 deletions c/parallel/src/scan.cu
Original file line number Diff line number Diff line change
Expand Up @@ -485,6 +485,7 @@ CUresult cccl_device_scan_cleanup(cccl_device_scan_build_result_t* build_ptr)
return CUDA_ERROR_INVALID_VALUE;
}
std::unique_ptr<char[]> cubin(reinterpret_cast<char*>(build_ptr->cubin));
std::unique_ptr<char[]> policy(reinterpret_cast<char*>(build_ptr->runtime_policy));
check(cuLibraryUnload(build_ptr->library));
}
catch (const std::exception& exc)
Expand Down
181 changes: 91 additions & 90 deletions c/parallel/src/unique_by_key.cu
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
#include <util/build_utils.h>
#include <util/context.h>
#include <util/indirect_arg.h>
#include <util/runtime_policy.h>
#include <util/scan_tile_state.h>
#include <util/tuning.h>
#include <util/types.h>
Expand All @@ -39,35 +40,23 @@ namespace unique_by_key
{
struct unique_by_key_runtime_tuning_policy
{
int block_size;
int items_per_thread;
cub::BlockLoadAlgorithm load_algorithm;
cub::CacheLoadModifier load_modifier;
cub::BlockScanAlgorithm scan_algorithm;
cub::detail::RuntimeUniqueByKeyAgentPolicy unique_by_key;

unique_by_key_runtime_tuning_policy UniqueByKey() const
auto UniqueByKey() const
{
return *this;
return unique_by_key;
}

using UniqueByKeyPolicyT = unique_by_key_runtime_tuning_policy;
};
using UniqueByKeyPolicyT = cub::detail::RuntimeUniqueByKeyAgentPolicy;
using MaxPolicy = unique_by_key_runtime_tuning_policy;

struct unique_by_key_tuning_t
{
int cc;
int block_size;
int items_per_thread;
template <typename F>
cudaError_t Invoke(int, F& op)
{
return op.template Invoke<unique_by_key_runtime_tuning_policy>(*this);
}
};

unique_by_key_runtime_tuning_policy get_policy(int /*cc*/, int /*key_size*/)
{
// TODO: we should update this once we figure out a way to reuse
// tuning logic from C++. Alternately, we should implement
// something better than a hardcoded default:
return {128, 4, cub::BLOCK_LOAD_DIRECT, cub::LOAD_DEFAULT, cub::BLOCK_SCAN_WARP_SCANS};
}

enum class unique_by_key_iterator_t
{
input_keys = 0,
Expand Down Expand Up @@ -163,21 +152,6 @@ std::string get_sweep_kernel_name(
offset_t);
}

template <auto* GetPolicy>
struct dynamic_unique_by_key_policy_t
{
using MaxPolicy = dynamic_unique_by_key_policy_t;

template <typename F>
cudaError_t Invoke(int device_ptx_version, F& op)
{
return op.template Invoke<unique_by_key_runtime_tuning_policy>(
GetPolicy(device_ptx_version, static_cast<int>(key_size)));
}

uint64_t key_size;
};

struct unique_by_key_kernel_source
{
cccl_device_unique_by_key_build_result_t& build;
Expand All @@ -203,13 +177,13 @@ struct dynamic_vsmem_helper_t
template <typename PolicyT, typename... Ts>
static int BlockThreads(PolicyT policy)
{
return policy.block_size;
return policy.BlockThreads();
}

template <typename PolicyT, typename... Ts>
static int ItemsPerThread(PolicyT policy)
{
return policy.items_per_thread;
return policy.ItemsPerThread();
}

template <typename PolicyT, typename... Ts>
Expand Down Expand Up @@ -243,9 +217,7 @@ CUresult cccl_device_unique_by_key_build_ex(
{
const char* name = "test";

const int cc = cc_major * 10 + cc_minor;
const auto policy = unique_by_key::get_policy(cc, static_cast<int>(input_keys_it.value_type.size));

const int cc = cc_major * 10 + cc_minor;
const auto input_keys_it_value_t = cccl_type_enum_to_name(input_keys_it.value_type.type);
const auto input_values_it_value_t = cccl_type_enum_to_name(input_values_it.value_type.type);
const auto output_keys_it_value_t = cccl_type_enum_to_name(output_keys_it.value_type.type);
Expand Down Expand Up @@ -295,30 +267,71 @@ struct __align__({3}) items_storage_t {{
struct __align__({5}) num_out_storage_t {{
char data[{4}];
}};
{6}
{7}
{8}
{9}
{10}
{11}
{12}
struct agent_policy_t {{
static constexpr int ITEMS_PER_THREAD = {7};
static constexpr int BLOCK_THREADS = {6};
static constexpr cub::BlockLoadAlgorithm LOAD_ALGORITHM = cub::BLOCK_LOAD_DIRECT;
static constexpr cub::CacheLoadModifier LOAD_MODIFIER = cub::LOAD_DEFAULT;
static constexpr cub::BlockScanAlgorithm SCAN_ALGORITHM = cub::BLOCK_SCAN_WARP_SCANS;
struct detail {{
using delay_constructor_t = cub::detail::default_delay_constructor_t<unsigned long long>;
}};
}};
)XXX";

const std::string src = std::format(
src_template,
input_keys_it.value_type.size, // 0
input_keys_it.value_type.alignment, // 1
input_values_it.value_type.size, // 2
input_values_it.value_type.alignment, // 3
output_values_it.value_type.size, // 4
output_values_it.value_type.alignment, // 5
input_keys_iterator_src, // 6
input_values_iterator_src, // 7
output_keys_iterator_src, // 8
output_values_iterator_src, // 9
output_num_selected_iterator_src, // 10
op_src); // 11

const std::string ptx_arch = std::format("-arch=compute_{}{}", cc_major, cc_minor);

std::vector<const char*> ptx_args = {
ptx_arch.c_str(), cub_path, thrust_path, libcudacxx_path, ctk_path, "-rdc=true"};

cccl::detail::extend_args_with_build_config(ptx_args, config);

std::string policy_hub_expr =
std::format("cub::detail::unique_by_key::policy_hub<{}, {}>", input_keys_it_value_t, input_values_it_value_t);

nlohmann::json runtime_policy = get_policy(
std::format("cub::detail::unique_by_key::MakeUniqueByKeyPolicyWrapper({}::MaxPolicy::ActivePolicy{{}})",
policy_hub_expr),
"#include <cub/device/dispatch/tuning/tuning_unique_by_key.cuh>\n" + src,
ptx_args);

auto delay_ctor_info = runtime_policy["DelayConstructor"];
std::string delay_ctor_params;
for (auto&& param : delay_ctor_info["params"])
{
delay_ctor_params.append(to_string(param) + ", ");
}
delay_ctor_params.erase(delay_ctor_params.size() - 2); // remove last ", "
auto delay_ctor_t =
std::format("cub::detail::{}<{}>", delay_ctor_info["name"].get<std::string>(), delay_ctor_params);

using cub::detail::RuntimeUniqueByKeyAgentPolicy;
auto [ubk_policy, ubk_policy_str] =
RuntimeUniqueByKeyAgentPolicy::from_json(runtime_policy, "UniqueByKeyPolicyT", delay_ctor_t);

std::string final_src = std::format(
R"XXX(
{0}
struct device_unique_by_key_policy {{
struct ActivePolicy {{
using UniqueByKeyPolicyT = agent_policy_t;
{1}
}};
}};
struct device_unique_by_key_vsmem_helper {{
template<typename ActivePolicyT, typename... Ts>
struct VSMemHelperDefaultFallbackPolicyT {{
using agent_policy_t = agent_policy_t;
using agent_policy_t = device_unique_by_key_policy::ActivePolicy::UniqueByKeyPolicyT;
using agent_t = cub::detail::unique_by_key::AgentUniqueByKey<agent_policy_t, Ts...>;
using static_temp_storage_t = typename cub::detail::unique_by_key::AgentUniqueByKey<agent_policy_t, Ts...>::TempStorage;
static _CCCL_DEVICE _CCCL_FORCEINLINE static_temp_storage_t& get_temp_storage(
Expand All @@ -333,29 +346,13 @@ struct device_unique_by_key_vsmem_helper {{
}}
}};
}};
{13}
)XXX";

const std::string src = std::format(
src_template,
input_keys_it.value_type.size, // 0
input_keys_it.value_type.alignment, // 1
input_values_it.value_type.size, // 2
input_values_it.value_type.alignment, // 3
output_values_it.value_type.size, // 4
output_values_it.value_type.alignment, // 5
policy.block_size, // 6
policy.items_per_thread, // 7
input_keys_iterator_src, // 8
input_values_iterator_src, // 9
output_keys_iterator_src, // 10
output_values_iterator_src, // 11
output_num_selected_iterator_src, // 12
op_src); // 13
)XXX",
src,
ubk_policy_str);

#if false // CCCL_DEBUGGING_SWITCH
fflush(stderr);
printf("\nCODE4NVRTC BEGIN\n%sCODE4NVRTC END\n", src.c_str());
printf("\nCODE4NVRTC BEGIN\n%sCODE4NVRTC END\n", final_src.c_str());
fflush(stdout);
#endif

Expand Down Expand Up @@ -388,7 +385,7 @@ struct device_unique_by_key_vsmem_helper {{

nvrtc_link_result result =
begin_linking_nvrtc_program(num_lto_args, lopts)
->add_program(nvrtc_translation_unit{src.c_str(), name})
->add_program(nvrtc_translation_unit{final_src.c_str(), name})
->add_expression({compact_init_kernel_name})
->add_expression({sweep_kernel_name})
->compile_program({args.data(), args.size()})
Expand All @@ -411,6 +408,7 @@ struct device_unique_by_key_vsmem_helper {{
build_ptr->cubin_size = result.size;
build_ptr->description_bytes_per_tile = description_bytes_per_tile;
build_ptr->payload_bytes_per_tile = payload_bytes_per_tile;
build_ptr->runtime_policy = new unique_by_key::unique_by_key_runtime_tuning_policy{ubk_policy};
}
catch (const std::exception& exc)
{
Expand Down Expand Up @@ -453,24 +451,26 @@ CUresult cccl_device_unique_by_key(
indirect_arg_t,
indirect_arg_t,
OffsetT,
unique_by_key::dynamic_unique_by_key_policy_t<&unique_by_key::get_policy>,
unique_by_key::unique_by_key_runtime_tuning_policy,
unique_by_key::unique_by_key_kernel_source,
cub::detail::CudaDriverLauncherFactory,
unique_by_key::dynamic_vsmem_helper_t,
indirect_arg_t,
indirect_arg_t>::Dispatch(d_temp_storage,
*temp_storage_bytes,
d_keys_in,
d_values_in,
d_keys_out,
d_values_out,
d_num_selected_out,
op,
num_items,
stream,
{build},
cub::detail::CudaDriverLauncherFactory{cu_device, build.cc},
{d_keys_in.value_type.size});
indirect_arg_t>::
Dispatch(
d_temp_storage,
*temp_storage_bytes,
d_keys_in,
d_values_in,
d_keys_out,
d_values_out,
d_num_selected_out,
op,
num_items,
stream,
{build},
cub::detail::CudaDriverLauncherFactory{cu_device, build.cc},
*reinterpret_cast<unique_by_key::unique_by_key_runtime_tuning_policy*>(build.runtime_policy));

error = static_cast<CUresult>(exec_status);
}
Expand Down Expand Up @@ -533,6 +533,7 @@ CUresult cccl_device_unique_by_key_cleanup(cccl_device_unique_by_key_build_resul
}

std::unique_ptr<char[]> cubin(reinterpret_cast<char*>(build_ptr->cubin));
std::unique_ptr<char[]> policy(reinterpret_cast<char*>(build_ptr->runtime_policy));
check(cuLibraryUnload(build_ptr->library));
}
catch (const std::exception& exc)
Expand Down
53 changes: 0 additions & 53 deletions c/parallel/test/test_unique_by_key.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -641,59 +641,6 @@ struct large_key_pair
}
};

C2H_TEST("DeviceSelect::UniqueByKey fails to build for large types due to no vsmem", "[device][select_unique_by_key]")
{
const int num_items = 1;

operation_t op = make_operation(
"op",
"struct large_key_pair { int a; char c[100]; };\n"
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Important: instead of deleting this, let us keep it but increase the size of the dtype. Use char c[500] here instead. The reason the build succeeds after your changes is that the selected policy uses a smaller block size with less items per thread, which allows us to use a type of this size without running out of shared memory.

VSmem is still not supported, if you search for VSMemPerBlock in unique_by_key.cu you can see we return 0.

This issue is tracked here #3790

"extern \"C\" __device__ bool op(large_key_pair lhs, large_key_pair rhs) {\n"
" return lhs.a == rhs.a;\n"
"}");
const std::vector<int> a = generate<int>(num_items);
std::vector<large_key_pair> input_keys(num_items);
for (int i = 0; i < num_items; ++i)
{
input_keys[i] = large_key_pair{a[i], {}};
}

pointer_t<large_key_pair> input_keys_it(input_keys);
pointer_t<item_t> input_values_it;
pointer_t<large_key_pair> output_keys_it(num_items);
pointer_t<item_t> output_values_it;
pointer_t<int> output_num_selected_it(1);

cudaDeviceProp deviceProp;
cudaGetDeviceProperties(&deviceProp, 0);

const int cc_major = deviceProp.major;
const int cc_minor = deviceProp.minor;

const char* cub_path = TEST_CUB_PATH;
const char* thrust_path = TEST_THRUST_PATH;
const char* libcudacxx_path = TEST_LIBCUDACXX_PATH;
const char* ctk_path = TEST_CTK_PATH;

cccl_device_unique_by_key_build_result_t build;
REQUIRE(
CUDA_ERROR_UNKNOWN
== cccl_device_unique_by_key_build(
&build,
input_keys_it,
input_values_it,
output_keys_it,
output_values_it,
output_num_selected_it,
op,
cc_major,
cc_minor,
cub_path,
thrust_path,
libcudacxx_path,
ctk_path));
}

C2H_TEST("UniqueByKey works with C++ source operations", "[unique_by_key]")
{
using key_t = int32_t;
Expand Down
20 changes: 20 additions & 0 deletions cub/cub/agent/agent_unique_by_key.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,26 @@ struct AgentUniqueByKeyPolicy
};
};

#if defined(CUB_DEFINE_RUNTIME_POLICIES) || defined(CUB_ENABLE_POLICY_PTX_JSON)
namespace detail
{
// Only define this when needed.
// Because of overload woes, this depends on C++20 concepts. util_device.h checks that concepts are available when
// either runtime policies or PTX JSON information are enabled, so if they are, this is always valid. The generic
// version is always defined, and that's the only one needed for regular CUB operations.
//
// TODO: enable this unconditionally once concepts are always available
CUB_DETAIL_POLICY_WRAPPER_DEFINE(
UniqueByKeyAgentPolicy,
(GenericAgentPolicy),
(BLOCK_THREADS, BlockThreads, int),
(ITEMS_PER_THREAD, ItemsPerThread, int),
(LOAD_ALGORITHM, LoadAlgorithm, cub::BlockLoadAlgorithm),
(LOAD_MODIFIER, LoadModifier, cub::CacheLoadModifier),
(SCAN_ALGORITHM, ScanAlgorithm, cub::BlockScanAlgorithm))
} // namespace detail
#endif // defined(CUB_DEFINE_RUNTIME_POLICIES) || defined(CUB_ENABLE_POLICY_PTX_JSON)

/******************************************************************************
* Thread block abstractions
******************************************************************************/
Expand Down
Loading
Loading