rapidsai · RAMitchell · Aug 14, 2025 · Aug 13, 2025 · Aug 13, 2025 · Aug 14, 2025
diff --git a/cpp/include/legate_dataframe/csv.hpp b/cpp/include/legate_dataframe/csv.hpp
@@ -38,6 +38,8 @@ class CSVWrite : public Task<CSVWrite, OpCode::CSVWrite> {
                                                 .with_has_allocations(true)
                                                 .with_elide_device_ctx_sync(true)
                                                 .with_has_side_effect(true);
+  static constexpr auto CPU_VARIANT_OPTIONS =
+    legate::VariantOptions{}.with_has_allocations(true).with_has_side_effect(true);
   static void cpu_variant(legate::TaskContext context);
   static void gpu_variant(legate::TaskContext context);
 };

diff --git a/cpp/include/legate_dataframe/filling.hpp b/cpp/include/legate_dataframe/filling.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024, NVIDIA CORPORATION.
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,9 +17,20 @@
 #pragma once
 
 #include <legate_dataframe/core/column.hpp>
+#include <legate_dataframe/core/library.hpp>
 
 namespace legate::dataframe {
 
+namespace task {
+
+class SequenceTask : public Task<SequenceTask, OpCode::Sequence> {
+ public:
+  static void cpu_variant(legate::TaskContext context);
+  static void gpu_variant(legate::TaskContext context);
+};
+
+}  // namespace task
+
 /**
  * @brief Fills a column with a sequence of int64 values
  *
@@ -36,9 +47,6 @@ namespace legate::dataframe {
  * Notice, this is primarily for C++ testing and examples for now. TODO: implement
  * all of the cudf features <https://github.com/rapidsai/legate-dataframe/issues/74>
  *
- * @throws cudf::logic_error if @p init is not numeric.
- * @throws cudf::logic_error if @p size is < 0.
- *
  * @param size Size of the output column
  * @param init First value in the sequence
  * @return The result column (int64) containing the generated sequence

diff --git a/cpp/include/legate_dataframe/groupby_aggregation.hpp b/cpp/include/legate_dataframe/groupby_aggregation.hpp
@@ -19,11 +19,23 @@
 #include <string>
 #include <vector>
 
-#include <cudf/aggregation.hpp>
-
+#include <legate_dataframe/core/library.hpp>
 #include <legate_dataframe/core/table.hpp>
 
 namespace legate::dataframe {
+namespace task {
+class GroupByAggregationTask : public Task<GroupByAggregationTask, OpCode::GroupByAggregation> {
+ public:
+  static constexpr auto GPU_VARIANT_OPTIONS = legate::VariantOptions{}
+                                                .with_has_allocations(true)
+                                                .with_concurrent(true)
+                                                .with_elide_device_ctx_sync(true);
+  static constexpr auto CPU_VARIANT_OPTIONS =
+    legate::VariantOptions{}.with_has_allocations(true).with_concurrent(true);
+  static void cpu_variant(legate::TaskContext context);
+  static void gpu_variant(legate::TaskContext context);
+};
+}  // namespace task
 
 /**
  * @brief Perform a groupby and aggregation in a single operation.

diff --git a/cpp/include/legate_dataframe/join.hpp b/cpp/include/legate_dataframe/join.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,13 +20,39 @@
 
 #include <cudf/types.hpp>  // cudf::null_equality
 
+#include <legate_dataframe/core/library.hpp>
 #include <legate_dataframe/core/table.hpp>
 
 namespace legate::dataframe {
-
 enum class JoinType : int32_t { INNER = 0, LEFT, FULL };
 enum class BroadcastInput : int32_t { AUTO = 0, LEFT, RIGHT };
 
+namespace task {
+template <bool needs_communication>
+class JoinTask : public Task<JoinTask<needs_communication>,
+                             needs_communication ? OpCode::JoinConcurrent : OpCode::Join> {
+ public:
+  static constexpr auto GPU_VARIANT_OPTIONS = legate::VariantOptions{}
+                                                .with_has_allocations(true)
+                                                .with_concurrent(needs_communication)
+                                                .with_elide_device_ctx_sync(true);
+  static constexpr auto CPU_VARIANT_OPTIONS =
+    legate::VariantOptions{}.with_has_allocations(true).with_concurrent(needs_communication);
+
+  static void cpu_variant(legate::TaskContext context);
+  static void gpu_variant(legate::TaskContext context);
+};
+/**
+ * @brief Help function to determine if we need to repartition the tables
+ *
+ * If legate broadcast the left- or right-hand side table, we might not need to
+ * repartition them. This depends on the join type and which table is broadcasted.
+ */
+bool is_repartition_not_needed(const TaskContext& ctx,
+                               JoinType join_type,
+                               bool lhs_broadcasted,
+                               bool rhs_broadcasted);
+}  // namespace task
 /**
  * @brief Perform a join between the specified tables.
  *

diff --git a/cpp/src/filling.cpp b/cpp/src/filling.cpp
@@ -16,9 +16,6 @@
 
 #include <legate.h>
 
-#include <cudf/filling.hpp>
-#include <cudf/scalar/scalar.hpp>
-
 #include <legate_dataframe/core/library.hpp>
 #include <legate_dataframe/core/task_argument.hpp>
 #include <legate_dataframe/core/task_context.hpp>
@@ -27,55 +24,29 @@
 namespace legate::dataframe {
 namespace task {
 
-class SequenceTask : public Task<SequenceTask, OpCode::Sequence> {
- public:
-  static void cpu_variant(legate::TaskContext context)
-  {
-    TaskContext ctx{context};
-    auto global_size = argument::get_next_scalar<size_t>(ctx);
-    auto global_init = argument::get_next_scalar<int64_t>(ctx);
-    auto output      = argument::get_next_output<PhysicalColumn>(ctx);
-    argument::get_parallel_launch_task(ctx);
-    auto [local_start, local_size] = evenly_partition_work(global_size, ctx.rank, ctx.nranks);
-    auto local_init                = global_init + local_start;
-
-    if (local_size == 0) {
-      output.bind_empty_data();
-      return;
-    }
-
-    arrow::Int64Builder long_builder = arrow::Int64Builder();
-    auto status                      = long_builder.Reserve(local_size);
-    for (size_t i = 0; i < local_size; i++) {
-      long_builder.UnsafeAppend(local_init + i);
-    }
-    auto local_array = ARROW_RESULT(long_builder.Finish());
-    output.move_into(std::move(local_array));
+/*static*/ void SequenceTask::cpu_variant(legate::TaskContext context)
+{
+  TaskContext ctx{context};
+  auto global_size = argument::get_next_scalar<size_t>(ctx);
+  auto global_init = argument::get_next_scalar<int64_t>(ctx);
+  auto output      = argument::get_next_output<PhysicalColumn>(ctx);
+  argument::get_parallel_launch_task(ctx);
+  auto [local_start, local_size] = evenly_partition_work(global_size, ctx.rank, ctx.nranks);
+  auto local_init                = global_init + local_start;
+
+  if (local_size == 0) {
+    output.bind_empty_data();
+    return;
   }
 
-  static void gpu_variant(legate::TaskContext context)
-  {
-    TaskContext ctx{context};
-    auto global_size = argument::get_next_scalar<size_t>(ctx);
-    auto global_init = argument::get_next_scalar<int64_t>(ctx);
-    auto output      = argument::get_next_output<PhysicalColumn>(ctx);
-    argument::get_parallel_launch_task(ctx);
-
-    auto [local_start, local_size] = evenly_partition_work(global_size, ctx.rank, ctx.nranks);
-    auto local_init                = global_init + local_start;
-
-    if (local_size == 0) {
-      output.bind_empty_data();
-      return;
-    }
-
-    cudf::numeric_scalar<int64_t> cudf_init(local_init, true, ctx.stream(), ctx.mr());
-    auto res = cudf::sequence(local_size, cudf_init, ctx.stream(), ctx.mr());
-
-    output.move_into(std::move(res));
+  arrow::Int64Builder long_builder = arrow::Int64Builder();
+  auto status                      = long_builder.Reserve(local_size);
+  for (size_t i = 0; i < local_size; i++) {
+    long_builder.UnsafeAppend(local_init + i);
   }
-};
-
+  auto local_array = ARROW_RESULT(long_builder.Finish());
+  output.move_into(std::move(local_array));
+}
 }  // namespace task
 
 LogicalColumn sequence(size_t size, int64_t init)

diff --git a/cpp/src/filling.cu b/cpp/src/filling.cu
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <legate.h>
+
+#include <cudf/filling.hpp>
+#include <cudf/scalar/scalar.hpp>
+
+#include <legate_dataframe/core/task_argument.hpp>
+#include <legate_dataframe/core/task_context.hpp>
+#include <legate_dataframe/filling.hpp>
+
+namespace legate::dataframe::task {
+/*static*/ void SequenceTask::gpu_variant(legate::TaskContext context)
+{
+  TaskContext ctx{context};
+  auto global_size = argument::get_next_scalar<size_t>(ctx);
+  auto global_init = argument::get_next_scalar<int64_t>(ctx);
+  auto output      = argument::get_next_output<PhysicalColumn>(ctx);
+  argument::get_parallel_launch_task(ctx);
+
+  auto [local_start, local_size] = evenly_partition_work(global_size, ctx.rank, ctx.nranks);
+  auto local_init                = global_init + local_start;
+
+  if (local_size == 0) {
+    output.bind_empty_data();
+    return;
+  }
+
+  cudf::numeric_scalar<int64_t> cudf_init(local_init, true, ctx.stream(), ctx.mr());
+  auto res = cudf::sequence(local_size, cudf_init, ctx.stream(), ctx.mr());
+
+  output.move_into(std::move(res));
+}
+}  // namespace legate::dataframe::task