Skip to content

Commit 6f808df

Browse files
authored
Refactor cuda code into .cu files (#97)
1 parent ee18da6 commit 6f808df

File tree

14 files changed

+553
-368
lines changed

14 files changed

+553
-368
lines changed

cpp/include/legate_dataframe/binaryop.hpp

Lines changed: 35 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -17,13 +17,43 @@
1717
#pragma once
1818

1919
#include <legate.h>
20-
21-
#include <cudf/binaryop.hpp>
22-
2320
#include <legate_dataframe/core/column.hpp>
21+
#include <legate_dataframe/core/library.hpp>
2422

2523
namespace legate::dataframe {
2624

25+
namespace task {
26+
const std::set<std::string> cudf_supported_binary_ops = {"add",
27+
"divide",
28+
"multiply",
29+
"power",
30+
"subtract",
31+
"bit_wise_and",
32+
"bit_wise_or",
33+
"bit_wise_xor",
34+
"shift_left",
35+
"shift_right",
36+
"logb",
37+
"atan2",
38+
"equal",
39+
"greater",
40+
"greater_equal",
41+
"less",
42+
"less_equal",
43+
"not_equal",
44+
// logical operators:
45+
"and",
46+
"or",
47+
"and_kleene",
48+
"or_kleene"};
49+
50+
class BinaryOpColColTask : public Task<BinaryOpColColTask, OpCode::BinaryOpColCol> {
51+
public:
52+
static void cpu_variant(legate::TaskContext context);
53+
static void gpu_variant(legate::TaskContext context);
54+
};
55+
56+
} // namespace task
2757
/**
2858
* @brief Performs a binary operation between two columns.
2959
*
@@ -43,15 +73,11 @@ namespace legate::dataframe {
4373
* @param output_type The desired data type of the output column
4474
* @return Output column of `output_type` type containing the result of
4575
* the binary operation
46-
* @throw cudf::logic_error if @p lhs and @p rhs are different sizes
47-
* @throw cudf::logic_error if @p output_type dtype isn't boolean for comparison and logical
48-
* operations.
49-
* @throw cudf::logic_error if @p output_type dtype isn't fixed-width
50-
* @throw cudf::data_type_error if the operation is not supported for the types of @p lhs and @p rhs
76+
* @throw std::invalid_argument if operator not supported
5177
*/
5278
LogicalColumn binary_operation(const LogicalColumn& lhs,
5379
const LogicalColumn& rhs,
5480
std::string op,
55-
cudf::data_type output_type);
81+
std::shared_ptr<arrow::DataType> output_type);
5682

5783
} // namespace legate::dataframe

cpp/include/legate_dataframe/core/column.hpp

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,31 @@ class LogicalColumn {
8282
}
8383
}
8484

85+
/**
86+
* @brief Create a column with a legate array as the data
87+
*
88+
* @param array The logical array (zero copy)
89+
* @param data_type The arrow data type of the column. If nullptr, the arrow data type is
90+
* derived from the data type of `array`.
91+
* @param scalar Whether to consider this column scalar. WARNING: currently
92+
* it is the callers responsibility to ensure the array is length 1 as this
93+
* check could be blocking.
94+
*/
95+
LogicalColumn(legate::LogicalArray array,
96+
std::shared_ptr<arrow::DataType> data_type,
97+
bool scalar = false)
98+
: array_{std::move(array)}, scalar_{scalar}
99+
{
100+
if (array_->dim() != 1) { throw std::invalid_argument("array must be 1-D"); }
101+
// Note: Checking the volume could be blocking, so assume that this is fine.
102+
assert(!scalar || array_->unbound() || array_->volume() == 1);
103+
104+
if (!data_type) {
105+
cudf_type_ = cudf::data_type{to_cudf_type_id(array_->type().code())};
106+
} else {
107+
cudf_type_ = to_cudf_type(*data_type);
108+
}
109+
}
85110
/*
86111
* Convenience constructor for tests
87112
*/
@@ -259,6 +284,32 @@ class LogicalColumn {
259284
}
260285
}
261286

287+
/**
288+
* @brief Create a new unbounded column from dtype and nullable
289+
*
290+
* @param dtype The data type of the new column
291+
* @param nullable The nullable of the new column
292+
* @param scalar Whether the result is a scalar column.
293+
* @return The new unbounded column
294+
*/
295+
static LogicalColumn empty_like(std::shared_ptr<arrow::DataType> dtype,
296+
bool nullable,
297+
bool scalar = false,
298+
std::optional<size_t> size = std::nullopt)
299+
{
300+
if (!size.has_value()) {
301+
return LogicalColumn(
302+
legate::Runtime::get_runtime()->create_array(to_legate_type(*dtype), 1, nullable),
303+
dtype,
304+
scalar);
305+
} else {
306+
return LogicalColumn(legate::Runtime::get_runtime()->create_array(
307+
Shape{size.value()}, to_legate_type(*dtype), nullable),
308+
dtype,
309+
scalar);
310+
}
311+
}
312+
262313
public:
263314
LogicalColumn(const LogicalColumn& other) = default;
264315
LogicalColumn& operator=(const LogicalColumn& other) = default;

cpp/include/legate_dataframe/csv.hpp

Lines changed: 21 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -18,13 +18,30 @@
1818

1919
#include <string>
2020

21-
#include <cudf/types.hpp>
22-
23-
#include <legate_dataframe/core/column.hpp>
21+
#include <legate.h>
22+
#include <legate_dataframe/core/library.hpp>
2423
#include <legate_dataframe/core/table.hpp>
2524

2625
namespace legate::dataframe {
2726

27+
namespace task {
28+
29+
class CSVRead : public Task<CSVRead, OpCode::CSVRead> {
30+
public:
31+
static void cpu_variant(legate::TaskContext context);
32+
static void gpu_variant(legate::TaskContext context);
33+
};
34+
35+
class CSVWrite : public Task<CSVWrite, OpCode::CSVWrite> {
36+
public:
37+
static constexpr auto GPU_VARIANT_OPTIONS = legate::VariantOptions{}
38+
.with_has_allocations(true)
39+
.with_elide_device_ctx_sync(true)
40+
.with_has_side_effect(true);
41+
static void cpu_variant(legate::TaskContext context);
42+
static void gpu_variant(legate::TaskContext context);
43+
};
44+
} // namespace task
2845
/**
2946
* @brief Write table to csv files.
3047
*
@@ -69,7 +86,7 @@ void csv_write(LogicalTable& tbl, const std::string& dirpath, char delimiter = '
6986
* @return The read LogicalTable.
7087
*/
7188
LogicalTable csv_read(const std::vector<std::string>& files,
72-
const std::vector<cudf::data_type>& dtypes,
89+
const std::vector<std::shared_ptr<arrow::DataType>>& dtypes,
7390
bool na_filter = true,
7491
char delimiter = ',',
7592
const std::optional<std::vector<std::string>>& names = std::nullopt,

cpp/src/binaryop.cpp

Lines changed: 47 additions & 131 deletions
Original file line numberDiff line numberDiff line change
@@ -14,12 +14,6 @@
1414
* limitations under the License.
1515
*/
1616

17-
#include <legate.h>
18-
19-
#include <cudf/binaryop.hpp>
20-
#include <cudf/types.hpp>
21-
#include <cudf/unary.hpp>
22-
2317
#include <arrow/compute/api.h>
2418
#include <legate_dataframe/binaryop.hpp>
2519
#include <legate_dataframe/core/column.hpp>
@@ -30,140 +24,61 @@
3024

3125
namespace legate::dataframe::task {
3226

33-
cudf::binary_operator arrow_to_cudf_binary_op(std::string op, legate::Type output_type)
27+
/*static*/ void BinaryOpColColTask::cpu_variant(legate::TaskContext context)
3428
{
35-
// Arrow binary operators taken from the below list,
36-
// where an equivalent cudf binary operator exists.
37-
// https://arrow.apache.org/docs/cpp/compute.html#element-wise-scalar-functions
38-
// https://docs.rapids.ai/api/libcudf/stable/group__transformation__binaryops
39-
std::unordered_map<std::string, cudf::binary_operator> arrow_to_cudf_ops = {
40-
{"add", cudf::binary_operator::ADD},
41-
{"divide", cudf::binary_operator::DIV},
42-
{"multiply", cudf::binary_operator::MUL},
43-
{"power", cudf::binary_operator::POW},
44-
{"subtract", cudf::binary_operator::SUB},
45-
{"bit_wise_and", cudf::binary_operator::BITWISE_AND},
46-
{"bit_wise_or", cudf::binary_operator::BITWISE_OR},
47-
{"bit_wise_xor", cudf::binary_operator::BITWISE_XOR},
48-
{"shift_left", cudf::binary_operator::SHIFT_LEFT},
49-
{"shift_right", cudf::binary_operator::SHIFT_RIGHT},
50-
{"logb", cudf::binary_operator::LOG_BASE},
51-
{"atan2", cudf::binary_operator::ATAN2},
52-
{"equal", cudf::binary_operator::EQUAL},
53-
{"greater", cudf::binary_operator::GREATER},
54-
{"greater_equal", cudf::binary_operator::GREATER_EQUAL},
55-
{"less", cudf::binary_operator::LESS},
56-
{"less_equal", cudf::binary_operator::LESS_EQUAL},
57-
{"not_equal", cudf::binary_operator::NOT_EQUAL},
58-
// logical operators:
59-
{"and", cudf::binary_operator::LOGICAL_AND},
60-
{"or", cudf::binary_operator::LOGICAL_OR},
61-
{"and_kleene", cudf::binary_operator::NULL_LOGICAL_AND},
62-
{"or_kleene", cudf::binary_operator::NULL_LOGICAL_OR},
63-
};
64-
65-
// Cudf has a special case for powers with integers
66-
// https://github.com/rapidsai/cudf/issues/10178#issuecomment-3004143727
67-
if (op == "power" && output_type.to_string().find("int") != std::string::npos) {
68-
return cudf::binary_operator::INT_POW;
29+
TaskContext ctx{context};
30+
auto op = argument::get_next_scalar<std::string>(ctx);
31+
const auto lhs = argument::get_next_input<PhysicalColumn>(ctx);
32+
const auto rhs = argument::get_next_input<PhysicalColumn>(ctx);
33+
auto output = argument::get_next_output<PhysicalColumn>(ctx);
34+
35+
std::vector<arrow::Datum> args(2);
36+
if (lhs.num_rows() == 1) {
37+
auto scalar = ARROW_RESULT(lhs.arrow_array_view()->GetScalar(0));
38+
args[0] = scalar;
39+
} else {
40+
args[0] = lhs.arrow_array_view();
41+
}
42+
if (rhs.num_rows() == 1) {
43+
auto scalar = ARROW_RESULT(rhs.arrow_array_view()->GetScalar(0));
44+
args[1] = scalar;
45+
} else {
46+
args[1] = rhs.arrow_array_view();
6947
}
7048

71-
if (arrow_to_cudf_ops.find(op) != arrow_to_cudf_ops.end()) { return arrow_to_cudf_ops[op]; }
72-
throw std::invalid_argument("Could not find cudf binary operator matching: " + op);
73-
return cudf::binary_operator::INVALID_BINARY;
74-
}
75-
76-
class BinaryOpColColTask : public Task<BinaryOpColColTask, OpCode::BinaryOpColCol> {
77-
public:
78-
static void cpu_variant(legate::TaskContext context)
79-
{
80-
TaskContext ctx{context};
81-
auto op = argument::get_next_scalar<std::string>(ctx);
82-
const auto lhs = argument::get_next_input<PhysicalColumn>(ctx);
83-
const auto rhs = argument::get_next_input<PhysicalColumn>(ctx);
84-
auto output = argument::get_next_output<PhysicalColumn>(ctx);
85-
86-
std::vector<arrow::Datum> args(2);
87-
if (lhs.num_rows() == 1) {
88-
auto scalar = ARROW_RESULT(lhs.arrow_array_view()->GetScalar(0));
89-
args[0] = scalar;
90-
} else {
91-
args[0] = lhs.arrow_array_view();
92-
}
93-
if (rhs.num_rows() == 1) {
94-
auto scalar = ARROW_RESULT(rhs.arrow_array_view()->GetScalar(0));
95-
args[1] = scalar;
96-
} else {
97-
args[1] = rhs.arrow_array_view();
98-
}
99-
100-
if (output.cudf_type().id() == cudf::type_id::BOOL8 &&
101-
(op == "and" || op == "or" || op == "and_kleene" || op == "or_kleene")) {
102-
// arrow doesn't seem to cast for the user for logical ops.
103-
args[0] = ARROW_RESULT(arrow::compute::Cast(args[0], arrow::boolean()));
104-
args[1] = ARROW_RESULT(arrow::compute::Cast(args[1], arrow::boolean()));
105-
}
106-
107-
// Result may be scalar or array
108-
auto datum_result = ARROW_RESULT(arrow::compute::CallFunction(op, args));
49+
if (output.cudf_type().id() == cudf::type_id::BOOL8 &&
50+
(op == "and" || op == "or" || op == "and_kleene" || op == "or_kleene")) {
51+
// arrow doesn't seem to cast for the user for logical ops.
52+
args[0] = ARROW_RESULT(arrow::compute::Cast(args[0], arrow::boolean()));
53+
args[1] = ARROW_RESULT(arrow::compute::Cast(args[1], arrow::boolean()));
54+
}
10955

110-
// Coerce the output type if necessary
111-
auto arrow_result_type = to_arrow_type(output.cudf_type().id());
112-
if (datum_result.type() != arrow_result_type) {
113-
auto coerced_result = ARROW_RESULT(arrow::compute::Cast(
114-
datum_result, arrow_result_type, arrow::compute::CastOptions::Unsafe()));
115-
datum_result = std::move(coerced_result);
116-
}
56+
// Result may be scalar or array
57+
auto datum_result = ARROW_RESULT(arrow::compute::CallFunction(op, args));
11758

118-
if (datum_result.is_scalar()) {
119-
auto as_array = ARROW_RESULT(arrow::MakeArrayFromScalar(*datum_result.scalar(), 1));
120-
if (get_prefer_eager_allocations()) {
121-
output.copy_into(std::move(as_array));
122-
} else {
123-
output.move_into(std::move(as_array));
124-
}
125-
} else {
126-
if (get_prefer_eager_allocations()) {
127-
output.copy_into(std::move(datum_result.make_array()));
128-
} else {
129-
output.move_into(std::move(datum_result.make_array()));
130-
}
131-
}
59+
// Coerce the output type if necessary
60+
auto arrow_result_type = to_arrow_type(output.cudf_type().id());
61+
if (datum_result.type() != arrow_result_type) {
62+
auto coerced_result = ARROW_RESULT(
63+
arrow::compute::Cast(datum_result, arrow_result_type, arrow::compute::CastOptions::Unsafe()));
64+
datum_result = std::move(coerced_result);
13265
}
13366

134-
static void gpu_variant(legate::TaskContext context)
135-
{
136-
TaskContext ctx{context};
137-
auto arrow_op = argument::get_next_scalar<std::string>(ctx);
138-
const auto lhs = argument::get_next_input<PhysicalColumn>(ctx);
139-
const auto rhs = argument::get_next_input<PhysicalColumn>(ctx);
140-
auto output = argument::get_next_output<PhysicalColumn>(ctx);
141-
auto op = arrow_to_cudf_binary_op(arrow_op, output.type());
142-
143-
std::unique_ptr<cudf::column> ret;
144-
/*
145-
* If one (not both) are length 1, use scalars as cudf doesn't allow
146-
* broadcast binary operations.
147-
*/
148-
if (lhs.num_rows() == 1 && rhs.num_rows() != 1) {
149-
auto lhs_scalar = lhs.cudf_scalar();
150-
ret = cudf::binary_operation(
151-
*lhs_scalar, rhs.column_view(), op, output.cudf_type(), ctx.stream(), ctx.mr());
152-
} else if (rhs.num_rows() == 1 && lhs.num_rows() != 1) {
153-
auto rhs_scalar = rhs.cudf_scalar();
154-
ret = cudf::binary_operation(
155-
lhs.column_view(), *rhs_scalar, op, output.cudf_type(), ctx.stream(), ctx.mr());
67+
if (datum_result.is_scalar()) {
68+
auto as_array = ARROW_RESULT(arrow::MakeArrayFromScalar(*datum_result.scalar(), 1));
69+
if (get_prefer_eager_allocations()) {
70+
output.copy_into(std::move(as_array));
15671
} else {
157-
ret = cudf::binary_operation(
158-
lhs.column_view(), rhs.column_view(), op, output.cudf_type(), ctx.stream(), ctx.mr());
72+
output.move_into(std::move(as_array));
15973
}
74+
} else {
16075
if (get_prefer_eager_allocations()) {
161-
output.copy_into(std::move(ret));
76+
output.copy_into(std::move(datum_result.make_array()));
16277
} else {
163-
output.move_into(std::move(ret));
78+
output.move_into(std::move(datum_result.make_array()));
16479
}
16580
}
166-
};
81+
}
16782

16883
} // namespace legate::dataframe::task
16984

@@ -181,15 +96,16 @@ namespace legate::dataframe {
18196
LogicalColumn binary_operation(const LogicalColumn& lhs,
18297
const LogicalColumn& rhs,
18398
std::string op,
184-
cudf::data_type output_type)
99+
std::shared_ptr<arrow::DataType> output_type)
185100
{
186101
auto runtime = legate::Runtime::get_runtime();
187102

188103
// Check if the op is valid before we enter the task
189104
// This allows us to to throw nicely
190105
if (runtime->get_machine().count(legate::mapping::TaskTarget::GPU) > 0) {
191-
// Throws if op doesn't exist
192-
task::arrow_to_cudf_binary_op(op, to_legate_type(output_type.id()));
106+
if (task::cudf_supported_binary_ops.count(op) == 0) {
107+
throw std::invalid_argument("Unsupported binary operator: " + op);
108+
}
193109
} else {
194110
auto result = arrow::compute::GetFunctionRegistry()->GetFunction(op);
195111
if (!result.ok()) {
@@ -201,7 +117,7 @@ LogicalColumn binary_operation(const LogicalColumn& lhs,
201117
auto scalar_result = lhs.is_scalar() && rhs.is_scalar();
202118
std::optional<size_t> size{};
203119
if (get_prefer_eager_allocations()) { size = lhs.is_scalar() ? rhs.num_rows() : lhs.num_rows(); }
204-
auto ret = LogicalColumn::empty_like(std::move(output_type), nullable, scalar_result, size);
120+
auto ret = LogicalColumn::empty_like(output_type, nullable, scalar_result, size);
205121
legate::AutoTask task =
206122
runtime->create_task(get_library(), task::BinaryOpColColTask::TASK_CONFIG.task_id());
207123
argument::add_next_scalar(task, op);

0 commit comments

Comments
 (0)