Skip to content

Commit 21405f2

Browse files
authored
Merge pull request #10 from hannes/main
Update to DuckDB 0.9.1
2 parents c6fc2fd + 1750ef0 commit 21405f2

File tree

85 files changed

+763
-471
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

85 files changed

+763
-471
lines changed

src/duckdb/extension/icu/icu-makedate.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ struct ICUMakeDate : public ICUDateFunc {
2323
}
2424

2525
// Extract the time zone parts
26-
auto micros = SetTime(calendar, instant);
26+
SetTime(calendar, instant);
2727
const auto era = ExtractField(calendar, UCAL_ERA);
2828
const auto year = ExtractField(calendar, UCAL_YEAR);
2929
const auto mm = ExtractField(calendar, UCAL_MONTH) + 1;

src/duckdb/extension/icu/icu-strptime.cpp

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -96,7 +96,6 @@ struct ICUStrptime : public ICUDateFunc {
9696
auto &info = func_expr.bind_info->Cast<ICUStrptimeBindData>();
9797
CalendarPtr calendar_ptr(info.calendar->clone());
9898
auto calendar = calendar_ptr.get();
99-
auto &formats = info.formats;
10099

101100
D_ASSERT(fmt_arg.GetVectorType() == VectorType::CONSTANT_VECTOR);
102101

@@ -126,7 +125,6 @@ struct ICUStrptime : public ICUDateFunc {
126125
auto &info = func_expr.bind_info->Cast<ICUStrptimeBindData>();
127126
CalendarPtr calendar_ptr(info.calendar->clone());
128127
auto calendar = calendar_ptr.get();
129-
auto &formats = info.formats;
130128

131129
D_ASSERT(fmt_arg.GetVectorType() == VectorType::CONSTANT_VECTOR);
132130

src/duckdb/extension/icu/icu_extension.cpp

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -223,7 +223,6 @@ static void SetICUCalendar(ClientContext &context, SetScope scope, Value &parame
223223

224224
void IcuExtension::Load(DuckDB &ddb) {
225225
auto &db = *ddb.instance;
226-
auto &catalog = Catalog::GetSystemCatalog(db);
227226

228227
// iterate over all the collations
229228
int32_t count;

src/duckdb/extension/json/json_functions.cpp

Lines changed: 1 addition & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -189,16 +189,7 @@ vector<TableFunctionSet> JSONFunctions::GetTableFunctions() {
189189

190190
unique_ptr<TableRef> JSONFunctions::ReadJSONReplacement(ClientContext &context, const string &table_name,
191191
ReplacementScanData *data) {
192-
auto lower_name = StringUtil::Lower(table_name);
193-
// remove any compression
194-
if (StringUtil::EndsWith(lower_name, ".gz")) {
195-
lower_name = lower_name.substr(0, lower_name.size() - 3);
196-
} else if (StringUtil::EndsWith(lower_name, ".zst")) {
197-
lower_name = lower_name.substr(0, lower_name.size() - 4);
198-
}
199-
if (!StringUtil::EndsWith(lower_name, ".json") && !StringUtil::Contains(lower_name, ".json?") &&
200-
!StringUtil::EndsWith(lower_name, ".jsonl") && !StringUtil::Contains(lower_name, ".jsonl?") &&
201-
!StringUtil::EndsWith(lower_name, ".ndjson") && !StringUtil::Contains(lower_name, ".ndjson?")) {
192+
if (!ReplacementScan::CanReplace(table_name, {"json", "jsonl", "ndjson"})) {
202193
return nullptr;
203194
}
204195
auto table_function = make_uniq<TableFunctionRef>();

src/duckdb/extension/json/json_functions/json_create.cpp

Lines changed: 27 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -682,20 +682,33 @@ BoundCastInfo AnyToJSONCastBind(BindCastInput &input, const LogicalType &source,
682682
}
683683

684684
void JSONFunctions::RegisterJSONCreateCastFunctions(CastFunctionSet &casts) {
685-
auto json_to_any_cost = casts.ImplicitCastCost(LogicalType::ANY, JSONCommon::JSONType());
686-
casts.RegisterCastFunction(LogicalType::ANY, JSONCommon::JSONType(), AnyToJSONCastBind, json_to_any_cost);
687-
688-
const auto struct_type = LogicalType::STRUCT({{"any", LogicalType::ANY}});
689-
auto struct_to_json_cost = casts.ImplicitCastCost(struct_type, LogicalType::VARCHAR) - 2;
690-
casts.RegisterCastFunction(struct_type, JSONCommon::JSONType(), AnyToJSONCastBind, struct_to_json_cost);
691-
692-
const auto list_type = LogicalType::LIST(LogicalType::ANY);
693-
auto list_to_json_cost = casts.ImplicitCastCost(list_type, LogicalType::VARCHAR) - 2;
694-
casts.RegisterCastFunction(list_type, JSONCommon::JSONType(), AnyToJSONCastBind, list_to_json_cost);
695-
696-
const auto map_type = LogicalType::MAP(LogicalType::ANY, LogicalType::ANY);
697-
auto map_to_json_cost = casts.ImplicitCastCost(map_type, LogicalType::VARCHAR) - 2;
698-
casts.RegisterCastFunction(map_type, JSONCommon::JSONType(), AnyToJSONCastBind, map_to_json_cost);
685+
// Anything can be cast to JSON
686+
for (const auto &type : LogicalType::AllTypes()) {
687+
LogicalType source_type;
688+
switch (type.id()) {
689+
case LogicalTypeId::STRUCT:
690+
source_type = LogicalType::STRUCT({{"any", LogicalType::ANY}});
691+
break;
692+
case LogicalTypeId::LIST:
693+
source_type = LogicalType::LIST(LogicalType::ANY);
694+
break;
695+
case LogicalTypeId::MAP:
696+
source_type = LogicalType::MAP(LogicalType::ANY, LogicalType::ANY);
697+
break;
698+
case LogicalTypeId::UNION:
699+
source_type = LogicalType::UNION({{"any", LogicalType::ANY}});
700+
break;
701+
case LogicalTypeId::VARCHAR:
702+
// We skip this one here as it's handled in json_functions.cpp
703+
continue;
704+
default:
705+
source_type = type;
706+
}
707+
// We prefer going to JSON over going to VARCHAR if a function can do either
708+
const auto source_to_json_cost =
709+
MaxValue<int64_t>(casts.ImplicitCastCost(source_type, LogicalType::VARCHAR) - 1, 0);
710+
casts.RegisterCastFunction(source_type, JSONCommon::JSONType(), AnyToJSONCastBind, source_to_json_cost);
711+
}
699712
}
700713

701714
} // namespace duckdb

src/duckdb/extension/json/json_functions/json_transform.cpp

Lines changed: 26 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -898,20 +898,32 @@ BoundCastInfo JSONToAnyCastBind(BindCastInput &input, const LogicalType &source,
898898
}
899899

900900
void JSONFunctions::RegisterJSONTransformCastFunctions(CastFunctionSet &casts) {
901-
auto json_to_any_cost = casts.ImplicitCastCost(JSONCommon::JSONType(), LogicalType::ANY);
902-
casts.RegisterCastFunction(JSONCommon::JSONType(), LogicalType::ANY, JSONToAnyCastBind, json_to_any_cost);
903-
904-
const auto struct_type = LogicalType::STRUCT({{"any", LogicalType::ANY}});
905-
auto json_to_struct_cost = casts.ImplicitCastCost(LogicalType::VARCHAR, struct_type) - 2;
906-
casts.RegisterCastFunction(JSONCommon::JSONType(), struct_type, JSONToAnyCastBind, json_to_struct_cost);
907-
908-
const auto list_type = LogicalType::LIST(LogicalType::ANY);
909-
auto json_to_list_cost = casts.ImplicitCastCost(LogicalType::VARCHAR, list_type) - 2;
910-
casts.RegisterCastFunction(JSONCommon::JSONType(), list_type, JSONToAnyCastBind, json_to_list_cost);
911-
912-
const auto map_type = LogicalType::MAP(LogicalType::ANY, LogicalType::ANY);
913-
auto json_to_map_cost = casts.ImplicitCastCost(LogicalType::VARCHAR, map_type) - 2;
914-
casts.RegisterCastFunction(JSONCommon::JSONType(), map_type, JSONToAnyCastBind, json_to_map_cost);
901+
// JSON can be cast to anything
902+
for (const auto &type : LogicalType::AllTypes()) {
903+
LogicalType target_type;
904+
switch (type.id()) {
905+
case LogicalTypeId::STRUCT:
906+
target_type = LogicalType::STRUCT({{"any", LogicalType::ANY}});
907+
break;
908+
case LogicalTypeId::LIST:
909+
target_type = LogicalType::LIST(LogicalType::ANY);
910+
break;
911+
case LogicalTypeId::MAP:
912+
target_type = LogicalType::MAP(LogicalType::ANY, LogicalType::ANY);
913+
break;
914+
case LogicalTypeId::UNION:
915+
target_type = LogicalType::UNION({{"any", LogicalType::ANY}});
916+
break;
917+
case LogicalTypeId::VARCHAR:
918+
// We skip this one here as it's handled in json_functions.cpp
919+
continue;
920+
default:
921+
target_type = type;
922+
}
923+
// Going from JSON to another type has the same cost as going from VARCHAR to that type
924+
const auto json_to_target_cost = casts.ImplicitCastCost(LogicalType::VARCHAR, target_type);
925+
casts.RegisterCastFunction(JSONCommon::JSONType(), target_type, JSONToAnyCastBind, json_to_target_cost);
926+
}
915927
}
916928

917929
} // namespace duckdb

src/duckdb/extension/parquet/column_reader.cpp

Lines changed: 26 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -243,6 +243,7 @@ void ColumnReader::InitializeRead(idx_t row_group_idx_p, const vector<ColumnChun
243243
void ColumnReader::PrepareRead(parquet_filter_t &filter) {
244244
dict_decoder.reset();
245245
defined_decoder.reset();
246+
bss_decoder.reset();
246247
block.reset();
247248
PageHeader page_hdr;
248249
page_hdr.read(protocol);
@@ -443,6 +444,13 @@ void ColumnReader::PrepareDataPage(PageHeader &page_hdr) {
443444
PrepareDeltaByteArray(*block);
444445
break;
445446
}
447+
case Encoding::BYTE_STREAM_SPLIT: {
448+
// Subtract 1 from length as the block is allocated with 1 extra byte,
449+
// but the byte stream split encoder needs to know the correct data size.
450+
bss_decoder = make_uniq<BssDecoder>(block->ptr, block->len - 1);
451+
block->inc(block->len);
452+
break;
453+
}
446454
case Encoding::PLAIN:
447455
// nothing to do here, will be read directly below
448456
break;
@@ -488,7 +496,7 @@ idx_t ColumnReader::Read(uint64_t num_values, parquet_filter_t &filter, data_ptr
488496

489497
idx_t null_count = 0;
490498

491-
if ((dict_decoder || dbp_decoder || rle_decoder) && HasDefines()) {
499+
if ((dict_decoder || dbp_decoder || rle_decoder || bss_decoder) && HasDefines()) {
492500
// we need the null count because the dictionary offsets have no entries for nulls
493501
for (idx_t i = 0; i < read_now; i++) {
494502
if (define_out[i + result_offset] != max_define) {
@@ -534,6 +542,23 @@ idx_t ColumnReader::Read(uint64_t num_values, parquet_filter_t &filter, data_ptr
534542
} else if (byte_array_data) {
535543
// DELTA_BYTE_ARRAY or DELTA_LENGTH_BYTE_ARRAY
536544
DeltaByteArray(define_out, read_now, filter, result_offset, result);
545+
} else if (bss_decoder) {
546+
auto read_buf = make_shared<ResizeableBuffer>();
547+
548+
switch (schema.type) {
549+
case duckdb_parquet::format::Type::FLOAT:
550+
read_buf->resize(reader.allocator, sizeof(float) * (read_now - null_count));
551+
bss_decoder->GetBatch<float>(read_buf->ptr, read_now - null_count);
552+
break;
553+
case duckdb_parquet::format::Type::DOUBLE:
554+
read_buf->resize(reader.allocator, sizeof(double) * (read_now - null_count));
555+
bss_decoder->GetBatch<double>(read_buf->ptr, read_now - null_count);
556+
break;
557+
default:
558+
throw std::runtime_error("BYTE_STREAM_SPLIT encoding is only supported for FLOAT or DOUBLE data");
559+
}
560+
561+
Plain(read_buf, define_out, read_now, filter, result_offset, result);
537562
} else {
538563
PlainReference(block, result);
539564
Plain(block, define_out, read_now, filter, result_offset, result);

src/duckdb/extension/parquet/column_writer.cpp

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -796,6 +796,13 @@ struct ParquetTimestampSOperator : public BaseParquetOperator {
796796
}
797797
};
798798

799+
struct ParquetTimeTZOperator : public BaseParquetOperator {
800+
template <class SRC, class TGT>
801+
static TGT Operation(SRC input) {
802+
return input.time().micros;
803+
}
804+
};
805+
799806
struct ParquetHugeintOperator {
800807
template <class SRC, class TGT>
801808
static TGT Operation(SRC input) {
@@ -1975,12 +1982,14 @@ unique_ptr<ColumnWriter> ColumnWriter::CreateWriterRecursive(vector<duckdb_parqu
19751982
max_define, can_have_nulls);
19761983
case LogicalTypeId::BIGINT:
19771984
case LogicalTypeId::TIME:
1978-
case LogicalTypeId::TIME_TZ:
19791985
case LogicalTypeId::TIMESTAMP:
19801986
case LogicalTypeId::TIMESTAMP_TZ:
19811987
case LogicalTypeId::TIMESTAMP_MS:
19821988
return make_uniq<StandardColumnWriter<int64_t, int64_t>>(writer, schema_idx, std::move(schema_path), max_repeat,
19831989
max_define, can_have_nulls);
1990+
case LogicalTypeId::TIME_TZ:
1991+
return make_uniq<StandardColumnWriter<dtime_tz_t, int64_t, ParquetTimeTZOperator>>(
1992+
writer, schema_idx, std::move(schema_path), max_repeat, max_define, can_have_nulls);
19841993
case LogicalTypeId::HUGEINT:
19851994
return make_uniq<StandardColumnWriter<hugeint_t, double, ParquetHugeintOperator>>(
19861995
writer, schema_idx, std::move(schema_path), max_repeat, max_define, can_have_nulls);

src/duckdb/extension/parquet/include/column_reader.hpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
#pragma once
1010

1111
#include "duckdb.hpp"
12+
#include "parquet_bss_decoder.hpp"
1213
#include "parquet_dbp_decoder.hpp"
1314
#include "parquet_rle_bp_decoder.hpp"
1415
#include "parquet_statistics.hpp"
@@ -161,6 +162,7 @@ class ColumnReader {
161162
unique_ptr<RleBpDecoder> repeated_decoder;
162163
unique_ptr<DbpDecoder> dbp_decoder;
163164
unique_ptr<RleBpDecoder> rle_decoder;
165+
unique_ptr<BssDecoder> bss_decoder;
164166

165167
// dummies for Skip()
166168
parquet_filter_t none_filter;
Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
//===----------------------------------------------------------------------===//
2+
// DuckDB
3+
//
4+
// parquet_bss_decoder.hpp
5+
//
6+
//
7+
//===----------------------------------------------------------------------===//
8+
9+
#pragma once
10+
#include "parquet_types.h"
11+
#include "resizable_buffer.hpp"
12+
13+
namespace duckdb {
14+
15+
/// Decoder for the Byte Stream Split encoding
16+
class BssDecoder {
17+
public:
18+
/// Create a decoder object. buffer/buffer_len is the encoded data.
19+
BssDecoder(data_ptr_t buffer, uint32_t buffer_len) : buffer_(buffer, buffer_len), value_offset_(0) {
20+
}
21+
22+
public:
23+
template <typename T>
24+
void GetBatch(data_ptr_t values_target_ptr, uint32_t batch_size) {
25+
if (buffer_.len % sizeof(T) != 0) {
26+
std::stringstream error;
27+
error << "Data buffer size for the BYTE_STREAM_SPLIT encoding (" << buffer_.len
28+
<< ") should be a multiple of the type size (" << sizeof(T) << ")";
29+
throw std::runtime_error(error.str());
30+
}
31+
uint32_t num_buffer_values = buffer_.len / sizeof(T);
32+
33+
buffer_.available((value_offset_ + batch_size) * sizeof(T));
34+
35+
for (uint32_t byte_offset = 0; byte_offset < sizeof(T); ++byte_offset) {
36+
data_ptr_t input_bytes = buffer_.ptr + byte_offset * num_buffer_values + value_offset_;
37+
for (uint32_t i = 0; i < batch_size; ++i) {
38+
values_target_ptr[byte_offset + i * sizeof(T)] = *(input_bytes + i);
39+
}
40+
}
41+
value_offset_ += batch_size;
42+
}
43+
44+
private:
45+
ByteBuffer buffer_;
46+
uint32_t value_offset_;
47+
};
48+
49+
} // namespace duckdb

0 commit comments

Comments
 (0)