diff --git a/velox/connectors/hive/SplitReader.cpp b/velox/connectors/hive/SplitReader.cpp index 673bd75449f0..6aa05b40f62a 100644 --- a/velox/connectors/hive/SplitReader.cpp +++ b/velox/connectors/hive/SplitReader.cpp @@ -421,11 +421,17 @@ std::vector SplitReader::adaptColumns( auto fileTypeIdx = fileType->getChildIdxIfExists(fieldName); if (!fileTypeIdx.has_value()) { // Column is missing. Most likely due to schema evolution. - VELOX_CHECK(tableSchema, "Unable to resolve column '{}'", fieldName); + auto outputTypeIdx = readerOutputType_->getChildIdxIfExists(fieldName); + TypePtr fieldType; + if (outputTypeIdx.has_value()) { + // Field name exists in the user-specified output type. + fieldType = readerOutputType_->childAt(outputTypeIdx.value()); + } else { + VELOX_CHECK(tableSchema, "Unable to resolve column '{}'", fieldName); + fieldType = tableSchema->findChild(fieldName); + } childSpec->setConstantValue(BaseVector::createNullConstant( - tableSchema->findChild(fieldName), - 1, - connectorQueryCtx_->memoryPool())); + fieldType, 1, connectorQueryCtx_->memoryPool())); } else { // Column no longer missing, reset constant value set on the spec. childSpec->setConstantValue(nullptr); diff --git a/velox/dwio/common/ScanSpec.cpp b/velox/dwio/common/ScanSpec.cpp index 696691222152..1d118bea3fed 100644 --- a/velox/dwio/common/ScanSpec.cpp +++ b/velox/dwio/common/ScanSpec.cpp @@ -157,7 +157,7 @@ bool ScanSpec::hasFilter() const { if (hasFilter_.has_value()) { return hasFilter_.value(); } - if (!isConstant() && filter()) { + if (filter()) { hasFilter_ = true; return true; } diff --git a/velox/dwio/common/SelectiveFlatMapColumnReader.h b/velox/dwio/common/SelectiveFlatMapColumnReader.h index 8a939ffa7bbe..507331123298 100644 --- a/velox/dwio/common/SelectiveFlatMapColumnReader.h +++ b/velox/dwio/common/SelectiveFlatMapColumnReader.h @@ -24,11 +24,13 @@ namespace facebook::velox::dwio::common { class SelectiveFlatMapColumnReader : public SelectiveStructColumnReaderBase { protected: SelectiveFlatMapColumnReader( + const dwio::common::ColumnReaderOptions& columnReaderOptions, const TypePtr& requestedType, const std::shared_ptr& fileType, FormatParams& params, velox::common::ScanSpec& scanSpec) : SelectiveStructColumnReaderBase( + columnReaderOptions, requestedType, fileType, params, diff --git a/velox/dwio/common/SelectiveStructColumnReader.cpp b/velox/dwio/common/SelectiveStructColumnReader.cpp index e9776575e11e..d0693649a1d1 100644 --- a/velox/dwio/common/SelectiveStructColumnReader.cpp +++ b/velox/dwio/common/SelectiveStructColumnReader.cpp @@ -426,7 +426,6 @@ void SelectiveStructColumnReaderBase::read( } const auto& childSpecs = scanSpec_->children(); - VELOX_CHECK(!childSpecs.empty()); for (size_t i = 0; i < childSpecs.size(); ++i) { const auto& childSpec = childSpecs[i]; VELOX_TRACE_HISTORY_PUSH("read %s", childSpec->fieldName().c_str()); @@ -526,9 +525,12 @@ bool SelectiveStructColumnReaderBase::isChildMissing( // row type that doesn't exist // in the output. fileType_->type()->kind() != - TypeKind::MAP && // If this is the case it means this is a flat map, - // so it can't have "missing" fields. - childSpec.channel() >= fileType_->size()); + TypeKind::MAP // If this is the case it means this is a flat map, + // so it can't have "missing" fields. + ) && + (columnReaderOptions_.useColumnNamesForColumnMapping_ + ? !asRowType(fileType_->type())->containsChild(childSpec.fieldName()) + : childSpec.channel() >= fileType_->size()); } std::unique_ptr @@ -540,7 +542,6 @@ SelectiveStructColumnReaderBase::makeColumnLoader(vector_size_t index) { void SelectiveStructColumnReaderBase::getValues( const RowSet& rows, VectorPtr* result) { - VELOX_CHECK(!scanSpec_->children().empty()); VELOX_CHECK_NOT_NULL( *result, "SelectiveStructColumnReaderBase expects a non-null result"); VELOX_CHECK( diff --git a/velox/dwio/common/SelectiveStructColumnReader.h b/velox/dwio/common/SelectiveStructColumnReader.h index 4cbc4189a781..19f442eda816 100644 --- a/velox/dwio/common/SelectiveStructColumnReader.h +++ b/velox/dwio/common/SelectiveStructColumnReader.h @@ -16,6 +16,7 @@ #pragma once +#include "velox/dwio/common/Options.h" #include "velox/dwio/common/SelectiveColumnReaderInternal.h" namespace facebook::velox::dwio::common { @@ -113,6 +114,7 @@ class SelectiveStructColumnReaderBase : public SelectiveColumnReader { static constexpr int32_t kConstantChildSpecSubscript = -1; SelectiveStructColumnReaderBase( + const dwio::common::ColumnReaderOptions& columnReaderOptions, const TypePtr& requestedType, const std::shared_ptr& fileType, FormatParams& params, @@ -120,6 +122,7 @@ class SelectiveStructColumnReaderBase : public SelectiveColumnReader { bool isRoot = false, bool generateLazyChildren = true) : SelectiveColumnReader(requestedType, fileType, params, scanSpec), + columnReaderOptions_(columnReaderOptions), debugString_( getExceptionContext().message(VeloxException::Type::kSystem)), isRoot_(isRoot), @@ -180,6 +183,8 @@ class SelectiveStructColumnReaderBase : public SelectiveColumnReader { } } + const dwio::common::ColumnReaderOptions& columnReaderOptions_; + // Context information obtained from ExceptionContext. Stored here // so that LazyVector readers under this can add this to their // ExceptionContext. Allows contextualizing reader errors to split diff --git a/velox/dwio/dwrf/reader/SelectiveFlatMapColumnReader.cpp b/velox/dwio/dwrf/reader/SelectiveFlatMapColumnReader.cpp index ec3dd9d90b1b..cc3f400bc42e 100644 --- a/velox/dwio/dwrf/reader/SelectiveFlatMapColumnReader.cpp +++ b/velox/dwio/dwrf/reader/SelectiveFlatMapColumnReader.cpp @@ -203,6 +203,7 @@ class SelectiveFlatMapAsStructReader : public SelectiveStructColumnReaderBase { DwrfParams& params, common::ScanSpec& scanSpec) : SelectiveStructColumnReaderBase( + columnReaderOptions, requestedType, fileType, params, @@ -241,6 +242,7 @@ class SelectiveFlatMapAsMapReader : public SelectiveStructColumnReaderBase { DwrfParams& params, common::ScanSpec& scanSpec) : SelectiveStructColumnReaderBase( + columnReaderOptions, requestedType, fileType, params, @@ -285,6 +287,7 @@ class SelectiveFlatMapReader DwrfParams& params, common::ScanSpec& scanSpec) : dwio::common::SelectiveFlatMapColumnReader( + columnReaderOptions, requestedType, fileType, params, diff --git a/velox/dwio/dwrf/reader/SelectiveStructColumnReader.cpp b/velox/dwio/dwrf/reader/SelectiveStructColumnReader.cpp index b1f47289505e..aa55cab7f04e 100644 --- a/velox/dwio/dwrf/reader/SelectiveStructColumnReader.cpp +++ b/velox/dwio/dwrf/reader/SelectiveStructColumnReader.cpp @@ -31,6 +31,7 @@ SelectiveStructColumnReader::SelectiveStructColumnReader( common::ScanSpec& scanSpec, bool isRoot) : SelectiveStructColumnReaderBase( + columnReaderOptions, requestedType, fileType, params, diff --git a/velox/dwio/dwrf/reader/SelectiveStructColumnReader.h b/velox/dwio/dwrf/reader/SelectiveStructColumnReader.h index 6e323b67fe39..f8559edfca0f 100644 --- a/velox/dwio/dwrf/reader/SelectiveStructColumnReader.h +++ b/velox/dwio/dwrf/reader/SelectiveStructColumnReader.h @@ -25,6 +25,7 @@ class SelectiveStructColumnReaderBase : public dwio::common::SelectiveStructColumnReaderBase { public: SelectiveStructColumnReaderBase( + const dwio::common::ColumnReaderOptions& columnReaderOptions, const TypePtr& requestedType, const std::shared_ptr& fileType, DwrfParams& params, @@ -32,6 +33,7 @@ class SelectiveStructColumnReaderBase bool isRoot = false, bool generateLazyChildren = true) : dwio::common::SelectiveStructColumnReaderBase( + columnReaderOptions, requestedType, fileType, params, diff --git a/velox/dwio/parquet/reader/ParquetColumnReader.cpp b/velox/dwio/parquet/reader/ParquetColumnReader.cpp index e98159b7a46e..8deac03573d3 100644 --- a/velox/dwio/parquet/reader/ParquetColumnReader.cpp +++ b/velox/dwio/parquet/reader/ParquetColumnReader.cpp @@ -38,7 +38,8 @@ std::unique_ptr ParquetColumnReader::build( const TypePtr& requestedType, const std::shared_ptr& fileType, ParquetParams& params, - common::ScanSpec& scanSpec) { + common::ScanSpec& scanSpec, + memory::MemoryPool& pool) { auto colName = scanSpec.fieldName(); switch (fileType->type()->kind()) { @@ -59,7 +60,7 @@ std::unique_ptr ParquetColumnReader::build( case TypeKind::ROW: return std::make_unique( - columnReaderOptions, requestedType, fileType, params, scanSpec); + columnReaderOptions, requestedType, fileType, params, scanSpec, pool); case TypeKind::VARBINARY: case TypeKind::VARCHAR: @@ -68,12 +69,12 @@ std::unique_ptr ParquetColumnReader::build( case TypeKind::ARRAY: { VELOX_CHECK(requestedType->isArray(), "Requested type must be array"); return std::make_unique( - columnReaderOptions, requestedType, fileType, params, scanSpec); + columnReaderOptions, requestedType, fileType, params, scanSpec, pool); } case TypeKind::MAP: return std::make_unique( - columnReaderOptions, requestedType, fileType, params, scanSpec); + columnReaderOptions, requestedType, fileType, params, scanSpec, pool); case TypeKind::BOOLEAN: return std::make_unique( diff --git a/velox/dwio/parquet/reader/ParquetColumnReader.h b/velox/dwio/parquet/reader/ParquetColumnReader.h index dd3a4a450b9e..4d9725d08391 100644 --- a/velox/dwio/parquet/reader/ParquetColumnReader.h +++ b/velox/dwio/parquet/reader/ParquetColumnReader.h @@ -47,6 +47,7 @@ class ParquetColumnReader { const TypePtr& requestedType, const std::shared_ptr& fileType, ParquetParams& params, - common::ScanSpec& scanSpec); + common::ScanSpec& scanSpec, + memory::MemoryPool& pool); }; } // namespace facebook::velox::parquet diff --git a/velox/dwio/parquet/reader/ParquetReader.cpp b/velox/dwio/parquet/reader/ParquetReader.cpp index 954b2548dfc1..7942440e67b3 100644 --- a/velox/dwio/parquet/reader/ParquetReader.cpp +++ b/velox/dwio/parquet/reader/ParquetReader.cpp @@ -1207,12 +1207,15 @@ class ParquetRowReader::Impl { options_.timestampPrecision()); requestedType_ = options_.requestedType() ? options_.requestedType() : readerBase_->schema(); + columnReaderOptions_ = + dwio::common::makeColumnReaderOptions(readerBase_->options()); columnReader_ = ParquetColumnReader::build( columnReaderOptions_, requestedType_, readerBase_->schemaWithId(), // Id is schema id params, - *options_.scanSpec()); + *options_.scanSpec(), + pool_); columnReader_->setIsTopLevel(); filterRowGroups(); @@ -1222,9 +1225,6 @@ class ParquetRowReader::Impl { // table scan. advanceToNextRowGroup(); } - - columnReaderOptions_ = - dwio::common::makeColumnReaderOptions(readerBase_->options()); } void filterRowGroups() { diff --git a/velox/dwio/parquet/reader/RepeatedColumnReader.cpp b/velox/dwio/parquet/reader/RepeatedColumnReader.cpp index 8cd75156747a..7c2ed6326eda 100644 --- a/velox/dwio/parquet/reader/RepeatedColumnReader.cpp +++ b/velox/dwio/parquet/reader/RepeatedColumnReader.cpp @@ -33,6 +33,9 @@ PageReader* readLeafRepDefs( return nullptr; } auto pageReader = reader->formatData().as().reader(); + if (pageReader == nullptr) { + return nullptr; + } pageReader->decodeRepDefs(numTop); return pageReader; } @@ -114,7 +117,8 @@ MapColumnReader::MapColumnReader( const TypePtr& requestedType, const std::shared_ptr& fileType, ParquetParams& params, - common::ScanSpec& scanSpec) + common::ScanSpec& scanSpec, + memory::MemoryPool& pool) : dwio::common::SelectiveMapColumnReader( requestedType, fileType, @@ -128,13 +132,15 @@ MapColumnReader::MapColumnReader( keyChildType, fileType_->childAt(0), params, - *scanSpec.children()[0]); + *scanSpec.children()[0], + pool); elementReader_ = ParquetColumnReader::build( columnReaderOptions, elementChildType, fileType_->childAt(1), params, - *scanSpec.children()[1]); + *scanSpec.children()[1], + pool); reinterpret_cast(fileType.get()) ->makeLevelInfo(levelInfo_); children_ = {keyReader_.get(), elementReader_.get()}; @@ -232,7 +238,8 @@ ListColumnReader::ListColumnReader( const TypePtr& requestedType, const std::shared_ptr& fileType, ParquetParams& params, - common::ScanSpec& scanSpec) + common::ScanSpec& scanSpec, + memory::MemoryPool& pool) : dwio::common::SelectiveListColumnReader( requestedType, fileType, @@ -244,7 +251,8 @@ ListColumnReader::ListColumnReader( childType, fileType_->childAt(0), params, - *scanSpec.children()[0]); + *scanSpec.children()[0], + pool); reinterpret_cast(fileType.get()) ->makeLevelInfo(levelInfo_); children_ = {child_.get()}; diff --git a/velox/dwio/parquet/reader/RepeatedColumnReader.h b/velox/dwio/parquet/reader/RepeatedColumnReader.h index 823e874b2f06..066cee7f7cd0 100644 --- a/velox/dwio/parquet/reader/RepeatedColumnReader.h +++ b/velox/dwio/parquet/reader/RepeatedColumnReader.h @@ -61,7 +61,8 @@ class MapColumnReader : public dwio::common::SelectiveMapColumnReader { const TypePtr& requestedType, const std::shared_ptr& fileType, ParquetParams& params, - common::ScanSpec& scanSpec); + common::ScanSpec& scanSpec, + memory::MemoryPool& pool); void prepareRead( vector_size_t offset, @@ -118,7 +119,8 @@ class ListColumnReader : public dwio::common::SelectiveListColumnReader { const TypePtr& requestedType, const std::shared_ptr& fileType, ParquetParams& params, - common::ScanSpec& scanSpec); + common::ScanSpec& scanSpec, + memory::MemoryPool& pool); void prepareRead( vector_size_t offset, diff --git a/velox/dwio/parquet/reader/StructColumnReader.cpp b/velox/dwio/parquet/reader/StructColumnReader.cpp index 694f334c51a2..6fb84c0cdd69 100644 --- a/velox/dwio/parquet/reader/StructColumnReader.cpp +++ b/velox/dwio/parquet/reader/StructColumnReader.cpp @@ -31,18 +31,33 @@ StructColumnReader::StructColumnReader( const TypePtr& requestedType, const std::shared_ptr& fileType, ParquetParams& params, - common::ScanSpec& scanSpec) - : SelectiveStructColumnReader(requestedType, fileType, params, scanSpec) { + common::ScanSpec& scanSpec, + memory::MemoryPool& pool) + : SelectiveStructColumnReader( + columnReaderOptions, + requestedType, + fileType, + params, + scanSpec, + /*isRoot=*/false) { auto& childSpecs = scanSpec_->stableChildren(); + const bool useColumnNames = + columnReaderOptions.useColumnNamesForColumnMapping_; + std::vector missingFields; for (auto i = 0; i < childSpecs.size(); ++i) { auto childSpec = childSpecs[i]; - if (childSpec->isConstant() || isChildMissing(*childSpec)) { + if (childSpec->isConstant() && + (!useColumnNames && isChildMissing(*childSpec))) { childSpec->setSubscript(kConstantChildSpecSubscript); continue; } if (!childSpecs[i]->readFromFile()) { continue; } + if (useColumnNames && isChildMissing(*childSpec)) { + missingFields.emplace_back(i); + continue; + } auto childFileType = fileType_->childByName(childSpec->fieldName()); auto childRequestedType = requestedType_->asRow().findChild(childSpec->fieldName()); @@ -51,10 +66,30 @@ StructColumnReader::StructColumnReader( childRequestedType, childFileType, params, - *childSpec)); + *childSpec, + pool)); childSpecs[i]->setSubscript(children_.size() - 1); } + + // 'missingFields' is not empty only when using column names for column + // mapping. + if (missingFields.size() > 0) { + // Set the struct as null if all the subfields in the requested type are + // missing and the number of subfields is more than one. + if (childSpecs.size() > 1 && missingFields.size() == childSpecs.size()) { + scanSpec_->setConstantValue( + BaseVector::createNullConstant(requestedType_, 1, &pool)); + } else { + // Set null constant for the missing subfield of requested type. + auto rowTypePtr = asRowType(requestedType_); + for (int channel : missingFields) { + childSpecs[channel]->setConstantValue(BaseVector::createNullConstant( + rowTypePtr->findChild(childSpecs[channel]->fieldName()), 1, &pool)); + } + } + } + auto type = reinterpret_cast(fileType_.get()); if (type->parent()) { levelMode_ = reinterpret_cast(fileType_.get()) @@ -64,7 +99,10 @@ StructColumnReader::StructColumnReader( // this and the child. auto child = childForRepDefs_; for (;;) { - assert(child); + if (child == nullptr) { + levelMode_ = LevelMode::kNulls; + break; + } if (child->fileType().type()->kind() == TypeKind::ARRAY || child->fileType().type()->kind() == TypeKind::MAP) { levelMode_ = LevelMode::kStructOverLists; @@ -101,7 +139,6 @@ StructColumnReader::findBestLeaf() { best = child; } } - assert(best); return best; } diff --git a/velox/dwio/parquet/reader/StructColumnReader.h b/velox/dwio/parquet/reader/StructColumnReader.h index 37acbdc2e5ea..015dcbc1babc 100644 --- a/velox/dwio/parquet/reader/StructColumnReader.h +++ b/velox/dwio/parquet/reader/StructColumnReader.h @@ -37,7 +37,8 @@ class StructColumnReader : public dwio::common::SelectiveStructColumnReader { const TypePtr& requestedType, const std::shared_ptr& fileType, ParquetParams& params, - common::ScanSpec& scanSpec); + common::ScanSpec& scanSpec, + memory::MemoryPool& pool); void read(int64_t offset, const RowSet& rows, const uint64_t* incomingNulls) override; diff --git a/velox/dwio/parquet/tests/reader/ParquetTableScanTest.cpp b/velox/dwio/parquet/tests/reader/ParquetTableScanTest.cpp index 5cef59993eaf..c99663f2e6d3 100644 --- a/velox/dwio/parquet/tests/reader/ParquetTableScanTest.cpp +++ b/velox/dwio/parquet/tests/reader/ParquetTableScanTest.cpp @@ -297,6 +297,11 @@ class ParquetTableScanTest : public HiveConnectorTestBase { "SELECT t from tmp where t != TIMESTAMP '2000-09-12 22:36:29'"); } + const std::vector>& splits() + const { + return splits_; + } + private: RowTypePtr getRowType(std::vector&& outputColumnNames) const { std::vector types; @@ -1246,6 +1251,133 @@ TEST_F(ParquetTableScanTest, schemaMatch) { assertEqualVectors(rows->childAt(2), nullVector); } +TEST_F(ParquetTableScanTest, structMatchByName) { + const auto assertSelectUseColumnNames = + [this]( + const RowTypePtr& outputType, + const std::string& sql, + const std::string& remainingFilter = "") { + const auto plan = + PlanBuilder().tableScan(outputType, {}, remainingFilter).planNode(); + AssertQueryBuilder(plan, duckDbQueryRunner_) + .connectorSessionProperty( + kHiveConnectorId, + HiveConfig::kParquetUseColumnNamesSession, + "true") + .splits(splits()) + .assertResults(sql); + }; + + std::vector values = {2}; + const auto id = makeFlatVector(values); + const auto name = makeRowVector( + {"first", "last"}, + { + makeFlatVector({"Janet"}), + makeFlatVector({"Jones"}), + }); + const auto address = makeFlatVector({"567 Maple Drive"}); + auto vector = makeRowVector({"id", "name", "address"}, {id, name, address}); + + WriterOptions options; + auto file = TempFilePath::create(); + writeToParquetFile(file->getPath(), {vector}, options); + + loadData(file->getPath(), asRowType(vector->type()), vector); + assertSelect({"id", "name", "address"}, "SELECT id, name, address from tmp"); + + // Add one non-existing subfield 'middle' to the 'name' field and rename filed + // 'address'. + auto rowType = + ROW({"id", "name", "email"}, + {BIGINT(), + ROW({"first", "middle", "last"}, {VARCHAR(), VARCHAR(), VARCHAR()}), + VARCHAR()}); + loadData(file->getPath(), rowType, vector); + assertSelectUseColumnNames( + rowType, "SELECT 2, ('Janet', null, 'Jones'), null"); + + // Filter pushdown on the non-existing field. + assertSelectUseColumnNames( + rowType, "SELECT * from tmp where false", "not(is_null(name.middle))"); + + // Rename subfields of the 'name' field. + rowType = + ROW({"id", "name", "address"}, + {BIGINT(), ROW({"a", "b"}, {VARCHAR(), VARCHAR()}), VARCHAR()}); + loadData(file->getPath(), rowType, vector); + assertSelectUseColumnNames(rowType, "SELECT 2, null, '567 Maple Drive'"); + + // Filter pushdown on the NULL subfield. + assertSelectUseColumnNames( + rowType, "SELECT * from tmp where false", "not(is_null(name))"); + + // Deletion of one subfield from the 'name' field. + rowType = + ROW({"id", "name", "address"}, + {BIGINT(), ROW({"full"}, {VARCHAR()}), VARCHAR()}); + loadData(file->getPath(), rowType, vector); + assertSelectUseColumnNames(rowType, "SELECT 2, row(null), '567 Maple Drive'"); + + // Filter pushdown on the non-existing subfield. + assertSelectUseColumnNames( + rowType, "SELECT * from tmp where false", "not(is_null(name.full))"); + + // No subfield in the 'name' field. + rowType = ROW({"id", "name", "address"}, {BIGINT(), ROW({}, {}), VARCHAR()}); + const auto op = PlanBuilder() + .startTableScan() + .outputType(rowType) + .dataColumns(rowType) + .endTableScan() + .planNode(); + const auto split = makeSplit(file->getPath()); + const auto result = AssertQueryBuilder(op) + .connectorSessionProperty( + kHiveConnectorId, + HiveConfig::kParquetUseColumnNamesSession, + "true") + .split(split) + .copyResults(pool()); + const auto rows = result->as(); + const auto expected = makeRowVector(ROW({}, {}), 1); + assertEqualVectors(expected, rows->childAt(1)); + + // Case sensitivity when matching by name. + vector = makeRowVector( + {"id", "name", "address"}, + {id, + makeRowVector( + {"FIRST", "LAST"}, + { + makeFlatVector({"Janet"}), + makeFlatVector({"Jones"}), + }), + address}); + file = TempFilePath::create(); + writeToParquetFile(file->getPath(), {vector}, options); + + rowType = + ROW({"id", "name", "address"}, + {BIGINT(), + ROW({"first", "middle", "last"}, {VARCHAR(), VARCHAR(), VARCHAR()}), + VARCHAR()}); + loadData(file->getPath(), rowType, vector); + assertSelectUseColumnNames(rowType, "SELECT 2, null, '567 Maple Drive'"); + + // Case insensitivity when matching by name and reading as lower case. + auto plan = PlanBuilder().tableScan(rowType, {}, "", rowType).planNode(); + AssertQueryBuilder(plan, duckDbQueryRunner_) + .connectorSessionProperty( + kHiveConnectorId, HiveConfig::kParquetUseColumnNamesSession, "true") + .connectorSessionProperty( + kHiveConnectorId, + HiveConfig::kFileColumnNamesReadAsLowerCaseSession, + "true") + .splits(splits()) + .assertResults("SELECT 2, ('Janet', null, 'Jones'), '567 Maple Drive'"); +} + TEST_F(ParquetTableScanTest, deltaByteArray) { auto a = makeFlatVector({"axis", "axle", "babble", "babyhood"}); auto expected = makeRowVector({"a"}, {a}); diff --git a/velox/exec/tests/TableScanTest.cpp b/velox/exec/tests/TableScanTest.cpp index 2bc53f8fcf66..f8eabc85cb83 100644 --- a/velox/exec/tests/TableScanTest.cpp +++ b/velox/exec/tests/TableScanTest.cpp @@ -1190,6 +1190,92 @@ TEST_F(TableScanTest, missingColumnsInRepeatedColumns) { .assertResults(expected); } +TEST_F(TableScanTest, structMatchByName) { + const auto assertSelectUseColumnNames = + [this]( + const RowTypePtr& outputType, + const std::string& sql, + const std::string& filePath, + const std::string& remainingFilter = "") { + const auto plan = + PlanBuilder().tableScan(outputType, {}, remainingFilter).planNode(); + AssertQueryBuilder(plan, duckDbQueryRunner_) + .connectorSessionProperty( + kHiveConnectorId, + connector::hive::HiveConfig::kOrcUseColumnNamesSession, + "true") + .split(makeHiveConnectorSplit(filePath)) + .assertResults(sql); + }; + + std::vector values = {2}; + const auto id = makeFlatVector(values); + const auto name = makeRowVector( + {"first", "last"}, + { + makeFlatVector({"Janet"}), + makeFlatVector({"Jones"}), + }); + const auto address = makeFlatVector({"567 Maple Drive"}); + auto vector = makeRowVector({"id", "name", "address"}, {id, name, address}); + + auto file = TempFilePath::create(); + writeToFile(file->getPath(), {vector}); + + // Add one non-existing subfield 'middle' to the 'name' field and rename filed + // 'address'. + auto rowType = + ROW({"id", "name", "email"}, + {BIGINT(), + ROW({"first", "middle", "last"}, {VARCHAR(), VARCHAR(), VARCHAR()}), + VARCHAR()}); + assertSelectUseColumnNames( + rowType, "SELECT 2, ('Janet', null, 'Jones'), null", file->getPath()); + + // Filter pushdown on the non-existing field. + createDuckDbTable({vector}); + assertSelectUseColumnNames( + rowType, + "SELECT * from tmp where false", + file->getPath(), + "not(is_null(name.middle))"); + + // Deletion of one subfield from the 'name' field. + rowType = + ROW({"id", "name", "address"}, + {BIGINT(), ROW({"full"}, {VARCHAR()}), VARCHAR()}); + assertSelectUseColumnNames( + rowType, "SELECT 2, row(null), '567 Maple Drive'", file->getPath()); + + // Filter pushdown on the non-existing subfield. + assertSelectUseColumnNames( + rowType, + "SELECT * from tmp where false", + file->getPath(), + "not(is_null(name.full))"); + + // No subfield in the 'name' field. + rowType = ROW({"id", "name", "address"}, {BIGINT(), ROW({}, {}), VARCHAR()}); + const auto op = PlanBuilder() + .startTableScan() + .outputType(rowType) + .dataColumns(rowType) + .endTableScan() + .planNode(); + const auto split = makeHiveConnectorSplit(file->getPath()); + const auto result = + AssertQueryBuilder(op) + .connectorSessionProperty( + kHiveConnectorId, + connector::hive::HiveConfig::kOrcUseColumnNamesSession, + "true") + .split(split) + .copyResults(pool()); + const auto rows = result->as(); + const auto expected = makeRowVector(ROW({}, {}), 1); + facebook::velox::test::assertEqualVectors(expected, rows->childAt(1)); +} + // Tests queries that use Lazy vectors with multiple layers of wrapping. TEST_F(TableScanTest, constDictLazy) { vector_size_t size = 1'000;