|
20 | 20 | #include <folly/experimental/EventCount.h>
|
21 | 21 | #include <folly/synchronization/Baton.h>
|
22 | 22 | #include <folly/synchronization/Latch.h>
|
| 23 | +#include <filesystem> |
23 | 24 |
|
24 | 25 | #include "velox/common/base/Fs.h"
|
25 | 26 | #include "velox/common/base/tests/GTestUtils.h"
|
26 | 27 | #include "velox/common/caching/AsyncDataCache.h"
|
27 | 28 | #include "velox/common/caching/tests/CacheTestUtil.h"
|
| 29 | +#include "velox/common/file/File.h" |
| 30 | +#include "velox/common/file/tests/FaultyFile.h" |
28 | 31 | #include "velox/common/file/tests/FaultyFileSystem.h"
|
29 | 32 | #include "velox/common/memory/MemoryArbitrator.h"
|
30 | 33 | #include "velox/common/testutil/TestValue.h"
|
@@ -5789,5 +5792,208 @@ TEST_F(TableScanTest, prevBatchEmptyAdaptivity) {
|
5789 | 5792 | EXPECT_GT(numBatchesReadWithoutAdaptivity, numBatchesRead);
|
5790 | 5793 | }
|
5791 | 5794 | }
|
| 5795 | + |
| 5796 | +TEST_F(TableScanTest, textfileEscape) { |
| 5797 | + auto expected = makeRowVector( |
| 5798 | + {"c0", "c1"}, |
| 5799 | + { |
| 5800 | + makeFlatVector<std::string>({"a,bc", "d"}), |
| 5801 | + makeFlatVector<std::string>({"e", "e"}), |
| 5802 | + }); |
| 5803 | + |
| 5804 | + const auto tempFile = TempFilePath::create(); |
| 5805 | + const auto tempPath = tempFile->getPath(); |
| 5806 | + remove(tempPath.c_str()); |
| 5807 | + LocalWriteFile localWriteFile(tempPath); |
| 5808 | + localWriteFile.append("a\\,bc,e\nd,e"); |
| 5809 | + localWriteFile.close(); |
| 5810 | + |
| 5811 | + std::unordered_map<std::string, std::string> customSplitInfo; |
| 5812 | + std::unordered_map<std::string, std::optional<std::string>> partitionKeys; |
| 5813 | + std::unordered_map<std::string, std::string> serdeParameters{ |
| 5814 | + {dwio::common::SerDeOptions::kFieldDelim, ","}, |
| 5815 | + {dwio::common::SerDeOptions::kEscapeChar, "\\"}}; |
| 5816 | + |
| 5817 | + auto split = std::make_shared<connector::hive::HiveConnectorSplit>( |
| 5818 | + kHiveConnectorId, |
| 5819 | + tempPath, |
| 5820 | + dwio::common::FileFormat(dwio::common::FileFormat::TEXT), |
| 5821 | + 0, |
| 5822 | + std::numeric_limits<uint64_t>::max(), |
| 5823 | + partitionKeys, |
| 5824 | + std::nullopt, |
| 5825 | + customSplitInfo, |
| 5826 | + nullptr, |
| 5827 | + serdeParameters); |
| 5828 | + |
| 5829 | + auto inputType = asRowType(expected->type()); |
| 5830 | + auto plan = |
| 5831 | + PlanBuilder(pool()).tableScan(inputType, {}, "", inputType).planNode(); |
| 5832 | + |
| 5833 | + auto task = facebook::velox::exec::test::AssertQueryBuilder(plan) |
| 5834 | + .split(split) |
| 5835 | + .assertResults(expected); |
| 5836 | + auto planStats = facebook::velox::exec::toPlanStats(task->taskStats()); |
| 5837 | + auto scanNodeId = plan->id(); |
| 5838 | + auto it = planStats.find(scanNodeId); |
| 5839 | + ASSERT_TRUE(it != planStats.end()); |
| 5840 | + auto rawInputBytes = it->second.rawInputBytes; |
| 5841 | + auto overreadBytes = getTableScanRuntimeStats(task).at("overreadBytes").sum; |
| 5842 | + |
| 5843 | + ASSERT_EQ(rawInputBytes, 11); |
| 5844 | + ASSERT_EQ(overreadBytes, 0); |
| 5845 | +} |
| 5846 | + |
| 5847 | +TEST_F(TableScanTest, textfileChunkReadEntireFile) { |
| 5848 | + auto expected = makeRowVector( |
| 5849 | + {"c0", "c1"}, |
| 5850 | + { |
| 5851 | + makeFlatVector<std::string>({"row1_col1", "row2_col1", "row3_col1"}), |
| 5852 | + makeFlatVector<std::string>({"row1_col2", "row2_col2", "row3_col2"}), |
| 5853 | + }); |
| 5854 | + |
| 5855 | + const auto tempFile = TempFilePath::create(); |
| 5856 | + const auto tempPath = tempFile->getPath(); |
| 5857 | + remove(tempPath.c_str()); |
| 5858 | + LocalWriteFile localWriteFile(tempPath); |
| 5859 | + |
| 5860 | + localWriteFile.append("row1_col1,row1_col2\n"); |
| 5861 | + localWriteFile.append("row2_col1,row2_col2\n"); |
| 5862 | + localWriteFile.append("row3_col1,row3_col2\n"); |
| 5863 | + |
| 5864 | + // Add extra padding data that might be read but not used |
| 5865 | + localWriteFile.append("extra_row1,extra_data1\n"); |
| 5866 | + localWriteFile.append("extra_row2,extra_data2\n"); |
| 5867 | + localWriteFile.close(); |
| 5868 | + |
| 5869 | + std::unordered_map<std::string, std::string> customSplitInfo; |
| 5870 | + std::unordered_map<std::string, std::optional<std::string>> partitionKeys; |
| 5871 | + std::unordered_map<std::string, std::string> serdeParameters{ |
| 5872 | + {dwio::common::SerDeOptions::kFieldDelim, ","}}; |
| 5873 | + |
| 5874 | + // Create a split that only reads part of the file (first 60 bytes) |
| 5875 | + // This should cause the reader to potentially overread beyond the split |
| 5876 | + // boundary |
| 5877 | + auto split = std::make_shared<connector::hive::HiveConnectorSplit>( |
| 5878 | + kHiveConnectorId, |
| 5879 | + tempPath, |
| 5880 | + dwio::common::FileFormat(dwio::common::FileFormat::TEXT), |
| 5881 | + 0, |
| 5882 | + 59, // Limit to first 60 bytes instead of reading entire file |
| 5883 | + partitionKeys, |
| 5884 | + std::nullopt, |
| 5885 | + customSplitInfo, |
| 5886 | + nullptr, |
| 5887 | + serdeParameters); |
| 5888 | + |
| 5889 | + auto inputType = asRowType(expected->type()); |
| 5890 | + auto plan = |
| 5891 | + PlanBuilder(pool()).tableScan(inputType, {}, "", inputType).planNode(); |
| 5892 | + |
| 5893 | + auto task = facebook::velox::exec::test::AssertQueryBuilder(plan) |
| 5894 | + .split(split) |
| 5895 | + .assertResults(expected); |
| 5896 | + |
| 5897 | + auto planStats = facebook::velox::exec::toPlanStats(task->taskStats()); |
| 5898 | + auto scanNodeId = plan->id(); |
| 5899 | + auto it = planStats.find(scanNodeId); |
| 5900 | + ASSERT_TRUE(it != planStats.end()); |
| 5901 | + auto rawInputBytes = it->second.rawInputBytes; |
| 5902 | + |
| 5903 | + // Entire file was read in a single chunk even though range is [0,59] |
| 5904 | + ASSERT_EQ(rawInputBytes, 106); |
| 5905 | +} |
| 5906 | + |
| 5907 | +TEST_F(TableScanTest, textfileLarge) { |
| 5908 | + constexpr int kNumRows = |
| 5909 | + 100000; // This will generate well over 8388608 bytes (per chunk read) |
| 5910 | + constexpr int kNumCols = 10; |
| 5911 | + |
| 5912 | + constexpr int loadQuantum = 8 << 20; // loadQuantum_ as of June 2025 |
| 5913 | + |
| 5914 | + // Helper function to generate column data |
| 5915 | + auto generateColumnData = [](int row, int col) { |
| 5916 | + return fmt::format("row{}_col{}_padding_data_to_increase_size", row, col); |
| 5917 | + }; |
| 5918 | + |
| 5919 | + // Helper function to generate CSV row |
| 5920 | + auto generateCsvRow = [&](int row) { |
| 5921 | + std::vector<std::string> cols; |
| 5922 | + cols.reserve(kNumCols); |
| 5923 | + for (int col = 0; col < kNumCols; ++col) { |
| 5924 | + cols.push_back(generateColumnData(row, col)); |
| 5925 | + } |
| 5926 | + return fmt::format("{}\n", fmt::join(cols, ",")); |
| 5927 | + }; |
| 5928 | + |
| 5929 | + // Create expected result (only first row since split limit is 10 bytes) |
| 5930 | + std::vector<std::string> expectedRow; |
| 5931 | + expectedRow.reserve(kNumCols); |
| 5932 | + for (int col = 0; col < kNumCols; ++col) { |
| 5933 | + expectedRow.push_back(generateColumnData(0, col)); |
| 5934 | + } |
| 5935 | + |
| 5936 | + std::vector<std::string> columnNames; |
| 5937 | + std::vector<VectorPtr> columnVectors; |
| 5938 | + columnNames.reserve(kNumCols); |
| 5939 | + columnVectors.reserve(kNumCols); |
| 5940 | + |
| 5941 | + for (int col = 0; col < kNumCols; ++col) { |
| 5942 | + columnNames.push_back(fmt::format("c{}", col)); |
| 5943 | + columnVectors.push_back(makeFlatVector<std::string>({expectedRow[col]})); |
| 5944 | + } |
| 5945 | + |
| 5946 | + auto expected = makeRowVector(columnNames, columnVectors); |
| 5947 | + |
| 5948 | + // Create large file |
| 5949 | + const auto tempFile = TempFilePath::create(); |
| 5950 | + const auto tempPath = tempFile->getPath(); |
| 5951 | + remove(tempPath.c_str()); |
| 5952 | + LocalWriteFile localWriteFile(tempPath); |
| 5953 | + |
| 5954 | + for (int row = 0; row < kNumRows; ++row) { |
| 5955 | + localWriteFile.append(generateCsvRow(row)); |
| 5956 | + } |
| 5957 | + localWriteFile.close(); |
| 5958 | + |
| 5959 | + ASSERT_GE(std::filesystem::file_size(tempPath), loadQuantum); |
| 5960 | + |
| 5961 | + std::unordered_map<std::string, std::string> customSplitInfo; |
| 5962 | + std::unordered_map<std::string, std::optional<std::string>> partitionKeys; |
| 5963 | + std::unordered_map<std::string, std::string> serdeParameters{ |
| 5964 | + {dwio::common::SerDeOptions::kFieldDelim, ","}}; |
| 5965 | + |
| 5966 | + auto split = std::make_shared<connector::hive::HiveConnectorSplit>( |
| 5967 | + kHiveConnectorId, |
| 5968 | + tempPath, |
| 5969 | + dwio::common::FileFormat(dwio::common::FileFormat::TEXT), |
| 5970 | + 0, |
| 5971 | + 10, // Limit to only first row |
| 5972 | + partitionKeys, |
| 5973 | + std::nullopt, |
| 5974 | + customSplitInfo, |
| 5975 | + nullptr, |
| 5976 | + serdeParameters); |
| 5977 | + |
| 5978 | + auto inputType = asRowType(expected->type()); |
| 5979 | + auto plan = |
| 5980 | + PlanBuilder(pool()).tableScan(inputType, {}, "", inputType).planNode(); |
| 5981 | + |
| 5982 | + auto task = facebook::velox::exec::test::AssertQueryBuilder(plan) |
| 5983 | + .split(split) |
| 5984 | + .assertResults(expected); |
| 5985 | + |
| 5986 | + auto planStats = facebook::velox::exec::toPlanStats(task->taskStats()); |
| 5987 | + auto scanNodeId = plan->id(); |
| 5988 | + auto it = planStats.find(scanNodeId); |
| 5989 | + ASSERT_TRUE(it != planStats.end()); |
| 5990 | + auto rawInputBytes = it->second.rawInputBytes; |
| 5991 | + |
| 5992 | + // Verify we did not read the entire file but only a chunk |
| 5993 | + ASSERT_EQ(rawInputBytes, loadQuantum); |
| 5994 | + ASSERT_GT(getTableScanRuntimeStats(task)["totalScanTime"].sum, 0); |
| 5995 | + ASSERT_GT(getTableScanRuntimeStats(task)["ioWaitWallNanos"].sum, 0); |
| 5996 | +} |
| 5997 | + |
5792 | 5998 | } // namespace
|
5793 | 5999 | } // namespace facebook::velox::exec
|
0 commit comments