-
Notifications
You must be signed in to change notification settings - Fork 3.3k
Allow using temporary staging path in Iceberg for writing sorted files #26172
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -87,6 +87,8 @@ public class IcebergConfig | |
private boolean hideMaterializedViewStorageTable = true; | ||
private Optional<String> materializedViewsStorageSchema = Optional.empty(); | ||
private boolean sortedWritingEnabled = true; | ||
private boolean temporaryStagingDirectoryEnabled; | ||
private String temporaryStagingDirectoryPath = "/tmp/presto-${USER}"; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. presto -> trino |
||
private boolean queryPartitionFilterRequired; | ||
private Set<String> queryPartitionFilterRequiredSchemas = ImmutableSet.of(); | ||
private int splitManagerThreads = Math.min(Runtime.getRuntime().availableProcessors() * 2, 32); | ||
|
@@ -454,6 +456,33 @@ public IcebergConfig setSortedWritingEnabled(boolean sortedWritingEnabled) | |
return this; | ||
} | ||
|
||
public boolean isTemporaryStagingDirectoryEnabled() | ||
{ | ||
return temporaryStagingDirectoryEnabled; | ||
} | ||
|
||
@Config("iceberg.temporary-staging-directory-enabled") | ||
@ConfigDescription("Should use temporary staging directory for write operations") | ||
public IcebergConfig setTemporaryStagingDirectoryEnabled(boolean temporaryStagingDirectoryEnabled) | ||
{ | ||
this.temporaryStagingDirectoryEnabled = temporaryStagingDirectoryEnabled; | ||
return this; | ||
} | ||
|
||
@NotNull | ||
public String getTemporaryStagingDirectoryPath() | ||
{ | ||
return temporaryStagingDirectoryPath; | ||
} | ||
|
||
@Config("iceberg.temporary-staging-directory-path") | ||
@ConfigDescription("Location of temporary staging directory for write operations. Use ${USER} placeholder to use different location for each user") | ||
public IcebergConfig setTemporaryStagingDirectoryPath(String temporaryStagingDirectoryPath) | ||
{ | ||
this.temporaryStagingDirectoryPath = temporaryStagingDirectoryPath; | ||
return this; | ||
} | ||
|
||
@Config("iceberg.query-partition-filter-required") | ||
@ConfigDescription("Require a filter on at least one partition column") | ||
public IcebergConfig setQueryPartitionFilterRequired(boolean queryPartitionFilterRequired) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -20,6 +20,7 @@ | |
import io.trino.orc.OrcWriteValidation.OrcWriteValidationMode; | ||
import io.trino.plugin.base.session.SessionPropertiesProvider; | ||
import io.trino.plugin.hive.HiveCompressionOption; | ||
import io.trino.plugin.hive.SortingFileWriterConfig; | ||
import io.trino.plugin.hive.orc.OrcReaderConfig; | ||
import io.trino.plugin.hive.orc.OrcWriterConfig; | ||
import io.trino.plugin.hive.parquet.ParquetReaderConfig; | ||
|
@@ -106,6 +107,10 @@ public final class IcebergSessionProperties | |
public static final String REMOVE_ORPHAN_FILES_MIN_RETENTION = "remove_orphan_files_min_retention"; | ||
private static final String MERGE_MANIFESTS_ON_WRITE = "merge_manifests_on_write"; | ||
private static final String SORTED_WRITING_ENABLED = "sorted_writing_enabled"; | ||
private static final String SORTED_WRITING_WRITER_BUFFER_SIZE = "sorted_writing_write_buffer_size"; | ||
private static final String SORTED_WRITING_WRITER_MAX_OPEN_FILES = "sorted_writing_writer_max_open_files"; | ||
private static final String SORTED_WRITING_TEMP_STAGING_DIR_ENABLED = "sorted_writing_temporary_staging_directory_enabled"; | ||
private static final String SORTED_WRITING_TEMP_STAGING_DIR_PATH = "sorted_writing_temporary_staging_directory_path"; | ||
Comment on lines
+110
to
+113
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We don't want to expose any of these as session properties, some of these were removed from hive connector in https://github.com/trinodb/trino/pull/17390/files#diff-7dad18a4dbe9d9a536d45655875f47ffd0d8e3b5d5c6d2f3447246763841fb44L7716 |
||
private static final String QUERY_PARTITION_FILTER_REQUIRED = "query_partition_filter_required"; | ||
private static final String QUERY_PARTITION_FILTER_REQUIRED_SCHEMAS = "query_partition_filter_required_schemas"; | ||
private static final String INCREMENTAL_REFRESH_ENABLED = "incremental_refresh_enabled"; | ||
|
@@ -121,7 +126,8 @@ public IcebergSessionProperties( | |
OrcReaderConfig orcReaderConfig, | ||
OrcWriterConfig orcWriterConfig, | ||
ParquetReaderConfig parquetReaderConfig, | ||
ParquetWriterConfig parquetWriterConfig) | ||
ParquetWriterConfig parquetWriterConfig, | ||
SortingFileWriterConfig sortingFileWriterConfig) | ||
{ | ||
sessionProperties = ImmutableList.<PropertyMetadata<?>>builder() | ||
.add(dataSizeProperty( | ||
|
@@ -368,6 +374,26 @@ public IcebergSessionProperties( | |
"Enable sorted writing to tables with a specified sort order", | ||
icebergConfig.isSortedWritingEnabled(), | ||
false)) | ||
.add(dataSizeProperty( | ||
SORTED_WRITING_WRITER_BUFFER_SIZE, | ||
"Target size of buffer files used during sorting", | ||
sortingFileWriterConfig.getWriterSortBufferSize(), | ||
false)) | ||
.add(integerProperty( | ||
SORTED_WRITING_WRITER_MAX_OPEN_FILES, | ||
"Max number of concurrently open buffer files during sorting", | ||
sortingFileWriterConfig.getMaxOpenSortFiles(), | ||
false)) | ||
.add(booleanProperty( | ||
SORTED_WRITING_TEMP_STAGING_DIR_ENABLED, | ||
"Should use (if possible) temporary staging directory for write operations", | ||
icebergConfig.isTemporaryStagingDirectoryEnabled(), | ||
false)) | ||
.add(stringProperty( | ||
SORTED_WRITING_TEMP_STAGING_DIR_PATH, | ||
"Location of temporary staging directory for write operations. Use ${USER} placeholder to use different location for each user", | ||
icebergConfig.getTemporaryStagingDirectoryPath(), | ||
false)) | ||
.add(booleanProperty( | ||
QUERY_PARTITION_FILTER_REQUIRED, | ||
"Require filter on partition column", | ||
|
@@ -643,6 +669,26 @@ public static boolean isSortedWritingEnabled(ConnectorSession session) | |
return session.getProperty(SORTED_WRITING_ENABLED, Boolean.class); | ||
} | ||
|
||
public static DataSize getSortedWritingWriterBufferSize(ConnectorSession session) | ||
{ | ||
return session.getProperty(SORTED_WRITING_WRITER_BUFFER_SIZE, DataSize.class); | ||
} | ||
|
||
public static Integer getSortedWritingWriterMaxOpenFiles(ConnectorSession session) | ||
{ | ||
return session.getProperty(SORTED_WRITING_WRITER_MAX_OPEN_FILES, Integer.class); | ||
} | ||
|
||
public static boolean isSortedWritingTempStagingDirEnabled(ConnectorSession session) | ||
{ | ||
return session.getProperty(SORTED_WRITING_TEMP_STAGING_DIR_ENABLED, Boolean.class); | ||
} | ||
|
||
public static String getSortedWritingTempStagingDirPath(ConnectorSession session) | ||
{ | ||
return session.getProperty(SORTED_WRITING_TEMP_STAGING_DIR_PATH, String.class); | ||
} | ||
|
||
public static boolean isQueryPartitionFilterRequired(ConnectorSession session) | ||
{ | ||
return session.getProperty(QUERY_PARTITION_FILTER_REQUIRED, Boolean.class); | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -566,6 +566,33 @@ public void testFileSortingWithLargerTable() | |
} | ||
} | ||
|
||
@Test | ||
public void testFileSortingWithLargerTableAndTempDirForBufferFiles() | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. testSortedWritingTempStaging |
||
{ | ||
// Using a larger table forces buffered data to be written to disk | ||
Session withSmallRowGroups = Session.builder(getSession()) | ||
.setCatalogSessionProperty("iceberg", "orc_writer_max_stripe_rows", "200") | ||
.setCatalogSessionProperty("iceberg", "parquet_writer_block_size", "20kB") | ||
.setCatalogSessionProperty("iceberg", "parquet_writer_batch_size", "200") | ||
Comment on lines
+574
to
+576
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. These are not needed, sorting is per file, not per row-group There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You can use small value of |
||
.setCatalogSessionProperty("iceberg", "sorted_writing_temporary_staging_directory_enabled", "true") | ||
.setCatalogSessionProperty("iceberg", "sorted_writing_write_buffer_size", "2kB") | ||
.setCatalogSessionProperty("iceberg", "sorted_writing_writer_max_open_files", "5") | ||
Comment on lines
+577
to
+579
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. These should be setup in config properties rather than session |
||
.build(); | ||
try (TestTable table = new TestTable( | ||
getQueryRunner()::execute, | ||
"test_sorted_lineitem_table", | ||
"WITH (sorted_by = ARRAY['comment'], format = '" + format.name() + "') AS TABLE tpch.tiny.lineitem WITH NO DATA")) { | ||
assertUpdate( | ||
withSmallRowGroups, | ||
"INSERT INTO " + table.getName() + " TABLE tpch.tiny.lineitem", | ||
"VALUES 60175"); | ||
for (Object filePath : computeActual("SELECT file_path from \"" + table.getName() + "$files\"").getOnlyColumnAsSet()) { | ||
assertThat(isFileSorted(Location.of((String) filePath), "comment")).isTrue(); | ||
} | ||
assertQuery("SELECT * FROM " + table.getName(), "SELECT * FROM lineitem"); | ||
} | ||
} | ||
|
||
@Test | ||
public void testDropTableWithMissingMetadataFile() | ||
throws Exception | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
presto -> trino
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@raunaqmorarka I defaulted to the same one as the Hive temp directory. Happy to change too if you prefer
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
we don't need retain the old naming, please change it here