diff --git a/hetu-docs/en/connector/hive.md b/hetu-docs/en/connector/hive.md index cde81ba58..1412652af 100644 --- a/hetu-docs/en/connector/hive.md +++ b/hetu-docs/en/connector/hive.md @@ -567,6 +567,24 @@ VACUUM TABLE hive_acid_table_partitioned +## Table Properties + +Bloom Index is supported for both transactional and non-transactional tables. Support is added for real, date, timestamp and char datatypes. + +| Property Name | Data type | Description | Default | +|:---------------------------------|:---------------|:-------------------------------------------------------------------|:---------| +| `orc_bloom_filter_columns` | array(varchar) | Bloom Filter Index columns for ORC files as a comma seperated list | | +| `orc_bloom_filter_fpp` | double | False positive probability of bloom filter | `0.05` | + +Example: Creating a table with bloom columns specified: + +```sql +CREATE TABLE testbloom + (a bigint, b row(c bigint, d bigint), e row(f double, g date)) + WITH (format='orc',transactional=true, + orc_bloom_filter_columns=ARRAY['a','b.c','e'], orc_bloom_filter_fpp=0.001); +``` + ## Schema Evolution diff --git a/presto-hive/src/main/java/io/prestosql/plugin/hive/HiveMetadata.java b/presto-hive/src/main/java/io/prestosql/plugin/hive/HiveMetadata.java index 85c2595e7..9ed3fce46 100755 --- a/presto-hive/src/main/java/io/prestosql/plugin/hive/HiveMetadata.java +++ b/presto-hive/src/main/java/io/prestosql/plugin/hive/HiveMetadata.java @@ -207,8 +207,9 @@ public class HiveMetadata public static final String STORAGE_FORMAT = "storage_format"; - private static final String ORC_BLOOM_FILTER_COLUMNS_KEY = "orc.bloom.filter.columns"; - private static final String ORC_BLOOM_FILTER_FPP_KEY = "orc.bloom.filter.fpp"; + public static final String ORC_BLOOM_FILTER_COLUMNS_KEY = "orc.bloom.filter.columns"; + public static final String ORC_BLOOM_FILTER_FPP_KEY = "orc.bloom.filter.fpp"; + public static final Splitter COLUMN_NAMES_SPLITTER = Splitter.on(',').trimResults().omitEmptyStrings(); private static final String TEXT_SKIP_HEADER_COUNT_KEY = "skip.header.line.count"; private static final String TEXT_SKIP_FOOTER_COUNT_KEY = "skip.footer.line.count"; @@ -876,8 +877,10 @@ protected Map getEmptyTableProperties(ConnectorTableMetadata tab List columns = HiveTableProperties.getOrcBloomFilterColumns(tableMetadata.getProperties()); if (columns != null && !columns.isEmpty()) { checkFormatForProperty(hiveStorageFormat, HiveStorageFormat.ORC, HiveTableProperties.ORC_BLOOM_FILTER_COLUMNS); + Double bloomFilterFpp = HiveTableProperties.getOrcBloomFilterFpp(tableMetadata.getProperties()); + checkValueForBloomFilterFpp(bloomFilterFpp); tableProperties.put(ORC_BLOOM_FILTER_COLUMNS_KEY, Joiner.on(",").join(columns)); - tableProperties.put(ORC_BLOOM_FILTER_FPP_KEY, String.valueOf(HiveTableProperties.getOrcBloomFilterFpp(tableMetadata.getProperties()))); + tableProperties.put(ORC_BLOOM_FILTER_FPP_KEY, String.valueOf(bloomFilterFpp)); } // Avro specific properties @@ -948,6 +951,13 @@ private static void checkFormatForProperty(HiveStorageFormat actualStorageFormat } } + private static void checkValueForBloomFilterFpp(Double bloomFilterFpp) + { + if (bloomFilterFpp < 0.0 || bloomFilterFpp > 1.0) { + throw new PrestoException(INVALID_TABLE_PROPERTY, String.format("Invalid value for %s property: %s", HiveTableProperties.ORC_BLOOM_FILTER_FPP, bloomFilterFpp)); + } + } + private String validateAndNormalizeAvroSchemaUrl(String url, HdfsContext context) { try { diff --git a/presto-hive/src/main/java/io/prestosql/plugin/hive/OrcFileWriterFactory.java b/presto-hive/src/main/java/io/prestosql/plugin/hive/OrcFileWriterFactory.java index 670b09386..cc7f312e7 100644 --- a/presto-hive/src/main/java/io/prestosql/plugin/hive/OrcFileWriterFactory.java +++ b/presto-hive/src/main/java/io/prestosql/plugin/hive/OrcFileWriterFactory.java @@ -15,6 +15,7 @@ import com.google.common.collect.ImmutableList; import com.google.common.collect.ImmutableMap; +import com.google.common.collect.ImmutableSet; import io.airlift.log.Logger; import io.prestosql.orc.OrcDataSink; import io.prestosql.orc.OrcDataSource; @@ -51,10 +52,17 @@ import java.util.concurrent.Callable; import java.util.function.Supplier; +import static io.prestosql.plugin.hive.HiveMetadata.COLUMN_NAMES_SPLITTER; +import static io.prestosql.plugin.hive.HiveMetadata.ORC_BLOOM_FILTER_COLUMNS_KEY; +import static io.prestosql.plugin.hive.HiveMetadata.ORC_BLOOM_FILTER_FPP_KEY; +import static io.prestosql.plugin.hive.HiveTableProperties.ORC_BLOOM_FILTER_FPP; +import static io.prestosql.plugin.hive.HiveTableProperties.TRANSACTIONAL; import static io.prestosql.plugin.hive.HiveUtil.getColumnNames; import static io.prestosql.plugin.hive.HiveUtil.getColumnTypes; import static io.prestosql.spi.type.BigintType.BIGINT; import static io.prestosql.spi.type.IntegerType.INTEGER; +import static java.lang.Double.parseDouble; +import static java.lang.String.format; import static java.util.Locale.ENGLISH; import static java.util.Objects.requireNonNull; import static java.util.stream.Collectors.toList; @@ -63,7 +71,8 @@ public class OrcFileWriterFactory implements HiveFileWriterFactory { private static final Logger log = Logger.get(OrcFileWriterFactory.class); - + private static final Integer BASE_OFFSET_FOR_TRANSACTIONAL_TABLE = 6; + private static final Integer BASE_OFFSET_FOR_NON_TRANSACTIONAL_TABLE = 0; private final HdfsEnvironment hdfsEnvironment; private final TypeManager typeManager; private final NodeVersion nodeVersion; @@ -210,12 +219,7 @@ public Optional createFileWriter( fileColumnTypes, dataFileColumnTypes, compression, - orcWriterOptions - .withStripeMinSize(HiveSessionProperties.getOrcOptimizedWriterMinStripeSize(session)) - .withStripeMaxSize(HiveSessionProperties.getOrcOptimizedWriterMaxStripeSize(session)) - .withStripeMaxRowCount(HiveSessionProperties.getOrcOptimizedWriterMaxStripeRows(session)) - .withDictionaryMaxMemory(HiveSessionProperties.getOrcOptimizedWriterMaxDictionaryMemory(session)) - .withMaxStringStatisticsLimit(HiveSessionProperties.getOrcStringStatisticsLimit(session)), + getOrcWriterBloomOptions(schema, getOrcWriterOptions(schema, session)), writeLegacyVersion, fileInputColumnIndexes, ImmutableMap.builder() @@ -236,6 +240,35 @@ public Optional createFileWriter( } } + public static OrcWriterOptions getOrcWriterBloomOptions(Properties schema, OrcWriterOptions orcWriterOptions) + { + if (schema.containsKey(ORC_BLOOM_FILTER_COLUMNS_KEY)) { + if (!schema.containsKey(ORC_BLOOM_FILTER_FPP_KEY)) { + throw new PrestoException(HiveErrorCode.HIVE_INVALID_METADATA, "FPP for bloom filter is missing"); + } + try { + return orcWriterOptions + .withBloomFilterFpp(parseDouble(schema.getProperty(ORC_BLOOM_FILTER_FPP_KEY))) + .withBloomFilterColumns(ImmutableSet.copyOf(COLUMN_NAMES_SPLITTER.splitToList(schema.getProperty(ORC_BLOOM_FILTER_COLUMNS_KEY)))); + } + catch (NumberFormatException e) { + throw new PrestoException(HiveErrorCode.HIVE_UNSUPPORTED_FORMAT, format("Invalid value for %s property: %s", ORC_BLOOM_FILTER_FPP, schema.getProperty(ORC_BLOOM_FILTER_FPP_KEY))); + } + } + return orcWriterOptions; + } + + private OrcWriterOptions getOrcWriterOptions(Properties schema, ConnectorSession session) + { + return orcWriterOptions + .withStripeMinSize(HiveSessionProperties.getOrcOptimizedWriterMinStripeSize(session)) + .withStripeMaxSize(HiveSessionProperties.getOrcOptimizedWriterMaxStripeSize(session)) + .withStripeMaxRowCount(HiveSessionProperties.getOrcOptimizedWriterMaxStripeRows(session)) + .withDictionaryMaxMemory(HiveSessionProperties.getOrcOptimizedWriterMaxDictionaryMemory(session)) + .withMaxStringStatisticsLimit(HiveSessionProperties.getOrcStringStatisticsLimit(session)) + .withBaseIndex(Boolean.valueOf(schema.getProperty(TRANSACTIONAL)) ? BASE_OFFSET_FOR_TRANSACTIONAL_TABLE : BASE_OFFSET_FOR_NON_TRANSACTIONAL_TABLE); + } + /** * Allow subclass to replace data sink implementation. */ diff --git a/presto-hive/src/main/java/io/prestosql/plugin/hive/metastore/thrift/BridgingHiveMetastore.java b/presto-hive/src/main/java/io/prestosql/plugin/hive/metastore/thrift/BridgingHiveMetastore.java index b8bd673a6..5e1a0cd17 100644 --- a/presto-hive/src/main/java/io/prestosql/plugin/hive/metastore/thrift/BridgingHiveMetastore.java +++ b/presto-hive/src/main/java/io/prestosql/plugin/hive/metastore/thrift/BridgingHiveMetastore.java @@ -13,6 +13,7 @@ */ package io.prestosql.plugin.hive.metastore.thrift; +import com.google.common.base.Joiner; import com.google.common.collect.ImmutableMap; import io.prestosql.plugin.hive.HivePartition; import io.prestosql.plugin.hive.HiveType; @@ -47,9 +48,13 @@ import java.util.Optional; import java.util.Set; import java.util.function.Function; +import java.util.function.Predicate; import java.util.stream.Collectors; +import java.util.stream.Stream; import static com.google.common.collect.ImmutableList.toImmutableList; +import static io.prestosql.plugin.hive.HiveMetadata.ORC_BLOOM_FILTER_COLUMNS_KEY; +import static io.prestosql.plugin.hive.HiveMetadata.ORC_BLOOM_FILTER_FPP_KEY; import static io.prestosql.plugin.hive.HiveMetadata.TABLE_COMMENT; import static io.prestosql.plugin.hive.metastore.thrift.ThriftMetastoreUtil.toMetastoreApiTable; import static io.prestosql.spi.StandardErrorCode.NOT_SUPPORTED; @@ -249,6 +254,7 @@ public void renameColumn(HiveIdentity identity, String databaseName, String tabl fieldSchema.setName(newColumnName); } } + alterBloomColumns(oldColumnName, table, newColumnName); alterTable(identity, databaseName, tableName, table); } @@ -258,10 +264,46 @@ public void dropColumn(HiveIdentity identity, String databaseName, String tableN MetastoreUtil.verifyCanDropColumn(this, identity, databaseName, tableName, columnName); org.apache.hadoop.hive.metastore.api.Table table = delegate.getTable(identity, databaseName, tableName) .orElseThrow(() -> new TableNotFoundException(new SchemaTableName(databaseName, tableName))); + + removeBloomColumns(columnName, table); table.getSd().getCols().removeIf(fieldSchema -> fieldSchema.getName().equals(columnName)); alterTable(identity, databaseName, tableName, table); } + private void alterBloomColumns(String oldColumnName, org.apache.hadoop.hive.metastore.api.Table table, String newColumnName) + { + if (table.getParameters().get(ORC_BLOOM_FILTER_COLUMNS_KEY) != null) { + List bloomColumnNames = Stream.of(table.getParameters().get(ORC_BLOOM_FILTER_COLUMNS_KEY).split(",")) + .map(s -> s.startsWith(oldColumnName + ".") ? s.replaceFirst(oldColumnName, newColumnName) : s) + .collect(Collectors.toList()); + bloomColumnNames = bloomColumnNames.stream() + .map(s -> s.equals(oldColumnName) ? newColumnName : s) + .collect(Collectors.toList()); + + table.getParameters().put(ORC_BLOOM_FILTER_COLUMNS_KEY, Joiner.on(",").join(bloomColumnNames)); + } + } + + private void removeBloomColumns(String oldColumnName, org.apache.hadoop.hive.metastore.api.Table table) + { + if (table.getParameters().get(ORC_BLOOM_FILTER_COLUMNS_KEY) != null) { + Predicate subColumnMatch = s -> !s.startsWith(oldColumnName + "."); + Predicate columnMatch = r -> !r.equals(oldColumnName); + + List bloomColumnNames = Stream.of(table.getParameters().get(ORC_BLOOM_FILTER_COLUMNS_KEY).split(",")) + .map(String::trim) + .filter(subColumnMatch.and(columnMatch)) + .collect(Collectors.toList()); + if (bloomColumnNames.size() > 0) { + table.getParameters().put(ORC_BLOOM_FILTER_COLUMNS_KEY, Joiner.on(",").join(bloomColumnNames)); + } + else { + table.getParameters().remove(ORC_BLOOM_FILTER_COLUMNS_KEY); + table.getParameters().remove(ORC_BLOOM_FILTER_FPP_KEY); + } + } + } + private void alterTable(HiveIdentity identity, String databaseName, String tableName, org.apache.hadoop.hive.metastore.api.Table table) { delegate.alterTable(identity, databaseName, tableName, table); diff --git a/presto-hive/src/test/java/io/prestosql/plugin/hive/TestHiveIntegrationSmokeTest.java b/presto-hive/src/test/java/io/prestosql/plugin/hive/TestHiveIntegrationSmokeTest.java index 95f67cc69..c6c9bbf65 100644 --- a/presto-hive/src/test/java/io/prestosql/plugin/hive/TestHiveIntegrationSmokeTest.java +++ b/presto-hive/src/test/java/io/prestosql/plugin/hive/TestHiveIntegrationSmokeTest.java @@ -11,6 +11,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package io.prestosql.plugin.hive; import com.google.common.base.Joiner; @@ -185,6 +186,7 @@ import static io.prestosql.testing.TestingSnapshotUtils.NOOP_SNAPSHOT_UTILS; import static io.prestosql.testing.assertions.Assert.assertEquals; import static io.prestosql.tests.QueryAssertions.assertEqualsIgnoreOrder; +import static io.prestosql.tests.sql.TestTable.randomTableSuffix; import static io.prestosql.transaction.TransactionBuilder.transaction; import static java.lang.String.format; import static java.nio.charset.StandardCharsets.UTF_8; @@ -6940,4 +6942,109 @@ public void testReadFromTablesWithPrimitiveAndStructDataTypeColumns() assertUpdate("DROP TABLE testReadSchema4.testReadStruct9"); assertUpdate("DROP SCHEMA testReadSchema4"); } + + @Test + public void testInvalidOrcBloomFilterProperty() + { + assertThatThrownBy(() -> assertUpdate("CREATE TABLE invalid_bloom_fpp (col1 bigint) WITH (format = 'ORC', orc_bloom_filter_columns = ARRAY['col1'], orc_bloom_filter_fpp=2)")) + .hasMessageMatching("Invalid value for orc_bloom_filter_fpp property: 2.0"); + assertThatThrownBy(() -> assertUpdate("CREATE TABLE invalid_bloom_fpp (col1 bigint) WITH (format = 'ORC', orc_bloom_filter_columns = ARRAY['col1'], orc_bloom_filter_fpp=a)")) + .hasMessageStartingWith("Invalid value for table property 'orc_bloom_filter_fpp'"); + } + + @Test + public void testOrcBloomFilterWrittenForTransactionalTables() + { + String tableName = "bloom_for_txn_table_" + randomTableSuffix(); + test_bloom_written_during_create(tableName, true); + + tableName = "bloom_for_txn_table_" + randomTableSuffix(); + test_bloom_written_during_insert(tableName, true); + } + + @Test + public void testOrcBloomFilterWrittenForNonTransactionalTables() + { + String tableName = "bloom_for_non_txn_table_" + randomTableSuffix(); + test_bloom_written_during_create(tableName, false); + + tableName = "bloom_for_non_txn_table_" + randomTableSuffix(); + test_bloom_written_during_insert(tableName, false); + } + + private void test_bloom_written_during_create(String tableName, boolean transactional) + { + // test for bloom is getting written for create table query + assertUpdate(format("drop table if exists %s", tableName)); + assertUpdate( + format( + "create table %s WITH (%s) as select * from tpch.tiny.lineitem", + tableName, + addTableProperties("ORC", transactional, "orderkey", 0.001)), + 60175); + assertBloomFilterBasedRowGroupPruning(format("select * from %s where orderkey = 29989", tableName)); + + // test to check bloom still takes effect on rename column + assertUpdate( + format( + "alter table %s rename column orderkey to alt_orderkey", tableName)); + assertBloomFilterBasedRowGroupPruning(format("select * from %s where alt_orderkey = 29989", tableName)); + assertUpdate(format("DROP TABLE %s", tableName)); + } + + private void test_bloom_written_during_insert(String tableName, boolean transactional) + { + // test for bloom is getting written for insert query + assertUpdate(format("drop table if exists %s", tableName)); + assertUpdate( + format( + "create table %s (orderkey bigint, partkey bigint, shipmode VARCHAR(10)) WITH (%s)", + tableName, + addTableProperties("ORC", transactional, "orderkey", 0.001))); + assertUpdate( + format( + "INSERT INTO %s SELECT orderkey, partkey, shipmode from tpch.tiny.lineitem", + tableName), + 60175); + assertBloomFilterBasedRowGroupPruning(format("select * from %s where orderkey = 29989", tableName)); + assertUpdate(format("DROP TABLE %s", tableName)); + } + + private String addTableProperties(String fileFormat, boolean transactional, String bloomFilterColumnName, double fpp) + { + return format( + "format = '%s', transactional = %s, orc_bloom_filter_columns = ARRAY['%s'], orc_bloom_filter_fpp = %s", + fileFormat, + transactional, + bloomFilterColumnName, + fpp); + } + + private void assertBloomFilterBasedRowGroupPruning(@Language("SQL") String sql) + { + assertQueryStats( + enableBloomFilters(getSession(), false), + sql, + queryStats -> { + assertThat(queryStats.getPhysicalInputPositions()).isGreaterThan(0); + assertThat(queryStats.getProcessedInputPositions()).isEqualTo(queryStats.getPhysicalInputPositions()); + }, + results -> assertThat(results.getRowCount()).isEqualTo(2)); + + assertQueryStats( + enableBloomFilters(getSession(), true), + sql, + queryStats -> { + assertThat(queryStats.getPhysicalInputPositions()).isGreaterThan(0); + assertThat(queryStats.getProcessedInputPositions()).isGreaterThan(0); + }, + results -> assertThat(results.getRowCount()).isEqualTo(2)); + } + + private Session enableBloomFilters(Session session, boolean value) + { + return Session.builder(session) + .setCatalogSessionProperty(session.getCatalog().get(), "orc_bloom_filters_enabled", String.valueOf(value)) + .build(); + } } diff --git a/presto-hive/src/test/java/io/prestosql/plugin/hive/util/TestOrcWriterOptions.java b/presto-hive/src/test/java/io/prestosql/plugin/hive/util/TestOrcWriterOptions.java new file mode 100644 index 000000000..428f3b5c4 --- /dev/null +++ b/presto-hive/src/test/java/io/prestosql/plugin/hive/util/TestOrcWriterOptions.java @@ -0,0 +1,108 @@ +/* + * Copyright (C) 2018-2022. Huawei Technologies Co., Ltd. All rights reserved. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.prestosql.plugin.hive.util; + +import io.airlift.units.DataSize; +import io.prestosql.orc.OrcWriterOptions; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.Properties; + +import static io.airlift.units.DataSize.Unit.BYTE; +import static io.airlift.units.DataSize.Unit.KILOBYTE; +import static io.airlift.units.DataSize.Unit.MEGABYTE; +import static io.prestosql.plugin.hive.HiveMetadata.ORC_BLOOM_FILTER_COLUMNS_KEY; +import static io.prestosql.plugin.hive.HiveMetadata.ORC_BLOOM_FILTER_FPP_KEY; +import static io.prestosql.plugin.hive.OrcFileWriterFactory.getOrcWriterBloomOptions; +import static org.assertj.core.api.AssertionsForClassTypes.assertThat; +import static org.assertj.core.api.AssertionsForClassTypes.assertThatThrownBy; + +public class TestOrcWriterOptions +{ + @Test + public void testDefaults() + { + OrcWriterOptions orcWriterOptions = new OrcWriterOptions(); + assertThat(orcWriterOptions.getStripeMinSize()).isEqualTo(new DataSize(32, MEGABYTE)); + assertThat(orcWriterOptions.getStripeMaxSize()).isEqualTo(new DataSize(64, MEGABYTE)); + assertThat(orcWriterOptions.getStripeMaxRowCount()).isEqualTo(10_000_000); + assertThat(orcWriterOptions.getRowGroupMaxRowCount()).isEqualTo(10_000); + assertThat(orcWriterOptions.getDictionaryMaxMemory()).isEqualTo(new DataSize(16, MEGABYTE)); + assertThat(orcWriterOptions.getMaxStringStatisticsLimit()).isEqualTo(new DataSize(64, BYTE)); + assertThat(orcWriterOptions.getMaxCompressionBufferSize()).isEqualTo(new DataSize(256, KILOBYTE)); + assertThat(orcWriterOptions.getBloomFilterFpp()).isEqualTo(0.05); + assertThat(orcWriterOptions.isBloomFilterColumn("unknown_bloom_column")).isFalse(); + } + + @Test + public void testOrcWriterOptionsFromTableProperties() + { + Properties tableProperties = new Properties(); + tableProperties.setProperty(ORC_BLOOM_FILTER_COLUMNS_KEY, "test_column_a, test_column_b"); + tableProperties.setProperty(ORC_BLOOM_FILTER_FPP_KEY, "0.1"); + OrcWriterOptions orcWriterOptions = getOrcWriterBloomOptions(tableProperties, new OrcWriterOptions()); + + assertThat(orcWriterOptions.isBloomFilterColumn("test_column_a")).isTrue(); + assertThat(orcWriterOptions.isBloomFilterColumn("test_column_b")).isTrue(); + assertThat(orcWriterOptions.isBloomFilterColumn("unknown_bloom_column")).isFalse(); + + assertThat(orcWriterOptions.getBloomFilterFpp()).isEqualTo(0.1); + assertThat(orcWriterOptions.getBloomFilterFpp()).isNotEqualTo(0.5); + } + + @Test(dataProvider = "invalidBloomFilterFpp") + public void testOrcWriterOptionsWithInvalidFPPValue(String fpp) + { + Properties tableProperties = new Properties(); + tableProperties.setProperty(ORC_BLOOM_FILTER_COLUMNS_KEY, "column_with_bloom_filter"); + tableProperties.setProperty(ORC_BLOOM_FILTER_FPP_KEY, fpp); + assertThatThrownBy(() -> getOrcWriterBloomOptions(tableProperties, new OrcWriterOptions())) + .hasMessage("Invalid value for orc_bloom_filter_fpp property: " + fpp); + } + + @Test(dataProvider = "invalidRangeBloomFilterFpp") + public void testOrcBloomFilterWithInvalidRange(String fpp) + { + Properties tableProperties = new Properties(); + tableProperties.setProperty(ORC_BLOOM_FILTER_COLUMNS_KEY, "column_with_bloom_filter"); + tableProperties.setProperty(ORC_BLOOM_FILTER_FPP_KEY, fpp); + assertThatThrownBy(() -> getOrcWriterBloomOptions(tableProperties, new OrcWriterOptions())) + .hasMessage("bloomFilterFpp should be > 0.0 & < 1.0"); + } + + @DataProvider + public Object[][] invalidBloomFilterFpp() + { + return new Object[][] { + {"abc"}, + {"12c"}, + {"$"}, + {"*"} + }; + } + + @DataProvider + public Object[][] invalidRangeBloomFilterFpp() + { + return new Object[][] { + {"10"}, + {"-10"}, + {"0"}, + {"1"} + }; + } +} diff --git a/presto-orc/src/main/java/io/prestosql/orc/OrcWriteValidation.java b/presto-orc/src/main/java/io/prestosql/orc/OrcWriteValidation.java index d7d61d4cd..431fb711d 100644 --- a/presto-orc/src/main/java/io/prestosql/orc/OrcWriteValidation.java +++ b/presto-orc/src/main/java/io/prestosql/orc/OrcWriteValidation.java @@ -34,12 +34,14 @@ import io.prestosql.orc.metadata.statistics.IntegerStatistics; import io.prestosql.orc.metadata.statistics.IntegerStatisticsBuilder; import io.prestosql.orc.metadata.statistics.LongDecimalStatisticsBuilder; +import io.prestosql.orc.metadata.statistics.NoOpBloomFilterBuilder; import io.prestosql.orc.metadata.statistics.ShortDecimalStatisticsBuilder; import io.prestosql.orc.metadata.statistics.StatisticsBuilder; import io.prestosql.orc.metadata.statistics.StatisticsHasher; import io.prestosql.orc.metadata.statistics.StringStatistics; import io.prestosql.orc.metadata.statistics.StringStatisticsBuilder; import io.prestosql.orc.metadata.statistics.StripeStatistics; +import io.prestosql.orc.metadata.statistics.TimestampStatisticsBuilder; import io.prestosql.spi.Page; import io.prestosql.spi.PrestoException; import io.prestosql.spi.block.Block; @@ -604,7 +606,7 @@ public Optional> build() return Optional.empty(); } ImmutableList.Builder statisticsBuilders = ImmutableList.builder(); - statisticsBuilders.add(new ColumnStatistics(rowCount, 0, null, null, null, null, null, null, null, null)); + statisticsBuilders.add(new ColumnStatistics(rowCount, 0, null, null, null, null, null, null, null, null, null)); columnStatisticsValidations.forEach(validation -> validation.build(statisticsBuilders)); return Optional.of(new ColumnMetadata<>(statisticsBuilders.build())); } @@ -632,37 +634,37 @@ else if (TINYINT.equals(type)) { fieldBuilders = ImmutableList.of(); } else if (SMALLINT.equals(type)) { - statisticsBuilder = new IntegerStatisticsBuilder(); + statisticsBuilder = new IntegerStatisticsBuilder(new NoOpBloomFilterBuilder()); fieldExtractor = ignored -> ImmutableList.of(); fieldBuilders = ImmutableList.of(); } else if (INTEGER.equals(type)) { - statisticsBuilder = new IntegerStatisticsBuilder(); + statisticsBuilder = new IntegerStatisticsBuilder(new NoOpBloomFilterBuilder()); fieldExtractor = ignored -> ImmutableList.of(); fieldBuilders = ImmutableList.of(); } else if (BIGINT.equals(type)) { - statisticsBuilder = new IntegerStatisticsBuilder(); + statisticsBuilder = new IntegerStatisticsBuilder(new NoOpBloomFilterBuilder()); fieldExtractor = ignored -> ImmutableList.of(); fieldBuilders = ImmutableList.of(); } else if (DOUBLE.equals(type)) { - statisticsBuilder = new DoubleStatisticsBuilder(); + statisticsBuilder = new DoubleStatisticsBuilder(new NoOpBloomFilterBuilder()); fieldExtractor = ignored -> ImmutableList.of(); fieldBuilders = ImmutableList.of(); } else if (REAL.equals(type)) { - statisticsBuilder = new DoubleStatisticsBuilder(); + statisticsBuilder = new DoubleStatisticsBuilder(new NoOpBloomFilterBuilder()); fieldExtractor = ignored -> ImmutableList.of(); fieldBuilders = ImmutableList.of(); } else if (type instanceof VarcharType) { - statisticsBuilder = new StringStatisticsBuilder(stringStatisticsLimitInBytes); + statisticsBuilder = new StringStatisticsBuilder(stringStatisticsLimitInBytes, new NoOpBloomFilterBuilder()); fieldExtractor = ignored -> ImmutableList.of(); fieldBuilders = ImmutableList.of(); } else if (type instanceof CharType) { - statisticsBuilder = new StringStatisticsBuilder(stringStatisticsLimitInBytes); + statisticsBuilder = new StringStatisticsBuilder(stringStatisticsLimitInBytes, new NoOpBloomFilterBuilder()); fieldExtractor = ignored -> ImmutableList.of(); fieldBuilders = ImmutableList.of(); } @@ -672,12 +674,12 @@ else if (VARBINARY.equals(type)) { fieldBuilders = ImmutableList.of(); } else if (DATE.equals(type)) { - statisticsBuilder = new DateStatisticsBuilder(); + statisticsBuilder = new DateStatisticsBuilder(new NoOpBloomFilterBuilder()); fieldExtractor = ignored -> ImmutableList.of(); fieldBuilders = ImmutableList.of(); } else if (TIMESTAMP.equals(type)) { - statisticsBuilder = new CountStatisticsBuilder(); + statisticsBuilder = new TimestampStatisticsBuilder(new NoOpBloomFilterBuilder()); fieldExtractor = ignored -> ImmutableList.of(); fieldBuilders = ImmutableList.of(); } @@ -761,7 +763,7 @@ public void addBlock(Type type, Block block) @Override public ColumnStatistics buildColumnStatistics() { - return new ColumnStatistics(rowCount, 0, null, null, null, null, null, null, null, null); + return new ColumnStatistics(rowCount, 0, null, null, null, null, null, null, null, null, null); } } diff --git a/presto-orc/src/main/java/io/prestosql/orc/OrcWriter.java b/presto-orc/src/main/java/io/prestosql/orc/OrcWriter.java index f5c0e42b8..cb4a42b45 100644 --- a/presto-orc/src/main/java/io/prestosql/orc/OrcWriter.java +++ b/presto-orc/src/main/java/io/prestosql/orc/OrcWriter.java @@ -75,6 +75,7 @@ import static io.prestosql.orc.metadata.PostScript.MAGIC; import static io.prestosql.orc.stream.OrcDataOutput.createDataOutput; import static io.prestosql.orc.writer.ColumnWriters.createColumnWriter; +import static io.prestosql.orc.writer.ColumnWriters.getBloomFilterBuilder; import static java.lang.Integer.min; import static java.lang.Math.toIntExact; import static java.util.Objects.requireNonNull; @@ -173,7 +174,14 @@ public OrcWriter( for (int fieldId = 0; fieldId < types.size(); fieldId++) { OrcColumnId fieldColumnIndex = localRootType.getFieldTypeIndex(fieldId); Type fieldType = types.get(fieldId); - ColumnWriter columnWriter = createColumnWriter(fieldColumnIndex, orcTypes, fieldType, compression, maxCompressionBufferSize, options.getMaxStringStatisticsLimit()); + ColumnWriter columnWriter = createColumnWriter(fieldColumnIndex, + orcTypes, + fieldType, + compression, + maxCompressionBufferSize, + options, + columnNames.get(fieldId), + getBloomFilterBuilder(options, columnNames.get(fieldId), fieldColumnIndex.getId())); localColumnWriters.add(columnWriter); if (columnWriter instanceof SliceDictionaryColumnWriter) { @@ -401,6 +409,11 @@ private List bufferStripeData(long stripeStartOffset, FlushReason allStreams.add(indexStream.getStream()); indexLength += indexStream.size(); } + for (StreamDataOutput bloomFilter : columnWriter.getBloomFilters(metadataWriter)) { + outputData.add(bloomFilter); + allStreams.add(bloomFilter.getStream()); + indexLength += bloomFilter.size(); + } } // data streams (sorted by size) @@ -430,7 +443,7 @@ private List bufferStripeData(long stripeStartOffset, FlushReason // the 0th column is a struct column for the whole row columnEncodings.put(ROOT_COLUMN, new ColumnEncoding(DIRECT, 0)); - columnStatistics.put(ROOT_COLUMN, new ColumnStatistics((long) stripeRowCount, 0, null, null, null, null, null, null, null, null)); + columnStatistics.put(ROOT_COLUMN, new ColumnStatistics((long) stripeRowCount, 0, null, null, null, null, null, null, null, null, null)); // add footer StripeFooter stripeFooter = new StripeFooter(allStreams, toColumnMetadata(columnEncodings, orcTypes.size()), ZoneId.of("UTC")); diff --git a/presto-orc/src/main/java/io/prestosql/orc/OrcWriterOptions.java b/presto-orc/src/main/java/io/prestosql/orc/OrcWriterOptions.java index d4d17454f..bc6394f5b 100644 --- a/presto-orc/src/main/java/io/prestosql/orc/OrcWriterOptions.java +++ b/presto-orc/src/main/java/io/prestosql/orc/OrcWriterOptions.java @@ -11,11 +11,15 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package io.prestosql.orc; import com.google.common.annotations.VisibleForTesting; +import com.google.common.collect.ImmutableSet; import io.airlift.units.DataSize; +import java.util.Set; + import static com.google.common.base.MoreObjects.toStringHelper; import static com.google.common.base.Preconditions.checkArgument; import static io.airlift.units.DataSize.Unit.BYTE; @@ -30,6 +34,8 @@ public class OrcWriterOptions private static final int DEFAULT_STRIPE_MAX_ROW_COUNT = 10_000_000; private static final int DEFAULT_ROW_GROUP_MAX_ROW_COUNT = 10_000; private static final DataSize DEFAULT_DICTIONARY_MAX_MEMORY = new DataSize(16, MEGABYTE); + private static final double DEFAULT_BLOOM_FILTER_FPP = 0.05; + private static final int BASE_INDEX = 0; @VisibleForTesting static final DataSize DEFAULT_MAX_STRING_STATISTICS_LIMIT = new DataSize(64, BYTE); @@ -44,6 +50,9 @@ public class OrcWriterOptions private final DataSize dictionaryMaxMemory; private final DataSize maxStringStatisticsLimit; private final DataSize maxCompressionBufferSize; + private Set bloomFilterColumns; + private final double bloomFilterFpp; + private final int baseIndex; public OrcWriterOptions() { @@ -54,7 +63,10 @@ public OrcWriterOptions() DEFAULT_ROW_GROUP_MAX_ROW_COUNT, DEFAULT_DICTIONARY_MAX_MEMORY, DEFAULT_MAX_STRING_STATISTICS_LIMIT, - DEFAULT_MAX_COMPRESSION_BUFFER_SIZE); + DEFAULT_MAX_COMPRESSION_BUFFER_SIZE, + ImmutableSet.of(), + DEFAULT_BLOOM_FILTER_FPP, + BASE_INDEX); } private OrcWriterOptions( @@ -64,7 +76,10 @@ private OrcWriterOptions( int rowGroupMaxRowCount, DataSize dictionaryMaxMemory, DataSize maxStringStatisticsLimit, - DataSize maxCompressionBufferSize) + DataSize maxCompressionBufferSize, + Set bloomFilterColumns, + double bloomFilterFpp, + int baseIndex) { requireNonNull(stripeMinSize, "stripeMinSize is null"); requireNonNull(stripeMaxSize, "stripeMaxSize is null"); @@ -73,6 +88,8 @@ private OrcWriterOptions( requireNonNull(dictionaryMaxMemory, "dictionaryMaxMemory is null"); requireNonNull(maxStringStatisticsLimit, "maxStringStatisticsLimit is null"); requireNonNull(maxCompressionBufferSize, "maxCompressionBufferSize is null"); + requireNonNull(bloomFilterColumns, "bloomFilterColumns is null"); + checkArgument(bloomFilterFpp > 0.0 && bloomFilterFpp < 1.0, "bloomFilterFpp should be > 0.0 & < 1.0"); this.stripeMinSize = stripeMinSize; this.stripeMaxSize = stripeMaxSize; @@ -81,6 +98,9 @@ private OrcWriterOptions( this.dictionaryMaxMemory = dictionaryMaxMemory; this.maxStringStatisticsLimit = maxStringStatisticsLimit; this.maxCompressionBufferSize = maxCompressionBufferSize; + this.bloomFilterColumns = ImmutableSet.copyOf(bloomFilterColumns); + this.bloomFilterFpp = bloomFilterFpp; + this.baseIndex = baseIndex; } public DataSize getStripeMinSize() @@ -118,39 +138,74 @@ public DataSize getMaxCompressionBufferSize() return maxCompressionBufferSize; } + public boolean isBloomFilterColumn(String columnName) + { + for (String bloomFilterColumn : bloomFilterColumns) { + if (bloomFilterColumn.equals(columnName) || columnName.startsWith(bloomFilterColumn + ".")) { + return true; + } + } + return false; + } + public OrcWriterOptions withStripeMinSize(DataSize stripeMinSize) { - return new OrcWriterOptions(stripeMinSize, stripeMaxSize, stripeMaxRowCount, rowGroupMaxRowCount, dictionaryMaxMemory, maxStringStatisticsLimit, maxCompressionBufferSize); + return new OrcWriterOptions(stripeMinSize, stripeMaxSize, stripeMaxRowCount, rowGroupMaxRowCount, dictionaryMaxMemory, maxStringStatisticsLimit, maxCompressionBufferSize, bloomFilterColumns, bloomFilterFpp, baseIndex); } public OrcWriterOptions withStripeMaxSize(DataSize stripeMaxSize) { - return new OrcWriterOptions(stripeMinSize, stripeMaxSize, stripeMaxRowCount, rowGroupMaxRowCount, dictionaryMaxMemory, maxStringStatisticsLimit, maxCompressionBufferSize); + return new OrcWriterOptions(stripeMinSize, stripeMaxSize, stripeMaxRowCount, rowGroupMaxRowCount, dictionaryMaxMemory, maxStringStatisticsLimit, maxCompressionBufferSize, bloomFilterColumns, bloomFilterFpp, baseIndex); } public OrcWriterOptions withStripeMaxRowCount(int stripeMaxRowCount) { - return new OrcWriterOptions(stripeMinSize, stripeMaxSize, stripeMaxRowCount, rowGroupMaxRowCount, dictionaryMaxMemory, maxStringStatisticsLimit, maxCompressionBufferSize); + return new OrcWriterOptions(stripeMinSize, stripeMaxSize, stripeMaxRowCount, rowGroupMaxRowCount, dictionaryMaxMemory, maxStringStatisticsLimit, maxCompressionBufferSize, bloomFilterColumns, bloomFilterFpp, baseIndex); } public OrcWriterOptions withRowGroupMaxRowCount(int rowGroupMaxRowCount) { - return new OrcWriterOptions(stripeMinSize, stripeMaxSize, stripeMaxRowCount, rowGroupMaxRowCount, dictionaryMaxMemory, maxStringStatisticsLimit, maxCompressionBufferSize); + return new OrcWriterOptions(stripeMinSize, stripeMaxSize, stripeMaxRowCount, rowGroupMaxRowCount, dictionaryMaxMemory, maxStringStatisticsLimit, maxCompressionBufferSize, bloomFilterColumns, bloomFilterFpp, baseIndex); } public OrcWriterOptions withDictionaryMaxMemory(DataSize dictionaryMaxMemory) { - return new OrcWriterOptions(stripeMinSize, stripeMaxSize, stripeMaxRowCount, rowGroupMaxRowCount, dictionaryMaxMemory, maxStringStatisticsLimit, maxCompressionBufferSize); + return new OrcWriterOptions(stripeMinSize, stripeMaxSize, stripeMaxRowCount, rowGroupMaxRowCount, dictionaryMaxMemory, maxStringStatisticsLimit, maxCompressionBufferSize, bloomFilterColumns, bloomFilterFpp, baseIndex); } public OrcWriterOptions withMaxStringStatisticsLimit(DataSize maxStringStatisticsLimit) { - return new OrcWriterOptions(stripeMinSize, stripeMaxSize, stripeMaxRowCount, rowGroupMaxRowCount, dictionaryMaxMemory, maxStringStatisticsLimit, maxCompressionBufferSize); + return new OrcWriterOptions(stripeMinSize, stripeMaxSize, stripeMaxRowCount, rowGroupMaxRowCount, dictionaryMaxMemory, maxStringStatisticsLimit, maxCompressionBufferSize, bloomFilterColumns, bloomFilterFpp, baseIndex); } public OrcWriterOptions withMaxCompressionBufferSize(DataSize maxCompressionBufferSize) { - return new OrcWriterOptions(stripeMinSize, stripeMaxSize, stripeMaxRowCount, rowGroupMaxRowCount, dictionaryMaxMemory, maxStringStatisticsLimit, maxCompressionBufferSize); + return new OrcWriterOptions(stripeMinSize, stripeMaxSize, stripeMaxRowCount, rowGroupMaxRowCount, dictionaryMaxMemory, maxStringStatisticsLimit, maxCompressionBufferSize, bloomFilterColumns, bloomFilterFpp, baseIndex); + } + + public OrcWriterOptions withBloomFilterColumns(Set bloomFilterColumns) + { + return new OrcWriterOptions(stripeMinSize, stripeMaxSize, stripeMaxRowCount, rowGroupMaxRowCount, dictionaryMaxMemory, maxStringStatisticsLimit, maxCompressionBufferSize, bloomFilterColumns, bloomFilterFpp, baseIndex); + } + + public OrcWriterOptions withBloomFilterFpp(double bloomFilterFpp) + { + return new OrcWriterOptions(stripeMinSize, stripeMaxSize, stripeMaxRowCount, rowGroupMaxRowCount, dictionaryMaxMemory, maxStringStatisticsLimit, maxCompressionBufferSize, bloomFilterColumns, bloomFilterFpp, baseIndex); + } + + public OrcWriterOptions withBaseIndex(int baseIndex) + { + return new OrcWriterOptions(stripeMinSize, stripeMaxSize, stripeMaxRowCount, rowGroupMaxRowCount, dictionaryMaxMemory, maxStringStatisticsLimit, maxCompressionBufferSize, bloomFilterColumns, bloomFilterFpp, baseIndex); + } + + public double getBloomFilterFpp() + { + return bloomFilterFpp; + } + + public int getBaseIndex() + { + return baseIndex; } @Override @@ -164,6 +219,9 @@ public String toString() .add("dictionaryMaxMemory", dictionaryMaxMemory) .add("maxStringStatisticsLimit", maxStringStatisticsLimit) .add("maxCompressionBufferSize", maxCompressionBufferSize) + .add("bloomFilterColumns", bloomFilterColumns) + .add("bloomFilterFpp", bloomFilterFpp) + .add("baseIndex", baseIndex) .toString(); } } diff --git a/presto-orc/src/main/java/io/prestosql/orc/TupleDomainOrcPredicate.java b/presto-orc/src/main/java/io/prestosql/orc/TupleDomainOrcPredicate.java index 54712eca0..e3f89ccfe 100644 --- a/presto-orc/src/main/java/io/prestosql/orc/TupleDomainOrcPredicate.java +++ b/presto-orc/src/main/java/io/prestosql/orc/TupleDomainOrcPredicate.java @@ -147,10 +147,7 @@ private boolean columnOverlaps(Domain predicateDomain, long numberOfRows, Column } // if none of the discrete predicate values are found in the bloom filter, there is no overlap and the section should be skipped - if (discreteValues.get().stream().noneMatch(value -> checkInBloomFilter(bloomFilter, value, stripeDomain.getType()))) { - return false; - } - return true; + return discreteValues.get().stream().anyMatch(value -> checkInBloomFilter(bloomFilter, value, stripeDomain.getType())); } @VisibleForTesting @@ -242,6 +239,9 @@ else if (isVarcharType(type) && columnStatistics.getStringStatistics() != null) else if (type.getTypeSignature().getBase().equals(StandardTypes.DATE) && columnStatistics.getDateStatistics() != null) { return createDomain(type, hasNullValue, columnStatistics.getDateStatistics(), value -> (long) value); } + else if (type.getTypeSignature().getBase().equals(StandardTypes.TIMESTAMP) && columnStatistics.getTimestampStatistics() != null) { + return createDomain(type, hasNullValue, columnStatistics.getTimestampStatistics(), value -> (long) value); + } else if (type.getJavaType() == long.class && columnStatistics.getIntegerStatistics() != null) { return createDomain(type, hasNullValue, columnStatistics.getIntegerStatistics()); } diff --git a/presto-orc/src/main/java/io/prestosql/orc/metadata/CompressedMetadataWriter.java b/presto-orc/src/main/java/io/prestosql/orc/metadata/CompressedMetadataWriter.java index 669adf745..0fce4762a 100644 --- a/presto-orc/src/main/java/io/prestosql/orc/metadata/CompressedMetadataWriter.java +++ b/presto-orc/src/main/java/io/prestosql/orc/metadata/CompressedMetadataWriter.java @@ -16,6 +16,7 @@ import io.airlift.slice.DynamicSliceOutput; import io.airlift.slice.Slice; import io.prestosql.orc.OrcOutputBuffer; +import io.prestosql.spi.util.BloomFilter; import java.io.IOException; import java.util.List; @@ -54,6 +55,13 @@ public Slice writeMetadata(Metadata metadata) return getSliceOutput(); } + public Slice writeBloomFilters(List bloomFilters) + throws IOException + { + metadataWriter.writeBloomFilters(buffer, bloomFilters); + return getSliceOutput(); + } + public Slice writeFooter(Footer footer) throws IOException { diff --git a/presto-orc/src/main/java/io/prestosql/orc/metadata/MetadataWriter.java b/presto-orc/src/main/java/io/prestosql/orc/metadata/MetadataWriter.java index e7b067848..91021f510 100644 --- a/presto-orc/src/main/java/io/prestosql/orc/metadata/MetadataWriter.java +++ b/presto-orc/src/main/java/io/prestosql/orc/metadata/MetadataWriter.java @@ -14,6 +14,7 @@ package io.prestosql.orc.metadata; import io.airlift.slice.SliceOutput; +import io.prestosql.spi.util.BloomFilter; import java.io.IOException; import java.util.List; @@ -36,4 +37,7 @@ int writeStripeFooter(SliceOutput output, StripeFooter footer) int writeRowIndexes(SliceOutput output, List rowGroupIndexes) throws IOException; + + int writeBloomFilters(SliceOutput output, List bloomFilters) + throws IOException; } diff --git a/presto-orc/src/main/java/io/prestosql/orc/metadata/OrcMetadataReader.java b/presto-orc/src/main/java/io/prestosql/orc/metadata/OrcMetadataReader.java index 10f468330..f74d63ada 100644 --- a/presto-orc/src/main/java/io/prestosql/orc/metadata/OrcMetadataReader.java +++ b/presto-orc/src/main/java/io/prestosql/orc/metadata/OrcMetadataReader.java @@ -34,6 +34,7 @@ import io.prestosql.orc.metadata.statistics.IntegerStatistics; import io.prestosql.orc.metadata.statistics.StringStatistics; import io.prestosql.orc.metadata.statistics.StripeStatistics; +import io.prestosql.orc.metadata.statistics.TimestampStatistics; import io.prestosql.orc.proto.OrcProto; import io.prestosql.orc.proto.OrcProto.RowIndexEntry; import io.prestosql.orc.protobuf.ByteString; @@ -70,6 +71,7 @@ import static io.prestosql.orc.metadata.statistics.IntegerStatistics.INTEGER_VALUE_BYTES; import static io.prestosql.orc.metadata.statistics.ShortDecimalStatisticsBuilder.SHORT_DECIMAL_VALUE_BYTES; import static io.prestosql.orc.metadata.statistics.StringStatistics.STRING_VALUE_BYTES_OVERHEAD; +import static io.prestosql.orc.metadata.statistics.TimestampStatistics.TIMESTAMP_VALUE_BYTES; import static java.lang.Character.MIN_SUPPLEMENTARY_CODE_POINT; import static java.lang.Math.toIntExact; @@ -268,6 +270,9 @@ else if (statistics.hasStringStatistics()) { else if (statistics.hasDateStatistics()) { minAverageValueBytes = DATE_VALUE_BYTES; } + else if (statistics.hasTimestampStatistics()) { + minAverageValueBytes = TIMESTAMP_VALUE_BYTES; + } else if (statistics.hasDecimalStatistics()) { // could be 8 or 16; return the smaller one given it is a min average minAverageValueBytes = DECIMAL_VALUE_BYTES_OVERHEAD + SHORT_DECIMAL_VALUE_BYTES; @@ -291,6 +296,7 @@ else if (statistics.hasBinaryStatistics()) { statistics.hasDoubleStatistics() ? toDoubleStatistics(statistics.getDoubleStatistics()) : null, statistics.hasStringStatistics() ? toStringStatistics(hiveWriterVersion, statistics.getStringStatistics(), isRowGroup) : null, statistics.hasDateStatistics() ? toDateStatistics(hiveWriterVersion, statistics.getDateStatistics(), isRowGroup) : null, + statistics.hasTimestampStatistics() ? toTimestampStatistics(hiveWriterVersion, statistics.getTimestampStatistics(), isRowGroup) : null, statistics.hasDecimalStatistics() ? toDecimalStatistics(statistics.getDecimalStatistics()) : null, statistics.hasBinaryStatistics() ? toBinaryStatistics(statistics.getBinaryStatistics()) : null, null); @@ -482,6 +488,17 @@ private static DateStatistics toDateStatistics(HiveWriterVersion hiveWriterVersi dateStatistics.hasMaximum() ? dateStatistics.getMaximum() : null); } + private static TimestampStatistics toTimestampStatistics(HiveWriterVersion hiveWriterVersion, OrcProto.TimestampStatistics timestampStatistics, boolean isRowGroup) + { + if (hiveWriterVersion == ORIGINAL && !isRowGroup) { + return null; + } + + return new TimestampStatistics( + timestampStatistics.hasMinimumUtc() ? timestampStatistics.getMinimumUtc() : null, + timestampStatistics.hasMaximumUtc() ? timestampStatistics.getMaximumUtc() : null); + } + private static OrcType toType(OrcProto.Type type) { Optional length = Optional.empty(); diff --git a/presto-orc/src/main/java/io/prestosql/orc/metadata/OrcMetadataWriter.java b/presto-orc/src/main/java/io/prestosql/orc/metadata/OrcMetadataWriter.java index 4c9b697dc..49c651f88 100644 --- a/presto-orc/src/main/java/io/prestosql/orc/metadata/OrcMetadataWriter.java +++ b/presto-orc/src/main/java/io/prestosql/orc/metadata/OrcMetadataWriter.java @@ -15,6 +15,7 @@ import com.google.common.collect.ImmutableList; import com.google.common.io.CountingOutputStream; +import com.google.common.primitives.Longs; import io.airlift.slice.Slice; import io.airlift.slice.SliceOutput; import io.prestosql.orc.metadata.ColumnEncoding.ColumnEncodingKind; @@ -29,6 +30,7 @@ import io.prestosql.orc.proto.OrcProto.UserMetadataItem; import io.prestosql.orc.protobuf.ByteString; import io.prestosql.orc.protobuf.MessageLite; +import io.prestosql.spi.util.BloomFilter; import java.io.IOException; import java.io.OutputStream; @@ -321,6 +323,8 @@ private static OrcProto.Stream.Kind toStreamKind(StreamKind streamKind) return OrcProto.Stream.Kind.SECONDARY; case ROW_INDEX: return OrcProto.Stream.Kind.ROW_INDEX; + case BLOOM_FILTER_UTF8: + return OrcProto.Stream.Kind.BLOOM_FILTER_UTF8; } throw new IllegalArgumentException("Unsupported stream kind: " + streamKind); } @@ -370,6 +374,26 @@ private static RowIndexEntry toRowGroupIndex(RowGroupIndex rowGroupIndex) .build(); } + @Override + public int writeBloomFilters(SliceOutput output, List bloomFilters) + throws IOException + { + OrcProto.BloomFilterIndex bloomFilterIndex = OrcProto.BloomFilterIndex.newBuilder() + .addAllBloomFilter(bloomFilters.stream() + .map(OrcMetadataWriter::toBloomFilter) + .collect(toList())) + .build(); + return writeProtobufObject(output, bloomFilterIndex); + } + + private static OrcProto.BloomFilter toBloomFilter(BloomFilter bloomFilter) + { + return OrcProto.BloomFilter.newBuilder() + .addAllBitset(Longs.asList(bloomFilter.getBitSet())) + .setNumHashFunctions(bloomFilter.getNumHashFunctions()) + .build(); + } + private static OrcProto.CompressionKind toCompression(CompressionKind compressionKind) { switch (compressionKind) { diff --git a/presto-orc/src/main/java/io/prestosql/orc/metadata/statistics/BinaryStatisticsBuilder.java b/presto-orc/src/main/java/io/prestosql/orc/metadata/statistics/BinaryStatisticsBuilder.java index f0d825d7e..718e40ec7 100644 --- a/presto-orc/src/main/java/io/prestosql/orc/metadata/statistics/BinaryStatisticsBuilder.java +++ b/presto-orc/src/main/java/io/prestosql/orc/metadata/statistics/BinaryStatisticsBuilder.java @@ -67,6 +67,7 @@ public ColumnStatistics buildColumnStatistics() null, null, null, + null, binaryStatistics.orElse(null), null); } diff --git a/presto-orc/src/main/java/io/prestosql/orc/metadata/statistics/BloomFilterBuilder.java b/presto-orc/src/main/java/io/prestosql/orc/metadata/statistics/BloomFilterBuilder.java new file mode 100644 index 000000000..1a833763e --- /dev/null +++ b/presto-orc/src/main/java/io/prestosql/orc/metadata/statistics/BloomFilterBuilder.java @@ -0,0 +1,31 @@ +/* + * Copyright (C) 2018-2022. Huawei Technologies Co., Ltd. All rights reserved. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.prestosql.orc.metadata.statistics; + +import io.airlift.slice.Slice; + +public interface BloomFilterBuilder +{ + BloomFilterBuilder addString(Slice val); + + BloomFilterBuilder addLong(long val); + + BloomFilterBuilder addDouble(double val); + + BloomFilterBuilder addFloat(float val); + + HashableBloomFilter buildBloomFilter(); +} diff --git a/presto-orc/src/main/java/io/prestosql/orc/metadata/statistics/BooleanStatisticsBuilder.java b/presto-orc/src/main/java/io/prestosql/orc/metadata/statistics/BooleanStatisticsBuilder.java index 8b67bbc43..a6c6580d8 100644 --- a/presto-orc/src/main/java/io/prestosql/orc/metadata/statistics/BooleanStatisticsBuilder.java +++ b/presto-orc/src/main/java/io/prestosql/orc/metadata/statistics/BooleanStatisticsBuilder.java @@ -76,6 +76,7 @@ public ColumnStatistics buildColumnStatistics() null, null, null, + null, null); } diff --git a/presto-orc/src/main/java/io/prestosql/orc/metadata/statistics/ColumnStatistics.java b/presto-orc/src/main/java/io/prestosql/orc/metadata/statistics/ColumnStatistics.java index 9e8595d82..0068741e9 100644 --- a/presto-orc/src/main/java/io/prestosql/orc/metadata/statistics/ColumnStatistics.java +++ b/presto-orc/src/main/java/io/prestosql/orc/metadata/statistics/ColumnStatistics.java @@ -11,6 +11,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package io.prestosql.orc.metadata.statistics; import io.prestosql.orc.metadata.statistics.StatisticsHasher.Hashable; @@ -27,6 +28,7 @@ import static io.prestosql.orc.metadata.statistics.IntegerStatisticsBuilder.mergeIntegerStatistics; import static io.prestosql.orc.metadata.statistics.LongDecimalStatisticsBuilder.mergeDecimalStatistics; import static io.prestosql.orc.metadata.statistics.StringStatisticsBuilder.mergeStringStatistics; +import static io.prestosql.orc.metadata.statistics.TimestampStatisticsBuilder.mergeTimestampStatistics; public class ColumnStatistics implements Hashable @@ -41,6 +43,7 @@ public class ColumnStatistics private final DoubleStatistics doubleStatistics; private final StringStatistics stringStatistics; private final DateStatistics dateStatistics; + private final TimestampStatistics timestampStatistics; private final DecimalStatistics decimalStatistics; private final BinaryStatistics binaryStatistics; private final HashableBloomFilter bloomFilter; @@ -53,6 +56,7 @@ public ColumnStatistics( DoubleStatistics doubleStatistics, StringStatistics stringStatistics, DateStatistics dateStatistics, + TimestampStatistics timestampStatistics, DecimalStatistics decimalStatistics, BinaryStatistics binaryStatistics, HashableBloomFilter bloomFilter) @@ -65,6 +69,7 @@ public ColumnStatistics( this.doubleStatistics = doubleStatistics; this.stringStatistics = stringStatistics; this.dateStatistics = dateStatistics; + this.timestampStatistics = timestampStatistics; this.decimalStatistics = decimalStatistics; this.binaryStatistics = binaryStatistics; this.bloomFilter = bloomFilter; @@ -126,6 +131,11 @@ public DecimalStatistics getDecimalStatistics() return decimalStatistics; } + public TimestampStatistics getTimestampStatistics() + { + return timestampStatistics; + } + public BinaryStatistics getBinaryStatistics() { return binaryStatistics; @@ -146,6 +156,7 @@ public ColumnStatistics withBloomFilter(HashableBloomFilter bloomFilter) doubleStatistics, stringStatistics, dateStatistics, + timestampStatistics, decimalStatistics, binaryStatistics, bloomFilter); @@ -169,6 +180,9 @@ public long getRetainedSizeInBytes() if (dateStatistics != null) { retainedSizeInBytes += dateStatistics.getRetainedSizeInBytes(); } + if (timestampStatistics != null) { + retainedSizeInBytes += timestampStatistics.getRetainedSizeInBytes(); + } if (decimalStatistics != null) { retainedSizeInBytes += decimalStatistics.getRetainedSizeInBytes(); } @@ -198,6 +212,7 @@ public boolean equals(Object o) Objects.equals(doubleStatistics, that.doubleStatistics) && Objects.equals(stringStatistics, that.stringStatistics) && Objects.equals(dateStatistics, that.dateStatistics) && + Objects.equals(timestampStatistics, that.timestampStatistics) && Objects.equals(decimalStatistics, that.decimalStatistics) && Objects.equals(binaryStatistics, that.binaryStatistics) && Objects.equals(bloomFilter, that.bloomFilter); @@ -214,6 +229,7 @@ public int hashCode() doubleStatistics, stringStatistics, dateStatistics, + timestampStatistics, decimalStatistics, binaryStatistics, bloomFilter); @@ -230,6 +246,7 @@ public String toString() .add("doubleStatistics", doubleStatistics) .add("stringStatistics", stringStatistics) .add("dateStatistics", dateStatistics) + .add("timestampStatistics", timestampStatistics) .add("decimalStatistics", decimalStatistics) .add("binaryStatistics", binaryStatistics) .add("bloomFilter", bloomFilter) @@ -245,6 +262,7 @@ public void addHash(StatisticsHasher hasher) .putOptionalHashable(doubleStatistics) .putOptionalHashable(stringStatistics) .putOptionalHashable(dateStatistics) + .putOptionalHashable(timestampStatistics) .putOptionalHashable(decimalStatistics) .putOptionalHashable(binaryStatistics) .putOptionalHashable(bloomFilter); @@ -271,6 +289,7 @@ public static ColumnStatistics mergeColumnStatistics(List stat mergeDoubleStatistics(stats).orElse(null), mergeStringStatistics(stats).orElse(null), mergeDateStatistics(stats).orElse(null), + mergeTimestampStatistics(stats).orElse(null), mergeDecimalStatistics(stats).orElse(null), mergeBinaryStatistics(stats).orElse(null), null); diff --git a/presto-orc/src/main/java/io/prestosql/orc/metadata/statistics/DateStatisticsBuilder.java b/presto-orc/src/main/java/io/prestosql/orc/metadata/statistics/DateStatisticsBuilder.java index a44ebd09b..d275b6203 100644 --- a/presto-orc/src/main/java/io/prestosql/orc/metadata/statistics/DateStatisticsBuilder.java +++ b/presto-orc/src/main/java/io/prestosql/orc/metadata/statistics/DateStatisticsBuilder.java @@ -26,6 +26,12 @@ public class DateStatisticsBuilder private long nonNullValueCount; private int minimum = Integer.MAX_VALUE; private int maximum = Integer.MIN_VALUE; + private final BloomFilterBuilder bloomFilterBuilder; + + public DateStatisticsBuilder(BloomFilterBuilder bloomFilterBuilder) + { + this.bloomFilterBuilder = requireNonNull(bloomFilterBuilder, "bloomFilterBuilder is null"); + } @Override public void addValue(long value) @@ -35,6 +41,7 @@ public void addValue(long value) int intValue = toIntExact(value); minimum = Math.min(intValue, minimum); maximum = Math.max(intValue, maximum); + bloomFilterBuilder.addLong(value); } private void addDateStatistics(long valueCount, DateStatistics value) @@ -70,17 +77,18 @@ public ColumnStatistics buildColumnStatistics() dateStatistics.orElse(null), null, null, - null); + null, + bloomFilterBuilder.buildBloomFilter()); } public static Optional mergeDateStatistics(List stats) { - DateStatisticsBuilder dateStatisticsBuilder = new DateStatisticsBuilder(); + DateStatisticsBuilder dateStatisticsBuilder = new DateStatisticsBuilder(new NoOpBloomFilterBuilder()); for (ColumnStatistics columnStatistics : stats) { DateStatistics partialStatistics = columnStatistics.getDateStatistics(); if (columnStatistics.getNumberOfValues() > 0) { if (partialStatistics == null) { - // there are non null values but no statistics, so we can not say anything about the data + // there are non-null values but no statistics, so we can not say anything about the data return Optional.empty(); } dateStatisticsBuilder.addDateStatistics(columnStatistics.getNumberOfValues(), partialStatistics); diff --git a/presto-orc/src/main/java/io/prestosql/orc/metadata/statistics/DoubleStatisticsBuilder.java b/presto-orc/src/main/java/io/prestosql/orc/metadata/statistics/DoubleStatisticsBuilder.java index dc0cb03c1..947b00eda 100644 --- a/presto-orc/src/main/java/io/prestosql/orc/metadata/statistics/DoubleStatisticsBuilder.java +++ b/presto-orc/src/main/java/io/prestosql/orc/metadata/statistics/DoubleStatisticsBuilder.java @@ -31,6 +31,13 @@ public class DoubleStatisticsBuilder private double minimum = Double.POSITIVE_INFINITY; private double maximum = Double.NEGATIVE_INFINITY; + private final BloomFilterBuilder bloomFilterBuilder; + + public DoubleStatisticsBuilder(BloomFilterBuilder bloomFilterBuilder) + { + this.bloomFilterBuilder = requireNonNull(bloomFilterBuilder, "bloomFilterBuilder is null"); + } + @Override public void addBlock(Type type, Block block) { @@ -58,6 +65,7 @@ public void addValue(double value) minimum = Math.min(value, minimum); maximum = Math.max(value, maximum); } + bloomFilterBuilder.addDouble(value); } private void addDoubleStatistics(long valueCount, DoubleStatistics value) @@ -94,12 +102,13 @@ public ColumnStatistics buildColumnStatistics() null, null, null, - null); + null, + bloomFilterBuilder.buildBloomFilter()); } public static Optional mergeDoubleStatistics(List stats) { - DoubleStatisticsBuilder doubleStatisticsBuilder = new DoubleStatisticsBuilder(); + DoubleStatisticsBuilder doubleStatisticsBuilder = new DoubleStatisticsBuilder(new NoOpBloomFilterBuilder()); for (ColumnStatistics columnStatistics : stats) { DoubleStatistics partialStatistics = columnStatistics.getDoubleStatistics(); if (columnStatistics.getNumberOfValues() > 0) { diff --git a/presto-orc/src/main/java/io/prestosql/orc/metadata/statistics/IntegerStatisticsBuilder.java b/presto-orc/src/main/java/io/prestosql/orc/metadata/statistics/IntegerStatisticsBuilder.java index 02da2fe21..e52fb3804 100644 --- a/presto-orc/src/main/java/io/prestosql/orc/metadata/statistics/IntegerStatisticsBuilder.java +++ b/presto-orc/src/main/java/io/prestosql/orc/metadata/statistics/IntegerStatisticsBuilder.java @@ -28,6 +28,12 @@ public class IntegerStatisticsBuilder private long maximum = Long.MIN_VALUE; private long sum; private boolean overflow; + private final BloomFilterBuilder bloomFilterBuilder; + + public IntegerStatisticsBuilder(BloomFilterBuilder bloomFilterBuilder) + { + this.bloomFilterBuilder = requireNonNull(bloomFilterBuilder, "bloomFilterBuilder is null"); + } @Override public void addValue(long value) @@ -45,6 +51,7 @@ public void addValue(long value) overflow = true; } } + bloomFilterBuilder.addLong(value); } private void addIntegerStatistics(long valueCount, IntegerStatistics value) @@ -95,17 +102,18 @@ public ColumnStatistics buildColumnStatistics() null, null, null, - null); + null, + bloomFilterBuilder.buildBloomFilter()); } public static Optional mergeIntegerStatistics(List stats) { - IntegerStatisticsBuilder integerStatisticsBuilder = new IntegerStatisticsBuilder(); + IntegerStatisticsBuilder integerStatisticsBuilder = new IntegerStatisticsBuilder(new NoOpBloomFilterBuilder()); for (ColumnStatistics columnStatistics : stats) { IntegerStatistics partialStatistics = columnStatistics.getIntegerStatistics(); if (columnStatistics.getNumberOfValues() > 0) { if (partialStatistics == null) { - // there are non null values but no statistics, so we can not say anything about the data + // there are non-null values but no statistics, so we can not say anything about the data return Optional.empty(); } integerStatisticsBuilder.addIntegerStatistics(columnStatistics.getNumberOfValues(), partialStatistics); diff --git a/presto-orc/src/main/java/io/prestosql/orc/metadata/statistics/LongDecimalStatisticsBuilder.java b/presto-orc/src/main/java/io/prestosql/orc/metadata/statistics/LongDecimalStatisticsBuilder.java index 4de98a75a..a9e795006 100644 --- a/presto-orc/src/main/java/io/prestosql/orc/metadata/statistics/LongDecimalStatisticsBuilder.java +++ b/presto-orc/src/main/java/io/prestosql/orc/metadata/statistics/LongDecimalStatisticsBuilder.java @@ -102,6 +102,7 @@ public ColumnStatistics buildColumnStatistics() null, null, null, + null, decimalStatistics.orElse(null), null, null); diff --git a/presto-orc/src/main/java/io/prestosql/orc/metadata/statistics/NoOpBloomFilterBuilder.java b/presto-orc/src/main/java/io/prestosql/orc/metadata/statistics/NoOpBloomFilterBuilder.java new file mode 100644 index 000000000..4dd5031f3 --- /dev/null +++ b/presto-orc/src/main/java/io/prestosql/orc/metadata/statistics/NoOpBloomFilterBuilder.java @@ -0,0 +1,52 @@ +/* + * Copyright (C) 2018-2022. Huawei Technologies Co., Ltd. All rights reserved. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.prestosql.orc.metadata.statistics; + +import io.airlift.slice.Slice; + +public class NoOpBloomFilterBuilder + implements BloomFilterBuilder +{ + @Override + public BloomFilterBuilder addString(Slice val) + { + return this; + } + + @Override + public BloomFilterBuilder addLong(long val) + { + return this; + } + + @Override + public BloomFilterBuilder addDouble(double val) + { + return this; + } + + @Override + public BloomFilterBuilder addFloat(float val) + { + return this; + } + + @Override + public HashableBloomFilter buildBloomFilter() + { + return null; + } +} diff --git a/presto-orc/src/main/java/io/prestosql/orc/metadata/statistics/ShortDecimalStatisticsBuilder.java b/presto-orc/src/main/java/io/prestosql/orc/metadata/statistics/ShortDecimalStatisticsBuilder.java index c8f21ea4e..93f9c4071 100644 --- a/presto-orc/src/main/java/io/prestosql/orc/metadata/statistics/ShortDecimalStatisticsBuilder.java +++ b/presto-orc/src/main/java/io/prestosql/orc/metadata/statistics/ShortDecimalStatisticsBuilder.java @@ -67,6 +67,7 @@ public ColumnStatistics buildColumnStatistics() null, null, null, + null, decimalStatistics.orElse(null), null, null); diff --git a/presto-orc/src/main/java/io/prestosql/orc/metadata/statistics/StringStatisticsBuilder.java b/presto-orc/src/main/java/io/prestosql/orc/metadata/statistics/StringStatisticsBuilder.java index 09bd33db1..84c26a410 100644 --- a/presto-orc/src/main/java/io/prestosql/orc/metadata/statistics/StringStatisticsBuilder.java +++ b/presto-orc/src/main/java/io/prestosql/orc/metadata/statistics/StringStatisticsBuilder.java @@ -35,25 +35,27 @@ public class StringStatisticsBuilder private Slice minimum; private Slice maximum; private long sum; + private final BloomFilterBuilder bloomFilterBuilder; - public StringStatisticsBuilder(int stringStatisticsLimitInBytes) + public StringStatisticsBuilder(int stringStatisticsLimitInBytes, BloomFilterBuilder bloomFilterBuilder) { - this(stringStatisticsLimitInBytes, 0, null, null, 0); + this(stringStatisticsLimitInBytes, 0, null, null, 0, bloomFilterBuilder); } - private StringStatisticsBuilder(int stringStatisticsLimitInBytes, long nonNullValueCount, Slice minimum, Slice maximum, long sum) + private StringStatisticsBuilder(int stringStatisticsLimitInBytes, long nonNullValueCount, Slice minimum, Slice maximum, long sum, BloomFilterBuilder bloomFilterBuilder) { this.stringStatisticsLimitInBytes = stringStatisticsLimitInBytes; this.nonNullValueCount = nonNullValueCount; this.minimum = minimum; this.maximum = maximum; this.sum = sum; + this.bloomFilterBuilder = requireNonNull(bloomFilterBuilder, "bloomFilterBuilder is null"); } public StringStatisticsBuilder withStringStatisticsLimit(int limitInBytes) { checkArgument(limitInBytes >= 0, "limitInBytes is less than 0"); - return new StringStatisticsBuilder(limitInBytes, nonNullValueCount, minimum, maximum, sum); + return new StringStatisticsBuilder(limitInBytes, nonNullValueCount, minimum, maximum, sum, bloomFilterBuilder); } public long getNonNullValueCount() @@ -77,7 +79,7 @@ else if (minimum != null && value.compareTo(minimum) <= 0) { else if (maximum != null && value.compareTo(maximum) >= 0) { maximum = value; } - + bloomFilterBuilder.addString(value); nonNullValueCount++; sum = addExact(sum, value.length()); } @@ -140,18 +142,19 @@ public ColumnStatistics buildColumnStatistics() null, null, null, - null); + null, + bloomFilterBuilder.buildBloomFilter()); } public static Optional mergeStringStatistics(List stats) { // no need to set the stats limit for the builder given we assume the given stats are within the same limit - StringStatisticsBuilder stringStatisticsBuilder = new StringStatisticsBuilder(Integer.MAX_VALUE); + StringStatisticsBuilder stringStatisticsBuilder = new StringStatisticsBuilder(Integer.MAX_VALUE, new NoOpBloomFilterBuilder()); for (ColumnStatistics columnStatistics : stats) { StringStatistics partialStatistics = columnStatistics.getStringStatistics(); if (columnStatistics.getNumberOfValues() > 0) { if (partialStatistics == null || (partialStatistics.getMin() == null && partialStatistics.getMax() == null)) { - // there are non null values but no statistics, so we can not say anything about the data + // there are non-null values but no statistics, so we can not say anything about the data return Optional.empty(); } stringStatisticsBuilder.addStringStatistics(columnStatistics.getNumberOfValues(), partialStatistics); diff --git a/presto-orc/src/main/java/io/prestosql/orc/metadata/statistics/TimestampStatistics.java b/presto-orc/src/main/java/io/prestosql/orc/metadata/statistics/TimestampStatistics.java new file mode 100644 index 000000000..9bd9c38a1 --- /dev/null +++ b/presto-orc/src/main/java/io/prestosql/orc/metadata/statistics/TimestampStatistics.java @@ -0,0 +1,104 @@ +/* + * Copyright (C) 2018-2022. Huawei Technologies Co., Ltd. All rights reserved. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.prestosql.orc.metadata.statistics; + +import io.prestosql.orc.metadata.statistics.StatisticsHasher.Hashable; +import org.openjdk.jol.info.ClassLayout; + +import java.util.Objects; + +import static com.google.common.base.MoreObjects.toStringHelper; +import static com.google.common.base.Preconditions.checkArgument; + +public class TimestampStatistics + implements RangeStatistics, Hashable +{ + // 1 byte to denote if null + 8 bytes for the value (timestamp is of long type) + public static final long TIMESTAMP_VALUE_BYTES = Byte.BYTES + Long.BYTES; + + private static final int INSTANCE_SIZE = ClassLayout.parseClass(TimestampStatistics.class).instanceSize(); + + private final boolean hasMinimum; + private final boolean hasMaximum; + + private final long minimum; + private final long maximum; + + public TimestampStatistics(Long minimum, Long maximum) + { + checkArgument(minimum == null || maximum == null || minimum <= maximum, "minimum is not less than maximum"); + + this.hasMinimum = minimum != null; + this.minimum = hasMinimum ? minimum : 0; + + this.hasMaximum = maximum != null; + this.maximum = hasMaximum ? maximum : 0; + } + + @Override + public Long getMin() + { + return hasMinimum ? minimum : null; + } + + @Override + public Long getMax() + { + return hasMaximum ? maximum : null; + } + + @Override + public long getRetainedSizeInBytes() + { + return INSTANCE_SIZE; + } + + @Override + public boolean equals(Object o) + { + if (this == o) { + return true; + } + if (o == null || getClass() != o.getClass()) { + return false; + } + TimestampStatistics that = (TimestampStatistics) o; + return Objects.equals(getMin(), that.getMin()) && + Objects.equals(getMax(), that.getMax()); + } + + @Override + public int hashCode() + { + return Objects.hash(getMin(), getMax()); + } + + @Override + public String toString() + { + return toStringHelper(this) + .add("min", getMin()) + .add("max", getMax()) + .toString(); + } + + @Override + public void addHash(StatisticsHasher hasher) + { + hasher.putOptionalLong(hasMinimum, minimum) + .putOptionalLong(hasMaximum, maximum); + } +} diff --git a/presto-orc/src/main/java/io/prestosql/orc/metadata/statistics/TimestampStatisticsBuilder.java b/presto-orc/src/main/java/io/prestosql/orc/metadata/statistics/TimestampStatisticsBuilder.java new file mode 100644 index 000000000..065fa543f --- /dev/null +++ b/presto-orc/src/main/java/io/prestosql/orc/metadata/statistics/TimestampStatisticsBuilder.java @@ -0,0 +1,99 @@ +/* + * Copyright (C) 2018-2022. Huawei Technologies Co., Ltd. All rights reserved. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.prestosql.orc.metadata.statistics; + +import java.util.List; +import java.util.Optional; + +import static io.prestosql.orc.metadata.statistics.TimestampStatistics.TIMESTAMP_VALUE_BYTES; +import static java.util.Objects.requireNonNull; + +public class TimestampStatisticsBuilder + implements LongValueStatisticsBuilder +{ + private long nonNullValueCount; + private long minimum = Integer.MAX_VALUE; + private long maximum = Integer.MIN_VALUE; + private final BloomFilterBuilder bloomFilterBuilder; + + public TimestampStatisticsBuilder(BloomFilterBuilder bloomFilterBuilder) + { + this.bloomFilterBuilder = requireNonNull(bloomFilterBuilder, "bloomFilterBuilder is null"); + } + + @Override + public void addValue(long value) + { + nonNullValueCount++; + + minimum = Math.min(value, minimum); + maximum = Math.max(value, maximum); + bloomFilterBuilder.addLong(value); + } + + private void addTimestampStatistics(long valueCount, TimestampStatistics value) + { + requireNonNull(value, "value is null"); + requireNonNull(value.getMin(), "value.getMin() is null"); + requireNonNull(value.getMax(), "value.getMax() is null"); + + nonNullValueCount += valueCount; + minimum = Math.min(value.getMin(), minimum); + maximum = Math.max(value.getMax(), maximum); + } + + private Optional buildTimestampStatistics() + { + if (nonNullValueCount == 0) { + return Optional.empty(); + } + return Optional.of(new TimestampStatistics(minimum, maximum)); + } + + @Override + public ColumnStatistics buildColumnStatistics() + { + Optional timestampStatistics = buildTimestampStatistics(); + return new ColumnStatistics( + nonNullValueCount, + timestampStatistics.map(s -> TIMESTAMP_VALUE_BYTES).orElse(0L), + null, + null, + null, + null, + null, + timestampStatistics.orElse(null), + null, + null, + bloomFilterBuilder.buildBloomFilter()); + } + + public static Optional mergeTimestampStatistics(List stats) + { + TimestampStatisticsBuilder timestampStatisticsBuilder = new TimestampStatisticsBuilder(new NoOpBloomFilterBuilder()); + for (ColumnStatistics columnStatistics : stats) { + TimestampStatistics partialStatistics = columnStatistics.getTimestampStatistics(); + if (columnStatistics.getNumberOfValues() > 0) { + if (partialStatistics == null) { + // there are non-null values but no statistics, so we can not say anything about the data + return Optional.empty(); + } + timestampStatisticsBuilder.addTimestampStatistics(columnStatistics.getNumberOfValues(), partialStatistics); + } + } + return timestampStatisticsBuilder.buildTimestampStatistics(); + } +} diff --git a/presto-orc/src/main/java/io/prestosql/orc/metadata/statistics/Utf8BloomFilterBuilder.java b/presto-orc/src/main/java/io/prestosql/orc/metadata/statistics/Utf8BloomFilterBuilder.java new file mode 100644 index 000000000..d59f2b932 --- /dev/null +++ b/presto-orc/src/main/java/io/prestosql/orc/metadata/statistics/Utf8BloomFilterBuilder.java @@ -0,0 +1,63 @@ +/* + * Copyright (C) 2018-2022. Huawei Technologies Co., Ltd. All rights reserved. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.prestosql.orc.metadata.statistics; + +import io.airlift.slice.Slice; + +public class Utf8BloomFilterBuilder + implements BloomFilterBuilder +{ + private final HashableBloomFilter bloomFilter; + + public Utf8BloomFilterBuilder(int expectedSize, double fpp) + { + bloomFilter = new HashableBloomFilter(expectedSize, fpp); + } + + @Override + public BloomFilterBuilder addString(Slice val) + { + bloomFilter.add(val); + return this; + } + + @Override + public BloomFilterBuilder addLong(long val) + { + bloomFilter.addLong(val); + return this; + } + + @Override + public BloomFilterBuilder addDouble(double val) + { + bloomFilter.addDouble(val); + return this; + } + + @Override + public BloomFilterBuilder addFloat(float val) + { + bloomFilter.addFloat(val); + return this; + } + + @Override + public HashableBloomFilter buildBloomFilter() + { + return bloomFilter; + } +} diff --git a/presto-orc/src/main/java/io/prestosql/orc/writer/BooleanColumnWriter.java b/presto-orc/src/main/java/io/prestosql/orc/writer/BooleanColumnWriter.java index 49f22d380..aa3e65f9d 100644 --- a/presto-orc/src/main/java/io/prestosql/orc/writer/BooleanColumnWriter.java +++ b/presto-orc/src/main/java/io/prestosql/orc/writer/BooleanColumnWriter.java @@ -166,6 +166,12 @@ private static List createBooleanColumnPositionList( return positionList.build(); } + @Override + public List getBloomFilters(CompressedMetadataWriter metadataWriter) + { + return ImmutableList.of(); + } + @Override public List getDataStreams() { diff --git a/presto-orc/src/main/java/io/prestosql/orc/writer/ByteColumnWriter.java b/presto-orc/src/main/java/io/prestosql/orc/writer/ByteColumnWriter.java index b5127cd5e..6cd1c7a5f 100644 --- a/presto-orc/src/main/java/io/prestosql/orc/writer/ByteColumnWriter.java +++ b/presto-orc/src/main/java/io/prestosql/orc/writer/ByteColumnWriter.java @@ -32,16 +32,19 @@ import io.prestosql.orc.stream.StreamDataOutput; import io.prestosql.spi.block.Block; import io.prestosql.spi.type.Type; +import io.prestosql.spi.util.BloomFilter; import org.openjdk.jol.info.ClassLayout; import java.io.IOException; import java.util.ArrayList; import java.util.List; import java.util.Map; +import java.util.Objects; import java.util.Optional; import static com.google.common.base.Preconditions.checkArgument; import static com.google.common.base.Preconditions.checkState; +import static com.google.common.collect.ImmutableList.toImmutableList; import static io.prestosql.orc.metadata.ColumnEncoding.ColumnEncodingKind.DIRECT; import static io.prestosql.orc.metadata.CompressionKind.NONE; import static java.util.Objects.requireNonNull; @@ -110,7 +113,7 @@ public void writeBlock(Block block) public Map finishRowGroup() { checkState(!closed); - ColumnStatistics statistics = new ColumnStatistics((long) nonNullValueCount, 0, null, null, null, null, null, null, null, null); + ColumnStatistics statistics = new ColumnStatistics((long) nonNullValueCount, 0, null, null, null, null, null, null, null, null, null); rowGroupColumnStatistics.add(statistics); nonNullValueCount = 0; return ImmutableMap.of(columnId, statistics); @@ -166,6 +169,24 @@ private static List createByteColumnPositionList( return positionList.build(); } + @Override + public List getBloomFilters(CompressedMetadataWriter metadataWriter) + throws IOException + { + List bloomFilters = rowGroupColumnStatistics.stream() + .map(ColumnStatistics::getBloomFilter) + .filter(Objects::nonNull) + .collect(toImmutableList()); + + if (!bloomFilters.isEmpty()) { + Slice slice = metadataWriter.writeBloomFilters(bloomFilters); + Stream stream = new Stream(columnId, StreamKind.BLOOM_FILTER_UTF8, slice.length(), false); + return ImmutableList.of(new StreamDataOutput(slice, stream)); + } + + return ImmutableList.of(); + } + @Override public List getDataStreams() { diff --git a/presto-orc/src/main/java/io/prestosql/orc/writer/ColumnWriter.java b/presto-orc/src/main/java/io/prestosql/orc/writer/ColumnWriter.java index 62a08704b..7234c9b13 100644 --- a/presto-orc/src/main/java/io/prestosql/orc/writer/ColumnWriter.java +++ b/presto-orc/src/main/java/io/prestosql/orc/writer/ColumnWriter.java @@ -66,4 +66,7 @@ List getIndexStreams(CompressedMetadataWriter metadataWriter) long getRetainedBytes(); void reset(); + + List getBloomFilters(CompressedMetadataWriter metadataWriter) + throws IOException; } diff --git a/presto-orc/src/main/java/io/prestosql/orc/writer/ColumnWriters.java b/presto-orc/src/main/java/io/prestosql/orc/writer/ColumnWriters.java index 19bff61da..366fe62de 100644 --- a/presto-orc/src/main/java/io/prestosql/orc/writer/ColumnWriters.java +++ b/presto-orc/src/main/java/io/prestosql/orc/writer/ColumnWriters.java @@ -11,23 +11,36 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package io.prestosql.orc.writer; import com.google.common.collect.ImmutableList; import io.airlift.units.DataSize; +import io.prestosql.orc.OrcWriterOptions; import io.prestosql.orc.metadata.ColumnMetadata; import io.prestosql.orc.metadata.CompressionKind; import io.prestosql.orc.metadata.OrcColumnId; import io.prestosql.orc.metadata.OrcType; import io.prestosql.orc.metadata.statistics.BinaryStatisticsBuilder; +import io.prestosql.orc.metadata.statistics.BloomFilterBuilder; import io.prestosql.orc.metadata.statistics.DateStatisticsBuilder; +import io.prestosql.orc.metadata.statistics.DoubleStatisticsBuilder; import io.prestosql.orc.metadata.statistics.IntegerStatisticsBuilder; +import io.prestosql.orc.metadata.statistics.NoOpBloomFilterBuilder; +import io.prestosql.orc.metadata.statistics.StringStatisticsBuilder; +import io.prestosql.orc.metadata.statistics.TimestampStatisticsBuilder; +import io.prestosql.orc.metadata.statistics.Utf8BloomFilterBuilder; import io.prestosql.spi.type.Type; +import java.util.function.Supplier; + +import static java.lang.Math.toIntExact; import static java.util.Objects.requireNonNull; public final class ColumnWriters { + private static final int BASE_OFFSET_FOR_TRANSACTIONAL_TABLE = 6; + private ColumnWriters() {} public static ColumnWriter createColumnWriter( @@ -36,36 +49,40 @@ public static ColumnWriter createColumnWriter( Type type, CompressionKind compression, int bufferSize, - DataSize stringStatisticsLimit) + OrcWriterOptions options, + String columnName, + Supplier bloomFilterBuilder) { requireNonNull(type, "type is null"); OrcType orcType = orcTypes.get(columnId); + DataSize stringStatisticsLimit = options.getMaxStringStatisticsLimit(); + switch (orcType.getOrcTypeKind()) { case BOOLEAN: return new BooleanColumnWriter(columnId, type, compression, bufferSize); case FLOAT: - return new FloatColumnWriter(columnId, type, compression, bufferSize); + return new FloatColumnWriter(columnId, type, compression, bufferSize, () -> new DoubleStatisticsBuilder(bloomFilterBuilder.get())); case DOUBLE: - return new DoubleColumnWriter(columnId, type, compression, bufferSize); + return new DoubleColumnWriter(columnId, type, compression, bufferSize, () -> new DoubleStatisticsBuilder(bloomFilterBuilder.get())); case BYTE: return new ByteColumnWriter(columnId, type, compression, bufferSize); case DATE: - return new LongColumnWriter(columnId, type, compression, bufferSize, DateStatisticsBuilder::new); + return new LongColumnWriter(columnId, type, compression, bufferSize, () -> new DateStatisticsBuilder(bloomFilterBuilder.get())); case SHORT: case INT: case LONG: - return new LongColumnWriter(columnId, type, compression, bufferSize, IntegerStatisticsBuilder::new); + return new LongColumnWriter(columnId, type, compression, bufferSize, () -> new IntegerStatisticsBuilder(bloomFilterBuilder.get())); case DECIMAL: return new DecimalColumnWriter(columnId, type, compression, bufferSize); case TIMESTAMP: - return new TimestampColumnWriter(columnId, type, compression, bufferSize); + return new TimestampColumnWriter(columnId, type, compression, bufferSize, () -> new TimestampStatisticsBuilder(bloomFilterBuilder.get())); case BINARY: return new SliceDirectColumnWriter(columnId, type, compression, bufferSize, BinaryStatisticsBuilder::new); @@ -73,12 +90,12 @@ public static ColumnWriter createColumnWriter( case CHAR: case VARCHAR: case STRING: - return new SliceDictionaryColumnWriter(columnId, type, compression, bufferSize, stringStatisticsLimit); + return new SliceDictionaryColumnWriter(columnId, type, compression, bufferSize, () -> new StringStatisticsBuilder(toIntExact(stringStatisticsLimit.toBytes()), bloomFilterBuilder.get())); case LIST: { OrcColumnId fieldColumnIndex = orcType.getFieldTypeIndex(0); Type fieldType = type.getTypeParameters().get(0); - ColumnWriter elementWriter = createColumnWriter(fieldColumnIndex, orcTypes, fieldType, compression, bufferSize, stringStatisticsLimit); + ColumnWriter elementWriter = createColumnWriter(fieldColumnIndex, orcTypes, fieldType, compression, bufferSize, options, columnName, bloomFilterBuilder); return new ListColumnWriter(columnId, compression, bufferSize, elementWriter); } @@ -89,14 +106,18 @@ public static ColumnWriter createColumnWriter( type.getTypeParameters().get(0), compression, bufferSize, - stringStatisticsLimit); + options, + columnName, + bloomFilterBuilder); ColumnWriter valueWriter = createColumnWriter( orcType.getFieldTypeIndex(1), orcTypes, type.getTypeParameters().get(1), compression, bufferSize, - stringStatisticsLimit); + options, + columnName, + bloomFilterBuilder); return new MapColumnWriter(columnId, compression, bufferSize, keyWriter, valueWriter); } @@ -105,7 +126,15 @@ public static ColumnWriter createColumnWriter( for (int fieldId = 0; fieldId < orcType.getFieldCount(); fieldId++) { OrcColumnId fieldColumnIndex = orcType.getFieldTypeIndex(fieldId); Type fieldType = type.getTypeParameters().get(fieldId); - fieldWriters.add(createColumnWriter(fieldColumnIndex, orcTypes, fieldType, compression, bufferSize, stringStatisticsLimit)); + String colName = isTransactional(options) && columnId.getId() == BASE_OFFSET_FOR_TRANSACTIONAL_TABLE ? orcType.getFieldName(fieldId) : columnName + "." + orcType.getFieldName(fieldId); + fieldWriters.add(createColumnWriter(fieldColumnIndex, + orcTypes, + fieldType, + compression, + bufferSize, + options, + colName, + getBloomFilterBuilder(options, colName, fieldColumnIndex.getId()))); } return new StructColumnWriter(columnId, compression, bufferSize, fieldWriters.build()); } @@ -113,4 +142,17 @@ public static ColumnWriter createColumnWriter( throw new IllegalArgumentException("Unsupported type: " + type); } + + public static Supplier getBloomFilterBuilder(OrcWriterOptions options, String columnName, Integer columnId) + { + if (columnId > options.getBaseIndex() && options.isBloomFilterColumn(columnName)) { + return () -> new Utf8BloomFilterBuilder(options.getRowGroupMaxRowCount(), options.getBloomFilterFpp()); + } + return NoOpBloomFilterBuilder::new; + } + + public static boolean isTransactional(OrcWriterOptions options) + { + return options.getBaseIndex() == 0 ? false : true; + } } diff --git a/presto-orc/src/main/java/io/prestosql/orc/writer/DecimalColumnWriter.java b/presto-orc/src/main/java/io/prestosql/orc/writer/DecimalColumnWriter.java index bd3ac3792..4cd42ee65 100644 --- a/presto-orc/src/main/java/io/prestosql/orc/writer/DecimalColumnWriter.java +++ b/presto-orc/src/main/java/io/prestosql/orc/writer/DecimalColumnWriter.java @@ -254,4 +254,11 @@ public void reset() shortDecimalStatisticsBuilder = new ShortDecimalStatisticsBuilder(this.type.getScale()); longDecimalStatisticsBuilder = new LongDecimalStatisticsBuilder(); } + + @Override + public List getBloomFilters(CompressedMetadataWriter metadataWriter) + throws IOException + { + return ImmutableList.of(); + } } diff --git a/presto-orc/src/main/java/io/prestosql/orc/writer/DoubleColumnWriter.java b/presto-orc/src/main/java/io/prestosql/orc/writer/DoubleColumnWriter.java index 2c71c9fd2..cfec77a4e 100644 --- a/presto-orc/src/main/java/io/prestosql/orc/writer/DoubleColumnWriter.java +++ b/presto-orc/src/main/java/io/prestosql/orc/writer/DoubleColumnWriter.java @@ -32,16 +32,20 @@ import io.prestosql.orc.stream.StreamDataOutput; import io.prestosql.spi.block.Block; import io.prestosql.spi.type.Type; +import io.prestosql.spi.util.BloomFilter; import org.openjdk.jol.info.ClassLayout; import java.io.IOException; import java.util.ArrayList; import java.util.List; import java.util.Map; +import java.util.Objects; import java.util.Optional; +import java.util.function.Supplier; import static com.google.common.base.Preconditions.checkArgument; import static com.google.common.base.Preconditions.checkState; +import static com.google.common.collect.ImmutableList.toImmutableList; import static io.prestosql.orc.metadata.ColumnEncoding.ColumnEncodingKind.DIRECT; import static io.prestosql.orc.metadata.CompressionKind.NONE; import static java.util.Objects.requireNonNull; @@ -59,18 +63,20 @@ public class DoubleColumnWriter private final PresentOutputStream presentStream; private final List rowGroupColumnStatistics = new ArrayList<>(); - - private DoubleStatisticsBuilder statisticsBuilder = new DoubleStatisticsBuilder(); + private final Supplier statisticsBuilderSupplier; + private DoubleStatisticsBuilder statisticsBuilder; private boolean closed; - public DoubleColumnWriter(OrcColumnId columnId, Type type, CompressionKind compression, int bufferSize) + public DoubleColumnWriter(OrcColumnId columnId, Type type, CompressionKind compression, int bufferSize, Supplier statisticsBuilderSupplier) { this.columnId = requireNonNull(columnId, "columnId is null"); this.type = requireNonNull(type, "type is null"); this.compressed = requireNonNull(compression, "compression is null") != NONE; this.dataStream = new DoubleOutputStream(compression, bufferSize); this.presentStream = new PresentOutputStream(compression, bufferSize); + this.statisticsBuilderSupplier = requireNonNull(statisticsBuilderSupplier, "statisticsBuilderSupplier is null"); + this.statisticsBuilder = statisticsBuilderSupplier.get(); } @Override @@ -113,7 +119,7 @@ public Map finishRowGroup() checkState(!closed); ColumnStatistics statistics = statisticsBuilder.buildColumnStatistics(); rowGroupColumnStatistics.add(statistics); - statisticsBuilder = new DoubleStatisticsBuilder(); + statisticsBuilder = statisticsBuilderSupplier.get(); return ImmutableMap.of(columnId, statistics); } @@ -156,6 +162,24 @@ public List getIndexStreams(CompressedMetadataWriter metadataW return ImmutableList.of(new StreamDataOutput(slice, stream)); } + @Override + public List getBloomFilters(CompressedMetadataWriter metadataWriter) + throws IOException + { + List bloomFilters = rowGroupColumnStatistics.stream() + .map(ColumnStatistics::getBloomFilter) + .filter(Objects::nonNull) + .collect(toImmutableList()); + + if (!bloomFilters.isEmpty()) { + Slice slice = metadataWriter.writeBloomFilters(bloomFilters); + Stream stream = new Stream(columnId, StreamKind.BLOOM_FILTER_UTF8, slice.length(), false); + return ImmutableList.of(new StreamDataOutput(slice, stream)); + } + + return ImmutableList.of(); + } + private static List createDoubleColumnPositionList( boolean compressed, DoubleStreamCheckpoint dataCheckpoint, @@ -201,6 +225,6 @@ public void reset() dataStream.reset(); presentStream.reset(); rowGroupColumnStatistics.clear(); - statisticsBuilder = new DoubleStatisticsBuilder(); + statisticsBuilder = statisticsBuilderSupplier.get(); } } diff --git a/presto-orc/src/main/java/io/prestosql/orc/writer/FloatColumnWriter.java b/presto-orc/src/main/java/io/prestosql/orc/writer/FloatColumnWriter.java index 52912d3ae..a01f274ea 100644 --- a/presto-orc/src/main/java/io/prestosql/orc/writer/FloatColumnWriter.java +++ b/presto-orc/src/main/java/io/prestosql/orc/writer/FloatColumnWriter.java @@ -32,16 +32,20 @@ import io.prestosql.orc.stream.StreamDataOutput; import io.prestosql.spi.block.Block; import io.prestosql.spi.type.Type; +import io.prestosql.spi.util.BloomFilter; import org.openjdk.jol.info.ClassLayout; import java.io.IOException; import java.util.ArrayList; import java.util.List; import java.util.Map; +import java.util.Objects; import java.util.Optional; +import java.util.function.Supplier; import static com.google.common.base.Preconditions.checkArgument; import static com.google.common.base.Preconditions.checkState; +import static com.google.common.collect.ImmutableList.toImmutableList; import static io.prestosql.orc.metadata.ColumnEncoding.ColumnEncodingKind.DIRECT; import static io.prestosql.orc.metadata.CompressionKind.NONE; import static java.lang.Float.intBitsToFloat; @@ -61,17 +65,20 @@ public class FloatColumnWriter private final List rowGroupColumnStatistics = new ArrayList<>(); - private DoubleStatisticsBuilder statisticsBuilder = new DoubleStatisticsBuilder(); + private final Supplier statisticsBuilderSupplier; + private DoubleStatisticsBuilder statisticsBuilder; private boolean closed; - public FloatColumnWriter(OrcColumnId columnId, Type type, CompressionKind compression, int bufferSize) + public FloatColumnWriter(OrcColumnId columnId, Type type, CompressionKind compression, int bufferSize, Supplier statisticsBuilderSupplier) { this.columnId = requireNonNull(columnId, "columnId is null"); this.type = requireNonNull(type, "type is null"); this.compressed = requireNonNull(compression, "compression is null") != NONE; this.dataStream = new FloatOutputStream(compression, bufferSize); this.presentStream = new PresentOutputStream(compression, bufferSize); + this.statisticsBuilderSupplier = requireNonNull(statisticsBuilderSupplier, "statisticsBuilderSupplier is null"); + this.statisticsBuilder = statisticsBuilderSupplier.get(); } @Override @@ -115,7 +122,7 @@ public Map finishRowGroup() checkState(!closed); ColumnStatistics statistics = statisticsBuilder.buildColumnStatistics(); rowGroupColumnStatistics.add(statistics); - statisticsBuilder = new DoubleStatisticsBuilder(); + statisticsBuilder = statisticsBuilderSupplier.get(); return ImmutableMap.of(columnId, statistics); } @@ -203,6 +210,24 @@ public void reset() dataStream.reset(); presentStream.reset(); rowGroupColumnStatistics.clear(); - statisticsBuilder = new DoubleStatisticsBuilder(); + statisticsBuilder = statisticsBuilderSupplier.get(); + } + + @Override + public List getBloomFilters(CompressedMetadataWriter metadataWriter) + throws IOException + { + List bloomFilters = rowGroupColumnStatistics.stream() + .map(ColumnStatistics::getBloomFilter) + .filter(Objects::nonNull) + .collect(toImmutableList()); + + if (!bloomFilters.isEmpty()) { + Slice slice = metadataWriter.writeBloomFilters(bloomFilters); + Stream stream = new Stream(columnId, StreamKind.BLOOM_FILTER_UTF8, slice.length(), false); + return ImmutableList.of(new StreamDataOutput(slice, stream)); + } + + return ImmutableList.of(); } } diff --git a/presto-orc/src/main/java/io/prestosql/orc/writer/ListColumnWriter.java b/presto-orc/src/main/java/io/prestosql/orc/writer/ListColumnWriter.java index eba74d63d..a3bfea75d 100644 --- a/presto-orc/src/main/java/io/prestosql/orc/writer/ListColumnWriter.java +++ b/presto-orc/src/main/java/io/prestosql/orc/writer/ListColumnWriter.java @@ -135,7 +135,7 @@ public Map finishRowGroup() { checkState(!closed); - ColumnStatistics statistics = new ColumnStatistics((long) nonNullValueCount, 0, null, null, null, null, null, null, null, null); + ColumnStatistics statistics = new ColumnStatistics((long) nonNullValueCount, 0, null, null, null, null, null, null, null, null, null); rowGroupColumnStatistics.add(statistics); nonNullValueCount = 0; @@ -189,6 +189,7 @@ public List getIndexStreams(CompressedMetadataWriter metadataW ImmutableList.Builder indexStreams = ImmutableList.builder(); indexStreams.add(new StreamDataOutput(slice, stream)); indexStreams.addAll(elementWriter.getIndexStreams(metadataWriter)); + indexStreams.addAll(elementWriter.getBloomFilters(metadataWriter)); return indexStreams.build(); } @@ -241,4 +242,11 @@ public void reset() rowGroupColumnStatistics.clear(); nonNullValueCount = 0; } + + @Override + public List getBloomFilters(CompressedMetadataWriter metadataWriter) + throws IOException + { + return ImmutableList.of(); + } } diff --git a/presto-orc/src/main/java/io/prestosql/orc/writer/LongColumnWriter.java b/presto-orc/src/main/java/io/prestosql/orc/writer/LongColumnWriter.java index 4fd7fc4fd..fd091f70f 100644 --- a/presto-orc/src/main/java/io/prestosql/orc/writer/LongColumnWriter.java +++ b/presto-orc/src/main/java/io/prestosql/orc/writer/LongColumnWriter.java @@ -33,17 +33,20 @@ import io.prestosql.orc.stream.StreamDataOutput; import io.prestosql.spi.block.Block; import io.prestosql.spi.type.Type; +import io.prestosql.spi.util.BloomFilter; import org.openjdk.jol.info.ClassLayout; import java.io.IOException; import java.util.ArrayList; import java.util.List; import java.util.Map; +import java.util.Objects; import java.util.Optional; import java.util.function.Supplier; import static com.google.common.base.Preconditions.checkArgument; import static com.google.common.base.Preconditions.checkState; +import static com.google.common.collect.ImmutableList.toImmutableList; import static io.prestosql.orc.metadata.ColumnEncoding.ColumnEncodingKind.DIRECT_V2; import static io.prestosql.orc.metadata.CompressionKind.NONE; import static io.prestosql.orc.metadata.Stream.StreamKind.DATA; @@ -209,4 +212,22 @@ public void reset() rowGroupColumnStatistics.clear(); statisticsBuilder = statisticsBuilderSupplier.get(); } + + @Override + public List getBloomFilters(CompressedMetadataWriter metadataWriter) + throws IOException + { + List bloomFilters = rowGroupColumnStatistics.stream() + .map(ColumnStatistics::getBloomFilter) + .filter(Objects::nonNull) + .collect(toImmutableList()); + + if (!bloomFilters.isEmpty()) { + Slice slice = metadataWriter.writeBloomFilters(bloomFilters); + Stream stream = new Stream(columnId, StreamKind.BLOOM_FILTER_UTF8, slice.length(), false); + return ImmutableList.of(new StreamDataOutput(slice, stream)); + } + + return ImmutableList.of(); + } } diff --git a/presto-orc/src/main/java/io/prestosql/orc/writer/MapColumnWriter.java b/presto-orc/src/main/java/io/prestosql/orc/writer/MapColumnWriter.java index 847c79d4c..03ff22274 100644 --- a/presto-orc/src/main/java/io/prestosql/orc/writer/MapColumnWriter.java +++ b/presto-orc/src/main/java/io/prestosql/orc/writer/MapColumnWriter.java @@ -142,7 +142,7 @@ public Map finishRowGroup() { checkState(!closed); - ColumnStatistics statistics = new ColumnStatistics((long) nonNullValueCount, 0, null, null, null, null, null, null, null, null); + ColumnStatistics statistics = new ColumnStatistics((long) nonNullValueCount, 0, null, null, null, null, null, null, null, null, null); rowGroupColumnStatistics.add(statistics); nonNullValueCount = 0; @@ -199,7 +199,9 @@ public List getIndexStreams(CompressedMetadataWriter metadataW ImmutableList.Builder indexStreams = ImmutableList.builder(); indexStreams.add(new StreamDataOutput(slice, stream)); indexStreams.addAll(keyWriter.getIndexStreams(metadataWriter)); + indexStreams.addAll(keyWriter.getBloomFilters(metadataWriter)); indexStreams.addAll(valueWriter.getIndexStreams(metadataWriter)); + indexStreams.addAll(valueWriter.getBloomFilters(metadataWriter)); return indexStreams.build(); } @@ -254,4 +256,11 @@ public void reset() rowGroupColumnStatistics.clear(); nonNullValueCount = 0; } + + @Override + public List getBloomFilters(CompressedMetadataWriter metadataWriter) + throws IOException + { + return ImmutableList.of(); + } } diff --git a/presto-orc/src/main/java/io/prestosql/orc/writer/SliceDictionaryColumnWriter.java b/presto-orc/src/main/java/io/prestosql/orc/writer/SliceDictionaryColumnWriter.java index 43b5033bd..dd4c66028 100644 --- a/presto-orc/src/main/java/io/prestosql/orc/writer/SliceDictionaryColumnWriter.java +++ b/presto-orc/src/main/java/io/prestosql/orc/writer/SliceDictionaryColumnWriter.java @@ -29,7 +29,7 @@ import io.prestosql.orc.metadata.Stream; import io.prestosql.orc.metadata.Stream.StreamKind; import io.prestosql.orc.metadata.statistics.ColumnStatistics; -import io.prestosql.orc.metadata.statistics.StringStatisticsBuilder; +import io.prestosql.orc.metadata.statistics.SliceColumnStatisticsBuilder; import io.prestosql.orc.stream.ByteArrayOutputStream; import io.prestosql.orc.stream.LongOutputStream; import io.prestosql.orc.stream.LongOutputStreamV2; @@ -38,6 +38,7 @@ import io.prestosql.spi.block.Block; import io.prestosql.spi.block.DictionaryBlock; import io.prestosql.spi.type.Type; +import io.prestosql.spi.util.BloomFilter; import it.unimi.dsi.fastutil.ints.AbstractIntComparator; import it.unimi.dsi.fastutil.ints.IntArrays; import org.openjdk.jol.info.ClassLayout; @@ -46,11 +47,14 @@ import java.util.ArrayList; import java.util.List; import java.util.Map; +import java.util.Objects; import java.util.Optional; import java.util.OptionalInt; +import java.util.function.Supplier; import static com.google.common.base.Preconditions.checkArgument; import static com.google.common.base.Preconditions.checkState; +import static com.google.common.collect.ImmutableList.toImmutableList; import static io.airlift.units.DataSize.Unit.MEGABYTE; import static io.prestosql.orc.DictionaryCompressionOptimizer.estimateIndexBytesPerValue; import static io.prestosql.orc.metadata.ColumnEncoding.ColumnEncodingKind.DICTIONARY_V2; @@ -71,7 +75,6 @@ public class SliceDictionaryColumnWriter private final Type type; private final CompressionKind compression; private final int bufferSize; - private final int stringStatisticsLimitInBytes; private final LongOutputStream dataStream; private final PresentOutputStream presentStream; @@ -81,10 +84,11 @@ public class SliceDictionaryColumnWriter private final DictionaryBuilder dictionary = new DictionaryBuilder(10000); private final List rowGroups = new ArrayList<>(); + private final Supplier statisticsBuilderSupplier; private IntBigArray values; private int rowGroupValueCount; - private StringStatisticsBuilder statisticsBuilder; + private SliceColumnStatisticsBuilder statisticsBuilder; private long rawBytes; private long totalValueCount; @@ -97,19 +101,19 @@ public class SliceDictionaryColumnWriter private boolean directEncoded; private SliceDirectColumnWriter directColumnWriter; - public SliceDictionaryColumnWriter(OrcColumnId columnId, Type type, CompressionKind compression, int bufferSize, DataSize stringStatisticsLimit) + public SliceDictionaryColumnWriter(OrcColumnId columnId, Type type, CompressionKind compression, int bufferSize, Supplier statisticsBuilderSupplier) { this.columnId = requireNonNull(columnId, "columnId is null"); this.type = requireNonNull(type, "type is null"); this.compression = requireNonNull(compression, "compression is null"); this.bufferSize = bufferSize; - this.stringStatisticsLimitInBytes = toIntExact(requireNonNull(stringStatisticsLimit, "stringStatisticsLimit is null").toBytes()); this.dataStream = new LongOutputStreamV2(compression, bufferSize, false, DATA); this.presentStream = new PresentOutputStream(compression, bufferSize); this.dictionaryDataStream = new ByteArrayOutputStream(compression, bufferSize, StreamKind.DICTIONARY_DATA); this.dictionaryLengthStream = createLengthOutputStream(compression, bufferSize); values = new IntBigArray(); - this.statisticsBuilder = newStringStatisticsBuilder(); + this.statisticsBuilderSupplier = requireNonNull(statisticsBuilderSupplier, "statisticsBuilderSupplier is null"); + this.statisticsBuilder = statisticsBuilderSupplier.get(); } @Override @@ -160,7 +164,7 @@ public OptionalInt tryConvertToDirect(int maxDirectBytes) checkState(!closed); checkState(!directEncoded); if (directColumnWriter == null) { - directColumnWriter = new SliceDirectColumnWriter(columnId, type, compression, bufferSize, this::newStringStatisticsBuilder); + directColumnWriter = new SliceDirectColumnWriter(columnId, type, compression, bufferSize, statisticsBuilderSupplier); } checkState(directColumnWriter.getBufferedBytes() == 0); @@ -190,8 +194,6 @@ public OptionalInt tryConvertToDirect(int maxDirectBytes) checkState(rowGroupValueCount == 0); } - rowGroups.clear(); - // free the dictionary dictionary.clear(); @@ -200,7 +202,7 @@ public OptionalInt tryConvertToDirect(int maxDirectBytes) totalNonNullValueCount = 0; rowGroupValueCount = 0; - statisticsBuilder = newStringStatisticsBuilder(); + statisticsBuilder = statisticsBuilderSupplier.get(); directEncoded = true; @@ -310,7 +312,7 @@ public Map finishRowGroup() ColumnStatistics statistics = statisticsBuilder.buildColumnStatistics(); rowGroups.add(new DictionaryRowGroup(values, rowGroupValueCount, statistics)); rowGroupValueCount = 0; - statisticsBuilder = newStringStatisticsBuilder(); + statisticsBuilder = statisticsBuilderSupplier.get(); values = new IntBigArray(); return ImmutableMap.of(columnId, statistics); } @@ -537,7 +539,7 @@ public void reset() dictionaryLengthStream.reset(); rowGroups.clear(); rowGroupValueCount = 0; - statisticsBuilder = newStringStatisticsBuilder(); + statisticsBuilder = statisticsBuilderSupplier.get(); columnEncoding = null; dictionary.clear(); @@ -551,9 +553,22 @@ public void reset() } } - private StringStatisticsBuilder newStringStatisticsBuilder() + @Override + public List getBloomFilters(CompressedMetadataWriter metadataWriter) + throws IOException { - return new StringStatisticsBuilder(stringStatisticsLimitInBytes); + List bloomFilters = rowGroups.stream() + .map(rowGroup -> rowGroup.getColumnStatistics().getBloomFilter()) + .filter(Objects::nonNull) + .collect(toImmutableList()); + + if (!bloomFilters.isEmpty()) { + Slice slice = metadataWriter.writeBloomFilters(bloomFilters); + Stream stream = new Stream(columnId, StreamKind.BLOOM_FILTER_UTF8, slice.length(), false); + return ImmutableList.of(new StreamDataOutput(slice, stream)); + } + + return ImmutableList.of(); } private static class DictionaryRowGroup diff --git a/presto-orc/src/main/java/io/prestosql/orc/writer/SliceDirectColumnWriter.java b/presto-orc/src/main/java/io/prestosql/orc/writer/SliceDirectColumnWriter.java index 0ae2ff3b0..d612c8407 100644 --- a/presto-orc/src/main/java/io/prestosql/orc/writer/SliceDirectColumnWriter.java +++ b/presto-orc/src/main/java/io/prestosql/orc/writer/SliceDirectColumnWriter.java @@ -34,17 +34,20 @@ import io.prestosql.orc.stream.StreamDataOutput; import io.prestosql.spi.block.Block; import io.prestosql.spi.type.Type; +import io.prestosql.spi.util.BloomFilter; import org.openjdk.jol.info.ClassLayout; import java.io.IOException; import java.util.ArrayList; import java.util.List; import java.util.Map; +import java.util.Objects; import java.util.Optional; import java.util.function.Supplier; import static com.google.common.base.Preconditions.checkArgument; import static com.google.common.base.Preconditions.checkState; +import static com.google.common.collect.ImmutableList.toImmutableList; import static io.prestosql.orc.metadata.ColumnEncoding.ColumnEncodingKind.DIRECT_V2; import static io.prestosql.orc.metadata.CompressionKind.NONE; import static io.prestosql.orc.stream.LongOutputStream.createLengthOutputStream; @@ -226,4 +229,22 @@ public void reset() rowGroupColumnStatistics.clear(); statisticsBuilder = statisticsBuilderSupplier.get(); } + + @Override + public List getBloomFilters(CompressedMetadataWriter metadataWriter) + throws IOException + { + List bloomFilters = rowGroupColumnStatistics.stream() + .map(ColumnStatistics::getBloomFilter) + .filter(Objects::nonNull) + .collect(toImmutableList()); + + if (!bloomFilters.isEmpty()) { + Slice slice = metadataWriter.writeBloomFilters(bloomFilters); + Stream stream = new Stream(columnId, StreamKind.BLOOM_FILTER_UTF8, slice.length(), false); + return ImmutableList.of(new StreamDataOutput(slice, stream)); + } + + return ImmutableList.of(); + } } diff --git a/presto-orc/src/main/java/io/prestosql/orc/writer/StructColumnWriter.java b/presto-orc/src/main/java/io/prestosql/orc/writer/StructColumnWriter.java index fbc7641ca..9f6cc2765 100644 --- a/presto-orc/src/main/java/io/prestosql/orc/writer/StructColumnWriter.java +++ b/presto-orc/src/main/java/io/prestosql/orc/writer/StructColumnWriter.java @@ -135,7 +135,7 @@ private void writeColumnarRow(ColumnarRow columnarRow) public Map finishRowGroup() { checkState(!closed); - ColumnStatistics statistics = new ColumnStatistics((long) nonNullValueCount, 0, null, null, null, null, null, null, null, null); + ColumnStatistics statistics = new ColumnStatistics((long) nonNullValueCount, 0, null, null, null, null, null, null, null, null, null); rowGroupColumnStatistics.add(statistics); nonNullValueCount = 0; @@ -191,6 +191,7 @@ public List getIndexStreams(CompressedMetadataWriter metadataW indexStreams.add(new StreamDataOutput(slice, stream)); for (ColumnWriter structField : structFields) { indexStreams.addAll(structField.getIndexStreams(metadataWriter)); + indexStreams.addAll(structField.getBloomFilters(metadataWriter)); } return indexStreams.build(); } @@ -249,4 +250,11 @@ public void reset() rowGroupColumnStatistics.clear(); nonNullValueCount = 0; } + + @Override + public List getBloomFilters(CompressedMetadataWriter metadataWriter) + throws IOException + { + return ImmutableList.of(); + } } diff --git a/presto-orc/src/main/java/io/prestosql/orc/writer/TimestampColumnWriter.java b/presto-orc/src/main/java/io/prestosql/orc/writer/TimestampColumnWriter.java index 401ebe2e9..11c842c8d 100644 --- a/presto-orc/src/main/java/io/prestosql/orc/writer/TimestampColumnWriter.java +++ b/presto-orc/src/main/java/io/prestosql/orc/writer/TimestampColumnWriter.java @@ -26,12 +26,14 @@ import io.prestosql.orc.metadata.Stream; import io.prestosql.orc.metadata.Stream.StreamKind; import io.prestosql.orc.metadata.statistics.ColumnStatistics; +import io.prestosql.orc.metadata.statistics.TimestampStatisticsBuilder; import io.prestosql.orc.stream.LongOutputStream; import io.prestosql.orc.stream.LongOutputStreamV2; import io.prestosql.orc.stream.PresentOutputStream; import io.prestosql.orc.stream.StreamDataOutput; import io.prestosql.spi.block.Block; import io.prestosql.spi.type.Type; +import io.prestosql.spi.util.BloomFilter; import org.openjdk.jol.info.ClassLayout; import java.io.IOException; @@ -39,10 +41,13 @@ import java.util.ArrayList; import java.util.List; import java.util.Map; +import java.util.Objects; import java.util.Optional; +import java.util.function.Supplier; import static com.google.common.base.Preconditions.checkArgument; import static com.google.common.base.Preconditions.checkState; +import static com.google.common.collect.ImmutableList.toImmutableList; import static io.prestosql.orc.metadata.ColumnEncoding.ColumnEncodingKind.DIRECT_V2; import static io.prestosql.orc.metadata.CompressionKind.NONE; import static io.prestosql.orc.metadata.Stream.StreamKind.DATA; @@ -68,11 +73,12 @@ public class TimestampColumnWriter private final List rowGroupColumnStatistics = new ArrayList<>(); private final long baseTimestampInSeconds; - private int nonNullValueCount; + private final Supplier statisticsBuilderSupplier; + private TimestampStatisticsBuilder statisticsBuilder; private boolean closed; - public TimestampColumnWriter(OrcColumnId columnId, Type type, CompressionKind compression, int bufferSize) + public TimestampColumnWriter(OrcColumnId columnId, Type type, CompressionKind compression, int bufferSize, Supplier statisticsBuilderSupplier) { this.columnId = requireNonNull(columnId, "columnId is null"); this.type = requireNonNull(type, "type is null"); @@ -82,6 +88,8 @@ public TimestampColumnWriter(OrcColumnId columnId, Type type, CompressionKind co this.nanosStream = new LongOutputStreamV2(compression, bufferSize, false, SECONDARY); this.presentStream = new PresentOutputStream(compression, bufferSize); this.baseTimestampInSeconds = OffsetDateTime.of(2015, 1, 1, 0, 0, 0, 0, UTC).toEpochSecond(); + this.statisticsBuilderSupplier = requireNonNull(statisticsBuilderSupplier, "statisticsBuilderSupplier is null"); + this.statisticsBuilder = statisticsBuilderSupplier.get(); } @Override @@ -141,7 +149,7 @@ public void writeBlock(Block block) secondsStream.writeLong(seconds); nanosStream.writeLong(encodedNanos); - nonNullValueCount++; + statisticsBuilder.addValue(value); } } } @@ -150,9 +158,9 @@ public void writeBlock(Block block) public Map finishRowGroup() { checkState(!closed); - ColumnStatistics statistics = new ColumnStatistics((long) nonNullValueCount, 0, null, null, null, null, null, null, null, null); + ColumnStatistics statistics = statisticsBuilder.buildColumnStatistics(); rowGroupColumnStatistics.add(statistics); - nonNullValueCount = 0; + statisticsBuilder = statisticsBuilderSupplier.get(); return ImmutableMap.of(columnId, statistics); } @@ -232,7 +240,7 @@ public long getBufferedBytes() @Override public long getRetainedBytes() { - long retainedBytes = secondsStream.getRetainedBytes() + nanosStream.getRetainedBytes() + presentStream.getRetainedBytes(); + long retainedBytes = INSTANCE_SIZE + secondsStream.getRetainedBytes() + nanosStream.getRetainedBytes() + presentStream.getRetainedBytes(); for (ColumnStatistics statistics : rowGroupColumnStatistics) { retainedBytes += statistics.getRetainedSizeInBytes(); } @@ -246,7 +254,25 @@ public void reset() secondsStream.reset(); nanosStream.reset(); presentStream.reset(); + statisticsBuilder = statisticsBuilderSupplier.get(); rowGroupColumnStatistics.clear(); - nonNullValueCount = 0; + } + + @Override + public List getBloomFilters(CompressedMetadataWriter metadataWriter) + throws IOException + { + List bloomFilters = rowGroupColumnStatistics.stream() + .map(ColumnStatistics::getBloomFilter) + .filter(Objects::nonNull) + .collect(toImmutableList()); + + if (!bloomFilters.isEmpty()) { + Slice slice = metadataWriter.writeBloomFilters(bloomFilters); + Stream stream = new Stream(columnId, StreamKind.BLOOM_FILTER_UTF8, slice.length(), false); + return ImmutableList.of(new StreamDataOutput(slice, stream)); + } + + return ImmutableList.of(); } } diff --git a/presto-orc/src/test/java/io/prestosql/orc/TestOrcBloomFilters.java b/presto-orc/src/test/java/io/prestosql/orc/TestOrcBloomFilters.java index 81198ae7a..4ba1fe5be 100644 --- a/presto-orc/src/test/java/io/prestosql/orc/TestOrcBloomFilters.java +++ b/presto-orc/src/test/java/io/prestosql/orc/TestOrcBloomFilters.java @@ -317,6 +317,7 @@ public void testMatches() null, null, null, + null, toBloomFilter(orcBloomFilter)))); ColumnMetadata nonMatchingStatisticsByColumnIndex = new ColumnMetadata<>(ImmutableList.of(new ColumnStatistics( @@ -329,6 +330,7 @@ public void testMatches() null, null, null, + null, toBloomFilter(emptyOrcBloomFilter)))); ColumnMetadata withoutBloomFilterStatisticsByColumnIndex = new ColumnMetadata<>(ImmutableList.of(new ColumnStatistics( @@ -341,6 +343,7 @@ public void testMatches() null, null, null, + null, null))); assertTrue(predicate.matches(1L, matchingStatisticsByColumnIndex)); diff --git a/presto-orc/src/test/java/io/prestosql/orc/TestSliceDictionaryColumnWriter.java b/presto-orc/src/test/java/io/prestosql/orc/TestSliceDictionaryColumnWriter.java index 7965c5a1f..69b28b5d0 100644 --- a/presto-orc/src/test/java/io/prestosql/orc/TestSliceDictionaryColumnWriter.java +++ b/presto-orc/src/test/java/io/prestosql/orc/TestSliceDictionaryColumnWriter.java @@ -16,6 +16,8 @@ import io.airlift.slice.Slices; import io.airlift.units.DataSize; import io.prestosql.orc.metadata.CompressionKind; +import io.prestosql.orc.metadata.statistics.NoOpBloomFilterBuilder; +import io.prestosql.orc.metadata.statistics.StringStatisticsBuilder; import io.prestosql.orc.writer.SliceDictionaryColumnWriter; import io.prestosql.spi.block.Block; import io.prestosql.spi.block.RunLengthEncodedBlock; @@ -41,7 +43,7 @@ public void testDirectConversion() VARCHAR, CompressionKind.NONE, toIntExact(DEFAULT_MAX_COMPRESSION_BUFFER_SIZE.toBytes()), - DEFAULT_MAX_STRING_STATISTICS_LIMIT); + () -> new StringStatisticsBuilder(toIntExact(DEFAULT_MAX_STRING_STATISTICS_LIMIT.toBytes()), new NoOpBloomFilterBuilder())); // a single row group exceeds 2G after direct conversion byte[] value = new byte[megabytes(1)]; diff --git a/presto-orc/src/test/java/io/prestosql/orc/TestTupleDomainOrcPredicate.java b/presto-orc/src/test/java/io/prestosql/orc/TestTupleDomainOrcPredicate.java index 09d68a02e..9cc5b3977 100644 --- a/presto-orc/src/test/java/io/prestosql/orc/TestTupleDomainOrcPredicate.java +++ b/presto-orc/src/test/java/io/prestosql/orc/TestTupleDomainOrcPredicate.java @@ -88,7 +88,7 @@ private static ColumnStatistics booleanColumnStats(Long numberOfValues, Long tru if (trueValueCount != null) { booleanStatistics = new BooleanStatistics(trueValueCount); } - return new ColumnStatistics(numberOfValues, 2L, booleanStatistics, null, null, null, null, null, null, null); + return new ColumnStatistics(numberOfValues, 2L, booleanStatistics, null, null, null, null, null, null, null, null); } @Test @@ -117,7 +117,7 @@ public void testBigint() private static ColumnStatistics integerColumnStats(Long numberOfValues, Long minimum, Long maximum) { - return new ColumnStatistics(numberOfValues, 9L, null, new IntegerStatistics(minimum, maximum, null), null, null, null, null, null, null); + return new ColumnStatistics(numberOfValues, 9L, null, new IntegerStatistics(minimum, maximum, null), null, null, null, null, null, null, null); } @Test @@ -146,7 +146,7 @@ public void testDouble() private static ColumnStatistics doubleColumnStats(Long numberOfValues, Double minimum, Double maximum) { - return new ColumnStatistics(numberOfValues, 9L, null, null, new DoubleStatistics(minimum, maximum), null, null, null, null, null); + return new ColumnStatistics(numberOfValues, 9L, null, null, new DoubleStatistics(minimum, maximum), null, null, null, null, null, null); } @Test @@ -236,7 +236,7 @@ private static ColumnStatistics stringColumnStats(Long numberOfValues, String mi Slice minimumSlice = minimum == null ? null : utf8Slice(minimum); Slice maximumSlice = maximum == null ? null : utf8Slice(maximum); // sum and minAverageValueSizeInBytes are not used in this test; they could be arbitrary numbers - return new ColumnStatistics(numberOfValues, 10L, null, null, null, new StringStatistics(minimumSlice, maximumSlice, 100L), null, null, null, null); + return new ColumnStatistics(numberOfValues, 10L, null, null, null, new StringStatistics(minimumSlice, maximumSlice, 100L), null, null, null, null, null); } @Test @@ -265,7 +265,7 @@ public void testDate() private static ColumnStatistics dateColumnStats(Long numberOfValues, Integer minimum, Integer maximum) { - return new ColumnStatistics(numberOfValues, 5L, null, null, null, null, new DateStatistics(minimum, maximum), null, null, null); + return new ColumnStatistics(numberOfValues, 5L, null, null, null, null, new DateStatistics(minimum, maximum), null, null, null, null); } @Test @@ -321,7 +321,7 @@ private static ColumnStatistics decimalColumnStats(Long numberOfValues, String m { BigDecimal minimumDecimal = minimum == null ? null : new BigDecimal(minimum); BigDecimal maximumDecimal = maximum == null ? null : new BigDecimal(maximum); - return new ColumnStatistics(numberOfValues, 9L, null, null, null, null, null, new DecimalStatistics(minimumDecimal, maximumDecimal, SHORT_DECIMAL_VALUE_BYTES), null, null); + return new ColumnStatistics(numberOfValues, 9L, null, null, null, null, null, null, new DecimalStatistics(minimumDecimal, maximumDecimal, SHORT_DECIMAL_VALUE_BYTES), null, null); } @Test @@ -343,7 +343,7 @@ public void testBinary() private static ColumnStatistics binaryColumnStats(Long numberOfValues) { // sum and minAverageValueSizeInBytes are not used in this test; they could be arbitrary numbers - return new ColumnStatistics(numberOfValues, 10L, null, null, null, null, null, null, new BinaryStatistics(100L), null); + return new ColumnStatistics(numberOfValues, 10L, null, null, null, null, null, null, null, new BinaryStatistics(100L), null); } private static Long shortDecimal(String value) diff --git a/presto-orc/src/test/java/io/prestosql/orc/metadata/statistics/AbstractStatisticsBuilderTest.java b/presto-orc/src/test/java/io/prestosql/orc/metadata/statistics/AbstractStatisticsBuilderTest.java index 5cd77707b..b3685b937 100644 --- a/presto-orc/src/test/java/io/prestosql/orc/metadata/statistics/AbstractStatisticsBuilderTest.java +++ b/presto-orc/src/test/java/io/prestosql/orc/metadata/statistics/AbstractStatisticsBuilderTest.java @@ -171,7 +171,7 @@ private void assertColumnStatistics( static List insertEmptyColumnStatisticsAt(List statisticsList, int index, long numberOfValues) { List newStatisticsList = new ArrayList<>(statisticsList); - newStatisticsList.add(index, new ColumnStatistics(numberOfValues, 0, null, null, null, null, null, null, null, null)); + newStatisticsList.add(index, new ColumnStatistics(numberOfValues, 0, null, null, null, null, null, null, null, null, null)); return newStatisticsList; } diff --git a/presto-orc/src/test/java/io/prestosql/orc/metadata/statistics/TestDateStatisticsBuilder.java b/presto-orc/src/test/java/io/prestosql/orc/metadata/statistics/TestDateStatisticsBuilder.java index 440088c39..d36bbec4e 100644 --- a/presto-orc/src/test/java/io/prestosql/orc/metadata/statistics/TestDateStatisticsBuilder.java +++ b/presto-orc/src/test/java/io/prestosql/orc/metadata/statistics/TestDateStatisticsBuilder.java @@ -24,6 +24,9 @@ import static io.prestosql.orc.metadata.statistics.DateStatistics.DATE_VALUE_BYTES; import static java.lang.Integer.MAX_VALUE; import static java.lang.Integer.MIN_VALUE; +import static org.testng.Assert.assertFalse; +import static org.testng.Assert.assertNotNull; +import static org.testng.Assert.assertTrue; import static org.testng.Assert.fail; public class TestDateStatisticsBuilder @@ -33,7 +36,7 @@ public class TestDateStatisticsBuilder public TestDateStatisticsBuilder() { - super(DATE, DateStatisticsBuilder::new, DateStatisticsBuilder::addValue); + super(DATE, () -> new DateStatisticsBuilder(new NoOpBloomFilterBuilder()), DateStatisticsBuilder::addValue); } @Test @@ -61,7 +64,7 @@ public void testMinMaxValues() public void testValueOutOfRange() { try { - new DateStatisticsBuilder().addValue(MAX_VALUE + 1L); + new DateStatisticsBuilder(new NoOpBloomFilterBuilder()).addValue(MAX_VALUE + 1L); fail("Expected ArithmeticException"); } catch (ArithmeticException expected) { @@ -69,7 +72,7 @@ public void testValueOutOfRange() } try { - new DateStatisticsBuilder().addValue(MIN_VALUE - 1L); + new DateStatisticsBuilder(new NoOpBloomFilterBuilder()).addValue(MIN_VALUE - 1L); fail("Expected ArithmeticException"); } catch (ArithmeticException expected) { @@ -85,4 +88,19 @@ public void testMinAverageValueBytes() assertMinAverageValueBytes(DATE_VALUE_BYTES, ImmutableList.of(0)); assertMinAverageValueBytes(DATE_VALUE_BYTES, ImmutableList.of(0, 42, 42, 43)); } + + @Test + public void testUtf8BloomFilter() + { + DateStatisticsBuilder statisticsBuilder = new DateStatisticsBuilder(new Utf8BloomFilterBuilder(3, 0.001)); + statisticsBuilder.addValue(1234); + statisticsBuilder.addValue(5678); + statisticsBuilder.addValue(9012); + HashableBloomFilter hashableBloomFilter = statisticsBuilder.buildColumnStatistics().getBloomFilter(); + assertNotNull(hashableBloomFilter); + assertTrue(hashableBloomFilter.test(1234)); + assertTrue(hashableBloomFilter.test(5678)); + assertTrue(hashableBloomFilter.test(9012)); + assertFalse(hashableBloomFilter.test(3456)); + } } diff --git a/presto-orc/src/test/java/io/prestosql/orc/metadata/statistics/TestDoubleStatisticsBuilder.java b/presto-orc/src/test/java/io/prestosql/orc/metadata/statistics/TestDoubleStatisticsBuilder.java index 5241dd838..b9788730b 100644 --- a/presto-orc/src/test/java/io/prestosql/orc/metadata/statistics/TestDoubleStatisticsBuilder.java +++ b/presto-orc/src/test/java/io/prestosql/orc/metadata/statistics/TestDoubleStatisticsBuilder.java @@ -26,6 +26,9 @@ import static java.lang.Double.NEGATIVE_INFINITY; import static java.lang.Double.NaN; import static java.lang.Double.POSITIVE_INFINITY; +import static org.testng.Assert.assertFalse; +import static org.testng.Assert.assertNotNull; +import static org.testng.Assert.assertTrue; public class TestDoubleStatisticsBuilder extends AbstractStatisticsBuilderTest @@ -34,7 +37,7 @@ public class TestDoubleStatisticsBuilder public TestDoubleStatisticsBuilder() { - super(DOUBLE, DoubleStatisticsBuilder::new, DoubleStatisticsBuilder::addValue); + super(DOUBLE, () -> new DoubleStatisticsBuilder(new NoOpBloomFilterBuilder()), DoubleStatisticsBuilder::addValue); } @Test @@ -66,7 +69,7 @@ private static List toDoubleList(Double minValue, Double maxValue, List< @Test public void testNanValue() { - DoubleStatisticsBuilder statisticsBuilder = new DoubleStatisticsBuilder(); + DoubleStatisticsBuilder statisticsBuilder = new DoubleStatisticsBuilder(new NoOpBloomFilterBuilder()); statisticsBuilder.addValue(NaN); assertNoColumnStatistics(statisticsBuilder.buildColumnStatistics(), 1); statisticsBuilder.addValue(NaN); @@ -74,7 +77,7 @@ public void testNanValue() statisticsBuilder.addValue(42.42); assertNoColumnStatistics(statisticsBuilder.buildColumnStatistics(), 3); - statisticsBuilder = new DoubleStatisticsBuilder(); + statisticsBuilder = new DoubleStatisticsBuilder(new NoOpBloomFilterBuilder()); statisticsBuilder.addValue(42.42); assertColumnStatistics(statisticsBuilder.buildColumnStatistics(), 1, 42.42, 42.42); statisticsBuilder.addValue(NaN); @@ -91,4 +94,19 @@ public void testMinAverageValueBytes() assertMinAverageValueBytes(DOUBLE_VALUE_BYTES, ImmutableList.of(0D)); assertMinAverageValueBytes(DOUBLE_VALUE_BYTES, ImmutableList.of(0D, 42D, 42D, 43D)); } + + @Test + public void testUtf8BloomFilter() + { + DoubleStatisticsBuilder statisticsBuilder = new DoubleStatisticsBuilder(new Utf8BloomFilterBuilder(3, 0.001)); + statisticsBuilder.addValue(1.23); + statisticsBuilder.addValue(45.67); + statisticsBuilder.addValue(89.01); + HashableBloomFilter hashableBloomFilter = statisticsBuilder.buildColumnStatistics().getBloomFilter(); + assertNotNull(hashableBloomFilter); + assertTrue(hashableBloomFilter.test(1.23)); + assertTrue(hashableBloomFilter.test(45.67)); + assertTrue(hashableBloomFilter.test(89.01)); + assertFalse(hashableBloomFilter.test(100)); + } } diff --git a/presto-orc/src/test/java/io/prestosql/orc/metadata/statistics/TestIntegerStatisticsBuilder.java b/presto-orc/src/test/java/io/prestosql/orc/metadata/statistics/TestIntegerStatisticsBuilder.java index 9706f1ff4..3fbea65e5 100644 --- a/presto-orc/src/test/java/io/prestosql/orc/metadata/statistics/TestIntegerStatisticsBuilder.java +++ b/presto-orc/src/test/java/io/prestosql/orc/metadata/statistics/TestIntegerStatisticsBuilder.java @@ -28,14 +28,17 @@ import static java.lang.Long.MAX_VALUE; import static java.lang.Long.MIN_VALUE; import static org.testng.Assert.assertEquals; +import static org.testng.Assert.assertFalse; +import static org.testng.Assert.assertNotNull; import static org.testng.Assert.assertNull; +import static org.testng.Assert.assertTrue; public class TestIntegerStatisticsBuilder extends AbstractStatisticsBuilderTest { public TestIntegerStatisticsBuilder() { - super(INTEGER, IntegerStatisticsBuilder::new, IntegerStatisticsBuilder::addValue); + super(INTEGER, () -> new IntegerStatisticsBuilder(new NoOpBloomFilterBuilder()), IntegerStatisticsBuilder::addValue); } @Test @@ -73,7 +76,7 @@ public void testSum() { int values = 0; long expectedSum = 0; - IntegerStatisticsBuilder integerStatisticsBuilder = new IntegerStatisticsBuilder(); + IntegerStatisticsBuilder integerStatisticsBuilder = new IntegerStatisticsBuilder(new NoOpBloomFilterBuilder()); for (int value = -100_000; value < 500_000; value++) { values++; expectedSum += value; @@ -85,7 +88,7 @@ public void testSum() @Test public void testSumOverflow() { - IntegerStatisticsBuilder integerStatisticsBuilder = new IntegerStatisticsBuilder(); + IntegerStatisticsBuilder integerStatisticsBuilder = new IntegerStatisticsBuilder(new NoOpBloomFilterBuilder()); integerStatisticsBuilder.addValue(MAX_VALUE); assertIntegerStatistics(integerStatisticsBuilder.buildColumnStatistics(), 1, MAX_VALUE); @@ -97,7 +100,7 @@ public void testSumOverflow() @Test public void testSumUnderflow() { - IntegerStatisticsBuilder integerStatisticsBuilder = new IntegerStatisticsBuilder(); + IntegerStatisticsBuilder integerStatisticsBuilder = new IntegerStatisticsBuilder(new NoOpBloomFilterBuilder()); integerStatisticsBuilder.addValue(MIN_VALUE); assertIntegerStatistics(integerStatisticsBuilder.buildColumnStatistics(), 1, MIN_VALUE); @@ -111,7 +114,7 @@ public void testMerge() { List statisticsList = new ArrayList<>(); - IntegerStatisticsBuilder statisticsBuilder = new IntegerStatisticsBuilder(); + IntegerStatisticsBuilder statisticsBuilder = new IntegerStatisticsBuilder(new NoOpBloomFilterBuilder()); statisticsList.add(statisticsBuilder.buildColumnStatistics()); assertMergedIntegerStatistics(statisticsList, 0, 0L); @@ -137,7 +140,7 @@ public void testMergeOverflow() { List statisticsList = new ArrayList<>(); - statisticsList.add(new IntegerStatisticsBuilder().buildColumnStatistics()); + statisticsList.add(new IntegerStatisticsBuilder(new NoOpBloomFilterBuilder()).buildColumnStatistics()); assertMergedIntegerStatistics(statisticsList, 0, 0L); statisticsList.add(singleValueIntegerStatistics(MAX_VALUE)); @@ -149,7 +152,7 @@ public void testMergeOverflow() private static ColumnStatistics singleValueIntegerStatistics(long value) { - IntegerStatisticsBuilder statisticsBuilder = new IntegerStatisticsBuilder(); + IntegerStatisticsBuilder statisticsBuilder = new IntegerStatisticsBuilder(new NoOpBloomFilterBuilder()); statisticsBuilder.addValue(value); return statisticsBuilder.buildColumnStatistics(); } @@ -174,4 +177,19 @@ private static void assertIntegerStatistics(ColumnStatistics columnStatistics, i assertEquals(columnStatistics.getNumberOfValues(), 0); } } + + @Test + public void testUtf8BloomFilter() + { + IntegerStatisticsBuilder statisticsBuilder = new IntegerStatisticsBuilder(new Utf8BloomFilterBuilder(3, 0.001)); + statisticsBuilder.addValue(12); + statisticsBuilder.addValue(34); + statisticsBuilder.addValue(56); + HashableBloomFilter hashableBloomFilter = statisticsBuilder.buildColumnStatistics().getBloomFilter(); + assertNotNull(hashableBloomFilter); + assertTrue(hashableBloomFilter.test(12)); + assertTrue(hashableBloomFilter.test(34)); + assertTrue(hashableBloomFilter.test(56)); + assertFalse(hashableBloomFilter.test(78)); + } } diff --git a/presto-orc/src/test/java/io/prestosql/orc/metadata/statistics/TestStringStatisticsBuilder.java b/presto-orc/src/test/java/io/prestosql/orc/metadata/statistics/TestStringStatisticsBuilder.java index 60ac726ba..7f2ee7e11 100644 --- a/presto-orc/src/test/java/io/prestosql/orc/metadata/statistics/TestStringStatisticsBuilder.java +++ b/presto-orc/src/test/java/io/prestosql/orc/metadata/statistics/TestStringStatisticsBuilder.java @@ -16,6 +16,7 @@ import com.google.common.collect.ImmutableList; import io.airlift.slice.Slice; import io.airlift.slice.Slices; +import org.testng.AssertJUnit; import org.testng.annotations.Test; import java.util.ArrayList; @@ -28,8 +29,10 @@ import static io.prestosql.orc.metadata.statistics.ColumnStatistics.mergeColumnStatistics; import static io.prestosql.orc.metadata.statistics.StringStatistics.STRING_VALUE_BYTES_OVERHEAD; import static org.testng.Assert.assertEquals; +import static org.testng.Assert.assertFalse; import static org.testng.Assert.assertNotNull; import static org.testng.Assert.assertNull; +import static org.testng.Assert.assertTrue; public class TestStringStatisticsBuilder extends AbstractStatisticsBuilderTest @@ -48,7 +51,7 @@ public class TestStringStatisticsBuilder public TestStringStatisticsBuilder() { - super(STRING, () -> new StringStatisticsBuilder(Integer.MAX_VALUE), StringStatisticsBuilder::addValue); + super(STRING, () -> new StringStatisticsBuilder(Integer.MAX_VALUE, new NoOpBloomFilterBuilder()), StringStatisticsBuilder::addValue); } @Test @@ -93,7 +96,7 @@ public void testMinMaxValues() @Test public void testSum() { - StringStatisticsBuilder stringStatisticsBuilder = new StringStatisticsBuilder(Integer.MAX_VALUE); + StringStatisticsBuilder stringStatisticsBuilder = new StringStatisticsBuilder(Integer.MAX_VALUE, new NoOpBloomFilterBuilder()); for (Slice value : ImmutableList.of(EMPTY_SLICE, LOW_BOTTOM_VALUE, LOW_TOP_VALUE)) { stringStatisticsBuilder.addValue(value); } @@ -105,7 +108,7 @@ public void testMerge() { List statisticsList = new ArrayList<>(); - StringStatisticsBuilder statisticsBuilder = new StringStatisticsBuilder(Integer.MAX_VALUE); + StringStatisticsBuilder statisticsBuilder = new StringStatisticsBuilder(Integer.MAX_VALUE, new NoOpBloomFilterBuilder()); statisticsList.add(statisticsBuilder.buildColumnStatistics()); assertMergedStringStatistics(statisticsList, 0, 0); @@ -162,7 +165,7 @@ public void testMixingAddValueAndMergeWithLimit() // max merged to null List statisticsList = new ArrayList<>(); - StringStatisticsBuilder statisticsBuilder = new StringStatisticsBuilder(7); + StringStatisticsBuilder statisticsBuilder = new StringStatisticsBuilder(7, new NoOpBloomFilterBuilder()); statisticsList.add(statisticsBuilder.buildColumnStatistics()); assertMergedStringStatistics(statisticsList, 0, 0); @@ -189,7 +192,7 @@ public void testMixingAddValueAndMergeWithLimit() // min merged to null statisticsList = new ArrayList<>(); - statisticsBuilder = new StringStatisticsBuilder(7); + statisticsBuilder = new StringStatisticsBuilder(7, new NoOpBloomFilterBuilder()); statisticsList.add(statisticsBuilder.buildColumnStatistics()); assertMergedStringStatistics(statisticsList, 0, 0); @@ -216,7 +219,7 @@ public void testMixingAddValueAndMergeWithLimit() // min and max both merged to null statisticsList = new ArrayList<>(); - statisticsBuilder = new StringStatisticsBuilder(7); + statisticsBuilder = new StringStatisticsBuilder(7, new NoOpBloomFilterBuilder()); statisticsBuilder.addValue(MEDIUM_BOTTOM_VALUE); statisticsList.add(statisticsBuilder.buildColumnStatistics()); assertMinMax(mergeColumnStatistics(statisticsList).getStringStatistics(), MEDIUM_BOTTOM_VALUE, MEDIUM_BOTTOM_VALUE); @@ -237,7 +240,7 @@ public void testMixingAddValueAndMergeWithLimit() @Test public void testCopyStatsToSaveMemory() { - StringStatisticsBuilder statisticsBuilder = new StringStatisticsBuilder(Integer.MAX_VALUE); + StringStatisticsBuilder statisticsBuilder = new StringStatisticsBuilder(Integer.MAX_VALUE, new NoOpBloomFilterBuilder()); Slice shortSlice = Slices.wrappedBuffer(LONG_BOTTOM_VALUE.getBytes(), 0, 1); statisticsBuilder.addValue(shortSlice); Slice stats = statisticsBuilder.buildColumnStatistics().getStringStatistics().getMax(); @@ -268,7 +271,7 @@ private void assertMergedStringStatistics(List statisticsList, private static void assertMinMaxValuesWithLimit(Slice expectedMin, Slice expectedMax, List values, int limit) { checkArgument(values != null && !values.isEmpty()); - StringStatisticsBuilder builder = new StringStatisticsBuilder(limit); + StringStatisticsBuilder builder = new StringStatisticsBuilder(limit, new NoOpBloomFilterBuilder()); for (Slice value : values) { builder.addValue(value); } @@ -299,6 +302,7 @@ private static ColumnStatistics stringColumnStatistics(Slice minimum, Slice maxi null, null, null, + null, null); } @@ -313,4 +317,19 @@ private void assertStringStatistics(ColumnStatistics columnStatistics, int expec assertEquals(columnStatistics.getNumberOfValues(), 0); } } + + @Test + public void testUtf8BloomFilter() + { + StringStatisticsBuilder statisticsBuilder = new StringStatisticsBuilder(Integer.MAX_VALUE, new Utf8BloomFilterBuilder(3, 0.001)); + statisticsBuilder.addValue(LOW_BOTTOM_VALUE); + statisticsBuilder.addValue(MEDIUM_BOTTOM_VALUE); + statisticsBuilder.addValue(HIGH_BOTTOM_VALUE); + HashableBloomFilter hashableBloomFilter = statisticsBuilder.buildColumnStatistics().getBloomFilter(); + AssertJUnit.assertNotNull(hashableBloomFilter); + assertTrue(hashableBloomFilter.test(LOW_BOTTOM_VALUE)); + assertTrue(hashableBloomFilter.test(MEDIUM_BOTTOM_VALUE)); + assertTrue(hashableBloomFilter.test(HIGH_BOTTOM_VALUE)); + assertFalse(hashableBloomFilter.test(HIGH_TOP_VALUE)); + } } diff --git a/presto-orc/src/test/java/io/prestosql/orc/metadata/statistics/TestTimestampStatistics.java b/presto-orc/src/test/java/io/prestosql/orc/metadata/statistics/TestTimestampStatistics.java new file mode 100644 index 000000000..37b4afdf9 --- /dev/null +++ b/presto-orc/src/test/java/io/prestosql/orc/metadata/statistics/TestTimestampStatistics.java @@ -0,0 +1,54 @@ +/* + * Copyright (C) 2018-2022. Huawei Technologies Co., Ltd. All rights reserved. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.prestosql.orc.metadata.statistics; + +import org.openjdk.jol.info.ClassLayout; +import org.testng.annotations.Test; + +import static java.lang.Long.MAX_VALUE; +import static java.lang.Long.MIN_VALUE; + +public class TestTimestampStatistics + extends AbstractRangeStatisticsTest +{ + private static final int INSTANCE_SIZE = ClassLayout.parseClass(TimestampStatistics.class).instanceSize(); + + @Override + protected TimestampStatistics getCreateStatistics(Long min, Long max) + { + return new TimestampStatistics(min, max); + } + + @Test + public void test() + { + assertMinMax(0L, 100L); + assertMinMax(100L, 100L); + assertMinMax(MIN_VALUE, 100L); + assertMinMax(100L, MAX_VALUE); + assertMinMax(MIN_VALUE, MAX_VALUE); + } + + @Test + public void testRetainedSize() + { + assertRetainedSize(0L, 100L, INSTANCE_SIZE); + assertRetainedSize(100L, 100L, INSTANCE_SIZE); + assertRetainedSize(MIN_VALUE, 100L, INSTANCE_SIZE); + assertRetainedSize(100L, MAX_VALUE, INSTANCE_SIZE); + assertRetainedSize(MIN_VALUE, MAX_VALUE, INSTANCE_SIZE); + } +} diff --git a/presto-orc/src/test/java/io/prestosql/orc/metadata/statistics/TestTimestampStatisticsBuilder.java b/presto-orc/src/test/java/io/prestosql/orc/metadata/statistics/TestTimestampStatisticsBuilder.java new file mode 100644 index 000000000..72ce65ea9 --- /dev/null +++ b/presto-orc/src/test/java/io/prestosql/orc/metadata/statistics/TestTimestampStatisticsBuilder.java @@ -0,0 +1,110 @@ +/* + * Copyright (C) 2018-2022. Huawei Technologies Co., Ltd. All rights reserved. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.prestosql.orc.metadata.statistics; + +import com.google.common.collect.ContiguousSet; +import com.google.common.collect.DiscreteDomain; +import com.google.common.collect.ImmutableList; +import com.google.common.collect.Range; +import io.airlift.log.Logger; +import org.testng.annotations.Test; + +import java.util.function.BiConsumer; +import java.util.function.Supplier; + +import static io.prestosql.orc.metadata.statistics.TimestampStatistics.TIMESTAMP_VALUE_BYTES; +import static java.lang.Long.MAX_VALUE; +import static java.lang.Long.MIN_VALUE; +import static org.testng.Assert.assertFalse; +import static org.testng.Assert.assertNotNull; +import static org.testng.Assert.assertTrue; +import static org.testng.Assert.fail; + +public class TestTimestampStatisticsBuilder + extends AbstractStatisticsBuilderTest +{ + private static final Logger LOG = Logger.get(TestTimestampStatisticsBuilder.class); + + public TestTimestampStatisticsBuilder(StatisticsType statisticsType, Supplier statisticsBuilderSupplier, BiConsumer adder) + { + super(statisticsType, statisticsBuilderSupplier, adder); + } + + @Test + public void testMinMaxValues() + { + assertMinMaxValues(0L, 0L); + assertMinMaxValues(100L, 100L); + assertMinMaxValues(MIN_VALUE, MIN_VALUE); + assertMinMaxValues(MAX_VALUE, MAX_VALUE); + + assertMinMaxValues(0L, 100L); + assertMinMaxValues(100L, 100L); + assertMinMaxValues(MIN_VALUE, 100L); + assertMinMaxValues(100L, MAX_VALUE); + assertMinMaxValues(MIN_VALUE, MAX_VALUE); + + assertValues(-100L, 0L, ContiguousSet.create(Range.closed(-100L, 0L), DiscreteDomain.longs()).asList()); + assertValues(-100L, 100L, ContiguousSet.create(Range.closed(-100L, 100L), DiscreteDomain.longs()).asList()); + assertValues(0L, 100L, ContiguousSet.create(Range.closed(0L, 100L), DiscreteDomain.longs()).asList()); + assertValues(MIN_VALUE, MIN_VALUE + 100L, ContiguousSet.create(Range.closed(MIN_VALUE, MIN_VALUE + 100L), DiscreteDomain.longs()).asList()); + assertValues(MAX_VALUE - 100L, MAX_VALUE, ContiguousSet.create(Range.closed(MAX_VALUE - 100L, MAX_VALUE), DiscreteDomain.longs()).asList()); + } + + @Test + public void testValueOutOfRange() + { + try { + new TimestampStatisticsBuilder(new NoOpBloomFilterBuilder()).addValue(MAX_VALUE + 1L); + fail("Expected ArithmeticException"); + } + catch (ArithmeticException expected) { + LOG.info("Error message: " + expected.getMessage()); + } + + try { + new TimestampStatisticsBuilder(new NoOpBloomFilterBuilder()).addValue(MIN_VALUE - 1L); + fail("Expected ArithmeticException"); + } + catch (ArithmeticException expected) { + LOG.info("Error message: " + expected.getMessage()); + } + } + + @Test + public void testMinAverageValueBytes() + { + assertMinAverageValueBytes(0L, ImmutableList.of()); + assertMinAverageValueBytes(TIMESTAMP_VALUE_BYTES, ImmutableList.of(100L)); + assertMinAverageValueBytes(TIMESTAMP_VALUE_BYTES, ImmutableList.of(0L)); + assertMinAverageValueBytes(TIMESTAMP_VALUE_BYTES, ImmutableList.of(0L, 100L, 100L, 101L)); + } + + @Test + public void testUtf8BloomFilter() + { + TimestampStatisticsBuilder statisticsBuilder = new TimestampStatisticsBuilder(new Utf8BloomFilterBuilder(3, 0.001)); + statisticsBuilder.addValue(1234); + statisticsBuilder.addValue(5678); + statisticsBuilder.addValue(9012); + HashableBloomFilter hashableBloomFilter = statisticsBuilder.buildColumnStatistics().getBloomFilter(); + assertNotNull(hashableBloomFilter); + assertTrue(hashableBloomFilter.test(1234)); + assertTrue(hashableBloomFilter.test(5678)); + assertTrue(hashableBloomFilter.test(9012)); + assertFalse(hashableBloomFilter.test(3456)); + } +} diff --git a/presto-spi/src/main/java/io/prestosql/spi/util/BloomFilter.java b/presto-spi/src/main/java/io/prestosql/spi/util/BloomFilter.java index 90870bf82..071a6ff8f 100644 --- a/presto-spi/src/main/java/io/prestosql/spi/util/BloomFilter.java +++ b/presto-spi/src/main/java/io/prestosql/spi/util/BloomFilter.java @@ -237,6 +237,16 @@ public void addLong(long val) addHash(getLongHash(val)); } + public void addDouble(double val) + { + addLong(doubleToLongBits(val)); + } + + public void addFloat(float val) + { + addDouble(val); + } + public void add(double val) { addLong(doubleToLongBits(val)); diff --git a/presto-tests/src/main/java/io/prestosql/tests/AbstractTestQueryFramework.java b/presto-tests/src/main/java/io/prestosql/tests/AbstractTestQueryFramework.java index ec02333fa..358980fca 100644 --- a/presto-tests/src/main/java/io/prestosql/tests/AbstractTestQueryFramework.java +++ b/presto-tests/src/main/java/io/prestosql/tests/AbstractTestQueryFramework.java @@ -24,6 +24,7 @@ import io.prestosql.cost.TaskCountEstimator; import io.prestosql.cube.CubeManager; import io.prestosql.execution.QueryManagerConfig; +import io.prestosql.execution.QueryStats; import io.prestosql.execution.TaskManagerConfig; import io.prestosql.execution.warnings.WarningCollector; import io.prestosql.heuristicindex.HeuristicIndexerManager; @@ -398,6 +399,29 @@ private QueryExplainer getQueryExplainer() new CubeManager(featuresConfig, hetuMetaStoreManager)); } + protected void assertQueryStats( + Session session, + @Language("SQL") String query, + Consumer queryStatsAssertion, + Consumer resultAssertion) + { + DistributedQueryRunner queryRunner = getDistributedQueryRunner(); + ResultWithQueryId resultWithQueryId = queryRunner.executeWithQueryId(session, query); + QueryStats queryStats = queryRunner.getCoordinator() + .getQueryManager() + .getFullQueryInfo(resultWithQueryId.getQueryId()) + .getQueryStats(); + queryStatsAssertion.accept(queryStats); + resultAssertion.accept(resultWithQueryId.getResult()); + } + + protected final DistributedQueryRunner getDistributedQueryRunner() + { + checkState(queryRunner != null, "queryRunner not set"); + checkState(queryRunner instanceof DistributedQueryRunner, "queryRunner is not a DistributedQueryRunner"); + return (DistributedQueryRunner) queryRunner; + } + protected static void skipTestUnless(boolean requirement) { if (!requirement) { diff --git a/presto-tests/src/main/java/io/prestosql/tests/sql/TestTable.java b/presto-tests/src/main/java/io/prestosql/tests/sql/TestTable.java index 98e0efa01..a345fd4f7 100644 --- a/presto-tests/src/main/java/io/prestosql/tests/sql/TestTable.java +++ b/presto-tests/src/main/java/io/prestosql/tests/sql/TestTable.java @@ -13,11 +13,18 @@ */ package io.prestosql.tests.sql; +import java.security.SecureRandom; import java.util.concurrent.atomic.AtomicInteger; +import static java.lang.Character.MAX_RADIX; +import static java.lang.Math.abs; +import static java.lang.Math.min; + public class TestTable implements AutoCloseable { + private static final SecureRandom RANDOM = new SecureRandom(); + private static final int RANDOM_SUFFIX_LENGTH = 5; private final SqlExecutor sqlExecutor; private final String name; @@ -40,4 +47,10 @@ public void close() { sqlExecutor.execute("DROP TABLE " + name); } + + public static String randomTableSuffix() + { + String randomSuffix = Long.toString(abs(RANDOM.nextLong()), MAX_RADIX); + return randomSuffix.substring(0, min(RANDOM_SUFFIX_LENGTH, randomSuffix.length())); + } }