Skip to content

[SPARK-52828][SQL] Make hashing for collated strings collation agnostic #51521

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 5 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@
import java.util.concurrent.ConcurrentHashMap;
import java.util.function.Function;
import java.util.function.BiFunction;
import java.util.function.ToLongFunction;
import java.util.stream.Stream;

import com.ibm.icu.text.CollationKey;
Expand Down Expand Up @@ -125,10 +124,19 @@ public static class Collation {
public final String version;

/**
* Collation sensitive hash function. Output for two UTF8Strings will be the same if they are
* equal according to the collation.
* Returns the sort key of the input UTF8String. Two UTF8String values are equal iff their
* sort keys are equal (compared as byte arrays).
* The sort key is defined as follows for collations without the RTRIM modifier:
* - UTF8_BINARY: It is the bytes of the string.
* - UTF8_LCASE: It is byte array we get by replacing all invalid UTF8 sequences with the
* Unicode replacement character and then converting all characters of the replaced string
* with their lowercase equivalents (the Greek capital and Greek small sigma both map to
* the Greek final sigma).
* - ICU collations: It is the byte array returned by the ICU library for the collated string.
* For strings with the RTRIM modifier, we right-trim the string and return the collation key
* of the resulting right-trimmed string.
*/
public final ToLongFunction<UTF8String> hashFunction;
public final Function<UTF8String, byte[]> sortKeyFunction;

/**
* Potentially faster way than using comparator to compare two UTF8Strings for equality.
Expand Down Expand Up @@ -182,7 +190,7 @@ public Collation(
Collator collator,
Comparator<UTF8String> comparator,
String version,
ToLongFunction<UTF8String> hashFunction,
Function<UTF8String, byte[]> sortKeyFunction,
BiFunction<UTF8String, UTF8String, Boolean> equalsFunction,
boolean isUtf8BinaryType,
boolean isUtf8LcaseType,
Expand All @@ -192,7 +200,7 @@ public Collation(
this.collator = collator;
this.comparator = comparator;
this.version = version;
this.hashFunction = hashFunction;
this.sortKeyFunction = sortKeyFunction;
this.isUtf8BinaryType = isUtf8BinaryType;
this.isUtf8LcaseType = isUtf8LcaseType;
this.equalsFunction = equalsFunction;
Expand Down Expand Up @@ -581,18 +589,18 @@ private static boolean isValidCollationId(int collationId) {
protected Collation buildCollation() {
if (caseSensitivity == CaseSensitivity.UNSPECIFIED) {
Comparator<UTF8String> comparator;
ToLongFunction<UTF8String> hashFunction;
Function<UTF8String, byte[]> sortKeyFunction;
BiFunction<UTF8String, UTF8String, Boolean> equalsFunction;
boolean supportsSpaceTrimming = spaceTrimming != SpaceTrimming.NONE;

if (spaceTrimming == SpaceTrimming.NONE) {
comparator = UTF8String::binaryCompare;
hashFunction = s -> (long) s.hashCode();
sortKeyFunction = s -> s.getBytes();
equalsFunction = UTF8String::equals;
} else {
comparator = (s1, s2) -> applyTrimmingPolicy(s1, spaceTrimming).binaryCompare(
applyTrimmingPolicy(s2, spaceTrimming));
hashFunction = s -> (long) applyTrimmingPolicy(s, spaceTrimming).hashCode();
sortKeyFunction = s -> applyTrimmingPolicy(s, spaceTrimming).getBytes();
equalsFunction = (s1, s2) -> applyTrimmingPolicy(s1, spaceTrimming).equals(
applyTrimmingPolicy(s2, spaceTrimming));
}
Expand All @@ -603,25 +611,25 @@ protected Collation buildCollation() {
null,
comparator,
CollationSpecICU.ICU_VERSION,
hashFunction,
sortKeyFunction,
equalsFunction,
/* isUtf8BinaryType = */ true,
/* isUtf8LcaseType = */ false,
spaceTrimming != SpaceTrimming.NONE);
} else {
Comparator<UTF8String> comparator;
ToLongFunction<UTF8String> hashFunction;
Function<UTF8String, byte[]> sortKeyFunction;

if (spaceTrimming == SpaceTrimming.NONE) {
comparator = CollationAwareUTF8String::compareLowerCase;
hashFunction = s ->
(long) CollationAwareUTF8String.lowerCaseCodePoints(s).hashCode();
sortKeyFunction = s ->
CollationAwareUTF8String.lowerCaseCodePoints(s).getBytes();
} else {
comparator = (s1, s2) -> CollationAwareUTF8String.compareLowerCase(
applyTrimmingPolicy(s1, spaceTrimming),
applyTrimmingPolicy(s2, spaceTrimming));
hashFunction = s -> (long) CollationAwareUTF8String.lowerCaseCodePoints(
applyTrimmingPolicy(s, spaceTrimming)).hashCode();
sortKeyFunction = s -> CollationAwareUTF8String.lowerCaseCodePoints(
applyTrimmingPolicy(s, spaceTrimming)).getBytes();
}

return new Collation(
Expand All @@ -630,7 +638,7 @@ protected Collation buildCollation() {
null,
comparator,
CollationSpecICU.ICU_VERSION,
hashFunction,
sortKeyFunction,
(s1, s2) -> comparator.compare(s1, s2) == 0,
/* isUtf8BinaryType = */ false,
/* isUtf8LcaseType = */ true,
Expand Down Expand Up @@ -1013,19 +1021,18 @@ protected Collation buildCollation() {
collator.freeze();

Comparator<UTF8String> comparator;
ToLongFunction<UTF8String> hashFunction;
Function<UTF8String, byte[]> sortKeyFunction;

if (spaceTrimming == SpaceTrimming.NONE) {
hashFunction = s -> (long) collator.getCollationKey(
s.toValidString()).hashCode();
comparator = (s1, s2) ->
collator.compare(s1.toValidString(), s2.toValidString());
sortKeyFunction = s -> collator.getCollationKey(s.toValidString()).toByteArray();
} else {
comparator = (s1, s2) -> collator.compare(
applyTrimmingPolicy(s1, spaceTrimming).toValidString(),
applyTrimmingPolicy(s2, spaceTrimming).toValidString());
hashFunction = s -> (long) collator.getCollationKey(
applyTrimmingPolicy(s, spaceTrimming).toValidString()).hashCode();
sortKeyFunction = s -> collator.getCollationKey(
applyTrimmingPolicy(s, spaceTrimming).toValidString()).toByteArray();
}

return new Collation(
Expand All @@ -1034,7 +1041,7 @@ protected Collation buildCollation() {
collator,
comparator,
ICU_VERSION,
hashFunction,
sortKeyFunction,
(s1, s2) -> comparator.compare(s1, s2) == 0,
/* isUtf8BinaryType = */ false,
/* isUtf8LcaseType = */ false,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,7 @@ class CollationFactorySuite extends AnyFunSuite with Matchers { // scalastyle:ig

case class CollationTestCase[R](collationName: String, s1: String, s2: String, expectedResult: R)

test("collation aware equality and hash") {
test("collation aware equality and sort key") {
val checks = Seq(
CollationTestCase("UTF8_BINARY", "aaa", "aaa", true),
CollationTestCase("UTF8_BINARY", "aaa", "AAA", false),
Expand Down Expand Up @@ -194,9 +194,9 @@ class CollationFactorySuite extends AnyFunSuite with Matchers { // scalastyle:ig
assert(collation.equalsFunction(toUTF8(testCase.s1), toUTF8(testCase.s2)) ==
testCase.expectedResult)

val hash1 = collation.hashFunction.applyAsLong(toUTF8(testCase.s1))
val hash2 = collation.hashFunction.applyAsLong(toUTF8(testCase.s2))
assert((hash1 == hash2) == testCase.expectedResult)
val sortKey1 = collation.sortKeyFunction.apply(toUTF8(testCase.s1)).asInstanceOf[Array[Byte]]
val sortKey2 = collation.sortKeyFunction.apply(toUTF8(testCase.s2)).asInstanceOf[Array[Byte]]
assert(sortKey1.sameElements(sortKey2) == testCase.expectedResult)
})
}

Expand Down
Loading