Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
91 changes: 64 additions & 27 deletions make/jdk/src/classes/build/tools/generatecharacter/CaseFolding.java
Original file line number Diff line number Diff line change
Expand Up @@ -29,45 +29,82 @@
import java.nio.file.Files;
import java.nio.file.Paths;
import java.nio.file.StandardOpenOption;
import java.util.Arrays;
import java.util.stream.Collectors;
import java.util.stream.IntStream;
import java.util.stream.Stream;

public class CaseFolding {

public static void main(String[] args) throws Throwable {
if (args.length != 3) {
System.err.println("Usage: java CaseFolding TemplateFile CaseFolding.txt CaseFolding.java");
if (args.length != 4) {
System.err.println("Usage: java CaseFolding TemplateFile CaseFolding.txt CaseFolding.java lang");
System.exit(1);
}
var templateFile = Paths.get(args[0]);
var caseFoldingTxt = Paths.get(args[1]);
var genSrcFile = Paths.get(args[2]);
var supportedTypes = "^.*; [CTS]; .*$";
var caseFoldingEntries = Files.lines(caseFoldingTxt)
.filter(line -> !line.startsWith("#") && line.matches(supportedTypes))
.map(line -> {
String[] cols = line.split("; ");
return new String[] {cols[0], cols[1], cols[2]};
})
.filter(cols -> {
// the folding case doesn't map back to the original char.
var cp1 = Integer.parseInt(cols[0], 16);
var cp2 = Integer.parseInt(cols[2], 16);
return Character.toUpperCase(cp2) != cp1 && Character.toLowerCase(cp2) != cp1;
})
.map(cols -> String.format(" entry(0x%s, 0x%s)", cols[0], cols[2]))
.collect(Collectors.joining(",\n", "", ""));
var pkg = args[3];

// hack, hack, hack! the logic does not pick 0131. just add manually to support 'I's.
// 0049; T; 0131; # LATIN CAPITAL LETTER I
final String T_0x0131_0x49 = String.format(" entry(0x%04x, 0x%04x),\n", 0x0131, 0x49);
if ("lang_string".equals(pkg)) {
var supportedTypes = "^.*; [CF]; .*$"; // full/1:M case folding
var caseFoldingEntries = Files.lines(caseFoldingTxt)
.filter(line -> !line.startsWith("#") && line.matches(supportedTypes))
.map(line -> {
var fields = line.split("; ");
var cp = Integer.parseInt(fields[0], 16);
fields = fields[2].trim().split(" ");
var folding = new int[fields.length];
for (int i = 0; i < folding.length; i++) {
folding[i] = Integer.parseInt(fields[i], 16);
}
var foldingChars = Arrays.stream(folding)
.mapToObj(Character::toChars)
.flatMapToInt(chars -> IntStream.range(0, chars.length).map(i -> (int)chars[i]))
.toArray();
return String.format("\t\tnew CaseFoldingEntry(0x%04x, %s)",
cp,
Arrays.stream(foldingChars)
.mapToObj(c -> String.format("0x%04x", c))
.collect(Collectors.joining(", ", "new char[] {", "}"))
);
})
.collect(Collectors.joining(",\n", "", ""));

// Generate .java file
Files.write(
genSrcFile,
Files.lines(templateFile)
.map(line -> line.contains("%%%Entries") ? T_0x0131_0x49 + caseFoldingEntries : line)
.collect(Collectors.toList()),
StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING);
Files.write(
genSrcFile,
Files.lines(templateFile)
.map(line -> line.contains("%%%Entries") ? caseFoldingEntries : line)
.collect(Collectors.toList()),
StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING);
} else {
var supportedTypes = "^.*; [CTS]; .*$";
var caseFoldingEntries = Files.lines(caseFoldingTxt)
.filter(line -> !line.startsWith("#") && line.matches(supportedTypes))
.map(line -> {
String[] cols = line.split("; ");
return new String[]{cols[0], cols[1], cols[2]};
})
.filter(cols -> {
// the folding case doesn't map back to the original char.
var cp1 = Integer.parseInt(cols[0], 16);
var cp2 = Integer.parseInt(cols[2], 16);
return Character.toUpperCase(cp2) != cp1 && Character.toLowerCase(cp2) != cp1;
})
.map(cols -> String.format(" entry(0x%s, 0x%s)", cols[0], cols[2]))
.collect(Collectors.joining(",\n", "", ""));

// hack, hack, hack! the logic does not pick 0131. just add manually to support 'I's.
// 0049; T; 0131; # LATIN CAPITAL LETTER I
final String T_0x0131_0x49 = String.format(" entry(0x%04x, 0x%04x),\n", 0x0131, 0x49);

// Generate .java file
Files.write(
genSrcFile,
Files.lines(templateFile)
.map(line -> line.contains("%%%Entries") ? T_0x0131_0x49 + caseFoldingEntries : line)
.collect(Collectors.toList()),
StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING);
}
}
}
18 changes: 18 additions & 0 deletions make/modules/java.base/gensrc/GensrcCharacterData.gmk
Original file line number Diff line number Diff line change
Expand Up @@ -72,5 +72,23 @@ TARGETS += $(GENSRC_CHARACTERDATA)

################################################################################


GENSRC_STRINGCASEFOLDING := $(SUPPORT_OUTPUTDIR)/gensrc/java.base/jdk/internal/java/lang/CaseFolding.java

STRINGCASEFOLDING_TEMPLATE := $(MODULE_SRC)/share/classes/jdk/internal/lang/CaseFolding.java.template
CASEFOLDINGTXT := $(MODULE_SRC)/share/data/unicodedata/CaseFolding.txt

$(GENSRC_STRINGCASEFOLDING): $(BUILD_TOOLS_JDK) $(STRINGCASEFOLDING_TEMPLATE) $(CASEFOLDINGTXT)
$(call LogInfo, Generating $@)
$(call MakeTargetDir)
$(TOOL_GENERATECASEFOLDING) \
$(STRINGCASEFOLDING_TEMPLATE) \
$(CASEFOLDINGTXT) \
$(GENSRC_STRINGCASEFOLDING) \
lang_string

TARGETS += $(GENSRC_STRINGCASEFOLDING)


endif # include guard
include MakeIncludeEnd.gmk
3 changes: 2 additions & 1 deletion make/modules/java.base/gensrc/GensrcRegex.gmk
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,8 @@ $(GENSRC_CASEFOLDING): $(BUILD_TOOLS_JDK) $(CASEFOLDINGTEMP) $(CASEFOLDINGTXT)
$(TOOL_GENERATECASEFOLDING) \
$(CASEFOLDINGTEMP) \
$(CASEFOLDINGTXT) \
$(GENSRC_CASEFOLDING)
$(GENSRC_CASEFOLDING) \
util_regex

TARGETS += $(GENSRC_CASEFOLDING)

Expand Down
100 changes: 100 additions & 0 deletions src/java.base/share/classes/java/lang/String.java
Original file line number Diff line number Diff line change
Expand Up @@ -2160,6 +2160,64 @@ public int compareToIgnoreCase(String str) {
return CASE_INSENSITIVE_ORDER.compare(this, str);
}

/**
* A Comparator that orders {@code String} objects as by
* {@link #compareToCaseFold(String) compareToCaseFold()}.
*
* @see #compareToCaseFold(String)
* @since 26
*/
public static final Comparator<String> CASE_FOLD_ORDER
= new CaseFoldComparator();

private static class CaseFoldComparator implements Comparator<String> {

@Override
public int compare(String s1, String s2) {
byte[] v1 = s1.value;
byte[] v2 = s2.value;
var ltr1 = s1.coder == LATIN1 ? StringCaseFoldedCharIterator.ofLatin1(v1)
: StringCaseFoldedCharIterator.ofUTF16(v1);
var ltr2 = s2.coder == LATIN1 ? StringCaseFoldedCharIterator.ofLatin1(v2)
: StringCaseFoldedCharIterator.ofUTF16(v2);
while (ltr1.hasNext() && ltr2.hasNext()) {
int ch1 = ltr1.nextChar();
int ch2 = ltr2.nextChar();
if (ch1 != ch2) {
return ch1 - ch2;
}
}
if (ltr1.hasNext()) return 1;
if (ltr2.hasNext()) return -1;
return 0;
}
}

/**
* Compares two strings lexicographically using Unicode case folding.
* <p>
* This method returns an integer whose sign is that of calling {@code compareTo}
* on the case folded versions of the strings. Unicode Case folding eliminates
* differences in case according to the Unicode Standard, using the mappings
* defined in
* <a href="https://www.unicode.org/Public/UCD/latest/ucd/CaseFolding.txt">CaseFolding.txt</a>,
* including one-to-many mappings, such as {@code"ß"} → {@code }"ss"}.
* <p>
* Note that this method does <em>not</em> take locale into account, and may
* produce results that differ from locale-sensitive ordering. For locale-aware
* comparisons, use {@link java.text.Collator}.
* @param str the {@code String} to be compared.
* @return a negative integer, zero, or a positive integer as the specified
* String is greater than, equal to, or less than this String,
* ignoring case considerations by case folding.
* @see java.text.Collator
* @see #toCaseFold()
* @since 26
*/
public int compareToCaseFold(String str) {
return CASE_FOLD_ORDER.compare(this, str);
}

/**
* Tests if two string regions are equal.
* <p>
Expand Down Expand Up @@ -3791,6 +3849,48 @@ public String toUpperCase() {
return toUpperCase(Locale.getDefault());
}

/**
* Returns a case-folded copy of this {@code String}, using the Unicode
* case folding mappings defined in
* <a href="https://www.unicode.org/Public/UCD/latest/ucd/CaseFolding.txt">
* Unicode Case Folding Properties</a>.
*
* <p>Case folding is a locale-independent, language-neutral form of
* case mapping, primarily intended for caseless matching.
* Unlike {@link #toLowerCase()} or {@link #toUpperCase()}, which are
* designed for locale-sensitive or display-oriented transformations,
* case folding provides a stable and consistent mapping across all
* environments. It may include one-to-many mappings; for example,
* the German sharp s ({@code U+00DF}) folds to the sequence
* {@code "ss"}.
*
* <p>This method performs the <em>"Full"</em> case folding as defined in
* the Unicode CaseFolding data file. The result is suitable for use in
* case-insensitive string comparison, searching, or indexing.
*
* @apiNote
* Case folding is intended for caseless matching, not for locale-sensitive
* presentation. For example:
*
* <pre>{@code
* String a = "Maße";
* String b = "MASSE";
* if (a.toCaseFold().equals(b.toCaseFold())) {
* // true, matches according to Unicode caseless rules
* }
* }</pre>
*
* @return a {@code String} containing the case-folded form of this string
* @see #toLowerCase()
* @see #toUpperCase()
* @since 26
*/

public String toCaseFold() {
return isLatin1() ? StringLatin1.toCaseFold(this, value)
: StringUTF16.toCaseFold(this, value);
}

/**
* Returns a string whose value is this string, with all leading
* and trailing space removed, where space is defined
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
package java.lang;

import jdk.internal.java.lang.CaseFolding;

abstract class StringCaseFoldedCharIterator {

protected final byte[] value; // underlying byte array
protected final int length; // length of the char unit in byte array
protected int index; // current position in byte array
protected char[] folded; // buffer for folded expansion
protected int foldedIndex; // position in folded[]

StringCaseFoldedCharIterator(byte[] value, int length) {
this.value = value;
this.length = length;
this.index = 0;
this.folded = null;
this.foldedIndex = 0;
}

public boolean hasNext() {
return (folded != null && foldedIndex < folded.length) || index < length;
}

public int nextChar() {
if (folded != null && foldedIndex < folded.length) {
return folded[foldedIndex++];
}
if (index >= length) {
return -1;
}
int cp = codePointAt(value, index);
index += Character.charCount(cp);
folded = CaseFolding.fold(cp);
foldedIndex = 0;
return folded[foldedIndex++];
}

protected abstract int codePointAt(byte[] value, int index);

// Factory for Latin1
static StringCaseFoldedCharIterator ofLatin1(byte[] value) {
return new StringCaseFoldedCharIterator(value, value.length) {
@Override
protected int codePointAt(byte[] value, int index) {
return StringLatin1.codePointAt(value, index, value.length);
}
};
}

// Factory for UTF16
static StringCaseFoldedCharIterator ofUTF16(byte[] value) {
return new StringCaseFoldedCharIterator(value, value.length >> 1) {
@Override
protected int codePointAt(byte[] value, int index) {
return StringUTF16.codePointAt(value, index, value.length);
}
};
}
}
49 changes: 49 additions & 0 deletions src/java.base/share/classes/java/lang/StringLatin1.java
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
import java.util.function.IntConsumer;
import java.util.stream.Stream;
import java.util.stream.StreamSupport;
import jdk.internal.java.lang.CaseFolding;
import jdk.internal.util.ArraysSupport;
import jdk.internal.vm.annotation.IntrinsicCandidate;

Expand Down Expand Up @@ -560,6 +561,54 @@ private static String toUpperCaseEx(String str, byte[] value,
return StringUTF16.newString(result, 0, resultOffset);
}

private static String toCaseFoldEx(String str, byte[] value, int first) {
byte[] result = StringUTF16.newBytesFor(value.length);
int resultOffset = 0;
for (int i = 0; i < first; i++) {
StringUTF16.putChar(result, resultOffset++, value[i] & 0xff);
}
for (int i = first; i < value.length; i++) {
int cp = value[i] & 0xff;
char[] folded = CaseFolding.fold(cp);
if (folded.length == 1) {
StringUTF16.putChar(result, resultOffset++, folded[0]);
} else {
byte[] result2 = StringUTF16.newBytesFor((result.length >> 1) + folded.length - 1);
System.arraycopy(result, 0, result2, 0, resultOffset << 1);
result = result2;
for (int x = 0; x < folded.length; ++x) {
StringUTF16.putChar(result, resultOffset++, folded[x]);
}
}
}
return StringUTF16.newString(result, 0, resultOffset);
}

public static String toCaseFold(String str, byte[] value) {
int first;
final int len = value.length;
// Now check if there are any characters that need to be changed
for (first = 0 ; first < len; first++) {
var cp = value[first] & 0xff;
if (!CaseFolding.isFolded(value[first] & 0xff)) {
break;
}
}
if (first == len)
return str;
byte[] result = new byte[len];
System.arraycopy(value, 0, result, 0, first); // Just copy the first few
// fold characters
for (int i = first; i < len; i++) {
var folded = CaseFolding.fold(value[i] & 0xff);
if (folded.length > 1 || !canEncode(folded[0])) {
return toCaseFoldEx(str, value, first);
}
result[i] = (byte)(folded[0] & 0xff);
}
return new String(result, LATIN1);
}

public static String trim(byte[] value) {
int len = value.length;
int st = 0;
Expand Down
Loading