openjdk · xuemingshen-oracle · Aug 18, 2025
diff --git a/make/jdk/src/classes/build/tools/generatecharacter/CaseFolding.java b/make/jdk/src/classes/build/tools/generatecharacter/CaseFolding.java
@@ -29,45 +29,82 @@
 import java.nio.file.Files;
 import java.nio.file.Paths;
 import java.nio.file.StandardOpenOption;
+import java.util.Arrays;
 import java.util.stream.Collectors;
+import java.util.stream.IntStream;
 import java.util.stream.Stream;
 
 public class CaseFolding {
 
     public static void main(String[] args) throws Throwable {
-        if (args.length != 3) {
-            System.err.println("Usage: java CaseFolding TemplateFile CaseFolding.txt CaseFolding.java");
+        if (args.length != 4) {
+            System.err.println("Usage: java CaseFolding TemplateFile CaseFolding.txt CaseFolding.java lang");
             System.exit(1);
         }
         var templateFile = Paths.get(args[0]);
         var caseFoldingTxt = Paths.get(args[1]);
         var genSrcFile = Paths.get(args[2]);
-        var supportedTypes = "^.*; [CTS]; .*$";
-        var caseFoldingEntries = Files.lines(caseFoldingTxt)
-            .filter(line -> !line.startsWith("#") && line.matches(supportedTypes))
-            .map(line -> {
-                String[] cols = line.split("; ");
-                return new String[] {cols[0], cols[1], cols[2]};
-            })
-            .filter(cols -> {
-                //  the folding case doesn't map back to the original char.
-                var cp1 = Integer.parseInt(cols[0], 16);
-                var cp2 = Integer.parseInt(cols[2], 16);
-                return Character.toUpperCase(cp2) != cp1 && Character.toLowerCase(cp2) != cp1;
-            })
-            .map(cols -> String.format("        entry(0x%s, 0x%s)", cols[0], cols[2]))
-            .collect(Collectors.joining(",\n", "", ""));
+        var pkg = args[3];
 
-        // hack, hack, hack! the logic does not pick 0131. just add manually to support 'I's.
-        // 0049; T; 0131; # LATIN CAPITAL LETTER I
-        final String T_0x0131_0x49 = String.format("        entry(0x%04x, 0x%04x),\n", 0x0131, 0x49);
+        if ("lang_string".equals(pkg)) {
+            var supportedTypes = "^.*; [CF]; .*$";  // full/1:M case folding
+            var caseFoldingEntries = Files.lines(caseFoldingTxt)
+                    .filter(line -> !line.startsWith("#") && line.matches(supportedTypes))
+                    .map(line -> {
+                        var fields = line.split("; ");
+                        var cp = Integer.parseInt(fields[0], 16);
+                        fields = fields[2].trim().split(" ");
+                        var folding = new int[fields.length];
+                        for (int i = 0; i < folding.length; i++) {
+                            folding[i] = Integer.parseInt(fields[i], 16);
+                        }
+                        var foldingChars = Arrays.stream(folding)
+                                .mapToObj(Character::toChars)
+                                .flatMapToInt(chars -> IntStream.range(0, chars.length).map(i -> (int)chars[i]))
+                                .toArray();
+                        return String.format("\t\tnew CaseFoldingEntry(0x%04x, %s)",
+                                cp,
+                                Arrays.stream(foldingChars)
+                                        .mapToObj(c -> String.format("0x%04x", c))
+                                        .collect(Collectors.joining(", ", "new char[] {", "}"))
+                        );
+                    })
+                    .collect(Collectors.joining(",\n", "", ""));
 
-        // Generate .java file
-        Files.write(
-            genSrcFile,
-            Files.lines(templateFile)
-                .map(line -> line.contains("%%%Entries") ? T_0x0131_0x49 + caseFoldingEntries : line)
-                .collect(Collectors.toList()),
-            StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING);
+            Files.write(
+                    genSrcFile,
+                    Files.lines(templateFile)
+                            .map(line -> line.contains("%%%Entries") ? caseFoldingEntries : line)
+                            .collect(Collectors.toList()),
+                    StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING);
+        } else {
+            var supportedTypes = "^.*; [CTS]; .*$";
+            var caseFoldingEntries = Files.lines(caseFoldingTxt)
+                    .filter(line -> !line.startsWith("#") && line.matches(supportedTypes))
+                    .map(line -> {
+                        String[] cols = line.split("; ");
+                        return new String[]{cols[0], cols[1], cols[2]};
+                    })
+                    .filter(cols -> {
+                        //  the folding case doesn't map back to the original char.
+                        var cp1 = Integer.parseInt(cols[0], 16);
+                        var cp2 = Integer.parseInt(cols[2], 16);
+                        return Character.toUpperCase(cp2) != cp1 && Character.toLowerCase(cp2) != cp1;
+                    })
+                    .map(cols -> String.format("        entry(0x%s, 0x%s)", cols[0], cols[2]))
+                    .collect(Collectors.joining(",\n", "", ""));
+
+            // hack, hack, hack! the logic does not pick 0131. just add manually to support 'I's.
+            // 0049; T; 0131; # LATIN CAPITAL LETTER I
+            final String T_0x0131_0x49 = String.format("        entry(0x%04x, 0x%04x),\n", 0x0131, 0x49);
+
+            // Generate .java file
+            Files.write(
+                    genSrcFile,
+                    Files.lines(templateFile)
+                            .map(line -> line.contains("%%%Entries") ? T_0x0131_0x49 + caseFoldingEntries : line)
+                            .collect(Collectors.toList()),
+                    StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING);
+        }
     }
 }
diff --git a/make/modules/java.base/gensrc/GensrcCharacterData.gmk b/make/modules/java.base/gensrc/GensrcCharacterData.gmk
@@ -72,5 +72,23 @@ TARGETS += $(GENSRC_CHARACTERDATA)
 
 ################################################################################
 
+
+GENSRC_STRINGCASEFOLDING := $(SUPPORT_OUTPUTDIR)/gensrc/java.base/jdk/internal/java/lang/CaseFolding.java
+
+STRINGCASEFOLDING_TEMPLATE := $(MODULE_SRC)/share/classes/jdk/internal/lang/CaseFolding.java.template
+CASEFOLDINGTXT := $(MODULE_SRC)/share/data/unicodedata/CaseFolding.txt
+
+$(GENSRC_STRINGCASEFOLDING): $(BUILD_TOOLS_JDK) $(STRINGCASEFOLDING_TEMPLATE) $(CASEFOLDINGTXT)
+	$(call LogInfo, Generating $@)
+	$(call MakeTargetDir)
+	$(TOOL_GENERATECASEFOLDING) \
+	    $(STRINGCASEFOLDING_TEMPLATE) \
+	    $(CASEFOLDINGTXT) \
+	    $(GENSRC_STRINGCASEFOLDING) \
+	    lang_string
+
+TARGETS += $(GENSRC_STRINGCASEFOLDING)
+
+
 endif # include guard
 include MakeIncludeEnd.gmk
diff --git a/make/modules/java.base/gensrc/GensrcRegex.gmk b/make/modules/java.base/gensrc/GensrcRegex.gmk
@@ -61,7 +61,8 @@ $(GENSRC_CASEFOLDING): $(BUILD_TOOLS_JDK) $(CASEFOLDINGTEMP) $(CASEFOLDINGTXT)
 	$(TOOL_GENERATECASEFOLDING) \
 	    $(CASEFOLDINGTEMP) \
 	    $(CASEFOLDINGTXT) \
-	    $(GENSRC_CASEFOLDING)
+	    $(GENSRC_CASEFOLDING) \
+	    util_regex
 
 TARGETS += $(GENSRC_CASEFOLDING)
 

diff --git a/src/java.base/share/classes/java/lang/String.java b/src/java.base/share/classes/java/lang/String.java
@@ -2160,6 +2160,64 @@ public int compareToIgnoreCase(String str) {
         return CASE_INSENSITIVE_ORDER.compare(this, str);
     }
 
+    /**
+     * A Comparator that orders {@code String} objects as by
+     * {@link #compareToCaseFold(String) compareToCaseFold()}.
+     *
+     * @see     #compareToCaseFold(String)
+     * @since   26
+     */
+    public static final Comparator<String> CASE_FOLD_ORDER
+            = new CaseFoldComparator();
+
+    private static class CaseFoldComparator implements Comparator<String> {
+
+        @Override
+        public int compare(String s1, String s2) {
+            byte[] v1 = s1.value;
+            byte[] v2 = s2.value;
+            var ltr1 = s1.coder == LATIN1 ? StringCaseFoldedCharIterator.ofLatin1(v1)
+                                          : StringCaseFoldedCharIterator.ofUTF16(v1);
+            var ltr2 = s2.coder == LATIN1 ? StringCaseFoldedCharIterator.ofLatin1(v2)
+                                          : StringCaseFoldedCharIterator.ofUTF16(v2);
+            while (ltr1.hasNext() && ltr2.hasNext()) {
+                int ch1 = ltr1.nextChar();
+                int ch2 = ltr2.nextChar();
+                if (ch1 != ch2) {
+                    return ch1 - ch2;
+                }
+            }
+            if (ltr1.hasNext()) return 1;
+            if (ltr2.hasNext()) return -1;
+            return 0;
+        }
+    }
+
+    /**
+     * Compares two strings lexicographically using Unicode case folding.
+     * <p>
+     * This method returns an integer whose sign is that of calling {@code compareTo}
+     * on the case folded versions of the strings.  Unicode Case folding eliminates
+     * differences in case according to the Unicode Standard, using the mappings
+     * defined in
+     * <a href="https://www.unicode.org/Public/UCD/latest/ucd/CaseFolding.txt">CaseFolding.txt</a>,
+     * including one-to-many mappings, such as {@code"ß"} → {@code }"ss"}.
+     * <p>
+     * Note that this method does <em>not</em> take locale into account, and may
+     * produce results that differ from locale-sensitive ordering. For locale-aware
+     * comparisons, use {@link java.text.Collator}.
+     * @param   str   the {@code String} to be compared.
+     * @return  a negative integer, zero, or a positive integer as the specified
+     *          String is greater than, equal to, or less than this String,
+     *          ignoring case considerations by case folding.
+     * @see     java.text.Collator
+     * @see     #toCaseFold()
+     * @since   26
+     */
+    public int compareToCaseFold(String str) {
+        return CASE_FOLD_ORDER.compare(this, str);
+    }
+
     /**
      * Tests if two string regions are equal.
      * <p>
@@ -3791,6 +3849,48 @@ public String toUpperCase() {
         return toUpperCase(Locale.getDefault());
     }
 
+    /**
+     * Returns a case-folded copy of this {@code String}, using the Unicode
+     * case folding mappings defined in
+     * <a href="https://www.unicode.org/Public/UCD/latest/ucd/CaseFolding.txt">
+     * Unicode Case Folding Properties</a>.
+     *
+     * <p>Case folding is a locale-independent, language-neutral form of
+     * case mapping, primarily intended for caseless matching.
+     * Unlike {@link #toLowerCase()} or {@link #toUpperCase()}, which are
+     * designed for locale-sensitive or display-oriented transformations,
+     * case folding provides a stable and consistent mapping across all
+     * environments. It may include one-to-many mappings; for example,
+     * the German sharp s ({@code U+00DF}) folds to the sequence
+     * {@code "ss"}.
+     *
+     * <p>This method performs the <em>"Full"</em> case folding as defined in
+     * the Unicode CaseFolding data file. The result is suitable for use in
+     * case-insensitive string comparison, searching, or indexing.
+     *
+     * @apiNote
+     * Case folding is intended for caseless matching, not for locale-sensitive
+     * presentation. For example:
+     *
+     * <pre>{@code
+     * String a = "Maße";
+     * String b = "MASSE";
+     * if (a.toCaseFold().equals(b.toCaseFold())) {
+     *     // true, matches according to Unicode caseless rules
+     * }
+     * }</pre>
+     *
+     * @return a {@code String} containing the case-folded form of this string
+     * @see #toLowerCase()
+     * @see #toUpperCase()
+     * @since 26
+     */
+
+    public String toCaseFold() {
+        return isLatin1() ? StringLatin1.toCaseFold(this, value)
+                          : StringUTF16.toCaseFold(this, value);
+    }
+
     /**
      * Returns a string whose value is this string, with all leading
      * and trailing space removed, where space is defined

diff --git a/src/java.base/share/classes/java/lang/StringCaseFoldedCharIterator.java b/src/java.base/share/classes/java/lang/StringCaseFoldedCharIterator.java
@@ -0,0 +1,60 @@
+package java.lang;
+
+import jdk.internal.java.lang.CaseFolding;
+
+abstract class StringCaseFoldedCharIterator {
+
+    protected final byte[] value;  // underlying byte array
+    protected final int length;    // length of the char unit in byte array
+    protected int index;           // current position in byte array
+    protected char[] folded;       // buffer for folded expansion
+    protected int foldedIndex;     // position in folded[]
+
+    StringCaseFoldedCharIterator(byte[] value, int length) {
+        this.value = value;
+        this.length = length;
+        this.index = 0;
+        this.folded = null;
+        this.foldedIndex = 0;
+    }
+
+    public boolean hasNext() {
+        return (folded != null && foldedIndex < folded.length) || index < length;
+    }
+
+    public int nextChar() {
+        if (folded != null && foldedIndex < folded.length) {
+            return folded[foldedIndex++];
+        }
+        if (index >= length) {
+            return -1;
+        }
+        int cp = codePointAt(value, index);
+        index += Character.charCount(cp);
+        folded = CaseFolding.fold(cp);
+        foldedIndex = 0;
+        return folded[foldedIndex++];
+    }
+
+    protected abstract int codePointAt(byte[] value, int index);
+
+    // Factory for Latin1
+    static StringCaseFoldedCharIterator ofLatin1(byte[] value) {
+        return new StringCaseFoldedCharIterator(value, value.length) {
+            @Override
+            protected int codePointAt(byte[] value, int index) {
+                return StringLatin1.codePointAt(value, index, value.length);
+            }
+        };
+    }
+
+    // Factory for UTF16
+    static StringCaseFoldedCharIterator ofUTF16(byte[] value) {
+        return new StringCaseFoldedCharIterator(value, value.length >> 1) {
+            @Override
+            protected int codePointAt(byte[] value, int index) {
+                return StringUTF16.codePointAt(value, index, value.length);
+            }
+        };
+    }
+}
diff --git a/src/java.base/share/classes/java/lang/StringLatin1.java b/src/java.base/share/classes/java/lang/StringLatin1.java
@@ -32,6 +32,7 @@
 import java.util.function.IntConsumer;
 import java.util.stream.Stream;
 import java.util.stream.StreamSupport;
+import jdk.internal.java.lang.CaseFolding;
 import jdk.internal.util.ArraysSupport;
 import jdk.internal.vm.annotation.IntrinsicCandidate;
 
@@ -560,6 +561,54 @@ private static String toUpperCaseEx(String str, byte[] value,
         return StringUTF16.newString(result, 0, resultOffset);
     }
 
+    private static String toCaseFoldEx(String str, byte[] value, int first) {
+        byte[] result = StringUTF16.newBytesFor(value.length);
+        int resultOffset = 0;
+        for (int i = 0; i < first; i++) {
+            StringUTF16.putChar(result, resultOffset++, value[i] & 0xff);
+        }
+        for (int i = first; i < value.length; i++) {
+            int cp = value[i] & 0xff;
+            char[] folded = CaseFolding.fold(cp);
+            if (folded.length == 1) {
+                StringUTF16.putChar(result, resultOffset++, folded[0]);
+            } else {
+                byte[] result2 = StringUTF16.newBytesFor((result.length >> 1) + folded.length - 1);
+                System.arraycopy(result, 0, result2, 0, resultOffset << 1);
+                result = result2;
+                for (int x = 0; x < folded.length; ++x) {
+                    StringUTF16.putChar(result, resultOffset++, folded[x]);
+                }
+            }
+        }
+        return StringUTF16.newString(result, 0, resultOffset);
+    }
+
+    public static String toCaseFold(String str, byte[] value) {
+        int first;
+        final int len = value.length;
+        // Now check if there are any characters that need to be changed
+        for (first = 0 ; first < len; first++) {
+            var cp = value[first] & 0xff;
+            if (!CaseFolding.isFolded(value[first] & 0xff)) {
+                break;
+            }
+        }
+        if (first == len)
+            return str;
+        byte[] result = new byte[len];
+        System.arraycopy(value, 0, result, 0, first);  // Just copy the first few
+        // fold characters
+        for (int i = first; i < len; i++) {
+            var folded = CaseFolding.fold(value[i] & 0xff);
+            if (folded.length > 1 || !canEncode(folded[0])) {
+                return toCaseFoldEx(str, value, first);
+            }
+            result[i] = (byte)(folded[0] & 0xff);
+        }
+        return new String(result, LATIN1);
+    }
+
     public static String trim(byte[] value) {
         int len = value.length;
         int st = 0;