ICU-22851 A named-element is an element, not a set

eggrobin · eggrobin · commit 17f175d38500 · 2025-12-29T23:22:23.000+01:00
diff --git a/icu4c/source/common/uniset_props.cpp b/icu4c/source/common/uniset_props.cpp
@@ -252,6 +252,10 @@ class UnicodeSet::Lexer {
             return U_SUCCESS(errorCode_) && category_ == STRING_LITERAL;
         }
 
+        bool isNamedElement() const {
+            return U_SUCCESS(errorCode_) && category_ == NAMED_ELEMENT;
+        }
+
         bool isBracketedElement() const {
             return U_SUCCESS(errorCode_) && category_ == BRACKETED_ELEMENT;
         }
@@ -267,20 +271,19 @@ class UnicodeSet::Lexer {
 
         std::optional<UChar32> codePoint() const {
             if (U_SUCCESS(errorCode_) && (category_ == LITERAL_ELEMENT || category_ == ESCAPED_ELEMENT ||
-                                          category_ == BRACKETED_ELEMENT)) {
+                                          category_ == BRACKETED_ELEMENT || category_ == NAMED_ELEMENT)) {
                 return string_.char32At(0);
             }
             return std::nullopt;
         }
 
-        // If `*this` is a valid property-query, named-element, or set-valued-variable, returns the
-        // set represented by this lexical element, which lives at least as long as `*this`.  Null
-        // otherwise.
+        // If `*this` is a valid property-query or set-valued-variable, returns the set represented
+        // by this lexical element, which lives at least as long as `*this`.  Null otherwise.
         const UnicodeSet *set() const {
             if (U_FAILURE(errorCode_)) {
                 return nullptr;
             }
-            if (category_ == PROPERTY_QUERY || category_ == NAMED_ELEMENT || category_ == VARIABLE) {
+            if (category_ == PROPERTY_QUERY || category_ == VARIABLE) {
                 if (precomputedSet_ != nullptr) {
                     return precomputedSet_;
                 } else {
@@ -453,11 +456,11 @@ class UnicodeSet::Lexer {
             if ((first == u'[' && second == u':') ||
                 (first == u'\\' && (second == u'p' || second == u'P' || second == u'N'))) {
                 if (second == u'N') {
-                    UnicodeSet const queryResult = scanNamedElementBrackets(errorCode);
+                    UChar32 const queryResult = scanNamedElementBrackets(errorCode);
                     return LexicalElement(
-                        LexicalElement::NAMED_ELEMENT, {}, getPos(), errorCode,
+                        LexicalElement::NAMED_ELEMENT, UnicodeString(queryResult), getPos(), errorCode,
                         /*precomputedSet=*/nullptr,
-                        /*set=*/std::move(queryResult),
+                        /*set=*/{},
                         std::u16string_view(pattern_).substr(start, parsePosition_.getIndex() - start));
                 } else {
                     UnicodeSet queryResult = scanPropertyQueryAfterStart(first, second, start, errorCode);
@@ -523,8 +526,24 @@ class UnicodeSet::Lexer {
             UChar32 next;
             int32_t codePointCount = 0;
             while (!chars_.atEnd() && U_SUCCESS(errorCode)) {
-                // TODO(egg): Propose making this space-sensitive and recognizing named-element.
-                next = chars_.next(charsOptions_, escaped, errorCode);
+                // TODO(egg): Propose making this space-sensitive.
+                const RuleCharacterIterator::Pos beforeNext = getPos();
+                next = chars_.next(charsOptions_ & ~RuleCharacterIterator::PARSE_ESCAPES,
+                                   unusedEscaped, errorCode);
+                if (next == u'\\') {
+                    if (chars_.next(charsOptions_ & ~(RuleCharacterIterator::PARSE_ESCAPES |
+                                                      RuleCharacterIterator::SKIP_WHITESPACE),
+                                    unusedEscaped, errorCode) == u'N') {
+                        next = scanNamedElementBrackets(errorCode);
+                        escaped = true;
+                    } else {
+                        chars_.setPos(beforeNext);
+                        // Parse the escape.
+                        next = chars_.next(charsOptions_, escaped, errorCode);
+                    }
+                } else {
+                  escaped = false;
+                }
                 if (!escaped && next == u'}') {
                     return LexicalElement(
                         codePointCount == 1 ? LexicalElement::BRACKETED_ELEMENT
@@ -551,8 +570,7 @@ class UnicodeSet::Lexer {
         }
     }
 
-    // TODO(egg): Return UChar32 per ICU-TC 2025-12-11.
-    UnicodeSet scanNamedElementBrackets(UErrorCode &errorCode) {
+    UChar32 scanNamedElementBrackets(UErrorCode &errorCode) {
         UBool unusedEscaped;
         const UChar32 open = chars_.next(charsOptions_ & ~(RuleCharacterIterator::PARSE_ESCAPES |
                                                            RuleCharacterIterator::SKIP_WHITESPACE),
@@ -571,7 +589,7 @@ class UnicodeSet::Lexer {
                         errorCode);
                     result.setPattern(
                         pattern_.tempSubStringBetween(nameStart - 3, parsePosition_.getIndex()));
-                    return result;
+                    return result.charAt(0);
                 }
             }
         }
@@ -1058,6 +1076,11 @@ void UnicodeSet::parseElements(Lexer &lexer,
     // Element      ::= RangeElement
     //                | string-literal
     //                | bracketed-element
+    // RangeElement ::= literal-element
+    //                | escaped-element
+    //                | named-element
+    // codePoint().has_value() on a lexical element if it is a RangeElement or a bracketed-element (which
+    // should become a RangeElement, but this will take another ICU proposal).
     if (lexer.lookahead().isBracketedElement() || lexer.lookahead().isStringLiteral()) {
         add(*lexer.lookahead().element());
         rebuiltPat.append(u'{');
@@ -1107,7 +1130,7 @@ void UnicodeSet::parseElements(Lexer &lexer,
                                                      lexer.lookahead2().debugString(),
                                                  lexer, ec);
         }
-    } else if (lexer.lookahead().codePoint().has_value()) {
+    } else if (lexer.lookahead().codePoint().has_value() && !lexer.lookahead().isBracketedElement()) {
         last = *lexer.lookahead().codePoint();
     } else {
         U_UNICODESET_RETURN_WITH_PARSE_ERROR("RangeElement", lexer.lookahead().debugString(), lexer, ec);
diff --git a/icu4c/source/test/intltest/usettest.cpp b/icu4c/source/test/intltest/usettest.cpp
@@ -1886,8 +1886,8 @@ void UnicodeSetTest::TestSymbolTable() {
              U_ZERO_ERROR,
              u"[{}]"},
             // Check that we don’t recursively expand variables.
-            // In ICU79 and earlier, this would have been U_ZERO_ERROR with [[\$y][\$x]]; but \$y is
-            // a sequence of elements, so it is not a valid variable value.
+            // In ICU 78 and earlier, this would have been U_ZERO_ERROR with [[\$y][\$x]]; but \$y
+            // is a sequence of elements, so it is not a valid variable value.
             {{{u"x", u"$y"}, {u"y", u"$x"}},
              u"[[$x][$y]]",
              U_MALFORMED_VARIABLE_DEFINITION,
@@ -4706,7 +4706,7 @@ void UnicodeSetTest::TestToPatternOutput() {
             // TODO(egg): PDUTS #61 disallows the space before ^.
             {uR"([: ^general category = punctuation :])", uR"([: ^general category = punctuation :])"},
             {uR"(\P{ gc = punctuation })", uR"(\P{ gc = punctuation })"},
-            {uR"(\N{ latin small letter a })", uR"(\N{ latin small letter a })"},
+            {uR"([\N{ latin small letter a }])", uR"([a])"},
             // If there is any Restriction among the terms, its syntax is mostly as-is (spaces are
             // still eliminated), with the exception that an initial UnescapedHyphenMinus gets escaped.
             // This is applied recursively, so innermost ranges-only UnicodeSets get normalized.
@@ -4722,9 +4722,15 @@ void UnicodeSetTest::TestToPatternOutput() {
             // Escapes are removed even when the syntax is preserved.
             {uR"([ { \x5A e i c h e n k e t t e } [] \x5Aeichenmenge ])",
             u"[{Zeichenkette}[]Zeichenmenge]"},
-            // A named-element is currently a nested set, so it is preserved and causes the syntax to be
-            // preserved.
-            {uR"([ \N{LATIN CAPITAL LETTER Z}eichenmenge ])", uR"([\N{LATIN CAPITAL LETTER Z}eichenmenge])"},
+            // In ICU 78 and earlier, a named-element was a nested set, so it was preserved and
+            // caused the syntax to be preserved.  Now it is treated like an escape.
+            {uR"([ \N{LATIN CAPITAL LETTER Z}eichenmenge ])", uR"([Zceg-imn])"},
+            // This was ill-formed in ICU 78 and earlier (in a convoluted way:
+            // {\N{LATIN CAPITAL LETTER Z} was a well-formed string literal, but then the second }
+            // was unpaired).
+            {uR"([ {\N{LATIN CAPITAL LETTER Z}eichenkette} ])", uR"([{Zeichenkette}])"},
+            // This used to be equal to [A] in ICU 78 and earlier.
+            {uR"([ \N{LATIN CAPITAL LETTER A} - \N{LATIN CAPITAL LETTER Z} ])", uR"([A-Z])"},
             // An anchor also causes the syntax to be preserved.
             {u"[ d-z a-c $ ]", u"[d-za-c$]"},
             {u"[ - a-c d-z $ ]", uR"([\-a-cd-z$])"},
@@ -4788,6 +4794,15 @@ void UnicodeSetTest::TestParseErrors() {
             u"[[a]&z]",
             // "Missing ']'".
             u"[a-z",
+            // This was a well-formed string in ICU 78 and earlier, with the value
+            // "N{LATINCAPITALLETTERZ".
+            uR"([{\N{LATIN CAPITAL LETTER Z}])",
+            // This was a well-formed set in ICU 78 and earlier; now it must be enclosed in square
+            // brackets.
+            uR"(\N{ latin small letter a })",
+            // TODO(egg): Well-formed in Java, ill-formed in ICU4C in ICU 78 and earlier.
+            u"[a-{z}]",
+            u"[{a}-z]",
         }) {
         UErrorCode errorCode = U_ZERO_ERROR;
         const UnicodeSet set(expression, errorCode);