Skip to content

Commit 17f175d

Browse files
committed
ICU-22851 A named-element is an element, not a set
1 parent 5b36180 commit 17f175d

File tree

2 files changed

+58
-20
lines changed

2 files changed

+58
-20
lines changed

icu4c/source/common/uniset_props.cpp

Lines changed: 37 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -252,6 +252,10 @@ class UnicodeSet::Lexer {
252252
return U_SUCCESS(errorCode_) && category_ == STRING_LITERAL;
253253
}
254254

255+
bool isNamedElement() const {
256+
return U_SUCCESS(errorCode_) && category_ == NAMED_ELEMENT;
257+
}
258+
255259
bool isBracketedElement() const {
256260
return U_SUCCESS(errorCode_) && category_ == BRACKETED_ELEMENT;
257261
}
@@ -267,20 +271,19 @@ class UnicodeSet::Lexer {
267271

268272
std::optional<UChar32> codePoint() const {
269273
if (U_SUCCESS(errorCode_) && (category_ == LITERAL_ELEMENT || category_ == ESCAPED_ELEMENT ||
270-
category_ == BRACKETED_ELEMENT)) {
274+
category_ == BRACKETED_ELEMENT || category_ == NAMED_ELEMENT)) {
271275
return string_.char32At(0);
272276
}
273277
return std::nullopt;
274278
}
275279

276-
// If `*this` is a valid property-query, named-element, or set-valued-variable, returns the
277-
// set represented by this lexical element, which lives at least as long as `*this`. Null
278-
// otherwise.
280+
// If `*this` is a valid property-query or set-valued-variable, returns the set represented
281+
// by this lexical element, which lives at least as long as `*this`. Null otherwise.
279282
const UnicodeSet *set() const {
280283
if (U_FAILURE(errorCode_)) {
281284
return nullptr;
282285
}
283-
if (category_ == PROPERTY_QUERY || category_ == NAMED_ELEMENT || category_ == VARIABLE) {
286+
if (category_ == PROPERTY_QUERY || category_ == VARIABLE) {
284287
if (precomputedSet_ != nullptr) {
285288
return precomputedSet_;
286289
} else {
@@ -453,11 +456,11 @@ class UnicodeSet::Lexer {
453456
if ((first == u'[' && second == u':') ||
454457
(first == u'\\' && (second == u'p' || second == u'P' || second == u'N'))) {
455458
if (second == u'N') {
456-
UnicodeSet const queryResult = scanNamedElementBrackets(errorCode);
459+
UChar32 const queryResult = scanNamedElementBrackets(errorCode);
457460
return LexicalElement(
458-
LexicalElement::NAMED_ELEMENT, {}, getPos(), errorCode,
461+
LexicalElement::NAMED_ELEMENT, UnicodeString(queryResult), getPos(), errorCode,
459462
/*precomputedSet=*/nullptr,
460-
/*set=*/std::move(queryResult),
463+
/*set=*/{},
461464
std::u16string_view(pattern_).substr(start, parsePosition_.getIndex() - start));
462465
} else {
463466
UnicodeSet queryResult = scanPropertyQueryAfterStart(first, second, start, errorCode);
@@ -523,8 +526,24 @@ class UnicodeSet::Lexer {
523526
UChar32 next;
524527
int32_t codePointCount = 0;
525528
while (!chars_.atEnd() && U_SUCCESS(errorCode)) {
526-
// TODO(egg): Propose making this space-sensitive and recognizing named-element.
527-
next = chars_.next(charsOptions_, escaped, errorCode);
529+
// TODO(egg): Propose making this space-sensitive.
530+
const RuleCharacterIterator::Pos beforeNext = getPos();
531+
next = chars_.next(charsOptions_ & ~RuleCharacterIterator::PARSE_ESCAPES,
532+
unusedEscaped, errorCode);
533+
if (next == u'\\') {
534+
if (chars_.next(charsOptions_ & ~(RuleCharacterIterator::PARSE_ESCAPES |
535+
RuleCharacterIterator::SKIP_WHITESPACE),
536+
unusedEscaped, errorCode) == u'N') {
537+
next = scanNamedElementBrackets(errorCode);
538+
escaped = true;
539+
} else {
540+
chars_.setPos(beforeNext);
541+
// Parse the escape.
542+
next = chars_.next(charsOptions_, escaped, errorCode);
543+
}
544+
} else {
545+
escaped = false;
546+
}
528547
if (!escaped && next == u'}') {
529548
return LexicalElement(
530549
codePointCount == 1 ? LexicalElement::BRACKETED_ELEMENT
@@ -551,8 +570,7 @@ class UnicodeSet::Lexer {
551570
}
552571
}
553572

554-
// TODO(egg): Return UChar32 per ICU-TC 2025-12-11.
555-
UnicodeSet scanNamedElementBrackets(UErrorCode &errorCode) {
573+
UChar32 scanNamedElementBrackets(UErrorCode &errorCode) {
556574
UBool unusedEscaped;
557575
const UChar32 open = chars_.next(charsOptions_ & ~(RuleCharacterIterator::PARSE_ESCAPES |
558576
RuleCharacterIterator::SKIP_WHITESPACE),
@@ -571,7 +589,7 @@ class UnicodeSet::Lexer {
571589
errorCode);
572590
result.setPattern(
573591
pattern_.tempSubStringBetween(nameStart - 3, parsePosition_.getIndex()));
574-
return result;
592+
return result.charAt(0);
575593
}
576594
}
577595
}
@@ -1058,6 +1076,11 @@ void UnicodeSet::parseElements(Lexer &lexer,
10581076
// Element ::= RangeElement
10591077
// | string-literal
10601078
// | bracketed-element
1079+
// RangeElement ::= literal-element
1080+
// | escaped-element
1081+
// | named-element
1082+
// codePoint().has_value() on a lexical element if it is a RangeElement or a bracketed-element (which
1083+
// should become a RangeElement, but this will take another ICU proposal).
10611084
if (lexer.lookahead().isBracketedElement() || lexer.lookahead().isStringLiteral()) {
10621085
add(*lexer.lookahead().element());
10631086
rebuiltPat.append(u'{');
@@ -1107,7 +1130,7 @@ void UnicodeSet::parseElements(Lexer &lexer,
11071130
lexer.lookahead2().debugString(),
11081131
lexer, ec);
11091132
}
1110-
} else if (lexer.lookahead().codePoint().has_value()) {
1133+
} else if (lexer.lookahead().codePoint().has_value() && !lexer.lookahead().isBracketedElement()) {
11111134
last = *lexer.lookahead().codePoint();
11121135
} else {
11131136
U_UNICODESET_RETURN_WITH_PARSE_ERROR("RangeElement", lexer.lookahead().debugString(), lexer, ec);

icu4c/source/test/intltest/usettest.cpp

Lines changed: 21 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1886,8 +1886,8 @@ void UnicodeSetTest::TestSymbolTable() {
18861886
U_ZERO_ERROR,
18871887
u"[{}]"},
18881888
// Check that we don’t recursively expand variables.
1889-
// In ICU79 and earlier, this would have been U_ZERO_ERROR with [[\$y][\$x]]; but \$y is
1890-
// a sequence of elements, so it is not a valid variable value.
1889+
// In ICU 78 and earlier, this would have been U_ZERO_ERROR with [[\$y][\$x]]; but \$y
1890+
// is a sequence of elements, so it is not a valid variable value.
18911891
{{{u"x", u"$y"}, {u"y", u"$x"}},
18921892
u"[[$x][$y]]",
18931893
U_MALFORMED_VARIABLE_DEFINITION,
@@ -4706,7 +4706,7 @@ void UnicodeSetTest::TestToPatternOutput() {
47064706
// TODO(egg): PDUTS #61 disallows the space before ^.
47074707
{uR"([: ^general category = punctuation :])", uR"([: ^general category = punctuation :])"},
47084708
{uR"(\P{ gc = punctuation })", uR"(\P{ gc = punctuation })"},
4709-
{uR"(\N{ latin small letter a })", uR"(\N{ latin small letter a })"},
4709+
{uR"([\N{ latin small letter a }])", uR"([a])"},
47104710
// If there is any Restriction among the terms, its syntax is mostly as-is (spaces are
47114711
// still eliminated), with the exception that an initial UnescapedHyphenMinus gets escaped.
47124712
// This is applied recursively, so innermost ranges-only UnicodeSets get normalized.
@@ -4722,9 +4722,15 @@ void UnicodeSetTest::TestToPatternOutput() {
47224722
// Escapes are removed even when the syntax is preserved.
47234723
{uR"([ { \x5A e i c h e n k e t t e } [] \x5Aeichenmenge ])",
47244724
u"[{Zeichenkette}[]Zeichenmenge]"},
4725-
// A named-element is currently a nested set, so it is preserved and causes the syntax to be
4726-
// preserved.
4727-
{uR"([ \N{LATIN CAPITAL LETTER Z}eichenmenge ])", uR"([\N{LATIN CAPITAL LETTER Z}eichenmenge])"},
4725+
// In ICU 78 and earlier, a named-element was a nested set, so it was preserved and
4726+
// caused the syntax to be preserved. Now it is treated like an escape.
4727+
{uR"([ \N{LATIN CAPITAL LETTER Z}eichenmenge ])", uR"([Zceg-imn])"},
4728+
// This was ill-formed in ICU 78 and earlier (in a convoluted way:
4729+
// {\N{LATIN CAPITAL LETTER Z} was a well-formed string literal, but then the second }
4730+
// was unpaired).
4731+
{uR"([ {\N{LATIN CAPITAL LETTER Z}eichenkette} ])", uR"([{Zeichenkette}])"},
4732+
// This used to be equal to [A] in ICU 78 and earlier.
4733+
{uR"([ \N{LATIN CAPITAL LETTER A} - \N{LATIN CAPITAL LETTER Z} ])", uR"([A-Z])"},
47284734
// An anchor also causes the syntax to be preserved.
47294735
{u"[ d-z a-c $ ]", u"[d-za-c$]"},
47304736
{u"[ - a-c d-z $ ]", uR"([\-a-cd-z$])"},
@@ -4788,6 +4794,15 @@ void UnicodeSetTest::TestParseErrors() {
47884794
u"[[a]&z]",
47894795
// "Missing ']'".
47904796
u"[a-z",
4797+
// This was a well-formed string in ICU 78 and earlier, with the value
4798+
// "N{LATINCAPITALLETTERZ".
4799+
uR"([{\N{LATIN CAPITAL LETTER Z}])",
4800+
// This was a well-formed set in ICU 78 and earlier; now it must be enclosed in square
4801+
// brackets.
4802+
uR"(\N{ latin small letter a })",
4803+
// TODO(egg): Well-formed in Java, ill-formed in ICU4C in ICU 78 and earlier.
4804+
u"[a-{z}]",
4805+
u"[{a}-z]",
47914806
}) {
47924807
UErrorCode errorCode = U_ZERO_ERROR;
47934808
const UnicodeSet set(expression, errorCode);

0 commit comments

Comments
 (0)