Skip to content

Commit 61fd54f

Browse files
committed
Some logic to IcuHelpers file.
1 parent d24ac9d commit 61fd54f

File tree

2 files changed

+325
-315
lines changed

2 files changed

+325
-315
lines changed

Jint/Native/Intl/IcuHelpers.cs

Lines changed: 324 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,324 @@
1+
using System.Text;
2+
using Jint.Runtime;
3+
4+
namespace Jint.Native.Intl
5+
{
6+
/// <summary>
7+
/// ICU interop + ECMA-402 canonicalization helpers shared by Intl built-ins.
8+
/// </summary>
9+
internal static class IcuHelpers
10+
{
11+
12+
/// <summary>
13+
/// Equivalent to WebKit's languageTagForLocaleID(localeID, isImmortal=false).
14+
/// Calls ICU uloc_toLanguageTag(localeId, strict=false), then applies the same
15+
/// unicode extension cleanup WebKit does (drop "-u-…-true" values).
16+
/// </summary>
17+
public static string LanguageTagForLocaleId(string localeId)
18+
{
19+
if (string.IsNullOrEmpty(localeId))
20+
return string.Empty;
21+
22+
var status = ICU.UErrorCode.U_ZERO_ERROR;
23+
24+
// First pass with a reasonable buffer
25+
byte[] buf = new byte[256];
26+
int len = ICU.uloc_toLanguageTag(localeId, buf, buf.Length, strict: false, ref status);
27+
28+
// If ICU tells us the required size, reallocate and retry
29+
if (len > buf.Length)
30+
{
31+
buf = new byte[len];
32+
status = ICU.UErrorCode.U_ZERO_ERROR;
33+
len = ICU.uloc_toLanguageTag(localeId, buf, buf.Length, strict: false, ref status);
34+
}
35+
36+
if (status != ICU.UErrorCode.U_ZERO_ERROR || len <= 0)
37+
Throw.ArgumentException($"ICU uloc_toLanguageTag failed for '{localeId}' (status={status}).");
38+
39+
// ICU writes UTF-8 bytes; decode exactly the returned length
40+
string tag = System.Text.Encoding.UTF8.GetString(buf, 0, len);
41+
42+
// Do the same extension cleanup WebKit applies
43+
return CanonicalizeUnicodeExtensionsAfterIcu(tag);
44+
}
45+
46+
// Keys whose boolean "true" value is **elided** in canonical form.
47+
// For these, "-u-<key>-yes" and "-u-<key>-true" both canonicalize to just "-u-<key>".
48+
// Add "ca" here so a bare `-u-ca` does not synthesize `-yes`
49+
private static readonly HashSet<string> s_trueDroppableKeys = new(StringComparer.OrdinalIgnoreCase)
50+
{
51+
"kb", "kc", "kh", "kk", "kn", "ca"
52+
};
53+
54+
55+
// Canonicalize subdivision aliases (used for rg/sd values).
56+
private static string CanonicalizeSubdivision(string value)
57+
{
58+
switch (value.ToLowerInvariant())
59+
{
60+
case "no23": return "no50";
61+
case "cn11": return "cnbj";
62+
case "cz10a": return "cz110";
63+
case "fra": return "frges";
64+
case "frg": return "frges";
65+
case "lud": return "lucl"; // test262 prefers the first in replacement list
66+
default: return value;
67+
}
68+
}
69+
70+
// Canonicalize time zone type aliases (used for tz values).
71+
private static string CanonicalizeTimeZoneType(string value)
72+
{
73+
switch (value.ToLowerInvariant())
74+
{
75+
case "cnckg": return "cnsha"; // deprecated -> preferred
76+
case "eire": return "iedub"; // alias -> canonical
77+
case "est": return "papty"; // alias -> canonical
78+
case "gmt0": return "gmt"; // alias -> canonical
79+
case "uct": return "utc"; // alias -> canonical
80+
case "zulu": return "utc"; // alias -> canonical
81+
case "utcw05": return "papty"; // short offset alias seen in test262
82+
default: return value;
83+
}
84+
}
85+
86+
/// <summary>
87+
/// Mirrors WebKit's canonicalizeUnicodeExtensionsAfterICULocaleCanonicalization():
88+
/// - Finds the "-u-" extension and its end (before the next singleton).
89+
/// - Re-emits the extension with per-key normalization:
90+
/// * For keys kb/kc/kh/kk/kn: drop boolean "true" (and treat "yes" as true → drop).
91+
/// * For all other keys: keep "yes"; if ICU turned "yes" into "true", revert to "yes".
92+
/// * For "rg"/"sd": canonicalize subdivision aliases (no23→no50, ...).
93+
/// * For "tz": canonicalize timezone aliases (eire→iedub, est→papty, ...).
94+
/// Everything else in the tag is preserved.
95+
/// </summary>
96+
public static string CanonicalizeUnicodeExtensionsAfterIcu(string tag)
97+
{
98+
if (string.IsNullOrEmpty(tag))
99+
return tag;
100+
101+
int extensionIndex = tag.IndexOf("-u-", StringComparison.OrdinalIgnoreCase);
102+
if (extensionIndex < 0)
103+
return tag;
104+
105+
// Determine the end of the -u- block (before the next singleton like -x-).
106+
int extensionLength = tag.Length - extensionIndex;
107+
int end = extensionIndex + 3;
108+
while (end < tag.Length)
109+
{
110+
int dash = tag.IndexOf('-', end);
111+
if (dash < 0)
112+
break;
113+
if (dash + 2 < tag.Length && tag[dash + 2] == '-')
114+
{
115+
extensionLength = dash - extensionIndex;
116+
break;
117+
}
118+
end = dash + 1;
119+
}
120+
121+
var result = new StringBuilder(tag.Length + 8);
122+
123+
// Copy up to and including "-u"
124+
result.Append(tag, 0, extensionIndex + 2);
125+
126+
// Process "-u-..." segment
127+
string extension = tag.Substring(extensionIndex, extensionLength);
128+
var parts = extension.Split('-'); // parts[0] == "", parts[1] == "u"
129+
int i = 2;
130+
131+
while (i < parts.Length)
132+
{
133+
string subtag = parts[i];
134+
if (subtag.Length == 0) { i++; continue; }
135+
136+
// Emit the key or attribute
137+
result.Append('-');
138+
result.Append(subtag);
139+
140+
if (subtag.Length == 2)
141+
{
142+
// It's a key.
143+
string key = subtag;
144+
bool keyIsDroppableTrue = s_trueDroppableKeys.Contains(key);
145+
146+
int valueStart = i + 1;
147+
int valueEnd = valueStart;
148+
while (valueEnd < parts.Length && parts[valueEnd].Length != 2 && parts[valueEnd].Length != 0)
149+
valueEnd++;
150+
151+
bool emittedAnyValue = false;
152+
153+
for (int v = valueStart; v < valueEnd; v++)
154+
{
155+
string value = parts[v];
156+
if (value.Length == 0)
157+
continue;
158+
159+
// Handle "yes"/"true" normalization
160+
if (value.Equals("yes", StringComparison.OrdinalIgnoreCase))
161+
{
162+
if (keyIsDroppableTrue)
163+
{
164+
// Drop boolean true for droppable keys.
165+
continue;
166+
}
167+
// keep "yes" for non-droppable
168+
}
169+
else if (value.Equals("true", StringComparison.OrdinalIgnoreCase))
170+
{
171+
if (keyIsDroppableTrue)
172+
{
173+
// Drop boolean true for droppable keys.
174+
continue;
175+
}
176+
// Non-droppable: canonicalize to "yes"
177+
value = "yes";
178+
}
179+
180+
// Per-key aliasing
181+
if (key.Equals("rg", StringComparison.OrdinalIgnoreCase) ||
182+
key.Equals("sd", StringComparison.OrdinalIgnoreCase))
183+
{
184+
value = CanonicalizeSubdivision(value);
185+
}
186+
else if (key.Equals("tz", StringComparison.OrdinalIgnoreCase))
187+
{
188+
value = CanonicalizeTimeZoneType(value);
189+
}
190+
191+
result.Append('-');
192+
result.Append(value);
193+
emittedAnyValue = true;
194+
}
195+
196+
// If **no** value was emitted for a **non-droppable** key, synthesize "-yes".
197+
if (!emittedAnyValue && !keyIsDroppableTrue)
198+
{
199+
result.Append("-yes");
200+
}
201+
202+
i = valueEnd;
203+
}
204+
else
205+
{
206+
// Attribute (or malformed); just pass through.
207+
i++;
208+
}
209+
}
210+
211+
// Append remainder after the -u- block
212+
result.Append(tag, extensionIndex + extensionLength, tag.Length - (extensionIndex + extensionLength));
213+
return result.ToString();
214+
}
215+
216+
/// Validates `tag` as a BCP-47 language tag via ICU and returns a canonical tag.
217+
/// Throws RangeError on invalid tags (spec-compliant).
218+
public static string CanonicalizeUnicodeLocaleIdOrThrow(Realm realm, string tag)
219+
{
220+
// 1) Validate & parse BCP-47 -> ICU locale ID
221+
var status = ICU.UErrorCode.U_ZERO_ERROR;
222+
byte[] locBuf = new byte[128];
223+
int parsed;
224+
int need = ICU.uloc_forLanguageTag(tag, locBuf, locBuf.Length, out parsed, ref status);
225+
226+
if (need > locBuf.Length)
227+
{
228+
locBuf = new byte[need];
229+
status = ICU.UErrorCode.U_ZERO_ERROR;
230+
need = ICU.uloc_forLanguageTag(tag, locBuf, locBuf.Length, out parsed, ref status);
231+
}
232+
233+
if (status != ICU.UErrorCode.U_ZERO_ERROR || parsed != tag.Length || need <= 0)
234+
{
235+
// RangeError per spec
236+
Throw.RangeError(realm, $"invalid language tag: {tag}");
237+
}
238+
239+
string icuLocaleId = Encoding.UTF8.GetString(locBuf, 0, need);
240+
241+
// 2) Canonicalize the ICU locale ID (this applies CLDR language/region/script aliases, e.g. cmn->zh)
242+
status = ICU.UErrorCode.U_ZERO_ERROR;
243+
byte[] canonLoc = new byte[System.Math.Max(need + 16, 256)];
244+
int canonLen = ICU.uloc_canonicalize(icuLocaleId, canonLoc, canonLoc.Length, ref status);
245+
246+
if (canonLen > canonLoc.Length)
247+
{
248+
canonLoc = new byte[canonLen];
249+
status = ICU.UErrorCode.U_ZERO_ERROR;
250+
canonLen = ICU.uloc_canonicalize(icuLocaleId, canonLoc, canonLoc.Length, ref status);
251+
}
252+
253+
string icuCanonical = (status == ICU.UErrorCode.U_ZERO_ERROR && canonLen > 0)
254+
? Encoding.UTF8.GetString(canonLoc, 0, canonLen)
255+
: icuLocaleId; // fall back if canonicalize didn’t change it
256+
257+
// 3) Convert canonical ICU locale ID -> canonical BCP-47 tag
258+
status = ICU.UErrorCode.U_ZERO_ERROR;
259+
byte[] outBuf = new byte[256];
260+
int len = ICU.uloc_toLanguageTag(icuCanonical, outBuf, outBuf.Length, strict: false, ref status);
261+
262+
if (len > outBuf.Length)
263+
{
264+
outBuf = new byte[len];
265+
status = ICU.UErrorCode.U_ZERO_ERROR;
266+
len = ICU.uloc_toLanguageTag(icuCanonical, outBuf, outBuf.Length, strict: false, ref status);
267+
}
268+
269+
if (status != ICU.UErrorCode.U_ZERO_ERROR || len <= 0)
270+
{
271+
Throw.RangeError(realm, $"failed to canonicalize language tag: {tag}");
272+
}
273+
274+
var canonical = Encoding.UTF8.GetString(outBuf, 0, len);
275+
276+
// WebKit-style cleanup for "-u-…-true"
277+
canonical = CanonicalizeUnicodeExtensionsAfterIcu(canonical);
278+
279+
// Fallback for ICU builds that don't alias cmn->zh
280+
canonical = FixKnownLanguageAliases(canonical);
281+
282+
return canonical;
283+
}
284+
285+
private static string FixKnownLanguageAliases(string canonicalTag)
286+
{
287+
if (string.IsNullOrEmpty(canonicalTag))
288+
return canonicalTag;
289+
290+
// Split once: "xx[-…]" → lang + rest (rest includes the leading '-')
291+
int dash = canonicalTag.IndexOf('-');
292+
ReadOnlySpan<char> lang = dash < 0
293+
? canonicalTag.AsSpan()
294+
: canonicalTag.AsSpan(0, dash);
295+
296+
// We'll append the remainder (if any) after we swap the primary language subtag.
297+
ReadOnlySpan<char> rest = dash < 0
298+
? ReadOnlySpan<char>.Empty
299+
: canonicalTag.AsSpan(dash); // includes '-...'
300+
301+
// Known primary language aliases not consistently handled by older ICU:
302+
// - cmn → zh (Mandarin → Chinese)
303+
// - ji → yi
304+
// - in → id
305+
if (lang.Equals("cmn".AsSpan(), StringComparison.OrdinalIgnoreCase))
306+
{
307+
return rest.IsEmpty ? "zh" : "zh" + rest.ToString();
308+
}
309+
310+
if (lang.Equals("ji".AsSpan(), StringComparison.OrdinalIgnoreCase))
311+
{
312+
return rest.IsEmpty ? "yi" : "yi" + rest.ToString();
313+
}
314+
315+
if (lang.Equals("in".AsSpan(), StringComparison.OrdinalIgnoreCase))
316+
{
317+
return rest.IsEmpty ? "id" : "id" + rest.ToString();
318+
}
319+
320+
// Otherwise, leave as-is.
321+
return canonicalTag;
322+
}
323+
}
324+
}

0 commit comments

Comments
 (0)