|
| 1 | +using System.Text; |
| 2 | +using Jint.Runtime; |
| 3 | + |
| 4 | +namespace Jint.Native.Intl |
| 5 | +{ |
| 6 | + /// <summary> |
| 7 | + /// ICU interop + ECMA-402 canonicalization helpers shared by Intl built-ins. |
| 8 | + /// </summary> |
| 9 | + internal static class IcuHelpers |
| 10 | + { |
| 11 | + |
| 12 | + /// <summary> |
| 13 | + /// Equivalent to WebKit's languageTagForLocaleID(localeID, isImmortal=false). |
| 14 | + /// Calls ICU uloc_toLanguageTag(localeId, strict=false), then applies the same |
| 15 | + /// unicode extension cleanup WebKit does (drop "-u-…-true" values). |
| 16 | + /// </summary> |
| 17 | + public static string LanguageTagForLocaleId(string localeId) |
| 18 | + { |
| 19 | + if (string.IsNullOrEmpty(localeId)) |
| 20 | + return string.Empty; |
| 21 | + |
| 22 | + var status = ICU.UErrorCode.U_ZERO_ERROR; |
| 23 | + |
| 24 | + // First pass with a reasonable buffer |
| 25 | + byte[] buf = new byte[256]; |
| 26 | + int len = ICU.uloc_toLanguageTag(localeId, buf, buf.Length, strict: false, ref status); |
| 27 | + |
| 28 | + // If ICU tells us the required size, reallocate and retry |
| 29 | + if (len > buf.Length) |
| 30 | + { |
| 31 | + buf = new byte[len]; |
| 32 | + status = ICU.UErrorCode.U_ZERO_ERROR; |
| 33 | + len = ICU.uloc_toLanguageTag(localeId, buf, buf.Length, strict: false, ref status); |
| 34 | + } |
| 35 | + |
| 36 | + if (status != ICU.UErrorCode.U_ZERO_ERROR || len <= 0) |
| 37 | + Throw.ArgumentException($"ICU uloc_toLanguageTag failed for '{localeId}' (status={status})."); |
| 38 | + |
| 39 | + // ICU writes UTF-8 bytes; decode exactly the returned length |
| 40 | + string tag = System.Text.Encoding.UTF8.GetString(buf, 0, len); |
| 41 | + |
| 42 | + // Do the same extension cleanup WebKit applies |
| 43 | + return CanonicalizeUnicodeExtensionsAfterIcu(tag); |
| 44 | + } |
| 45 | + |
| 46 | + // Keys whose boolean "true" value is **elided** in canonical form. |
| 47 | + // For these, "-u-<key>-yes" and "-u-<key>-true" both canonicalize to just "-u-<key>". |
| 48 | + // Add "ca" here so a bare `-u-ca` does not synthesize `-yes` |
| 49 | + private static readonly HashSet<string> s_trueDroppableKeys = new(StringComparer.OrdinalIgnoreCase) |
| 50 | + { |
| 51 | + "kb", "kc", "kh", "kk", "kn", "ca" |
| 52 | + }; |
| 53 | + |
| 54 | + |
| 55 | + // Canonicalize subdivision aliases (used for rg/sd values). |
| 56 | + private static string CanonicalizeSubdivision(string value) |
| 57 | + { |
| 58 | + switch (value.ToLowerInvariant()) |
| 59 | + { |
| 60 | + case "no23": return "no50"; |
| 61 | + case "cn11": return "cnbj"; |
| 62 | + case "cz10a": return "cz110"; |
| 63 | + case "fra": return "frges"; |
| 64 | + case "frg": return "frges"; |
| 65 | + case "lud": return "lucl"; // test262 prefers the first in replacement list |
| 66 | + default: return value; |
| 67 | + } |
| 68 | + } |
| 69 | + |
| 70 | + // Canonicalize time zone type aliases (used for tz values). |
| 71 | + private static string CanonicalizeTimeZoneType(string value) |
| 72 | + { |
| 73 | + switch (value.ToLowerInvariant()) |
| 74 | + { |
| 75 | + case "cnckg": return "cnsha"; // deprecated -> preferred |
| 76 | + case "eire": return "iedub"; // alias -> canonical |
| 77 | + case "est": return "papty"; // alias -> canonical |
| 78 | + case "gmt0": return "gmt"; // alias -> canonical |
| 79 | + case "uct": return "utc"; // alias -> canonical |
| 80 | + case "zulu": return "utc"; // alias -> canonical |
| 81 | + case "utcw05": return "papty"; // short offset alias seen in test262 |
| 82 | + default: return value; |
| 83 | + } |
| 84 | + } |
| 85 | + |
| 86 | + /// <summary> |
| 87 | + /// Mirrors WebKit's canonicalizeUnicodeExtensionsAfterICULocaleCanonicalization(): |
| 88 | + /// - Finds the "-u-" extension and its end (before the next singleton). |
| 89 | + /// - Re-emits the extension with per-key normalization: |
| 90 | + /// * For keys kb/kc/kh/kk/kn: drop boolean "true" (and treat "yes" as true → drop). |
| 91 | + /// * For all other keys: keep "yes"; if ICU turned "yes" into "true", revert to "yes". |
| 92 | + /// * For "rg"/"sd": canonicalize subdivision aliases (no23→no50, ...). |
| 93 | + /// * For "tz": canonicalize timezone aliases (eire→iedub, est→papty, ...). |
| 94 | + /// Everything else in the tag is preserved. |
| 95 | + /// </summary> |
| 96 | + public static string CanonicalizeUnicodeExtensionsAfterIcu(string tag) |
| 97 | + { |
| 98 | + if (string.IsNullOrEmpty(tag)) |
| 99 | + return tag; |
| 100 | + |
| 101 | + int extensionIndex = tag.IndexOf("-u-", StringComparison.OrdinalIgnoreCase); |
| 102 | + if (extensionIndex < 0) |
| 103 | + return tag; |
| 104 | + |
| 105 | + // Determine the end of the -u- block (before the next singleton like -x-). |
| 106 | + int extensionLength = tag.Length - extensionIndex; |
| 107 | + int end = extensionIndex + 3; |
| 108 | + while (end < tag.Length) |
| 109 | + { |
| 110 | + int dash = tag.IndexOf('-', end); |
| 111 | + if (dash < 0) |
| 112 | + break; |
| 113 | + if (dash + 2 < tag.Length && tag[dash + 2] == '-') |
| 114 | + { |
| 115 | + extensionLength = dash - extensionIndex; |
| 116 | + break; |
| 117 | + } |
| 118 | + end = dash + 1; |
| 119 | + } |
| 120 | + |
| 121 | + var result = new StringBuilder(tag.Length + 8); |
| 122 | + |
| 123 | + // Copy up to and including "-u" |
| 124 | + result.Append(tag, 0, extensionIndex + 2); |
| 125 | + |
| 126 | + // Process "-u-..." segment |
| 127 | + string extension = tag.Substring(extensionIndex, extensionLength); |
| 128 | + var parts = extension.Split('-'); // parts[0] == "", parts[1] == "u" |
| 129 | + int i = 2; |
| 130 | + |
| 131 | + while (i < parts.Length) |
| 132 | + { |
| 133 | + string subtag = parts[i]; |
| 134 | + if (subtag.Length == 0) { i++; continue; } |
| 135 | + |
| 136 | + // Emit the key or attribute |
| 137 | + result.Append('-'); |
| 138 | + result.Append(subtag); |
| 139 | + |
| 140 | + if (subtag.Length == 2) |
| 141 | + { |
| 142 | + // It's a key. |
| 143 | + string key = subtag; |
| 144 | + bool keyIsDroppableTrue = s_trueDroppableKeys.Contains(key); |
| 145 | + |
| 146 | + int valueStart = i + 1; |
| 147 | + int valueEnd = valueStart; |
| 148 | + while (valueEnd < parts.Length && parts[valueEnd].Length != 2 && parts[valueEnd].Length != 0) |
| 149 | + valueEnd++; |
| 150 | + |
| 151 | + bool emittedAnyValue = false; |
| 152 | + |
| 153 | + for (int v = valueStart; v < valueEnd; v++) |
| 154 | + { |
| 155 | + string value = parts[v]; |
| 156 | + if (value.Length == 0) |
| 157 | + continue; |
| 158 | + |
| 159 | + // Handle "yes"/"true" normalization |
| 160 | + if (value.Equals("yes", StringComparison.OrdinalIgnoreCase)) |
| 161 | + { |
| 162 | + if (keyIsDroppableTrue) |
| 163 | + { |
| 164 | + // Drop boolean true for droppable keys. |
| 165 | + continue; |
| 166 | + } |
| 167 | + // keep "yes" for non-droppable |
| 168 | + } |
| 169 | + else if (value.Equals("true", StringComparison.OrdinalIgnoreCase)) |
| 170 | + { |
| 171 | + if (keyIsDroppableTrue) |
| 172 | + { |
| 173 | + // Drop boolean true for droppable keys. |
| 174 | + continue; |
| 175 | + } |
| 176 | + // Non-droppable: canonicalize to "yes" |
| 177 | + value = "yes"; |
| 178 | + } |
| 179 | + |
| 180 | + // Per-key aliasing |
| 181 | + if (key.Equals("rg", StringComparison.OrdinalIgnoreCase) || |
| 182 | + key.Equals("sd", StringComparison.OrdinalIgnoreCase)) |
| 183 | + { |
| 184 | + value = CanonicalizeSubdivision(value); |
| 185 | + } |
| 186 | + else if (key.Equals("tz", StringComparison.OrdinalIgnoreCase)) |
| 187 | + { |
| 188 | + value = CanonicalizeTimeZoneType(value); |
| 189 | + } |
| 190 | + |
| 191 | + result.Append('-'); |
| 192 | + result.Append(value); |
| 193 | + emittedAnyValue = true; |
| 194 | + } |
| 195 | + |
| 196 | + // If **no** value was emitted for a **non-droppable** key, synthesize "-yes". |
| 197 | + if (!emittedAnyValue && !keyIsDroppableTrue) |
| 198 | + { |
| 199 | + result.Append("-yes"); |
| 200 | + } |
| 201 | + |
| 202 | + i = valueEnd; |
| 203 | + } |
| 204 | + else |
| 205 | + { |
| 206 | + // Attribute (or malformed); just pass through. |
| 207 | + i++; |
| 208 | + } |
| 209 | + } |
| 210 | + |
| 211 | + // Append remainder after the -u- block |
| 212 | + result.Append(tag, extensionIndex + extensionLength, tag.Length - (extensionIndex + extensionLength)); |
| 213 | + return result.ToString(); |
| 214 | + } |
| 215 | + |
| 216 | + /// Validates `tag` as a BCP-47 language tag via ICU and returns a canonical tag. |
| 217 | + /// Throws RangeError on invalid tags (spec-compliant). |
| 218 | + public static string CanonicalizeUnicodeLocaleIdOrThrow(Realm realm, string tag) |
| 219 | + { |
| 220 | + // 1) Validate & parse BCP-47 -> ICU locale ID |
| 221 | + var status = ICU.UErrorCode.U_ZERO_ERROR; |
| 222 | + byte[] locBuf = new byte[128]; |
| 223 | + int parsed; |
| 224 | + int need = ICU.uloc_forLanguageTag(tag, locBuf, locBuf.Length, out parsed, ref status); |
| 225 | + |
| 226 | + if (need > locBuf.Length) |
| 227 | + { |
| 228 | + locBuf = new byte[need]; |
| 229 | + status = ICU.UErrorCode.U_ZERO_ERROR; |
| 230 | + need = ICU.uloc_forLanguageTag(tag, locBuf, locBuf.Length, out parsed, ref status); |
| 231 | + } |
| 232 | + |
| 233 | + if (status != ICU.UErrorCode.U_ZERO_ERROR || parsed != tag.Length || need <= 0) |
| 234 | + { |
| 235 | + // RangeError per spec |
| 236 | + Throw.RangeError(realm, $"invalid language tag: {tag}"); |
| 237 | + } |
| 238 | + |
| 239 | + string icuLocaleId = Encoding.UTF8.GetString(locBuf, 0, need); |
| 240 | + |
| 241 | + // 2) Canonicalize the ICU locale ID (this applies CLDR language/region/script aliases, e.g. cmn->zh) |
| 242 | + status = ICU.UErrorCode.U_ZERO_ERROR; |
| 243 | + byte[] canonLoc = new byte[System.Math.Max(need + 16, 256)]; |
| 244 | + int canonLen = ICU.uloc_canonicalize(icuLocaleId, canonLoc, canonLoc.Length, ref status); |
| 245 | + |
| 246 | + if (canonLen > canonLoc.Length) |
| 247 | + { |
| 248 | + canonLoc = new byte[canonLen]; |
| 249 | + status = ICU.UErrorCode.U_ZERO_ERROR; |
| 250 | + canonLen = ICU.uloc_canonicalize(icuLocaleId, canonLoc, canonLoc.Length, ref status); |
| 251 | + } |
| 252 | + |
| 253 | + string icuCanonical = (status == ICU.UErrorCode.U_ZERO_ERROR && canonLen > 0) |
| 254 | + ? Encoding.UTF8.GetString(canonLoc, 0, canonLen) |
| 255 | + : icuLocaleId; // fall back if canonicalize didn’t change it |
| 256 | + |
| 257 | + // 3) Convert canonical ICU locale ID -> canonical BCP-47 tag |
| 258 | + status = ICU.UErrorCode.U_ZERO_ERROR; |
| 259 | + byte[] outBuf = new byte[256]; |
| 260 | + int len = ICU.uloc_toLanguageTag(icuCanonical, outBuf, outBuf.Length, strict: false, ref status); |
| 261 | + |
| 262 | + if (len > outBuf.Length) |
| 263 | + { |
| 264 | + outBuf = new byte[len]; |
| 265 | + status = ICU.UErrorCode.U_ZERO_ERROR; |
| 266 | + len = ICU.uloc_toLanguageTag(icuCanonical, outBuf, outBuf.Length, strict: false, ref status); |
| 267 | + } |
| 268 | + |
| 269 | + if (status != ICU.UErrorCode.U_ZERO_ERROR || len <= 0) |
| 270 | + { |
| 271 | + Throw.RangeError(realm, $"failed to canonicalize language tag: {tag}"); |
| 272 | + } |
| 273 | + |
| 274 | + var canonical = Encoding.UTF8.GetString(outBuf, 0, len); |
| 275 | + |
| 276 | + // WebKit-style cleanup for "-u-…-true" |
| 277 | + canonical = CanonicalizeUnicodeExtensionsAfterIcu(canonical); |
| 278 | + |
| 279 | + // Fallback for ICU builds that don't alias cmn->zh |
| 280 | + canonical = FixKnownLanguageAliases(canonical); |
| 281 | + |
| 282 | + return canonical; |
| 283 | + } |
| 284 | + |
| 285 | + private static string FixKnownLanguageAliases(string canonicalTag) |
| 286 | + { |
| 287 | + if (string.IsNullOrEmpty(canonicalTag)) |
| 288 | + return canonicalTag; |
| 289 | + |
| 290 | + // Split once: "xx[-…]" → lang + rest (rest includes the leading '-') |
| 291 | + int dash = canonicalTag.IndexOf('-'); |
| 292 | + ReadOnlySpan<char> lang = dash < 0 |
| 293 | + ? canonicalTag.AsSpan() |
| 294 | + : canonicalTag.AsSpan(0, dash); |
| 295 | + |
| 296 | + // We'll append the remainder (if any) after we swap the primary language subtag. |
| 297 | + ReadOnlySpan<char> rest = dash < 0 |
| 298 | + ? ReadOnlySpan<char>.Empty |
| 299 | + : canonicalTag.AsSpan(dash); // includes '-...' |
| 300 | + |
| 301 | + // Known primary language aliases not consistently handled by older ICU: |
| 302 | + // - cmn → zh (Mandarin → Chinese) |
| 303 | + // - ji → yi |
| 304 | + // - in → id |
| 305 | + if (lang.Equals("cmn".AsSpan(), StringComparison.OrdinalIgnoreCase)) |
| 306 | + { |
| 307 | + return rest.IsEmpty ? "zh" : "zh" + rest.ToString(); |
| 308 | + } |
| 309 | + |
| 310 | + if (lang.Equals("ji".AsSpan(), StringComparison.OrdinalIgnoreCase)) |
| 311 | + { |
| 312 | + return rest.IsEmpty ? "yi" : "yi" + rest.ToString(); |
| 313 | + } |
| 314 | + |
| 315 | + if (lang.Equals("in".AsSpan(), StringComparison.OrdinalIgnoreCase)) |
| 316 | + { |
| 317 | + return rest.IsEmpty ? "id" : "id" + rest.ToString(); |
| 318 | + } |
| 319 | + |
| 320 | + // Otherwise, leave as-is. |
| 321 | + return canonicalTag; |
| 322 | + } |
| 323 | + } |
| 324 | +} |
0 commit comments