Skip to content

Commit 56c4f10

Browse files
gh-88091: Fix unicodedata.decomposition() for Hangul Syllables (GH-144993)
1 parent 97181bb commit 56c4f10

File tree

3 files changed

+44
-15
lines changed

3 files changed

+44
-15
lines changed

Lib/test/test_unicodedata.py

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -378,6 +378,12 @@ def test_decomposition(self):
378378
# New in 17.0.0
379379
self.assertEqual(self.db.decomposition('\uA7F1'), '' if self.old else '<super> 0053')
380380

381+
# Hangul characters
382+
self.assertEqual(self.db.decomposition('\uAC00'), '1100 1161')
383+
self.assertEqual(self.db.decomposition('\uD4DB'), '1111 1171 11B6')
384+
self.assertEqual(self.db.decomposition('\uC2F8'), '110A 1161')
385+
self.assertEqual(self.db.decomposition('\uD7A3'), '1112 1175 11C2')
386+
381387
self.assertRaises(TypeError, self.db.decomposition)
382388
self.assertRaises(TypeError, self.db.decomposition, 'xx')
383389

@@ -687,9 +693,9 @@ class UnicodeFunctionsTest(unittest.TestCase, BaseUnicodeFunctionsTest):
687693

688694
# Update this if the database changes. Make sure to do a full rebuild
689695
# (e.g. 'make distclean && make') to get the correct checksum.
690-
expectedchecksum = ('668dbbea1136e69d4f00677a5988b23bc78aefc6'
696+
expectedchecksum = ('00b13fa975a60b1d3f490f1fc8c126ab24990c75'
691697
if quicktest else
692-
'b869af769bd8fe352c04622ab90533dc54df5cf3')
698+
'ebfc9dd281c2226998fd435744dd2e9321899beb')
693699

694700
@requires_resource('network')
695701
def test_all_names(self):
@@ -1068,9 +1074,9 @@ def test_block_invalid_input(self):
10681074
class Unicode_3_2_0_FunctionsTest(unittest.TestCase, BaseUnicodeFunctionsTest):
10691075
db = unicodedata.ucd_3_2_0
10701076
old = True
1071-
expectedchecksum = ('2164a66700e03cba9c9f5ed9e9a8d594d2da136a'
1077+
expectedchecksum = ('cb5bbbd1f55b67371e18222b90a8e21c87f16b72'
10721078
if quicktest else
1073-
'a8276cec9b6991779c5bdaa46c1ae7cc50bc2403')
1079+
'74936dffe949d99203a47e6a66565b2fc337bae7')
10741080

10751081

10761082
class UnicodeMiscTest(unittest.TestCase):
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Fix :func:`unicodedata.decomposition` for Hangul characters.

Modules/unicodedata.c

Lines changed: 33 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -429,6 +429,17 @@ unicodedata_UCD_east_asian_width_impl(PyObject *self, int chr)
429429
return PyUnicode_FromString(_PyUnicode_EastAsianWidthNames[index]);
430430
}
431431

432+
// For Hangul decomposition
433+
#define SBase 0xAC00
434+
#define LBase 0x1100
435+
#define VBase 0x1161
436+
#define TBase 0x11A7
437+
#define LCount 19
438+
#define VCount 21
439+
#define TCount 28
440+
#define NCount (VCount*TCount)
441+
#define SCount (LCount*NCount)
442+
432443
/*[clinic input]
433444
@permit_long_summary
434445
unicodedata.UCD.decomposition
@@ -460,6 +471,25 @@ unicodedata_UCD_decomposition_impl(PyObject *self, int chr)
460471
return Py_GetConstant(Py_CONSTANT_EMPTY_STR); /* unassigned */
461472
}
462473

474+
// Hangul Decomposition.
475+
// See section 3.12.2, "Hangul Syllable Decomposition"
476+
// https://www.unicode.org/versions/latest/core-spec/chapter-3/#G56669
477+
if (SBase <= code && code < (SBase + SCount)) {
478+
int SIndex = code - SBase;
479+
int L = LBase + SIndex / NCount;
480+
int V = VBase + (SIndex % NCount) / TCount;
481+
int T = TBase + SIndex % TCount;
482+
if (T != TBase) {
483+
PyOS_snprintf(decomp, sizeof(decomp),
484+
"%04X %04X %04X", L, V, T);
485+
}
486+
else {
487+
PyOS_snprintf(decomp, sizeof(decomp),
488+
"%04X %04X", L, V);
489+
}
490+
return PyUnicode_FromString(decomp);
491+
}
492+
463493
if (code < 0 || code >= 0x110000)
464494
index = 0;
465495
else {
@@ -522,16 +552,6 @@ get_decomp_record(PyObject *self, Py_UCS4 code,
522552
(*index)++;
523553
}
524554

525-
#define SBase 0xAC00
526-
#define LBase 0x1100
527-
#define VBase 0x1161
528-
#define TBase 0x11A7
529-
#define LCount 19
530-
#define VCount 21
531-
#define TCount 28
532-
#define NCount (VCount*TCount)
533-
#define SCount (LCount*NCount)
534-
535555
static PyObject*
536556
nfd_nfkd(PyObject *self, PyObject *input, int k)
537557
{
@@ -585,7 +605,9 @@ nfd_nfkd(PyObject *self, PyObject *input, int k)
585605
}
586606
output = new_output;
587607
}
588-
/* Hangul Decomposition. */
608+
// Hangul Decomposition.
609+
// See section 3.12.2, "Hangul Syllable Decomposition"
610+
// https://www.unicode.org/versions/latest/core-spec/chapter-3/#G56669
589611
if (SBase <= code && code < (SBase+SCount)) {
590612
int SIndex = code - SBase;
591613
int L = LBase + SIndex / NCount;

0 commit comments

Comments
 (0)