Skip to content

Commit aae7778

Browse files
author
Zoltan Herczeg
committed
Fix character list generator
1 parent 91606be commit aae7778

File tree

4 files changed

+44
-9
lines changed

4 files changed

+44
-9
lines changed

src/pcre2_compile_class.c

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -721,9 +721,13 @@ while (TRUE)
721721
else
722722
cranges->char_lists_types |= tmp1 << tmp2;
723723

724-
if (range_start < XCL_CHAR_LIST_LOW_16_START) break;
724+
if (range_end < XCL_CHAR_LIST_LOW_16_START || tmp2 == 0)
725+
{
726+
PCRE2_ASSERT(range_start < XCL_CHAR_LIST_LOW_16_START);
727+
break;
728+
}
725729

726-
PCRE2_ASSERT(tmp2 >= XCL_TYPE_BIT_LEN);
730+
PCRE2_ASSERT((tmp2 % XCL_TYPE_BIT_LEN) == 0);
727731
char_list_end = char_list_start - 1;
728732
char_list_start = *char_list_next++;
729733
tmp1 = 0;

src/pcre2_internal.h

Lines changed: 12 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1518,7 +1518,7 @@ High16 and High32: the highest bit is always one
15181518
The items are ordered in increasing order, so binary search can be
15191519
used to find the lower bound of an input character. The lower bound
15201520
is the highest item, which value is less or equal than the input
1521-
character. If the lower bit of the item is cleard, or the character
1521+
character. If the lower bit of the item is cleared, or the character
15221522
stored in the item equals to the input character, the input
15231523
character is in the character list. */
15241524

@@ -1539,14 +1539,19 @@ character is in the character list. */
15391539
#define XCL_CHAR_LIST_HIGH_32_END 0xffffffff
15401540
#define XCL_CHAR_LIST_HIGH_32_ADD 0x80000000
15411541

1542-
/* Mask for getting the descriptors of character list ranges.
1543-
Each descriptor has XCL_TYPE_BIT_LEN bits, and can be processed
1544-
by XCL_BEGIN_WITH_RANGE and XCL_ITEM_COUNT_MASK macros. */
1542+
/* Mask and length values for getting the descriptors of
1543+
all character list ranges. The bit length of each descriptor
1544+
is XCL_TYPE_BIT_LEN so the total size is 4*XCL_TYPE_BIT_LEN
1545+
(currently 12 bit). This data is stored for all four character
1546+
lists, even if no characters are present in a list. */
15451547
#define XCL_TYPE_MASK 0xfff
15461548
#define XCL_TYPE_BIT_LEN 3
1547-
/* If this bit is set, the first item of the character list is the
1548-
end of a range, which started before the starting character of the
1549-
character list. */
1549+
/* If this bit is set for a character class, the first item of the
1550+
character list is the end of a range, which started before the
1551+
starting character of the character list. If this bit is set, and
1552+
no characters are present in the list, the whole character class
1553+
is part of a range. E.g: [\x{500}-\x{12000}] covers the entire
1554+
0x8000-0xffff range. */
15501555
#define XCL_BEGIN_WITH_RANGE 0x4
15511556
/* Number of items in the character list: 0, 1, or 2. The value 3
15521557
represents that the item count is stored at the begining of the

testdata/testinput5

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2892,6 +2892,12 @@
28922892
/([\x{6535}\x{6536}\x{6538}\x{6539}\x{653b}\x{653c}\x{653e}\x{653f}\x{6541}\x{6542}\x{8000}-\x{ffff}]#)+/B,utf
28932893
\x{6534}#\x{6537}#\x{653a}#\x{653d}#\x{6540}#\x{6543}#\x{7fff}#\x{6535}#\x{6536}#\x{6538}#\x{6539}#\x{653b}#\x{653c}#\x{653e}#\x{653f}#\x{6541}#\x{6542}#\x{8000}#\x{c246}#\x{ffff}
28942894

2895+
/[\x{ff}\x{100}\x{8000}\x{8002}\x{8004}\x{8006}\x{8008}\x{800a}\x{800c}\x{800e}]+/B,utf
2896+
\x{ff}\x{100}\x{8000}\x{800a}\x{800e}\x{101}
2897+
2898+
/[\x{ff}-\x{104}\x{8000}\x{8002}\x{8004}\x{8006}\x{8008}\x{800a}\x{800c}\x{800e}]+/B,utf
2899+
\x{ff}\x{100}\x{101}\x{104}\x{8000}\x{800a}\x{800e}\x{105}
2900+
28952901
/[[:xdigit:]\x{400}-\x{600}]+/utf,ucp
28962902
!a0\x{400}\x{600}9\x{3ff}
28972903

testdata/testoutput5

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6339,6 +6339,26 @@ Failed: error 115 at offset 53: reference to non-existent subpattern
63396339
0: \x{6535}#\x{6536}#\x{6538}#\x{6539}#\x{653b}#\x{653c}#\x{653e}#\x{653f}#\x{6541}#\x{6542}#\x{8000}#\x{c246}#
63406340
1: \x{c246}#
63416341

6342+
/[\x{ff}\x{100}\x{8000}\x{8002}\x{8004}\x{8006}\x{8008}\x{800a}\x{800c}\x{800e}]+/B,utf
6343+
------------------------------------------------------------------
6344+
Bra
6345+
[\xff\x{100}\x{8000}\x{8002}\x{8004}\x{8006}\x{8008}\x{800a}\x{800c}\x{800e}]++
6346+
Ket
6347+
End
6348+
------------------------------------------------------------------
6349+
\x{ff}\x{100}\x{8000}\x{800a}\x{800e}\x{101}
6350+
0: \x{ff}\x{100}\x{8000}\x{800a}\x{800e}
6351+
6352+
/[\x{ff}-\x{104}\x{8000}\x{8002}\x{8004}\x{8006}\x{8008}\x{800a}\x{800c}\x{800e}]+/B,utf
6353+
------------------------------------------------------------------
6354+
Bra
6355+
[\xff\x{100}-\x{104}\x{8000}\x{8002}\x{8004}\x{8006}\x{8008}\x{800a}\x{800c}\x{800e}]++
6356+
Ket
6357+
End
6358+
------------------------------------------------------------------
6359+
\x{ff}\x{100}\x{101}\x{104}\x{8000}\x{800a}\x{800e}\x{105}
6360+
0: \x{ff}\x{100}\x{101}\x{104}\x{8000}\x{800a}\x{800e}
6361+
63426362
/[[:xdigit:]\x{400}-\x{600}]+/utf,ucp
63436363
!a0\x{400}\x{600}9\x{3ff}
63446364
0: a0\x{400}\x{600}9

0 commit comments

Comments
 (0)