diff --git a/src/pcre2_compile_class.c b/src/pcre2_compile_class.c index 55b641c19..c6f30d6fc 100644 --- a/src/pcre2_compile_class.c +++ b/src/pcre2_compile_class.c @@ -721,9 +721,13 @@ while (TRUE) else cranges->char_lists_types |= tmp1 << tmp2; - if (range_start < XCL_CHAR_LIST_LOW_16_START) break; + if (range_end < XCL_CHAR_LIST_LOW_16_START || tmp2 == 0) + { + PCRE2_ASSERT(range_start < XCL_CHAR_LIST_LOW_16_START); + break; + } - PCRE2_ASSERT(tmp2 >= XCL_TYPE_BIT_LEN); + PCRE2_ASSERT((tmp2 % XCL_TYPE_BIT_LEN) == 0); char_list_end = char_list_start - 1; char_list_start = *char_list_next++; tmp1 = 0; diff --git a/src/pcre2_internal.h b/src/pcre2_internal.h index aa7940957..2675e58e6 100644 --- a/src/pcre2_internal.h +++ b/src/pcre2_internal.h @@ -1518,7 +1518,7 @@ High16 and High32: the highest bit is always one The items are ordered in increasing order, so binary search can be used to find the lower bound of an input character. The lower bound is the highest item, which value is less or equal than the input -character. If the lower bit of the item is cleard, or the character +character. If the lower bit of the item is cleared, or the character stored in the item equals to the input character, the input character is in the character list. */ @@ -1539,14 +1539,19 @@ character is in the character list. */ #define XCL_CHAR_LIST_HIGH_32_END 0xffffffff #define XCL_CHAR_LIST_HIGH_32_ADD 0x80000000 -/* Mask for getting the descriptors of character list ranges. -Each descriptor has XCL_TYPE_BIT_LEN bits, and can be processed -by XCL_BEGIN_WITH_RANGE and XCL_ITEM_COUNT_MASK macros. */ +/* Mask and length values for getting the descriptors of +all character list ranges. The bit length of each descriptor +is XCL_TYPE_BIT_LEN so the total size is 4*XCL_TYPE_BIT_LEN +(currently 12 bit). This data is stored for all four character +lists, even if no characters are present in a list. */ #define XCL_TYPE_MASK 0xfff #define XCL_TYPE_BIT_LEN 3 -/* If this bit is set, the first item of the character list is the -end of a range, which started before the starting character of the -character list. */ +/* If this bit is set for a character class, the first item of the +character list is the end of a range, which started before the +starting character of the character list. If this bit is set, and +no characters are present in the list, the whole character class +is part of a range. E.g: [\x{500}-\x{12000}] covers the entire +0x8000-0xffff range. */ #define XCL_BEGIN_WITH_RANGE 0x4 /* Number of items in the character list: 0, 1, or 2. The value 3 represents that the item count is stored at the begining of the diff --git a/testdata/testinput5 b/testdata/testinput5 index 7f9f6a2d0..d47e0c1ef 100644 --- a/testdata/testinput5 +++ b/testdata/testinput5 @@ -2892,6 +2892,12 @@ /([\x{6535}\x{6536}\x{6538}\x{6539}\x{653b}\x{653c}\x{653e}\x{653f}\x{6541}\x{6542}\x{8000}-\x{ffff}]#)+/B,utf \x{6534}#\x{6537}#\x{653a}#\x{653d}#\x{6540}#\x{6543}#\x{7fff}#\x{6535}#\x{6536}#\x{6538}#\x{6539}#\x{653b}#\x{653c}#\x{653e}#\x{653f}#\x{6541}#\x{6542}#\x{8000}#\x{c246}#\x{ffff} +/[\x{ff}\x{100}\x{8000}\x{8002}\x{8004}\x{8006}\x{8008}\x{800a}\x{800c}\x{800e}]+/B,utf + \x{ff}\x{100}\x{8000}\x{800a}\x{800e}\x{101} + +/[\x{ff}-\x{104}\x{8000}\x{8002}\x{8004}\x{8006}\x{8008}\x{800a}\x{800c}\x{800e}]+/B,utf + \x{ff}\x{100}\x{101}\x{104}\x{8000}\x{800a}\x{800e}\x{105} + /[[:xdigit:]\x{400}-\x{600}]+/utf,ucp !a0\x{400}\x{600}9\x{3ff} diff --git a/testdata/testoutput5 b/testdata/testoutput5 index fd36974dd..82b3abe9f 100644 --- a/testdata/testoutput5 +++ b/testdata/testoutput5 @@ -6339,6 +6339,26 @@ Failed: error 115 at offset 53: reference to non-existent subpattern 0: \x{6535}#\x{6536}#\x{6538}#\x{6539}#\x{653b}#\x{653c}#\x{653e}#\x{653f}#\x{6541}#\x{6542}#\x{8000}#\x{c246}# 1: \x{c246}# +/[\x{ff}\x{100}\x{8000}\x{8002}\x{8004}\x{8006}\x{8008}\x{800a}\x{800c}\x{800e}]+/B,utf +------------------------------------------------------------------ + Bra + [\xff\x{100}\x{8000}\x{8002}\x{8004}\x{8006}\x{8008}\x{800a}\x{800c}\x{800e}]++ + Ket + End +------------------------------------------------------------------ + \x{ff}\x{100}\x{8000}\x{800a}\x{800e}\x{101} + 0: \x{ff}\x{100}\x{8000}\x{800a}\x{800e} + +/[\x{ff}-\x{104}\x{8000}\x{8002}\x{8004}\x{8006}\x{8008}\x{800a}\x{800c}\x{800e}]+/B,utf +------------------------------------------------------------------ + Bra + [\xff\x{100}-\x{104}\x{8000}\x{8002}\x{8004}\x{8006}\x{8008}\x{800a}\x{800c}\x{800e}]++ + Ket + End +------------------------------------------------------------------ + \x{ff}\x{100}\x{101}\x{104}\x{8000}\x{800a}\x{800e}\x{105} + 0: \x{ff}\x{100}\x{101}\x{104}\x{8000}\x{800a}\x{800e} + /[[:xdigit:]\x{400}-\x{600}]+/utf,ucp !a0\x{400}\x{600}9\x{3ff} 0: a0\x{400}\x{600}9