Skip to content

Commit 484e564

Browse files
committed
pcre2test: refactor replication implementations
Make the pattern implemenation aware of the underlying buffer size to avoid mostly harmless buffer overreads, and allow for a nicer fallback in case of syntax errors. Update the data implementation to behave similarly when the provided number is not valid, and cleanup fixes that had accumlated unorganically.
1 parent ead3652 commit 484e564

File tree

4 files changed

+160
-85
lines changed

4 files changed

+160
-85
lines changed

doc/pcre2test.1

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -563,7 +563,9 @@ part of the file. For example:
563563
\e[abc]{4}
564564
.sp
565565
is converted to "abcabcabcabc". This feature does not support nesting. To
566-
include a closing square bracket in the characters, code it as \ex5D.
566+
include a closing square bracket in the characters, code it with \ex followed
567+
by two hexadecimal digits that represent that letter in the character set used
568+
(e.g. \ex5D for ASCII or UTF-8).
567569
.P
568570
A backslash followed by an equals sign marks the end of the subject string and
569571
the start of a modifier list. For example:

src/pcre2test.c

Lines changed: 71 additions & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -5975,8 +5975,7 @@ static int
59755975
process_pattern(void)
59765976
{
59775977
BOOL utf;
5978-
uint32_t k;
5979-
uint8_t *p = buffer;
5978+
uint8_t *pend, *p = buffer;
59805979
unsigned int delimiter = *p++;
59815980
int errorcode;
59825981
void *use_pat_context;
@@ -6029,6 +6028,7 @@ if (p[1] == '\\') *p++ = '\\';
60296028

60306029
*p++ = 0;
60316030
patlen = p - buffer - 2;
6031+
pend = buffer + 1 + patlen;
60326032

60336033
/* Look for modifiers and options after the final delimiter. */
60346034

@@ -6068,7 +6068,7 @@ if (pat_patctl.convert_type != CONVERT_UNSET &&
60686068
/* Check for mutually exclusive control modifiers. At present, these are all in
60696069
the first control word. */
60706070

6071-
for (k = 0; k < sizeof(exclusive_pat_controls)/sizeof(uint32_t); k++)
6071+
for (uint32_t k = 0; k < sizeof(exclusive_pat_controls)/sizeof(uint32_t); k++)
60726072
{
60736073
uint32_t c = pat_patctl.control & exclusive_pat_controls[k];
60746074
if (c != 0 && c != (c & (~c+1)))
@@ -6166,54 +6166,63 @@ else if ((pat_patctl.control & CTL_EXPAND) != 0)
61666166
uint8_t *pc = pp;
61676167
uint32_t count = 1;
61686168
size_t length = 1;
6169+
size_t m = 1;
61696170

61706171
/* Check for replication syntax; if not found, the defaults just set will
6171-
prevail and one character will be copied. */
6172+
prevail and `length` characters will be copied once. */
61726173

6173-
if (pp[0] == '\\' && pp[1] == '[')
6174+
if (memcmp(pp, "\\[", 2) == 0)
61746175
{
61756176
uint8_t *pe;
6176-
for (pe = pp + 2; *pe != 0; pe++)
6177+
6178+
/* Start the capture after skipping the prefix. This pointer will need
6179+
to be updated back if a problem is found and would rather go through
6180+
the literal fallback below */
6181+
pc += 2;
6182+
for (pe = pc; *pe != 0; pe++)
61776183
{
6178-
if (pe[0] == ']' && pe[1] == '{')
6184+
if (pend - pe > 3 && memcmp(pe, "]{", 2) == 0 && isdigit(pe[2]))
61796185
{
6180-
size_t clen = pe - pc - 2;
6181-
uint32_t i = 0;
61826186
unsigned long uli;
61836187
char *endptr;
61846188

6185-
pe += 2;
6186-
uli = strtoul((const char *)pe, &endptr, 10);
6187-
if (U32OVERFLOW(uli))
6189+
errno = 0;
6190+
uli = strtoul((const char *)pe + 2, &endptr, 10);
6191+
if (errno != 0 || uli == 0 || U32OVERFLOW(uli))
61886192
{
6189-
fprintf(outfile, "** Pattern repeat count too large\n");
6193+
fprintf(outfile, "** Invalid replication count (1..UINT_MAX)\n");
61906194
return PR_SKIP;
61916195
}
61926196

6193-
i = (uint32_t)uli;
6194-
pe = (uint8_t *)endptr;
6195-
if (*pe == '}')
6197+
if (*endptr == '}') count = (uint32_t)uli;
6198+
length = pe - pc;
6199+
if (length >= SIZE_MAX/count)
61966200
{
6197-
if (i == 0)
6198-
{
6199-
fprintf(outfile, "** Zero repeat not allowed\n");
6200-
return PR_SKIP;
6201-
}
6202-
pc += 2;
6203-
count = i;
6204-
length = clen;
6205-
pp = pe;
6206-
break;
6201+
fprintf(outfile, "** Expanded content too large\n");
6202+
return PR_SKIP;
62076203
}
6204+
pe = (uint8_t *)endptr;
6205+
break;
62086206
}
62096207
}
6208+
if (*pe != '}')
6209+
{
6210+
pc -= 2;
6211+
length = pe - pc;
6212+
}
6213+
m = length * count;
6214+
pp = pe;
6215+
6216+
/* The main loop increments pp, so if we are already at the end of
6217+
the pattern need to backtrack to avoid jumping over the NUL. */
6218+
if (*pe == 0) pp--;
62106219
}
62116220

62126221
/* Add to output. If the buffer is too small expand it. The function for
62136222
expanding buffers always keeps buffer and pbuffer8 in step as far as their
62146223
size goes. */
62156224

6216-
while (pt + count * length > pbuffer8 + pbuffer8_size)
6225+
while (pt + m > pbuffer8 + pbuffer8_size)
62176226
{
62186227
size_t pc_offset = pc - buffer;
62196228
size_t pp_offset = pp - buffer;
@@ -7958,7 +7967,7 @@ process_data(void)
79587967
{
79597968
PCRE2_SIZE len, ulen, arg_ulen;
79607969
uint32_t gmatched;
7961-
uint32_t c, k;
7970+
uint32_t c;
79627971
uint32_t g_notempty = 0;
79637972
uint8_t *p, *pp, *start_rep;
79647973
size_t needlen;
@@ -8089,14 +8098,15 @@ while ((c = *p++) != 0)
80898098

80908099
if (*p++ != '{')
80918100
{
8092-
fprintf(outfile, "** Expected '{' after \\[....]\n");
8101+
fprintf(outfile, "** Expected '{' after \\[...]\n");
80938102
return PR_OK;
80948103
}
80958104

8105+
errno = 0;
80968106
li = strtol((const char *)p, &endptr, 10);
8097-
if (S32OVERFLOW(li))
8107+
if (!isdigit(*p) || errno != 0 || li < 1 || S32OVERFLOW(li))
80988108
{
8099-
fprintf(outfile, "** Repeat count too large\n");
8109+
fprintf(outfile, "** Replication count missing or invalid (1..INT_MAX)\n");
81008110
return PR_OK;
81018111
}
81028112
i = (int)li;
@@ -8108,44 +8118,41 @@ while ((c = *p++) != 0)
81088118
return PR_OK;
81098119
}
81108120

8111-
if (i-- <= 0)
8121+
if (i-- > 1)
81128122
{
8113-
fprintf(outfile, "** Zero or negative repeat not allowed\n");
8114-
return PR_OK;
8115-
}
8116-
8117-
replen = CAST8VAR(q) - start_rep;
8118-
if (i > 0 && replen > (SIZE_MAX - needlen) / i)
8119-
{
8120-
fprintf(outfile, "** Expanded content too large\n");
8121-
return PR_OK;
8122-
}
8123-
needlen += replen * i;
8124-
8125-
if (needlen >= dbuffer_size)
8126-
{
8127-
size_t qoffset = CAST8VAR(q) - dbuffer;
8128-
size_t rep_offset = start_rep - dbuffer;
8129-
while (needlen >= dbuffer_size)
8123+
replen = CAST8VAR(q) - start_rep;
8124+
if (replen >= (SIZE_MAX - needlen) / i)
81308125
{
8131-
if (dbuffer_size < SIZE_MAX/2) dbuffer_size *= 2;
8132-
else dbuffer_size = needlen + 1;
8126+
fprintf(outfile, "** Expanded content too large\n");
8127+
return PR_OK;
81338128
}
8134-
dbuffer = (uint8_t *)realloc(dbuffer, dbuffer_size);
8135-
if (dbuffer == NULL)
8129+
needlen += replen * i;
8130+
8131+
if (needlen >= dbuffer_size)
81368132
{
8137-
fprintf(stderr, "pcre2test: realloc(%" SIZ_FORM ") failed\n",
8138-
dbuffer_size);
8139-
exit(1);
8133+
size_t qoffset = CAST8VAR(q) - dbuffer;
8134+
size_t rep_offset = start_rep - dbuffer;
8135+
while (needlen >= dbuffer_size)
8136+
{
8137+
if (dbuffer_size < SIZE_MAX / 2) dbuffer_size *= 2;
8138+
else dbuffer_size = needlen + 1;
8139+
}
8140+
dbuffer = (uint8_t *)realloc(dbuffer, dbuffer_size);
8141+
if (dbuffer == NULL)
8142+
{
8143+
fprintf(stderr, "pcre2test: realloc(%" SIZ_FORM ") failed\n",
8144+
dbuffer_size);
8145+
exit(1);
8146+
}
8147+
SETCASTPTR(q, dbuffer + qoffset);
8148+
start_rep = dbuffer + rep_offset;
81408149
}
8141-
SETCASTPTR(q, dbuffer + qoffset);
8142-
start_rep = dbuffer + rep_offset;
8143-
}
81448150

8145-
while (i-- > 0)
8146-
{
8147-
memcpy(CAST8VAR(q), start_rep, replen);
8148-
SETPLUS(q, replen/code_unit_size);
8151+
while (i-- > 0)
8152+
{
8153+
memcpy(CAST8VAR(q), start_rep, replen);
8154+
SETPLUS(q, replen/code_unit_size);
8155+
}
81498156
}
81508157

81518158
start_rep = NULL;
@@ -8426,7 +8433,7 @@ if (dat_datctl.substitute_skip != 0 || dat_datctl.substitute_stop != 0)
84268433
/* Check for mutually exclusive modifiers. At present, these are all in the
84278434
first control word. */
84288435

8429-
for (k = 0; k < sizeof(exclusive_dat_controls)/sizeof(uint32_t); k++)
8436+
for (uint32_t k = 0; k < sizeof(exclusive_dat_controls)/sizeof(uint32_t); k++)
84308437
{
84318438
c = dat_datctl.control & exclusive_dat_controls[k];
84328439
if (c != 0 && c != (c & (~c+1)))

testdata/testinput2

Lines changed: 19 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -4589,9 +4589,6 @@
45894589
/(abc)*/
45904590
\[abc]{1}
45914591

4592-
/(abc)*/
4593-
\[abc]{0}
4594-
45954592
/^/gm
45964593
\n\n\n
45974594

@@ -5136,12 +5133,29 @@ a)"xI
51365133
\= Expect no match
51375134
abc
51385135

5136+
# Expand tests
5137+
5138+
/(abc)*/
5139+
\[abc]{0}
5140+
\[abc]{}
5141+
51395142
/aaa/
5140-
\[abc]{10000000000000000000000000000}
5141-
\[a]{3}
5143+
\[X]{-10}
5144+
\[abc]{10000000000000000000000000000}
5145+
\[a]{3}
51425146

51435147
/\[AB]{6000000000000000000000}/expand
51445148

5149+
/\[AB]{0000000000000000000000}/expand
5150+
5151+
/a\[b]{-1}/BI,expand
5152+
5153+
/a\[b]{/BI,expand
5154+
5155+
/a\[/BI,expand
5156+
5157+
//BI,expand
5158+
51455159
# Hex uses pattern length, not zero-terminated. This tests for overrunning
51465160
# the given length of a pattern.
51475161

@@ -6324,9 +6338,6 @@ a)"xI
63246338
/[Aa]{2,3}/BI
63256339
aabcd
63266340

6327-
--
6328-
\[X]{-10}
6329-
63306341
# Check imposition of maximum by match_data_create().
63316342

63326343
/abcd/

testdata/testoutput2

Lines changed: 67 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -15177,10 +15177,6 @@ No match
1517715177
0: abc
1517815178
1: abc
1517915179

15180-
/(abc)*/
15181-
\[abc]{0}
15182-
** Zero or negative repeat not allowed
15183-
1518415180
/^/gm
1518515181
\n\n\n
1518615182
0:
@@ -16358,14 +16354,77 @@ Failed: error 122 at offset 11: unmatched closing parenthesis
1635816354
0 ^ 0
1635916355
No match
1636016356

16357+
# Expand tests
16358+
16359+
/(abc)*/
16360+
\[abc]{0}
16361+
** Replication count missing or invalid (1..INT_MAX)
16362+
\[abc]{}
16363+
** Replication count missing or invalid (1..INT_MAX)
16364+
1636116365
/aaa/
16362-
\[abc]{10000000000000000000000000000}
16363-
** Repeat count too large
16364-
\[a]{3}
16366+
\[X]{-10}
16367+
** Replication count missing or invalid (1..INT_MAX)
16368+
\[abc]{10000000000000000000000000000}
16369+
** Replication count missing or invalid (1..INT_MAX)
16370+
\[a]{3}
1636516371
0: aaa
1636616372

1636716373
/\[AB]{6000000000000000000000}/expand
16368-
** Pattern repeat count too large
16374+
** Invalid replication count (1..UINT_MAX)
16375+
16376+
/\[AB]{0000000000000000000000}/expand
16377+
** Invalid replication count (1..UINT_MAX)
16378+
16379+
/a\[b]{-1}/BI,expand
16380+
Expanded: a\[b]{-1}
16381+
------------------------------------------------------------------
16382+
Bra
16383+
a[b]{-1}
16384+
Ket
16385+
End
16386+
------------------------------------------------------------------
16387+
Capture group count = 0
16388+
First code unit = 'a'
16389+
Last code unit = '}'
16390+
Subject length lower bound = 8
16391+
16392+
/a\[b]{/BI,expand
16393+
Expanded: a\[b]{
16394+
------------------------------------------------------------------
16395+
Bra
16396+
a[b]{
16397+
Ket
16398+
End
16399+
------------------------------------------------------------------
16400+
Capture group count = 0
16401+
First code unit = 'a'
16402+
Last code unit = '{'
16403+
Subject length lower bound = 5
16404+
16405+
/a\[/BI,expand
16406+
Expanded: a\[
16407+
------------------------------------------------------------------
16408+
Bra
16409+
a[
16410+
Ket
16411+
End
16412+
------------------------------------------------------------------
16413+
Capture group count = 0
16414+
First code unit = 'a'
16415+
Last code unit = '['
16416+
Subject length lower bound = 2
16417+
16418+
//BI,expand
16419+
Expanded:
16420+
------------------------------------------------------------------
16421+
Bra
16422+
Ket
16423+
End
16424+
------------------------------------------------------------------
16425+
Capture group count = 0
16426+
May match empty string
16427+
Subject length lower bound = 0
1636916428

1637016429
# Hex uses pattern length, not zero-terminated. This tests for overrunning
1637116430
# the given length of a pattern.
@@ -18979,10 +19038,6 @@ Subject length lower bound = 2
1897919038
aabcd
1898019039
0: aa
1898119040

18982-
--
18983-
\[X]{-10}
18984-
** Zero or negative repeat not allowed
18985-
1898619041
# Check imposition of maximum by match_data_create().
1898719042

1898819043
/abcd/

0 commit comments

Comments
 (0)