summaryrefslogtreecommitdiff
path: root/src/pcre2_compile.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/pcre2_compile.c')
-rw-r--r--src/pcre2_compile.c271
1 files changed, 172 insertions, 99 deletions
diff --git a/src/pcre2_compile.c b/src/pcre2_compile.c
index d852837..bb9736c 100644
--- a/src/pcre2_compile.c
+++ b/src/pcre2_compile.c
@@ -81,7 +81,7 @@ by defining macros in order to minimize #if usage. */
/* Function definitions to allow mutual recursion */
-static int
+static unsigned int
add_list_to_class(uint8_t *, PCRE2_UCHAR **, uint32_t, compile_block *,
const uint32_t *, unsigned int);
@@ -149,9 +149,16 @@ have to check them every time. */
#define OFLOW_MAX (INT_MAX - 20)
-/* Macro for setting individual bits in class bitmaps. */
+/* Macro for setting individual bits in class bitmaps. It took some
+experimenting to figure out how to stop gcc 5.3.0 from warning with
+-Wconversion. This version gets a warning:
+
+ #define SETBIT(a,b) a[(b)/8] |= (uint8_t)(1 << ((b)&7))
+
+Let's hope the apparently less efficient version isn't actually so bad if the
+compiler is clever with identical subexpressions. */
-#define SETBIT(a,b) a[(b)/8] |= (1 << ((b)&7))
+#define SETBIT(a,b) a[(b)/8] = (uint8_t)(a[(b)/8] | (1 << ((b)&7)))
/* Private flags added to firstcu and reqcu. */
@@ -723,6 +730,39 @@ static const uint8_t opcode_possessify[] = {
/*************************************************
+* Copy compiled code *
+*************************************************/
+
+/* Compiled JIT code cannot be copied, so the new compiled block has no
+associated JIT data. */
+
+PCRE2_EXP_DEFN pcre2_code * PCRE2_CALL_CONVENTION
+pcre2_code_copy(const pcre2_code *code)
+{
+PCRE2_SIZE* ref_count;
+pcre2_code *newcode;
+
+if (code == NULL) return NULL;
+newcode = code->memctl.malloc(code->blocksize, code->memctl.memory_data);
+if (newcode == NULL) return NULL;
+memcpy(newcode, code, code->blocksize);
+newcode->executable_jit = NULL;
+
+/* If the code is one that has been deserialized, increment the reference count
+in the decoded tables. */
+
+if ((code->flags & PCRE2_DEREF_TABLES) != 0)
+ {
+ ref_count = (PCRE2_SIZE *)(code->tables + tables_length);
+ (*ref_count)++;
+ }
+
+return newcode;
+}
+
+
+
+/*************************************************
* Free compiled code *
*************************************************/
@@ -804,7 +844,7 @@ static void
complete_callout(PCRE2_UCHAR *previous_callout, PCRE2_SPTR ptr,
compile_block *cb)
{
-size_t length = ptr - cb->start_pattern - GET(previous_callout, 1);
+size_t length = (size_t)(ptr - cb->start_pattern - GET(previous_callout, 1));
PUT(previous_callout, 1 + LINK_SIZE, length);
}
@@ -839,9 +879,10 @@ Arguments:
Returns: if non-negative, the fixed length,
or -1 if an OP_RECURSE item was encountered and atend is FALSE
or -2 if there is no fixed length,
- or -3 if \C was encountered (in UTF-8 mode only)
- or -4 length is too long
- or -5 if an unknown opcode was encountered (internal error)
+ or -3 if \C was encountered (in UTF mode only)
+ or -4 if length is too long
+ or -5 if regex is too complicated
+ or -6 if an unknown opcode was encountered (internal error)
*/
#define FFL_LATER (-1)
@@ -855,11 +896,11 @@ static int
find_fixedlength(PCRE2_UCHAR *code, BOOL utf, BOOL atend, compile_block *cb,
recurse_check *recurses, int *countptr)
{
-int length = -1;
+uint32_t length = 0xffffffffu; /* Unset */
uint32_t group = 0;
uint32_t groupinfo = 0;
recurse_check this_recurse;
-register int branchlength = 0;
+register uint32_t branchlength = 0;
register PCRE2_UCHAR *cc = code + 1 + LINK_SIZE;
/* If this is a capturing group, we may have the answer cached, but we can only
@@ -910,7 +951,7 @@ for (;;)
case OP_COND:
d = find_fixedlength(cc, utf, atend, cb, recurses, countptr);
if (d < 0) return d;
- branchlength += d;
+ branchlength += (uint32_t)d;
do cc += GET(cc, 1); while (*cc == OP_ALT);
cc += 1 + LINK_SIZE;
break;
@@ -926,16 +967,16 @@ for (;;)
case OP_END:
case OP_ACCEPT:
case OP_ASSERT_ACCEPT:
- if (length < 0) length = branchlength;
+ if (length == 0xffffffffu) length = branchlength;
else if (length != branchlength) goto ISNOTFIXED;
if (*cc != OP_ALT)
{
if (group > 0)
{
- groupinfo |= (GI_SET_FIXED_LENGTH | length);
+ groupinfo |= (uint32_t)(GI_SET_FIXED_LENGTH | length);
cb->groupinfo[group] = groupinfo;
}
- return length;
+ return (int)length;
}
cc += 1 + LINK_SIZE;
branchlength = 0;
@@ -960,7 +1001,7 @@ for (;;)
this_recurse.group = cs;
d = find_fixedlength(cs, utf, atend, cb, &this_recurse, countptr);
if (d < 0) return d;
- branchlength += d;
+ branchlength += (uint32_t)d;
cc += 1 + LINK_SIZE;
break;
@@ -1039,7 +1080,7 @@ for (;;)
case OP_EXACTI:
case OP_NOTEXACT:
case OP_NOTEXACTI:
- branchlength += (int)GET2(cc,1);
+ branchlength += GET2(cc,1);
cc += 2 + IMM2_SIZE;
#ifdef SUPPORT_UNICODE
if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
@@ -1076,8 +1117,8 @@ for (;;)
cc++;
break;
- /* The single-byte matcher isn't allowed. This only happens in UTF-8 mode;
- otherwise \C is coded as OP_ALLANY. */
+ /* The single-byte matcher isn't allowed. This only happens in UTF-8 or
+ UTF-16 mode; otherwise \C is coded as OP_ALLANY. */
case OP_ANYBYTE:
return FFL_BACKSLASHC;
@@ -1115,7 +1156,7 @@ for (;;)
case OP_CRMINRANGE:
case OP_CRPOSRANGE:
if (GET2(cc,1) != GET2(cc,1+IMM2_SIZE)) goto ISNOTFIXED;
- branchlength += (int)GET2(cc,1);
+ branchlength += GET2(cc,1);
cc += 1 + 2 * IMM2_SIZE;
break;
@@ -1941,7 +1982,7 @@ else
overflow = TRUE;
break;
}
- s = s * 10 + (int)(*(++ptr) - CHAR_0);
+ s = s * 10 + (unsigned int)(*(++ptr) - CHAR_0);
}
if (overflow) /* Integer overflow */
{
@@ -2005,7 +2046,7 @@ else
overflow = TRUE;
break;
}
- s = s * 10 + (int)(*(++ptr) - CHAR_0);
+ s = s * 10 + (unsigned int)(*(++ptr) - CHAR_0);
}
if (overflow) /* Integer overflow */
{
@@ -2285,7 +2326,7 @@ get_ucp(PCRE2_SPTR *ptrptr, BOOL *negptr, unsigned int *ptypeptr,
unsigned int *pdataptr, int *errorcodeptr, compile_block *cb)
{
register PCRE2_UCHAR c;
-int i, bot, top;
+size_t i, bot, top;
PCRE2_SPTR ptr = *ptrptr;
PCRE2_UCHAR name[32];
@@ -2753,13 +2794,13 @@ Returns: the number of < 256 characters added
the pointer to extra data is updated
*/
-static int
+static unsigned int
add_to_class(uint8_t *classbits, PCRE2_UCHAR **uchardptr, uint32_t options,
compile_block *cb, uint32_t start, uint32_t end)
{
uint32_t c;
uint32_t classbits_end = (end <= 0xff ? end : 0xff);
-int n8 = 0;
+unsigned int n8 = 0;
/* If caseless matching is required, scan the range and process alternate
cases. In Unicode, there are 8-bit characters that have alternate cases that
@@ -2907,14 +2948,14 @@ Returns: the number of < 256 characters added
the pointer to extra data is updated
*/
-static int
+static unsigned int
add_list_to_class(uint8_t *classbits, PCRE2_UCHAR **uchardptr, uint32_t options,
compile_block *cb, const uint32_t *p, unsigned int except)
{
-int n8 = 0;
+unsigned int n8 = 0;
while (p[0] < NOTACHAR)
{
- int n = 0;
+ unsigned int n = 0;
if (p[0] != except)
{
while(p[n+1] == p[0] + n + 1) n++;
@@ -2945,12 +2986,12 @@ Returns: the number of < 256 characters added
the pointer to extra data is updated
*/
-static int
+static unsigned int
add_not_list_to_class(uint8_t *classbits, PCRE2_UCHAR **uchardptr,
uint32_t options, compile_block *cb, const uint32_t *p)
{
BOOL utf = (options & PCRE2_UTF) != 0;
-int n8 = 0;
+unsigned int n8 = 0;
if (p[0] > 0)
n8 += add_to_class(classbits, uchardptr, options, cb, 0, p[0] - 1);
while (p[0] < NOTACHAR)
@@ -3099,7 +3140,7 @@ for (; ptr < cb->end_pattern; ptr++)
/* Not UTF */
{
- if (code != NULL) *code++ = x;
+ if (code != NULL) *code++ = (PCRE2_UCHAR)x;
}
arglen++;
@@ -3173,20 +3214,20 @@ typedef struct nest_save {
#define NSF_EXTENDED 0x0002u
#define NSF_DUPNAMES 0x0004u
-static uint32_t scan_for_captures(PCRE2_SPTR *ptrptr, uint32_t options,
+static int scan_for_captures(PCRE2_SPTR *ptrptr, uint32_t options,
compile_block *cb)
{
uint32_t c;
uint32_t delimiter;
-uint32_t nest_depth = 0;
uint32_t set, unset, *optset;
+uint32_t skiptoket = 0;
+uint16_t nest_depth = 0;
int errorcode = 0;
int escape;
int namelen;
int i;
BOOL inescq = FALSE;
BOOL isdupname;
-BOOL skiptoket = FALSE;
BOOL utf = (options & PCRE2_UTF) != 0;
BOOL negate_class;
PCRE2_SPTR name;
@@ -3213,10 +3254,10 @@ for (; ptr < cb->end_pattern; ptr++)
next closing parenthesis must be ignored. The parenthesis itself must be
processed (to end the nested parenthesized item). */
- if (skiptoket)
+ if (skiptoket != 0)
{
if (c != CHAR_RIGHT_PARENTHESIS) continue;
- skiptoket = FALSE;
+ skiptoket = 0;
}
/* Skip over literals */
@@ -3231,17 +3272,16 @@ for (; ptr < cb->end_pattern; ptr++)
continue;
}
- /* Skip over comments and whitespace in extended mode. Need a loop to handle
- whitespace after a comment. */
+ /* Skip over # comments and whitespace in extended mode. */
if ((options & PCRE2_EXTENDED) != 0)
{
- for (;;)
+ PCRE2_SPTR wscptr = ptr;
+ while (MAX_255(c) && (cb->ctypes[c] & ctype_space) != 0) c = *(++ptr);
+ if (c == CHAR_NUMBER_SIGN)
{
- while (MAX_255(c) && (cb->ctypes[c] & ctype_space) != 0) c = *(++ptr);
- if (c != CHAR_NUMBER_SIGN) break;
ptr++;
- while (*ptr != CHAR_NULL)
+ while (ptr < cb->end_pattern)
{
if (IS_NEWLINE(ptr)) /* For non-fixed-length newline cases, */
{ /* IS_NEWLINE sets cb->nllen. */
@@ -3253,7 +3293,15 @@ for (; ptr < cb->end_pattern; ptr++)
if (utf) FORWARDCHAR(ptr);
#endif
}
- c = *ptr; /* Either NULL or the char after a newline */
+ }
+
+ /* If we skipped any characters, restart the loop. Otherwise, we didn't see
+ a comment. */
+
+ if (ptr > wscptr)
+ {
+ ptr--;
+ continue;
}
}
@@ -3377,27 +3425,24 @@ for (; ptr < cb->end_pattern; ptr++)
if ((options & PCRE2_NO_AUTO_CAPTURE) == 0) cb->bracount++;
}
- /* (*something) - just skip to closing ket unless PCRE2_ALT_VERBNAMES is
- set, in which case we have to process escapes in the string after the
- name. */
+ /* (*something) - skip over a name, and then just skip to closing ket
+ unless PCRE2_ALT_VERBNAMES is set, in which case we have to process
+ escapes in the string after a verb name terminated by a colon. */
else
{
ptr += 2;
while (MAX_255(*ptr) && (cb->ctypes[*ptr] & ctype_word) != 0) ptr++;
- if (*ptr == CHAR_COLON)
+ if (*ptr == CHAR_COLON && (options & PCRE2_ALT_VERBNAMES) != 0)
{
ptr++;
- if ((options & PCRE2_ALT_VERBNAMES) != 0)
- {
- if (process_verb_name(&ptr, NULL, &errorcode, options, utf, cb) < 0)
- goto FAILED;
- }
- else
- {
- while (ptr < cb->end_pattern && *ptr != CHAR_RIGHT_PARENTHESIS)
- ptr++;
- }
+ if (process_verb_name(&ptr, NULL, &errorcode, options, utf, cb) < 0)
+ goto FAILED;
+ }
+ else
+ {
+ while (ptr < cb->end_pattern && *ptr != CHAR_RIGHT_PARENTHESIS)
+ ptr++;
}
nest_depth--;
}
@@ -3414,7 +3459,7 @@ for (; ptr < cb->end_pattern; ptr++)
IS_DIGIT(ptr[0]) || /* (?n) */
(ptr[0] == CHAR_MINUS && IS_DIGIT(ptr[1]))) /* (?-n) */
{
- skiptoket = TRUE;
+ skiptoket = ptr[0];
break;
}
@@ -3434,8 +3479,8 @@ for (; ptr < cb->end_pattern; ptr++)
if (*ptr == CHAR_VERTICAL_LINE)
{
- top_nest->reset_group = cb->bracount;
- top_nest->max_group = cb->bracount;
+ top_nest->reset_group = (uint16_t)cb->bracount;
+ top_nest->max_group = (uint16_t)cb->bracount;
top_nest->flags |= NSF_RESET;
cb->external_flags |= PCRE2_DUPCAPUSED;
break;
@@ -3470,9 +3515,10 @@ for (; ptr < cb->end_pattern; ptr++)
case CHAR_U:
break;
- default: errorcode = ERR11;
- ptr--; /* Correct the offset */
- goto FAILED;
+ default:
+ errorcode = ERR11;
+ ptr--; /* Correct the offset */
+ goto FAILED;
}
}
@@ -3648,7 +3694,7 @@ for (; ptr < cb->end_pattern; ptr++)
}
if (namelen + IMM2_SIZE + 1 > cb->name_entry_size)
- cb->name_entry_size = namelen + IMM2_SIZE + 1;
+ cb->name_entry_size = (uint16_t)(namelen + IMM2_SIZE + 1);
/* We have a valid name for this capturing group. */
@@ -3666,7 +3712,7 @@ for (; ptr < cb->end_pattern; ptr++)
for (i = 0; i < cb->names_found; i++, ng++)
{
if (namelen == ng->length &&
- PRIV(strncmp)(name, ng->name, namelen) == 0)
+ PRIV(strncmp)(name, ng->name, (size_t)namelen) == 0)
{
if (ng->number == cb->bracount) break;
if ((options & PCRE2_DUPNAMES) == 0)
@@ -3690,7 +3736,7 @@ for (; ptr < cb->end_pattern; ptr++)
if (cb->names_found >= cb->named_group_list_size)
{
- int newsize = cb->named_group_list_size * 2;
+ uint32_t newsize = cb->named_group_list_size * 2;
named_group *newspace =
cb->cx->memctl.malloc(newsize * sizeof(named_group),
cb->cx->memctl.memory_data);
@@ -3712,9 +3758,9 @@ for (; ptr < cb->end_pattern; ptr++)
/* Add this name to the list */
cb->named_groups[cb->names_found].name = name;
- cb->named_groups[cb->names_found].length = namelen;
+ cb->named_groups[cb->names_found].length = (uint16_t)namelen;
cb->named_groups[cb->names_found].number = cb->bracount;
- cb->named_groups[cb->names_found].isdup = isdupname;
+ cb->named_groups[cb->names_found].isdup = (uint16_t)isdupname;
cb->names_found++;
break;
} /* End of (? switch */
@@ -3727,7 +3773,7 @@ for (; ptr < cb->end_pattern; ptr++)
(top_nest->flags & NSF_RESET) != 0)
{
if (cb->bracount > top_nest->max_group)
- top_nest->max_group = cb->bracount;
+ top_nest->max_group = (uint16_t)cb->bracount;
cb->bracount = top_nest->reset_group;
}
break;
@@ -3748,13 +3794,26 @@ for (; ptr < cb->end_pattern; ptr++)
if (top_nest == (nest_save *)(cb->start_workspace)) top_nest = NULL;
else top_nest--;
}
- if (nest_depth > 0) nest_depth--; /* Can be 0 for unmatched ) */
+ if (nest_depth == 0) /* Unmatched closing parenthesis */
+ {
+ errorcode = ERR22;
+ goto FAILED;
+ }
+ nest_depth--;
break;
}
}
-cb->final_bracount = cb->bracount;
-return 0;
+if (nest_depth == 0)
+ {
+ cb->final_bracount = cb->bracount;
+ return 0;
+ }
+
+/* We give a special error for a missing closing parentheses after (?# because
+it might otherwise be hard to see where the missing character is. */
+
+errorcode = (skiptoket == CHAR_NUMBER_SIGN)? ERR18 : ERR14;
FAILED:
*ptrptr = ptr;
@@ -3905,6 +3964,10 @@ for (;; ptr++)
int32_t subreqcuflags, subfirstcuflags; /* Must be signed */
PCRE2_UCHAR mcbuffer[8];
+ /* Come here to restart the loop. */
+
+ REDO_LOOP:
+
/* Get next character in the pattern */
c = *ptr;
@@ -3949,7 +4012,7 @@ for (;; ptr++)
*errorcodeptr = ERR20;
goto FAILED;
}
- *lengthptr += code - last_code;
+ *lengthptr += (size_t)(code - last_code);
/* If "previous" is set and it is not at the start of the work space, move
it back to there, in order to avoid filling up the work space. Otherwise,
@@ -3959,7 +4022,7 @@ for (;; ptr++)
{
if (previous > orig_code)
{
- memmove(orig_code, previous, CU2BYTES(code - previous));
+ memmove(orig_code, previous, (size_t)CU2BYTES(code - previous));
code -= previous - orig_code;
previous = orig_code;
}
@@ -4045,11 +4108,7 @@ for (;; ptr++)
/* If we skipped any characters, restart the loop. Otherwise, we didn't see
a comment. */
- if (ptr > wscptr)
- {
- ptr--;
- continue;
- }
+ if (ptr > wscptr) goto REDO_LOOP;
}
/* Skip over (?# comments. */
@@ -4120,7 +4179,7 @@ for (;; ptr++)
*errorcodeptr = ERR20;
goto FAILED;
}
- *lengthptr += code - last_code; /* To include callout length */
+ *lengthptr += (size_t)(code - last_code); /* To include callout length */
}
return TRUE;
@@ -4189,17 +4248,15 @@ for (;; ptr++)
if (PRIV(strncmp_c8)(ptr+1, STRING_WEIRD_STARTWORD, 6) == 0)
{
cb->nestptr[0] = ptr + 7;
- ptr = sub_start_of_word; /* Do not combine these statements; clang's */
- ptr--; /* sanitizer moans about a negative index. */
- continue;
+ ptr = sub_start_of_word;
+ goto REDO_LOOP;
}
if (PRIV(strncmp_c8)(ptr+1, STRING_WEIRD_ENDWORD, 6) == 0)
{
cb->nestptr[0] = ptr + 7;
- ptr = sub_end_of_word; /* Do not combine these statements; clang's */
- ptr--; /* sanitizer moans about a negative index. */
- continue;
+ ptr = sub_end_of_word;
+ goto REDO_LOOP;
}
/* Handle a real character class. */
@@ -4408,7 +4465,7 @@ for (;; ptr++)
case PC_PUNCT:
if (ptype == 0) ptype = PT_PXPUNCT;
*class_uchardata++ = local_negate? XCL_NOTPROP : XCL_PROP;
- *class_uchardata++ = ptype;
+ *class_uchardata++ = (PCRE2_UCHAR)ptype;
*class_uchardata++ = 0;
xclass_has_prop = TRUE;
ptr = tempptr + 1;
@@ -4456,9 +4513,9 @@ for (;; ptr++)
if (taboffset >= 0)
{
if (tabopt >= 0)
- for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset];
+ for (c = 0; c < 32; c++) pbits[c] |= cbits[(int)c + taboffset];
else
- for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
+ for (c = 0; c < 32; c++) pbits[c] &= ~cbits[(int)c + taboffset];
}
/* Now see if we need to remove any special characters. An option
@@ -5899,10 +5956,22 @@ for (;; ptr++)
goto FAILED;
}
cb->had_accept = TRUE;
+
+ /* In the first pass, just accumulate the length required;
+ otherwise hitting (*ACCEPT) inside many nested parentheses can
+ cause workspace overflow. */
+
for (oc = cb->open_caps; oc != NULL; oc = oc->next)
{
- *code++ = OP_CLOSE;
- PUT2INC(code, 0, oc->number);
+ if (lengthptr != NULL)
+ {
+ *lengthptr += CU2BYTES(1) + IMM2_SIZE;
+ }
+ else
+ {
+ *code++ = OP_CLOSE;
+ PUT2INC(code, 0, oc->number);
+ }
}
setverb = *code++ =
(cb->assert_depth > 0)? OP_ASSERT_ACCEPT : OP_ACCEPT;
@@ -7042,7 +7111,9 @@ for (;; ptr++)
}
}
- /* Error if hit end of pattern */
+ /* At the end of a group, it's an error if we hit end of pattern or
+ any non-closing parenthesis. This check also happens in the pre-scan,
+ so should not trigger here, but leave this code as an insurance. */
if (*ptr != CHAR_RIGHT_PARENTHESIS)
{
@@ -7349,12 +7420,17 @@ for (;; ptr++)
}
else
#endif
- /* In non-UTF mode, we turn \C into OP_ALLANY instead of OP_ANYBYTE
- so that it works in DFA mode and in lookbehinds. */
+ /* In non-UTF mode, and for both 32-bit modes, we turn \C into
+ OP_ALLANY instead of OP_ANYBYTE so that it works in DFA mode and in
+ lookbehinds. */
{
previous = (escape > ESC_b && escape < ESC_Z)? code : NULL;
+#if PCRE2_CODE_UNIT_WIDTH == 32
+ *code++ = (escape == ESC_C)? OP_ALLANY : escape;
+#else
*code++ = (!utf && escape == ESC_C)? OP_ALLANY : escape;
+#endif
}
}
continue;
@@ -8704,14 +8780,11 @@ if (cb.had_accept)
reqcuflags = REQ_NONE;
}
-/* If we have not reached end of pattern after a successful compile, there's an
-excess bracket. Fill in the final opcode and check for disastrous overflow.
-If no overflow, but the estimated length exceeds the really used length, adjust
-the value of re->blocksize, and if valgrind support is configured, mark the
-extra allocated memory as unaddressable, so that any out-of-bound reads can be
-detected. */
+/* Fill in the final opcode and check for disastrous overflow. If no overflow,
+but the estimated length exceeds the really used length, adjust the value of
+re->blocksize, and if valgrind support is configured, mark the extra allocated
+memory as unaddressable, so that any out-of-bound reads can be detected. */
-if (errorcode == 0 && ptr < cb.end_pattern) errorcode = ERR22;
*code++ = OP_END;
usedlength = code - codestart;
if (usedlength > length) errorcode = ERR23; else