diff options
Diffstat (limited to 'sed/regexp.c')
-rw-r--r-- | sed/regexp.c | 237 |
1 files changed, 199 insertions, 38 deletions
diff --git a/sed/regexp.c b/sed/regexp.c index bbeccb5..ff898a8 100644 --- a/sed/regexp.c +++ b/sed/regexp.c @@ -1,6 +1,5 @@ /* GNU SED, a batch stream editor. - Copyright (C) 1999, 2002, 2003, 2004, 2005, 2006 - Free Software Foundation, Inc. + Copyright (C) 1999-2016 Free Software Foundation, Inc. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -37,21 +36,33 @@ static const char errors[] = #define NO_REGEX (errors) #define BAD_MODIF (NO_REGEX + sizeof(N_("no previous regular expression"))) -#define END_ERRORS (BAD_MODIF + sizeof(N_("cannot specify modifiers on empty regexp"))) + + + +void +dfaerror (char const *mesg) +{ + panic ("%s", mesg); +} + +void +dfawarn (char const *mesg) +{ + if (!getenv ("POSIXLY_CORRECT")) + dfaerror (mesg); +} static void -compile_regex_1 (new_regex, needed_sub) - struct regex *new_regex; - int needed_sub; +compile_regex_1 (struct regex *new_regex, int needed_sub) { #ifdef REG_PERL int errcode; errcode = regncomp(&new_regex->pattern, new_regex->re, new_regex->sz, - (needed_sub ? 0 : REG_NOSUB) - | new_regex->flags - | extended_regexp_flags); + (needed_sub ? 0 : REG_NOSUB) + | new_regex->flags + | extended_regexp_flags); if (errcode) { @@ -62,7 +73,7 @@ compile_regex_1 (new_regex, needed_sub) #else const char *error; int syntax = ((extended_regexp_flags & REG_EXTENDED) - ? RE_SYNTAX_POSIX_EXTENDED + ? RE_SYNTAX_POSIX_EXTENDED : RE_SYNTAX_POSIX_BASIC); syntax &= ~RE_DOT_NOT_NULL; @@ -81,14 +92,11 @@ compile_regex_1 (new_regex, needed_sub) break; } -#ifdef RE_ICASE - syntax |= (new_regex->flags & REG_ICASE) ? RE_ICASE : 0; -#endif -#ifdef RE_NO_SUB + if (new_regex->flags & REG_ICASE) + syntax |= RE_ICASE; + else + new_regex->pattern.fastmap = malloc (1 << (sizeof (char) * 8)); syntax |= needed_sub ? 0 : RE_NO_SUB; -#endif - - new_regex->pattern.fastmap = malloc (1 << (sizeof (char) * 8)); /* If REG_NEWLINE is set, newlines are treated differently. */ if (new_regex->flags & REG_NEWLINE) @@ -100,8 +108,9 @@ compile_regex_1 (new_regex, needed_sub) re_set_syntax (syntax); error = re_compile_pattern (new_regex->re, new_regex->sz, - &new_regex->pattern); - new_regex->pattern.newline_anchor = (new_regex->flags & REG_NEWLINE) != 0; + &new_regex->pattern); + new_regex->pattern.newline_anchor = + buffer_delimiter == '\n' && (new_regex->flags & REG_NEWLINE) != 0; new_regex->pattern.translate = NULL; #ifndef RE_ICASE @@ -110,7 +119,7 @@ compile_regex_1 (new_regex, needed_sub) static char translate[1 << (sizeof(char) * 8)]; int i; for (i = 0; i < sizeof(translate) / sizeof(char); i++) - translate[i] = tolower (i); + translate[i] = tolower (i); new_regex->pattern.translate = translate; } @@ -127,16 +136,30 @@ compile_regex_1 (new_regex, needed_sub) { char buf[200]; sprintf(buf, _("invalid reference \\%d on `s' command's RHS"), - needed_sub - 1); + needed_sub - 1); bad_prog(buf); } + + int dfaopts = buffer_delimiter == '\n' ? 0 : DFA_EOL_NUL; + new_regex->dfa = dfaalloc (); + dfasyntax (new_regex->dfa, &localeinfo, syntax, dfaopts); + dfacomp (new_regex->re, new_regex->sz, new_regex->dfa, 1); + + /* The patterns which consist of only ^ or $ often appear in + substitution, but regex and dfa are not good at them, as regex does + not build fastmap, and as all in buffer must be scanned for $. So + we mark them to handle manually. */ + if (new_regex->sz == 1) + { + if (new_regex->re[0] == '^') + new_regex->begline = true; + if (new_regex->re[0] == '$') + new_regex->endline = true; + } } struct regex * -compile_regex(b, flags, needed_sub) - struct buffer *b; - int flags; - int needed_sub; +compile_regex(struct buffer *b, int flags, int needed_sub) { struct regex *new_regex; size_t re_len; @@ -145,7 +168,7 @@ compile_regex(b, flags, needed_sub) if (size_buffer(b) == 0) { if (flags > 0) - bad_prog(_(BAD_MODIF)); + bad_prog(_(BAD_MODIF)); return NULL; } @@ -204,20 +227,16 @@ copy_regs (regs, pmatch, nregs) #endif int -match_regex(regex, buf, buflen, buf_start_offset, regarray, regsize) - struct regex *regex; - char *buf; - size_t buflen; - size_t buf_start_offset; - struct re_registers *regarray; - int regsize; +match_regex(struct regex *regex, char *buf, size_t buflen, + size_t buf_start_offset, struct re_registers *regarray, + int regsize) { int ret; static struct regex *regex_last; #ifdef REG_PERL regmatch_t rm[10], *regmatch = rm; if (regsize > 10) - regmatch = (regmatch_t *) alloca (sizeof (regmatch_t) * regsize); + regmatch = alloca (sizeof (regmatch_t) * regsize); #endif /* printf ("Matching from %d/%d\n", buf_start_offset, buflen); */ @@ -227,7 +246,7 @@ match_regex(regex, buf, buflen, buf_start_offset, regarray, regsize) { regex = regex_last; if (!regex_last) - bad_prog(_(NO_REGEX)); + bad_prog(_(NO_REGEX)); } else regex_last = regex; @@ -247,9 +266,151 @@ match_regex(regex, buf, buflen, buf_start_offset, regarray, regsize) regex->pattern.regs_allocated = REGS_REALLOCATE; - ret = re_search (®ex->pattern, buf, buflen, buf_start_offset, - buflen - buf_start_offset, - regsize ? regarray : NULL); + /* Optimized handling for '^' and '$' patterns */ + if (regex->begline || regex->endline) + { + size_t offset; + + if (regex->endline) + { + const char *p = NULL; + + if (regex->flags & REG_NEWLINE) + p = memchr (buf + buf_start_offset, buffer_delimiter, buflen); + + offset = p ? p - buf : buflen; + } + else if (buf_start_offset == 0) + /* begline anchor, starting at beginning of the buffer. */ + offset = 0; + else if (!(regex->flags & REG_NEWLINE)) + /* begline anchor, starting in the middle of the text buffer, + and multiline regex is not specified - will never match. + Example: seq 2 | sed 'N;s/^/X/g' */ + return 0; + else if (buf[buf_start_offset - 1] == buffer_delimiter) + /* begline anchor, starting in the middle of the text buffer, + with multiline match, and the current character + is the line delimiter - start here. + Example: seq 2 | sed 'N;s/^/X/mg' */ + offset = buf_start_offset; + else + { + /* begline anchor, starting in the middle of the search buffer, + all previous optimizions didn't work: search + for the next line delimiter character in the buffer, + and start from there if found. */ + const char *p = memchr (buf + buf_start_offset, buffer_delimiter, + buflen - buf_start_offset); + + if (p == NULL) + return 0; + + offset = p - buf + 1; + } + + if (regsize) + { + size_t i; + + if (!regarray->start) + { + regarray->start = MALLOC (1, regoff_t); + regarray->end = MALLOC (1, regoff_t); + regarray->num_regs = 1; + } + + regarray->start[0] = offset; + regarray->end[0] = offset; + + for (i = 1 ; i < regarray->num_regs; ++i) + regarray->start[i] = regarray->end[i] = -1; + } + + return 1; + } + + if (buf_start_offset == 0) + { + struct dfa *superset = dfasuperset (regex->dfa); + + if (superset && !dfaexec (superset, buf, buf + buflen, true, NULL, NULL)) + return 0; + + if ((!regsize && (regex->flags & REG_NEWLINE)) + || (!superset && dfaisfast (regex->dfa))) + { + bool backref = false; + + if (!dfaexec (regex->dfa, buf, buf + buflen, true, NULL, &backref)) + return 0; + + if (!regsize && (regex->flags & REG_NEWLINE) && !backref) + return 1; + } + } + + /* If the buffer delimiter is not newline character, we cannot use + newline_anchor flag of regex. So do it line-by-line, and add offset + value to results. */ + if ((regex->flags & REG_NEWLINE) && buffer_delimiter != '\n') + { + const char *beg, *end; + const char *start; + + beg = buf; + + if (buf_start_offset > 0) + { + const char *eol = memrchr (buf, buffer_delimiter, buf_start_offset); + + if (eol != NULL) + beg = eol + 1; + } + + start = buf + buf_start_offset; + + for (;;) + { + end = memchr (beg, buffer_delimiter, buf + buflen - beg); + + if (end == NULL) + end = buf + buflen; + + ret = re_search (®ex->pattern, beg, end - beg, + start - beg, end - start, + regsize ? regarray : NULL); + + if (ret > -1) + { + size_t i; + + ret += beg - buf; + + if (regsize) + { + for (i = 0; i < regarray->num_regs; ++i) + { + if (regarray->start[i] > -1) + regarray->start[i] += beg - buf; + if (regarray->end[i] > -1) + regarray->end[i] += beg - buf; + } + } + + break; + } + + if (end == buf + buflen) + break; + + beg = start = end + 1; + } + } + else + ret = re_search (®ex->pattern, buf, buflen, buf_start_offset, + buflen - buf_start_offset, + regsize ? regarray : NULL); return (ret > -1); #endif |