summaryrefslogtreecommitdiff
path: root/sed/regexp.c
diff options
context:
space:
mode:
Diffstat (limited to 'sed/regexp.c')
-rw-r--r--sed/regexp.c237
1 files changed, 199 insertions, 38 deletions
diff --git a/sed/regexp.c b/sed/regexp.c
index bbeccb5..ff898a8 100644
--- a/sed/regexp.c
+++ b/sed/regexp.c
@@ -1,6 +1,5 @@
/* GNU SED, a batch stream editor.
- Copyright (C) 1999, 2002, 2003, 2004, 2005, 2006
- Free Software Foundation, Inc.
+ Copyright (C) 1999-2016 Free Software Foundation, Inc.
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
@@ -37,21 +36,33 @@ static const char errors[] =
#define NO_REGEX (errors)
#define BAD_MODIF (NO_REGEX + sizeof(N_("no previous regular expression")))
-#define END_ERRORS (BAD_MODIF + sizeof(N_("cannot specify modifiers on empty regexp")))
+
+
+
+void
+dfaerror (char const *mesg)
+{
+ panic ("%s", mesg);
+}
+
+void
+dfawarn (char const *mesg)
+{
+ if (!getenv ("POSIXLY_CORRECT"))
+ dfaerror (mesg);
+}
static void
-compile_regex_1 (new_regex, needed_sub)
- struct regex *new_regex;
- int needed_sub;
+compile_regex_1 (struct regex *new_regex, int needed_sub)
{
#ifdef REG_PERL
int errcode;
errcode = regncomp(&new_regex->pattern, new_regex->re, new_regex->sz,
- (needed_sub ? 0 : REG_NOSUB)
- | new_regex->flags
- | extended_regexp_flags);
+ (needed_sub ? 0 : REG_NOSUB)
+ | new_regex->flags
+ | extended_regexp_flags);
if (errcode)
{
@@ -62,7 +73,7 @@ compile_regex_1 (new_regex, needed_sub)
#else
const char *error;
int syntax = ((extended_regexp_flags & REG_EXTENDED)
- ? RE_SYNTAX_POSIX_EXTENDED
+ ? RE_SYNTAX_POSIX_EXTENDED
: RE_SYNTAX_POSIX_BASIC);
syntax &= ~RE_DOT_NOT_NULL;
@@ -81,14 +92,11 @@ compile_regex_1 (new_regex, needed_sub)
break;
}
-#ifdef RE_ICASE
- syntax |= (new_regex->flags & REG_ICASE) ? RE_ICASE : 0;
-#endif
-#ifdef RE_NO_SUB
+ if (new_regex->flags & REG_ICASE)
+ syntax |= RE_ICASE;
+ else
+ new_regex->pattern.fastmap = malloc (1 << (sizeof (char) * 8));
syntax |= needed_sub ? 0 : RE_NO_SUB;
-#endif
-
- new_regex->pattern.fastmap = malloc (1 << (sizeof (char) * 8));
/* If REG_NEWLINE is set, newlines are treated differently. */
if (new_regex->flags & REG_NEWLINE)
@@ -100,8 +108,9 @@ compile_regex_1 (new_regex, needed_sub)
re_set_syntax (syntax);
error = re_compile_pattern (new_regex->re, new_regex->sz,
- &new_regex->pattern);
- new_regex->pattern.newline_anchor = (new_regex->flags & REG_NEWLINE) != 0;
+ &new_regex->pattern);
+ new_regex->pattern.newline_anchor =
+ buffer_delimiter == '\n' && (new_regex->flags & REG_NEWLINE) != 0;
new_regex->pattern.translate = NULL;
#ifndef RE_ICASE
@@ -110,7 +119,7 @@ compile_regex_1 (new_regex, needed_sub)
static char translate[1 << (sizeof(char) * 8)];
int i;
for (i = 0; i < sizeof(translate) / sizeof(char); i++)
- translate[i] = tolower (i);
+ translate[i] = tolower (i);
new_regex->pattern.translate = translate;
}
@@ -127,16 +136,30 @@ compile_regex_1 (new_regex, needed_sub)
{
char buf[200];
sprintf(buf, _("invalid reference \\%d on `s' command's RHS"),
- needed_sub - 1);
+ needed_sub - 1);
bad_prog(buf);
}
+
+ int dfaopts = buffer_delimiter == '\n' ? 0 : DFA_EOL_NUL;
+ new_regex->dfa = dfaalloc ();
+ dfasyntax (new_regex->dfa, &localeinfo, syntax, dfaopts);
+ dfacomp (new_regex->re, new_regex->sz, new_regex->dfa, 1);
+
+ /* The patterns which consist of only ^ or $ often appear in
+ substitution, but regex and dfa are not good at them, as regex does
+ not build fastmap, and as all in buffer must be scanned for $. So
+ we mark them to handle manually. */
+ if (new_regex->sz == 1)
+ {
+ if (new_regex->re[0] == '^')
+ new_regex->begline = true;
+ if (new_regex->re[0] == '$')
+ new_regex->endline = true;
+ }
}
struct regex *
-compile_regex(b, flags, needed_sub)
- struct buffer *b;
- int flags;
- int needed_sub;
+compile_regex(struct buffer *b, int flags, int needed_sub)
{
struct regex *new_regex;
size_t re_len;
@@ -145,7 +168,7 @@ compile_regex(b, flags, needed_sub)
if (size_buffer(b) == 0)
{
if (flags > 0)
- bad_prog(_(BAD_MODIF));
+ bad_prog(_(BAD_MODIF));
return NULL;
}
@@ -204,20 +227,16 @@ copy_regs (regs, pmatch, nregs)
#endif
int
-match_regex(regex, buf, buflen, buf_start_offset, regarray, regsize)
- struct regex *regex;
- char *buf;
- size_t buflen;
- size_t buf_start_offset;
- struct re_registers *regarray;
- int regsize;
+match_regex(struct regex *regex, char *buf, size_t buflen,
+ size_t buf_start_offset, struct re_registers *regarray,
+ int regsize)
{
int ret;
static struct regex *regex_last;
#ifdef REG_PERL
regmatch_t rm[10], *regmatch = rm;
if (regsize > 10)
- regmatch = (regmatch_t *) alloca (sizeof (regmatch_t) * regsize);
+ regmatch = alloca (sizeof (regmatch_t) * regsize);
#endif
/* printf ("Matching from %d/%d\n", buf_start_offset, buflen); */
@@ -227,7 +246,7 @@ match_regex(regex, buf, buflen, buf_start_offset, regarray, regsize)
{
regex = regex_last;
if (!regex_last)
- bad_prog(_(NO_REGEX));
+ bad_prog(_(NO_REGEX));
}
else
regex_last = regex;
@@ -247,9 +266,151 @@ match_regex(regex, buf, buflen, buf_start_offset, regarray, regsize)
regex->pattern.regs_allocated = REGS_REALLOCATE;
- ret = re_search (&regex->pattern, buf, buflen, buf_start_offset,
- buflen - buf_start_offset,
- regsize ? regarray : NULL);
+ /* Optimized handling for '^' and '$' patterns */
+ if (regex->begline || regex->endline)
+ {
+ size_t offset;
+
+ if (regex->endline)
+ {
+ const char *p = NULL;
+
+ if (regex->flags & REG_NEWLINE)
+ p = memchr (buf + buf_start_offset, buffer_delimiter, buflen);
+
+ offset = p ? p - buf : buflen;
+ }
+ else if (buf_start_offset == 0)
+ /* begline anchor, starting at beginning of the buffer. */
+ offset = 0;
+ else if (!(regex->flags & REG_NEWLINE))
+ /* begline anchor, starting in the middle of the text buffer,
+ and multiline regex is not specified - will never match.
+ Example: seq 2 | sed 'N;s/^/X/g' */
+ return 0;
+ else if (buf[buf_start_offset - 1] == buffer_delimiter)
+ /* begline anchor, starting in the middle of the text buffer,
+ with multiline match, and the current character
+ is the line delimiter - start here.
+ Example: seq 2 | sed 'N;s/^/X/mg' */
+ offset = buf_start_offset;
+ else
+ {
+ /* begline anchor, starting in the middle of the search buffer,
+ all previous optimizions didn't work: search
+ for the next line delimiter character in the buffer,
+ and start from there if found. */
+ const char *p = memchr (buf + buf_start_offset, buffer_delimiter,
+ buflen - buf_start_offset);
+
+ if (p == NULL)
+ return 0;
+
+ offset = p - buf + 1;
+ }
+
+ if (regsize)
+ {
+ size_t i;
+
+ if (!regarray->start)
+ {
+ regarray->start = MALLOC (1, regoff_t);
+ regarray->end = MALLOC (1, regoff_t);
+ regarray->num_regs = 1;
+ }
+
+ regarray->start[0] = offset;
+ regarray->end[0] = offset;
+
+ for (i = 1 ; i < regarray->num_regs; ++i)
+ regarray->start[i] = regarray->end[i] = -1;
+ }
+
+ return 1;
+ }
+
+ if (buf_start_offset == 0)
+ {
+ struct dfa *superset = dfasuperset (regex->dfa);
+
+ if (superset && !dfaexec (superset, buf, buf + buflen, true, NULL, NULL))
+ return 0;
+
+ if ((!regsize && (regex->flags & REG_NEWLINE))
+ || (!superset && dfaisfast (regex->dfa)))
+ {
+ bool backref = false;
+
+ if (!dfaexec (regex->dfa, buf, buf + buflen, true, NULL, &backref))
+ return 0;
+
+ if (!regsize && (regex->flags & REG_NEWLINE) && !backref)
+ return 1;
+ }
+ }
+
+ /* If the buffer delimiter is not newline character, we cannot use
+ newline_anchor flag of regex. So do it line-by-line, and add offset
+ value to results. */
+ if ((regex->flags & REG_NEWLINE) && buffer_delimiter != '\n')
+ {
+ const char *beg, *end;
+ const char *start;
+
+ beg = buf;
+
+ if (buf_start_offset > 0)
+ {
+ const char *eol = memrchr (buf, buffer_delimiter, buf_start_offset);
+
+ if (eol != NULL)
+ beg = eol + 1;
+ }
+
+ start = buf + buf_start_offset;
+
+ for (;;)
+ {
+ end = memchr (beg, buffer_delimiter, buf + buflen - beg);
+
+ if (end == NULL)
+ end = buf + buflen;
+
+ ret = re_search (&regex->pattern, beg, end - beg,
+ start - beg, end - start,
+ regsize ? regarray : NULL);
+
+ if (ret > -1)
+ {
+ size_t i;
+
+ ret += beg - buf;
+
+ if (regsize)
+ {
+ for (i = 0; i < regarray->num_regs; ++i)
+ {
+ if (regarray->start[i] > -1)
+ regarray->start[i] += beg - buf;
+ if (regarray->end[i] > -1)
+ regarray->end[i] += beg - buf;
+ }
+ }
+
+ break;
+ }
+
+ if (end == buf + buflen)
+ break;
+
+ beg = start = end + 1;
+ }
+ }
+ else
+ ret = re_search (&regex->pattern, buf, buflen, buf_start_offset,
+ buflen - buf_start_offset,
+ regsize ? regarray : NULL);
return (ret > -1);
#endif