diff options
-rw-r--r-- | dfa.c | 19 | ||||
-rw-r--r-- | doc/flex.texi | 28 | ||||
-rw-r--r-- | flexdef.h | 10 | ||||
-rw-r--r-- | gen.c | 6 | ||||
-rw-r--r-- | main.c | 10 | ||||
-rw-r--r-- | parse.y | 77 | ||||
-rw-r--r-- | scan.l | 20 | ||||
-rw-r--r-- | scanflags.c | 14 | ||||
-rw-r--r-- | tests/test-ccl/scanner.l | 13 | ||||
-rw-r--r-- | tests/test-ccl/test.input | 4 |
10 files changed, 117 insertions, 84 deletions
@@ -697,21 +697,6 @@ void ntod () } } - if (caseins && !useecs) { - register int j; - - for (i = 'A', j = 'a'; i <= 'Z'; ++i, ++j) { - if (state[i] == 0 && state[j] != 0) - /* We're adding a transition. */ - ++totaltrans; - - else if (state[i] != 0 && state[j] == 0) - /* We're taking away a transition. */ - --totaltrans; - - state[i] = state[j]; - } - } numsnpairs += totaltrans; @@ -1018,10 +1003,6 @@ int symfollowset (ds, dsize, transsym, nset) } } - else if (sym >= 'A' && sym <= 'Z' && caseins) - flexfatal (_ - ("consistency check failed in symfollowset")); - else if (sym == SYM_EPSILON) { /* do nothing */ } diff --git a/doc/flex.texi b/doc/flex.texi index f4ad42a..c8ec07f 100644 --- a/doc/flex.texi +++ b/doc/flex.texi @@ -746,6 +746,34 @@ the character with hexadecimal value 2a @item (r) match an @samp{r}; parentheses are used to override precedence (see below) +@item (?r-s:pattern) +apply option @samp{r} and omit option @samp{s} while interpreting pattern. +Options may be zero or more of the characters @samp{i}, @samp{s}, or @samp{x}. + +@samp{i} means case-insensitive. @samp{-i} means case-sensitive. + +@samp{s} alters the meaning of the @samp{.} syntax to match any single byte whatsoever. +@samp{-s} alters the meaning of @samp{.} to match any byte except @samp{\n}. + +@samp{x} ignores comments and whitespace in patterns. Whitespace is ignored unless +it is backslash-escaped, contained within @samp{""}s, or appears inside a +character class. TODO -- Do we ignore Perl comments, C comments, or both? + +The following are all valid: + +@verbatim +(?:foo) same as (foo) +(?i:ab7) same as ([aA][bB]7) +(?-i:ab) same as (ab) +(?s:.) same as [\x00-\xFF] +(?-s:.) same as [^\n] +(?ix-s: a . b) same as ([Aa][^\n][bB]) +(?x:a b) same as ("ab") +(?x:a\ b) same as ("a b") +(?x:a" "b) same as ("a b") +(?x:a[ ]b) same as ("a b") +@end verbatim + @item (?# comment ) omit everything within @samp{()}. The first @samp{)} character encountered ends the pattern. It is not possible to for the comment @@ -341,7 +341,6 @@ * nowarn - if true (-w), do not generate warnings * spprdflt - if true (-s), suppress the default rule * interactive - if true (-I), generate an interactive scanner - * caseins - if true (-i), generate a case-insensitive scanner * lex_compat - if true (-l), maximize compatibility with AT&T lex * posix_compat - if true (-X), maximize compatibility with POSIX lex * do_yylineno - if true, generate code to maintain yylineno @@ -383,7 +382,7 @@ extern int printstats, syntaxerror, eofseen, ddebug, trace, nowarn, spprdflt; -extern int interactive, caseins, lex_compat, posix_compat, do_yylineno; +extern int interactive, lex_compat, posix_compat, do_yylineno; extern int useecs, fulltbl, usemecs, fullspd; extern int gen_line_dirs, performance_report, backing_up_report; extern int reentrant, bison_bridge_lval, bison_bridge_lloc; @@ -1189,19 +1188,18 @@ bool regmatch_empty (regmatch_t * m); /* From "scanflags.h" */ typedef unsigned int scanflags_t; extern scanflags_t* _sf_stk; -extern size_t _sf_n, _sf_max; /**< stack of scanner flags. */ +extern size_t _sf_top_ix, _sf_max; /**< stack of scanner flags. */ #define _SF_CASE_INS 0x0001 #define _SF_DOT_ALL 0x0002 #define _SF_SKIP_WS 0x0004 - -#define sf_top() (_sf_stk[sf_n]) +#define sf_top() (_sf_stk[_sf_top_ix]) #define sf_case_ins() (sf_top() & _SF_CASE_INS) #define sf_dot_all() (sf_top() & _SF_DOT_ALL) #define sf_skip_ws() (sf_top() & _SF_SKIP_WS) #define sf_set_case_ins(X) ((X) ? (sf_top() |= _SF_CASE_INS) : (sf_top() &= ~_SF_CASE_INS)) #define sf_set_dot_all(X) ((X) ? (sf_top() |= _SF_DOT_ALL) : (sf_top() &= ~_SF_DOT_ALL)) #define sf_set_skip_ws(X) ((X) ? (sf_top() |= _SF_SKIP_WS) : (sf_top() &= ~_SF_SKIP_WS)) - +extern void sf_init(void); extern void sf_push(void); extern void sf_pop(void); @@ -447,9 +447,6 @@ struct yytbl_data *mkecstbl (void) (flex_int32_t *) calloc (tbl->td_lolen, sizeof (flex_int32_t)); for (i = 1; i < csize; ++i) { - if (caseins && isupper (i)) - ecgroup[i] = ecgroup[tolower (i)]; - ecgroup[i] = ABS (ecgroup[i]); tdata[i] = ecgroup[i]; } @@ -471,9 +468,6 @@ void genecs () out_str_dec (get_int32_decl (), "yy_ec", csize); for (i = 1; i < csize; ++i) { - if (caseins && (i >= 'A') && (i <= 'Z')) - ecgroup[i] = ecgroup[clower (i)]; - ecgroup[i] = ABS (ecgroup[i]); mkdata (ecgroup[i]); } @@ -49,7 +49,7 @@ static char *basename2 PROTO ((char *path, int should_strip_ext)); /* these globals are all defined and commented in flexdef.h */ int printstats, syntaxerror, eofseen, ddebug, trace, nowarn, spprdflt; -int interactive, caseins, lex_compat, posix_compat, do_yylineno, +int interactive, lex_compat, posix_compat, do_yylineno, useecs, fulltbl, usemecs; int fullspd, gen_line_dirs, performance_report, backing_up_report; int C_plus_plus, long_align, use_read, yytext_is_array, do_yywrap, @@ -740,7 +740,7 @@ void flexend (exit_status) putc ('b', stderr); if (ddebug) putc ('d', stderr); - if (caseins) + if (sf_case_ins()) putc ('i', stderr); if (lex_compat) putc ('l', stderr); @@ -929,7 +929,7 @@ void flexinit (argc, argv) char *arg; scanopt_t sopt; - printstats = syntaxerror = trace = spprdflt = caseins = false; + printstats = syntaxerror = trace = spprdflt = false; lex_compat = posix_compat = C_plus_plus = backing_up_report = ddebug = fulltbl = false; fullspd = long_align = nowarn = yymore_used = continued_action = @@ -971,6 +971,8 @@ void flexinit (argc, argv) buf_append (&m4defs_buf, &m4defs_init_str, 2); } + sf_init (); + /* initialize regex lib */ flex_init_regex(); @@ -1086,7 +1088,7 @@ void flexinit (argc, argv) break; case OPT_CASE_INSENSITIVE: - caseins = true; + sf_set_case_ins(true); break; case OPT_LEX_COMPAT: @@ -786,13 +786,14 @@ singleton : singleton '*' { ++rulelen; - if ( caseins && isupper($1)) - $1 = clower( $1 ); - if ($1 == nlch) rule_has_nl[num_rules] = true; - $$ = mkstate( $1 ); + if (sf_case_ins() && has_case($1)) + /* create an alternation, as in (a|A) */ + $$ = mkor (mkstate($1), mkstate(reverse_case($1))); + else + $$ = mkstate( $1 ); } ; fullccl: @@ -814,24 +815,17 @@ braceccl: ccl : ccl CHAR '-' CHAR { - if (caseins) + if (sf_case_ins()) { - /* Squish the character range to lowercase only if BOTH - * ends of the range are uppercase. - */ - if (isupper ($2) && isupper ($4)) - { - $2 = tolower ($2); - $4 = tolower ($4); - } /* If one end of the range has case and the other * does not, or the cases are different, then we're not * sure what range the user is trying to express. * Examples: [@-z] or [S-t] */ - else if (has_case ($2) != has_case ($4) - || (has_case ($2) && (b_islower ($2) != b_islower ($4)))) + if (has_case ($2) != has_case ($4) + || (has_case ($2) && (b_islower ($2) != b_islower ($4))) + || (has_case ($2) && (b_isupper ($2) != b_isupper ($4)))) format_warn3 ( _("the character range [%c-%c] is ambiguous in a case-insensitive scanner"), $2, $4); @@ -860,6 +854,19 @@ ccl : ccl CHAR '-' CHAR */ cclsorted = cclsorted && ($2 > lastchar); lastchar = $4; + + /* Do it again for upper/lowercase */ + if (sf_case_ins() && has_case($2) && has_case($4)){ + $2 = reverse_case ($2); + $4 = reverse_case ($4); + + for ( i = $2; i <= $4; ++i ) + ccladd( $1, i ); + + cclsorted = cclsorted && ($2 > lastchar); + lastchar = $4; + } + } $$ = $1; @@ -867,12 +874,19 @@ ccl : ccl CHAR '-' CHAR | ccl CHAR { - if ( caseins && isupper($2)) - $2 = clower( $2 ); - ccladd( $1, $2 ); cclsorted = cclsorted && ($2 > lastchar); lastchar = $2; + + /* Do it again for upper/lowercase */ + if (sf_case_ins() && has_case($2)){ + $1 = reverse_case ($2); + ccladd ($1, reverse_case ($2)); + + cclsorted = cclsorted && ($2 > lastchar); + lastchar = $2; + } + $$ = $1; } @@ -898,16 +912,19 @@ ccl_expr: | CCE_CNTRL { CCL_EXPR(iscntrl); } | CCE_DIGIT { CCL_EXPR(isdigit); } | CCE_GRAPH { CCL_EXPR(isgraph); } - | CCE_LOWER { CCL_EXPR(islower); } + | CCE_LOWER { + CCL_EXPR(islower); + if (sf_case_ins()) + CCL_EXPR(isupper); + } | CCE_PRINT { CCL_EXPR(isprint); } | CCE_PUNCT { CCL_EXPR(ispunct); } | CCE_SPACE { CCL_EXPR(isspace); } | CCE_XDIGIT { CCL_EXPR(isxdigit); } | CCE_UPPER { - if ( caseins ) - CCL_EXPR(islower); - else - CCL_EXPR(isupper); + CCL_EXPR(isupper); + if (sf_case_ins()) + CCL_EXPR(islower); } | CCE_NEG_ALNUM { CCL_NEG_EXPR(isalnum); } @@ -921,13 +938,13 @@ ccl_expr: | CCE_NEG_SPACE { CCL_NEG_EXPR(isspace); } | CCE_NEG_XDIGIT { CCL_NEG_EXPR(isxdigit); } | CCE_NEG_LOWER { - if ( caseins ) + if ( sf_case_ins() ) warn(_("[:^lower:] is ambiguous in case insensitive scanner")); else CCL_NEG_EXPR(islower); } | CCE_NEG_UPPER { - if ( caseins ) + if ( sf_case_ins() ) warn(_("[:^upper:] ambiguous in case insensitive scanner")); else CCL_NEG_EXPR(isupper); @@ -936,15 +953,17 @@ ccl_expr: string : string CHAR { - if ( caseins && isupper($2)) - $2 = clower( $2 ); - if ( $2 == nlch ) rule_has_nl[num_rules] = true; ++rulelen; - $$ = link_machines( $1, mkstate( $2 ) ); + if (sf_case_ins() && has_case($2)) + $$ = mkor (mkstate($2), mkstate(reverse_case($2))); + else + $$ = mkstate ($2); + + $$ = link_machines( $1, $$); } | @@ -338,8 +338,8 @@ M4QEND "]]" bison_bridge_lval = true; } "c++" C_plus_plus = option_sense; - caseful|case-sensitive caseins = ! option_sense; - caseless|case-insensitive caseins = option_sense; + caseful|case-sensitive sf_set_case_ins(!option_sense); + caseless|case-insensitive sf_set_case_ins(option_sense); debug ddebug = option_sense; default spprdflt = ! option_sense; ecs useecs = option_sense; @@ -657,9 +657,9 @@ nmstr[yyleng - 2 - end_is_ws] = '\0'; /* chop trailing brace */ } "(?#" BEGIN(EXTENDED_COMMENT); - "(?" BEGIN(GROUP_WITH_PARAMS); return '('; /* TODO: push parameterized rule state. */ - "(" return '('; /* TODO: push parameterized rule state. */ - ")" return ')'; /* TODO: pop parameterized rule state. */ + "(?" sf_push(); BEGIN(GROUP_WITH_PARAMS); return '('; + "(" sf_push(); return '('; + ")" sf_pop(); return ')'; [/|*+?.(){}] return (unsigned char) yytext[0]; . RETURNCHAR; @@ -696,13 +696,15 @@ nmstr[yyleng - 2 - end_is_ws] = '\0'; /* chop trailing brace */ <GROUP_WITH_PARAMS>{ ":" BEGIN(SECT2); "-" BEGIN(GROUP_MINUS_PARAMS); - i ; /* TODO: temporarily case-insensitive. */ - s ; /* TODO: temporary dot-all. */ + i sf_set_case_ins(1); + s sf_set_dot_all(1); + x sf_set_skip_ws(1); } <GROUP_MINUS_PARAMS>{ ":" BEGIN(SECT2); - i ; /* TODO: temporarily NOT case-insensitive. */ - s ; /* TODO: temporarily NOT dot-all. */ + i sf_set_case_ins(0); + s sf_set_dot_all(0); + x sf_set_skip_ws(0); } <FIRSTCCL>{ diff --git a/scanflags.c b/scanflags.c index 503ade0..20ff501 100644 --- a/scanflags.c +++ b/scanflags.c @@ -34,24 +34,24 @@ #include "flexdef.h" scanflags_t* _sf_stk = NULL; -size_t _sf_n=0, _sf_max=0; +size_t _sf_top_ix=0, _sf_max=0; void sf_push (void) { - if (_sf_n + 1 >= _sf_max) + if (_sf_top_ix + 1 >= _sf_max) _sf_stk = (scanflags_t*) flex_realloc ( (void*) _sf_stk, sizeof(scanflags_t) * (_sf_max += 32)); // copy the top element - _sf_stk[_sf_n + 1] = _sf_stk[_sf_n]; - ++_sf_n; + _sf_stk[_sf_top_ix + 1] = _sf_stk[_sf_top_ix]; + ++_sf_top_ix; } void sf_pop (void) { - assert(_sf_n > 0); - --_sf_n; + assert(_sf_top_ix > 0); + --_sf_top_ix; } /* one-time initialization. Should be called before any sf_ functions. */ @@ -60,7 +60,7 @@ sf_init (void) { assert(_sf_stk == NULL); _sf_stk = (scanflags_t*) flex_alloc ( sizeof(scanflags_t) * (_sf_max = 32)); - _sf_stk[_sf_n] = 0; + _sf_stk[_sf_top_ix] = 0; } /* vim:set expandtab cindent tabstop=4 softtabstop=4 shiftwidth=4 textwidth=0: */ diff --git a/tests/test-ccl/scanner.l b/tests/test-ccl/scanner.l index 383c6bc..1cc7917 100644 --- a/tests/test-ccl/scanner.l +++ b/tests/test-ccl/scanner.l @@ -28,6 +28,7 @@ #include "config.h" /*#include "parser.h" */ +#define err_abort() do{printf("ERROR: flex line %d. input line %d.\n", __LINE__, yylineno); abort();} while(0) %} %option 8bit outfile="scanner.c" prefix="test" @@ -53,11 +54,15 @@ ^"abcd-bc:"([abcd]{-}[bc])+@abcd-bc@\n printf("OK: %s", yytext); ++yylineno; return 1; ^"abcde-b-c:"([abcde]{-}[b]{-}[c])+@abcde-b-c@\n printf("OK: %s", yytext); ++yylineno; return 1; ^"^XY-^XYZ:"([^XY]{-}[^XYZ])+@^XY-^XYZ@\n printf("OK: %s", yytext); ++yylineno; return 1; +^"ia:"(?i:a)+@ia@\n printf("OK: %s", yytext); ++yylineno; return 1; +^"iabc:"(?i:abc)+@iabc@\n printf("OK: %s", yytext); ++yylineno; return 1; +^"ia-c:"(?i:[a-c]+)@ia-c@\n printf("OK: %s", yytext); ++yylineno; return 1; -.|\n { - printf("ERROR: at line %d\n", yylineno); - abort(); - } + /* We don't want this one to match. */ +^"check-a:"(?i:(?-i:A))@\n err_abort(); +^"check-a:"(?i:(?-i:(?i:A)))@\n printf("OK: %s", yytext); ++yylineno; return 1; + +.|\n { err_abort(); } %% int main(void); diff --git a/tests/test-ccl/test.input b/tests/test-ccl/test.input index 74c96a9..af3b840 100644 --- a/tests/test-ccl/test.input +++ b/tests/test-ccl/test.input @@ -13,3 +13,7 @@ l-xyz:abcdefghijklmnopqrstuvw@l-xyz@ abcd-bc:aaaaddddaaaa@abcd-bc@ abcde-b-c:aaaaddddeeee@abcde-b-c@ ^XY-^XYZ:ZZZZZZZZZZZ@^XY-^XYZ@ +ia:AaAa@ia@ +iabc:ABCabcAbCaBc@iabc@ +ia-c:ABCabcAbCaBc@ia-c@ +check-a:a@ |