Removed global variable caseins.

Added scanner stack flags for case-insensitivity. Moved case-folding code from DFA-generation to parse time read-macros. Added localized case-sensitivity syntax from Perl. Added test for new syntax in test suite. Documented new syntax.
author: John Millaway <john43@users.sourceforge.net> 2006-03-27 20:59:11 +0000
committer: John Millaway <john43@users.sourceforge.net> 2006-03-27 20:59:11 +0000
commit: a1037c0e1e7ffba540b80c38e4d95265312781b3 (patch)
tree: 1a2ee702913e41370908ff450d00c18d72570888
parent: 0b91d61cb2c8ea26ac273c7189a4481846ea969d (diff)
10 files changed, 117 insertions, 84 deletions
diff --git a/dfa.c b/dfa.c
index f3064c9..8613d75 100644
--- a/dfa.c
+++ b/dfa.c
@@ -697,21 +697,6 @@ void ntod ()
 			}
 		}
 
-		if (caseins && !useecs) {
-			register int j;
-
-			for (i = 'A', j = 'a'; i <= 'Z'; ++i, ++j) {
-				if (state[i] == 0 && state[j] != 0)
-					/* We're adding a transition. */
-					++totaltrans;
-
-				else if (state[i] != 0 && state[j] == 0)
-					/* We're taking away a transition. */
-					--totaltrans;
-
-				state[i] = state[j];
-			}
-		}
 
 		numsnpairs += totaltrans;
 
@@ -1018,10 +1003,6 @@ int symfollowset (ds, dsize, transsym, nset)
 				}
 		}
 
-		else if (sym >= 'A' && sym <= 'Z' && caseins)
-			flexfatal (_
-				   ("consistency check failed in symfollowset"));
-
 		else if (sym == SYM_EPSILON) {	/* do nothing */
 		}
 
diff --git a/doc/flex.texi b/doc/flex.texi
index f4ad42a..c8ec07f 100644
--- a/doc/flex.texi
+++ b/doc/flex.texi
@@ -746,6 +746,34 @@ the character with hexadecimal value 2a
 @item (r)
 match an @samp{r}; parentheses are used to override precedence (see below)
 
+@item (?r-s:pattern)
+apply option @samp{r} and omit option @samp{s} while interpreting pattern.
+Options may be zero or more of the characters @samp{i}, @samp{s}, or @samp{x}.
+
+@samp{i} means case-insensitive. @samp{-i} means case-sensitive.
+
+@samp{s} alters the meaning of the @samp{.} syntax to match any single byte whatsoever.
+@samp{-s} alters the meaning of @samp{.} to match any byte except @samp{\n}.
+
+@samp{x} ignores comments and whitespace in patterns. Whitespace is ignored unless
+it is backslash-escaped, contained within @samp{""}s, or appears inside a 
+character class. TODO -- Do we ignore Perl comments, C comments, or both?
+
+The following are all valid:
+
+@verbatim
+(?:foo)         same as  (foo)
+(?i:ab7)        same as  ([aA][bB]7)
+(?-i:ab)        same as  (ab)
+(?s:.)          same as  [\x00-\xFF]
+(?-s:.)         same as  [^\n]
+(?ix-s: a . b)  same as  ([Aa][^\n][bB])
+(?x:a  b)       same as  ("ab")
+(?x:a\ b)       same as  ("a b")
+(?x:a" "b)      same as  ("a b")
+(?x:a[ ]b)      same as  ("a b")
+@end verbatim
+
 @item (?# comment )
 omit everything within @samp{()}. The first @samp{)}
 character encountered ends the pattern. It is not possible to for the comment
diff --git a/flexdef.h b/flexdef.h
index 3005dcf..4136578 100644
--- a/flexdef.h
+++ b/flexdef.h
@@ -341,7 +341,6 @@
  * nowarn - if true (-w), do not generate warnings
  * spprdflt - if true (-s), suppress the default rule
  * interactive - if true (-I), generate an interactive scanner
- * caseins - if true (-i), generate a case-insensitive scanner
  * lex_compat - if true (-l), maximize compatibility with AT&T lex
  * posix_compat - if true (-X), maximize compatibility with POSIX lex
  * do_yylineno - if true, generate code to maintain yylineno
@@ -383,7 +382,7 @@
 
 extern int printstats, syntaxerror, eofseen, ddebug, trace, nowarn,
 	spprdflt;
-extern int interactive, caseins, lex_compat, posix_compat, do_yylineno;
+extern int interactive, lex_compat, posix_compat, do_yylineno;
 extern int useecs, fulltbl, usemecs, fullspd;
 extern int gen_line_dirs, performance_report, backing_up_report;
 extern int reentrant, bison_bridge_lval, bison_bridge_lloc;
@@ -1189,19 +1188,18 @@ bool regmatch_empty (regmatch_t * m);
 /* From "scanflags.h" */
 typedef unsigned int scanflags_t;
 extern scanflags_t* _sf_stk;
-extern size_t _sf_n, _sf_max; /**< stack of scanner flags. */
+extern size_t _sf_top_ix, _sf_max; /**< stack of scanner flags. */
 #define _SF_CASE_INS   0x0001
 #define _SF_DOT_ALL    0x0002
 #define _SF_SKIP_WS    0x0004
-
-#define sf_top()           (_sf_stk[sf_n])
+#define sf_top()           (_sf_stk[_sf_top_ix])
 #define sf_case_ins()      (sf_top() & _SF_CASE_INS)
 #define sf_dot_all()       (sf_top() & _SF_DOT_ALL)
 #define sf_skip_ws()       (sf_top() & _SF_SKIP_WS)
 #define sf_set_case_ins(X)      ((X) ? (sf_top() |= _SF_CASE_INS) : (sf_top() &= ~_SF_CASE_INS))
 #define sf_set_dot_all(X)       ((X) ? (sf_top() |= _SF_DOT_ALL)  : (sf_top() &= ~_SF_DOT_ALL))
 #define sf_set_skip_ws(X)       ((X) ? (sf_top() |= _SF_SKIP_WS)  : (sf_top() &= ~_SF_SKIP_WS))
-
+extern void sf_init(void);
 extern void sf_push(void);
 extern void sf_pop(void);
 
diff --git a/gen.c b/gen.c
index 99e9c8b..aac39af 100644
--- a/gen.c
+++ b/gen.c
@@ -447,9 +447,6 @@ struct yytbl_data *mkecstbl (void)
 		(flex_int32_t *) calloc (tbl->td_lolen, sizeof (flex_int32_t));
 
 	for (i = 1; i < csize; ++i) {
-		if (caseins && isupper (i))
-			ecgroup[i] = ecgroup[tolower (i)];
-
 		ecgroup[i] = ABS (ecgroup[i]);
 		tdata[i] = ecgroup[i];
 	}
@@ -471,9 +468,6 @@ void genecs ()
 	out_str_dec (get_int32_decl (), "yy_ec", csize);
 
 	for (i = 1; i < csize; ++i) {
-		if (caseins && (i >= 'A') && (i <= 'Z'))
-			ecgroup[i] = ecgroup[clower (i)];
-
 		ecgroup[i] = ABS (ecgroup[i]);
 		mkdata (ecgroup[i]);
 	}
diff --git a/main.c b/main.c
index a1283cb..5d8a818 100644
--- a/main.c
+++ b/main.c
@@ -49,7 +49,7 @@ static char *basename2 PROTO ((char *path, int should_strip_ext));
 
 /* these globals are all defined and commented in flexdef.h */
 int     printstats, syntaxerror, eofseen, ddebug, trace, nowarn, spprdflt;
-int     interactive, caseins, lex_compat, posix_compat, do_yylineno,
+int     interactive, lex_compat, posix_compat, do_yylineno,
 	useecs, fulltbl, usemecs;
 int     fullspd, gen_line_dirs, performance_report, backing_up_report;
 int     C_plus_plus, long_align, use_read, yytext_is_array, do_yywrap,
@@ -740,7 +740,7 @@ void flexend (exit_status)
 			putc ('b', stderr);
 		if (ddebug)
 			putc ('d', stderr);
-		if (caseins)
+		if (sf_case_ins())
 			putc ('i', stderr);
 		if (lex_compat)
 			putc ('l', stderr);
@@ -929,7 +929,7 @@ void flexinit (argc, argv)
 	char   *arg;
 	scanopt_t sopt;
 
-	printstats = syntaxerror = trace = spprdflt = caseins = false;
+	printstats = syntaxerror = trace = spprdflt = false;
 	lex_compat = posix_compat = C_plus_plus = backing_up_report =
 		ddebug = fulltbl = false;
 	fullspd = long_align = nowarn = yymore_used = continued_action =
@@ -971,6 +971,8 @@ void flexinit (argc, argv)
         buf_append (&m4defs_buf, &m4defs_init_str, 2);
     }
 
+    sf_init ();
+
     /* initialize regex lib */
     flex_init_regex();
 
@@ -1086,7 +1088,7 @@ void flexinit (argc, argv)
 			break;
 
 		case OPT_CASE_INSENSITIVE:
-			caseins = true;
+			sf_set_case_ins(true);
 			break;
 
 		case OPT_LEX_COMPAT:
diff --git a/parse.y b/parse.y
index 2b80bb7..cecda23 100644
--- a/parse.y
+++ b/parse.y
@@ -786,13 +786,14 @@ singleton	:  singleton '*'
 			{
 			++rulelen;
 
-			if ( caseins && isupper($1))
-				$1 = clower( $1 );
-
 			if ($1 == nlch)
 				rule_has_nl[num_rules] = true;
 
-			$$ = mkstate( $1 );
+            if (sf_case_ins() && has_case($1))
+                /* create an alternation, as in (a|A) */
+                $$ = mkor (mkstate($1), mkstate(reverse_case($1)));
+            else
+                $$ = mkstate( $1 );
 			}
 		;
 fullccl:
@@ -814,24 +815,17 @@ braceccl:
 ccl		:  ccl CHAR '-' CHAR
 			{
 
-			if (caseins)
+			if (sf_case_ins())
 			  {
-			    /* Squish the character range to lowercase only if BOTH
-			     * ends of the range are uppercase.
-			     */
-			    if (isupper ($2) && isupper ($4))
-			      {
-				$2 = tolower ($2);
-				$4 = tolower ($4);
-			      }
 
 			    /* If one end of the range has case and the other
 			     * does not, or the cases are different, then we're not
 			     * sure what range the user is trying to express.
 			     * Examples: [@-z] or [S-t]
 			     */
-			    else if (has_case ($2) != has_case ($4)
-				     || (has_case ($2) && (b_islower ($2) != b_islower ($4))))
+			    if (has_case ($2) != has_case ($4)
+				     || (has_case ($2) && (b_islower ($2) != b_islower ($4)))
+				     || (has_case ($2) && (b_isupper ($2) != b_isupper ($4))))
 			      format_warn3 (
 			      _("the character range [%c-%c] is ambiguous in a case-insensitive scanner"),
 					    $2, $4);
@@ -860,6 +854,19 @@ ccl		:  ccl CHAR '-' CHAR
 				 */
 				cclsorted = cclsorted && ($2 > lastchar);
 				lastchar = $4;
+
+                /* Do it again for upper/lowercase */
+                if (sf_case_ins() && has_case($2) && has_case($4)){
+                    $2 = reverse_case ($2);
+                    $4 = reverse_case ($4);
+                    
+                    for ( i = $2; i <= $4; ++i )
+                        ccladd( $1, i );
+
+                    cclsorted = cclsorted && ($2 > lastchar);
+                    lastchar = $4;
+                }
+
 				}
 
 			$$ = $1;
@@ -867,12 +874,19 @@ ccl		:  ccl CHAR '-' CHAR
 
 		|  ccl CHAR
 			{
-			if ( caseins && isupper($2))
-				$2 = clower( $2 );
-
 			ccladd( $1, $2 );
 			cclsorted = cclsorted && ($2 > lastchar);
 			lastchar = $2;
+
+            /* Do it again for upper/lowercase */
+            if (sf_case_ins() && has_case($2)){
+                $1 = reverse_case ($2);
+                ccladd ($1, reverse_case ($2));
+
+                cclsorted = cclsorted && ($2 > lastchar);
+                lastchar = $2;
+            }
+
 			$$ = $1;
 			}
 
@@ -898,16 +912,19 @@ ccl_expr:
 		|  CCE_CNTRL	{ CCL_EXPR(iscntrl); }
 		|  CCE_DIGIT	{ CCL_EXPR(isdigit); }
 		|  CCE_GRAPH	{ CCL_EXPR(isgraph); }
-		|  CCE_LOWER	{ CCL_EXPR(islower); }
+		|  CCE_LOWER	{ 
+                          CCL_EXPR(islower);
+                          if (sf_case_ins())
+                              CCL_EXPR(isupper);
+                        }
 		|  CCE_PRINT	{ CCL_EXPR(isprint); }
 		|  CCE_PUNCT	{ CCL_EXPR(ispunct); }
 		|  CCE_SPACE	{ CCL_EXPR(isspace); }
 		|  CCE_XDIGIT	{ CCL_EXPR(isxdigit); }
 		|  CCE_UPPER	{
-				if ( caseins )
-					CCL_EXPR(islower);
-				else
-					CCL_EXPR(isupper);
+                    CCL_EXPR(isupper);
+                    if (sf_case_ins())
+                        CCL_EXPR(islower);
 				}
 
         |  CCE_NEG_ALNUM	{ CCL_NEG_EXPR(isalnum); }
@@ -921,13 +938,13 @@ ccl_expr:
 		|  CCE_NEG_SPACE	{ CCL_NEG_EXPR(isspace); }
 		|  CCE_NEG_XDIGIT	{ CCL_NEG_EXPR(isxdigit); }
 		|  CCE_NEG_LOWER	{ 
-				if ( caseins )
+				if ( sf_case_ins() )
 					warn(_("[:^lower:] is ambiguous in case insensitive scanner"));
 				else
 					CCL_NEG_EXPR(islower);
 				}
 		|  CCE_NEG_UPPER	{
-				if ( caseins )
+				if ( sf_case_ins() )
 					warn(_("[:^upper:] ambiguous in case insensitive scanner"));
 				else
 					CCL_NEG_EXPR(isupper);
@@ -936,15 +953,17 @@ ccl_expr:
 		
 string		:  string CHAR
 			{
-			if ( caseins && isupper($2))
-				$2 = clower( $2 );
-
 			if ( $2 == nlch )
 				rule_has_nl[num_rules] = true;
 
 			++rulelen;
 
-			$$ = link_machines( $1, mkstate( $2 ) );
+            if (sf_case_ins() && has_case($2))
+                $$ = mkor (mkstate($2), mkstate(reverse_case($2)));
+            else
+                $$ = mkstate ($2);
+
+			$$ = link_machines( $1, $$);
 			}
 
 		|
diff --git a/scan.l b/scan.l
index 367cf43..4ae7ea4 100644
--- a/scan.l
+++ b/scan.l
@@ -338,8 +338,8 @@ M4QEND      "]]"
                             bison_bridge_lval = true;
                      }
 	"c++"		C_plus_plus = option_sense;
-	caseful|case-sensitive		caseins = ! option_sense;
-	caseless|case-insensitive	caseins = option_sense;
+	caseful|case-sensitive		sf_set_case_ins(!option_sense);
+	caseless|case-insensitive	sf_set_case_ins(option_sense);
 	debug		ddebug = option_sense;
 	default		spprdflt = ! option_sense;
 	ecs		useecs = option_sense;
@@ -657,9 +657,9 @@ nmstr[yyleng - 2 - end_is_ws] = '\0';  /* chop trailing brace */
 			}
 
     "(?#"       BEGIN(EXTENDED_COMMENT);
-    "(?"        BEGIN(GROUP_WITH_PARAMS); return '('; /* TODO: push parameterized rule state. */
-    "("         return '('; /* TODO: push  parameterized rule state. */
-    ")"         return ')'; /* TODO: pop  parameterized rule state. */
+    "(?"        sf_push(); BEGIN(GROUP_WITH_PARAMS); return '(';
+    "("         sf_push(); return '(';
+    ")"         sf_pop(); return ')';
 
 	[/|*+?.(){}]	return (unsigned char) yytext[0];
 	.		RETURNCHAR;
@@ -696,13 +696,15 @@ nmstr[yyleng - 2 - end_is_ws] = '\0';  /* chop trailing brace */
 <GROUP_WITH_PARAMS>{
     ":"     BEGIN(SECT2);
     "-"     BEGIN(GROUP_MINUS_PARAMS);
-    i       ; /* TODO: temporarily case-insensitive. */
-    s       ; /* TODO: temporary dot-all. */
+    i       sf_set_case_ins(1);
+    s       sf_set_dot_all(1);
+    x       sf_set_skip_ws(1);
 }
 <GROUP_MINUS_PARAMS>{
     ":"     BEGIN(SECT2);
-    i       ; /* TODO: temporarily NOT case-insensitive. */
-    s       ; /* TODO: temporarily NOT dot-all. */
+    i       sf_set_case_ins(0);
+    s       sf_set_dot_all(0);
+    x       sf_set_skip_ws(0);
 }
 
 <FIRSTCCL>{
diff --git a/scanflags.c b/scanflags.c
index 503ade0..20ff501 100644
--- a/scanflags.c
+++ b/scanflags.c
@@ -34,24 +34,24 @@
 #include "flexdef.h"
 
 scanflags_t* _sf_stk = NULL;
-size_t _sf_n=0, _sf_max=0;
+size_t _sf_top_ix=0, _sf_max=0;
 
 void
 sf_push (void)
 {
-    if (_sf_n + 1 >= _sf_max)
+    if (_sf_top_ix + 1 >= _sf_max)
         _sf_stk = (scanflags_t*) flex_realloc ( (void*) _sf_stk, sizeof(scanflags_t) * (_sf_max += 32));
 
     // copy the top element
-    _sf_stk[_sf_n + 1] = _sf_stk[_sf_n];
-    ++_sf_n;
+    _sf_stk[_sf_top_ix + 1] = _sf_stk[_sf_top_ix];
+    ++_sf_top_ix;
 }
 
 void
 sf_pop (void)
 {
-    assert(_sf_n > 0);
-    --_sf_n;
+    assert(_sf_top_ix > 0);
+    --_sf_top_ix;
 }
 
 /* one-time initialization. Should be called before any sf_ functions. */
@@ -60,7 +60,7 @@ sf_init (void)
 {
     assert(_sf_stk == NULL);
     _sf_stk = (scanflags_t*) flex_alloc ( sizeof(scanflags_t) * (_sf_max = 32));
-    _sf_stk[_sf_n] = 0;
+    _sf_stk[_sf_top_ix] = 0;
 }
 
 /* vim:set expandtab cindent tabstop=4 softtabstop=4 shiftwidth=4 textwidth=0: */
diff --git a/tests/test-ccl/scanner.l b/tests/test-ccl/scanner.l
index 383c6bc..1cc7917 100644
--- a/tests/test-ccl/scanner.l
+++ b/tests/test-ccl/scanner.l
@@ -28,6 +28,7 @@
 #include "config.h"
 /*#include "parser.h" */
 
+#define err_abort() do{printf("ERROR: flex line %d. input line %d.\n", __LINE__, yylineno); abort();} while(0)
 %}
 
 %option 8bit outfile="scanner.c" prefix="test"
@@ -53,11 +54,15 @@
 ^"abcd-bc:"([abcd]{-}[bc])+@abcd-bc@\n          printf("OK: %s", yytext); ++yylineno; return 1;
 ^"abcde-b-c:"([abcde]{-}[b]{-}[c])+@abcde-b-c@\n    printf("OK: %s", yytext); ++yylineno; return 1;
 ^"^XY-^XYZ:"([^XY]{-}[^XYZ])+@^XY-^XYZ@\n    printf("OK: %s", yytext); ++yylineno; return 1;
+^"ia:"(?i:a)+@ia@\n                          printf("OK: %s", yytext); ++yylineno; return 1;
+^"iabc:"(?i:abc)+@iabc@\n                    printf("OK: %s", yytext); ++yylineno; return 1;
+^"ia-c:"(?i:[a-c]+)@ia-c@\n                             printf("OK: %s", yytext); ++yylineno; return 1;
 
-.|\n                       {
-    printf("ERROR: at line %d\n", yylineno);
-    abort();
-    }
+    /* We don't want this one to match. */
+^"check-a:"(?i:(?-i:A))@\n               err_abort();
+^"check-a:"(?i:(?-i:(?i:A)))@\n          printf("OK: %s", yytext); ++yylineno; return 1;
+
+.|\n                       { err_abort(); }
 %%
 
 int main(void);
diff --git a/tests/test-ccl/test.input b/tests/test-ccl/test.input
index 74c96a9..af3b840 100644
--- a/tests/test-ccl/test.input
+++ b/tests/test-ccl/test.input
@@ -13,3 +13,7 @@ l-xyz:abcdefghijklmnopqrstuvw@l-xyz@
 abcd-bc:aaaaddddaaaa@abcd-bc@
 abcde-b-c:aaaaddddeeee@abcde-b-c@
 ^XY-^XYZ:ZZZZZZZZZZZ@^XY-^XYZ@
+ia:AaAa@ia@
+iabc:ABCabcAbCaBc@iabc@
+ia-c:ABCabcAbCaBc@ia-c@
+check-a:a@
author	John Millaway <john43@users.sourceforge.net>	2006-03-27 20:59:11 +0000
committer	John Millaway <john43@users.sourceforge.net>	2006-03-27 20:59:11 +0000
commit	a1037c0e1e7ffba540b80c38e4d95265312781b3 (patch)
tree	1a2ee702913e41370908ff450d00c18d72570888
parent	0b91d61cb2c8ea26ac273c7189a4481846ea969d (diff)