diff options
author | Vern Paxson <vern@ee.lbl.gov> | 1987-11-08 22:24:44 +0000 |
---|---|---|
committer | Vern Paxson <vern@ee.lbl.gov> | 1987-11-08 22:24:44 +0000 |
commit | 2cc578462372baa1b85936749946608d7f36415f (patch) | |
tree | b103972512328042bbc4e7541616e88369619b2c /nfa.c |
Initial revision
Diffstat (limited to 'nfa.c')
-rw-r--r-- | nfa.c | 542 |
1 files changed, 542 insertions, 0 deletions
@@ -0,0 +1,542 @@ +/* lexnfa - NFA construction routines */ + +/* + * Copyright (c) University of California, 1987 + */ + +#include "flexdef.h" + +/* add_accept - add an accepting state to a machine + * + * synopsis + * + * add_accept( mach, headcnt, trailcnt ); + * + * the global ACCNUM is incremented and the new value becomes mach's + * accepting number. if headcnt or trailcnt is non-zero then the machine + * recognizes a pattern with trailing context. headcnt is the number of + * characters in the matched part of the pattern, or zero if the matched + * part has variable length. trailcnt is the number of trailing context + * characters in the pattern, or zero if the trailing context has variable + * length. + */ +add_accept( mach, headcnt, trailcnt ) +int mach, headcnt, trailcnt; + + { + int astate; + + printf( "case %d:\n", ++accnum ); + + if ( headcnt > 0 || trailcnt > 0 ) + { /* do trailing context magic to not match the trailing characters */ + printf( "YYDOBEFORESCAN; /* undo effects of setting up yytext */\n" ); + + if ( headcnt > 0 ) + { + if ( ! genftl || headcnt > 1 ) + printf( "yycbufp = yybbufp + %d;\n", + genftl ? headcnt - 1 : headcnt ); + else + printf( "yycbufp = yybbufp;\n" ); + } + + else + printf( "yycbufp -= %d;\n", trailcnt ); + + printf( "YYDOBEFOREACTION; /* set up yytext again */\n" ); + } + + line_directive_out(); + + /* hang the accepting number off an epsilon state. if it is associated + * with a state that has a non-epsilon out-transition, then the state + * will accept BEFORE it makes that transition, i.e. one character too soon + */ + + if ( transchar[finalst[mach]] == SYM_EPSILON ) + accptnum[finalst[mach]] = accnum; + + else + { + astate = mkstate( SYM_EPSILON ); + accptnum[astate] = accnum; + mach = link_machines( mach, astate ); + } + } + + +/* copysingl - make a given number of copies of a singleton machine + * + * synopsis + * + * newsng = copysingl( singl, num ); + * + * newsng - a new singleton composed of num copies of singl + * singl - a singleton machine + * num - the number of copies of singl to be present in newsng + */ +int copysingl( singl, num ) +int singl, num; + + { + int copy, i; + + copy = mkstate( SYM_EPSILON ); + + for ( i = 1; i <= num; ++i ) + copy = link_machines( copy, dupmachine( singl ) ); + + return ( copy ); + } + + +/* dumpnfa - debugging routine to write out an nfa + * + * synopsis + * int state1; + * dumpnfa( state1 ); + */ +dumpnfa( state1 ) +int state1; + + { + int sym, tsp1, tsp2, anum, ns; + + fprintf( stderr, "\n\n********** beginning dump of nfa with start state %d\n", + state1 ); + + /* we probably should loop starting at firstst[state1] and going to + * lastst[state1], but they're not maintained properly when we "or" + * all of the rules together. So we use our knowledge that the machine + * starts at state 1 and ends at lastnfa. + */ + + /* for ( ns = firstst[state1]; ns <= lastst[state1]; ++ns ) */ + for ( ns = 1; ns <= lastnfa; ++ns ) + { + fprintf( stderr, "state # %4d\t", ns ); + + sym = transchar[ns]; + tsp1 = trans1[ns]; + tsp2 = trans2[ns]; + anum = accptnum[ns]; + + fprintf( stderr, "%3d: %4d, %4d", sym, tsp1, tsp2 ); + + if ( anum != NIL ) + fprintf( stderr, " [%d]", anum ); + + fprintf( stderr, "\n" ); + } + + fprintf( stderr, "********** end of dump\n" ); + } + + +/* dupmachine - make a duplicate of a given machine + * + * synopsis + * + * copy = dupmachine( mach ); + * + * copy - holds duplicate of mach + * mach - machine to be duplicated + * + * note that the copy of mach is NOT an exact duplicate; rather, all the + * transition states values are adjusted so that the copy is self-contained, + * as the original should have been. + * + * also note that the original MUST be contiguous, with its low and high + * states accessible by the arrays firstst and lastst + */ +int dupmachine( mach ) +int mach; + + { + int i, state, init, last = lastst[mach], state_offset; + + for ( i = firstst[mach]; i <= last; ++i ) + { + state = mkstate( transchar[i] ); + + if ( trans1[i] != NO_TRANSITION ) + { + mkxtion( finalst[state], trans1[i] + state - i ); + + if ( transchar[i] == SYM_EPSILON && trans2[i] != NO_TRANSITION ) + mkxtion( finalst[state], trans2[i] + state - i ); + } + + accptnum[state] = accptnum[i]; + } + + state_offset = state - i + 1; + + init = mach + state_offset; + firstst[init] = firstst[mach] + state_offset; + finalst[init] = finalst[mach] + state_offset; + lastst[init] = lastst[mach] + state_offset; + + return ( init ); + } + + +/* link_machines - connect two machines together + * + * synopsis + * + * new = link_machines( first, last ); + * + * new - a machine constructed by connecting first to last + * first - the machine whose successor is to be last + * last - the machine whose predecessor is to be first + * + * note: this routine concatenates the machine first with the machine + * last to produce a machine new which will pattern-match first first + * and then last, and will fail if either of the sub-patterns fails. + * FIRST is set to new by the operation. last is unmolested. + */ +int link_machines( first, last ) +int first, last; + + { + if ( first == NIL ) + return ( last ); + + else if ( last == NIL ) + return ( first ); + + else + { + mkxtion( finalst[first], last ); + finalst[first] = finalst[last]; + lastst[first] = max( lastst[first], lastst[last] ); + firstst[first] = min( firstst[first], firstst[last] ); + + return ( first ); + } + } + + +/* mkbranch - make a machine that branches to two machines + * + * synopsis + * + * branch = mkbranch( first, second ); + * + * branch - a machine which matches either first's pattern or second's + * first, second - machines whose patterns are to be or'ed (the | operator) + * + * note that first and second are NEITHER destroyed by the operation. Also, + * the resulting machine CANNOT be used with any other "mk" operation except + * more mkbranch's. Compare with mkor() + */ +int mkbranch( first, second ) +int first, second; + + { + int eps; + + if ( first == NO_TRANSITION ) + return ( second ); + + else if ( second == NO_TRANSITION ) + return ( first ); + + eps = mkstate( SYM_EPSILON ); + + mkxtion( eps, first ); + mkxtion( eps, second ); + + return ( eps ); + } + + +/* mkclos - convert a machine into a closure + * + * synopsis + * new = mkclos( state ); + * + * new - a new state which matches the closure of "state" + */ +int mkclos( state ) +int state; + + { + return ( mkopt( mkposcl( state ) ) ); + } + + +/* mkopt - make a machine optional + * + * synopsis + * + * new = mkopt( mach ); + * + * new - a machine which optionally matches whatever mach matched + * mach - the machine to make optional + * + * notes: + * 1. mach must be the last machine created + * 2. mach is destroyed by the call + */ +int mkopt( mach ) +int mach; + + { + int eps; + + if ( ! SUPER_FREE_EPSILON(finalst[mach]) ) + { + eps = mkstate( SYM_EPSILON ); + mach = link_machines( mach, eps ); + } + + /* can't skimp on the following if FREE_EPSILON(mach) is true because + * some state interior to "mach" might point back to the beginning + * for a closure + */ + eps = mkstate( SYM_EPSILON ); + mach = link_machines( eps, mach ); + + mkxtion( mach, finalst[mach] ); + + return ( mach ); + } + + +/* mkor - make a machine that matches either one of two machines + * + * synopsis + * + * new = mkor( first, second ); + * + * new - a machine which matches either first's pattern or second's + * first, second - machines whose patterns are to be or'ed (the | operator) + * + * note that first and second are both destroyed by the operation + * the code is rather convoluted because an attempt is made to minimize + * the number of epsilon states needed + */ +int mkor( first, second ) +int first, second; + + { + int eps, orend; + + if ( first == NIL ) + return ( second ); + + else if ( second == NIL ) + return ( first ); + + else + { + /* see comment in mkopt() about why we can't use the first state + * of "first" or "second" if they satisfy "FREE_EPSILON" + */ + eps = mkstate( SYM_EPSILON ); + + first = link_machines( eps, first ); + + mkxtion( first, second ); + + if ( SUPER_FREE_EPSILON(finalst[first]) && + accptnum[finalst[first]] == NIL ) + { + orend = finalst[first]; + mkxtion( finalst[second], orend ); + } + + else if ( SUPER_FREE_EPSILON(finalst[second]) && + accptnum[finalst[second]] == NIL ) + { + orend = finalst[second]; + mkxtion( finalst[first], orend ); + } + + else + { + eps = mkstate( SYM_EPSILON ); + + first = link_machines( first, eps ); + orend = finalst[first]; + + mkxtion( finalst[second], orend ); + } + } + + finalst[first] = orend; + return ( first ); + } + + +/* mkposcl - convert a machine into a positive closure + * + * synopsis + * new = mkposcl( state ); + * + * new - a machine matching the positive closure of "state" + */ +int mkposcl( state ) +int state; + + { + int eps; + + if ( SUPER_FREE_EPSILON(finalst[state]) ) + { + mkxtion( finalst[state], state ); + return ( state ); + } + + else + { + eps = mkstate( SYM_EPSILON ); + mkxtion( eps, state ); + return ( link_machines( state, eps ) ); + } + } + + +/* mkrep - make a replicated machine + * + * synopsis + * new = mkrep( mach, lb, ub ); + * + * new - a machine that matches whatever "mach" matched from "lb" + * number of times to "ub" number of times + * + * note + * if "ub" is INFINITY then "new" matches "lb" or more occurances of "mach" + */ +int mkrep( mach, lb, ub ) +int mach, lb, ub; + + { + int base, tail, copy, i; + + base = copysingl( mach, lb - 1 ); + + if ( ub == INFINITY ) + { + copy = dupmachine( mach ); + mach = link_machines( mach, link_machines( base, mkclos( copy ) ) ); + } + + else + { + tail = mkstate( SYM_EPSILON ); + + for ( i = lb; i < ub; ++i ) + { + copy = dupmachine( mach ); + tail = mkopt( link_machines( copy, tail ) ); + } + + mach = link_machines( mach, link_machines( base, tail ) ); + } + + return ( mach ); + } + + +/* mkstate - create a state with a transition on a given symbol + * + * synopsis + * + * state = mkstate( sym ); + * + * state - a new state matching sym + * sym - the symbol the new state is to have an out-transition on + * + * note that this routine makes new states in ascending order through the + * state array (and increments LASTNFA accordingly). The routine DUPMACHINE + * relies on machines being made in ascending order and that they are + * CONTIGUOUS. Change it and you will have to rewrite DUPMACHINE (kludge + * that it admittedly is) + */ +int mkstate( sym ) +int sym; + + { + if ( ++lastnfa >= current_mns ) + { + if ( (current_mns += MNS_INCREMENT) >= MAXIMUM_MNS ) + lerrif( "input rules are too complicated (>= %d NFA states)", + current_mns ); + + ++num_reallocs; + + transchar = reallocate_integer_array( transchar, current_mns ); + trans1 = reallocate_integer_array( trans1, current_mns ); + trans2 = reallocate_integer_array( trans2, current_mns ); + accptnum = reallocate_integer_array( accptnum, current_mns ); + firstst = reallocate_integer_array( firstst, current_mns ); + finalst = reallocate_integer_array( finalst, current_mns ); + lastst = reallocate_integer_array( lastst, current_mns ); + } + + transchar[lastnfa] = sym; + trans1[lastnfa] = NO_TRANSITION; + trans2[lastnfa] = NO_TRANSITION; + accptnum[lastnfa] = NIL; + firstst[lastnfa] = lastnfa; + finalst[lastnfa] = lastnfa; + lastst[lastnfa] = lastnfa; + + /* fix up equivalence classes base on this transition. Note that any + * character which has its own transition gets its own equivalence class. + * Thus only characters which are only in character classes have a chance + * at being in the same equivalence class. E.g. "a|b" puts 'a' and 'b' + * into two different equivalence classes. "[ab]" puts them in the same + * equivalence class (barring other differences elsewhere in the input). + */ + + if ( sym < 0 ) + { + /* we don't have to update the equivalence classes since that was + * already done when the ccl was created for the first time + */ + } + + else if ( sym == SYM_EPSILON ) + ++numeps; + + else + { + if ( useecs ) + mkechar( sym, nextecm, ecgroup ); + } + + return ( lastnfa ); + } + + +/* mkxtion - make a transition from one state to another + * + * synopsis + * + * mkxtion( statefrom, stateto ); + * + * statefrom - the state from which the transition is to be made + * stateto - the state to which the transition is to be made + */ +mkxtion( statefrom, stateto ) +int statefrom, stateto; + + { + if ( trans1[statefrom] == NO_TRANSITION ) + trans1[statefrom] = stateto; + + else if ( (transchar[statefrom] != SYM_EPSILON) || + (trans2[statefrom] != NO_TRANSITION) ) + lexfatal( "found too many transitions in mkxtion()" ); + + else + { /* second out-transition for an epsilon state */ + ++eps2; + trans2[statefrom] = stateto; + } + } |