/* PURPOSE: * SCANNER.C: Functions that implement a Lexical Scanner. * scanner_init() must be called before using the scanner(). * When done, call scanner_term() to tidy up. * HISTORY: * Ian D. Allen idallen@freenet.carleton.ca */ //FIXME-- add system includes here //FIXME-- add your local includes here /* Initial character memory size and increment used to collect tokens. */ #define TOKEN_INIT 80 #define TOKEN_INCR 20 /* State names for the DFA. * * States less than ST__ACCEPTING_BEGIN are "going" states; not accepting. * States greater than ST__ACCEPTING_BEGIN cause the DFA to stop. * Dummy state ST__ACCEPTING_BEGIN is only used by the ACCEPTING() macro; * this dummy state divides the states into "not accepting" and "accepting". * * Accepting states are numbered above 128 so that the accepting state * numbers look quite different from the other states. The other states * (non-accepting) must be consecutively numbered from zero so that * they may be used as row indexes into a DFA Transition Table. * * The ST__END state isn't used; it's just there as a place holder * for silly compilers that don't want trailing commas inside enums. */ typedef enum { /* States < ST__ACCEPTING_BEGIN keep the DFA going */ ST_START=0, /* start */ ST_INID, /* inside identifier */ //FIXME-- add your other non-accepting DFA states here /* States > ST__ACCEPTING_BEGIN cause the DFA to accept and stop */ #define ACCEPTING(s) ((s) > ST__ACCEPTING_BEGIN) ST__ACCEPTING_BEGIN=128, ST_ERROR, /* a DFA error (unknown transition) */ ST_GOTEOF, /* got EOF */ ST_GOTID, /* got identifier */ //FIXME-- add your other accepting DFA states here ST__END /* compiler syntax: dummy entry without comma */ } State; /* Here is the static information private to the scanner. * All of this data is initialized by scanner_init() so that * it can be used by scanner() without having any arguments passed. */ //FIXME-- put the rest of the static data initialized by scanner_init here static long linecounter = -999; /* initialize counter with garbage */ /* The English names of the TokenType enum, and the function * that turns the enum into the English name. The validity check * helps detect cases where we add to the TokenType and forget to * update the tokenName list here as well. */ static char *tokenName[] = { "Unknown", /* each of these must match, in order, a TokenType */ "Error", "EOF", "Junk", "Identifier", "UnsignedInt" }; char * token_name( /* RET: English description of token */ TokenType token /* IN: token number from Scanner */ ){ /* validity check on names vs. TokenType enum */ if( NEL(tokenName) != TT_DIMENSION ) eprintf("Number of lines in tokenName %d; expecting %d", NEL(tokenName), TT_DIMENSION); if( token < 0 || token >= NEL(tokenName) ) eprintf("Unknown token name for token %d; max is %d", token, NEL(tokenName)); return tokenName[token]; } /* Put the character back in the input stream. * If it was a newline, decrement the line count, because we * will count the newline again next time we read it. */ static void scanner_ungetc( int ch, /* IN: character to put back */ FILE *fd /* IN: stream to receive ch */ ){ //FIXME-- your code for this function goes here } /* Initialize the scanner: * Do some checking to prevent duplicate or incorrect calls to this function. * Remember the input stream fd and input stream file name. * Allocate a lexeme buffer for the scanner. * Start counting lines at line one. * Do a dummy call of token_name() to make sure it works. */ void scanner_init( FILE *fd, /* IN: open file descriptor to read */ char *fname /* IN: name of open file */ ){ //FIXME-- your code for this function goes here linecounter = 1; printf( "INIT: The Scanner defines %d token types\n", TT_DIMENSION ); (void) token_name(0); /* test validity of token names */ } /* Terminate the scanner: Undo scanner_init() * Do some checking to prevent duplicate or incorrect calls to this function. * NULL out the file fd and file name. * Free the scanner lexeme buffer (the one allocated by scanner_init). * Check to see if the input stream has been read completely (is at EOF); * print a warning if not true. * Put garbage in the line counter. */ void scanner_term(void){ //FIXME-- your code for this function goes here printf( "TERM: The Scanner terminated at line %ld\n", linecounter ); linecounter = -999; /* set counter back to garbage */ } /* next_state: Given the current state and an input symbol * (which will be a character or EOF) return the next state. * The Transition Table of the DFA gives the design of this function. * * Make sure that EOF always causes a transition to an accepting * state, or the scanner will loop reading the same EOF over and over. */ static State next_state( /* RET: the next DFA state */ State current, /* IN: the current DFA state */ int ch /* IN: the character just read, or EOF */ ){ //FIXME-- your code for this function goes here //FIXME-- typical implementations use a two-dimensional array of State's } /* The Scanner: * Implement a Deterministic Finite Automaton. * Move from one state to another depending on the current state * and the input symbol (a character read from the stream, or EOF). * * The stream descriptor and file name from which we are reading * must be set by scanner_init() before we are called. A lexeme * buffer must also be initialized there. * * Note that since you can't push back EOF, any reading of EOF * will set the feof() flag on the input fd. If EOF ends a token, * the stream EOF flag will be set but the TT_EOF token itself won't be * returned until the *next* call to the scanner. We check to see * if the stream is already at EOF and don't call getc() again. */ Token scanner( /* RET: the next Token from the input stream */ void ){ State currentstate; /* the Current State */ State nextstate; /* the Next State */ Token token; /* token to be built and returned */ int ch; /* character read from stream, or EOF */ /* Check to make sure scanner_init() was called. * Initialize the state of the DFA. * Initialize our return token to garbage; it will be updated. * Clear the scanner lexeme buffer that will hold the lexeme. */ //FIXME-- your code for the above goes here /* Loop reading characters until we end up in an accepting state. * Since some tokens may end at EOF, the scanner may be called * when the input is techically in an EOF condition. Since we * are not allowed to read a stream after EOF, we simply set ch * to be EOF instead of calling getc(). */ while( ! ACCEPTING(currentstate) ){ if( feof(scanfd) ) ch = EOF; /* can't use getc() after EOF */ else { if( (ch = getc(scanfd)) == '\n' ) ++linecounter; } nextstate = next_state(currentstate,ch); /* Save characters into the lexeme buffer. */ //FIXME-- your code to save characters in the lexeme buffer goes here currentstate = nextstate; } /* NUL terminate the lexeme in the lexeme buffer. * Set the token structure to point to the lexeme in the buffer. */ //FIXME-- your code for the above goes here /* Find out in which state we accepted. * If that state requires a push-back, do the push-back. * Set the token type according to the lexeme we recognized. */ //FIXME-- your code for the above goes here /* Make sure we record the line counter *after* doing any * ungetc() push-back that might adjust the counter. */ token.lineno = linecounter; return token; /* returns entire structure */ }