/* PURPOSE: * SCANNER.C: Functions that implement a Lexical Scanner. * scanner_init() must be called before using the scanner(). * When done, call scanner_term() to tidy up. * HISTORY: * Ian D. Allen idallen@freenet.carleton.ca */ //FIXME-- add system includes here //FIXME-- add your local includes here /* Initial memory size and increment for buffer used to collect lexemes. */ #define TOKEN_INIT 10 #define TOKEN_INCR 10 /* TYPEDEF: State * PURPOSE: State names for the DFA. Private to the Scanner. * * States less than ST__ACCEPTING_BEGIN are "going" states; not accepting. * States greater than ST__ACCEPTING_BEGIN cause the DFA to stop. * Dummy state ST__ACCEPTING_BEGIN is only used by the ACCEPTING() macro; * this dummy state divides the states into "not accepting" and "accepting". * * Accepting states are numbered above 128 so that the accepting state * numbers look quite different from the other states. The other states * (non-accepting) must be consecutively numbered from zero so that * they may be used directly as indexes into a 2-D DFA Transition Table. * * The ST__END state isn't used; it's just there as a place holder * for silly compilers that don't want trailing commas inside enums. */ typedef enum { /* States < ST__ACCEPTING_BEGIN keep the DFA going. * These states must match EXACTLY with the corresponding * slices of the 2-D DFA Transition Table used by next_state(). */ ST_START=0, /* start */ ST_INID=1, /* inside identifier */ //FIXME-- carefully add your other non-accepting DFA states here /* States > ST__ACCEPTING_BEGIN cause the DFA to accept and stop. * These states don't have to be in any order; but, each * accepting state needs to be handled by the scanner() function. */ #define ACCEPTING(s) ((s) > ST__ACCEPTING_BEGIN) ST__ACCEPTING_BEGIN=128, ST_ERROR, /* a DFA error (unknown transition) */ ST_GOTEOF, /* got EOF */ ST_GOTID, /* got identifier */ //FIXME-- add your new accepting DFA states here, in any order ST__END /* for compiler syntax: dummy entry without comma */ } State; /* DEFINITIONS: static data, global to this scanner source file * PURPOSE: * Here is the static information private to the scanner. * All of this data is initialized by scanner_init() so that * it can be used by scanner() without having any arguments passed. */ static FILE *scanfd = NULL; /* the open input stream */ static char *scanfnam = NULL; /* name of open input stream */ static Buffer *scanbuf = NULL; /* lexeme buffer */ static long linecounter = -999; /* initialize line counter with garbage */ /* FUNCTION: token_name * PURPOSE: * Here are the English names of the TokenType enum, and the function * that turns the enum into the English name. This list must match * exactly the TokenType enum in scanner.h. * * The validity check in the function helps detect cases where we add * to the TokenType and forget to update the tokenName list here as well. */ static char *tokenName[] = { "Unknown", /* each of these must match, in order, a TokenType */ "Error", "EOF", "Junk", "Identifier", "Unsigned Integer", //FIXME-- add the strings that match your new Token types here }; char * token_name( /* RET: English description of token */ TokenType token /* IN: token number from Scanner */ ){ /* run-time validity check on names vs. TokenType enum */ if( NEL(tokenName) != TT_DIMENSION ) eprintf("Number of lines in tokenName %d; expecting %d", NEL(tokenName), TT_DIMENSION); if( token < 0 || token >= NEL(tokenName) ){ eprintf("Unknown token name for token %d; max is %d", token, NEL(tokenName)); token = 0; /* a safe index number */ } return tokenName[token]; } /* FUNCTION: scanner_ungetc * PURPOSE: * Put the character back in the input stream using stdio's ungetc(). * If it was a newline, decrement the line count, because we * will count the newline again next time we read it. * Ignore the character if it is the EOF constant, since * you can't push back an EOF. */ static void scanner_ungetc( int ch, /* IN: character to put back */ FILE *fd /* IN: stream to receive ch */ ){ //FIXME-- your code for this function goes here } /* FUNCTION: scanner_init * PURPOSE: * Initialize the scanner: * Do checking to prevent duplicate or incorrect calls to this function. * Record a copy of the input stream file descriptor and the input stream * file name pointer in the static storage in this file. * Allocate a lexeme buffer for the scanner; the pointer to the * buffer is also recorded in the static storage in this file. * Start counting lines at line one. * Do a dummy call of token_name() to make sure it works. */ void scanner_init( FILE *fd, /* IN: open file descriptor to read */ char *fname /* IN: name of open file */ ){ //FIXME-- your code for this function goes here linecounter = 1; printf( "scanner_init: The Scanner defines %d token types\n", TT_DIMENSION ); printf( "scanner_init: Reading from '%s'.\n", scanfnam); (void) token_name(0); /* test validity of token names */ } /* FUNCTION: scanner_term * PURPOSE: * Terminate the scanner: Undo scanner_init(). * Do checking to prevent duplicate or incorrect calls to this function. * Put NULL into the the static copies of the file descriptor and file name. * Free the scanner lexeme buffer (the one allocated by scanner_init) * and set its static pointer to NULL. * Check to see if the input stream has been read completely (is at EOF); * print a warning if not true. * Put garbage in the line counter. */ void scanner_term(void){ printf( "scanner_term: The Scanner terminated in '%s' at line %ld\n", scanfnam, linecounter ); //FIXME-- your code for this function goes here linecounter = -999; /* set counter back to garbage */ } /* FUNCTION: next_state * PURPOSE: * Given the current state and an input symbol * (which will be a character or EOF) return the next state. * * The Transition Table of the DFA gives the design of this function. * We program the table as a two-dimensional array 'table[][]' here. * * Since we have more character classes than we have states, the * table looks best on paper if rows are classes and columns are states. * * The table is indexed using a character class (row) and * a current state (column). The first part of the next_state function * turns a character into a row index (character class) for the table. * * The order of the columns (current state) must exactly match the * order of non-accepting states in the State enum, since those state * values are used to index the columns of this array directly. * * Make sure that EOF always causes a transition to an accepting * state, or the scanner may loop reading the same EOF over and over. */ static State table[?][?] = { //FIXME-- Your transition table goes here. //FIXME-- Label the rows and columns with comments, so that you //FIXME-- can read the table. }; static State next_state( /* RET: the next DFA state */ State current, /* IN: the current DFA state */ int ch /* IN: the character just read, or EOF */ ){ unsigned int row = 999; /* char class row index into above table */ //FIXME-- the code to turn the incoming character into a row index goes here /* Make sure the indicies into the table are valid, to * catch programming errors that index outside the table. */ if( row >= NEL(table) || current < 0 || current >= NEL(table[0]) ){ eprintf("Error: Invalid table index: row %d current %d\n", row, current); abort(); exit( EXIT_FAILURE ); /*NOTREACHED*/ } return table[row][current]; /* chars are rows; states are columns */ } /* FUNCTION: scanner * PURPOSE: * The Scanner: Implement a Deterministic Finite Automaton. * * Move from one state to another depending on the current state * and the input symbol (a character read from the stream, or EOF). * Save some of the characters read in a lexeme buffer. * Return the token found. * * The stream descriptor and file name from which we are reading * must be set by scanner_init() before scanner() is called. * A buffer for the lexeme must also be initialized there. * * Note that since you can't push back EOF, any reading of EOF * will set the feof() flag on the input fd. If EOF ends a token, * the stream EOF flag will be set but the TT_EOF token itself won't be * returned until the *next* call to the scanner. We check to see * if the stream is already at EOF and don't call getc() again. */ Token scanner( /* RET: the next Token from the input stream */ void ){ State currentstate; /* the Current State */ State nextstate; /* the Next State */ Token token; /* token to be built and returned */ int ch; /* character read from stream, or EOF */ /* Check to make sure scanner_init() was called. * Initialize the state of the DFA. * Initialize our return token to garbage; it will be updated. * Clear the scanner lexeme buffer that will hold the lexeme. */ //FIXME-- your code for the above goes here /* Loop reading characters until we end up in an accepting state. * Since some tokens may end at EOF, the scanner may be called * when the input is technically in an EOF condition. Since we * are not allowed to read a stream after EOF, we simply set ch * to be EOF instead of incorrectly calling getc() after EOF. */ while( ! ACCEPTING(currentstate) ){ if( feof(scanfd) ) ch = EOF; /* can't use getc() after EOF */ else { if( (ch = getc(scanfd)) == '\n' ) ++linecounter; } nextstate = next_state(currentstate,ch); /* Save some of the characters into the lexeme buffer. * The characters saved are selected based on the DFA. * Looking at the DFA, we note which state transitions * require us to save characters, and code the simplest * logic that saves chars only for those state transitions. */ //FIXME-- Your code to save some characters in the lexeme buffer goes here. //FIXME-- Save only the characters that label desired DFA state transitions. currentstate = nextstate; } /* ASCII NUL terminate the lexeme in the lexeme buffer. * Set the token structure to point to the lexeme in the buffer. */ //FIXME-- your code for the above goes here /* Find out in which state (currentstate) we accepted. * If that state requires a push-back, do the push-back by * calling the utility function scanner_ungetc(). * For each accepting state, set the returned token type * according to the lexeme we just recognized. * Print an error message if we end up in an accepting state * that isn't handled here -- this helps catch programming errors. */ switch( currentstate ){ //FIXME-- Your code for the above goes here. //FIXME-- You will have one case for every accepting state, plus a default. default: /* Will never see this unless DFA is programmed incorrectly */ eprintf("Scanner: file '%s' line %ld: Unexpected state %d", scanfnam, linecounter, currentstate); abort(); /*NOTREACHED*/ } /* Make sure we record the line counter *after* doing any * ungetc() push-back that might adjust the counter. */ token.lineno = linecounter; #ifdef SCANNER_DEBUG /* DEBUG: print the token type, lineno, and the lexeme */ eprintf("<%s,%ld,'%s'>", token_name(token.type), token.lineno, token.lexeme); #endif /*SCANNER_DEBUG*/ return token; /* returns entire structure */ }