/* PURPOSE: * SCANNER.C: Functions that implement a Lexical Scanner. * scanner_init() must be called before using the scanner(). * When done, call scanner_term() to tidy up. * HISTORY: * Ian D. Allen idallen@freenet.carleton.ca */ //FIXME-- add system includes here //FIXME-- add your local includes here /* Initial character memory size and increment used to collect tokens. */ #define TOKEN_INIT 10 #define TOKEN_INCR 10 /* State names for the DFA. * * States less than ST__ACCEPTING_BEGIN are "going" states; not accepting. * States greater than ST__ACCEPTING_BEGIN cause the DFA to stop. * Dummy state ST__ACCEPTING_BEGIN is only used by the ACCEPTING() macro; * this dummy state divides the states into "not accepting" and "accepting". * * Accepting states are numbered above 128 so that the accepting state * numbers look quite different from the other states. The other states * (non-accepting) must be consecutively numbered from zero so that * they may be used directly as indexes into a 2-D DFA Transition Table. * * The ST__END state isn't used; it's just there as a place holder * for silly compilers that don't want trailing commas inside enums. */ typedef enum { /* States < ST__ACCEPTING_BEGIN keep the DFA going. * These states must match EXACTLY with the corresponding * slices of the 2-D DFA Transition Table used by next_state(). */ ST_START=0, /* start */ ST_INID=1, /* inside identifier */ //FIXME-- carefully add your other non-accepting DFA states here /* States > ST__ACCEPTING_BEGIN cause the DFA to accept and stop. * These states don't have to be in any order; but, each * accepting state needs to be handled by the scanner() function. */ #define ACCEPTING(s) ((s) > ST__ACCEPTING_BEGIN) ST__ACCEPTING_BEGIN=128, ST_ERROR, /* a DFA error (unknown transition) */ ST_GOTEOF, /* got EOF */ ST_GOTID, /* got identifier */ //FIXME-- add your new accepting DFA states here, in any order ST__END /* for compiler syntax: dummy entry without comma */ } State; /* Here is the static information private to the scanner. * All of this data is initialized by scanner_init() so that * it can be used by scanner() without having any arguments passed. */ //FIXME-- put the rest of the static data initialized by scanner_init here static long linecounter = -999; /* initialize counter with garbage */ /* The English names of the TokenType enum, and the function * that turns the enum into the English name. This list must match * exactly the TokenType enum in scanner.h. * The validity check in the function helps detect cases where we add * to the TokenType and forget to update the tokenName list here as well. */ static char *tokenName[] = { "Unknown", /* each of these must match, in order, a TokenType */ "Error", "EOF", "Junk", "Identifier", "Unsigned Integer", //FIXME-- add the strings that match your new Token types here }; char * token_name( /* RET: English description of token */ TokenType token /* IN: token number from Scanner */ ){ /* run-time validity check on names vs. TokenType enum */ if( NEL(tokenName) != TT_DIMENSION ) eprintf("Number of lines in tokenName %d; expecting %d", NEL(tokenName), TT_DIMENSION); if( token < 0 || token >= NEL(tokenName) ){ eprintf("Unknown token name for token %d; max is %d", token, NEL(tokenName)); token = 0; /* a safe index number */ } return tokenName[token]; } /* Put the character back in the input stream using stdio's ungetc(). * If it was a newline, decrement the line count, because we * will count the newline again next time we read it. */ static void scanner_ungetc( int ch, /* IN: character to put back */ FILE *fd /* IN: stream to receive ch */ ){ //FIXME-- your code for this function goes here } /* Initialize the scanner: * Do some checking to prevent duplicate or incorrect calls to this function. * Remember the input stream fd and input stream file name. * Allocate a lexeme buffer for the scanner. * Start counting lines at line one. * Do a dummy call of token_name() to make sure it works. */ void scanner_init( FILE *fd, /* IN: open file descriptor to read */ char *fname /* IN: name of open file */ ){ //FIXME-- your code for this function goes here linecounter = 1; printf( "scanner_init: The Scanner defines %d token types\n", TT_DIMENSION ); printf( "scanner_init: Reading from '%s'.\n", scanfnam); (void) token_name(0); /* test validity of token names */ } /* Terminate the scanner: Undo scanner_init() * Do some checking to prevent duplicate or incorrect calls to this function. * NULL out the file fd and file name. * Free the scanner lexeme buffer (the one allocated by scanner_init). * Check to see if the input stream has been read completely (is at EOF); * print a warning if not true. * Put garbage in the line counter. */ void scanner_term(void){ printf( "scanner_term: The Scanner terminated in '%s' at line %ld\n", scanfnam, linecounter ); //FIXME-- your code for this function goes here linecounter = -999; /* set counter back to garbage */ } /* next_state: Given the current state and an input symbol * (which will be a character or EOF) return the next state. * The Transition Table of the DFA gives the design of this function. * We program the table as a two-dimensional array 'table[][]' here. * * Since we have more character classes than we have states, the * table looks best on paper if rows are classes and columns are states. * * NOTE: The table is indexed using a character class (row) and * a current state (column). The first part of the next_state function * turns a character into a row index (character class) for the table. * * NOTE: The order of the columns (current state) must exactly match the * order of non-accepting states in the State enum, since those state * values are used to index the columns of this array directly. * * Make sure that EOF always causes a transition to an accepting * state, or the scanner may loop reading the same EOF over and over. */ static State table[13][4] = { //FIXME-- your transition table goes here }; static State next_state( /* RET: the next DFA state */ State current, /* IN: the current DFA state */ int ch /* IN: the character just read, or EOF */ ){ unsigned int row = 999; /* char class row index into above table */ //FIXME-- the code to turn the incoming character into a row index goes here /* Make sure the indicies into the table are valid, to * catch programming errors. */ if( row >= NEL(table) || current < 0 || current >= NEL(table[0]) ){ eprintf("Error: Invalid table index: row %d current %d\n", row, current); abort(); exit( EXIT_FAILURE ); } return table[row][current]; } /* The Scanner: * Implement a Deterministic Finite Automaton. * Move from one state to another depending on the current state * and the input symbol (a character read from the stream, or EOF). * * The stream descriptor and file name from which we are reading * must be set by scanner_init() before we are called. A lexeme * buffer must also be initialized there. * * Note that since you can't push back EOF, any reading of EOF * will set the feof() flag on the input fd. If EOF ends a token, * the stream EOF flag will be set but the TT_EOF token itself won't be * returned until the *next* call to the scanner. We check to see * if the stream is already at EOF and don't call getc() again. */ Token scanner( /* RET: the next Token from the input stream */ void ){ State currentstate; /* the Current State */ State nextstate; /* the Next State */ Token token; /* token to be built and returned */ int ch; /* character read from stream, or EOF */ /* Check to make sure scanner_init() was called. * Initialize the state of the DFA. * Initialize our return token to garbage; it will be updated. * Clear the scanner lexeme buffer that will hold the lexeme. */ //FIXME-- your code for the above goes here /* Loop reading characters until we end up in an accepting state. * Since some tokens may end at EOF, the scanner may be called * when the input is techically in an EOF condition. Since we * are not allowed to read a stream after EOF, we simply set ch * to be EOF instead of incorrectly calling getc() after EOF. */ while( ! ACCEPTING(currentstate) ){ if( feof(scanfd) ) ch = EOF; /* can't use getc() after EOF */ else { if( (ch = getc(scanfd)) == '\n' ) ++linecounter; } nextstate = next_state(currentstate,ch); /* Save characters into the lexeme buffer. */ //FIXME-- your code to save characters in the lexeme buffer goes here currentstate = nextstate; } /* NUL terminate the lexeme in the lexeme buffer. * Set the token structure to point to the lexeme in the buffer. */ //FIXME-- your code for the above goes here /* Find out in which state we accepted. * If that state requires a push-back, do the push-back by * calling the utility function scanner_ungetc(). * Set the token type according to the lexeme we recognized. */ //FIXME-- your code for the above goes here /* Make sure we record the line counter *after* doing any * ungetc() push-back that might adjust the counter. */ token.lineno = linecounter; #ifdef SCANNER_DEBUG printf("<%s>", token_name(token.type)); /* print the token types */ #endif /*SCANNER_DEBUG*/ return token; /* returns entire structure */ }