/* * Example program that runs a DFA to skip through a file and find and * print either C++ comments (starting with //) or #-comments in a file. * * Inside comments, any characters are allowed. * Outside comments, only alphanumerics and blanks are allowed. * * Unknown characters are collected by the Scanner * and returned as T_JUNK token types. (One might well simply print and * ignore T_JUNK tokens, rather than returning them to the parser.) * * INDEX to this example file: * 1) data structures for the global TokenType common to the * Driver and the Scanner modules * 2) the Driver Program: main() * 3) the Scanner module: * a) private (static) data structures for the Scanner * b) private (static) Scanner support functions * c) the public Scanner function: lex_gettoken() * * -IAN! idallen@freenet.carleton.ca idallen@freenet.carleton.ca * June 1997 */ #include #include #include #include /* * Here are the defined token types for this grammar, followed * by an array of character strings that describe each token type. * Keep the token types and the array synchronized. */ typedef enum { T_SLASHCOMMENT=0, /* found a slash-type comment */ T_POUNDCOMMENT, /* found a #-comment */ T_EOF, /* EOF on input stream */ T_JUNK /* unknown lexical input */ } TypeList; char *lex_descriptions[] = { "Slash Comment", /* T_SLASHCOMMENT=0 */ "Pound Comment", /* T_POUNDCOMMENT */ "EOF Token", /* T_EOF */ "Unknown Input" /* T_JUNK */ }; /* * This defines the global structure that holds the current token. * Each token has a type. Most tokens have lexeme values * associated with the type. The lexeme values can be different * C data types, so we store each value in a union to save space. */ typedef struct { TypeList type; /* classification of the lexeme */ int line_number; /* unused in this example */ union { char *str; /* points to a string lexeme */ long number; /* unused in this example */ double real; /* unused in this example */ } value; } TokenType; /* * lex_token is the global current token. * The contents are set by the Scanner: lex_gettoken() */ TokenType lex_token; /* GLOBAL */ TokenType *lex_gettoken(void); /* prototype for the Scanner */ main() { TypeList type; /* token type of current token */ /* Loop getting tokens from the Scanner until the Scanner * says the file is finished. Print the tokens found. */ while( (type=lex_gettoken()->type) != T_EOF ){ switch( type ){ case T_SLASHCOMMENT: case T_POUNDCOMMENT: case T_JUNK: printf("Found token '%s' with lexeme '%s'\n", lex_descriptions[(int)type], lex_token.value.str); break; default: printf("Unknown token type %d ('%s')\n", type, lex_descriptions[(int)type]); break; } } printf("%s found. Goodbye.\n", lex_descriptions[(int)T_EOF]); return(0); } /*************************************************************************** * * The whole Scanner / Lexical Analyser follows. * Normally this would be in a separate file, to hide the private * functions and private data structures. */ #define CH_SLASH '/' /* slash character */ #define CH_POUND '#' /* octothorpe == pound symbol */ #define CH_NEWLINE '\n' /* newline */ #define LEX_SIZE 100 /* static buffer size for example */ typedef enum { FALSE=0, TRUE } Boolean; /* Next is the typedef for the states in the transition table. * Note that the order of these states must exactly match the * state rows in the table itself, since we index the table using * these values. Accepting states don't need rows in the table. * * The state transition table is indexed by state on the left and by * type of character on the top. Character class "legal" is defined * (for this example) to be alphanumerics and blanks. * Note that inside comments, no characters are errors. * Note that most any character gets you out of the S_INERR state. * * lex_column(): A private utility function for the Scanner. * It converts a character to a column index for the transition table. * Keep this function near the table, so if you change the function * you remember to change the table to match. Note that we test for * newline before isspace(), so that a newline gets its own column. */ typedef enum { /* These first are intermediate states */ S_START=0, /* start state */ S_INSL1, /* found a leading / */ S_INSL2, /* found a second / */ S_INPOU, /* found a leading # */ S_INERR, /* start of invalid lexical input */ /* These next are Accepting states */ S_ENDSL, /* end of // comment */ S_ENDPOU, /* end of # comment */ S_ENDERR, /* end of invalid lexical input */ } State; static State lex_table[][5] = { /* '/', '#', '\n', legal, other*/ /*S_START*/ { S_INSL1, S_INPOU, S_START, S_START, S_INERR }, /*S_INSL1*/ { S_INSL2, S_INPOU, S_START, S_START, S_INERR }, /*S_INSL2*/ { S_INSL2, S_INSL2, S_ENDSL, S_INSL2, S_INSL2 }, /*S_INPOU*/ { S_INPOU, S_INPOU, S_ENDPOU, S_INPOU, S_INPOU }, /*S_INERR*/ { S_ENDERR, S_ENDERR, S_ENDERR, S_ENDERR, S_INERR }, }; static int lex_column( int ch ) { switch( ch ){ case CH_SLASH: return 0; /* column 0 */ case CH_POUND: return 1; /* column 1 */ case CH_NEWLINE: return 2; /* column 2 */ default:; /*see below*/ } if( isascii(ch) && (isalnum(ch)||isspace(ch)) ) return 3; /* "legal" char */ return 4; /* "other" char */ } /* NextState(): A private utility function for the Scanner. * This function uses a transition table to find the next state. * The current character is turned into a table column index first. */ static State NextState( State state, int ch ) { return lex_table[state][ lex_column(ch) ]; } /* makeprintable(): A private utility function for the Scanner. * Return a string containing the printable character, or * else the HEX value of the unprintable character. * NOTE: Static storage is in use here; be careful how you use this. */ static char * makeprintable( int ch ) { static char buf[10]; /* to hold the sprintf output */ sprintf(buf, (isascii(ch)&&isprint(ch)) ? "%c" : "<%02x>", ch); return buf; } /* lex_gettoken(): The actual public Scanner function. * Call this repeatedly from a driver program until it returns T_EOF. * This example doesn't use dynamic memory or check for buffer overflow. * A real Scanner would do one, the other, or both. */ /*public*/ TokenType * lex_gettoken() { State current; /* current state */ State next; /* next state */ int ch; /* current input char */ Boolean done; /* to detect accept states */ static char lexbuf[LEX_SIZE]; /* return lexeme; must be static! */ /* See if we've hit EOF already. Return the T_EOF token if so. * (You aren't allowed to read files after hitting EOF.) */ if( feof(stdin) ){ lex_token.type = T_EOF; return &lex_token; } /* We are not yet at EOF. * Loop getting characters and run the DFA on them. * Note how input is only read in *one* place. * The DFA handles all the state transitions. */ current = S_START; /* start at the START state */ done = FALSE; /* haven't yet found an accept state */ lexbuf[0] = '\0'; /* start with empty lexeme buffer */ while( done == FALSE && (ch=fgetc(stdin)) != EOF ){ next = NextState(current,ch); /* Start of ACTIONS section. * We put semantic actions here that do things based * on where we are in the DFA. We can know where we are * by examining "current", "next", and "ch". */ switch( next ){ case S_INSL2: case S_INPOU: /* Save all the comment characters */ /* BAD CODE: NO CHECK FOR OVERFLOW HERE! */ if( current == next ) strcat(lexbuf,makeprintable(ch)); break; case S_INERR: /* Collect all invalid lexical input too */ /* BAD CODE: NO CHECK FOR OVERFLOW HERE! */ strcat(lexbuf,makeprintable(ch)); break; default:; /*do nothing*/ } /* We are done with all the semantic actions. * We quit the loop by setting done=TRUE * when the current state becomes an accepting state: */ current = next; switch( current ){ case S_ENDSL: case S_ENDPOU: done = TRUE; break; case S_ENDERR: /* the DFA requires a "put it back" action here */ ungetc(ch,stdin); done = TRUE; break; default:; /*do nothing*/ } } /* * The DFA loop ended. * Decide what to do depending on what state we are in. * If we are in an accepting state, simply return what we found. * We could be in *any* state when we hit EOF; handle them all: */ switch( current ){ case S_INSL2: /* EOF found inside slash comment */ /* BAD CODE: NO CHECK FOR OVERFLOW HERE! */ strcat(lexbuf,""); /*FALLTHROUGH*/ case S_ENDSL: /* an accepting state */ lex_token.type = T_SLASHCOMMENT; lex_token.value.str = lexbuf; break; case S_INPOU: /* EOF found inside pound comment */ /* BAD CODE: NO CHECK FOR OVERFLOW HERE! */ strcat(lexbuf,""); /*FALLTHROUGH*/ case S_ENDPOU: /* an accepting state */ lex_token.type = T_POUNDCOMMENT; lex_token.value.str = lexbuf; break; case S_INERR: /* EOF found collecting unknown characters */ /* BAD CODE: NO CHECK FOR OVERFLOW HERE! */ strcat(lexbuf,""); /*FALLTHROUGH*/ case S_ENDERR: /* an accepting state */ lex_token.type = T_JUNK; lex_token.value.str = lexbuf; break; default: /* any other state */ lex_token.type = T_EOF; break; } return &lex_token; }