/*
 * Example program that runs a DFA to skip through a file and find and
 * print either C++ comments (starting with //) or #-comments in a file.
 *
 * Inside comments, any characters are allowed.
 * Outside comments, only alphanumerics and blanks are allowed.
 *
 * Unknown characters are collected by the Scanner
 * and returned as T_JUNK token types.  (One might well simply print and
 * ignore T_JUNK tokens, rather than returning them to the parser.)
 *
 * INDEX to this example file:
 *   1) data structures for the global TokenType common to the
 *      Driver and the Scanner modules
 *   2) the Driver Program: main()
 *   3) the Scanner module:
 *      a) private (static) data structures for the Scanner
 *      b) private (static) Scanner support functions
 *      c) the public Scanner function: lex_gettoken()
 *
 * -IAN! idallen@freenet.carleton.ca   idallen@freenet.carleton.ca
 * June 1997
 */

#include <stdlib.h>
#include <stdio.h>
#include <ctype.h>
#include <string.h>

/*
 * Here are the defined token types for this grammar, followed
 * by an array of character strings that describe each token type.
 * Keep the token types and the array synchronized.
 */
typedef enum {
        T_SLASHCOMMENT=0,       /* found a slash-type comment */
        T_POUNDCOMMENT,         /* found a #-comment */
        T_EOF,                  /* EOF on input stream */
        T_JUNK                  /* unknown lexical input */
} TypeList;

char *lex_descriptions[] = {
        "Slash Comment",        /* T_SLASHCOMMENT=0 */
        "Pound Comment",        /* T_POUNDCOMMENT */
        "EOF Token",            /* T_EOF */
        "Unknown Input"         /* T_JUNK */
};
 
/*
 * This defines the global structure that holds the current token.
 * Each token has a type.  Most tokens have lexeme values
 * associated with the type.  The lexeme values can be different
 * C data types, so we store each value in a union to save space.
 */
typedef struct {
        TypeList type;          /* classification of the lexeme */
        int line_number;        /* unused in this example */
        union {
                char *str;      /* points to a string lexeme */
                long number;    /* unused in this example */
                double real;    /* unused in this example */
        } value;
} TokenType;

/*
 * lex_token is the global current token.
 * The contents are set by the Scanner: lex_gettoken()
 */
TokenType lex_token;            /* GLOBAL */
 
TokenType *lex_gettoken(void);      /* prototype for the Scanner */

main()
{
        TypeList type;          /* token type of current token */

        /* Loop getting tokens from the Scanner until the Scanner
         * says the file is finished.  Print the tokens found.
         */
        while( (type=lex_gettoken()->type) != T_EOF ){
                switch( type ){
                case T_SLASHCOMMENT:
                case T_POUNDCOMMENT:
                case T_JUNK:
                        printf("Found token '%s' with lexeme '%s'\n",
                                lex_descriptions[(int)type],
                                lex_token.value.str);
                        break;
                default:
                        printf("Unknown token type %d ('%s')\n",
                                type, lex_descriptions[(int)type]);
                        break;
                }
        }
        printf("%s found.  Goodbye.\n", lex_descriptions[(int)T_EOF]);

        return(0);
}

/***************************************************************************
 *
 * The whole Scanner / Lexical Analyser follows.
 * Normally this would be in a separate file, to hide the private
 * functions and private data structures.
 */

#define CH_SLASH        '/'     /* slash character */
#define CH_POUND        '#'     /* octothorpe == pound symbol */
#define CH_NEWLINE      '\n'    /* newline */

#define LEX_SIZE 100            /* static buffer size for example */

typedef enum { FALSE=0, TRUE } Boolean;

/* Next is the typedef for the states in the transition table.
 * Note that the order of these states must exactly match the
 * state rows in the table itself, since we index the table using
 * these values.  Accepting states don't need rows in the table.
 *
 * The state transition table is indexed by state on the left and by
 * type of character on the top.  Character class "legal" is defined
 * (for this example) to be alphanumerics and blanks.
 * Note that inside comments, no characters are errors.
 * Note that most any character gets you out of the S_INERR state.
 *
 * lex_column(): A private utility function for the Scanner.
 * It converts a character to a column index for the transition table.
 * Keep this function near the table, so if you change the function
 * you remember to change the table to match.  Note that we test for
 * newline before isspace(), so that a newline gets its own column.
 */
typedef enum {
        /* These first are intermediate states */
        S_START=0,      /* start state */
        S_INSL1,        /* found a leading / */
        S_INSL2,        /* found a second / */
        S_INPOU,        /* found a leading # */
        S_INERR,        /* start of invalid lexical input */
        /* These next are Accepting states */
        S_ENDSL,        /* end of // comment */
        S_ENDPOU,       /* end of # comment */
        S_ENDERR,       /* end of invalid lexical input */
} State;

static State lex_table[][5] = {
/*                '/',      '#',     '\n',     legal,    other*/
/*S_START*/ { S_INSL1,  S_INPOU,  S_START,   S_START,  S_INERR },
/*S_INSL1*/ { S_INSL2,  S_INPOU,  S_START,   S_START,  S_INERR },
/*S_INSL2*/ { S_INSL2,  S_INSL2,  S_ENDSL,   S_INSL2,  S_INSL2 },
/*S_INPOU*/ { S_INPOU,  S_INPOU,  S_ENDPOU,  S_INPOU,  S_INPOU },
/*S_INERR*/ { S_ENDERR, S_ENDERR, S_ENDERR,  S_ENDERR, S_INERR },
};

        static int
lex_column( int ch )
{
        switch( ch ){
        case CH_SLASH:   return 0;              /* column 0 */
        case CH_POUND:   return 1;              /* column 1 */
        case CH_NEWLINE: return 2;              /* column 2 */
        default:; /*see below*/
        }
        if( isascii(ch) && (isalnum(ch)||isspace(ch)) )
                return 3;                       /* "legal" char */
        return 4;                               /* "other" char */
}

/* NextState(): A private utility function for the Scanner.
 * This function uses a transition table to find the next state.
 * The current character is turned into a table column index first.
 */
        static State
NextState( State state, int ch )
{
        return lex_table[state][ lex_column(ch) ];
}

/* makeprintable(): A private utility function for the Scanner.
 * Return a string containing the printable character, or
 * else the HEX value of the unprintable character.
 * NOTE: Static storage is in use here; be careful how you use this.
 */
        static char *
makeprintable( int ch )
{
        static char buf[10];    /* to hold the sprintf output */

        sprintf(buf, (isascii(ch)&&isprint(ch))
                ? "%c"
                : "<%02x>",
                ch);
        return buf;
}

/* lex_gettoken(): The actual public Scanner function.
 * Call this repeatedly from a driver program until it returns T_EOF.
 * This example doesn't use dynamic memory or check for buffer overflow.
 * A real Scanner would do one, the other, or both.
 */
        /*public*/ TokenType *
lex_gettoken()
{
        State current;                  /* current state */
        State next;                     /* next state */
        int ch;                         /* current input char */
        Boolean done;                   /* to detect accept states */
        static char lexbuf[LEX_SIZE];   /* return lexeme; must be static! */

        /* See if we've hit EOF already.  Return the T_EOF token if so.
         * (You aren't allowed to read files after hitting EOF.)
         */
        if( feof(stdin) ){
                lex_token.type = T_EOF;
                return &lex_token;
        }

        /* We are not yet at EOF.
         * Loop getting characters and run the DFA on them.
         * Note how input is only read in *one* place.
         * The DFA handles all the state transitions.
         */
        current = S_START;              /* start at the START state */
        done = FALSE;                   /* haven't yet found an accept state */
        lexbuf[0] = '\0';               /* start with empty lexeme buffer */

        while( done == FALSE && (ch=fgetc(stdin)) != EOF ){

                next = NextState(current,ch);

                /* Start of ACTIONS section.
                 * We put semantic actions here that do things based
                 * on where we are in the DFA.  We can know where we are
                 * by examining "current", "next", and "ch".
                 */

                switch( next ){
                case S_INSL2:
                case S_INPOU:
                        /* Save all the comment characters */
                        /* BAD CODE: NO CHECK FOR OVERFLOW HERE! */
                        if( current == next )
                                strcat(lexbuf,makeprintable(ch));
                        break;
                case S_INERR:
                        /* Collect all invalid lexical input too */
                        /* BAD CODE: NO CHECK FOR OVERFLOW HERE! */
                        strcat(lexbuf,makeprintable(ch));
                        break;
                default:; /*do nothing*/
                }

                /* We are done with all the semantic actions.
                 * We quit the loop by setting done=TRUE
                 * when the current state becomes an accepting state:
                 */
                current = next;

                switch( current ){
                case S_ENDSL:
                case S_ENDPOU:
                        done = TRUE;
                        break;
                case S_ENDERR:
                        /* the DFA requires a "put it back" action here */
                        ungetc(ch,stdin);
                        done = TRUE;
                        break;
                default:; /*do nothing*/
                }
        }

        /*
         * The DFA loop ended.
         * Decide what to do depending on what state we are in.
         * If we are in an accepting state, simply return what we found.
         * We could be in *any* state when we hit EOF; handle them all:
         */
        switch( current ){
        case S_INSL2:           /* EOF found inside slash comment */
                /* BAD CODE: NO CHECK FOR OVERFLOW HERE! */
                strcat(lexbuf,"<EOF>");
                /*FALLTHROUGH*/
        case S_ENDSL:           /* an accepting state */
                lex_token.type = T_SLASHCOMMENT;
                lex_token.value.str = lexbuf;
                break;
        case S_INPOU:           /* EOF found inside pound comment */
                /* BAD CODE: NO CHECK FOR OVERFLOW HERE! */
                strcat(lexbuf,"<EOF>");
                /*FALLTHROUGH*/
        case S_ENDPOU:          /* an accepting state */
                lex_token.type = T_POUNDCOMMENT;
                lex_token.value.str = lexbuf;
                break;
        case S_INERR:           /* EOF found collecting unknown characters */
                /* BAD CODE: NO CHECK FOR OVERFLOW HERE! */
                strcat(lexbuf,"<EOF>");
                /*FALLTHROUGH*/
        case S_ENDERR:          /* an accepting state */
                lex_token.type = T_JUNK;
                lex_token.value.str = lexbuf;
                break;
        default:                /* any other state */
                lex_token.type = T_EOF;
                break;
        }

        return &lex_token;
}