lci/tokenizer.h

/** Structures and functions for grouping lexemes into tokens. The tokenizer
  * reads through an array of lexemes (generated by the lexer) and groups them
  * into tokens based on their structure.  In addition, some lexemes with
  * semantic meaning (such as integers, floats, strings, and booleans) will have
  * their values extracted and stored.
  *
  * \file   tokenizer.h
  *
  * \author Justin J. Meza
  *
  * \date   2010 */

#ifndef __TOKENIZER_H__
#define __TOKENIZER_H__

#include <stdlib.h>
#include <stdio.h>
#include <string.h>

#include "lexer.h"

#undef DEBUG

/** Denotes the type of token present.  All of the token type names are
  * self-explainatory and correspond to either the semantic type of token data
  * (in the case of TT_INTEGER, TT_FLOAT, TT_STRING, or TT_IDENTIFIER) or the
  * lexemes which make up the particular token. */
typedef enum {
	TT_INTEGER,
	TT_FLOAT,
	TT_STRING,
	TT_IDENTIFIER,
	TT_BOOLEAN,
	TT_IT,
	TT_NOOB,
	TT_NUMBR,
	TT_NUMBAR,
	TT_TROOF,
	TT_YARN,
	TT_EOF,
	TT_NEWLINE,
	TT_HAI,
	TT_KTHXBYE,
	TT_HASA,
	TT_ITZA,
	TT_ITZ,
	TT_RNOOB,
	TT_R,
	TT_ANYR,
	TT_AN,
	TT_SUMOF,
	TT_DIFFOF,
	TT_PRODUKTOF,
	TT_QUOSHUNTOF,
	TT_MODOF,
	TT_BIGGROF,
	TT_SMALLROF,
	TT_BOTHOF,
	TT_EITHEROF,
	TT_WONOF,
	TT_NOT,
	TT_MKAY,
	TT_ALLOF,
	TT_ANYOF,
	TT_BOTHSAEM,
	TT_DIFFRINT,
	TT_MAEK,
	TT_A,
	TT_ISNOWA,
	TT_VISIBLE,
	TT_SMOOSH,
	TT_BANG,
	TT_GIMMEH,
	TT_ORLY,
	TT_YARLY,
	TT_MEBBE,
	TT_NOWAI,
	TT_OIC,
	TT_WTF,
	TT_OMG,
	TT_OMGWTF,
	TT_GTFO,
	TT_IMINYR,
	TT_UPPIN,
	TT_NERFIN,
	TT_YR,
	TT_TIL,
	TT_WILE,
	TT_IMOUTTAYR,
	TT_HOWDUZ,
	TT_IFUSAYSO,
	TT_FOUNDYR,
	TT_ENDOFTOKENS
} TokenType;

static const char *keywords[] = {
	"",            /* TT_INTEGER */
	"",            /* TT_FLOAT */
	"",            /* TT_STRING */
	"",            /* TT_IDENTIFIER */
	"",            /* TT_BOOLEAN */
	"IT",          /* TT_IT */
	"NOOB",        /* TT_NOOB */
	"NUMBR",       /* TT_NUMBR */
	"NUMBAR",      /* TT_NUMBAR */
	"TROOF",       /* TT_TROOF */
	"YARN",        /* TT_YARN */
	"",            /* TT_EOF */
	"",            /* TT_NEWLINE */
	"HAI",         /* TT_HAI */
	"KTHXBYE",     /* TT_KTHXBYE */
	"HAS A",       /* TT_HASA */
	"ITZ A",       /* TT_ITZA */
	"ITZ",         /* TT_ITZ */
	"R NOOB",      /* TT_RNOOB */
	"R",           /* TT_R */
	"AN YR",       /* TT_ANYR */
	"AN",          /* TT_AN */
	"SUM OF",      /* TT_SUMOF */
	"DIFF OF",     /* TT_DIFFOF */
	"PRODUKT OF",  /* TT_PRODUKTOF */
	"QUOSHUNT OF", /* TT_QUOSHUNTOF */
	"MOD OF",      /* TT_MODOF */
	"BIGGR OF",    /* TT_BIGGROF */
	"SMALLR OF",   /* TT_SMALLROF */
	"BOTH OF",     /* TT_BOTHOF */
	"EITHER OF",   /* TT_EITHEROF */
	"WON OF",      /* TT_WONOF */
	"NOT",         /* TT_NOT */
	"MKAY",        /* TT_MKAY */
	"ALL OF",      /* TT_ALLOF */
	"ANY OF",      /* TT_ANYOF */
	"BOTH SAEM",   /* TT_BOTHSAEM */
	"DIFFRINT",    /* TT_DIFFRINT */
	"MAEK",        /* TT_MAEK */
	"A",           /* TT_A */
	"IS NOW A",    /* TT_ISNOWA */
	"VISIBLE",     /* TT_VISIBLE */
	"SMOOSH",      /* TT_SMOOSH */
	"!",           /* TT_BANG */
	"GIMMEH",      /* TT_GIMMEH */
	"O RLY?",      /* TT_ORLY */
	"YA RLY",      /* TT_YARLY */
	"MEBBE",       /* TT_MEBBE */
	"NO WAI",      /* TT_NOWAI */
	"OIC",         /* TT_OIC */
	"WTF?",        /* TT_WTF */
	"OMG",         /* TT_OMG */
	"OMGWTF",      /* TT_OMGWTF */
	"GTFO",        /* TT_GTFO */
	"IM IN YR",    /* TT_IMINYR */
	"UPPIN",       /* TT_UPPIN */
	"NERFIN",      /* TT_NERFIN */
	"YR",          /* TT_YR */
	"TIL",         /* TT_TIL */
	"WILE",        /* TT_WILE */
	"IM OUTTA YR", /* TT_IMOUTTAYR */
	"HOW DUZ",     /* TT_HOWDUZ */
	"IF U SAY SO", /* TT_IFUSAYSO */
	"FOUND YR",    /* TT_FOUNDYR */
	"",            /* TT_ENDOFTOKENS */
};

/** Stores the data associated with a Token structure. */
typedef union {
	int i;   /**< Integer data. */
	float f; /**< Floating point data. */
} TokenData;

/** Stores a token and any value parsed by the tokenizer. */
typedef struct {
	TokenType type;    /**< The type of token. */
	TokenData data;    /**< The stored data of type \a type. */
	char *image;       /**< The array of characters from the lexer which correspond to the token. */
	const char *fname; /**< A pointer to the name of the file containing the token. */
	unsigned int line; /**< The line number from the source file that the token occurred on. */
} Token;

int isInteger(const char *);
int isFloat(const char *);
int isString(const char *);
int isIdentifier(const char *);
Token *createToken(TokenType, const char *, const char *, unsigned int);
void deleteToken(Token *);
Token *addToken(Token ***, unsigned int *, Token*);
void deleteTokens(Token **);
unsigned int acceptLexemes(LexemeList *, unsigned int, const char *);
Token *isKeyword(LexemeList *, unsigned int *);
Token **tokenizeLexemes(LexemeList *);

#endif /* __TOKENIZER_H__ */