lci/tokenizer.h

192 lines
4.7 KiB
C

/** Structures and functions for grouping lexemes into tokens. The tokenizer
* reads through an array of lexemes (generated by the lexer) and groups them
* into tokens based on their structure. In addition, some lexemes with
* semantic meaning (such as integers, floats, strings, and booleans) will have
* their values extracted and stored.
*
* \file tokenizer.h
*
* \author Justin J. Meza
*
* \date 2010 */
#ifndef __TOKENIZER_H__
#define __TOKENIZER_H__
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include "lexer.h"
#undef DEBUG
/** Denotes the type of token present. All of the token type names are
* self-explainatory and correspond to either the semantic type of token data
* (in the case of TT_INTEGER, TT_FLOAT, TT_STRING, or TT_IDENTIFIER) or the
* lexemes which make up the particular token. */
typedef enum {
TT_INTEGER,
TT_FLOAT,
TT_STRING,
TT_IDENTIFIER,
TT_BOOLEAN,
TT_IT,
TT_NOOB,
TT_NUMBR,
TT_NUMBAR,
TT_TROOF,
TT_YARN,
TT_EOF,
TT_NEWLINE,
TT_HAI,
TT_KTHXBYE,
TT_HASA,
TT_ITZA,
TT_ITZ,
TT_RNOOB,
TT_R,
TT_ANYR,
TT_AN,
TT_SUMOF,
TT_DIFFOF,
TT_PRODUKTOF,
TT_QUOSHUNTOF,
TT_MODOF,
TT_BIGGROF,
TT_SMALLROF,
TT_BOTHOF,
TT_EITHEROF,
TT_WONOF,
TT_NOT,
TT_MKAY,
TT_ALLOF,
TT_ANYOF,
TT_BOTHSAEM,
TT_DIFFRINT,
TT_MAEK,
TT_A,
TT_ISNOWA,
TT_VISIBLE,
TT_SMOOSH,
TT_BANG,
TT_GIMMEH,
TT_ORLY,
TT_YARLY,
TT_MEBBE,
TT_NOWAI,
TT_OIC,
TT_WTF,
TT_OMG,
TT_OMGWTF,
TT_GTFO,
TT_IMINYR,
TT_UPPIN,
TT_NERFIN,
TT_YR,
TT_TIL,
TT_WILE,
TT_IMOUTTAYR,
TT_HOWDUZ,
TT_IFUSAYSO,
TT_FOUNDYR,
TT_ENDOFTOKENS
} TokenType;
static const char *keywords[] = {
"", /* TT_INTEGER */
"", /* TT_FLOAT */
"", /* TT_STRING */
"", /* TT_IDENTIFIER */
"", /* TT_BOOLEAN */
"IT", /* TT_IT */
"NOOB", /* TT_NOOB */
"NUMBR", /* TT_NUMBR */
"NUMBAR", /* TT_NUMBAR */
"TROOF", /* TT_TROOF */
"YARN", /* TT_YARN */
"", /* TT_EOF */
"", /* TT_NEWLINE */
"HAI", /* TT_HAI */
"KTHXBYE", /* TT_KTHXBYE */
"HAS A", /* TT_HASA */
"ITZ A", /* TT_ITZA */
"ITZ", /* TT_ITZ */
"R NOOB", /* TT_RNOOB */
"R", /* TT_R */
"AN YR", /* TT_ANYR */
"AN", /* TT_AN */
"SUM OF", /* TT_SUMOF */
"DIFF OF", /* TT_DIFFOF */
"PRODUKT OF", /* TT_PRODUKTOF */
"QUOSHUNT OF", /* TT_QUOSHUNTOF */
"MOD OF", /* TT_MODOF */
"BIGGR OF", /* TT_BIGGROF */
"SMALLR OF", /* TT_SMALLROF */
"BOTH OF", /* TT_BOTHOF */
"EITHER OF", /* TT_EITHEROF */
"WON OF", /* TT_WONOF */
"NOT", /* TT_NOT */
"MKAY", /* TT_MKAY */
"ALL OF", /* TT_ALLOF */
"ANY OF", /* TT_ANYOF */
"BOTH SAEM", /* TT_BOTHSAEM */
"DIFFRINT", /* TT_DIFFRINT */
"MAEK", /* TT_MAEK */
"A", /* TT_A */
"IS NOW A", /* TT_ISNOWA */
"VISIBLE", /* TT_VISIBLE */
"SMOOSH", /* TT_SMOOSH */
"!", /* TT_BANG */
"GIMMEH", /* TT_GIMMEH */
"O RLY?", /* TT_ORLY */
"YA RLY", /* TT_YARLY */
"MEBBE", /* TT_MEBBE */
"NO WAI", /* TT_NOWAI */
"OIC", /* TT_OIC */
"WTF?", /* TT_WTF */
"OMG", /* TT_OMG */
"OMGWTF", /* TT_OMGWTF */
"GTFO", /* TT_GTFO */
"IM IN YR", /* TT_IMINYR */
"UPPIN", /* TT_UPPIN */
"NERFIN", /* TT_NERFIN */
"YR", /* TT_YR */
"TIL", /* TT_TIL */
"WILE", /* TT_WILE */
"IM OUTTA YR", /* TT_IMOUTTAYR */
"HOW DUZ", /* TT_HOWDUZ */
"IF U SAY SO", /* TT_IFUSAYSO */
"FOUND YR", /* TT_FOUNDYR */
"", /* TT_ENDOFTOKENS */
};
/** Stores the data associated with a Token structure. */
typedef union {
int i; /**< Integer data. */
float f; /**< Floating point data. */
} TokenData;
/** Stores a token and any value parsed by the tokenizer. */
typedef struct {
TokenType type; /**< The type of token. */
TokenData data; /**< The stored data of type \a type. */
char *image; /**< The array of characters from the lexer which correspond to the token. */
const char *fname; /**< A pointer to the name of the file containing the token. */
unsigned int line; /**< The line number from the source file that the token occurred on. */
} Token;
int isInteger(const char *);
int isFloat(const char *);
int isString(const char *);
int isIdentifier(const char *);
Token *createToken(TokenType, const char *, const char *, unsigned int);
void deleteToken(Token *);
Token *addToken(Token ***, unsigned int *, Token*);
void deleteTokens(Token **);
unsigned int acceptLexemes(LexemeList *, unsigned int, const char *);
Token *isKeyword(LexemeList *, unsigned int *);
Token **tokenizeLexemes(LexemeList *);
#endif /* __TOKENIZER_H__ */