192 lines
4.7 KiB
C
192 lines
4.7 KiB
C
/** Structures and functions for grouping lexemes into tokens. The tokenizer
|
|
* reads through an array of lexemes (generated by the lexer) and groups them
|
|
* into tokens based on their structure. In addition, some lexemes with
|
|
* semantic meaning (such as integers, floats, strings, and booleans) will have
|
|
* their values extracted and stored.
|
|
*
|
|
* \file tokenizer.h
|
|
*
|
|
* \author Justin J. Meza
|
|
*
|
|
* \date 2010 */
|
|
|
|
#ifndef __TOKENIZER_H__
|
|
#define __TOKENIZER_H__
|
|
|
|
#include <stdlib.h>
|
|
#include <stdio.h>
|
|
#include <string.h>
|
|
|
|
#include "lexer.h"
|
|
|
|
#undef DEBUG
|
|
|
|
/** Denotes the type of token present. All of the token type names are
|
|
* self-explainatory and correspond to either the semantic type of token data
|
|
* (in the case of TT_INTEGER, TT_FLOAT, TT_STRING, or TT_IDENTIFIER) or the
|
|
* lexemes which make up the particular token. */
|
|
typedef enum {
|
|
TT_INTEGER,
|
|
TT_FLOAT,
|
|
TT_STRING,
|
|
TT_IDENTIFIER,
|
|
TT_BOOLEAN,
|
|
TT_IT,
|
|
TT_NOOB,
|
|
TT_NUMBR,
|
|
TT_NUMBAR,
|
|
TT_TROOF,
|
|
TT_YARN,
|
|
TT_EOF,
|
|
TT_NEWLINE,
|
|
TT_HAI,
|
|
TT_KTHXBYE,
|
|
TT_HASA,
|
|
TT_ITZA,
|
|
TT_ITZ,
|
|
TT_RNOOB,
|
|
TT_R,
|
|
TT_ANYR,
|
|
TT_AN,
|
|
TT_SUMOF,
|
|
TT_DIFFOF,
|
|
TT_PRODUKTOF,
|
|
TT_QUOSHUNTOF,
|
|
TT_MODOF,
|
|
TT_BIGGROF,
|
|
TT_SMALLROF,
|
|
TT_BOTHOF,
|
|
TT_EITHEROF,
|
|
TT_WONOF,
|
|
TT_NOT,
|
|
TT_MKAY,
|
|
TT_ALLOF,
|
|
TT_ANYOF,
|
|
TT_BOTHSAEM,
|
|
TT_DIFFRINT,
|
|
TT_MAEK,
|
|
TT_A,
|
|
TT_ISNOWA,
|
|
TT_VISIBLE,
|
|
TT_SMOOSH,
|
|
TT_BANG,
|
|
TT_GIMMEH,
|
|
TT_ORLY,
|
|
TT_YARLY,
|
|
TT_MEBBE,
|
|
TT_NOWAI,
|
|
TT_OIC,
|
|
TT_WTF,
|
|
TT_OMG,
|
|
TT_OMGWTF,
|
|
TT_GTFO,
|
|
TT_IMINYR,
|
|
TT_UPPIN,
|
|
TT_NERFIN,
|
|
TT_YR,
|
|
TT_TIL,
|
|
TT_WILE,
|
|
TT_IMOUTTAYR,
|
|
TT_HOWDUZ,
|
|
TT_IFUSAYSO,
|
|
TT_FOUNDYR,
|
|
TT_ENDOFTOKENS
|
|
} TokenType;
|
|
|
|
static const char *keywords[] = {
|
|
"", /* TT_INTEGER */
|
|
"", /* TT_FLOAT */
|
|
"", /* TT_STRING */
|
|
"", /* TT_IDENTIFIER */
|
|
"", /* TT_BOOLEAN */
|
|
"IT", /* TT_IT */
|
|
"NOOB", /* TT_NOOB */
|
|
"NUMBR", /* TT_NUMBR */
|
|
"NUMBAR", /* TT_NUMBAR */
|
|
"TROOF", /* TT_TROOF */
|
|
"YARN", /* TT_YARN */
|
|
"", /* TT_EOF */
|
|
"", /* TT_NEWLINE */
|
|
"HAI", /* TT_HAI */
|
|
"KTHXBYE", /* TT_KTHXBYE */
|
|
"HAS A", /* TT_HASA */
|
|
"ITZ A", /* TT_ITZA */
|
|
"ITZ", /* TT_ITZ */
|
|
"R NOOB", /* TT_RNOOB */
|
|
"R", /* TT_R */
|
|
"AN YR", /* TT_ANYR */
|
|
"AN", /* TT_AN */
|
|
"SUM OF", /* TT_SUMOF */
|
|
"DIFF OF", /* TT_DIFFOF */
|
|
"PRODUKT OF", /* TT_PRODUKTOF */
|
|
"QUOSHUNT OF", /* TT_QUOSHUNTOF */
|
|
"MOD OF", /* TT_MODOF */
|
|
"BIGGR OF", /* TT_BIGGROF */
|
|
"SMALLR OF", /* TT_SMALLROF */
|
|
"BOTH OF", /* TT_BOTHOF */
|
|
"EITHER OF", /* TT_EITHEROF */
|
|
"WON OF", /* TT_WONOF */
|
|
"NOT", /* TT_NOT */
|
|
"MKAY", /* TT_MKAY */
|
|
"ALL OF", /* TT_ALLOF */
|
|
"ANY OF", /* TT_ANYOF */
|
|
"BOTH SAEM", /* TT_BOTHSAEM */
|
|
"DIFFRINT", /* TT_DIFFRINT */
|
|
"MAEK", /* TT_MAEK */
|
|
"A", /* TT_A */
|
|
"IS NOW A", /* TT_ISNOWA */
|
|
"VISIBLE", /* TT_VISIBLE */
|
|
"SMOOSH", /* TT_SMOOSH */
|
|
"!", /* TT_BANG */
|
|
"GIMMEH", /* TT_GIMMEH */
|
|
"O RLY?", /* TT_ORLY */
|
|
"YA RLY", /* TT_YARLY */
|
|
"MEBBE", /* TT_MEBBE */
|
|
"NO WAI", /* TT_NOWAI */
|
|
"OIC", /* TT_OIC */
|
|
"WTF?", /* TT_WTF */
|
|
"OMG", /* TT_OMG */
|
|
"OMGWTF", /* TT_OMGWTF */
|
|
"GTFO", /* TT_GTFO */
|
|
"IM IN YR", /* TT_IMINYR */
|
|
"UPPIN", /* TT_UPPIN */
|
|
"NERFIN", /* TT_NERFIN */
|
|
"YR", /* TT_YR */
|
|
"TIL", /* TT_TIL */
|
|
"WILE", /* TT_WILE */
|
|
"IM OUTTA YR", /* TT_IMOUTTAYR */
|
|
"HOW DUZ", /* TT_HOWDUZ */
|
|
"IF U SAY SO", /* TT_IFUSAYSO */
|
|
"FOUND YR", /* TT_FOUNDYR */
|
|
"", /* TT_ENDOFTOKENS */
|
|
};
|
|
|
|
/** Stores the data associated with a Token structure. */
|
|
typedef union {
|
|
int i; /**< Integer data. */
|
|
float f; /**< Floating point data. */
|
|
} TokenData;
|
|
|
|
/** Stores a token and any value parsed by the tokenizer. */
|
|
typedef struct {
|
|
TokenType type; /**< The type of token. */
|
|
TokenData data; /**< The stored data of type \a type. */
|
|
char *image; /**< The array of characters from the lexer which correspond to the token. */
|
|
const char *fname; /**< A pointer to the name of the file containing the token. */
|
|
unsigned int line; /**< The line number from the source file that the token occurred on. */
|
|
} Token;
|
|
|
|
int isInteger(const char *);
|
|
int isFloat(const char *);
|
|
int isString(const char *);
|
|
int isIdentifier(const char *);
|
|
Token *createToken(TokenType, const char *, const char *, unsigned int);
|
|
void deleteToken(Token *);
|
|
Token *addToken(Token ***, unsigned int *, Token*);
|
|
void deleteTokens(Token **);
|
|
unsigned int acceptLexemes(LexemeList *, unsigned int, const char *);
|
|
Token *isKeyword(LexemeList *, unsigned int *);
|
|
Token **tokenizeLexemes(LexemeList *);
|
|
|
|
#endif /* __TOKENIZER_H__ */
|