lci/tokenizer.c

470 lines
12 KiB
C

#include "tokenizer.h"
static const char *keywords[] = {
"", /* TT_INTEGER */
"", /* TT_FLOAT */
"", /* TT_STRING */
"", /* TT_IDENTIFIER */
"", /* TT_BOOLEAN */
"IT", /* TT_IT */
"NOOB", /* TT_NOOB */
"NUMBR", /* TT_NUMBR */
"NUMBAR", /* TT_NUMBAR */
"TROOF", /* TT_TROOF */
"YARN", /* TT_YARN */
"BUKKIT", /* TT_BUKKIT */
"", /* TT_EOF */
"", /* TT_NEWLINE */
"HAI", /* TT_HAI */
"KTHXBYE", /* TT_KTHXBYE */
"HAS A", /* TT_HASA */
"ITZ A", /* TT_ITZA */
"ITZ", /* TT_ITZ */
"R NOOB", /* TT_RNOOB */
"R", /* TT_R */
"AN YR", /* TT_ANYR */
"AN", /* TT_AN */
"SUM OF", /* TT_SUMOF */
"DIFF OF", /* TT_DIFFOF */
"PRODUKT OF", /* TT_PRODUKTOF */
"QUOSHUNT OF", /* TT_QUOSHUNTOF */
"MOD OF", /* TT_MODOF */
"BIGGR OF", /* TT_BIGGROF */
"SMALLR OF", /* TT_SMALLROF */
"BOTH OF", /* TT_BOTHOF */
"EITHER OF", /* TT_EITHEROF */
"WON OF", /* TT_WONOF */
"NOT", /* TT_NOT */
"MKAY", /* TT_MKAY */
"ALL OF", /* TT_ALLOF */
"ANY OF", /* TT_ANYOF */
"BOTH SAEM", /* TT_BOTHSAEM */
"DIFFRINT", /* TT_DIFFRINT */
"MAEK", /* TT_MAEK */
"A", /* TT_A */
"IS NOW A", /* TT_ISNOWA */
"VISIBLE", /* TT_VISIBLE */
"SMOOSH", /* TT_SMOOSH */
"!", /* TT_BANG */
"GIMMEH", /* TT_GIMMEH */
"O RLY?", /* TT_ORLY */
"YA RLY", /* TT_YARLY */
"MEBBE", /* TT_MEBBE */
"NO WAI", /* TT_NOWAI */
"OIC", /* TT_OIC */
"WTF?", /* TT_WTF */
"OMG", /* TT_OMG */
"OMGWTF", /* TT_OMGWTF */
"GTFO", /* TT_GTFO */
"IM IN YR", /* TT_IMINYR */
"UPPIN", /* TT_UPPIN */
"NERFIN", /* TT_NERFIN */
"YR", /* TT_YR */
"TIL", /* TT_TIL */
"WILE", /* TT_WILE */
"IM OUTTA YR", /* TT_IMOUTTAYR */
"HOW IZ", /* TT_HOWIZ */
"IZ", /* TT_IZ */
"IF U SAY SO", /* TT_IFUSAYSO */
"FOUND YR", /* TT_FOUNDYR */
"SRS", /* TT_SRS */
"'Z", /* TT_APOSTROPHEZ */
"BUKKIT", /* TT_BUKKIT */
"" /* TT_ENDOFTOKENS */
};
/**
* Checks if a string follows the format for an integer. Specifically, it
* checks if the string matches the regular expression: (-?[1-9][0-9]*|0).
*
* \param [in] image The string to check.
*
* \retval 0 \a image does not match the pattern for an integer.
*
* \retval 1 \a image matches the pattern for an integer.
*/
int isInteger(const char *image)
{
const char *cur = image;
if (*cur == '-'
|| (isdigit(*cur) && *cur != '0')
|| (*cur == '0' && *(cur + 1) == '\0')) {
cur++;
while (isdigit(*cur)) cur++;
if (*cur == '\0') return 1;
}
return 0;
}
/**
* Checks if a string follows the format for a decimal. Specifically, it checks
* if the string matches the regular expression: (-?[0-9].[0-9]*).
*
* \param [in] image The string to check.
*
* \retval 0 \a image does not match the pattern for a decimal.
*
* \retval 1 \a image matches the pattern for a decimal.
*/
int isFloat(const char *image)
{
const char *cur = image;
if (*cur == '-' || isdigit(*cur)) {
cur++;
while (isdigit(*cur)) cur++;
if (*cur == '.') {
cur++;
while (isdigit(*cur)) cur++;
if (*cur == '\0') return 1;
}
}
return 0;
}
/**
* Checks if a string follows the format for a string literal. Specifically, it
* checks if the string matches the regular expression: (".*").
*
* \param [in] image The string to check.
*
* \retval 0 \a image does not match the pattern for a string.
*
* \retval 1 \a image matches the pattern for a string.
*/
int isString(const char *image)
{
size_t len = strlen(image);
return (len >= 2 && image[0] == '"' && image[len - 1] == '"');
}
/**
* Checks if a string follows the format for an identifier. Specifically, it
* checks if the string matches the regular expression: ([a-zA-Z][a-zA-Z0-9_]*).
*
* \param image [in] The string to check.
*
* \retval 0 \a image does not match the pattern for an identifier.
*
* \retval 1 \a image matches the pattern for an identifier.
*/
int isIdentifier(const char *image)
{
const char *cur = image;
/* First character must be alphabetic */
if (!cur || !isalpha(*cur)) return 0;
cur++;
while (*cur) {
if (isalnum(*cur) || *cur == '_') cur++;
else return 0;
}
return 1;
}
/**
* Creates a token.
*
* \param [in] type The type of token to create.
*
* \param [in] image The string that represents the token.
*
* \param [in] fname The name of the file containing the token.
*
* \param [in] line The number of the line containing the token.
*
* \return A pointer to a new token with the desired properties.
*
* \retval NULL Memory allocation failed.
*/
Token *createToken(TokenType type,
const char *image,
const char *fname,
unsigned int line)
{
Token *ret = malloc(sizeof(Token));
if (!ret) {
perror("malloc");
return NULL;
}
ret->type = type;
ret->image = malloc(sizeof(char) * (strlen(image) + 1));
if (!(ret->image)) {
free(ret);
perror("malloc");
return NULL;
}
strcpy(ret->image, image);
/**
* \note fname is not copied because only one copy is stored for all
* Token structures that share it.
*/
ret->fname = fname;
ret->line = line;
return ret;
}
/**
* Deletes a token.
*
* \param [in,out] token The token to delete.
*
* \post The memory at \a token and all of its members will be freed.
*/
void deleteToken(Token *token)
{
if (!token) return;
free(token->image);
free(token);
}
/**
* Adds a token to a list.
*
* \param [in,out] list The list of tokens to add \a token to.
*
* \param [in,out] num The number of tokens in \a list.
*
* \param [in] token The token to add to \a list.
*
* \post \a token will be added to the end of \a list and the size of \a list
* will be updated.
*
* \retval 0 Memory allocation failed.
*
* \retval 1 \a token was added to \a list.
*/
int addToken(Token ***list,
unsigned int *num,
Token *token)
{
unsigned int newsize = *num + 1;
void *mem = realloc(*list, sizeof(Token *) * newsize);
if (!mem) {
perror("realloc");
return 0;
}
*list = mem;
(*list)[*num] = token;
*num = newsize;
#ifdef DEBUG
fprintf(stderr, "Adding token type %d [%s]\n", token->type, token->image);
#endif
return 1;
}
/**
* Deletes a list of tokens.
*
* \param list [in,out] The list of tokens to delete.
*
* \post The memory at \a list and all of its members will be freed.
*/
void deleteTokens(Token **list)
{
Token **tok = list;
while (*tok) {
deleteToken(*tok);
tok++;
}
free(list);
}
/**
* Matches lexemes against a string. Traverses \a lexemes starting at \a start
* and compares lexeme images to space-delimited substrings from \a match.
*
* \param lexemes [in] The list of lexemes to match from.
*
* \param start [in] The index within \a lexemes to start matching at.
*
* \param match [in] A string of space-delimited substrings to match.
*
* \return The number of lexemes matched.
*/
unsigned int acceptLexemes(LexemeList *lexemes,
unsigned int start,
const char *match)
{
unsigned int offset = 0;
unsigned int n;
unsigned int i;
for (n = 0, i = 0;
match[n] || lexemes->lexemes[start + offset]->image[i];
n++) {
if (match[n] == ' ') {
offset++;
i = 0;
continue;
}
if (lexemes->lexemes[start + offset]->image[i] != match[n])
return 0;
i++;
}
return offset + 1;
}
/**
* Checks if the next lexemes in a list comprise a keyword and, if so, generates
* a new token representing that keyword. Specifically, \a lexemes is searched,
* starting at \a start for keywords. If one is found, an appropriate token is
* created and returned and \a start is incremented by the number of lexemes
* matched minus one.
*
* \param lexemes [in] A list of lexemes to search for keywords in.
*
* \param start [in,out] The position within \a lexemes to begin searching for
* keywords.
*
* \post If a keyword is not found, \a start will not be modified. Otherwise,
* \a start will be incremented by the number of lexemes matched minus one.
*
* \return A pointer to the token containing the matched keyword.
*
* \retval NULL No keywords were found or there was an error allocating memory.
*/
Token *isKeyword(LexemeList *lexemes,
unsigned int *start)
{
Token *token = NULL;
TokenType type;
const char *fname = lexemes->lexemes[*start]->fname;
unsigned int line = lexemes->lexemes[*start]->line;
/* For each keyword, */
for (type = 0; type != TT_ENDOFTOKENS; type++) {
/* Check if the start of lexemes match */
unsigned int num = acceptLexemes(lexemes,
*start, keywords[type]);
if (!num) continue;
/* If so, create a new token for the keyword */
token = createToken(type, keywords[type], fname, line);
/* And advance the start */
*start += (num - 1);
break;
}
return token;
}
/**
* Converts a list of lexemes into tokens. Also parses integers, floats, and
* strings into tokens with semantic meaning.
*
* \param list [in] A list of lexemes to tokenize.
*
* \return A list of tokens generated from \a list.
*
* \retval NULL An unrecognized token was encounteres or memory allocation
* failed.
*/
Token **tokenizeLexemes(LexemeList *list)
{
void *mem = NULL;
Token **ret = NULL;
unsigned int retsize = 0;
unsigned int n;
for (n = 0; n < list->num; n++) {
Lexeme *lexeme = list->lexemes[n];
const char *image = lexeme->image;
const char *fname = lexeme->fname;
unsigned int line = lexeme->line;
Token *token = NULL;
/* String */
if (isString(image)) {
token = createToken(TT_STRING, image, fname, line);
}
/* Float */
else if (isFloat(image)) {
token = createToken(TT_FLOAT, image, fname, line);
if (sscanf(lexeme->image, "%f", &(token->data.f)) != 1) {
fprintf(stderr, "Expected floating point decimal value.\n");
}
}
/* Integer */
else if (isInteger(image)) {
token = createToken(TT_INTEGER, image, fname, line);
if (sscanf(lexeme->image, "%i", &(token->data.i)) != 1) {
fprintf(stderr, "Expected integer value.\n");
}
}
/* FAIL */
else if (!strcmp(image, "FAIL")) {
token = createToken(TT_BOOLEAN, "FAIL", fname, line);
token->data.i = 0;
}
/* WIN */
else if (!strcmp(image, "WIN")) {
token = createToken(TT_BOOLEAN, "WIN", fname, line);
token->data.i = 1;
}
/* CAN HAS STDIO? */
else if (n < list->num - 2
&& !strcmp(lexeme->image, "CAN")
&& !strcmp(list->lexemes[n + 1]->image, "HAS")
&& !strcmp(list->lexemes[n + 2]->image, "STDIO?")) {
n += 2;
/* Just for fun; not actually in spec */
continue;
}
/* Newline */
/* Note that the spec is unclear as to whether a command *must*
* follow a comma. For now, we let commas end a line. */
else if (!strcmp(image, "\n")) {
/* Note that we ignore any initial newlines */
if (retsize < 1) {
#ifdef DEBUG
fprintf(stderr, "Skipping initial newline.\n");
#endif
continue;
}
else if (ret[retsize - 1]->type == TT_NEWLINE) {
#ifdef DEBUG
fprintf(stderr, "Skipping duplicate newline.\n");
#endif
continue;
}
else {
token = createToken(TT_NEWLINE, "end of line", fname, line);
}
}
/* Keyword */
else if ((token = isKeyword(list, &n))) {
}
/* Identifier */
/* This must be placed after keyword parsing or else most
* keywords would be tokenized as identifiers. */
else if (isIdentifier(image)) {
token = createToken(TT_IDENTIFIER, image, fname, line);
}
/* EOF */
else if (!strcmp(image, "$")) {
token = createToken(TT_EOF, "end of file", fname, line);
}
else {
fprintf(stderr, "%s:%u: unknown token at: %s\n", fname, line, image);
/* Clean up */
deleteToken(ret[retsize - 1]);
ret[retsize - 1] = NULL;
deleteTokens(ret);
return NULL;
}
/* Add the token to the token array */
if (!addToken(&ret, &retsize, token)) {
/* Clean up */
if (token) deleteToken(token);
deleteToken(ret[retsize - 1]);
ret[retsize - 1] = NULL;
deleteTokens(ret);
return NULL;
}
}
mem = realloc(ret, sizeof(Token *) * ++retsize);
if (!mem) {
deleteToken(ret[retsize - 2]);
ret[retsize - 2] = NULL;
deleteTokens(ret);
return NULL;
}
ret = mem;
ret[retsize - 1] = NULL;
return ret;
}