lci/tokenizer.c

#include "tokenizer.h"

static const char *keywords[] = {
	"",            /* TT_INTEGER */
	"",            /* TT_FLOAT */
	"",            /* TT_STRING */
	"",            /* TT_IDENTIFIER */
	"",            /* TT_BOOLEAN */
	"IT",          /* TT_IT */
	"NOOB",        /* TT_NOOB */
	"NUMBR",       /* TT_NUMBR */
	"NUMBAR",      /* TT_NUMBAR */
	"TROOF",       /* TT_TROOF */
	"YARN",        /* TT_YARN */
	"BUKKIT",      /* TT_BUKKIT */
	"",            /* TT_EOF */
	"",            /* TT_NEWLINE */
	"HAI",         /* TT_HAI */
	"KTHXBYE",     /* TT_KTHXBYE */
	"HAS A",       /* TT_HASA */
	"ITZ A",       /* TT_ITZA */
	"ITZ",         /* TT_ITZ */
	"R NOOB",      /* TT_RNOOB */
	"R",           /* TT_R */
	"AN YR",       /* TT_ANYR */
	"AN",          /* TT_AN */
	"SUM OF",      /* TT_SUMOF */
	"DIFF OF",     /* TT_DIFFOF */
	"PRODUKT OF",  /* TT_PRODUKTOF */
	"QUOSHUNT OF", /* TT_QUOSHUNTOF */
	"MOD OF",      /* TT_MODOF */
	"BIGGR OF",    /* TT_BIGGROF */
	"SMALLR OF",   /* TT_SMALLROF */
	"BOTH OF",     /* TT_BOTHOF */
	"EITHER OF",   /* TT_EITHEROF */
	"WON OF",      /* TT_WONOF */
	"NOT",         /* TT_NOT */
	"MKAY",        /* TT_MKAY */
	"ALL OF",      /* TT_ALLOF */
	"ANY OF",      /* TT_ANYOF */
	"BOTH SAEM",   /* TT_BOTHSAEM */
	"DIFFRINT",    /* TT_DIFFRINT */
	"MAEK",        /* TT_MAEK */
	"A",           /* TT_A */
	"IS NOW A",    /* TT_ISNOWA */
	"VISIBLE",     /* TT_VISIBLE */
	"SMOOSH",      /* TT_SMOOSH */
	"!",           /* TT_BANG */
	"GIMMEH",      /* TT_GIMMEH */
	"O RLY?",      /* TT_ORLY */
	"YA RLY",      /* TT_YARLY */
	"MEBBE",       /* TT_MEBBE */
	"NO WAI",      /* TT_NOWAI */
	"OIC",         /* TT_OIC */
	"WTF?",        /* TT_WTF */
	"OMG",         /* TT_OMG */
	"OMGWTF",      /* TT_OMGWTF */
	"GTFO",        /* TT_GTFO */
	"IM IN YR",    /* TT_IMINYR */
	"UPPIN",       /* TT_UPPIN */
	"NERFIN",      /* TT_NERFIN */
	"YR",          /* TT_YR */
	"TIL",         /* TT_TIL */
	"WILE",        /* TT_WILE */
	"IM OUTTA YR", /* TT_IMOUTTAYR */
	"HOW IZ",      /* TT_HOWIZ */
	"IZ",          /* TT_IZ */
	"IF U SAY SO", /* TT_IFUSAYSO */
	"FOUND YR",    /* TT_FOUNDYR */
	"SRS",         /* TT_SRS */
	"'Z",          /* TT_APOSTROPHEZ */
	"BUKKIT",      /* TT_BUKKIT */
	""             /* TT_ENDOFTOKENS */
};

/**
 * Checks if a string follows the format for an integer.  Specifically, it
 * checks if the string matches the regular expression: (-?[1-9][0-9]*|0).
 *
 * \param [in] image The string to check.
 *
 * \retval 0 \a image does not match the pattern for an integer.
 *
 * \retval 1 \a image matches the pattern for an integer.
 */
int isInteger(const char *image)
{
	const char *cur = image;
	if (*cur == '-'
			|| (isdigit(*cur) && *cur != '0')
			|| (*cur == '0' && *(cur + 1) == '\0')) {
		cur++;
		while (isdigit(*cur)) cur++;
		if (*cur == '\0') return 1;
	}
	return 0;
}

/**
 * Checks if a string follows the format for a decimal.  Specifically, it checks
 * if the string matches the regular expression: (-?[0-9].[0-9]*).
 *
 * \param [in] image The string to check.
 *
 * \retval 0 \a image does not match the pattern for a decimal.
 *
 * \retval 1 \a image matches the pattern for a decimal.
 */
int isFloat(const char *image)
{
	const char *cur = image;
	if (*cur == '-' || isdigit(*cur)) {
		cur++;
		while (isdigit(*cur)) cur++;
		if (*cur == '.') {
			cur++;
			while (isdigit(*cur)) cur++;
			if (*cur == '\0') return 1;
		}
	}
	return 0;
}

/**
 * Checks if a string follows the format for a string literal.  Specifically, it
 * checks if the string matches the regular expression: (".*").
 *
 * \param [in] image The string to check.
 *
 * \retval 0 \a image does not match the pattern for a string.
 *
 * \retval 1 \a image matches the pattern for a string.
 */
int isString(const char *image)
{
	size_t len = strlen(image);
	return (len >= 2 && image[0] == '"' && image[len - 1] == '"');
}

/**
 * Checks if a string follows the format for an identifier.  Specifically, it
 * checks if the string matches the regular expression: ([a-zA-Z][a-zA-Z0-9_]*).
 *
 * \param image [in] The string to check.
 *
 * \retval 0 \a image does not match the pattern for an identifier.
 *
 * \retval 1 \a image matches the pattern for an identifier.
 */
int isIdentifier(const char *image)
{
	const char *cur = image;
	/* First character must be alphabetic */
	if (!cur || !isalpha(*cur)) return 0;
	cur++;
	while (*cur) {
		if (isalnum(*cur) || *cur == '_') cur++;
		else return 0;
	}
	return 1;
}

/**
 * Creates a token.
 *
 * \param [in] type The type of token to create.
 *
 * \param [in] image The string that represents the token.
 *
 * \param [in] fname The name of the file containing the token.
 *
 * \param [in] line The number of the line containing the token.
 *
 * \return A pointer to a new token with the desired properties.
 *
 * \retval NULL Memory allocation failed.
 */
Token *createToken(TokenType type,
                   const char *image,
                   const char *fname,
                   unsigned int line)
{
	Token *ret = malloc(sizeof(Token));
	if (!ret) {
		perror("malloc");
		return NULL;
	}
	ret->type = type;
	ret->image = malloc(sizeof(char) * (strlen(image) + 1));
	if (!(ret->image)) {
		free(ret);
		perror("malloc");
		return NULL;
	}
	strcpy(ret->image, image);
	/**
	 * \note fname is not copied because only one copy is stored for all
	 * Token structures that share it.
	 */
	ret->fname = fname;
	ret->line = line;
	return ret;
}

/**
 * Deletes a token.
 *
 * \param [in,out] token The token to delete.
 *
 * \post The memory at \a token and all of its members will be freed.
 */
void deleteToken(Token *token)
{
	if (!token) return;
	free(token->image);
	free(token);
}

/**
 * Adds a token to a list.
 *
 * \param [in,out] list The list of tokens to add \a token to.
 *
 * \param [in,out] num The number of tokens in \a list.
 *
 * \param [in] token The token to add to \a list.
 *
 * \post \a token will be added to the end of \a list and the size of \a list
 * will be updated.
 *
 * \retval 0 Memory allocation failed.
 *
 * \retval 1 \a token was added to \a list.
 */
int addToken(Token ***list,
             unsigned int *num,
             Token *token)
{
	unsigned int newsize = *num + 1;
	void *mem = realloc(*list, sizeof(Token *) * newsize);
	if (!mem) {
		perror("realloc");
		return 0;
	}
	*list = mem;
	(*list)[*num] = token;
	*num = newsize;
#ifdef DEBUG
	fprintf(stderr, "Adding token type %d [%s]\n", token->type, token->image);
#endif
	return 1;
}

/**
 * Deletes a list of tokens.
 *
 * \param list [in,out] The list of tokens to delete.
 *
 * \post The memory at \a list and all of its members will be freed.
 */
void deleteTokens(Token **list)
{
	Token **tok = list;
	while (*tok) {
		deleteToken(*tok);
		tok++;
	}
	free(list);
}

/**
 * Matches lexemes against a string.  Traverses \a lexemes starting at \a start
 * and compares lexeme images to space-delimited substrings from \a match.
 *
 * \param lexemes [in] The list of lexemes to match from.
 *
 * \param start [in] The index within \a lexemes to start matching at.
 *
 * \param match [in] A string of space-delimited substrings to match.
 *
 * \return The number of lexemes matched.
 */
unsigned int acceptLexemes(LexemeList *lexemes,
                           unsigned int start,
                           const char *match)
{
	unsigned int offset = 0;
	unsigned int n;
	unsigned int i;
	for (n = 0, i = 0;
			match[n] || lexemes->lexemes[start + offset]->image[i];
			n++) {
		if (match[n] == ' ') {
			offset++;
			i = 0;
			continue;
		}
		if (lexemes->lexemes[start + offset]->image[i] != match[n])
			return 0;
		i++;
	}
	return offset + 1;
}

/**
 * Checks if the next lexemes in a list comprise a keyword and, if so, generates
 * a new token representing that keyword.  Specifically, \a lexemes is searched,
 * starting at \a start for keywords.  If one is found, an appropriate token is
 * created and returned and \a start is incremented by the number of lexemes
 * matched minus one.
 *
 * \param lexemes [in] A list of lexemes to search for keywords in.
 *
 * \param start [in,out] The position within \a lexemes to begin searching for
 * keywords.
 *
 * \post If a keyword is not found, \a start will not be modified.  Otherwise,
 * \a start will be incremented by the number of lexemes matched minus one.
 *
 * \return A pointer to the token containing the matched keyword.
 *
 * \retval NULL No keywords were found or there was an error allocating memory.
 */
Token *isKeyword(LexemeList *lexemes,
                 unsigned int *start)
{
	Token *token = NULL;
	TokenType type;
	const char *fname = lexemes->lexemes[*start]->fname;
	unsigned int line = lexemes->lexemes[*start]->line;
	/* For each keyword, */
	for (type = 0; type != TT_ENDOFTOKENS; type++) {
		/* Check if the start of lexemes match */
		unsigned int num = acceptLexemes(lexemes,
				*start, keywords[type]);
		if (!num) continue;
		/* If so, create a new token for the keyword */
		token = createToken(type, keywords[type], fname, line);
		/* And advance the start */
		*start += (num - 1);
		break;
	}
	return token;
}

/**
 * Converts a list of lexemes into tokens.  Also parses integers, floats, and
 * strings into tokens with semantic meaning.
 *
 * \param list [in] A list of lexemes to tokenize.
 *
 * \return A list of tokens generated from \a list.
 *
 * \retval NULL An unrecognized token was encounteres or memory allocation
 * failed.
 */
Token **tokenizeLexemes(LexemeList *list)
{
	void *mem = NULL;
	Token **ret = NULL;
	unsigned int retsize = 0;
	unsigned int n;
	for (n = 0; n < list->num; n++) {
		Lexeme *lexeme = list->lexemes[n];
		const char *image = lexeme->image;
		const char *fname = lexeme->fname;
		unsigned int line = lexeme->line;
		Token *token = NULL;
		/* String */
		if (isString(image)) {
			token = createToken(TT_STRING, image, fname, line);
		}
		/* Float */
		else if (isFloat(image)) {
			token = createToken(TT_FLOAT, image, fname, line);
			if (sscanf(lexeme->image, "%f", &(token->data.f)) != 1) {
				fprintf(stderr, "Expected floating point decimal value.\n");
			}
		}
		/* Integer */
		else if (isInteger(image)) {
			token = createToken(TT_INTEGER, image, fname, line);
			if (sscanf(lexeme->image, "%i", &(token->data.i)) != 1) {
				fprintf(stderr, "Expected integer value.\n");
			}
		}
		/* FAIL */
		else if (!strcmp(image, "FAIL")) {
			token = createToken(TT_BOOLEAN, "FAIL", fname, line);
			token->data.i = 0;
		}
		/* WIN */
		else if (!strcmp(image, "WIN")) {
			token = createToken(TT_BOOLEAN, "WIN", fname, line);
			token->data.i = 1;
		}
		/* CAN HAS STDIO? */
		else if (n < list->num - 2
				&& !strcmp(lexeme->image, "CAN")
				&& !strcmp(list->lexemes[n + 1]->image, "HAS")
				&& !strcmp(list->lexemes[n + 2]->image, "STDIO?")) {
			n += 2;
			/* Just for fun; not actually in spec */
			continue;
		}
		/* Newline */
		/* Note that the spec is unclear as to whether a command *must*
		 * follow a comma.  For now, we let commas end a line. */
		else if (!strcmp(image, "\n")) {
			/* Note that we ignore any initial newlines */
			if (retsize < 1) {
#ifdef DEBUG
				fprintf(stderr, "Skipping initial newline.\n");
#endif
				continue;
			}
			else if (ret[retsize - 1]->type == TT_NEWLINE) {
#ifdef DEBUG
				fprintf(stderr, "Skipping duplicate newline.\n");
#endif
				continue;
			}
			else {
				token = createToken(TT_NEWLINE, "end of line", fname, line);
			}
		}
		/* Keyword */
		else if ((token = isKeyword(list, &n))) {
		}
		/* Identifier */
		/* This must be placed after keyword parsing or else most
		 * keywords would be tokenized as identifiers. */
		else if (isIdentifier(image)) {
			token = createToken(TT_IDENTIFIER, image, fname, line);
		}
		/* EOF */
		else if (!strcmp(image, "$")) {
			token = createToken(TT_EOF, "end of file", fname, line);
		}
		else {
			fprintf(stderr, "%s:%u: unknown token at: %s\n", fname, line, image);
			/* Clean up */
			deleteToken(ret[retsize - 1]);
			ret[retsize - 1] = NULL;
			deleteTokens(ret);
			return NULL;
		}

		/* Add the token to the token array */
		if (!addToken(&ret, &retsize, token)) {
			/* Clean up */
			if (token) deleteToken(token);
			deleteToken(ret[retsize - 1]);
			ret[retsize - 1] = NULL;
			deleteTokens(ret);
			return NULL;
		}
	}
	mem = realloc(ret, sizeof(Token *) * ++retsize);
	if (!mem) {
		deleteToken(ret[retsize - 2]);
		ret[retsize - 2] = NULL;
		deleteTokens(ret);
		return NULL;
	}
	ret = mem;
	ret[retsize - 1] = NULL;
	return ret;
}