386 lines
9.7 KiB
C
386 lines
9.7 KiB
C
#include "tokenizer.h"
|
|
|
|
/**
|
|
* Checks if a string follows the format for an integer. Specifically, it
|
|
* checks if the string matches the regular expression: (-?[1-9][0-9]*|0).
|
|
*
|
|
* \param [in] image The string to check.
|
|
*
|
|
* \retval 0 \a image does not match the pattern for an integer.
|
|
*
|
|
* \retval 1 \a image matches the pattern for an integer.
|
|
*/
|
|
int isInteger(const char *image)
|
|
{
|
|
const char *cur = image;
|
|
if (*cur == '-'
|
|
|| (isdigit(*cur) && *cur != '0')
|
|
|| (*cur == '0' && *(cur + 1) == '\0')) {
|
|
cur++;
|
|
while (isdigit(*cur)) cur++;
|
|
if (*cur == '\0') return 1;
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
/**
|
|
* Checks if a string follows the format for a decimal. Specifically, it checks
|
|
* if the string matches the regular expression: (-?[0-9].[0-9]*).
|
|
*
|
|
* \param [in] image The string to check.
|
|
*
|
|
* \retval 0 \a image does not match the pattern for a decimal.
|
|
*
|
|
* \retval 1 \a image matches the pattern for a decimal.
|
|
*/
|
|
int isFloat(const char *image)
|
|
{
|
|
const char *cur = image;
|
|
if (*cur == '-' || isdigit(*cur)) {
|
|
cur++;
|
|
while (isdigit(*cur)) cur++;
|
|
if (*cur == '.') {
|
|
cur++;
|
|
while (isdigit(*cur)) cur++;
|
|
if (*cur == '\0') return 1;
|
|
}
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
/**
|
|
* Checks if a string follows the format for a string literal. Specifically, it
|
|
* checks if the string matches the regular expression: (".*").
|
|
*
|
|
* \param [in] image The string to check.
|
|
*
|
|
* \retval 0 \a image does not match the pattern for a string.
|
|
*
|
|
* \retval 1 \a image matches the pattern for a string.
|
|
*/
|
|
int isString(const char *image)
|
|
{
|
|
size_t len = strlen(image);
|
|
return (len >= 2 && image[0] == '"' && image[len - 1] == '"');
|
|
}
|
|
|
|
/**
|
|
* Checks if a string follows the format for an identifier. Specifically, it
|
|
* checks if the string matches the regular expression: ([a-zA-Z][a-zA-Z0-9_]*).
|
|
*
|
|
* \param image [in] The string to check.
|
|
*
|
|
* \retval 0 \a image does not match the pattern for an identifier.
|
|
*
|
|
* \retval 1 \a image matches the pattern for an identifier.
|
|
*/
|
|
int isIdentifier(const char *image)
|
|
{
|
|
const char *cur = image;
|
|
/* First character must be alphabetic */
|
|
if (!cur || !isalpha(*cur)) return 0;
|
|
cur++;
|
|
while (*cur) {
|
|
if (isalnum(*cur) || *cur == '_') cur++;
|
|
else return 0;
|
|
}
|
|
return 1;
|
|
}
|
|
|
|
/**
|
|
* Creates a token.
|
|
*
|
|
* \param [in] type The type of token to create.
|
|
*
|
|
* \param [in] image The string that represents the token.
|
|
*
|
|
* \param [in] fname The name of the file containing the token.
|
|
*
|
|
* \param [in] line The number of the line containing the token.
|
|
*
|
|
* \return A pointer to a new token with the desired properties.
|
|
*
|
|
* \retval NULL Memory allocation failed.
|
|
*/
|
|
Token *createToken(TokenType type,
|
|
const char *image,
|
|
const char *fname,
|
|
unsigned int line)
|
|
{
|
|
Token *ret = malloc(sizeof(Token));
|
|
if (!ret) {
|
|
perror("malloc");
|
|
return NULL;
|
|
}
|
|
ret->type = type;
|
|
ret->image = malloc(sizeof(char) * (strlen(image) + 1));
|
|
if (!(ret->image)) {
|
|
free(ret);
|
|
perror("malloc");
|
|
return NULL;
|
|
}
|
|
strcpy(ret->image, image);
|
|
/**
|
|
* \note fname is not copied because only one copy is stored for all
|
|
* Token structures that share it.
|
|
*/
|
|
ret->fname = fname;
|
|
ret->line = line;
|
|
return ret;
|
|
}
|
|
|
|
/**
|
|
* Deletes a token.
|
|
*
|
|
* \param [in,out] token The token to delete.
|
|
*
|
|
* \post The memory at \a token and all of its members will be freed.
|
|
*/
|
|
void deleteToken(Token *token)
|
|
{
|
|
if (!token) return;
|
|
free(token->image);
|
|
free(token);
|
|
}
|
|
|
|
/**
|
|
* Adds a token to a list.
|
|
*
|
|
* \param [in,out] list The list of tokens to add \a token to.
|
|
*
|
|
* \param [in,out] num The number of tokens in \a list.
|
|
*
|
|
* \param [in] token The token to add to \a list.
|
|
*
|
|
* \post \a token will be added to the end of \a list and the size of \a list
|
|
* will be updated.
|
|
*
|
|
* \retval 0 Memory allocation failed.
|
|
*
|
|
* \retval 1 \a token was added to \a list.
|
|
*/
|
|
int addToken(Token ***list,
|
|
unsigned int *num,
|
|
Token *token)
|
|
{
|
|
unsigned int newsize = *num + 1;
|
|
void *mem = realloc(*list, sizeof(Token *) * newsize);
|
|
if (!mem) {
|
|
perror("realloc");
|
|
return 0;
|
|
}
|
|
*list = mem;
|
|
(*list)[*num] = token;
|
|
*num = newsize;
|
|
#ifdef DEBUG
|
|
fprintf(stderr, "Adding token type %d [%s]\n", token->type, token->image);
|
|
#endif
|
|
return 1;
|
|
}
|
|
|
|
/**
|
|
* Deletes a list of tokens.
|
|
*
|
|
* \param list [in,out] The list of tokens to delete.
|
|
*
|
|
* \post The memory at \a list and all of its members will be freed.
|
|
*/
|
|
void deleteTokens(Token **list)
|
|
{
|
|
Token **tok = list;
|
|
while (*tok) {
|
|
deleteToken(*tok);
|
|
tok++;
|
|
}
|
|
free(list);
|
|
}
|
|
|
|
/**
|
|
* Matches lexemes against a string. Traverses \a lexemes starting at \a start
|
|
* and compares lexeme images to space-delimited substrings from \a match.
|
|
*
|
|
* \param lexemes [in] The list of lexemes to match from.
|
|
*
|
|
* \param start [in] The index within \a lexemes to start matching at.
|
|
*
|
|
* \param match [in] A string of space-delimited substrings to match.
|
|
*
|
|
* \return The number of lexemes matched.
|
|
*/
|
|
unsigned int acceptLexemes(LexemeList *lexemes,
|
|
unsigned int start,
|
|
const char *match)
|
|
{
|
|
unsigned int offset = 0;
|
|
unsigned int n;
|
|
unsigned int i;
|
|
for (n = 0, i = 0;
|
|
match[n] || lexemes->lexemes[start + offset]->image[i];
|
|
n++) {
|
|
if (match[n] == ' ') {
|
|
offset++;
|
|
i = 0;
|
|
continue;
|
|
}
|
|
if (lexemes->lexemes[start + offset]->image[i] != match[n])
|
|
return 0;
|
|
i++;
|
|
}
|
|
return offset + 1;
|
|
}
|
|
|
|
/**
|
|
* Checks if the next lexemes in a list comprise a keyword and, if so, generates
|
|
* a new token representing that keyword. Specifically, \a lexemes is searched,
|
|
* starting at \a start for keywords. If one is found, an appropriate token is
|
|
* created and returned and \a start is incremented by the number of lexemes
|
|
* matched minus one.
|
|
*
|
|
* \param lexemes [in] A list of lexemes to search for keywords in.
|
|
*
|
|
* \param start [in,out] The position within \a lexemes to begin searching for
|
|
* keywords.
|
|
*
|
|
* \post If a keyword is not found, \a start will not be modified. Otherwise,
|
|
* \a start will be incremented by the number of lexemes matched minus one.
|
|
*
|
|
* \return A pointer to the token containing the matched keyword.
|
|
*
|
|
* \retval NULL No keywords were found or there was an error allocating memory.
|
|
*/
|
|
Token *isKeyword(LexemeList *lexemes,
|
|
unsigned int *start)
|
|
{
|
|
Token *token = NULL;
|
|
TokenType type;
|
|
const char *fname = lexemes->lexemes[*start]->fname;
|
|
unsigned int line = lexemes->lexemes[*start]->line;
|
|
/* For each keyword, */
|
|
for (type = 0; type != TT_ENDOFTOKENS; type++) {
|
|
/* Check if the start of lexemes match */
|
|
unsigned int num = acceptLexemes(lexemes,
|
|
*start, keywords[type]);
|
|
if (!num) continue;
|
|
/* If so, create a new token for the keyword */
|
|
token = createToken(type, keywords[type], fname, line);
|
|
/* And advance the start */
|
|
*start += (num - 1);
|
|
break;
|
|
}
|
|
return token;
|
|
}
|
|
|
|
/**
|
|
* Converts a list of lexemes into tokens. Also parses integers, floats, and
|
|
* strings into tokens with semantic meaning.
|
|
*
|
|
* \param list [in] A list of lexemes to tokenize.
|
|
*
|
|
* \return A list of tokens generated from \a list.
|
|
*
|
|
* \retval NULL An unrecognized token was encounteres or memory allocation
|
|
* failed.
|
|
*/
|
|
Token **tokenizeLexemes(LexemeList *list)
|
|
{
|
|
void *mem = NULL;
|
|
Token **ret = NULL;
|
|
unsigned int retsize = 0;
|
|
unsigned int n;
|
|
for (n = 0; n < list->num; n++) {
|
|
Lexeme *lexeme = list->lexemes[n];
|
|
const char *image = lexeme->image;
|
|
const char *fname = lexeme->fname;
|
|
unsigned int line = lexeme->line;
|
|
Token *token = NULL;
|
|
/* String */
|
|
if (isString(image)) {
|
|
token = createToken(TT_STRING, image, fname, line);
|
|
}
|
|
/* Float */
|
|
else if (isFloat(image)) {
|
|
token = createToken(TT_FLOAT, image, fname, line);
|
|
if (sscanf(lexeme->image, "%f", &(token->data.f)) != 1)
|
|
error(TK_EXPECTED_FLOATING_POINT, fname, line);
|
|
}
|
|
/* Integer */
|
|
else if (isInteger(image)) {
|
|
token = createToken(TT_INTEGER, image, fname, line);
|
|
if (sscanf(lexeme->image, "%lli", &(token->data.i)) != 1)
|
|
error(TK_EXPECTED_INTEGER, fname, line);
|
|
}
|
|
/* FAIL */
|
|
else if (!strcmp(image, "FAIL")) {
|
|
token = createToken(TT_BOOLEAN, "FAIL", fname, line);
|
|
token->data.i = 0;
|
|
}
|
|
/* WIN */
|
|
else if (!strcmp(image, "WIN")) {
|
|
token = createToken(TT_BOOLEAN, "WIN", fname, line);
|
|
token->data.i = 1;
|
|
}
|
|
/* Newline */
|
|
/* Note that the spec is unclear as to whether a command *must*
|
|
* follow a comma. For now, we let commas end a line. */
|
|
else if (!strcmp(image, "\n")) {
|
|
/* Note that we ignore any initial newlines */
|
|
if (retsize < 1) {
|
|
#ifdef DEBUG
|
|
fprintf(stderr, "Skipping initial newline.\n");
|
|
#endif
|
|
continue;
|
|
}
|
|
else if (ret[retsize - 1]->type == TT_NEWLINE) {
|
|
#ifdef DEBUG
|
|
fprintf(stderr, "Skipping duplicate newline.\n");
|
|
#endif
|
|
continue;
|
|
}
|
|
else {
|
|
token = createToken(TT_NEWLINE, "end of line", fname, line);
|
|
}
|
|
}
|
|
/* Keyword */
|
|
else if ((token = isKeyword(list, &n))) {
|
|
}
|
|
/* Identifier */
|
|
/* This must be placed after keyword parsing or else most
|
|
* keywords would be tokenized as identifiers. */
|
|
else if (isIdentifier(image)) {
|
|
token = createToken(TT_IDENTIFIER, image, fname, line);
|
|
}
|
|
/* EOF */
|
|
else if (!strcmp(image, "$")) {
|
|
token = createToken(TT_EOF, "end of file", fname, line);
|
|
}
|
|
else {
|
|
error(TK_UNKNOWN_TOKEN, fname, line, image);
|
|
/* Clean up */
|
|
deleteToken(ret[retsize - 1]);
|
|
ret[retsize - 1] = NULL;
|
|
deleteTokens(ret);
|
|
return NULL;
|
|
}
|
|
|
|
/* Add the token to the token array */
|
|
if (!addToken(&ret, &retsize, token)) {
|
|
/* Clean up */
|
|
if (token) deleteToken(token);
|
|
deleteToken(ret[retsize - 1]);
|
|
ret[retsize - 1] = NULL;
|
|
deleteTokens(ret);
|
|
return NULL;
|
|
}
|
|
}
|
|
mem = realloc(ret, sizeof(Token *) * ++retsize);
|
|
if (!mem) {
|
|
deleteToken(ret[retsize - 2]);
|
|
ret[retsize - 2] = NULL;
|
|
deleteTokens(ret);
|
|
return NULL;
|
|
}
|
|
ret = mem;
|
|
ret[retsize - 1] = NULL;
|
|
return ret;
|
|
}
|