356 lines
11 KiB
C
356 lines
11 KiB
C
#include "tokenizer.h"
|
|
|
|
/** Checks if a string of characters follows the format for an integer.
|
|
* Specifically, it checks if the string of characters matches the regular
|
|
* expression: [-]?[1-9][0-9]* | 0
|
|
*
|
|
* \retval 0 The string of characters is not an integer.
|
|
* \retval 1 The string of characters is an integer.
|
|
*
|
|
* \see isFloat(const char *)
|
|
* \see isString(const char *)
|
|
* \see isIdentifier(const char *) */
|
|
int isInteger(const char *image) /**< [in] The string of characters to compare. */
|
|
{
|
|
const char *cur = image;
|
|
if (*cur == '-' || (isdigit(*cur) && *cur != '0') || (*cur == '0' && *(cur + 1) == '\0')) {
|
|
cur++;
|
|
while (isdigit(*cur)) cur++;
|
|
if (*cur == '\0') return 1;
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
/** Checks if a string of characters follows the format for a floating
|
|
* point decimal. Specifically, it checks if the string of characters matches
|
|
* the regular expression: [-]?[0-9].[0-9]*
|
|
*
|
|
* \retval 0 The string of characters is not a floating point decimal.
|
|
* \retval 1 The string of characters is a floating point decimal.
|
|
*
|
|
* \see isInteger(const char *)
|
|
* \see isString(const char *)
|
|
* \see isIdentifier(const char *) */
|
|
int isFloat(const char *image) /**< [in] The string of characters to compare. */
|
|
{
|
|
const char *cur = image;
|
|
if (*cur == '-' || isdigit(*cur)) {
|
|
cur++;
|
|
while (isdigit(*cur)) cur++;
|
|
if (*cur == '.') {
|
|
cur++;
|
|
while (isdigit(*cur)) cur++;
|
|
if (*cur == '\0') return 1;
|
|
}
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
/** Checks if a string of characters follows the format for a string.
|
|
* Specifically, it checks if the string of characters begins and ends with a
|
|
* quote character.
|
|
*
|
|
* \retval 0 The string of characters is not a string.
|
|
* \retval 1 The string of characters is a string.
|
|
*
|
|
* \see isInteger(const char *)
|
|
* \see isFloat(const char *)
|
|
* \see isIdentifier(const char *) */
|
|
int isString(const char *image) /**< [in] The string of characters to compare. */
|
|
{
|
|
size_t len = strlen(image);
|
|
return (len >= 2 && image[0] == '"' && image[len - 1] == '"');
|
|
}
|
|
|
|
/** Checks if a string of characters follows the format for an identifier.
|
|
* Specifically, it checks if the string of characters matches the regular
|
|
* expression: [a-zA-Z][a-zA-Z0-9_]*
|
|
*
|
|
* \retval 0 The string of characters is not an identifier.
|
|
* \retval 1 The string of characters is an identifier.
|
|
*
|
|
* \see isInteger(const char *)
|
|
* \see isFloat(const char *)
|
|
* \see isString(const char *) */
|
|
int isIdentifier(const char *image) /**< [in] The string of characters to compare. */
|
|
{
|
|
const char *cur = image;
|
|
/* First character must be alphabetic */
|
|
if (!cur || !isalpha(*cur)) return 0;
|
|
cur++;
|
|
while (*cur) {
|
|
if (isalnum(*cur) || *cur == '_') cur++;
|
|
/* Proposed LOLCODE Version 1.3 identifiers
|
|
* Remember to update expression: [a-zA-Z][a-zA-Z0-9]*([!!|!?][a-zA-Z][a-zA-Z0-9]*)*
|
|
else if (*cur == '!' && *(cur + 1) && *(cur + 1) == '!') cur += 2;
|
|
else if (*cur == '!' && *(cur + 1) && *(cur + 1) == '?') cur += 2;
|
|
*/
|
|
else return 0;
|
|
}
|
|
return 1;
|
|
}
|
|
|
|
/** Creates a Token structure.
|
|
*
|
|
* \return A pointer to a Token structure with the desired properties.
|
|
*
|
|
* \retval NULL malloc was unable to allocate memory.
|
|
*
|
|
* \see deleteToken(Token *) */
|
|
Token *createToken(TokenType type, /**< [in] The type of token to create. */
|
|
const char *image, /**< [in] The characters from the source file that represent the token. */
|
|
const char *fname, /**< [in] A pointer to the name of the file containing the token. */
|
|
unsigned int line) /**< [in] The line number from the source file that the token occurred on. */
|
|
{
|
|
Token *ret = malloc(sizeof(Token));
|
|
if (!ret) {
|
|
perror("malloc");
|
|
return NULL;
|
|
}
|
|
ret->type = type;
|
|
ret->image = malloc(sizeof(char) * (strlen(image) + 1));
|
|
if (!(ret->image)) {
|
|
free(ret);
|
|
perror("malloc");
|
|
return NULL;
|
|
}
|
|
strcpy(ret->image, image);
|
|
/** \note fname is not copied because it would only one copy is stored
|
|
* for all Token structures that share it. */
|
|
ret->fname = fname;
|
|
ret->line = line;
|
|
return ret;
|
|
}
|
|
|
|
/** Deletes a Token structure.
|
|
*
|
|
* \pre \a token points to a Token structure created by createToken(TokenType, const char *, const char *, unsigned int).
|
|
*
|
|
* \post The memory at \a token and all of its elements will be freed.
|
|
*
|
|
* \see createToken(TokenType, const char *, const char *, unsigned int) */
|
|
void deleteToken(Token *token)
|
|
{
|
|
if (!token) return;
|
|
free(token->image);
|
|
free(token);
|
|
}
|
|
|
|
/** Adds a Token to an array of Token structures.
|
|
*
|
|
* \note \a list may be NULL in which case a new list is created.
|
|
*
|
|
* \pre \a num is the number of elements in \a list.
|
|
*
|
|
* \post \a token will be added on to the end of \a list and the value at \a num
|
|
* will be updated accordingly.
|
|
*
|
|
* \return A pointer to the added Token structure (will be the same as \a token).
|
|
*
|
|
* \retval NULL realloc was unable to allocate memory.
|
|
*
|
|
* \see deleteTokens(Token **) */
|
|
Token *addToken(Token ***list, /**< [in,out] A pointer to a pointer to an array of Token structures to add the new Token onto. */
|
|
unsigned int *num, /**< [in,out] A pointer to the number of elements in \a list. */
|
|
Token *token) /**< [in] A pointer to the Token structure to add to \a list. */
|
|
{
|
|
unsigned int newsize = *num + 1;
|
|
void *mem = realloc(*list, sizeof(Token *) * newsize);
|
|
if (!mem) {
|
|
perror("realloc");
|
|
return NULL;
|
|
}
|
|
*list = mem;
|
|
(*list)[*num] = token;
|
|
*num = newsize;
|
|
#ifdef DEBUG
|
|
fprintf(stderr, "Adding token type %d [%s]\n", token->type, token->image);
|
|
#endif
|
|
return token;
|
|
}
|
|
|
|
/** Deletes an array of Token structures.
|
|
*
|
|
* \pre \a list was created by and contains items added by addToken(Token ***, unsigned int *, TokenType, const char *, unsigned int).
|
|
*
|
|
* \post The memory at \a list and all of its elements will be freed.
|
|
*
|
|
* \see addToken(Token ***, unsigned int *, TokenType, const char *, unsigned int) */
|
|
void deleteTokens(Token **list) /**< [in,out] A pointer to an array of Token structures to be deleted. */
|
|
{
|
|
Token **tok = list;
|
|
while (*tok) {
|
|
deleteToken(*tok);
|
|
tok++;
|
|
}
|
|
free(list);
|
|
}
|
|
|
|
/** Tries to match a sequence of lexemes. Scans through \a lexemes starting at
|
|
* \a start and tries to match space-delimited lexemes from \a match.
|
|
*
|
|
* \pre \a lexemes was created by scanBuffer(const char *, unsigned int, const char *).
|
|
*
|
|
* \return The number of lexemes matched. */
|
|
unsigned int acceptLexemes(LexemeList *lexemes, /**< [in] A pointer to a LexemeList structure to match lexemes from. */
|
|
unsigned int start, /**< [in] The position within \a lexemes to start matching at. */
|
|
const char *match) /**< [in] A pointer to a character array describing the sequence of lexemes to match. */
|
|
{
|
|
unsigned int offset = 0;
|
|
unsigned int n;
|
|
unsigned int i;
|
|
for (n = 0, i = 0; match[n] || lexemes->lexemes[start + offset]->image[i]; n++, i++) {
|
|
if (match[n] == ' ') {
|
|
offset++;
|
|
i = -1;
|
|
continue;
|
|
}
|
|
if (lexemes->lexemes[start + offset]->image[i] != match[n])
|
|
return 0;
|
|
}
|
|
return offset + 1;
|
|
}
|
|
|
|
/** Checks if a sequence of lexemes is a keyword. \a lexemes is searched
|
|
* starting at \a start for keywords. If one is found, the appropriate Token
|
|
* structure is created and returned and the value of \a start is incremented
|
|
* by the number of lexemes matched minus one.
|
|
*
|
|
* \pre \a lexemes was created by scanBuffer(const char *, unsigned int, const char *).
|
|
*
|
|
* \post If a keyword is not found, \a start will be unmodified. Otherwise,
|
|
* \a start will be incremented by the number of lexemes matched minus
|
|
* one.
|
|
*
|
|
* \return A pointer to a newly created keyword Token structure.
|
|
*
|
|
* \retval NULL No keywords were matched or there was an error allocating
|
|
* memory. */
|
|
Token *isKeyword(LexemeList *lexemes, /**< [in] A pointer to a LexemeList structure to search for keywords in. */
|
|
unsigned int *start) /**< [in,out] A pointer to the position within \a lexemes to start checking at. */
|
|
{
|
|
Token *token = NULL;
|
|
TokenType type;
|
|
const char *fname = lexemes->lexemes[*start]->fname;
|
|
unsigned int line = lexemes->lexemes[*start]->line;
|
|
for (type = 0; type != TT_ENDOFTOKENS; type++) {
|
|
int num = acceptLexemes(lexemes, *start, keywords[type]);
|
|
if (!num) continue;
|
|
token = createToken(type, keywords[type], fname, line);
|
|
*start += (num - 1);
|
|
break;
|
|
}
|
|
return token;
|
|
}
|
|
|
|
/** Converts a list of lexemes into tokens. Additionally parses the literal
|
|
* values of integers, floating point decimals, and strings.
|
|
*
|
|
* \pre \a list was created by scanBuffer(const char *, unsigned int, const char *).
|
|
*
|
|
* \return A pointer to an array of Token structures representing the tokenized
|
|
* form of the input lexeme stream.
|
|
*
|
|
* \retval NULL An unrecognized token was encountered or memory allocation
|
|
* failed. */
|
|
Token **tokenizeLexemes(LexemeList *list) /**< [in] A pointer to a LexemeList structure to tokenize. */
|
|
{
|
|
void *mem = NULL;
|
|
Token **ret = NULL;
|
|
unsigned int retsize = 0;
|
|
unsigned int n;
|
|
for (n = 0; n < list->num; n++) {
|
|
Lexeme *lexeme = list->lexemes[n];
|
|
const char *image = lexeme->image;
|
|
const char *fname = lexeme->fname;
|
|
unsigned int line = lexeme->line;
|
|
Token *token = NULL;
|
|
/* String */
|
|
if (isString(image)) {
|
|
token = createToken(TT_STRING, image, fname, line);
|
|
}
|
|
/* Float */
|
|
else if (isFloat(image)) {
|
|
token = createToken(TT_FLOAT, image, fname, line);
|
|
sscanf(lexeme->image, "%f", &(token->data.f));
|
|
}
|
|
/* Integer */
|
|
else if (isInteger(image)) {
|
|
token = createToken(TT_INTEGER, image, fname, line);
|
|
sscanf(lexeme->image, "%i", &(token->data.i));
|
|
}
|
|
/* FAIL */
|
|
else if (!strcmp(image, "FAIL")) {
|
|
token = createToken(TT_BOOLEAN, "FAIL", fname, line);
|
|
token->data.i = 0;
|
|
}
|
|
/* WIN */
|
|
else if (!strcmp(image, "WIN")) {
|
|
token = createToken(TT_BOOLEAN, "WIN", fname, line);
|
|
token->data.i = 1;
|
|
}
|
|
/* CAN HAS STDIO? */
|
|
else if (n < list->num - 2
|
|
&& !strcmp(lexeme->image, "CAN")
|
|
&& !strcmp(list->lexemes[n + 1]->image, "HAS")
|
|
&& !strcmp(list->lexemes[n + 2]->image, "STDIO?")) {
|
|
n += 2;
|
|
/* Just for fun; not actually in spec */
|
|
continue;
|
|
}
|
|
/* Newline */
|
|
/* Note that the spec is unclear as to whether a command *must* follow
|
|
* a comma. For now, we let commas end a line. */
|
|
else if (!strcmp(image, "\n")) {
|
|
/* Note that we ignore any initial newlines */
|
|
if (retsize < 1) {
|
|
#ifdef DEBUG
|
|
fprintf(stderr, "Skipping initial newline.\n");
|
|
#endif
|
|
continue;
|
|
}
|
|
else if (ret[retsize - 1]->type == TT_NEWLINE) {
|
|
#ifdef DEBUG
|
|
fprintf(stderr, "Skipping duplicate newline.\n");
|
|
#endif
|
|
continue;
|
|
}
|
|
else {
|
|
token = createToken(TT_NEWLINE, "end of line", fname, line);
|
|
}
|
|
}
|
|
/* Keyword */
|
|
else if ((token = isKeyword(list, &n))) {
|
|
}
|
|
/* Identifier */
|
|
/* This must be placed after keyword parsing because most
|
|
* keywords look like identifiers. */
|
|
else if (isIdentifier(image)) {
|
|
token = createToken(TT_IDENTIFIER, image, fname, line);
|
|
}
|
|
/* EOF */
|
|
else if (!strcmp(image, "$")) {
|
|
token = createToken(TT_EOF, "end of file", fname, line);
|
|
}
|
|
else {
|
|
fprintf(stderr, "%s:%d: unknown token at: %s\n", fname, line, image);
|
|
/* Clean up */
|
|
deleteToken(ret[retsize - 1]);
|
|
ret[retsize - 1] = NULL;
|
|
deleteTokens(ret);
|
|
return NULL;
|
|
}
|
|
addToken(&ret, &retsize, token);
|
|
}
|
|
mem = realloc(ret, sizeof(Token *) * ++retsize);
|
|
if (!mem) {
|
|
deleteToken(ret[retsize - 2]);
|
|
ret[retsize - 2] = NULL;
|
|
deleteTokens(ret);
|
|
return NULL;
|
|
}
|
|
ret = mem;
|
|
ret[retsize - 1] = NULL;
|
|
return ret;
|
|
}
|