#include "lexer.h" /** * Creates a lexeme. * * \param [in] image The string that identifies the lexeme. * * \param [in] fname The name of the file containing the lexeme. * * \param [in] line The line number the lexeme occurred on. * * \return A new lexeme with the desired properties. * * \retval NULL Memory allocation failed. */ Lexeme *createLexeme(char *image, const char *fname, unsigned int line) { Lexeme *ret = malloc(sizeof(Lexeme)); if (!ret) { perror("malloc"); return NULL; } ret->image = malloc(sizeof(char) * (strlen(image) + 1)); if (!(ret->image)) { free(ret); perror("malloc"); return NULL; } strcpy(ret->image, image); /** * \note \a fname is not copied because it only one copy is stored for * all lexemes from the same file. This is simply to avoid large * numbers of lexemes storing duplicate file name strings. */ ret->fname = fname; ret->line = line; #ifdef DEBUG fprintf(stderr, "Creating lexeme [%s]\n", image); #endif return ret; } /** * Deletes a lexeme. * * \param [in,out] lexeme The lexeme to delete. */ void deleteLexeme(Lexeme *lexeme) { if (!lexeme) return; free(lexeme->image); /** * \note We do not free the file name because it is shared between many * lexemes and is freed by whomever created the file name string. */ free(lexeme); } /** * Creates a list of lexemes. * * \return An empty lexeme list. * * \retval NULL Memory allocation failed. */ LexemeList *createLexemeList(void) { LexemeList *p = malloc(sizeof(LexemeList)); if (!p) { perror("malloc"); return NULL; } p->num = 0; p->lexemes = NULL; return p; } /** * Adds a lexeme to a list of lexemes. * * \param [in,out] list The list of lexemes to add \a lexeme to. * * \param [in] lexeme The lexeme to add to \a list. * * \post \a lexeme will be added to the end of \a list and the size of \a list * will be updated. * * \return A pointer to the added lexeme (will be the same as \a lexeme). * * \retval NULL Memory allocation failed. */ Lexeme *addLexeme(LexemeList *list, Lexeme *lexeme) { unsigned int newsize; void *mem = NULL; if (!list) return NULL; newsize = list->num + 1; mem = realloc(list->lexemes, sizeof(Lexeme *) * newsize); if (!mem) { perror("realloc"); return NULL; } list->lexemes = mem; list->lexemes[list->num] = lexeme; list->num = newsize; return lexeme; } /** * Deletes a list of lexemes. * * \param [in,out] list The lexeme list to delete. * * \post The memory at \a list and all of its members will be freed. */ void deleteLexemeList(LexemeList *list) { unsigned int n; if (!list) return; for (n = 0; n < list->num; n++) deleteLexeme(list->lexemes[n]); free(list->lexemes); free(list); } /** * Scans a buffer, removing unnecessary characters and grouping characters into * lexemes. Lexemes are strings of characters separated by whitespace (although * newline characters are considered separate lexemes). String literals are * handled a bit differently: Starting at the first quotation character, * characters are collected until either a non-escaped quotation character is * read (i.e., a quotation character not preceded by a colon which itself is not * preceded by a colon) or a newline or carriage return character is read, * whichever comes first. This handles the odd (but possible) case of strings * such as "::" which print out a single colon. Also handled are the effects of * commas, ellipses, bangs (!), and array accesses ('Z). * * \param [in] buffer The characters to turn into lexemes. * * \param [in] size The number of characters in \a buffer. * * \param [in] fname The name of the file \a buffer was read from. * * \return A list of lexemes created from the contents of \a buffer. */ LexemeList *scanBuffer(const char *buffer, unsigned int size, const char *fname) { const char *start = buffer; LexemeList *list = NULL; unsigned int line = 1; list = createLexemeList(); if (!list) return NULL; while (start < buffer + size) { char *temp = NULL; unsigned int len = 1; /* Comma (,) is a soft newline */ if (*start == ',') { Lexeme *lex = createLexeme("\n", fname, line); if (!lex) { deleteLexemeList(list); return NULL; } if (!addLexeme(list, lex)) { deleteLexeme(lex); deleteLexemeList(list); return NULL; } start++; continue; } /* Bang (!) is its own lexeme */ if (*start == '!') { Lexeme *lex = createLexeme("!", fname, line); if (!lex) { deleteLexemeList(list); return NULL; } if (!addLexeme(list, lex)) { deleteLexeme(lex); deleteLexemeList(list); return NULL; } start++; continue; } /* Apostrophe Z ('Z) is its own lexeme */ if (!strncmp(start, "'Z", 2)) { Lexeme *lex = createLexeme("'Z", fname, line); if (!lex) { deleteLexemeList(list); return NULL; } if (!addLexeme(list, lex)) { deleteLexeme(lex); deleteLexemeList(list); return NULL; } start += 2; continue; } /* Question mark (?) is its own lexeme */ if (*start == '?') { Lexeme *lex = createLexeme("?", fname, line); if (!lex) { deleteLexemeList(list); return NULL; } if (!addLexeme(list, lex)) { deleteLexeme(lex); deleteLexemeList(list); return NULL; } start++; continue; } /* Skip over leading whitespace */ while (isspace(*start)) { unsigned int newline = 0; /* Newline is its own lexeme */ if (!strncmp(start, "\r\n", 2)) { newline = 1; start++; } else if (*start == '\r' || *start == '\n') { newline = 1; } if (newline) { Lexeme *lex = createLexeme("\n", fname, line); if (!lex) { deleteLexemeList(list); return NULL; } if (!addLexeme(list, lex)) { deleteLexeme(lex); deleteLexemeList(list); return NULL; } line++; } start++; continue; } /* Skip over ellipses (...) and newline */ if ((!strncmp(start, "\xE2\x80\xA6\r\n", 5) && (start += 5)) || (!strncmp(start, "\xE2\x80\xA6\r", 4) && (start += 4)) || (!strncmp(start, "\xE2\x80\xA6\n", 4) && (start += 4)) || (!strncmp(start, "...\r\n", 5) && (start += 5)) || (!strncmp(start, "...\r", 4) && (start += 4)) || (!strncmp(start, "...\n", 4) && (start += 4))) { const char *test = start; /* Make sure next line is not empty */ while (*test && isspace(*test)) { if (*test == '\r' || *test == '\n') { error(LX_LINE_CONTINUATION, fname, line); deleteLexemeList(list); return NULL; } test++; } continue; } /* Skip over comments */ if ((list->num == 0 || *(list->lexemes[list->num - 1]->image) == '\n') && !strncmp(start, "OBTW", 4)) { start += 4; while (strncmp(start, "TLDR", 4)) { if ((!strncmp(start, "\r\n", 2) && (start += 2)) || (*start == '\r' && start++) || (*start == '\n' && start++)) line++; else start++; } start += 4; /* Must end in newline */ while (*start && isspace(*start) && *start != '\r' && *start != '\n') start++; if (start == buffer || *start == ',' || *start == '\r' || *start == '\n') continue; error(LX_MULTIPLE_LINE_COMMENT, fname, line); deleteLexemeList(list); return NULL; } if (!strncmp(start, "BTW", 3)) { start += 3; while (*start && *start != '\r' && *start != '\n') start++; continue; } /* We have removed or processed any leading characters at this * point */ if (!*start) break; if (*start == '"') { /* Find the end of the string, watching for escape * sequences */ while ((start[len] && *(start + len) != '\r' && *(start + len) != '\n' && *(start + len) != '"') || (*(start + len) == '"' && *(start + len - 1) == ':' && *(start + len - 2) != ':')) len++; if (*(start + len) == '"') len++; /* Make sure this is the end of the token */ if (start[len] && !isspace(start[len]) && *(start + len) != ',' && *(start + len) != '!' && *(start + len) != '?' && strncmp(start + len, "'Z", 2) && strncmp(start + len, "...", 3) && strncmp(start + len, "\xE2\x80\xA6", 3)) { error(LX_EXPECTED_TOKEN_DELIMITER, fname, line); deleteLexemeList(list); return NULL; } } else { /* Scan for the end of the token */ while (start[len] && !isspace(start[len]) && *(start + len) != ',' && *(start + len) != '!' && *(start + len) != '?' && strncmp(start + len, "'Z", 2) && strncmp(start + len, "...", 3) && strncmp(start + len, "\xE2\x80\xA6", 3)) len++; } temp = malloc(sizeof(char) * (len + 1)); if (!temp) { perror("malloc"); deleteLexemeList(list); return NULL; } strncpy(temp, start, len); temp[len] = '\0'; Lexeme *lex = createLexeme(temp, fname, line); if (!lex) { free(temp); deleteLexemeList(list); return NULL; } if (!addLexeme(list, lex)) { free(temp); deleteLexeme(lex); deleteLexemeList(list); return NULL; } free(temp); start += len; } /* Create an end-of-file lexeme */ Lexeme *lex = createLexeme("$", fname, line); if (!lex) { deleteLexemeList(list); return NULL; } if (!addLexeme(list, lex)) { deleteLexeme(lex); deleteLexemeList(list); return NULL; } return list; }