lci/lexer.c

366 lines
9.1 KiB
C

#include "lexer.h"
/**
* Creates a lexeme.
*
* \param [in] image The string that identifies the lexeme.
*
* \param [in] fname The name of the file containing the lexeme.
*
* \param [in] line The line number the lexeme occurred on.
*
* \return A new lexeme with the desired properties.
*
* \retval NULL Memory allocation failed.
*/
Lexeme *createLexeme(char *image, const char *fname, unsigned int line)
{
Lexeme *ret = malloc(sizeof(Lexeme));
if (!ret) {
perror("malloc");
return NULL;
}
ret->image = malloc(sizeof(char) * (strlen(image) + 1));
if (!(ret->image)) {
free(ret);
perror("malloc");
return NULL;
}
strcpy(ret->image, image);
/**
* \note \a fname is not copied because it only one copy is stored for
* all lexemes from the same file. This is simply to avoid large
* numbers of lexemes storing duplicate file name strings.
*/
ret->fname = fname;
ret->line = line;
#ifdef DEBUG
fprintf(stderr, "Creating lexeme [%s]\n", image);
#endif
return ret;
}
/**
* Deletes a lexeme.
*
* \param [in,out] lexeme The lexeme to delete.
*/
void deleteLexeme(Lexeme *lexeme)
{
if (!lexeme) return;
free(lexeme->image);
/**
* \note We do not free the file name because it is shared between many
* lexemes and is freed by whomever created the file name string.
*/
free(lexeme);
}
/**
* Creates a list of lexemes.
*
* \return An empty lexeme list.
*
* \retval NULL Memory allocation failed.
*/
LexemeList *createLexemeList(void)
{
LexemeList *p = malloc(sizeof(LexemeList));
if (!p) {
perror("malloc");
return NULL;
}
p->num = 0;
p->lexemes = NULL;
return p;
}
/**
* Adds a lexeme to a list of lexemes.
*
* \param [in,out] list The list of lexemes to add \a lexeme to.
*
* \param [in] lexeme The lexeme to add to \a list.
*
* \post \a lexeme will be added to the end of \a list and the size of \a list
* will be updated.
*
* \return A pointer to the added lexeme (will be the same as \a lexeme).
*
* \retval NULL Memory allocation failed.
*/
Lexeme *addLexeme(LexemeList *list, Lexeme *lexeme)
{
unsigned int newsize;
void *mem = NULL;
if (!list) return NULL;
newsize = list->num + 1;
mem = realloc(list->lexemes, sizeof(Lexeme *) * newsize);
if (!mem) {
perror("realloc");
return NULL;
}
list->lexemes = mem;
list->lexemes[list->num] = lexeme;
list->num = newsize;
return lexeme;
}
/**
* Deletes a list of lexemes.
*
* \param [in,out] list The lexeme list to delete.
*
* \post The memory at \a list and all of its members will be freed.
*/
void deleteLexemeList(LexemeList *list)
{
unsigned int n;
if (!list) return;
for (n = 0; n < list->num; n++)
deleteLexeme(list->lexemes[n]);
free(list->lexemes);
free(list);
}
/**
* Scans a buffer, removing unnecessary characters and grouping characters into
* lexemes. Lexemes are strings of characters separated by whitespace (although
* newline characters are considered separate lexemes). String literals are
* handled a bit differently: Starting at the first quotation character,
* characters are collected until either a non-escaped quotation character is
* read (i.e., a quotation character not preceded by a colon which itself is not
* preceded by a colon) or a newline or carriage return character is read,
* whichever comes first. This handles the odd (but possible) case of strings
* such as "::" which print out a single colon. Also handled are the effects of
* commas, ellipses, bangs (!), and array accesses ('Z).
*
* \param [in] buffer The characters to turn into lexemes.
*
* \param [in] size The number of characters in \a buffer.
*
* \param [in] fname The name of the file \a buffer was read from.
*
* \return A list of lexemes created from the contents of \a buffer.
*/
LexemeList *scanBuffer(const char *buffer, unsigned int size, const char *fname)
{
const char *start = buffer;
LexemeList *list = NULL;
unsigned int line = 1;
list = createLexemeList();
if (!list) return NULL;
while (start < buffer + size) {
char *temp = NULL;
unsigned int len = 1;
/* Comma (,) is a soft newline */
if (*start == ',') {
Lexeme *lex = createLexeme("\n", fname, line);
if (!lex) {
deleteLexemeList(list);
return NULL;
}
if (!addLexeme(list, lex)) {
deleteLexeme(lex);
deleteLexemeList(list);
return NULL;
}
start++;
continue;
}
/* Bang (!) is its own lexeme */
if (*start == '!') {
Lexeme *lex = createLexeme("!", fname, line);
if (!lex) {
deleteLexemeList(list);
return NULL;
}
if (!addLexeme(list, lex)) {
deleteLexeme(lex);
deleteLexemeList(list);
return NULL;
}
start++;
continue;
}
/* Apostrophe Z ('Z) is its own lexeme */
if (!strncmp(start, "'Z", 2)) {
Lexeme *lex = createLexeme("'Z", fname, line);
if (!lex) {
deleteLexemeList(list);
return NULL;
}
if (!addLexeme(list, lex)) {
deleteLexeme(lex);
deleteLexemeList(list);
return NULL;
}
start += 2;
continue;
}
/* Question mark (?) is its own lexeme */
if (*start == '?') {
Lexeme *lex = createLexeme("?", fname, line);
if (!lex) {
deleteLexemeList(list);
return NULL;
}
if (!addLexeme(list, lex)) {
deleteLexeme(lex);
deleteLexemeList(list);
return NULL;
}
start++;
continue;
}
/* Skip over leading whitespace */
while (isspace(*start)) {
unsigned int newline = 0;
/* Newline is its own lexeme */
if (!strncmp(start, "\r\n", 2)) {
newline = 1;
start++;
}
else if (*start == '\r' || *start == '\n') {
newline = 1;
}
if (newline) {
Lexeme *lex = createLexeme("\n", fname, line);
if (!lex) {
deleteLexemeList(list);
return NULL;
}
if (!addLexeme(list, lex)) {
deleteLexeme(lex);
deleteLexemeList(list);
return NULL;
}
line++;
}
start++;
continue;
}
/* Skip over ellipses (...) and newline */
if ((!strncmp(start, "\xE2\x80\xA6\r\n", 5) && (start += 5))
|| (!strncmp(start, "\xE2\x80\xA6\r", 4) && (start += 4))
|| (!strncmp(start, "\xE2\x80\xA6\n", 4) && (start += 4))
|| (!strncmp(start, "...\r\n", 5) && (start += 5))
|| (!strncmp(start, "...\r", 4) && (start += 4))
|| (!strncmp(start, "...\n", 4) && (start += 4))) {
const char *test = start;
/* Make sure next line is not empty */
while (*test && isspace(*test)) {
if (*test == '\r' || *test == '\n') {
error(LX_LINE_CONTINUATION, fname, line);
deleteLexemeList(list);
return NULL;
}
test++;
}
continue;
}
/* Skip over comments */
if ((list->num == 0
|| *(list->lexemes[list->num - 1]->image) == '\n')
&& !strncmp(start, "OBTW", 4)) {
start += 4;
while (strncmp(start, "TLDR", 4)) {
if ((!strncmp(start, "\r\n", 2) && (start += 2))
|| (*start == '\r' && start++)
|| (*start == '\n' && start++))
line++;
else
start++;
}
start += 4;
/* Must end in newline */
while (*start && isspace(*start) && *start != '\r' && *start != '\n')
start++;
if (start == buffer || *start == ',' || *start == '\r' || *start == '\n')
continue;
error(LX_MULTIPLE_LINE_COMMENT, fname, line);
deleteLexemeList(list);
return NULL;
}
if (!strncmp(start, "BTW", 3)) {
start += 3;
while (*start && *start != '\r' && *start != '\n')
start++;
continue;
}
/* We have removed or processed any leading characters at this
* point */
if (!*start) break;
if (*start == '"') {
/* Find the end of the string, watching for escape
* sequences */
while ((start[len]
&& *(start + len) != '\r'
&& *(start + len) != '\n'
&& *(start + len) != '"')
|| (*(start + len) == '"'
&& *(start + len - 1) == ':'
&& *(start + len - 2) != ':'))
len++;
if (*(start + len) == '"') len++;
/* Make sure this is the end of the token */
if (start[len] && !isspace(start[len])
&& *(start + len) != ','
&& *(start + len) != '!'
&& *(start + len) != '?'
&& strncmp(start + len, "'Z", 2)
&& strncmp(start + len, "...", 3)
&& strncmp(start + len, "\xE2\x80\xA6", 3)) {
error(LX_EXPECTED_TOKEN_DELIMITER, fname, line);
deleteLexemeList(list);
return NULL;
}
}
else {
/* Scan for the end of the token */
while (start[len] && !isspace(start[len])
&& *(start + len) != ','
&& *(start + len) != '!'
&& *(start + len) != '?'
&& strncmp(start + len, "'Z", 2)
&& strncmp(start + len, "...", 3)
&& strncmp(start + len, "\xE2\x80\xA6", 3))
len++;
}
temp = malloc(sizeof(char) * (len + 1));
if (!temp) {
perror("malloc");
deleteLexemeList(list);
return NULL;
}
strncpy(temp, start, len);
temp[len] = '\0';
Lexeme *lex = createLexeme(temp, fname, line);
if (!lex) {
free(temp);
deleteLexemeList(list);
return NULL;
}
if (!addLexeme(list, lex)) {
free(temp);
deleteLexeme(lex);
deleteLexemeList(list);
return NULL;
}
free(temp);
start += len;
}
/* Create an end-of-file lexeme */
Lexeme *lex = createLexeme("$", fname, line);
if (!lex) {
deleteLexemeList(list);
return NULL;
}
if (!addLexeme(list, lex)) {
deleteLexeme(lex);
deleteLexemeList(list);
return NULL;
}
return list;
}