2010-08-09 07:01:59 +00:00
|
|
|
#include "lexer.h"
|
|
|
|
|
2011-06-15 06:54:12 +00:00
|
|
|
/**
|
|
|
|
* Creates a lexeme.
|
|
|
|
*
|
|
|
|
* \param [in] image The string that identifies the lexeme.
|
|
|
|
*
|
|
|
|
* \param [in] fname The name of the file containing the lexeme.
|
|
|
|
*
|
|
|
|
* \param [in] line The line number the lexeme occurred on.
|
|
|
|
*
|
|
|
|
* \return A new lexeme with the desired properties.
|
|
|
|
*
|
|
|
|
* \retval NULL Memory allocation failed.
|
|
|
|
*/
|
|
|
|
Lexeme *createLexeme(char *image, const char *fname, unsigned int line)
|
2010-08-09 07:01:59 +00:00
|
|
|
{
|
|
|
|
Lexeme *ret = malloc(sizeof(Lexeme));
|
|
|
|
if (!ret) {
|
|
|
|
perror("malloc");
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
ret->image = malloc(sizeof(char) * (strlen(image) + 1));
|
|
|
|
if (!(ret->image)) {
|
|
|
|
free(ret);
|
|
|
|
perror("malloc");
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
strcpy(ret->image, image);
|
2011-06-15 06:54:12 +00:00
|
|
|
/**
|
|
|
|
* \note \a fname is not copied because it only one copy is stored for
|
|
|
|
* all lexemes from the same file. This is simply to avoid large
|
|
|
|
* numbers of lexemes storing duplicate file name strings.
|
|
|
|
*/
|
2010-08-09 07:01:59 +00:00
|
|
|
ret->fname = fname;
|
|
|
|
ret->line = line;
|
|
|
|
#ifdef DEBUG
|
|
|
|
fprintf(stderr, "Creating lexeme [%s]\n", image);
|
|
|
|
#endif
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2011-06-15 06:54:12 +00:00
|
|
|
/**
|
|
|
|
* Deletes a lexeme.
|
|
|
|
*
|
|
|
|
* \param [in,out] lexeme The lexeme to delete.
|
|
|
|
*/
|
2010-08-09 07:01:59 +00:00
|
|
|
void deleteLexeme(Lexeme *lexeme)
|
|
|
|
{
|
|
|
|
if (!lexeme) return;
|
|
|
|
free(lexeme->image);
|
2011-06-15 06:54:12 +00:00
|
|
|
/**
|
|
|
|
* \note We do not free the file name because it is shared between many
|
|
|
|
* lexemes and is freed by whomever created the file name string.
|
|
|
|
*/
|
2010-08-09 07:01:59 +00:00
|
|
|
free(lexeme);
|
|
|
|
}
|
|
|
|
|
2011-06-15 06:54:12 +00:00
|
|
|
/**
|
|
|
|
* Creates a list of lexemes.
|
|
|
|
*
|
|
|
|
* \return An empty lexeme list.
|
|
|
|
*
|
|
|
|
* \retval NULL Memory allocation failed.
|
|
|
|
*/
|
2010-08-09 07:01:59 +00:00
|
|
|
LexemeList *createLexemeList(void)
|
|
|
|
{
|
|
|
|
LexemeList *p = malloc(sizeof(LexemeList));
|
|
|
|
if (!p) {
|
|
|
|
perror("malloc");
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
p->num = 0;
|
|
|
|
p->lexemes = NULL;
|
|
|
|
return p;
|
|
|
|
}
|
|
|
|
|
2011-06-15 06:54:12 +00:00
|
|
|
/**
|
|
|
|
* Adds a lexeme to a list of lexemes.
|
|
|
|
*
|
|
|
|
* \param [in,out] list The list of lexemes to add \a lexeme to.
|
|
|
|
*
|
|
|
|
* \param [in] lexeme The lexeme to add to \a list.
|
|
|
|
*
|
|
|
|
* \post \a lexeme will be added to the end of \a list and the size of \a list
|
|
|
|
* will be updated.
|
|
|
|
*
|
|
|
|
* \return A pointer to the added lexeme (will be the same as \a lexeme).
|
|
|
|
*
|
|
|
|
* \retval NULL Memory allocation failed.
|
|
|
|
*/
|
|
|
|
Lexeme *addLexeme(LexemeList *list, Lexeme *lexeme)
|
2010-08-09 07:01:59 +00:00
|
|
|
{
|
|
|
|
unsigned int newsize;
|
|
|
|
void *mem = NULL;
|
|
|
|
if (!list) return NULL;
|
|
|
|
newsize = list->num + 1;
|
|
|
|
mem = realloc(list->lexemes, sizeof(Lexeme *) * newsize);
|
|
|
|
if (!mem) {
|
|
|
|
perror("realloc");
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
list->lexemes = mem;
|
|
|
|
list->lexemes[list->num] = lexeme;
|
|
|
|
list->num = newsize;
|
|
|
|
return lexeme;
|
|
|
|
}
|
|
|
|
|
2011-06-15 06:54:12 +00:00
|
|
|
/**
|
|
|
|
* Deletes a list of lexemes.
|
|
|
|
*
|
|
|
|
* \param [in,out] list The lexeme list to delete.
|
|
|
|
*
|
|
|
|
* \post The memory at \a list and all of its members will be freed.
|
|
|
|
*/
|
|
|
|
void deleteLexemeList(LexemeList *list)
|
2010-08-09 07:01:59 +00:00
|
|
|
{
|
|
|
|
unsigned int n;
|
|
|
|
if (!list) return;
|
|
|
|
for (n = 0; n < list->num; n++)
|
|
|
|
deleteLexeme(list->lexemes[n]);
|
|
|
|
free(list->lexemes);
|
|
|
|
free(list);
|
|
|
|
}
|
|
|
|
|
2011-06-15 06:54:12 +00:00
|
|
|
/**
|
|
|
|
* Scans a buffer, removing unnecessary characters and grouping characters into
|
|
|
|
* lexemes. Lexemes are strings of characters separated by whitespace (although
|
|
|
|
* newline characters are considered separate lexemes). String literals are
|
|
|
|
* handled a bit differently: Starting at the first quotation character,
|
|
|
|
* characters are collected until either a non-escaped quotation character is
|
|
|
|
* read (i.e., a quotation character not preceded by a colon which itself is not
|
|
|
|
* preceded by a colon) or a newline or carriage return character is read,
|
|
|
|
* whichever comes first. This handles the odd (but possible) case of strings
|
|
|
|
* such as "::" which print out a single colon. Also handled are the effects of
|
|
|
|
* commas, ellipses, bangs (!), and array accesses ('Z).
|
|
|
|
*
|
|
|
|
* \param [in] buffer The characters to turn into lexemes.
|
|
|
|
*
|
|
|
|
* \param [in] size The number of characters in \a buffer.
|
|
|
|
*
|
|
|
|
* \param [in] fname The name of the file \a buffer was read from.
|
|
|
|
*
|
|
|
|
* \return A list of lexemes created from the contents of \a buffer.
|
|
|
|
*/
|
|
|
|
LexemeList *scanBuffer(const char *buffer, unsigned int size, const char *fname)
|
2010-08-09 07:01:59 +00:00
|
|
|
{
|
|
|
|
const char *start = buffer;
|
|
|
|
LexemeList *list = NULL;
|
|
|
|
unsigned int line = 1;
|
|
|
|
list = createLexemeList();
|
|
|
|
if (!list) return NULL;
|
|
|
|
while (start < buffer + size) {
|
|
|
|
char *temp = NULL;
|
2011-06-15 06:54:12 +00:00
|
|
|
unsigned int len = 1;
|
2010-08-09 07:01:59 +00:00
|
|
|
/* Comma (,) is a soft newline */
|
|
|
|
if (*start == ',') {
|
2011-06-15 06:54:12 +00:00
|
|
|
Lexeme *lex = createLexeme("\n", fname, line);
|
2010-08-09 07:01:59 +00:00
|
|
|
if (!lex) {
|
|
|
|
deleteLexemeList(list);
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
if (!addLexeme(list, lex)) {
|
|
|
|
deleteLexeme(lex);
|
|
|
|
deleteLexemeList(list);
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
start++;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
/* Bang (!) is its own lexeme */
|
|
|
|
if (*start == '!') {
|
2011-06-15 06:54:12 +00:00
|
|
|
Lexeme *lex = createLexeme("!", fname, line);
|
2010-08-09 07:01:59 +00:00
|
|
|
if (!lex) {
|
|
|
|
deleteLexemeList(list);
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
if (!addLexeme(list, lex)) {
|
|
|
|
deleteLexeme(lex);
|
|
|
|
deleteLexemeList(list);
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
start++;
|
|
|
|
continue;
|
|
|
|
}
|
2011-06-15 06:54:12 +00:00
|
|
|
/* Apostrophe Z ('Z) is its own lexeme */
|
|
|
|
if (!strncmp(start, "'Z", 2)) {
|
|
|
|
Lexeme *lex = createLexeme("'Z", fname, line);
|
|
|
|
if (!lex) {
|
|
|
|
deleteLexemeList(list);
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
if (!addLexeme(list, lex)) {
|
|
|
|
deleteLexeme(lex);
|
|
|
|
deleteLexemeList(list);
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
start += 2;
|
|
|
|
continue;
|
|
|
|
}
|
2013-02-24 20:44:33 +00:00
|
|
|
/* Question mark (?) is its own lexeme */
|
|
|
|
if (*start == '?') {
|
|
|
|
Lexeme *lex = createLexeme("?", fname, line);
|
|
|
|
if (!lex) {
|
|
|
|
deleteLexemeList(list);
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
if (!addLexeme(list, lex)) {
|
|
|
|
deleteLexeme(lex);
|
|
|
|
deleteLexemeList(list);
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
start++;
|
|
|
|
continue;
|
|
|
|
}
|
2010-08-09 07:01:59 +00:00
|
|
|
/* Skip over leading whitespace */
|
|
|
|
while (isspace(*start)) {
|
|
|
|
unsigned int newline = 0;
|
|
|
|
/* Newline is its own lexeme */
|
|
|
|
if (!strncmp(start, "\r\n", 2)) {
|
|
|
|
newline = 1;
|
|
|
|
start++;
|
|
|
|
}
|
|
|
|
else if (*start == '\r' || *start == '\n') {
|
|
|
|
newline = 1;
|
|
|
|
}
|
|
|
|
if (newline) {
|
2011-06-15 06:54:12 +00:00
|
|
|
Lexeme *lex = createLexeme("\n", fname, line);
|
2010-08-09 07:01:59 +00:00
|
|
|
if (!lex) {
|
|
|
|
deleteLexemeList(list);
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
if (!addLexeme(list, lex)) {
|
|
|
|
deleteLexeme(lex);
|
|
|
|
deleteLexemeList(list);
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
line++;
|
|
|
|
}
|
|
|
|
start++;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
/* Skip over ellipses (...) and newline */
|
|
|
|
if ((!strncmp(start, "\xE2\x80\xA6\r\n", 5) && (start += 5))
|
|
|
|
|| (!strncmp(start, "\xE2\x80\xA6\r", 4) && (start += 4))
|
|
|
|
|| (!strncmp(start, "\xE2\x80\xA6\n", 4) && (start += 4))
|
|
|
|
|| (!strncmp(start, "...\r\n", 5) && (start += 5))
|
|
|
|
|| (!strncmp(start, "...\r", 4) && (start += 4))
|
|
|
|
|| (!strncmp(start, "...\n", 4) && (start += 4))) {
|
|
|
|
const char *test = start;
|
|
|
|
/* Make sure next line is not empty */
|
|
|
|
while (*test && isspace(*test)) {
|
|
|
|
if (*test == '\r' || *test == '\n') {
|
2012-12-13 04:53:54 +00:00
|
|
|
error(LX_LINE_CONTINUATION, fname, line);
|
2010-08-09 07:01:59 +00:00
|
|
|
deleteLexemeList(list);
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
test++;
|
|
|
|
}
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
/* Skip over comments */
|
|
|
|
if ((list->num == 0
|
|
|
|
|| *(list->lexemes[list->num - 1]->image) == '\n')
|
|
|
|
&& !strncmp(start, "OBTW", 4)) {
|
|
|
|
start += 4;
|
|
|
|
while (strncmp(start, "TLDR", 4)) {
|
|
|
|
if ((!strncmp(start, "\r\n", 2) && (start += 2))
|
|
|
|
|| (*start == '\r' && start++)
|
|
|
|
|| (*start == '\n' && start++))
|
|
|
|
line++;
|
|
|
|
else
|
|
|
|
start++;
|
|
|
|
}
|
|
|
|
start += 4;
|
|
|
|
/* Must end in newline */
|
|
|
|
while (*start && isspace(*start) && *start != '\r' && *start != '\n')
|
|
|
|
start++;
|
|
|
|
if (start == buffer || *start == ',' || *start == '\r' || *start == '\n')
|
|
|
|
continue;
|
2012-12-13 04:53:54 +00:00
|
|
|
error(LX_MULTIPLE_LINE_COMMENT, fname, line);
|
2010-08-09 07:01:59 +00:00
|
|
|
deleteLexemeList(list);
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
if (!strncmp(start, "BTW", 3)) {
|
|
|
|
start += 3;
|
|
|
|
while (*start && *start != '\r' && *start != '\n')
|
|
|
|
start++;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
/* We have removed or processed any leading characters at this
|
|
|
|
* point */
|
|
|
|
if (!*start) break;
|
|
|
|
if (*start == '"') {
|
|
|
|
/* Find the end of the string, watching for escape
|
|
|
|
* sequences */
|
|
|
|
while ((start[len]
|
|
|
|
&& *(start + len) != '\r'
|
|
|
|
&& *(start + len) != '\n'
|
|
|
|
&& *(start + len) != '"')
|
2010-08-27 17:50:13 +00:00
|
|
|
|| (*(start + len) == '"'
|
|
|
|
&& *(start + len - 1) == ':'
|
|
|
|
&& *(start + len - 2) != ':'))
|
2010-08-09 07:01:59 +00:00
|
|
|
len++;
|
|
|
|
if (*(start + len) == '"') len++;
|
2010-08-27 17:50:13 +00:00
|
|
|
/* Make sure this is the end of the token */
|
|
|
|
if (start[len] && !isspace(start[len])
|
|
|
|
&& *(start + len) != ','
|
|
|
|
&& *(start + len) != '!'
|
2013-02-24 20:44:33 +00:00
|
|
|
&& *(start + len) != '?'
|
2011-06-15 06:54:12 +00:00
|
|
|
&& strncmp(start + len, "'Z", 2)
|
2010-08-27 17:50:13 +00:00
|
|
|
&& strncmp(start + len, "...", 3)
|
|
|
|
&& strncmp(start + len, "\xE2\x80\xA6", 3)) {
|
2012-12-13 04:53:54 +00:00
|
|
|
error(LX_EXPECTED_TOKEN_DELIMITER, fname, line);
|
2010-08-27 17:50:13 +00:00
|
|
|
deleteLexemeList(list);
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
/* Scan for the end of the token */
|
|
|
|
while (start[len] && !isspace(start[len])
|
|
|
|
&& *(start + len) != ','
|
|
|
|
&& *(start + len) != '!'
|
2013-02-24 20:44:33 +00:00
|
|
|
&& *(start + len) != '?'
|
2011-06-15 06:54:12 +00:00
|
|
|
&& strncmp(start + len, "'Z", 2)
|
2010-08-27 17:50:13 +00:00
|
|
|
&& strncmp(start + len, "...", 3)
|
|
|
|
&& strncmp(start + len, "\xE2\x80\xA6", 3))
|
|
|
|
len++;
|
2010-08-09 07:01:59 +00:00
|
|
|
}
|
|
|
|
temp = malloc(sizeof(char) * (len + 1));
|
|
|
|
if (!temp) {
|
|
|
|
perror("malloc");
|
|
|
|
deleteLexemeList(list);
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
strncpy(temp, start, len);
|
|
|
|
temp[len] = '\0';
|
2011-06-15 06:54:12 +00:00
|
|
|
Lexeme *lex = createLexeme(temp, fname, line);
|
2010-08-09 07:01:59 +00:00
|
|
|
if (!lex) {
|
|
|
|
free(temp);
|
|
|
|
deleteLexemeList(list);
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
if (!addLexeme(list, lex)) {
|
|
|
|
free(temp);
|
|
|
|
deleteLexeme(lex);
|
|
|
|
deleteLexemeList(list);
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
free(temp);
|
|
|
|
start += len;
|
|
|
|
}
|
|
|
|
/* Create an end-of-file lexeme */
|
2011-06-15 06:54:12 +00:00
|
|
|
Lexeme *lex = createLexeme("$", fname, line);
|
2010-08-09 07:01:59 +00:00
|
|
|
if (!lex) {
|
|
|
|
deleteLexemeList(list);
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
if (!addLexeme(list, lex)) {
|
|
|
|
deleteLexeme(lex);
|
|
|
|
deleteLexemeList(list);
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
return list;
|
|
|
|
}
|