libweb/src/HTML/tokenizer.c

251 lines
7.8 KiB
C

#include <LibWeb/HTML/tokenizer.h>
#include <ctype.h>
#include <stdbool.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
tokenizing_state_t state = DATA_STATE;
tokenizing_state_t return_state = DATA_STATE;
size_t position = 0;
uint32_t next_char(char *string) {
if (position >= strlen(string))
return EOF;
return string[position++];
}
void set_tokenizing(tokenizing_state_t input_state) {
state = input_state;
}
void reconsume(tokenizing_state_t reconsume_state) {
position--;
set_tokenizing(reconsume_state);
}
bool next_few_characters_are(char *string, char *src) {
for (size_t i = 0; i < strlen(string); i++) {
if (string[i] != src[position+i])
return false;
}
return true;
}
void consume(char *string) {
position += strlen(string);
}
char *init_str(char c) {
char *temp = malloc(2);
temp[0] = c;
temp[1] = '\0';
return temp;
}
char *append(char *string, char c) {
size_t len = strlen(string);
char *str = init_str(c);
string = realloc(string, len+1+1);
strcpy(string+len, str);
free(str);
return string;
}
token_t next_token(char *string) {
token_t token = {0};
token.type = UNKNOWN;
uint32_t current_input_character = EOF;
for (;;) {
switch(state) {
case DATA_STATE:
current_input_character = next_char(string);
if (current_input_character == '&') {
return_state = DATA_STATE;
set_tokenizing(CHARACTER_REFERENCE_STATE);
continue;
} else if (current_input_character == '<') {
set_tokenizing(TAG_OPEN_STATE);
continue;
} else if (current_input_character == '\0') {
// TODO: This is an unexpected-null-character parse error. Emit the current input character as a character token.
continue;
} else if (current_input_character == (uint32_t)EOF) {
token.type = END_OF_FILE;
// Emit an end-of-file token.
return token;
} else {
// Emit the current input character as a character token.
token.type = CHARACTER;
token.character.data = current_input_character;
return token;
}
break;
case RCDATA_STATE:
current_input_character = next_char(string);
if (current_input_character == '&') {
return_state = RCDATA_STATE;
set_tokenizing(CHARACTER_REFERENCE_STATE);
continue;
} else if (current_input_character == '<') {
// Switch to the RCDATA less-than sign state.
set_tokenizing(RCDATA_LESS_SIGN_STATE);
continue;
} else if (current_input_character == '\0') {
// TODO: This is an unexpected-null-character parse error. Emit a U+FFFD REPLACEMENT CHARACTER character token.
continue;
} else if (current_input_character == (uint32_t)EOF) {
token.type = END_OF_FILE;
// Emit an end-of-file token.
return token;
} else {
// Emit the current input character as a character token.
token.type = CHARACTER;
token.character.data = current_input_character;
return token;
}
break;
case RAWTEXT_STATE:
current_input_character = next_char(string);
if (current_input_character == '<') {
set_tokenizing(RAWTEXT_LESS_SIGN_STATE);
continue;
} else if (current_input_character == '\0') {
// TODO: This is an unexpected-null-character parse error. Emit a U+FFFD REPLACEMENT CHARACTER character token.
continue;
} else if (current_input_character == (uint32_t)EOF) {
token.type = END_OF_FILE;
// Emit an end-of-file token.
return token;
} else {
// Emit the current input character as a character token.
token.type = CHARACTER;
token.character.data = current_input_character;
return token;
}
break;
case SCRIPT_DATA_STATE:
current_input_character = next_char(string);
if (current_input_character == '<') {
set_tokenizing(SCRIPT_DATA_LESS_SIGN_STATE);
continue;
} else if (current_input_character == '\0') {
// TODO: This is an unexpected-null-character parse error. Emit a U+FFFD REPLACEMENT CHARACTER character token.
continue;
} else if (current_input_character == (uint32_t)EOF) {
token.type = END_OF_FILE;
// Emit an end-of-file token.
return token;
} else {
// Emit the current input character as a character token.
token.type = CHARACTER;
token.character.data = current_input_character;
return token;
}
break;
case PLAINTEXT_STATE:
current_input_character = next_char(string);
if (current_input_character == '\0') {
// TODO: This is an unexpected-null-character parse error. Emit a U+FFFD REPLACEMENT CHARACTER character token.
continue;
} else if (current_input_character == (uint32_t)EOF) {
// Emit an end-of-file token.
token.type = END_OF_FILE;
return token;
} else {
// Emit the current input character as a character token.
token.type = CHARACTER;
token.character.data = current_input_character;
return token;
}
break;
case TAG_OPEN_STATE:
current_input_character = next_char(string);
if (current_input_character == '!') {
set_tokenizing(MARKUP_DECLARATION_OPEN_STATE);
continue;
} else if (current_input_character == '/') {
set_tokenizing(END_TAG_OPEN_STATE);
continue;
} else if (isalpha(current_input_character)) {
token.type = START_TAG;
token.tag.name = init_str('\0');
token.tag.self_closing = false;
reconsume(TAG_NAME_STATE);
}
break;
case MARKUP_DECLARATION_OPEN_STATE:
if (next_few_characters_are("--", string)) {
fprintf(stderr, "Not implemented: MARKUP_DECLARATION_OPEN_STATE, '-'");
} else if (next_few_characters_are("DOCTYPE", string)) {
consume("DOCTYPE");
set_tokenizing(DOCTYPE_STATE);
continue;
}
break;
case DOCTYPE_STATE:
current_input_character = next_char(string);
if (isspace(current_input_character)) {
set_tokenizing(BEFORE_DOCTYPE_NAME_STATE);
continue;
}
break;
case BEFORE_DOCTYPE_NAME_STATE:
current_input_character = next_char(string);
if (isspace(current_input_character)) {
continue;
} else {
token.type = DOCTYPE;
token.doctype.name = init_str(current_input_character);
set_tokenizing(DOCTYPE_NAME_STATE);
continue;
}
break;
case DOCTYPE_NAME_STATE:
current_input_character = next_char(string);
if (isspace(current_input_character)) {
set_tokenizing(AFTER_DOCTYPE_NAME_STATE);
continue;
} else if (current_input_character == '>') {
set_tokenizing(DATA_STATE);
// Emit the current DOCTYPE token.
return token;
continue;
} else {
token.doctype.name = append(token.doctype.name, current_input_character);
}
break;
case TAG_NAME_STATE:
current_input_character = next_char(string);
if (isspace(current_input_character)) {
// TODO: Switch to the before attribute name state.
} else if (current_input_character == '>') {
set_tokenizing(DATA_STATE);
// Emit the current tag token.
return token;
continue;
} else {
token.tag.name = append(token.tag.name, current_input_character);
}
break;
case END_TAG_OPEN_STATE:
current_input_character = next_char(string);
if (isalpha(current_input_character)) {
token.type = END_TAG;
token.tag.name = init_str('\0');
token.tag.self_closing = true;
reconsume(TAG_NAME_STATE);
}
break;
default:
fprintf(stderr, "Not implemented\n");
token.type = UNKNOWN;
return token;
break;
}
}
}