251 lines
7.8 KiB
C
251 lines
7.8 KiB
C
#include <LibWeb/HTML/tokenizer.h>
|
|
#include <ctype.h>
|
|
#include <stdbool.h>
|
|
#include <stdint.h>
|
|
#include <stdio.h>
|
|
#include <stdlib.h>
|
|
#include <string.h>
|
|
|
|
tokenizing_state_t state = DATA_STATE;
|
|
tokenizing_state_t return_state = DATA_STATE;
|
|
size_t position = 0;
|
|
|
|
uint32_t next_char(char *string) {
|
|
if (position >= strlen(string))
|
|
return EOF;
|
|
return string[position++];
|
|
}
|
|
|
|
void set_tokenizing(tokenizing_state_t input_state) {
|
|
state = input_state;
|
|
}
|
|
|
|
void reconsume(tokenizing_state_t reconsume_state) {
|
|
position--;
|
|
set_tokenizing(reconsume_state);
|
|
}
|
|
|
|
bool next_few_characters_are(char *string, char *src) {
|
|
for (size_t i = 0; i < strlen(string); i++) {
|
|
if (string[i] != src[position+i])
|
|
return false;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
void consume(char *string) {
|
|
position += strlen(string);
|
|
}
|
|
|
|
char *init_str(char c) {
|
|
char *temp = malloc(2);
|
|
temp[0] = c;
|
|
temp[1] = '\0';
|
|
return temp;
|
|
}
|
|
|
|
char *append(char *string, char c) {
|
|
size_t len = strlen(string);
|
|
char *str = init_str(c);
|
|
string = realloc(string, len+1+1);
|
|
strcpy(string+len, str);
|
|
free(str);
|
|
return string;
|
|
}
|
|
|
|
token_t next_token(char *string) {
|
|
token_t token = {0};
|
|
token.type = UNKNOWN;
|
|
|
|
uint32_t current_input_character = EOF;
|
|
|
|
for (;;) {
|
|
switch(state) {
|
|
case DATA_STATE:
|
|
current_input_character = next_char(string);
|
|
if (current_input_character == '&') {
|
|
return_state = DATA_STATE;
|
|
set_tokenizing(CHARACTER_REFERENCE_STATE);
|
|
continue;
|
|
} else if (current_input_character == '<') {
|
|
set_tokenizing(TAG_OPEN_STATE);
|
|
continue;
|
|
} else if (current_input_character == '\0') {
|
|
// TODO: This is an unexpected-null-character parse error. Emit the current input character as a character token.
|
|
continue;
|
|
} else if (current_input_character == (uint32_t)EOF) {
|
|
token.type = END_OF_FILE;
|
|
// Emit an end-of-file token.
|
|
return token;
|
|
} else {
|
|
// Emit the current input character as a character token.
|
|
token.type = CHARACTER;
|
|
token.character.data = current_input_character;
|
|
return token;
|
|
}
|
|
break;
|
|
case RCDATA_STATE:
|
|
current_input_character = next_char(string);
|
|
if (current_input_character == '&') {
|
|
return_state = RCDATA_STATE;
|
|
set_tokenizing(CHARACTER_REFERENCE_STATE);
|
|
continue;
|
|
} else if (current_input_character == '<') {
|
|
// Switch to the RCDATA less-than sign state.
|
|
set_tokenizing(RCDATA_LESS_SIGN_STATE);
|
|
continue;
|
|
} else if (current_input_character == '\0') {
|
|
// TODO: This is an unexpected-null-character parse error. Emit a U+FFFD REPLACEMENT CHARACTER character token.
|
|
continue;
|
|
} else if (current_input_character == (uint32_t)EOF) {
|
|
token.type = END_OF_FILE;
|
|
// Emit an end-of-file token.
|
|
return token;
|
|
} else {
|
|
// Emit the current input character as a character token.
|
|
token.type = CHARACTER;
|
|
token.character.data = current_input_character;
|
|
return token;
|
|
}
|
|
break;
|
|
case RAWTEXT_STATE:
|
|
current_input_character = next_char(string);
|
|
if (current_input_character == '<') {
|
|
set_tokenizing(RAWTEXT_LESS_SIGN_STATE);
|
|
continue;
|
|
} else if (current_input_character == '\0') {
|
|
// TODO: This is an unexpected-null-character parse error. Emit a U+FFFD REPLACEMENT CHARACTER character token.
|
|
continue;
|
|
} else if (current_input_character == (uint32_t)EOF) {
|
|
token.type = END_OF_FILE;
|
|
// Emit an end-of-file token.
|
|
return token;
|
|
} else {
|
|
// Emit the current input character as a character token.
|
|
token.type = CHARACTER;
|
|
token.character.data = current_input_character;
|
|
return token;
|
|
}
|
|
break;
|
|
case SCRIPT_DATA_STATE:
|
|
current_input_character = next_char(string);
|
|
if (current_input_character == '<') {
|
|
set_tokenizing(SCRIPT_DATA_LESS_SIGN_STATE);
|
|
continue;
|
|
} else if (current_input_character == '\0') {
|
|
// TODO: This is an unexpected-null-character parse error. Emit a U+FFFD REPLACEMENT CHARACTER character token.
|
|
continue;
|
|
} else if (current_input_character == (uint32_t)EOF) {
|
|
token.type = END_OF_FILE;
|
|
// Emit an end-of-file token.
|
|
return token;
|
|
} else {
|
|
// Emit the current input character as a character token.
|
|
token.type = CHARACTER;
|
|
token.character.data = current_input_character;
|
|
return token;
|
|
}
|
|
break;
|
|
case PLAINTEXT_STATE:
|
|
current_input_character = next_char(string);
|
|
if (current_input_character == '\0') {
|
|
// TODO: This is an unexpected-null-character parse error. Emit a U+FFFD REPLACEMENT CHARACTER character token.
|
|
continue;
|
|
} else if (current_input_character == (uint32_t)EOF) {
|
|
// Emit an end-of-file token.
|
|
token.type = END_OF_FILE;
|
|
return token;
|
|
} else {
|
|
// Emit the current input character as a character token.
|
|
token.type = CHARACTER;
|
|
token.character.data = current_input_character;
|
|
return token;
|
|
}
|
|
break;
|
|
case TAG_OPEN_STATE:
|
|
current_input_character = next_char(string);
|
|
if (current_input_character == '!') {
|
|
set_tokenizing(MARKUP_DECLARATION_OPEN_STATE);
|
|
continue;
|
|
} else if (current_input_character == '/') {
|
|
set_tokenizing(END_TAG_OPEN_STATE);
|
|
continue;
|
|
} else if (isalpha(current_input_character)) {
|
|
token.type = START_TAG;
|
|
token.tag.name = init_str('\0');
|
|
token.tag.self_closing = false;
|
|
reconsume(TAG_NAME_STATE);
|
|
}
|
|
break;
|
|
case MARKUP_DECLARATION_OPEN_STATE:
|
|
if (next_few_characters_are("--", string)) {
|
|
fprintf(stderr, "Not implemented: MARKUP_DECLARATION_OPEN_STATE, '-'");
|
|
} else if (next_few_characters_are("DOCTYPE", string)) {
|
|
consume("DOCTYPE");
|
|
set_tokenizing(DOCTYPE_STATE);
|
|
continue;
|
|
}
|
|
break;
|
|
case DOCTYPE_STATE:
|
|
current_input_character = next_char(string);
|
|
if (isspace(current_input_character)) {
|
|
set_tokenizing(BEFORE_DOCTYPE_NAME_STATE);
|
|
continue;
|
|
}
|
|
break;
|
|
case BEFORE_DOCTYPE_NAME_STATE:
|
|
current_input_character = next_char(string);
|
|
if (isspace(current_input_character)) {
|
|
continue;
|
|
} else {
|
|
token.type = DOCTYPE;
|
|
token.doctype.name = init_str(current_input_character);
|
|
set_tokenizing(DOCTYPE_NAME_STATE);
|
|
continue;
|
|
}
|
|
break;
|
|
case DOCTYPE_NAME_STATE:
|
|
current_input_character = next_char(string);
|
|
if (isspace(current_input_character)) {
|
|
set_tokenizing(AFTER_DOCTYPE_NAME_STATE);
|
|
continue;
|
|
} else if (current_input_character == '>') {
|
|
set_tokenizing(DATA_STATE);
|
|
// Emit the current DOCTYPE token.
|
|
return token;
|
|
continue;
|
|
} else {
|
|
token.doctype.name = append(token.doctype.name, current_input_character);
|
|
}
|
|
break;
|
|
case TAG_NAME_STATE:
|
|
current_input_character = next_char(string);
|
|
if (isspace(current_input_character)) {
|
|
// TODO: Switch to the before attribute name state.
|
|
} else if (current_input_character == '>') {
|
|
set_tokenizing(DATA_STATE);
|
|
// Emit the current tag token.
|
|
return token;
|
|
continue;
|
|
} else {
|
|
token.tag.name = append(token.tag.name, current_input_character);
|
|
}
|
|
break;
|
|
case END_TAG_OPEN_STATE:
|
|
current_input_character = next_char(string);
|
|
if (isalpha(current_input_character)) {
|
|
token.type = END_TAG;
|
|
token.tag.name = init_str('\0');
|
|
token.tag.self_closing = true;
|
|
reconsume(TAG_NAME_STATE);
|
|
}
|
|
break;
|
|
default:
|
|
fprintf(stderr, "Not implemented\n");
|
|
token.type = UNKNOWN;
|
|
return token;
|
|
break;
|
|
}
|
|
}
|
|
}
|