libweb/src/HTML/parser.c

495 lines
15 KiB
C

#include <LibWeb/HTML/parser.h>
#include <LibWeb/HTML/token.h>
#include <LibWeb/HTML/tokenizer.h>
#include <LibWeb/HTML/stack.h>
#include <stdbool.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <ctype.h>
#define tag_name_is(name) \
!strcmp(tag_name(token), name)
Node *document_node;
Node *current_node;
Node *head_element = NULL;
Node *form_element = NULL;
insertion_mode_t insertion_mode = INITIAL_STATE;
bool stop_parsing = false; // TODO: implement function for end parsing
bool frameset_ok = true;
void set_insertion_mode_state(insertion_mode_t mode) {
insertion_mode = mode;
}
void dump_tree(Node *root, size_t spaces) {
if (root == NULL)
return;
for (size_t i = 0; i < spaces; i++)
putchar(' ');
switch(root->node_type) {
case ELEMENT_NODE:
printf("%s", root->owner_document.document_element.local_name);
break;
case ATTRIBUTE_NODE:
break;
case TEXT_NODE:
printf("#text: %s", root->owner_document.text.data.data);
break;
case CDATA_SECTION_NODE:
break;
case ENTITY_REFERENCE_NODE:
break;
case ENTITY_NODE:
break;
case PROCESSING_INSTRUCTION_NODE:
break;
case COMMENT_NODE:
break;
case DOCUMENT_NODE:
printf("Document");
break;
case DOCUMENT_TYPE_NODE:
printf("DOCTYPE: %s", root->owner_document.doctype.name);
break;
case DOCUMENT_FRAGMENT_NODE:
break;
case NOTATION_NODE:
break;
}
putchar('\n');
dump_tree(root->first_child, spaces+1);
dump_tree(root->next_sibling, spaces);
}
// https://html.spec.whatwg.org/multipage/parsing.html#appropriate-place-for-inserting-a-node
Node *find_appropriate_place_for_inserting_node(Node *node) { // TODO: optional
// TODO: If there was an override target specified, then let target be the override target.
Node *target = node;
Node *adjusted_insertion_location;
// TODO: If foster parenting is enabled and target is a table, tbody, tfoot, thead, or tr element
// FIXME
if (target->last_child == NULL)
adjusted_insertion_location = target;
else
adjusted_insertion_location = target->last_child;
// TODO: If the adjusted insertion location is inside a template element, let it instead be inside the template element's template contents, after its last child (if any).
return adjusted_insertion_location;
}
Node *insert_element(Node *node, token_t token) {
Node *adjusted_insertion_location = find_appropriate_place_for_inserting_node(node);
Node *element = make_node(ELEMENT_NODE, token); // FIXME
// FIXME: If the parser was not created as part of the HTML fragment parsing algorithm, then push a new element queue onto element's relevant agent's custom element reactions stack.
append_child(adjusted_insertion_location->parent_node, element);
push(element);
return element;
}
void insert_character(Node *node, token_t token) {
char data[1];
data[0] = character_data(token);
Node *adjusted_insertion_location = find_appropriate_place_for_inserting_node(node);
// FIXME
/*if (adjusted_insertion_location->parent_node->node_type == DOCUMENT_NODE) {
return;
}*/
if (adjusted_insertion_location->node_type == TEXT_NODE) { // FIXME
adjusted_insertion_location->owner_document.text \
= text_append(adjusted_insertion_location->owner_document.text, data);
return;
}
Text text;
text = text_constructor(text, data);
Node *text_node = make_text_node(text);
append_child(adjusted_insertion_location, text_node);
}
bool is_in_list(char *list[], char *string, size_t len) {
for (size_t i = 0; i < len; i++) {
if (!strcmp(list[i], string))
return true;
}
return false;
}
void generate_implied_end_tags() {
Node *node = last_node_on_stack();
char *elements[] = { "dd", "dt", "li", "optgroup", "p", "rb", "rp", "rtc" };
while (1) {
if (!is_in_list(elements, node->owner_document.document_element.local_name, 8)) {
break;
}
node = pop();
}
current_node = last_node_on_stack();
}
void close_p_element() {
// TODO: generate_implied_end_tags("p");
// TODO: If the current node is not a p element, then this is a parse error.
// TODO: Pop elements from the stack of open elements until a p element has been popped from the stack.
Node *node = last_node_on_stack();
if (strcmp(node->owner_document.document_element.local_name, "p"))
return; // Parse error.
while (1) {
if (!strcmp(node->owner_document.document_element.local_name, "p")) {
break;
}
node = pop();
}
current_node = last_node_on_stack();
}
int process_token_using(insertion_mode_t insertion_mode, token_t token);
void handle_initial(token_t token) {
if (is_character(token)) {
return;
} else if (is_comment(token)) {
// Insert a comment as the last child of the Document object.
return;
} else if (is_doctype(token)) {
/*if ((strlen(token.doctype.name) == 0 || strcmp(token.doctype.name, "html") != 0) ||
(strlen(token.doctype.public_identifier) != 0) ||
(strlen(token.doctype.system_identifier) != 0 && strcmp(token.doctype.system_identifier, "about:legacy-compat") != 0)) {
// Parse error
//return;
// FIXME: segfault
}*/
Node *node = make_node(DOCUMENT_TYPE_NODE, token);
append_child(current_node, node);
// TODO: Then, if the document is not an iframe srcdoc document, and the parser cannot change the mode flag is false, and the DOCTYPE token matches one of the conditions in the following list, then set the Document to quirks mode.
// TODO: Otherwise, if the document is not an iframe srcdoc document, and the parser cannot change the mode flag is false, and the DOCTYPE token matches one of the conditions in the following list, then then set the Document to limited-quirks mode.
// TODO: The system identifier and public identifier strings must be compared to the values given in the lists above in an ASCII case-insensitive manner. A system identifier whose value is the empty string is not considered missing for the purposes of the conditions above.
set_insertion_mode_state(BEFORE_HTML_STATE);
} else {
}
}
void handle_before_html(token_t token) {
if (is_doctype(token)) {
// Parse error
return;
} else if (is_comment(token)) {
// TODO: Insert a comment as the last child of the Document object.
return;
} else if (is_character(token) && isspace(character_data(token))) {
return;
} else if (is_start_tag(token) && tag_name_is("html")) {
// Create an element for the token in the HTML namespace, with the Document as the intended parent.
Node *html_node = make_node(ELEMENT_NODE, token);
// Append it to the Document object.
append_child(document_node, html_node);
current_node = html_node;
push(html_node);
set_insertion_mode_state(BEFORE_HEAD_STATE);
return;
} else if (is_end_tag(token) && \
is_in_list((char*[]){"head", "body", "html", "br"}, tag_name(token), 4)) {
goto BEFORE_HTML_ANYTHING_ELSE;
} else if (is_end_tag(token)) {
// Parse error
return;
} else {
BEFORE_HTML_ANYTHING_ELSE:
token_t html_token;
html_token.tag.name = "html";
Node *html_node = make_node(ELEMENT_NODE, html_token);
append_child(document_node, html_node);
current_node = html_node;
push(html_node);
set_insertion_mode_state(BEFORE_HEAD_STATE);
// FIXME: reprocess the token.
return;
}
}
void handle_before_head(token_t token) {
if (is_character(token) && isspace(character_data(token))) {
return;
} else if (is_comment(token)) {
// Insert a comment.
return;
} else if (is_doctype(token)) {
// Parse error.
return;
} else if (is_start_tag(token) && tag_name_is("html")) {
process_token_using(IN_BODY_STATE, token);
} else if (is_start_tag(token) && tag_name_is("head")) {
Node *head_node = insert_element(current_node, token);
current_node = head_node;
head_element = head_node;
set_insertion_mode_state(IN_HEAD_STATE);
return;
} else if (is_end_tag(token) &&
is_in_list((char*[]){"head", "body", "html", "br"}, tag_name(token), 4)) {
goto BEFORE_HEAD_ANYTHING_ELSE;
} else if (is_end_tag(token)) {
// Parse error.
return;
} else {
BEFORE_HEAD_ANYTHING_ELSE:
token_t head_token;
head_token.tag.name = "head";
Node *head_node = insert_element(current_node, head_token);
current_node = head_node;
head_element = head_node;
set_insertion_mode_state(IN_HEAD_STATE);
// FIXME: Reprocess the current token.
return;
}
}
void handle_in_head(token_t token) {
if (is_character(token) && isspace(character_data(token))) {
insert_character(current_node, token);
return;
} else if (is_comment(token)) {
// TODO: Insert a comment
return;
} else if (is_doctype(token)) {
// Parse error.
return;
} else if (is_start_tag(token) && tag_name_is("html")) {
process_token_using(IN_BODY_STATE, token);
} else if (is_end_tag(token) && tag_name_is("head")) {
pop();
set_insertion_mode_state(AFTER_HEAD_STATE);
return;
}
}
void handle_after_head(token_t token) {
if (is_character(token) && isspace(character_data(token))) {
insert_character(current_node, token);
return;
} else if (is_comment(token)) {
// TODO: Insert a comment
return;
} else if (is_doctype(token)) {
// Parse error.
return;
} else if (is_start_tag(token) && tag_name_is("html")) {
process_token_using(IN_BODY_STATE, token);
} else if (is_start_tag(token) && tag_name_is("body")) {
Node *node = insert_element(current_node, token);
current_node = node;
// TODO: Set the Document's awaiting parser-inserted body flag to false.
frameset_ok = false;
set_insertion_mode_state(IN_BODY_STATE);
return;
}
}
void handle_in_body(token_t token) {
if (is_character(token) && character_data(token) == '\0') {
// Parse error.
return;
} else if (is_character(token) && isspace(character_data(token))) {
// TODO: Reconstruct the active formatting elements, if any.
insert_character(current_node, token);
return;
} else if (is_character(token)) { // FIXME
// TODO: Reconstruct the active formatting elements, if any.
insert_character(current_node, token);
frameset_ok = false;
return;
} else if (is_comment(token)) {
// TODO: Insert a comment.
return;
} else if (is_doctype(token)) {
// Parse error.
return;
} else if (is_start_tag(token) &&
is_in_list((char*[]) {"address", \
"article", "aside", "blockquote", \
"center", "details", "dialog", "dir", \
"div", "dl", "fieldset", "figcaption", \
"figure", "footer", "header", "hgroup", \
"main", "menu", "nav", "ol", "p", \
"section", "summary", "ul"}, tag_name(token), 24)) {
// TODO: If the stack of open elements has a p element in button scope, then close a p element.
Node *node = insert_element(current_node, token);
current_node = node;
return;
} else if (is_start_tag(token) &&
is_in_list((char*[]) {"h1", "h2", "h3", "h4", "h5", "h6"}, tag_name(token), 6)) {
// TODO: If the stack of open elements has a p element in button scope, then close a p element.
Node *node = last_node_on_stack();
if (is_in_list((char*[]) {"h1", "h2", "h3", "h4", "h5", "h6"}, node->owner_document.document_element.local_name, 6)) {
current_node = pop();
return; // Parse error.
}
node = insert_element(current_node, token);
current_node = node;
return;
} else if (is_end_tag(token) && tag_name_is("p")) {
// TODO: If the stack of open elements does not have a p element in button scope, then this is a parse error; insert an HTML element for a "p" start tag token with no attributes.
close_p_element();
return;
} else if (is_end_tag(token) &&
is_in_list((char*[]) {"h1", "h2", "h3", "h4", "h5", "h6"}, tag_name(token), 6)) {
// TODO: If the stack of open elements does not have an element in scope that is an HTML element and whose tag name is one of "h1", "h2", "h3", "h4", "h5", or "h6", then this is a parse error; ignore the token.
generate_implied_end_tags();
Node *node = last_node_on_stack();
if (strcmp(node->owner_document.document_element.local_name, token.tag.name))
return; // Parse error.
while (1) {
if (!strcmp(node->owner_document.document_element.local_name, token.tag.name)) {
break;
}
node = pop();
}
} else if (is_end_tag(token) && tag_name_is("body")) {
// TODO: If the stack of open elements does not have a body element in scope, this is a parse error; ignore the token.
// TODO: Otherwise, if there is a node in the stack of open elements that is not either a dd element, a dt element, an li element, an optgroup element, an option element, a p element, an rb element, an rp element, an rt element, an rtc element, a tbody element, a td element, a tfoot element, a th element, a thead element, a tr element, the body element, or the html element, then this is a parse error.
set_insertion_mode_state(AFTER_BODY_STATE);
return;
}
}
void handle_after_body(token_t token) {
if (is_character(token) && isspace(character_data(token))) {
process_token_using(IN_BODY_STATE, token);
} else if (is_end_tag(token) && tag_name_is("html")) {
// TODO: If the parser was created as part of the HTML fragment parsing algorithm, this is a parse error; ignore the token. (fragment case)
set_insertion_mode_state(AFTER_AFTER_BODY_STATE);
return;
} else if (is_eof(token)) {
stop_parsing = true;
return;
}
}
void handle_after_after_body(token_t token) {
if (is_eof(token)) {
stop_parsing = true;
return;
}
}
int process_token_using(insertion_mode_t insertion_mode, token_t token) {
switch (insertion_mode) {
case INITIAL_STATE:
handle_initial(token);
break;
case BEFORE_HTML_STATE:
handle_before_html(token);
break;
case BEFORE_HEAD_STATE:
handle_before_head(token);
break;
case IN_HEAD_STATE:
handle_in_head(token);
break;
case IN_HEAD_NOSCRIPT_STATE:
break;
case AFTER_HEAD_STATE:
handle_after_head(token);
break;
case IN_BODY_STATE:
handle_in_body(token);
break;
case TEXT_STATE:
case IN_TABLE_STATE:
case IN_TABLE_TEXT_STATE:
case IN_CAPTION_STATE:
case IN_COLUMN_GROUP_STATE:
case IN_TABLE_BODY_STATE:
case IN_ROW_STATE:
case IN_CELL_STATE:
case IN_SELECT_STATE:
case IN_SELECT_IN_TABLE_STATE:
case IN_TEMPLATE_STATE:
break;
case AFTER_BODY_STATE:
handle_after_body(token);
break;
case IN_FRAMESET_STATE:
case AFTER_FRAMESET_STATE:
case AFTER_AFTER_BODY_STATE:
handle_after_after_body(token);
break;
case AFTER_AFTER_FRAMESET_STATE:
stop_parsing = true;
break;
default:
fprintf(stderr, "Not handled parsing mode\n");
stop_parsing = true;
return -1;
}
return 0;
}
Node *parse(char *data) { // FIXME
document_node = init_document_node();
current_node = document_node;
token_t token;
for (;;) {
token = next_token(data);
process_token_using(insertion_mode, token);
if (stop_parsing) {
return document_node;
}
}
}