From 702e2782f2a531b24defbd6eb688bdac76918ee0 Mon Sep 17 00:00:00 2001 From: lexborisov Date: Fri, 2 Sep 2016 10:29:18 +0400 Subject: [PATCH] Change rules for parse flags skip whitespace MyHTML_TREE_PARSE_FLAGS_SKIP_WHITESPACE_TOKEN skip ws token, but not for RCDATA, RAWTEXT, CDATA and PLAINTEXT --- include/myhtml/api.h | 15 ++++++++++++--- include/myhtml/myosi.h | 13 +++++++++++-- source/myhtml/api.h | 15 ++++++++++++--- source/myhtml/myosi.h | 13 +++++++++++-- source/myhtml/tokenizer.c | 9 ++++++--- 5 files changed, 52 insertions(+), 13 deletions(-) diff --git a/include/myhtml/api.h b/include/myhtml/api.h index fed7a73..d23ef09 100644 --- a/include/myhtml/api.h +++ b/include/myhtml/api.h @@ -370,7 +370,15 @@ enum myhtml_tags { // base /* Very important!!! - see modest/myosi.h:modest_status_t + + for myhtml 0..00ffff; MyHTML_STATUS_OK == 0x000000 + for mycss and modules 010000..01ffff; MyCSS_STATUS_OK == 0x000000 + for modest 020000..02ffff; MODEST_STATUS_OK == 0x000000 + for myrender 030000..03ffff; MyRENDER_STATUS_OK == 0x000000 + for mydom 040000..04ffff; MyDOM_STATUS_OK == 0x000000 + for mynetwork 050000..05ffff; MyNETWORK_STATUS_OK == 0x000000 + for myecma 060000..06ffff; MyECMA_STATUS_OK == 0x000000 + not occupied 070000.. */ enum myhtml_status { MyHTML_STATUS_OK = 0x0000, @@ -426,7 +434,8 @@ enum myhtml_status { } typedef myhtml_status_t; -#define MYHTML_FAILED(_status_) ((_status_) != MyHTML_STATUS_OK) +#define MYHTML_FAILED(_status_) ((_status_) != MyHTML_STATUS_OK) /* deprecated */ +#define MyHTML_FAILED(_status_) ((_status_) != MyHTML_STATUS_OK) /** * @struct myhtml namespace @@ -463,7 +472,7 @@ enum myhtml_tree_parse_flags { MyHTML_TREE_PARSE_FLAGS_CLEAN = 0x000, MyHTML_TREE_PARSE_FLAGS_WITHOUT_BUILD_TREE = 0x001, MyHTML_TREE_PARSE_FLAGS_WITHOUT_PROCESS_TOKEN = 0x003, - MyHTML_TREE_PARSE_FLAGS_SKIP_WHITESPACE_TOKEN = 0x004, + MyHTML_TREE_PARSE_FLAGS_SKIP_WHITESPACE_TOKEN = 0x004, /* skip ws token, but not for RCDATA, RAWTEXT, CDATA and PLAINTEXT */ MyHTML_TREE_PARSE_FLAGS_WITHOUT_DOCTYPE_IN_TREE = 0x008, } typedef myhtml_tree_parse_flags_t; diff --git a/include/myhtml/myosi.h b/include/myhtml/myosi.h index 1a09327..c8ea00c 100644 --- a/include/myhtml/myosi.h +++ b/include/myhtml/myosi.h @@ -102,6 +102,7 @@ extern "C" { sizeof(strcn) * myhtml->sizen); \ } +#define MyHTML_FAILED(_status_) ((_status_) != MyHTML_STATUS_OK) // encoding // https://encoding.spec.whatwg.org/#the-encoding @@ -203,7 +204,7 @@ enum myhtml_tree_parse_flags { MyHTML_TREE_PARSE_FLAGS_CLEAN = 0x000, MyHTML_TREE_PARSE_FLAGS_WITHOUT_BUILD_TREE = 0x001, MyHTML_TREE_PARSE_FLAGS_WITHOUT_PROCESS_TOKEN = 0x003, - MyHTML_TREE_PARSE_FLAGS_SKIP_WHITESPACE_TOKEN = 0x004, + MyHTML_TREE_PARSE_FLAGS_SKIP_WHITESPACE_TOKEN = 0x004, /* skip ws token, but not for RCDATA, RAWTEXT, CDATA and PLAINTEXT */ MyHTML_TREE_PARSE_FLAGS_WITHOUT_DOCTYPE_IN_TREE = 0x008, } typedef myhtml_tree_parse_flags_t; @@ -375,7 +376,15 @@ enum myhtml_insertion_mode { // base /* Very important!!! - see modest/myosi.h:modest_status_t + + for myhtml 0..00ffff; MyHTML_STATUS_OK == 0x000000 + for mycss and modules 010000..01ffff; MyCSS_STATUS_OK == 0x000000 + for modest 020000..02ffff; MODEST_STATUS_OK == 0x000000 + for myrender 030000..03ffff; MyRENDER_STATUS_OK == 0x000000 + for mydom 040000..04ffff; MyDOM_STATUS_OK == 0x000000 + for mynetwork 050000..05ffff; MyNETWORK_STATUS_OK == 0x000000 + for myecma 060000..06ffff; MyECMA_STATUS_OK == 0x000000 + not occupied 070000.. */ enum myhtml_status { MyHTML_STATUS_OK = 0x0000, diff --git a/source/myhtml/api.h b/source/myhtml/api.h index fed7a73..d23ef09 100644 --- a/source/myhtml/api.h +++ b/source/myhtml/api.h @@ -370,7 +370,15 @@ enum myhtml_tags { // base /* Very important!!! - see modest/myosi.h:modest_status_t + + for myhtml 0..00ffff; MyHTML_STATUS_OK == 0x000000 + for mycss and modules 010000..01ffff; MyCSS_STATUS_OK == 0x000000 + for modest 020000..02ffff; MODEST_STATUS_OK == 0x000000 + for myrender 030000..03ffff; MyRENDER_STATUS_OK == 0x000000 + for mydom 040000..04ffff; MyDOM_STATUS_OK == 0x000000 + for mynetwork 050000..05ffff; MyNETWORK_STATUS_OK == 0x000000 + for myecma 060000..06ffff; MyECMA_STATUS_OK == 0x000000 + not occupied 070000.. */ enum myhtml_status { MyHTML_STATUS_OK = 0x0000, @@ -426,7 +434,8 @@ enum myhtml_status { } typedef myhtml_status_t; -#define MYHTML_FAILED(_status_) ((_status_) != MyHTML_STATUS_OK) +#define MYHTML_FAILED(_status_) ((_status_) != MyHTML_STATUS_OK) /* deprecated */ +#define MyHTML_FAILED(_status_) ((_status_) != MyHTML_STATUS_OK) /** * @struct myhtml namespace @@ -463,7 +472,7 @@ enum myhtml_tree_parse_flags { MyHTML_TREE_PARSE_FLAGS_CLEAN = 0x000, MyHTML_TREE_PARSE_FLAGS_WITHOUT_BUILD_TREE = 0x001, MyHTML_TREE_PARSE_FLAGS_WITHOUT_PROCESS_TOKEN = 0x003, - MyHTML_TREE_PARSE_FLAGS_SKIP_WHITESPACE_TOKEN = 0x004, + MyHTML_TREE_PARSE_FLAGS_SKIP_WHITESPACE_TOKEN = 0x004, /* skip ws token, but not for RCDATA, RAWTEXT, CDATA and PLAINTEXT */ MyHTML_TREE_PARSE_FLAGS_WITHOUT_DOCTYPE_IN_TREE = 0x008, } typedef myhtml_tree_parse_flags_t; diff --git a/source/myhtml/myosi.h b/source/myhtml/myosi.h index 1a09327..c8ea00c 100644 --- a/source/myhtml/myosi.h +++ b/source/myhtml/myosi.h @@ -102,6 +102,7 @@ extern "C" { sizeof(strcn) * myhtml->sizen); \ } +#define MyHTML_FAILED(_status_) ((_status_) != MyHTML_STATUS_OK) // encoding // https://encoding.spec.whatwg.org/#the-encoding @@ -203,7 +204,7 @@ enum myhtml_tree_parse_flags { MyHTML_TREE_PARSE_FLAGS_CLEAN = 0x000, MyHTML_TREE_PARSE_FLAGS_WITHOUT_BUILD_TREE = 0x001, MyHTML_TREE_PARSE_FLAGS_WITHOUT_PROCESS_TOKEN = 0x003, - MyHTML_TREE_PARSE_FLAGS_SKIP_WHITESPACE_TOKEN = 0x004, + MyHTML_TREE_PARSE_FLAGS_SKIP_WHITESPACE_TOKEN = 0x004, /* skip ws token, but not for RCDATA, RAWTEXT, CDATA and PLAINTEXT */ MyHTML_TREE_PARSE_FLAGS_WITHOUT_DOCTYPE_IN_TREE = 0x008, } typedef myhtml_tree_parse_flags_t; @@ -375,7 +376,15 @@ enum myhtml_insertion_mode { // base /* Very important!!! - see modest/myosi.h:modest_status_t + + for myhtml 0..00ffff; MyHTML_STATUS_OK == 0x000000 + for mycss and modules 010000..01ffff; MyCSS_STATUS_OK == 0x000000 + for modest 020000..02ffff; MODEST_STATUS_OK == 0x000000 + for myrender 030000..03ffff; MyRENDER_STATUS_OK == 0x000000 + for mydom 040000..04ffff; MyDOM_STATUS_OK == 0x000000 + for mynetwork 050000..05ffff; MyNETWORK_STATUS_OK == 0x000000 + for myecma 060000..06ffff; MyECMA_STATUS_OK == 0x000000 + not occupied 070000.. */ enum myhtml_status { MyHTML_STATUS_OK = 0x0000, diff --git a/source/myhtml/tokenizer.c b/source/myhtml/tokenizer.c index 1d18c38..605a9a3 100644 --- a/source/myhtml/tokenizer.c +++ b/source/myhtml/tokenizer.c @@ -438,7 +438,8 @@ bool _myhtml_tokenizer_state_andata_end_tag_name(myhtml_tree_t* tree, myhtml_tok token_node->element_begin = tmp_begin; token_node->element_length = token_node->raw_length; token_node->type |= type; - token_node->tag_id = MyHTML_TAG__TEXT; + token_node->type ^= (token_node->type & MyHTML_TOKEN_TYPE_WHITESPACE); + token_node->tag_id = MyHTML_TAG__TEXT; myhtml_queue_add(tree, *html_offset, token_node); token_node = tree->current_token_node; @@ -648,9 +649,10 @@ size_t myhtml_tokenizer_state_plaintext(myhtml_tree_t* tree, myhtml_token_node_t if((token_node->type & MyHTML_TOKEN_TYPE_PLAINTEXT) == 0) token_node->type |= MyHTML_TOKEN_TYPE_PLAINTEXT; - token_node->raw_begin = (html_offset + tree->global_offset); + token_node->type ^= (token_node->type & MyHTML_TOKEN_TYPE_WHITESPACE); + token_node->raw_begin = (html_offset + tree->global_offset); token_node->raw_length = token_node->element_length = (html_size + tree->global_offset) - token_node->raw_begin; - token_node->tag_id = MyHTML_TAG__TEXT; + token_node->tag_id = MyHTML_TAG__TEXT; myhtml_tokenizer_state_set(tree) = MyHTML_TOKENIZER_STATE_DATA; myhtml_queue_add(tree, html_size, token_node); @@ -871,6 +873,7 @@ size_t myhtml_tokenizer_state_markup_declaration_open(myhtml_tree_t* tree, myhtm token_node->raw_begin += 7; token_node->raw_length = 0; token_node->tag_id = MyHTML_TAG__TEXT; + token_node->type ^= (token_node->type & MyHTML_TOKEN_TYPE_WHITESPACE); return html_offset; }