Modest/source/myhtml/myhtml.c
2016-10-14 17:42:01 +03:00

1504 lines
47 KiB
C

/*
Copyright (C) 2015-2016 Alexander Borisov
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Author: lex.borisov@gmail.com (Alexander Borisov)
*/
#include "myhtml/myhtml.h"
void myhtml_init_marker(myhtml_t* myhtml)
{
myhtml->marker = (myhtml_tree_node_t*)myhtml_malloc(sizeof(myhtml_tree_node_t));
if(myhtml->marker)
myhtml_tree_node_clean(myhtml->marker);
}
void myhtml_destroy_marker(myhtml_t* myhtml)
{
if(myhtml->marker)
myhtml_free(myhtml->marker);
}
myhtml_t * myhtml_create(void)
{
return (myhtml_t*)myhtml_malloc(sizeof(myhtml_t));
}
myhtml_status_t myhtml_init(myhtml_t* myhtml, enum myhtml_options opt, size_t thread_count, size_t queue_size)
{
myhtml_status_t status;
myhtml_init_marker(myhtml);
status = myhtml_tokenizer_state_init(myhtml);
if(status) {
myhtml->insertion_func = NULL;
myhtml->thread = NULL;
return status;
}
status = myhtml_rules_init(myhtml);
if(status) {
myhtml->thread = NULL;
return status;
}
myhtml->opt = opt;
myhtml->thread = mythread_create();
if(myhtml->thread == NULL)
return MyHTML_STATUS_THREAD_ERROR_MEMORY_ALLOCATION;
#ifdef MyHTML_BUILD_WITHOUT_THREADS
status = mythread_init(myhtml->thread, NULL, thread_count);
if(status)
return status;
#else /* MyHTML_BUILD_WITHOUT_THREADS */
switch (opt) {
case MyHTML_OPTIONS_PARSE_MODE_SINGLE:
status = mythread_init(myhtml->thread, "lastmac", 0);
if(status)
return status;
myhtml->thread->context = mythread_queue_list_create(&status);
if(status)
return status;
break;
case MyHTML_OPTIONS_PARSE_MODE_ALL_IN_ONE:
status = mythread_init(myhtml->thread, "lastmac", 1);
if(status)
return status;
myhtml->thread->context = mythread_queue_list_create(&status);
if(status)
return status;
myhread_create_stream(myhtml->thread, mythread_function_queue_stream, myhtml_parser_worker_stream, MyTHREAD_OPT_STOP, &status);
break;
default:
// default MyHTML_OPTIONS_PARSE_MODE_SEPARATELY
if(thread_count == 0)
thread_count = 1;
status = mythread_init(myhtml->thread, "lastmac", (thread_count + 1));
if(status)
return status;
myhtml->thread->context = mythread_queue_list_create(&status);
if(status)
return status;
myhread_create_stream(myhtml->thread, mythread_function_queue_stream, myhtml_parser_stream, MyTHREAD_OPT_STOP, &status);
myhread_create_batch(myhtml->thread, mythread_function_queue_batch, myhtml_parser_worker, MyTHREAD_OPT_STOP, &status, thread_count);
break;
}
#endif /* MyHTML_BUILD_WITHOUT_THREADS */
myhtml_clean(myhtml);
return status;
}
void myhtml_clean(myhtml_t* myhtml)
{
mythread_clean(myhtml->thread);
}
myhtml_t* myhtml_destroy(myhtml_t* myhtml)
{
if(myhtml == NULL)
return NULL;
myhtml_destroy_marker(myhtml);
if(myhtml->thread) {
#ifndef MyHTML_BUILD_WITHOUT_THREADS
mythread_queue_list_t* queue_list = myhtml->thread->context;
#endif
myhtml->thread = mythread_destroy(myhtml->thread, mythread_queue_wait_all_for_done, true);
#ifndef MyHTML_BUILD_WITHOUT_THREADS
mythread_queue_list_destroy(queue_list);
#endif
}
myhtml_tokenizer_state_destroy(myhtml);
if(myhtml->insertion_func)
myhtml_free(myhtml->insertion_func);
myhtml_free(myhtml);
return NULL;
}
myhtml_status_t myhtml_parse(myhtml_tree_t* tree, myhtml_encoding_t encoding, const char* html, size_t html_size)
{
if(tree->flags & MyHTML_TREE_FLAGS_PARSE_END) {
myhtml_tree_clean(tree);
}
myhtml_encoding_set(tree, encoding);
myhtml_status_t status = myhtml_tokenizer_begin(tree, html, html_size);
if(status)
return status;
return myhtml_tokenizer_end(tree);
}
myhtml_status_t myhtml_parse_fragment(myhtml_tree_t* tree, myhtml_encoding_t encoding, const char* html, size_t html_size, myhtml_tag_id_t tag_id, enum myhtml_namespace ns)
{
if(tree->flags & MyHTML_TREE_FLAGS_PARSE_END) {
myhtml_tree_clean(tree);
}
if(tag_id == 0)
tag_id = MyHTML_TAG_DIV;
if(ns == 0)
ns = MyHTML_NAMESPACE_HTML;
if(myhtml_tokenizer_fragment_init(tree, tag_id, ns) == NULL)
return MyHTML_STATUS_TOKENIZER_ERROR_FRAGMENT_INIT;
myhtml_encoding_set(tree, encoding);
myhtml_status_t status = myhtml_tokenizer_begin(tree, html, html_size);
if(status)
return status;
return myhtml_tokenizer_end(tree);
}
myhtml_status_t myhtml_parse_single(myhtml_tree_t* tree, myhtml_encoding_t encoding, const char* html, size_t html_size)
{
if(tree->flags & MyHTML_TREE_FLAGS_PARSE_END) {
myhtml_tree_clean(tree);
}
tree->flags |= MyHTML_TREE_FLAGS_SINGLE_MODE;
myhtml_encoding_set(tree, encoding);
myhtml_status_t status = myhtml_tokenizer_begin(tree, html, html_size);
if(status)
return status;
return myhtml_tokenizer_end(tree);
}
myhtml_status_t myhtml_parse_fragment_single(myhtml_tree_t* tree, myhtml_encoding_t encoding, const char* html, size_t html_size, myhtml_tag_id_t tag_id, enum myhtml_namespace ns)
{
if(tree->flags & MyHTML_TREE_FLAGS_PARSE_END) {
myhtml_tree_clean(tree);
}
if(tag_id == 0)
tag_id = MyHTML_TAG_DIV;
if(ns == 0)
ns = MyHTML_NAMESPACE_HTML;
tree->flags |= MyHTML_TREE_FLAGS_SINGLE_MODE;
if(myhtml_tokenizer_fragment_init(tree, tag_id, ns) == NULL)
return MyHTML_STATUS_TOKENIZER_ERROR_FRAGMENT_INIT;
myhtml_encoding_set(tree, encoding);
myhtml_status_t status = myhtml_tokenizer_begin(tree, html, html_size);
if(status)
return status;
return myhtml_tokenizer_end(tree);
}
myhtml_status_t myhtml_parse_chunk(myhtml_tree_t* tree, const char* html, size_t html_size)
{
if(tree->flags & MyHTML_TREE_FLAGS_PARSE_END) {
myhtml_tree_clean(tree);
}
return myhtml_tokenizer_chunk(tree, html, html_size);
}
myhtml_status_t myhtml_parse_chunk_fragment(myhtml_tree_t* tree, const char* html, size_t html_size, myhtml_tag_id_t tag_id, enum myhtml_namespace ns)
{
if(tree->flags & MyHTML_TREE_FLAGS_PARSE_END) {
myhtml_tree_clean(tree);
}
if(tag_id == 0)
tag_id = MyHTML_TAG_DIV;
if(ns == 0)
ns = MyHTML_NAMESPACE_HTML;
if(myhtml_tokenizer_fragment_init(tree, tag_id, ns) == NULL)
return MyHTML_STATUS_TOKENIZER_ERROR_FRAGMENT_INIT;
return myhtml_tokenizer_chunk(tree, html, html_size);
}
myhtml_status_t myhtml_parse_chunk_single(myhtml_tree_t* tree, const char* html, size_t html_size)
{
if((tree->flags & MyHTML_TREE_FLAGS_SINGLE_MODE) == 0)
tree->flags |= MyHTML_TREE_FLAGS_SINGLE_MODE;
return myhtml_parse_chunk(tree, html, html_size);
}
myhtml_status_t myhtml_parse_chunk_fragment_single(myhtml_tree_t* tree, const char* html, size_t html_size, myhtml_tag_id_t tag_id, enum myhtml_namespace ns)
{
if((tree->flags & MyHTML_TREE_FLAGS_SINGLE_MODE) == 0)
tree->flags |= MyHTML_TREE_FLAGS_SINGLE_MODE;
return myhtml_parse_chunk_fragment(tree, html, html_size, tag_id, ns);
}
myhtml_status_t myhtml_parse_chunk_end(myhtml_tree_t* tree)
{
return myhtml_tokenizer_end(tree);
}
void myhtml_encoding_set(myhtml_tree_t* tree, myhtml_encoding_t encoding)
{
if(encoding >= MyHTML_ENCODING_LAST_ENTRY)
return;
tree->encoding_usereq = encoding;
tree->encoding = encoding;
}
myhtml_encoding_t myhtml_encoding_get(myhtml_tree_t* tree)
{
return tree->encoding;
}
/*
* Nodes
*/
myhtml_status_t myhtml_get_nodes_by_tag_id_in_scope_find_recursion(myhtml_tree_node_t *node, myhtml_collection_t *collection, myhtml_tag_id_t tag_id)
{
while(node) {
if(node->tag_id == tag_id) {
collection->list[ collection->length ] = node;
collection->length++;
if(collection->length >= collection->size)
{
myhtml_status_t mystatus = myhtml_collection_check_size(collection, 1024, 0);
if(mystatus != MyHTML_STATUS_OK)
return mystatus;
}
}
if(node->child)
myhtml_get_nodes_by_tag_id_in_scope_find_recursion(node->child, collection, tag_id);
node = node->next;
}
return MyHTML_STATUS_OK;
}
myhtml_collection_t * myhtml_get_nodes_by_tag_id_in_scope(myhtml_tree_t* tree, myhtml_collection_t *collection, myhtml_tree_node_t *node, myhtml_tag_id_t tag_id, myhtml_status_t *status)
{
if(node == NULL)
return NULL;
myhtml_status_t mystatus = MyHTML_STATUS_OK;
if(collection == NULL) {
collection = myhtml_collection_create(1024, &mystatus);
}
if(mystatus) {
if(status)
*status = mystatus;
return collection;
}
if(node->child)
mystatus = myhtml_get_nodes_by_tag_id_in_scope_find_recursion(node->child, collection, tag_id);
collection->list[collection->length] = NULL;
if(status)
*status = mystatus;
return collection;
}
myhtml_collection_t * myhtml_get_nodes_by_name_in_scope(myhtml_tree_t* tree, myhtml_collection_t *collection, myhtml_tree_node_t *node, const char* html, size_t length, myhtml_status_t *status)
{
const myhtml_tag_context_t *tag_ctx = myhtml_tag_get_by_name(tree->tags, html, length);
return myhtml_get_nodes_by_tag_id_in_scope(tree, collection, node, tag_ctx->id, status);
}
myhtml_collection_t * myhtml_get_nodes_by_tag_id(myhtml_tree_t* tree, myhtml_collection_t *collection, myhtml_tag_id_t tag_id, myhtml_status_t *status)
{
myhtml_tag_index_entry_t *index_tag = myhtml_tag_index_entry(tree->indexes->tags, tag_id);
myhtml_tag_index_node_t *index_node = myhtml_tag_index_first(tree->indexes->tags, tag_id);
if(index_tag->count == 0) {
if(status)
*status = MyHTML_STATUS_OK;
return collection;
}
myhtml_status_t mystatus = MyHTML_STATUS_OK;
size_t idx = 0;
if(collection == NULL) {
collection = myhtml_collection_create((index_tag->count + 128), &mystatus);
collection->length += index_tag->count;
}
else {
idx = collection->length;
mystatus = myhtml_collection_check_size(collection, index_tag->count, 128);
}
if(mystatus) {
if(status)
*status = mystatus;
return collection;
}
while (index_node)
{
collection->list[idx] = index_node->node;
idx++;
index_node = index_node->next;
}
collection->list[idx] = NULL;
if(status)
*status = mystatus;
return collection;
}
myhtml_collection_t * myhtml_get_nodes_by_name(myhtml_tree_t* tree, myhtml_collection_t *collection, const char* html, size_t length, myhtml_status_t *status)
{
const myhtml_tag_context_t *tag_ctx = myhtml_tag_get_by_name(tree->tags, html, length);
return myhtml_get_nodes_by_tag_id(tree, collection, tag_ctx->id, status);
}
/*
* Manipulate Nodes
*/
myhtml_tree_node_t * myhtml_node_first(myhtml_tree_t* tree)
{
if(tree->fragment) {
// document -> html -> need element
if(tree->document && tree->document->child)
return tree->document->child->child;
}
else if(tree->document) {
// document -> html
return tree->document->child;
}
return NULL;
}
myhtml_tree_node_t * myhtml_node_next(myhtml_tree_node_t *node)
{
return node->next;
}
myhtml_tree_node_t * myhtml_node_prev(myhtml_tree_node_t *node)
{
return node->prev;
}
myhtml_tree_node_t * myhtml_node_parent(myhtml_tree_node_t *node)
{
return node->parent;
}
myhtml_tree_node_t * myhtml_node_child(myhtml_tree_node_t *node)
{
return node->child;
}
myhtml_tree_node_t * myhtml_node_last_child(myhtml_tree_node_t *node)
{
return node->last_child;
}
myhtml_tree_node_t * myhtml_node_create(myhtml_tree_t* tree, myhtml_tag_id_t tag_id, enum myhtml_namespace ns)
{
myhtml_tree_node_t *node = myhtml_tree_node_create(tree);
node->tag_id = tag_id;
node->ns = ns;
return node;
}
myhtml_tree_node_t * myhtml_node_remove(myhtml_tree_t* tree, myhtml_tree_node_t *node)
{
return myhtml_tree_node_remove(tree, node);
}
void myhtml_node_delete(myhtml_tree_t* tree, myhtml_tree_node_t *node)
{
myhtml_tree_node_delete(tree, node);
}
void myhtml_node_delete_recursive(myhtml_tree_t* tree, myhtml_tree_node_t *node)
{
myhtml_tree_node_delete_recursive(tree, node);
}
void myhtml_node_free(myhtml_tree_t* tree, myhtml_tree_node_t *node)
{
myhtml_tree_node_free(tree, node);
}
myhtml_tree_node_t * myhtml_node_insert_before(myhtml_tree_t* tree, myhtml_tree_node_t *target, myhtml_tree_node_t *node)
{
if(target == NULL || node == NULL)
return NULL;
myhtml_tree_node_insert_before(tree, target, node);
return node;
}
myhtml_tree_node_t * myhtml_node_insert_after(myhtml_tree_t* tree, myhtml_tree_node_t *target, myhtml_tree_node_t *node)
{
if(target == NULL || node == NULL)
return NULL;
myhtml_tree_node_insert_after(tree, target, node);
return node;
}
myhtml_tree_node_t * myhtml_node_append_child(myhtml_tree_t* tree, myhtml_tree_node_t *target, myhtml_tree_node_t *node)
{
if(target == NULL || node == NULL)
return NULL;
myhtml_tree_node_add_child(tree, target, node);
return node;
}
myhtml_tree_node_t * myhtml_node_insert_to_appropriate_place(myhtml_tree_t* tree, myhtml_tree_node_t *target, myhtml_tree_node_t *node)
{
if(target == NULL || node == NULL)
return NULL;
enum myhtml_tree_insertion_mode mode;
tree->foster_parenting = true;
target = myhtml_tree_appropriate_place_inserting_in_tree(tree, target, &mode);
tree->foster_parenting = false;
myhtml_tree_node_insert_by_mode(tree, target, node, mode);
return node;
}
myhtml_string_t * myhtml_node_text_set(myhtml_tree_t* tree, myhtml_tree_node_t *node, const char* text, size_t length, myhtml_encoding_t encoding)
{
if(node == NULL)
return NULL;
if(encoding >= MyHTML_ENCODING_LAST_ENTRY)
return NULL;
if(node->token == NULL) {
mcobject_async_status_t mcstatus;
node->token = (myhtml_token_node_t*)mcobject_async_malloc(tree->token->nodes_obj, tree->mcasync_token_id, &mcstatus);
if(mcstatus)
return NULL;
myhtml_token_node_clean(node->token);
}
if(node->token->str.data == NULL) {
myhtml_string_init(tree->mchar, tree->mchar_node_id, &node->token->str, (length + 2));
}
else {
if(node->token->str.size < length) {
mchar_async_free(tree->mchar, node->token->str.node_idx, node->token->str.data);
myhtml_string_init(tree->mchar, tree->mchar_node_id, &node->token->str, length);
}
else
node->token->str.length = 0;
}
if(encoding != MyHTML_ENCODING_UTF_8) {
myhtml_string_append_with_convert_encoding(&node->token->str, text, length, encoding);
}
else {
myhtml_string_append(&node->token->str, text, length);
}
node->token->raw_begin = 0;
node->token->raw_length = 0;
return &node->token->str;
}
myhtml_string_t * myhtml_node_text_set_with_charef(myhtml_tree_t* tree, myhtml_tree_node_t *node, const char* text, size_t length, myhtml_encoding_t encoding)
{
if(node == NULL)
return NULL;
if(encoding >= MyHTML_ENCODING_LAST_ENTRY)
return NULL;
if(node->token == NULL) {
mcobject_async_status_t mcstatus;
node->token = (myhtml_token_node_t*)mcobject_async_malloc(tree->token->nodes_obj, tree->mcasync_token_id, &mcstatus);
if(mcstatus)
return NULL;
myhtml_token_node_clean(node->token);
}
if(node->token->str.data == NULL) {
myhtml_string_init(tree->mchar, tree->mchar_node_id, &node->token->str, (length + 2));
}
else {
if(node->token->str.size < length) {
mchar_async_free(tree->mchar, node->token->str.node_idx, node->token->str.data);
myhtml_string_init(tree->mchar, tree->mchar_node_id, &node->token->str, length);
}
else
node->token->str.length = 0;
}
myhtml_data_process_entry_t proc_entry;
myhtml_data_process_entry_clean(&proc_entry);
proc_entry.encoding = encoding;
myhtml_encoding_result_clean(&proc_entry.res);
myhtml_data_process(&proc_entry, &node->token->str, text, length);
myhtml_data_process_end(&proc_entry, &node->token->str);
node->token->raw_begin = 0;
node->token->raw_length = 0;
return &node->token->str;
}
myhtml_token_node_t* myhtml_node_token(myhtml_tree_node_t *node)
{
return node->token;
}
myhtml_namespace_t myhtml_node_namespace(myhtml_tree_node_t *node)
{
return node->ns;
}
void myhtml_node_namespace_set(myhtml_tree_node_t *node, myhtml_namespace_t ns)
{
node->ns = ns;
}
myhtml_tag_id_t myhtml_node_tag_id(myhtml_tree_node_t *node)
{
return node->tag_id;
}
const char * myhtml_tag_name_by_id(myhtml_tree_t* tree, myhtml_tag_id_t tag_id, size_t *length)
{
if(length)
*length = 0;
if(tree == NULL || tree->tags == NULL)
return NULL;
const myhtml_tag_context_t *ctx = myhtml_tag_get_by_id(tree->tags, tag_id);
if(ctx == NULL)
return NULL;
if(length)
*length = ctx->name_length;
return ctx->name;
}
myhtml_tag_id_t myhtml_tag_id_by_name(myhtml_tree_t* tree, const char *tag_name, size_t length)
{
if(tree == NULL || tree->tags == NULL)
return MyHTML_TAG__UNDEF;
const myhtml_tag_context_t *ctx = myhtml_tag_get_by_name(tree->tags, tag_name, length);
if(ctx == NULL)
return MyHTML_TAG__UNDEF;
return ctx->id;
}
bool myhtml_node_is_close_self(myhtml_tree_node_t *node)
{
if(node->token)
return (node->token->type & MyHTML_TOKEN_TYPE_CLOSE_SELF);
return false;
}
myhtml_tree_attr_t * myhtml_node_attribute_first(myhtml_tree_node_t *node)
{
if(node->token)
return node->token->attr_first;
return NULL;
}
myhtml_tree_attr_t * myhtml_node_attribute_last(myhtml_tree_node_t *node)
{
if(node->token)
return node->token->attr_last;
return NULL;
}
const char * myhtml_node_text(myhtml_tree_node_t *node, size_t *length)
{
if(node->token && node->token->str.length && node->token->str.data)
{
if(length)
*length = node->token->str.length;
return node->token->str.data;
}
if(length)
*length = 0;
return NULL;
}
myhtml_string_t * myhtml_node_string(myhtml_tree_node_t *node)
{
if(node && node->token)
return &node->token->str;
return NULL;
}
myhtml_position_t myhtml_node_raw_pasition(myhtml_tree_node_t *node)
{
if(node && node->token)
return (myhtml_position_t){node->token->raw_begin, node->token->raw_length};
return (myhtml_position_t){0, 0};
}
myhtml_position_t myhtml_node_element_pasition(myhtml_tree_node_t *node)
{
if(node && node->token)
return (myhtml_position_t){node->token->element_begin, node->token->element_length};
return (myhtml_position_t){0, 0};
}
void myhtml_node_set_data(myhtml_tree_node_t *node, void* data)
{
node->data = data;
}
void * myhtml_node_get_data(myhtml_tree_node_t *node)
{
return node->data;
}
myhtml_status_t myhtml_get_nodes_by_attribute_key_recursion(myhtml_tree_t *tree, myhtml_tree_node_t* node, myhtml_collection_t* collection, const char* key, size_t key_len)
{
while(node)
{
if(node->token && node->token->attr_first) {
myhtml_tree_attr_t* attr = node->token->attr_first;
while(attr) {
myhtml_string_t* str_key = &attr->key;
if(str_key->length == key_len && myhtml_strncasecmp(str_key->data, key, key_len) == 0) {
collection->list[ collection->length ] = node;
collection->length++;
if(collection->length >= collection->size) {
myhtml_status_t status = myhtml_collection_check_size(collection, 1024, 0);
if(status)
return status;
}
}
attr = attr->next;
}
}
if(node->child) {
myhtml_status_t status = myhtml_get_nodes_by_attribute_key_recursion(tree, node->child, collection, key, key_len);
if(status)
return status;
}
node = node->next;
}
return MyHTML_STATUS_OK;
}
myhtml_collection_t * myhtml_get_nodes_by_attribute_key(myhtml_tree_t *tree, myhtml_collection_t* collection, myhtml_tree_node_t* scope_node, const char* key, size_t key_len, myhtml_status_t* status)
{
if(collection == NULL) {
collection = myhtml_collection_create(1024, status);
if((status && *status) || collection == NULL)
return NULL;
}
if(scope_node == NULL)
scope_node = tree->node_html;
myhtml_status_t rec_status = myhtml_get_nodes_by_attribute_key_recursion(tree, scope_node, collection, key, key_len);
if(rec_status && status)
*status = rec_status;
return collection;
}
/* find by attribute value; case-sensitivity */
bool myhtml_get_nodes_by_attribute_value_recursion_eq(myhtml_string_t* str, const char* value, size_t value_len)
{
return str->length == value_len && myhtml_strncmp(str->data, value, value_len) == 0;
}
bool myhtml_get_nodes_by_attribute_value_recursion_whitespace_separated(myhtml_string_t* str, const char* value, size_t value_len)
{
if(str->length < value_len)
return false;
const char *data = str->data;
if(myhtml_strncmp(data, value, value_len) == 0) {
if((str->length > value_len && myhtml_utils_whithspace(data[value_len], ==, ||)) || str->length == value_len)
return true;
}
for(size_t i = 1; (str->length - i) >= value_len; i++)
{
if(myhtml_utils_whithspace(data[(i - 1)], ==, ||)) {
if(myhtml_strncmp(&data[i], value, value_len) == 0) {
if((i > value_len && myhtml_utils_whithspace(data[(i + value_len)], ==, ||)) || (str->length - i) == value_len)
return true;
}
}
}
return false;
}
bool myhtml_get_nodes_by_attribute_value_recursion_begin(myhtml_string_t* str, const char* value, size_t value_len)
{
if(str->length < value_len)
return false;
return myhtml_strncmp(str->data, value, value_len) == 0;
}
bool myhtml_get_nodes_by_attribute_value_recursion_end(myhtml_string_t* str, const char* value, size_t value_len)
{
if(str->length < value_len)
return false;
return myhtml_strncmp(&str->data[ (str->length - (str->length - value_len)) ], value, value_len) == 0;
}
bool myhtml_get_nodes_by_attribute_value_recursion_contain(myhtml_string_t* str, const char* value, size_t value_len)
{
if(str->length < value_len)
return false;
const char *data = str->data;
for(size_t i = 0; (str->length - i) >= value_len; i++)
{
if(myhtml_strncmp(&data[i], value, value_len) == 0) {
return true;
}
}
return false;
}
bool myhtml_get_nodes_by_attribute_value_recursion_hyphen_separated(myhtml_string_t* str, const char* value, size_t value_len)
{
const char *data = str->data;
if(str->length < value_len)
return false;
else if(str->length == value_len && myhtml_strncmp(data, value, value_len) == 0) {
return true;
}
else if(myhtml_strncmp(data, value, value_len) == 0 && data[value_len] == '-') {
return true;
}
return false;
}
/* find by attribute value; case-insensitive */
bool myhtml_get_nodes_by_attribute_value_recursion_eq_i(myhtml_string_t* str, const char* value, size_t value_len)
{
return str->length == value_len && myhtml_strncasecmp(str->data, value, value_len) == 0;
}
bool myhtml_get_nodes_by_attribute_value_recursion_whitespace_separated_i(myhtml_string_t* str, const char* value, size_t value_len)
{
if(str->length < value_len)
return false;
const char *data = str->data;
if(myhtml_strncasecmp(data, value, value_len) == 0) {
if((str->length > value_len && myhtml_utils_whithspace(data[value_len], ==, ||)) || str->length == value_len)
return true;
}
for(size_t i = 1; (str->length - i) >= value_len; i++)
{
if(myhtml_utils_whithspace(data[(i - 1)], ==, ||)) {
if(myhtml_strncasecmp(&data[i], value, value_len) == 0) {
if((i > value_len && myhtml_utils_whithspace(data[(i + value_len)], ==, ||)) || (str->length - i) == value_len)
return true;
}
}
}
return false;
}
bool myhtml_get_nodes_by_attribute_value_recursion_begin_i(myhtml_string_t* str, const char* value, size_t value_len)
{
if(str->length < value_len)
return false;
return myhtml_strncasecmp(str->data, value, value_len) == 0;
}
bool myhtml_get_nodes_by_attribute_value_recursion_end_i(myhtml_string_t* str, const char* value, size_t value_len)
{
if(str->length < value_len)
return false;
return myhtml_strncasecmp(&str->data[ (str->length - (str->length - value_len)) ], value, value_len) == 0;
}
bool myhtml_get_nodes_by_attribute_value_recursion_contain_i(myhtml_string_t* str, const char* value, size_t value_len)
{
if(str->length < value_len)
return false;
const char *data = str->data;
for(size_t i = 0; (str->length - i) >= value_len; i++)
{
if(myhtml_strncasecmp(&data[i], value, value_len) == 0) {
return true;
}
}
return false;
}
bool myhtml_get_nodes_by_attribute_value_recursion_hyphen_separated_i(myhtml_string_t* str, const char* value, size_t value_len)
{
const char *data = str->data;
if(str->length < value_len)
return false;
else if(str->length == value_len && myhtml_strncasecmp(data, value, value_len) == 0) {
return true;
}
else if(myhtml_strncasecmp(data, value, value_len) == 0 && data[value_len] == '-') {
return true;
}
return false;
}
/* find by attribute value; basic functions */
myhtml_status_t myhtml_get_nodes_by_attribute_value_recursion(myhtml_tree_t *tree, myhtml_tree_node_t* node,
myhtml_collection_t* collection,
myhtml_attribute_value_find_f func_eq,
const char* value, size_t value_len)
{
while(node)
{
if(node->token && node->token->attr_first) {
myhtml_tree_attr_t* attr = node->token->attr_first;
while(attr) {
myhtml_string_t* str = &attr->value;
if(func_eq(str, value, value_len)) {
collection->list[ collection->length ] = node;
collection->length++;
if(collection->length >= collection->size) {
myhtml_status_t status = myhtml_collection_check_size(collection, 1024, 0);
if(status)
return status;
}
}
attr = attr->next;
}
}
if(node->child) {
myhtml_status_t status = myhtml_get_nodes_by_attribute_value_recursion(tree, node->child, collection, func_eq, value, value_len);
if(status)
return status;
}
node = node->next;
}
return MyHTML_STATUS_OK;
}
myhtml_status_t myhtml_get_nodes_by_attribute_value_recursion_by_key(myhtml_tree_t *tree, myhtml_tree_node_t* node,
myhtml_collection_t* collection,
myhtml_attribute_value_find_f func_eq,
const char* key, size_t key_len,
const char* value, size_t value_len)
{
while(node)
{
if(node->token && node->token->attr_first) {
myhtml_tree_attr_t* attr = node->token->attr_first;
while(attr) {
myhtml_string_t* str_key = &attr->key;
myhtml_string_t* str = &attr->value;
if(str_key->length == key_len && myhtml_strncasecmp(str_key->data, key, key_len) == 0)
{
if(func_eq(str, value, value_len)) {
collection->list[ collection->length ] = node;
collection->length++;
if(collection->length >= collection->size) {
myhtml_status_t status = myhtml_collection_check_size(collection, 1024, 0);
if(status)
return status;
}
}
}
attr = attr->next;
}
}
if(node->child) {
myhtml_status_t status = myhtml_get_nodes_by_attribute_value_recursion_by_key(tree, node->child, collection, func_eq,
key, key_len, value, value_len);
if(status)
return status;
}
node = node->next;
}
return MyHTML_STATUS_OK;
}
myhtml_collection_t * _myhtml_get_nodes_by_attribute_value(myhtml_tree_t *tree, myhtml_collection_t* collection, myhtml_tree_node_t* node,
myhtml_attribute_value_find_f func_eq,
const char* key, size_t key_len,
const char* value, size_t value_len,
myhtml_status_t* status)
{
if(collection == NULL) {
collection = myhtml_collection_create(1024, status);
if((status && *status) || collection == NULL)
return NULL;
}
if(node == NULL)
node = tree->node_html;
myhtml_status_t rec_status;
if(key && key_len)
rec_status = myhtml_get_nodes_by_attribute_value_recursion_by_key(tree, node, collection, func_eq, key, key_len, value, value_len);
else
rec_status = myhtml_get_nodes_by_attribute_value_recursion(tree, node, collection, func_eq, value, value_len);
if(rec_status && status)
*status = rec_status;
return collection;
}
myhtml_collection_t * myhtml_get_nodes_by_attribute_value(myhtml_tree_t *tree, myhtml_collection_t* collection, myhtml_tree_node_t* node,
bool case_insensitive,
const char* key, size_t key_len,
const char* value, size_t value_len,
myhtml_status_t* status)
{
if(case_insensitive) {
return _myhtml_get_nodes_by_attribute_value(tree, collection, node,
myhtml_get_nodes_by_attribute_value_recursion_eq_i,
key, key_len, value, value_len, status);
}
return _myhtml_get_nodes_by_attribute_value(tree, collection, node,
myhtml_get_nodes_by_attribute_value_recursion_eq,
key, key_len, value, value_len, status);
}
myhtml_collection_t * myhtml_get_nodes_by_attribute_value_whitespace_separated(myhtml_tree_t *tree, myhtml_collection_t* collection, myhtml_tree_node_t* node,
bool case_insensitive,
const char* key, size_t key_len,
const char* value, size_t value_len,
myhtml_status_t* status)
{
if(case_insensitive) {
return _myhtml_get_nodes_by_attribute_value(tree, collection, node,
myhtml_get_nodes_by_attribute_value_recursion_whitespace_separated_i,
key, key_len, value, value_len, status);
}
return _myhtml_get_nodes_by_attribute_value(tree, collection, node,
myhtml_get_nodes_by_attribute_value_recursion_whitespace_separated,
key, key_len, value, value_len, status);
}
myhtml_collection_t * myhtml_get_nodes_by_attribute_value_begin(myhtml_tree_t *tree, myhtml_collection_t* collection, myhtml_tree_node_t* node,
bool case_insensitive,
const char* key, size_t key_len,
const char* value, size_t value_len,
myhtml_status_t* status)
{
if(case_insensitive) {
return _myhtml_get_nodes_by_attribute_value(tree, collection, node,
myhtml_get_nodes_by_attribute_value_recursion_begin_i,
key, key_len, value, value_len, status);
}
return _myhtml_get_nodes_by_attribute_value(tree, collection, node,
myhtml_get_nodes_by_attribute_value_recursion_begin,
key, key_len, value, value_len, status);
}
myhtml_collection_t * myhtml_get_nodes_by_attribute_value_end(myhtml_tree_t *tree, myhtml_collection_t* collection, myhtml_tree_node_t* node,
bool case_insensitive,
const char* key, size_t key_len,
const char* value, size_t value_len,
myhtml_status_t* status)
{
if(case_insensitive) {
return _myhtml_get_nodes_by_attribute_value(tree, collection, node,
myhtml_get_nodes_by_attribute_value_recursion_end_i,
key, key_len, value, value_len, status);
}
return _myhtml_get_nodes_by_attribute_value(tree, collection, node,
myhtml_get_nodes_by_attribute_value_recursion_end,
key, key_len, value, value_len, status);
}
myhtml_collection_t * myhtml_get_nodes_by_attribute_value_contain(myhtml_tree_t *tree, myhtml_collection_t* collection, myhtml_tree_node_t* node,
bool case_insensitive,
const char* key, size_t key_len,
const char* value, size_t value_len,
myhtml_status_t* status)
{
if(case_insensitive) {
return _myhtml_get_nodes_by_attribute_value(tree, collection, node,
myhtml_get_nodes_by_attribute_value_recursion_contain_i,
key, key_len, value, value_len, status);
}
return _myhtml_get_nodes_by_attribute_value(tree, collection, node,
myhtml_get_nodes_by_attribute_value_recursion_contain,
key, key_len, value, value_len, status);
}
myhtml_collection_t * myhtml_get_nodes_by_attribute_value_hyphen_separated(myhtml_tree_t *tree, myhtml_collection_t* collection, myhtml_tree_node_t* node,
bool case_insensitive,
const char* key, size_t key_len,
const char* value, size_t value_len,
myhtml_status_t* status)
{
if(case_insensitive) {
return _myhtml_get_nodes_by_attribute_value(tree, collection, node,
myhtml_get_nodes_by_attribute_value_recursion_hyphen_separated_i,
key, key_len, value, value_len, status);
}
return _myhtml_get_nodes_by_attribute_value(tree, collection, node,
myhtml_get_nodes_by_attribute_value_recursion_hyphen_separated,
key, key_len, value, value_len, status);
}
/*
* Attributes
*/
myhtml_tree_attr_t * myhtml_attribute_next(myhtml_tree_attr_t *attr)
{
return attr->next;
}
myhtml_tree_attr_t * myhtml_attribute_prev(myhtml_tree_attr_t *attr)
{
return attr->prev;
}
enum myhtml_namespace myhtml_attribute_namespace(myhtml_tree_attr_t *attr)
{
return attr->ns;
}
void myhtml_attribute_namespace_set(myhtml_tree_attr_t *attr, myhtml_namespace_t ns)
{
attr->ns = ns;
}
const char * myhtml_attribute_key(myhtml_tree_attr_t *attr, size_t *length)
{
if(attr->key.data && attr->key.length)
{
if(length)
*length = attr->key.length;
return attr->key.data;
}
if(length)
*length = 0;
return NULL;
}
const char * myhtml_attribute_value(myhtml_tree_attr_t *attr, size_t *length)
{
if(attr->value.data && attr->value.length)
{
if(length)
*length = attr->value.length;
return attr->value.data;
}
if(length)
*length = 0;
return NULL;
}
myhtml_string_t * myhtml_attribute_key_string(myhtml_tree_attr_t* attr)
{
if(attr)
return &attr->key;
return NULL;
}
myhtml_string_t * myhtml_attribute_value_string(myhtml_tree_attr_t* attr)
{
if(attr)
return &attr->value;
return NULL;
}
myhtml_tree_attr_t * myhtml_attribute_by_key(myhtml_tree_node_t *node, const char *key, size_t key_len)
{
if(node == NULL || node->token == NULL)
return NULL;
return myhtml_token_attr_by_name(node->token, key, key_len);
}
myhtml_tree_attr_t * myhtml_attribute_add(myhtml_tree_t *tree, myhtml_tree_node_t *node, const char *key, size_t key_len, const char *value, size_t value_len, myhtml_encoding_t encoding)
{
if(node == NULL)
return NULL;
if(node->token == NULL) {
mcobject_async_status_t mcstatus;
node->token = (myhtml_token_node_t*)mcobject_async_malloc(tree->token->nodes_obj, tree->mcasync_token_id, &mcstatus);
if(mcstatus)
return NULL;
myhtml_token_node_clean(node->token);
}
return myhtml_token_node_attr_append_with_convert_encoding(tree->token, node->token, key, key_len,
value, value_len, tree->mcasync_token_id, encoding);
}
myhtml_tree_attr_t * myhtml_attribute_remove(myhtml_tree_node_t *node, myhtml_tree_attr_t *attr)
{
if(node == NULL || node->token == NULL)
return NULL;
return myhtml_token_attr_remove(node->token, attr);
}
myhtml_tree_attr_t * myhtml_attribute_remove_by_key(myhtml_tree_node_t *node, const char *key, size_t key_len)
{
if(node == NULL || node->token == NULL)
return NULL;
return myhtml_token_attr_remove_by_name(node->token, key, key_len);
}
void myhtml_attribute_delete(myhtml_tree_t *tree, myhtml_tree_node_t *node, myhtml_tree_attr_t *attr)
{
if(node == NULL || node->token == NULL)
return;
myhtml_token_attr_remove(node->token, attr);
myhtml_attribute_free(tree, attr);
}
void myhtml_attribute_free(myhtml_tree_t *tree, myhtml_tree_attr_t *attr)
{
if(attr->key.data)
mchar_async_free(attr->key.mchar, attr->key.node_idx, attr->key.data);
if(attr->value.data)
mchar_async_free(attr->value.mchar, attr->value.node_idx, attr->value.data);
mcobject_async_free(tree->token->attr_obj, attr);
}
myhtml_position_t myhtml_attribute_key_raw_position(myhtml_tree_attr_t *attr)
{
if(attr)
return (myhtml_position_t){attr->raw_key_begin, attr->raw_key_length};
return (myhtml_position_t){0, 0};
}
myhtml_position_t myhtml_attribute_value_raw_position(myhtml_tree_attr_t *attr)
{
if(attr)
return (myhtml_position_t){attr->raw_value_begin, attr->raw_value_length};
return (myhtml_position_t){0, 0};
}
/*
* Collections
*/
myhtml_collection_t * myhtml_collection_create(size_t size, myhtml_status_t *status)
{
myhtml_collection_t *collection = (myhtml_collection_t*)myhtml_malloc(sizeof(myhtml_collection_t));
if(collection == NULL) {
if(status)
*status = MyHTML_STATUS_ERROR_MEMORY_ALLOCATION;
return NULL;
}
collection->size = size;
collection->length = 0;
collection->list = (myhtml_tree_node_t **)myhtml_malloc(sizeof(myhtml_tree_node_t*) * size);
if(collection->list == NULL) {
myhtml_free(collection);
if(status)
*status = MyHTML_STATUS_ERROR_MEMORY_ALLOCATION;
return NULL;
}
if(status)
*status = MyHTML_STATUS_OK;
return collection;
}
myhtml_status_t myhtml_collection_check_size(myhtml_collection_t *collection, size_t need, size_t upto_length)
{
if((collection->length + need) >= collection->size)
{
size_t tmp_size = collection->length + need + upto_length + 1;
myhtml_tree_node_t **tmp = (myhtml_tree_node_t **)myhtml_realloc(collection->list, sizeof(myhtml_tree_node_t*) * tmp_size);
if(tmp) {
collection->length = upto_length;
collection->size = tmp_size;
collection->list = tmp;
}
else
return MyHTML_STATUS_ERROR_MEMORY_ALLOCATION;
}
return MyHTML_STATUS_OK;
}
void myhtml_collection_clean(myhtml_collection_t *collection)
{
if(collection)
collection->length = 0;
}
myhtml_collection_t * myhtml_collection_destroy(myhtml_collection_t *collection)
{
if(collection == NULL)
return NULL;
if(collection->list)
myhtml_free(collection->list);
myhtml_free(collection);
return NULL;
}
/* queue */
void myhtml_queue_add(myhtml_tree_t *tree, size_t begin, myhtml_token_node_t* token)
{
mythread_queue_node_t *qnode = tree->current_qnode;
if(tree->parse_flags & MyHTML_TREE_PARSE_FLAGS_SKIP_WHITESPACE_TOKEN) {
if(token && token->tag_id == MyHTML_TAG__TEXT && token->type & MyHTML_TOKEN_TYPE_WHITESPACE)
{
myhtml_token_node_clean(token);
token->raw_begin = token->element_begin = (tree->global_offset + begin);
return;
}
}
qnode->token = token;
#ifndef MyHTML_BUILD_WITHOUT_THREADS
if(tree->flags & MyHTML_TREE_FLAGS_SINGLE_MODE) {
myhtml_parser_worker(0, qnode);
myhtml_parser_stream(0, qnode);
tree->current_qnode = mythread_queue_node_malloc_limit(tree->myhtml->thread, tree->queue, 4, NULL);
}
else {
tree->current_qnode = mythread_queue_node_malloc_round(tree->myhtml->thread, tree->queue_entry, NULL);
}
#else
myhtml_parser_worker(0, qnode);
myhtml_parser_stream(0, qnode);
tree->current_qnode = mythread_queue_node_malloc_limit(tree->myhtml->thread, tree->queue, 4, NULL);
#endif /* MyHTML_BUILD_WITHOUT_THREADS */
tree->current_qnode->tree = tree;
tree->current_qnode->prev = qnode;
if(qnode)
myhtml_tokenizer_calc_current_namespace(tree, token);
myhtml_token_node_malloc(tree->token, tree->current_token_node, tree->token->mcasync_token_id);
tree->current_token_node->raw_begin = tree->current_token_node->element_begin = (tree->global_offset + begin);
}
bool myhtml_utils_strcmp(const char* ab, const char* to_lowercase, size_t size)
{
size_t i = 0;
for(;;) {
if(i == size)
return true;
if((const unsigned char)(to_lowercase[i] > 0x40 && to_lowercase[i] < 0x5b ?
(to_lowercase[i]|0x60) : to_lowercase[i]) != (const unsigned char)ab[i])
{
return false;
}
i++;
}
return false;
}
bool myhtml_is_html_node(myhtml_tree_node_t *node, myhtml_tag_id_t tag_id)
{
if(node == NULL)
return false;
return node->tag_id == tag_id && node->ns == MyHTML_NAMESPACE_HTML;
}
/* version */
myhtml_version_t myhtml_version(void)
{
return (myhtml_version_t){MyHTML_VERSION_MAJOR, MyHTML_VERSION_MINOR, MyHTML_VERSION_PATCH};
}