mirror of
https://github.com/lexborisov/Modest
synced 2024-11-22 05:41:32 +03:00
322 lines
15 KiB
C
322 lines
15 KiB
C
/*
|
|
Copyright (C) 2015-2017 Alexander Borisov
|
|
|
|
This library is free software; you can redistribute it and/or
|
|
modify it under the terms of the GNU Lesser General Public
|
|
License as published by the Free Software Foundation; either
|
|
version 2.1 of the License, or (at your option) any later version.
|
|
|
|
This library is distributed in the hope that it will be useful,
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
Lesser General Public License for more details.
|
|
|
|
You should have received a copy of the GNU Lesser General Public
|
|
License along with this library; if not, write to the Free Software
|
|
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
|
|
|
Author: lex.borisov@gmail.com (Alexander Borisov)
|
|
*/
|
|
|
|
#ifndef MyHTML_MYOSI_H
|
|
#define MyHTML_MYOSI_H
|
|
#pragma once
|
|
|
|
#include <mycore/myosi.h>
|
|
|
|
#define MyHTML_VERSION_MAJOR 4
|
|
#define MyHTML_VERSION_MINOR 0
|
|
#define MyHTML_VERSION_PATCH 5
|
|
|
|
#define MyHTML_VERSION_STRING MyCORE_STR(MyHTML_VERSION_MAJOR) MyCORE_STR(.) MyCORE_STR(MyHTML_VERSION_MINOR) MyCORE_STR(.) MyCORE_STR(MyHTML_VERSION_PATCH)
|
|
|
|
#ifdef __cplusplus
|
|
extern "C" {
|
|
#endif
|
|
|
|
#define MyHTML_FAILED(_status_) ((_status_) != MyHTML_STATUS_OK)
|
|
|
|
// char references
|
|
typedef struct myhtml_data_process_entry myhtml_data_process_entry_t;
|
|
|
|
// tree
|
|
enum myhtml_tree_flags {
|
|
MyHTML_TREE_FLAGS_CLEAN = 0x000,
|
|
MyHTML_TREE_FLAGS_SCRIPT = 0x001,
|
|
MyHTML_TREE_FLAGS_FRAMESET_OK = 0x002,
|
|
MyHTML_TREE_FLAGS_IFRAME_SRCDOC = 0x004,
|
|
MyHTML_TREE_FLAGS_ALREADY_STARTED = 0x008,
|
|
MyHTML_TREE_FLAGS_SINGLE_MODE = 0x010,
|
|
MyHTML_TREE_FLAGS_PARSE_END = 0x020,
|
|
MyHTML_TREE_FLAGS_PARSE_FLAG = 0x040,
|
|
MyHTML_TREE_FLAGS_PARSE_FLAG_EMIT_NEWLINE = 0x080
|
|
};
|
|
|
|
enum myhtml_tree_parse_flags {
|
|
MyHTML_TREE_PARSE_FLAGS_CLEAN = 0x000,
|
|
MyHTML_TREE_PARSE_FLAGS_WITHOUT_BUILD_TREE = 0x001,
|
|
MyHTML_TREE_PARSE_FLAGS_WITHOUT_PROCESS_TOKEN = 0x003,
|
|
MyHTML_TREE_PARSE_FLAGS_SKIP_WHITESPACE_TOKEN = 0x004, /* skip ws token, but not for RCDATA, RAWTEXT, CDATA and PLAINTEXT */
|
|
MyHTML_TREE_PARSE_FLAGS_WITHOUT_DOCTYPE_IN_TREE = 0x008
|
|
}
|
|
typedef myhtml_tree_parse_flags_t;
|
|
|
|
typedef struct myhtml_tree_temp_tag_name myhtml_tree_temp_tag_name_t;
|
|
typedef struct myhtml_tree_insertion_list myhtml_tree_insertion_list_t;
|
|
typedef struct myhtml_tree_token_list myhtml_tree_token_list_t;
|
|
typedef struct myhtml_tree_list myhtml_tree_list_t;
|
|
typedef struct myhtml_tree_doctype myhtml_tree_doctype_t;
|
|
typedef struct myhtml_async_args myhtml_async_args_t;
|
|
|
|
typedef struct myhtml_tree_node myhtml_tree_node_t;
|
|
typedef struct myhtml_tree myhtml_tree_t;
|
|
|
|
// token
|
|
enum myhtml_token_type {
|
|
MyHTML_TOKEN_TYPE_OPEN = 0x000,
|
|
MyHTML_TOKEN_TYPE_CLOSE = 0x001,
|
|
MyHTML_TOKEN_TYPE_CLOSE_SELF = 0x002,
|
|
MyHTML_TOKEN_TYPE_DONE = 0x004,
|
|
MyHTML_TOKEN_TYPE_WHITESPACE = 0x008,
|
|
MyHTML_TOKEN_TYPE_RCDATA = 0x010,
|
|
MyHTML_TOKEN_TYPE_RAWTEXT = 0x020,
|
|
MyHTML_TOKEN_TYPE_SCRIPT = 0x040,
|
|
MyHTML_TOKEN_TYPE_PLAINTEXT = 0x080,
|
|
MyHTML_TOKEN_TYPE_CDATA = 0x100,
|
|
MyHTML_TOKEN_TYPE_DATA = 0x200,
|
|
MyHTML_TOKEN_TYPE_COMMENT = 0x400,
|
|
MyHTML_TOKEN_TYPE_NULL = 0x800
|
|
};
|
|
|
|
typedef size_t myhtml_token_index_t;
|
|
typedef size_t myhtml_token_attr_index_t;
|
|
typedef struct myhtml_token_replacement_entry myhtml_token_replacement_entry_t;
|
|
typedef struct myhtml_token_namespace_replacement myhtml_token_namespace_replacement_t;
|
|
|
|
typedef struct myhtml_token_attr myhtml_token_attr_t;
|
|
typedef struct myhtml_token_node myhtml_token_node_t;
|
|
typedef struct myhtml_token myhtml_token_t;
|
|
|
|
// tags
|
|
enum myhtml_tag_categories {
|
|
MyHTML_TAG_CATEGORIES_UNDEF = 0x000,
|
|
MyHTML_TAG_CATEGORIES_ORDINARY = 0x001,
|
|
MyHTML_TAG_CATEGORIES_SPECIAL = 0x002,
|
|
MyHTML_TAG_CATEGORIES_FORMATTING = 0x004,
|
|
MyHTML_TAG_CATEGORIES_SCOPE = 0x008,
|
|
MyHTML_TAG_CATEGORIES_SCOPE_LIST_ITEM = 0x010,
|
|
MyHTML_TAG_CATEGORIES_SCOPE_BUTTON = 0x020,
|
|
MyHTML_TAG_CATEGORIES_SCOPE_TABLE = 0x040,
|
|
MyHTML_TAG_CATEGORIES_SCOPE_SELECT = 0x080
|
|
};
|
|
|
|
typedef struct myhtml_tag_index_node myhtml_tag_index_node_t;
|
|
typedef struct myhtml_tag_index_entry myhtml_tag_index_entry_t;
|
|
typedef struct myhtml_tag_index myhtml_tag_index_t;
|
|
|
|
typedef size_t myhtml_tag_id_t;
|
|
|
|
typedef struct myhtml_tag myhtml_tag_t;
|
|
|
|
// stream
|
|
typedef struct myhtml_stream_buffer_entry myhtml_stream_buffer_entry_t;
|
|
typedef struct myhtml_stream_buffer myhtml_stream_buffer_t;
|
|
|
|
// parse
|
|
enum myhtml_tokenizer_state {
|
|
MyHTML_TOKENIZER_STATE_DATA = 0x000,
|
|
MyHTML_TOKENIZER_STATE_CHARACTER_REFERENCE_IN_DATA = 0x001,
|
|
MyHTML_TOKENIZER_STATE_RCDATA = 0x002,
|
|
MyHTML_TOKENIZER_STATE_CHARACTER_REFERENCE_IN_RCDATA = 0x003,
|
|
MyHTML_TOKENIZER_STATE_RAWTEXT = 0x004,
|
|
MyHTML_TOKENIZER_STATE_SCRIPT_DATA = 0x005,
|
|
MyHTML_TOKENIZER_STATE_PLAINTEXT = 0x006,
|
|
MyHTML_TOKENIZER_STATE_TAG_OPEN = 0x007,
|
|
MyHTML_TOKENIZER_STATE_END_TAG_OPEN = 0x008,
|
|
MyHTML_TOKENIZER_STATE_TAG_NAME = 0x009,
|
|
MyHTML_TOKENIZER_STATE_RCDATA_LESS_THAN_SIGN = 0x00a,
|
|
MyHTML_TOKENIZER_STATE_RCDATA_END_TAG_OPEN = 0x00b,
|
|
MyHTML_TOKENIZER_STATE_RCDATA_END_TAG_NAME = 0x00c,
|
|
MyHTML_TOKENIZER_STATE_RAWTEXT_LESS_THAN_SIGN = 0x00d,
|
|
MyHTML_TOKENIZER_STATE_RAWTEXT_END_TAG_OPEN = 0x00e,
|
|
MyHTML_TOKENIZER_STATE_RAWTEXT_END_TAG_NAME = 0x00f,
|
|
MyHTML_TOKENIZER_STATE_SCRIPT_DATA_LESS_THAN_SIGN = 0x010,
|
|
MyHTML_TOKENIZER_STATE_SCRIPT_DATA_END_TAG_OPEN = 0x011,
|
|
MyHTML_TOKENIZER_STATE_SCRIPT_DATA_END_TAG_NAME = 0x012,
|
|
MyHTML_TOKENIZER_STATE_SCRIPT_DATA_ESCAPE_START = 0x013,
|
|
MyHTML_TOKENIZER_STATE_SCRIPT_DATA_ESCAPE_START_DASH = 0x014,
|
|
MyHTML_TOKENIZER_STATE_SCRIPT_DATA_ESCAPED = 0x015,
|
|
MyHTML_TOKENIZER_STATE_SCRIPT_DATA_ESCAPED_DASH = 0x016,
|
|
MyHTML_TOKENIZER_STATE_SCRIPT_DATA_ESCAPED_DASH_DASH = 0x017,
|
|
MyHTML_TOKENIZER_STATE_SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN = 0x018,
|
|
MyHTML_TOKENIZER_STATE_SCRIPT_DATA_ESCAPED_END_TAG_OPEN = 0x019,
|
|
MyHTML_TOKENIZER_STATE_SCRIPT_DATA_ESCAPED_END_TAG_NAME = 0x01a,
|
|
MyHTML_TOKENIZER_STATE_SCRIPT_DATA_DOUBLE_ESCAPE_START = 0x01b,
|
|
MyHTML_TOKENIZER_STATE_SCRIPT_DATA_DOUBLE_ESCAPED = 0x01c,
|
|
MyHTML_TOKENIZER_STATE_SCRIPT_DATA_DOUBLE_ESCAPED_DASH = 0x01d,
|
|
MyHTML_TOKENIZER_STATE_SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH = 0x01e,
|
|
MyHTML_TOKENIZER_STATE_SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN = 0x01f,
|
|
MyHTML_TOKENIZER_STATE_SCRIPT_DATA_DOUBLE_ESCAPE_END = 0x020,
|
|
MyHTML_TOKENIZER_STATE_BEFORE_ATTRIBUTE_NAME = 0x021,
|
|
MyHTML_TOKENIZER_STATE_ATTRIBUTE_NAME = 0x022,
|
|
MyHTML_TOKENIZER_STATE_AFTER_ATTRIBUTE_NAME = 0x023,
|
|
MyHTML_TOKENIZER_STATE_BEFORE_ATTRIBUTE_VALUE = 0x024,
|
|
MyHTML_TOKENIZER_STATE_ATTRIBUTE_VALUE_DOUBLE_QUOTED = 0x025,
|
|
MyHTML_TOKENIZER_STATE_ATTRIBUTE_VALUE_SINGLE_QUOTED = 0x026,
|
|
MyHTML_TOKENIZER_STATE_ATTRIBUTE_VALUE_UNQUOTED = 0x027,
|
|
MyHTML_TOKENIZER_STATE_CHARACTER_REFERENCE_IN_ATTRIBUTE_VALUE = 0x028,
|
|
MyHTML_TOKENIZER_STATE_AFTER_ATTRIBUTE_VALUE_QUOTED = 0x029,
|
|
MyHTML_TOKENIZER_STATE_SELF_CLOSING_START_TAG = 0x02a,
|
|
MyHTML_TOKENIZER_STATE_BOGUS_COMMENT = 0x02b,
|
|
MyHTML_TOKENIZER_STATE_MARKUP_DECLARATION_OPEN = 0x02c,
|
|
MyHTML_TOKENIZER_STATE_COMMENT_START = 0x02d,
|
|
MyHTML_TOKENIZER_STATE_COMMENT_START_DASH = 0x02e,
|
|
MyHTML_TOKENIZER_STATE_COMMENT = 0x02f,
|
|
MyHTML_TOKENIZER_STATE_COMMENT_END_DASH = 0x030,
|
|
MyHTML_TOKENIZER_STATE_COMMENT_END = 0x031,
|
|
MyHTML_TOKENIZER_STATE_COMMENT_END_BANG = 0x032,
|
|
MyHTML_TOKENIZER_STATE_DOCTYPE = 0x033,
|
|
MyHTML_TOKENIZER_STATE_BEFORE_DOCTYPE_NAME = 0x034,
|
|
MyHTML_TOKENIZER_STATE_DOCTYPE_NAME = 0x035,
|
|
MyHTML_TOKENIZER_STATE_AFTER_DOCTYPE_NAME = 0x036,
|
|
MyHTML_TOKENIZER_STATE_AFTER_DOCTYPE_PUBLIC_KEYWORD = 0x037,
|
|
MyHTML_TOKENIZER_STATE_BEFORE_DOCTYPE_PUBLIC_IDENTIFIER = 0x038,
|
|
MyHTML_TOKENIZER_STATE_DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED = 0x039,
|
|
MyHTML_TOKENIZER_STATE_DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED = 0x03a,
|
|
MyHTML_TOKENIZER_STATE_AFTER_DOCTYPE_PUBLIC_IDENTIFIER = 0x03b,
|
|
MyHTML_TOKENIZER_STATE_BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS = 0x03c,
|
|
MyHTML_TOKENIZER_STATE_AFTER_DOCTYPE_SYSTEM_KEYWORD = 0x03d,
|
|
MyHTML_TOKENIZER_STATE_BEFORE_DOCTYPE_SYSTEM_IDENTIFIER = 0x03e,
|
|
MyHTML_TOKENIZER_STATE_DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED = 0x03f,
|
|
MyHTML_TOKENIZER_STATE_DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED = 0x040,
|
|
MyHTML_TOKENIZER_STATE_AFTER_DOCTYPE_SYSTEM_IDENTIFIER = 0x041,
|
|
MyHTML_TOKENIZER_STATE_BOGUS_DOCTYPE = 0x042,
|
|
MyHTML_TOKENIZER_STATE_CDATA_SECTION = 0x043,
|
|
MyHTML_TOKENIZER_STATE_CUSTOM_AFTER_DOCTYPE_NAME_A_Z = 0x044,
|
|
MyHTML_TOKENIZER_STATE_PARSE_ERROR_STOP = 0x045,
|
|
MyHTML_TOKENIZER_STATE_FIRST_ENTRY = MyHTML_TOKENIZER_STATE_DATA,
|
|
MyHTML_TOKENIZER_STATE_LAST_ENTRY = 0x046
|
|
};
|
|
|
|
enum myhtml_insertion_mode {
|
|
MyHTML_INSERTION_MODE_INITIAL = 0x000,
|
|
MyHTML_INSERTION_MODE_BEFORE_HTML = 0x001,
|
|
MyHTML_INSERTION_MODE_BEFORE_HEAD = 0x002,
|
|
MyHTML_INSERTION_MODE_IN_HEAD = 0x003,
|
|
MyHTML_INSERTION_MODE_IN_HEAD_NOSCRIPT = 0x004,
|
|
MyHTML_INSERTION_MODE_AFTER_HEAD = 0x005,
|
|
MyHTML_INSERTION_MODE_IN_BODY = 0x006,
|
|
MyHTML_INSERTION_MODE_TEXT = 0x007,
|
|
MyHTML_INSERTION_MODE_IN_TABLE = 0x008,
|
|
MyHTML_INSERTION_MODE_IN_TABLE_TEXT = 0x009,
|
|
MyHTML_INSERTION_MODE_IN_CAPTION = 0x00a,
|
|
MyHTML_INSERTION_MODE_IN_COLUMN_GROUP = 0x00b,
|
|
MyHTML_INSERTION_MODE_IN_TABLE_BODY = 0x00c,
|
|
MyHTML_INSERTION_MODE_IN_ROW = 0x00d,
|
|
MyHTML_INSERTION_MODE_IN_CELL = 0x00e,
|
|
MyHTML_INSERTION_MODE_IN_SELECT = 0x00f,
|
|
MyHTML_INSERTION_MODE_IN_SELECT_IN_TABLE = 0x010,
|
|
MyHTML_INSERTION_MODE_IN_TEMPLATE = 0x011,
|
|
MyHTML_INSERTION_MODE_AFTER_BODY = 0x012,
|
|
MyHTML_INSERTION_MODE_IN_FRAMESET = 0x013,
|
|
MyHTML_INSERTION_MODE_AFTER_FRAMESET = 0x014,
|
|
MyHTML_INSERTION_MODE_AFTER_AFTER_BODY = 0x015,
|
|
MyHTML_INSERTION_MODE_AFTER_AFTER_FRAMESET = 0x016,
|
|
MyHTML_INSERTION_MODE_LAST_ENTRY = 0x017
|
|
};
|
|
|
|
// base
|
|
/*
|
|
Very important!!!
|
|
See mycore/myosi.h:mystatus_t
|
|
*/
|
|
enum myhtml_status {
|
|
MyHTML_STATUS_OK = 0x0000,
|
|
MyHTML_STATUS_ERROR = 0x0001,
|
|
MyHTML_STATUS_ERROR_MEMORY_ALLOCATION = 0x0002,
|
|
MyHTML_STATUS_RULES_ERROR_MEMORY_ALLOCATION = 0x9064,
|
|
MyHTML_STATUS_TOKENIZER_ERROR_MEMORY_ALLOCATION = 0x912c,
|
|
MyHTML_STATUS_TOKENIZER_ERROR_FRAGMENT_INIT = 0x912d,
|
|
MyHTML_STATUS_TAGS_ERROR_MEMORY_ALLOCATION = 0x9190,
|
|
MyHTML_STATUS_TAGS_ERROR_MCOBJECT_CREATE = 0x9191,
|
|
MyHTML_STATUS_TAGS_ERROR_MCOBJECT_MALLOC = 0x9192,
|
|
MyHTML_STATUS_TAGS_ERROR_MCOBJECT_CREATE_NODE = 0x9193,
|
|
MyHTML_STATUS_TAGS_ERROR_CACHE_MEMORY_ALLOCATION = 0x9194,
|
|
MyHTML_STATUS_TAGS_ERROR_INDEX_MEMORY_ALLOCATION = 0x9195,
|
|
MyHTML_STATUS_TREE_ERROR_MEMORY_ALLOCATION = 0x91f4,
|
|
MyHTML_STATUS_TREE_ERROR_MCOBJECT_CREATE = 0x91f5,
|
|
MyHTML_STATUS_TREE_ERROR_MCOBJECT_INIT = 0x91f6,
|
|
MyHTML_STATUS_TREE_ERROR_MCOBJECT_CREATE_NODE = 0x91f7,
|
|
MyHTML_STATUS_TREE_ERROR_INCOMING_BUFFER_CREATE = 0x91f8,
|
|
MyHTML_STATUS_ATTR_ERROR_ALLOCATION = 0x9258,
|
|
MyHTML_STATUS_ATTR_ERROR_CREATE = 0x9259,
|
|
MyHTML_STATUS_STREAM_BUFFER_ERROR_CREATE = 0x9300,
|
|
MyHTML_STATUS_STREAM_BUFFER_ERROR_INIT = 0x9301,
|
|
MyHTML_STATUS_STREAM_BUFFER_ENTRY_ERROR_CREATE = 0x9302,
|
|
MyHTML_STATUS_STREAM_BUFFER_ENTRY_ERROR_INIT = 0x9303,
|
|
MyHTML_STATUS_STREAM_BUFFER_ERROR_ADD_ENTRY = 0x9304
|
|
}
|
|
typedef myhtml_status_t;
|
|
|
|
enum myhtml_namespace {
|
|
MyHTML_NAMESPACE_UNDEF = 0x00,
|
|
MyHTML_NAMESPACE_HTML = 0x01,
|
|
MyHTML_NAMESPACE_MATHML = 0x02,
|
|
MyHTML_NAMESPACE_SVG = 0x03,
|
|
MyHTML_NAMESPACE_XLINK = 0x04,
|
|
MyHTML_NAMESPACE_XML = 0x05,
|
|
MyHTML_NAMESPACE_XMLNS = 0x06,
|
|
|
|
/* MyHTML_NAMESPACE_ANY == MyHTML_NAMESPACE_LAST_ENTRY */
|
|
MyHTML_NAMESPACE_ANY = 0x07,
|
|
MyHTML_NAMESPACE_LAST_ENTRY = 0x07
|
|
}
|
|
typedef myhtml_namespace_t;
|
|
|
|
enum myhtml_options {
|
|
MyHTML_OPTIONS_DEFAULT = 0x00,
|
|
MyHTML_OPTIONS_PARSE_MODE_SINGLE = 0x01,
|
|
MyHTML_OPTIONS_PARSE_MODE_ALL_IN_ONE = 0x02,
|
|
MyHTML_OPTIONS_PARSE_MODE_SEPARATELY = 0x04
|
|
};
|
|
|
|
struct myhtml_position {
|
|
size_t begin;
|
|
size_t length;
|
|
}
|
|
typedef myhtml_position_t;
|
|
|
|
struct myhtml_version {
|
|
int major;
|
|
int minor;
|
|
int patch;
|
|
}
|
|
typedef myhtml_version_t;
|
|
|
|
typedef myhtml_token_attr_t myhtml_tree_attr_t;
|
|
typedef struct myhtml_collection myhtml_collection_t;
|
|
typedef struct myhtml myhtml_t;
|
|
|
|
// parser state function
|
|
typedef size_t (*myhtml_tokenizer_state_f)(myhtml_tree_t* tree, myhtml_token_node_t* token_node, const char* html, size_t html_offset, size_t html_size);
|
|
|
|
// parser insertion mode function
|
|
typedef bool (*myhtml_insertion_f)(myhtml_tree_t* tree, myhtml_token_node_t* token);
|
|
|
|
// char references state
|
|
typedef size_t (*myhtml_data_process_state_f)(myhtml_data_process_entry_t* charef, mycore_string_t* str, const char* data, size_t offset, size_t size);
|
|
|
|
// callback functions
|
|
typedef void* (*myhtml_callback_token_f)(myhtml_tree_t* tree, myhtml_token_node_t* token, void* ctx);
|
|
typedef void (*myhtml_callback_tree_node_f)(myhtml_tree_t* tree, myhtml_tree_node_t* node, void* ctx);
|
|
|
|
// find attribute value functions
|
|
typedef bool (*myhtml_attribute_value_find_f)(mycore_string_t* str_key, const char* value, size_t value_len);
|
|
|
|
#ifdef __cplusplus
|
|
} /* extern "C" */
|
|
#endif
|
|
|
|
#endif
|
|
|