From 32876b578fce0abac1fc4c68c05c5389795e28da Mon Sep 17 00:00:00 2001 From: lexborisov Date: Fri, 17 Feb 2017 14:49:33 +0300 Subject: [PATCH] Little changes for api --- include/myhtml/api.h | 17 ++++++++++ include/myhtml/encoding.h | 1 + source/myhtml/api.h | 17 ++++++++++ source/myhtml/encoding.h | 1 + source/myhtml/encoding_detect.c | 5 ++- source/myhtml/url.h | 60 +++++++++++++++++++++++++++++++++ source/myhtml/utils/mhash.c | 4 +-- 7 files changed, 100 insertions(+), 5 deletions(-) create mode 100644 source/myhtml/url.h diff --git a/include/myhtml/api.h b/include/myhtml/api.h index e7eb4a6..138fa2f 100755 --- a/include/myhtml/api.h +++ b/include/myhtml/api.h @@ -2081,6 +2081,23 @@ myhtml_encoding_name_by_id(myhtml_encoding_t encoding, size_t *length); myhtml_encoding_t myhtml_encoding_prescan_stream_to_determine_encoding(const char *data, size_t data_size); +/** + * Extracting character encoding from string. Find "charset=" and see encoding. + * For example: "text/html; charset=windows-1251". Return MyHTML_ENCODING_WINDOWS_1251 + * + * + * See https://html.spec.whatwg.org/multipage/infrastructure.html#algorithm-for-extracting-a-character-encoding-from-a-meta-element + * + * @param[in] data + * @param[in] data length + * @param[out] return encoding + * + * @return true if encoding found + */ +bool +myhtml_encoding_extracting_character_encoding_from_charset(const char *data, size_t data_size, + myhtml_encoding_t *encoding); + /*********************************************************************************** * * MyHTML_STRING diff --git a/include/myhtml/encoding.h b/include/myhtml/encoding.h index 1ac3745..b9755fd 100644 --- a/include/myhtml/encoding.h +++ b/include/myhtml/encoding.h @@ -154,6 +154,7 @@ const myhtml_encoding_detect_name_entry_t * myhtml_encoding_name_entry_by_name(c bool myhtml_encoding_by_name(const char *name, size_t length, myhtml_encoding_t *encoding); const char * myhtml_encoding_name_by_id(myhtml_encoding_t encoding, size_t *length); +bool myhtml_encoding_extracting_character_encoding_from_charset(const char *data, size_t data_size, myhtml_encoding_t *encoding); myhtml_encoding_t myhtml_encoding_prescan_stream_to_determine_encoding(const char *data, size_t data_size); #ifdef __cplusplus diff --git a/source/myhtml/api.h b/source/myhtml/api.h index e7eb4a6..138fa2f 100755 --- a/source/myhtml/api.h +++ b/source/myhtml/api.h @@ -2081,6 +2081,23 @@ myhtml_encoding_name_by_id(myhtml_encoding_t encoding, size_t *length); myhtml_encoding_t myhtml_encoding_prescan_stream_to_determine_encoding(const char *data, size_t data_size); +/** + * Extracting character encoding from string. Find "charset=" and see encoding. + * For example: "text/html; charset=windows-1251". Return MyHTML_ENCODING_WINDOWS_1251 + * + * + * See https://html.spec.whatwg.org/multipage/infrastructure.html#algorithm-for-extracting-a-character-encoding-from-a-meta-element + * + * @param[in] data + * @param[in] data length + * @param[out] return encoding + * + * @return true if encoding found + */ +bool +myhtml_encoding_extracting_character_encoding_from_charset(const char *data, size_t data_size, + myhtml_encoding_t *encoding); + /*********************************************************************************** * * MyHTML_STRING diff --git a/source/myhtml/encoding.h b/source/myhtml/encoding.h index e575690..7dab732 100644 --- a/source/myhtml/encoding.h +++ b/source/myhtml/encoding.h @@ -154,6 +154,7 @@ const myhtml_encoding_detect_name_entry_t * myhtml_encoding_name_entry_by_name(c bool myhtml_encoding_by_name(const char *name, size_t length, myhtml_encoding_t *encoding); const char * myhtml_encoding_name_by_id(myhtml_encoding_t encoding, size_t *length); +bool myhtml_encoding_extracting_character_encoding_from_charset(const char *data, size_t data_size, myhtml_encoding_t *encoding); myhtml_encoding_t myhtml_encoding_prescan_stream_to_determine_encoding(const char *data, size_t data_size); #ifdef __cplusplus diff --git a/source/myhtml/encoding_detect.c b/source/myhtml/encoding_detect.c index 4dc0842..561bbf6 100644 --- a/source/myhtml/encoding_detect.c +++ b/source/myhtml/encoding_detect.c @@ -406,8 +406,7 @@ const char * myhtml_encoding_name_by_id(myhtml_encoding_t encoding, size_t *leng the user agent either runs out of bytes (meaning the position pointer created in the first step below goes beyond the end of the byte stream obtained so far) or reaches its end condition, then abort the prescan a byte stream to determine its encoding algorithm unsuccessfully. */ - -bool myhtml_encoding_algorithm_extracting_character_encoding_from_meta_element(const char *data, size_t data_size, myhtml_encoding_t *encoding) +bool myhtml_encoding_extracting_character_encoding_from_charset(const char *data, size_t data_size, myhtml_encoding_t *encoding) { *encoding = MyHTML_ENCODING_NOT_DETERMINED; @@ -754,7 +753,7 @@ bool myhtml_encoding_prescan_stream_to_determine_encoding_check_meta(const unsig if((is_exists & 2) == 0) { is_exists |= 2; - if(myhtml_encoding_algorithm_extracting_character_encoding_from_meta_element((const char*)(&udata[ attr.value_begin ]), attr.value_length, encoding)) { + if(myhtml_encoding_extracting_character_encoding_from_charset((const char*)(&udata[ attr.value_begin ]), attr.value_length, encoding)) { need_pragma = 2; } } diff --git a/source/myhtml/url.h b/source/myhtml/url.h new file mode 100644 index 0000000..3f8130d --- /dev/null +++ b/source/myhtml/url.h @@ -0,0 +1,60 @@ +/* + Copyright (C) 2016 Alexander Borisov + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + + Author: lex.borisov@gmail.com (Alexander Borisov) +*/ + +#ifndef MyHTML_URL_H +#define MyHTML_URL_H +#pragma once + +#ifdef __cplusplus +//extern "C" { +#endif + +#include "myhtml/myosi.h" +#include "myhtml/mystring.h" +#include "myhtml/url/scheme.h" + +typedef struct myhtml_url myhtml_url_t; + +struct myhtml_url { + const myhtml_url_scheme_entry_t* scheme; + + char* href; + char* origin; + char* protocol; + char* username; + char* password; + char* host; + char* hostname; + char* port; + char* pathname; + char* search; + char* hash; + + mchar_async_t* mchar; + size_t node_idx; +}; + +myhtml_status_t myhtml_url_parse(myhtml_url_t* url, mchar_async_t* mchar, size_t node_id); + +#ifdef __cplusplus +} /* extern "C" */ +#endif + +#endif /* MyHTML_URL_H */ diff --git a/source/myhtml/utils/mhash.c b/source/myhtml/utils/mhash.c index 1929f26..f798b5e 100644 --- a/source/myhtml/utils/mhash.c +++ b/source/myhtml/utils/mhash.c @@ -47,7 +47,7 @@ myhtml_status_t myhtml_utils_mhash_init(myhtml_utils_mhash_t* mhash, size_t tabl { mhash->mchar_obj = mchar_async_create(128, 4096); if(mhash->mchar_obj == NULL) - return MyHTML_STATUS_ATTR_ERROR_ALLOCATION; + return MyHTML_STATUS_ERROR_MEMORY_ALLOCATION; mhash->mchar_node = mchar_async_node_add(mhash->mchar_obj); @@ -56,7 +56,7 @@ myhtml_status_t myhtml_utils_mhash_init(myhtml_utils_mhash_t* mhash, size_t tabl mhash->table = myhtml_calloc(table_size, sizeof(myhtml_utils_mhash_entry_t*)); if(mhash->table == NULL) - return MyHTML_STATUS_ATTR_ERROR_ALLOCATION; + return MyHTML_STATUS_ERROR_MEMORY_ALLOCATION; if(max_depth < 1) max_depth = 1;