166 lines
4.8 KiB
C
166 lines
4.8 KiB
C
/*
|
|
* Copyright 2005 John M Bell <jmb202@ecs.soton.ac.uk>
|
|
*
|
|
* This file is part of NetSurf, http://www.netsurf-browser.org/
|
|
*
|
|
* NetSurf is free software; you can redistribute it and/or modify
|
|
* it under the terms of the GNU General Public License as published by
|
|
* the Free Software Foundation; version 2 of the License.
|
|
*
|
|
* NetSurf is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
* GNU General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU General Public License
|
|
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
*/
|
|
|
|
/** \file
|
|
* UTF-8 manipulation functions (interface).
|
|
*/
|
|
|
|
#ifndef _NETSURF_UTILS_UTF8_H_
|
|
#define _NETSURF_UTILS_UTF8_H_
|
|
|
|
#include <stdbool.h>
|
|
#include <stdint.h>
|
|
|
|
#include "utils/errors.h"
|
|
|
|
/**
|
|
* Convert a UTF-8 multibyte sequence into a single UCS4 character
|
|
*
|
|
* Encoding of UCS values outside the UTF-16 plane has been removed from
|
|
* RFC3629. This function conforms to RFC2279, however.
|
|
*
|
|
* \param[in] s The sequence to process
|
|
* \param[in] l Length of sequence
|
|
* \return UCS4 character
|
|
*/
|
|
uint32_t utf8_to_ucs4(const char *s, size_t l);
|
|
|
|
/**
|
|
* Convert a single UCS4 character into a UTF-8 multibyte sequence
|
|
*
|
|
* Encoding of UCS values outside the UTF-16 plane has been removed from
|
|
* RFC3629. This function conforms to RFC2279, however.
|
|
*
|
|
* \param c The character to process (0 <= c <= 0x7FFFFFFF)
|
|
* \param s Pointer to 6 byte long output buffer
|
|
* \return Length of multibyte sequence
|
|
*/
|
|
size_t utf8_from_ucs4(uint32_t c, char *s);
|
|
|
|
|
|
/**
|
|
* Calculate the length (in characters) of a NULL-terminated UTF-8 string
|
|
*
|
|
* \param s The string
|
|
* \return Length of string
|
|
*/
|
|
size_t utf8_length(const char *s);
|
|
|
|
/**
|
|
* Calculated the length (in characters) of a bounded UTF-8 string
|
|
*
|
|
* \param s The string
|
|
* \param l Maximum length of input (in bytes)
|
|
* \return Length of string, in characters
|
|
*/
|
|
size_t utf8_bounded_length(const char *s, size_t l);
|
|
|
|
/**
|
|
* Calculate the length (in bytes) of a bounded UTF-8 string
|
|
*
|
|
* \param s The string
|
|
* \param l Maximum length of input (in bytes)
|
|
* \param c Maximum number of characters to measure
|
|
* \return Length of string, in bytes
|
|
*/
|
|
size_t utf8_bounded_byte_length(const char *s, size_t l, size_t c);
|
|
|
|
/**
|
|
* Calculate the length (in bytes) of a UTF-8 character
|
|
*
|
|
* \param s Pointer to start of character
|
|
* \return Length of character, in bytes
|
|
*/
|
|
size_t utf8_char_byte_length(const char *s);
|
|
|
|
|
|
/**
|
|
* Find previous legal UTF-8 char in string
|
|
*
|
|
* \param s The string
|
|
* \param o Offset in the string to start at
|
|
* \return Offset of first byte of previous legal character
|
|
*/
|
|
size_t utf8_prev(const char *s, size_t o);
|
|
|
|
/**
|
|
* Find next legal UTF-8 char in string
|
|
*
|
|
* \param s The string
|
|
* \param l Maximum offset in string
|
|
* \param o Offset in the string to start at
|
|
* \return Offset of first byte of next legal character
|
|
*/
|
|
size_t utf8_next(const char *s, size_t l, size_t o);
|
|
|
|
|
|
/**
|
|
* Convert a UTF8 string into the named encoding
|
|
*
|
|
* \param string The NULL-terminated string to convert
|
|
* \param encname The encoding name (suitable for passing to iconv)
|
|
* \param len Length of input string to consider (in bytes), or 0
|
|
* \param result Pointer to location to store result (allocated on heap)
|
|
* \return standard nserror value
|
|
*/
|
|
nserror utf8_to_enc(const char *string, const char *encname,
|
|
size_t len, char **result);
|
|
|
|
/**
|
|
* Convert a string in the named encoding into a UTF-8 string
|
|
*
|
|
* \param string The NULL-terminated string to convert
|
|
* \param encname The encoding name (suitable for passing to iconv)
|
|
* \param len Length of input string to consider (in bytes), or 0
|
|
* \param result Pointer to location to store result (allocated on heap)
|
|
* \param result_len The length of the data placed in result.
|
|
* \return standard nserror value
|
|
*/
|
|
nserror utf8_from_enc(const char *string, const char *encname,
|
|
size_t len, char **result, size_t *result_len);
|
|
|
|
/**
|
|
* Convert a UTF-8 encoded string into a string of the given encoding,
|
|
* applying HTML escape sequences where necessary.
|
|
*
|
|
* \param string String to convert (NUL-terminated)
|
|
* \param encname Name of encoding to convert to
|
|
* \param len Length, in bytes, of the input string, or 0
|
|
* \param result Pointer to location to receive result
|
|
* \return standard nserror code
|
|
*/
|
|
nserror utf8_to_html(const char *string, const char *encname,
|
|
size_t len, char **result);
|
|
|
|
/**
|
|
* Save the given utf8 text to a file, converting to local encoding.
|
|
*
|
|
* \param utf8_text text to save to file
|
|
* \param path pathname to save to
|
|
* \return true iff the save succeeded
|
|
*/
|
|
bool utf8_save_text(const char *utf8_text, const char *path);
|
|
|
|
|
|
/**
|
|
* Finalise the UTF-8 library
|
|
*/
|
|
nserror utf8_finalise(void);
|
|
|
|
#endif
|