mc/lib/strutil/strutilutf8.c
Andrew Borodin 7257f794d2 Update template for .c files.
Add section for forward declarations of local functions. This section is
located before file scope variables because functions can be used in
strucutres (see find.c for example):

/*** forward declarations (file scope functions) *************************************************/

/* button callbacks */
static int start_stop (WButton * button, int action);
static int find_do_view_file (WButton * button, int action);
static int find_do_edit_file (WButton * button, int action);

/*** file scope variables ************************************************************************/

static struct
{
    ...
    bcback_fn callback;
} fbuts[] =
{
    ...
    { B_STOP, NORMAL_BUTTON, N_("S&uspend"), 0, 0, NULL, start_stop },
    ...
    { B_VIEW, NORMAL_BUTTON, N_("&View - F3"), 0, 0, NULL, find_do_view_file },
    { B_VIEW, NORMAL_BUTTON, N_("&Edit - F4"), 0, 0, NULL, find_do_edit_file }
};

Signed-off-by: Andrew Borodin <aborodin@vmail.ru>
2023-03-19 20:34:24 +03:00

1522 lines
40 KiB
C

/*
UTF-8 strings utilities
Copyright (C) 2007-2023
Free Software Foundation, Inc.
Written by:
Rostislav Benes, 2007
This file is part of the Midnight Commander.
The Midnight Commander is free software: you can redistribute it
and/or modify it under the terms of the GNU General Public License as
published by the Free Software Foundation, either version 3 of the License,
or (at your option) any later version.
The Midnight Commander is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#include <config.h>
#include <stdlib.h>
#include <langinfo.h>
#include <limits.h> /* MB_LEN_MAX */
#include <string.h>
#include "lib/global.h"
#include "lib/strutil.h"
/* using function for utf-8 from glib */
/*** global variables ****************************************************************************/
/*** file scope macro definitions ****************************************************************/
/*** file scope type declarations ****************************************************************/
struct utf8_tool
{
char *actual;
size_t remain;
const char *checked;
int ident;
gboolean compose;
};
struct term_form
{
char text[BUF_MEDIUM * MB_LEN_MAX];
size_t width;
gboolean compose;
};
/*** forward declarations (file scope functions) *************************************************/
/*** file scope variables ************************************************************************/
static const char replch[] = "\xEF\xBF\xBD";
/* --------------------------------------------------------------------------------------------- */
/*** file scope functions ************************************************************************/
/* --------------------------------------------------------------------------------------------- */
static gboolean
str_unichar_iscombiningmark (gunichar uni)
{
GUnicodeType type;
type = g_unichar_type (uni);
return (type == G_UNICODE_SPACING_MARK)
|| (type == G_UNICODE_ENCLOSING_MARK) || (type == G_UNICODE_NON_SPACING_MARK);
}
/* --------------------------------------------------------------------------------------------- */
static void
str_utf8_insert_replace_char (GString * buffer)
{
g_string_append (buffer, replch);
}
/* --------------------------------------------------------------------------------------------- */
static gboolean
str_utf8_is_valid_string (const char *text)
{
return g_utf8_validate (text, -1, NULL);
}
/* --------------------------------------------------------------------------------------------- */
static int
str_utf8_is_valid_char (const char *ch, size_t size)
{
switch (g_utf8_get_char_validated (ch, size))
{
case (gunichar) (-2):
return (-2);
case (gunichar) (-1):
return (-1);
default:
return 1;
}
}
/* --------------------------------------------------------------------------------------------- */
static void
str_utf8_cnext_char (const char **text)
{
(*text) = g_utf8_next_char (*text);
}
/* --------------------------------------------------------------------------------------------- */
static void
str_utf8_cprev_char (const char **text)
{
(*text) = g_utf8_prev_char (*text);
}
/* --------------------------------------------------------------------------------------------- */
static void
str_utf8_cnext_char_safe (const char **text)
{
if (str_utf8_is_valid_char (*text, -1) == 1)
(*text) = g_utf8_next_char (*text);
else
(*text)++;
}
/* --------------------------------------------------------------------------------------------- */
static void
str_utf8_cprev_char_safe (const char **text)
{
const char *result, *t;
result = g_utf8_prev_char (*text);
t = result;
str_utf8_cnext_char_safe (&t);
if (t == *text)
(*text) = result;
else
(*text)--;
}
/* --------------------------------------------------------------------------------------------- */
static void
str_utf8_fix_string (char *text)
{
while (text[0] != '\0')
{
gunichar uni;
uni = g_utf8_get_char_validated (text, -1);
if ((uni != (gunichar) (-1)) && (uni != (gunichar) (-2)))
text = g_utf8_next_char (text);
else
{
text[0] = '?';
text++;
}
}
}
/* --------------------------------------------------------------------------------------------- */
static gboolean
str_utf8_isspace (const char *text)
{
gunichar uni;
uni = g_utf8_get_char_validated (text, -1);
return g_unichar_isspace (uni);
}
/* --------------------------------------------------------------------------------------------- */
static gboolean
str_utf8_ispunct (const char *text)
{
gunichar uni;
uni = g_utf8_get_char_validated (text, -1);
return g_unichar_ispunct (uni);
}
/* --------------------------------------------------------------------------------------------- */
static gboolean
str_utf8_isalnum (const char *text)
{
gunichar uni;
uni = g_utf8_get_char_validated (text, -1);
return g_unichar_isalnum (uni);
}
/* --------------------------------------------------------------------------------------------- */
static gboolean
str_utf8_isdigit (const char *text)
{
gunichar uni;
uni = g_utf8_get_char_validated (text, -1);
return g_unichar_isdigit (uni);
}
/* --------------------------------------------------------------------------------------------- */
static gboolean
str_utf8_isprint (const char *ch)
{
gunichar uni;
uni = g_utf8_get_char_validated (ch, -1);
return g_unichar_isprint (uni);
}
/* --------------------------------------------------------------------------------------------- */
static gboolean
str_utf8_iscombiningmark (const char *ch)
{
gunichar uni;
uni = g_utf8_get_char_validated (ch, -1);
return str_unichar_iscombiningmark (uni);
}
/* --------------------------------------------------------------------------------------------- */
static int
str_utf8_cnext_noncomb_char (const char **text)
{
int count = 0;
while ((*text)[0] != '\0')
{
str_utf8_cnext_char_safe (text);
count++;
if (!str_utf8_iscombiningmark (*text))
break;
}
return count;
}
/* --------------------------------------------------------------------------------------------- */
static int
str_utf8_cprev_noncomb_char (const char **text, const char *begin)
{
int count = 0;
while ((*text) != begin)
{
str_utf8_cprev_char_safe (text);
count++;
if (!str_utf8_iscombiningmark (*text))
break;
}
return count;
}
/* --------------------------------------------------------------------------------------------- */
static gboolean
str_utf8_toupper (const char *text, char **out, size_t * remain)
{
gunichar uni;
size_t left;
uni = g_utf8_get_char_validated (text, -1);
if (uni == (gunichar) (-1) || uni == (gunichar) (-2))
return FALSE;
uni = g_unichar_toupper (uni);
left = g_unichar_to_utf8 (uni, NULL);
if (left >= *remain)
return FALSE;
left = g_unichar_to_utf8 (uni, *out);
(*out) += left;
(*remain) -= left;
return TRUE;
}
/* --------------------------------------------------------------------------------------------- */
static gboolean
str_utf8_tolower (const char *text, char **out, size_t * remain)
{
gunichar uni;
size_t left;
uni = g_utf8_get_char_validated (text, -1);
if (uni == (gunichar) (-1) || uni == (gunichar) (-2))
return FALSE;
uni = g_unichar_tolower (uni);
left = g_unichar_to_utf8 (uni, NULL);
if (left >= *remain)
return FALSE;
left = g_unichar_to_utf8 (uni, *out);
(*out) += left;
(*remain) -= left;
return TRUE;
}
/* --------------------------------------------------------------------------------------------- */
static int
str_utf8_length (const char *text)
{
int result = 0;
const char *start;
const char *end;
start = text;
while (!g_utf8_validate (start, -1, &end) && start[0] != '\0')
{
if (start != end)
result += g_utf8_strlen (start, end - start);
result++;
start = end + 1;
}
if (start == text)
result = g_utf8_strlen (text, -1);
else if (start[0] != '\0' && start != end)
result += g_utf8_strlen (start, end - start);
return result;
}
/* --------------------------------------------------------------------------------------------- */
static int
str_utf8_length2 (const char *text, int size)
{
int result = 0;
const char *start;
const char *end;
start = text;
while (!g_utf8_validate (start, -1, &end) && start[0] != '\0' && size > 0)
{
if (start != end)
{
result += g_utf8_strlen (start, MIN (end - start, size));
size -= end - start;
}
result += (size > 0);
size--;
start = end + 1;
}
if (start == text)
result = g_utf8_strlen (text, size);
else if (start[0] != '\0' && start != end && size > 0)
result += g_utf8_strlen (start, MIN (end - start, size));
return result;
}
/* --------------------------------------------------------------------------------------------- */
static int
str_utf8_length_noncomb (const char *text)
{
int result = 0;
const char *t = text;
while (t[0] != '\0')
{
str_utf8_cnext_noncomb_char (&t);
result++;
}
return result;
}
/* --------------------------------------------------------------------------------------------- */
#if 0
static void
str_utf8_questmark_sustb (char **string, size_t * left, GString * buffer)
{
char *next;
next = g_utf8_next_char (*string);
(*left) -= next - (*string);
(*string) = next;
g_string_append_c (buffer, '?');
}
#endif
/* --------------------------------------------------------------------------------------------- */
static gchar *
str_utf8_conv_gerror_message (GError * mcerror, const char *def_msg)
{
if (mcerror != NULL)
return g_strdup (mcerror->message);
return g_strdup (def_msg != NULL ? def_msg : "");
}
/* --------------------------------------------------------------------------------------------- */
static estr_t
str_utf8_vfs_convert_to (GIConv coder, const char *string, int size, GString * buffer)
{
estr_t result = ESTR_SUCCESS;
if (coder == str_cnv_not_convert)
g_string_append_len (buffer, string, size);
else
result = str_nconvert (coder, string, size, buffer);
return result;
}
/* --------------------------------------------------------------------------------------------- */
/* utility function, that makes string valid in utf8 and all characters printable
* return width of string too */
static const struct term_form *
str_utf8_make_make_term_form (const char *text, size_t length)
{
static struct term_form result;
gunichar uni;
size_t left;
char *actual;
result.text[0] = '\0';
result.width = 0;
result.compose = FALSE;
actual = result.text;
/* check if text start with combining character,
* add space at begin in this case */
if (length != 0 && text[0] != '\0')
{
uni = g_utf8_get_char_validated (text, -1);
if ((uni != (gunichar) (-1)) && (uni != (gunichar) (-2))
&& str_unichar_iscombiningmark (uni))
{
actual[0] = ' ';
actual++;
result.width++;
result.compose = TRUE;
}
}
while (length != 0 && text[0] != '\0')
{
uni = g_utf8_get_char_validated (text, -1);
if ((uni != (gunichar) (-1)) && (uni != (gunichar) (-2)))
{
if (g_unichar_isprint (uni))
{
left = g_unichar_to_utf8 (uni, actual);
actual += left;
if (str_unichar_iscombiningmark (uni))
result.compose = TRUE;
else
{
result.width++;
if (g_unichar_iswide (uni))
result.width++;
}
}
else
{
actual[0] = '.';
actual++;
result.width++;
}
text = g_utf8_next_char (text);
}
else
{
text++;
/*actual[0] = '?'; */
memcpy (actual, replch, strlen (replch));
actual += strlen (replch);
result.width++;
}
if (length != (size_t) (-1))
length--;
}
actual[0] = '\0';
return &result;
}
/* --------------------------------------------------------------------------------------------- */
static const char *
str_utf8_term_form (const char *text)
{
static char result[BUF_MEDIUM * MB_LEN_MAX];
const struct term_form *pre_form;
pre_form = str_utf8_make_make_term_form (text, (size_t) (-1));
if (pre_form->compose)
{
char *composed;
composed = g_utf8_normalize (pre_form->text, -1, G_NORMALIZE_DEFAULT_COMPOSE);
g_strlcpy (result, composed, sizeof (result));
g_free (composed);
}
else
g_strlcpy (result, pre_form->text, sizeof (result));
return result;
}
/* --------------------------------------------------------------------------------------------- */
/* utility function, that copies all characters from checked to actual */
static gboolean
utf8_tool_copy_chars_to_end (struct utf8_tool *tool)
{
tool->compose = FALSE;
while (tool->checked[0] != '\0')
{
gunichar uni;
size_t left;
uni = g_utf8_get_char (tool->checked);
tool->compose = tool->compose || str_unichar_iscombiningmark (uni);
left = g_unichar_to_utf8 (uni, NULL);
if (tool->remain <= left)
return FALSE;
left = g_unichar_to_utf8 (uni, tool->actual);
tool->actual += left;
tool->remain -= left;
tool->checked = g_utf8_next_char (tool->checked);
}
return TRUE;
}
/* --------------------------------------------------------------------------------------------- */
/* utility function, that copies characters from checked to actual until ident is
* smaller than to_ident */
static gboolean
utf8_tool_copy_chars_to (struct utf8_tool *tool, int to_ident)
{
tool->compose = FALSE;
while (tool->checked[0] != '\0')
{
gunichar uni;
size_t left;
int w = 0;
uni = g_utf8_get_char (tool->checked);
if (str_unichar_iscombiningmark (uni))
tool->compose = TRUE;
else
{
w = 1;
if (g_unichar_iswide (uni))
w++;
if (tool->ident + w > to_ident)
return TRUE;
}
left = g_unichar_to_utf8 (uni, NULL);
if (tool->remain <= left)
return FALSE;
left = g_unichar_to_utf8 (uni, tool->actual);
tool->actual += left;
tool->remain -= left;
tool->checked = g_utf8_next_char (tool->checked);
tool->ident += w;
}
return TRUE;
}
/* --------------------------------------------------------------------------------------------- */
/* utility function, adds count spaces to actual */
static int
utf8_tool_insert_space (struct utf8_tool *tool, int count)
{
if (count <= 0)
return 1;
if (tool->remain <= (gsize) count)
return 0;
memset (tool->actual, ' ', count);
tool->actual += count;
tool->remain -= count;
return 1;
}
/* --------------------------------------------------------------------------------------------- */
/* utility function, adds one characters to actual */
static int
utf8_tool_insert_char (struct utf8_tool *tool, char ch)
{
if (tool->remain <= 1)
return 0;
tool->actual[0] = ch;
tool->actual++;
tool->remain--;
return 1;
}
/* --------------------------------------------------------------------------------------------- */
/* utility function, thah skips characters from checked until ident is greater or
* equal to to_ident */
static gboolean
utf8_tool_skip_chars_to (struct utf8_tool *tool, int to_ident)
{
gunichar uni;
while (to_ident > tool->ident && tool->checked[0] != '\0')
{
uni = g_utf8_get_char (tool->checked);
if (!str_unichar_iscombiningmark (uni))
{
tool->ident++;
if (g_unichar_iswide (uni))
tool->ident++;
}
tool->checked = g_utf8_next_char (tool->checked);
}
uni = g_utf8_get_char (tool->checked);
while (str_unichar_iscombiningmark (uni))
{
tool->checked = g_utf8_next_char (tool->checked);
uni = g_utf8_get_char (tool->checked);
}
return TRUE;
}
/* --------------------------------------------------------------------------------------------- */
static void
utf8_tool_compose (char *buffer, size_t size)
{
char *composed;
composed = g_utf8_normalize (buffer, -1, G_NORMALIZE_DEFAULT_COMPOSE);
g_strlcpy (buffer, composed, size);
g_free (composed);
}
/* --------------------------------------------------------------------------------------------- */
static const char *
str_utf8_fit_to_term (const char *text, int width, align_crt_t just_mode)
{
static char result[BUF_MEDIUM * MB_LEN_MAX];
const struct term_form *pre_form;
struct utf8_tool tool;
pre_form = str_utf8_make_make_term_form (text, (size_t) (-1));
tool.checked = pre_form->text;
tool.actual = result;
tool.remain = sizeof (result);
tool.compose = FALSE;
if (pre_form->width <= (gsize) width)
{
switch (HIDE_FIT (just_mode))
{
case J_CENTER_LEFT:
case J_CENTER:
tool.ident = (width - pre_form->width) / 2;
break;
case J_RIGHT:
tool.ident = width - pre_form->width;
break;
default:
tool.ident = 0;
break;
}
utf8_tool_insert_space (&tool, tool.ident);
utf8_tool_copy_chars_to_end (&tool);
utf8_tool_insert_space (&tool, width - pre_form->width - tool.ident);
}
else if (IS_FIT (just_mode))
{
tool.ident = 0;
utf8_tool_copy_chars_to (&tool, width / 2);
utf8_tool_insert_char (&tool, '~');
tool.ident = 0;
utf8_tool_skip_chars_to (&tool, pre_form->width - width + 1);
utf8_tool_copy_chars_to_end (&tool);
utf8_tool_insert_space (&tool, width - (pre_form->width - tool.ident + 1));
}
else
{
switch (HIDE_FIT (just_mode))
{
case J_CENTER:
tool.ident = (width - pre_form->width) / 2;
break;
case J_RIGHT:
tool.ident = width - pre_form->width;
break;
default:
tool.ident = 0;
break;
}
utf8_tool_skip_chars_to (&tool, 0);
utf8_tool_insert_space (&tool, tool.ident);
utf8_tool_copy_chars_to (&tool, width);
utf8_tool_insert_space (&tool, width - tool.ident);
}
tool.actual[0] = '\0';
if (tool.compose)
utf8_tool_compose (result, sizeof (result));
return result;
}
/* --------------------------------------------------------------------------------------------- */
static const char *
str_utf8_term_trim (const char *text, int width)
{
static char result[BUF_MEDIUM * MB_LEN_MAX];
const struct term_form *pre_form;
struct utf8_tool tool;
if (width < 1)
{
result[0] = '\0';
return result;
}
pre_form = str_utf8_make_make_term_form (text, (size_t) (-1));
tool.checked = pre_form->text;
tool.actual = result;
tool.remain = sizeof (result);
tool.compose = FALSE;
if ((gsize) width >= pre_form->width)
utf8_tool_copy_chars_to_end (&tool);
else if (width <= 3)
{
memset (tool.actual, '.', width);
tool.actual += width;
tool.remain -= width;
}
else
{
memset (tool.actual, '.', 3);
tool.actual += 3;
tool.remain -= 3;
tool.ident = 0;
utf8_tool_skip_chars_to (&tool, pre_form->width - width + 3);
utf8_tool_copy_chars_to_end (&tool);
}
tool.actual[0] = '\0';
if (tool.compose)
utf8_tool_compose (result, sizeof (result));
return result;
}
/* --------------------------------------------------------------------------------------------- */
static int
str_utf8_term_width2 (const char *text, size_t length)
{
const struct term_form *result;
result = str_utf8_make_make_term_form (text, length);
return result->width;
}
/* --------------------------------------------------------------------------------------------- */
static int
str_utf8_term_width1 (const char *text)
{
return str_utf8_term_width2 (text, (size_t) (-1));
}
/* --------------------------------------------------------------------------------------------- */
static int
str_utf8_term_char_width (const char *text)
{
gunichar uni;
uni = g_utf8_get_char_validated (text, -1);
return (str_unichar_iscombiningmark (uni)) ? 0 : ((g_unichar_iswide (uni)) ? 2 : 1);
}
/* --------------------------------------------------------------------------------------------- */
static const char *
str_utf8_term_substring (const char *text, int start, int width)
{
static char result[BUF_MEDIUM * MB_LEN_MAX];
const struct term_form *pre_form;
struct utf8_tool tool;
pre_form = str_utf8_make_make_term_form (text, (size_t) (-1));
tool.checked = pre_form->text;
tool.actual = result;
tool.remain = sizeof (result);
tool.compose = FALSE;
tool.ident = -start;
utf8_tool_skip_chars_to (&tool, 0);
if (tool.ident < 0)
tool.ident = 0;
utf8_tool_insert_space (&tool, tool.ident);
utf8_tool_copy_chars_to (&tool, width);
utf8_tool_insert_space (&tool, width - tool.ident);
tool.actual[0] = '\0';
if (tool.compose)
utf8_tool_compose (result, sizeof (result));
return result;
}
/* --------------------------------------------------------------------------------------------- */
static const char *
str_utf8_trunc (const char *text, int width)
{
static char result[MC_MAXPATHLEN * MB_LEN_MAX * 2];
const struct term_form *pre_form;
struct utf8_tool tool;
pre_form = str_utf8_make_make_term_form (text, (size_t) (-1));
tool.checked = pre_form->text;
tool.actual = result;
tool.remain = sizeof (result);
tool.compose = FALSE;
if (pre_form->width <= (gsize) width)
utf8_tool_copy_chars_to_end (&tool);
else
{
tool.ident = 0;
utf8_tool_copy_chars_to (&tool, width / 2);
utf8_tool_insert_char (&tool, '~');
tool.ident = 0;
utf8_tool_skip_chars_to (&tool, pre_form->width - width + 1);
utf8_tool_copy_chars_to_end (&tool);
}
tool.actual[0] = '\0';
if (tool.compose)
utf8_tool_compose (result, sizeof (result));
return result;
}
/* --------------------------------------------------------------------------------------------- */
static int
str_utf8_offset_to_pos (const char *text, size_t length)
{
if (str_utf8_is_valid_string (text))
return g_utf8_offset_to_pointer (text, length) - text;
else
{
int result;
GString *buffer;
buffer = g_string_new (text);
str_utf8_fix_string (buffer->str);
result = g_utf8_offset_to_pointer (buffer->str, length) - buffer->str;
g_string_free (buffer, TRUE);
return result;
}
}
/* --------------------------------------------------------------------------------------------- */
static int
str_utf8_column_to_pos (const char *text, size_t pos)
{
int result = 0;
int width = 0;
while (text[0] != '\0')
{
gunichar uni;
uni = g_utf8_get_char_validated (text, MB_LEN_MAX);
if ((uni != (gunichar) (-1)) && (uni != (gunichar) (-2)))
{
if (g_unichar_isprint (uni))
{
if (!str_unichar_iscombiningmark (uni))
{
width++;
if (g_unichar_iswide (uni))
width++;
}
}
else
{
width++;
}
text = g_utf8_next_char (text);
}
else
{
text++;
width++;
}
if ((gsize) width > pos)
return result;
result++;
}
return result;
}
/* --------------------------------------------------------------------------------------------- */
static char *
str_utf8_create_search_needle (const char *needle, gboolean case_sen)
{
char *fold, *result;
if (needle == NULL)
return NULL;
if (case_sen)
return g_utf8_normalize (needle, -1, G_NORMALIZE_ALL);
fold = g_utf8_casefold (needle, -1);
result = g_utf8_normalize (fold, -1, G_NORMALIZE_ALL);
g_free (fold);
return result;
}
/* --------------------------------------------------------------------------------------------- */
static void
str_utf8_release_search_needle (char *needle, gboolean case_sen)
{
(void) case_sen;
g_free (needle);
}
/* --------------------------------------------------------------------------------------------- */
static const char *
str_utf8_search_first (const char *text, const char *search, gboolean case_sen)
{
char *fold_text;
char *deco_text;
const char *match;
const char *result = NULL;
const char *m;
fold_text = case_sen ? (char *) text : g_utf8_casefold (text, -1);
deco_text = g_utf8_normalize (fold_text, -1, G_NORMALIZE_ALL);
match = deco_text;
do
{
match = g_strstr_len (match, -1, search);
if (match != NULL)
{
if ((!str_utf8_iscombiningmark (match) || (match == deco_text)) &&
!str_utf8_iscombiningmark (match + strlen (search)))
{
result = text;
m = deco_text;
while (m < match)
{
str_utf8_cnext_noncomb_char (&m);
str_utf8_cnext_noncomb_char (&result);
}
}
else
str_utf8_cnext_char (&match);
}
}
while (match != NULL && result == NULL);
g_free (deco_text);
if (!case_sen)
g_free (fold_text);
return result;
}
/* --------------------------------------------------------------------------------------------- */
static const char *
str_utf8_search_last (const char *text, const char *search, gboolean case_sen)
{
char *fold_text;
char *deco_text;
char *match;
const char *result = NULL;
const char *m;
fold_text = case_sen ? (char *) text : g_utf8_casefold (text, -1);
deco_text = g_utf8_normalize (fold_text, -1, G_NORMALIZE_ALL);
do
{
match = g_strrstr_len (deco_text, -1, search);
if (match != NULL)
{
if ((!str_utf8_iscombiningmark (match) || (match == deco_text)) &&
!str_utf8_iscombiningmark (match + strlen (search)))
{
result = text;
m = deco_text;
while (m < match)
{
str_utf8_cnext_noncomb_char (&m);
str_utf8_cnext_noncomb_char (&result);
}
}
else
match[0] = '\0';
}
}
while (match != NULL && result == NULL);
g_free (deco_text);
if (!case_sen)
g_free (fold_text);
return result;
}
/* --------------------------------------------------------------------------------------------- */
static char *
str_utf8_normalize (const char *text)
{
GString *fixed;
char *tmp;
char *result;
const char *start;
const char *end;
/* g_utf8_normalize() is a heavyweight function, that converts UTF-8 into UCS-4,
* does the normalization and then converts UCS-4 back into UTF-8.
* Since file names are composed of ASCII characters in most cases, we can speed up
* utf8 normalization by checking if the heavyweight Unicode normalization is actually
* needed. Normalization of ASCII string is no-op.
*/
/* find out whether text is ASCII only */
for (end = text; *end != '\0'; end++)
if ((*end & 0x80) != 0)
{
/* found 2nd byte of utf8-encoded symbol */
break;
}
/* if text is ASCII-only, return copy, normalize otherwise */
if (*end == '\0')
return g_strndup (text, end - text);
fixed = g_string_sized_new (4);
start = text;
while (!g_utf8_validate (start, -1, &end) && start[0] != '\0')
{
if (start != end)
{
tmp = g_utf8_normalize (start, end - start, G_NORMALIZE_ALL);
g_string_append (fixed, tmp);
g_free (tmp);
}
g_string_append_c (fixed, end[0]);
start = end + 1;
}
if (start == text)
{
result = g_utf8_normalize (text, -1, G_NORMALIZE_ALL);
g_string_free (fixed, TRUE);
}
else
{
if (start[0] != '\0' && start != end)
{
tmp = g_utf8_normalize (start, end - start, G_NORMALIZE_ALL);
g_string_append (fixed, tmp);
g_free (tmp);
}
result = g_string_free (fixed, FALSE);
}
return result;
}
/* --------------------------------------------------------------------------------------------- */
static char *
str_utf8_casefold_normalize (const char *text)
{
GString *fixed;
char *tmp, *fold;
char *result;
const char *start;
const char *end;
fixed = g_string_sized_new (4);
start = text;
while (!g_utf8_validate (start, -1, &end) && start[0] != '\0')
{
if (start != end)
{
fold = g_utf8_casefold (start, end - start);
tmp = g_utf8_normalize (fold, -1, G_NORMALIZE_ALL);
g_string_append (fixed, tmp);
g_free (tmp);
g_free (fold);
}
g_string_append_c (fixed, end[0]);
start = end + 1;
}
if (start == text)
{
fold = g_utf8_casefold (text, -1);
result = g_utf8_normalize (fold, -1, G_NORMALIZE_ALL);
g_free (fold);
g_string_free (fixed, TRUE);
}
else
{
if (start[0] != '\0' && start != end)
{
fold = g_utf8_casefold (start, end - start);
tmp = g_utf8_normalize (fold, -1, G_NORMALIZE_ALL);
g_string_append (fixed, tmp);
g_free (tmp);
g_free (fold);
}
result = g_string_free (fixed, FALSE);
}
return result;
}
/* --------------------------------------------------------------------------------------------- */
static int
str_utf8_compare (const char *t1, const char *t2)
{
char *n1, *n2;
int result;
n1 = str_utf8_normalize (t1);
n2 = str_utf8_normalize (t2);
result = strcmp (n1, n2);
g_free (n1);
g_free (n2);
return result;
}
/* --------------------------------------------------------------------------------------------- */
static int
str_utf8_ncompare (const char *t1, const char *t2)
{
char *n1, *n2;
size_t l1, l2;
int result;
n1 = str_utf8_normalize (t1);
n2 = str_utf8_normalize (t2);
l1 = strlen (n1);
l2 = strlen (n2);
result = strncmp (n1, n2, MIN (l1, l2));
g_free (n1);
g_free (n2);
return result;
}
/* --------------------------------------------------------------------------------------------- */
static int
str_utf8_casecmp (const char *t1, const char *t2)
{
char *n1, *n2;
int result;
n1 = str_utf8_casefold_normalize (t1);
n2 = str_utf8_casefold_normalize (t2);
result = strcmp (n1, n2);
g_free (n1);
g_free (n2);
return result;
}
/* --------------------------------------------------------------------------------------------- */
static int
str_utf8_ncasecmp (const char *t1, const char *t2)
{
char *n1, *n2;
size_t l1, l2;
int result;
n1 = str_utf8_casefold_normalize (t1);
n2 = str_utf8_casefold_normalize (t2);
l1 = strlen (n1);
l2 = strlen (n2);
result = strncmp (n1, n2, MIN (l1, l2));
g_free (n1);
g_free (n2);
return result;
}
/* --------------------------------------------------------------------------------------------- */
static int
str_utf8_prefix (const char *text, const char *prefix)
{
char *t, *p;
const char *nt, *np;
const char *nnt, *nnp;
int result;
t = str_utf8_normalize (text);
p = str_utf8_normalize (prefix);
nt = t;
np = p;
nnt = t;
nnp = p;
while (nt[0] != '\0' && np[0] != '\0')
{
str_utf8_cnext_char_safe (&nnt);
str_utf8_cnext_char_safe (&nnp);
if (nnt - nt != nnp - np)
break;
if (strncmp (nt, np, nnt - nt) != 0)
break;
nt = nnt;
np = nnp;
}
result = np - p;
g_free (t);
g_free (p);
return result;
}
/* --------------------------------------------------------------------------------------------- */
static int
str_utf8_caseprefix (const char *text, const char *prefix)
{
char *t, *p;
const char *nt, *np;
const char *nnt, *nnp;
int result;
t = str_utf8_casefold_normalize (text);
p = str_utf8_casefold_normalize (prefix);
nt = t;
np = p;
nnt = t;
nnp = p;
while (nt[0] != '\0' && np[0] != '\0')
{
str_utf8_cnext_char_safe (&nnt);
str_utf8_cnext_char_safe (&nnp);
if (nnt - nt != nnp - np)
break;
if (strncmp (nt, np, nnt - nt) != 0)
break;
nt = nnt;
np = nnp;
}
result = np - p;
g_free (t);
g_free (p);
return result;
}
/* --------------------------------------------------------------------------------------------- */
static char *
str_utf8_create_key_gen (const char *text, gboolean case_sen,
gchar * (*keygen) (const gchar * text, gssize size))
{
char *result;
if (case_sen)
result = str_utf8_normalize (text);
else
{
gboolean dot;
GString *fixed;
const char *start, *end;
char *fold, *key;
dot = text[0] == '.';
fixed = g_string_sized_new (16);
if (!dot)
start = text;
else
{
start = text + 1;
g_string_append_c (fixed, '.');
}
while (!g_utf8_validate (start, -1, &end) && start[0] != '\0')
{
if (start != end)
{
fold = g_utf8_casefold (start, end - start);
key = keygen (fold, -1);
g_string_append (fixed, key);
g_free (key);
g_free (fold);
}
g_string_append_c (fixed, end[0]);
start = end + 1;
}
if (start == text)
{
fold = g_utf8_casefold (start, -1);
result = keygen (fold, -1);
g_free (fold);
g_string_free (fixed, TRUE);
}
else if (dot && (start == text + 1))
{
fold = g_utf8_casefold (start, -1);
key = keygen (fold, -1);
g_string_append (fixed, key);
g_free (key);
g_free (fold);
result = g_string_free (fixed, FALSE);
}
else
{
if (start[0] != '\0' && start != end)
{
fold = g_utf8_casefold (start, end - start);
key = keygen (fold, -1);
g_string_append (fixed, key);
g_free (key);
g_free (fold);
}
result = g_string_free (fixed, FALSE);
}
}
return result;
}
/* --------------------------------------------------------------------------------------------- */
static char *
str_utf8_create_key (const char *text, gboolean case_sen)
{
return str_utf8_create_key_gen (text, case_sen, g_utf8_collate_key);
}
/* --------------------------------------------------------------------------------------------- */
#ifdef MC__USE_STR_UTF8_CREATE_KEY_FOR_FILENAME
static char *
str_utf8_create_key_for_filename (const char *text, gboolean case_sen)
{
return str_utf8_create_key_gen (text, case_sen, g_utf8_collate_key_for_filename);
}
#endif
/* --------------------------------------------------------------------------------------------- */
static int
str_utf8_key_collate (const char *t1, const char *t2, gboolean case_sen)
{
(void) case_sen;
return strcmp (t1, t2);
}
/* --------------------------------------------------------------------------------------------- */
static void
str_utf8_release_key (char *key, gboolean case_sen)
{
(void) case_sen;
g_free (key);
}
/* --------------------------------------------------------------------------------------------- */
/*** public functions ****************************************************************************/
/* --------------------------------------------------------------------------------------------- */
struct str_class
str_utf8_init (void)
{
struct str_class result;
result.conv_gerror_message = str_utf8_conv_gerror_message;
result.vfs_convert_to = str_utf8_vfs_convert_to;
result.insert_replace_char = str_utf8_insert_replace_char;
result.is_valid_string = str_utf8_is_valid_string;
result.is_valid_char = str_utf8_is_valid_char;
result.cnext_char = str_utf8_cnext_char;
result.cprev_char = str_utf8_cprev_char;
result.cnext_char_safe = str_utf8_cnext_char_safe;
result.cprev_char_safe = str_utf8_cprev_char_safe;
result.cnext_noncomb_char = str_utf8_cnext_noncomb_char;
result.cprev_noncomb_char = str_utf8_cprev_noncomb_char;
result.char_isspace = str_utf8_isspace;
result.char_ispunct = str_utf8_ispunct;
result.char_isalnum = str_utf8_isalnum;
result.char_isdigit = str_utf8_isdigit;
result.char_isprint = str_utf8_isprint;
result.char_iscombiningmark = str_utf8_iscombiningmark;
result.char_toupper = str_utf8_toupper;
result.char_tolower = str_utf8_tolower;
result.length = str_utf8_length;
result.length2 = str_utf8_length2;
result.length_noncomb = str_utf8_length_noncomb;
result.fix_string = str_utf8_fix_string;
result.term_form = str_utf8_term_form;
result.fit_to_term = str_utf8_fit_to_term;
result.term_trim = str_utf8_term_trim;
result.term_width2 = str_utf8_term_width2;
result.term_width1 = str_utf8_term_width1;
result.term_char_width = str_utf8_term_char_width;
result.term_substring = str_utf8_term_substring;
result.trunc = str_utf8_trunc;
result.offset_to_pos = str_utf8_offset_to_pos;
result.column_to_pos = str_utf8_column_to_pos;
result.create_search_needle = str_utf8_create_search_needle;
result.release_search_needle = str_utf8_release_search_needle;
result.search_first = str_utf8_search_first;
result.search_last = str_utf8_search_last;
result.compare = str_utf8_compare;
result.ncompare = str_utf8_ncompare;
result.casecmp = str_utf8_casecmp;
result.ncasecmp = str_utf8_ncasecmp;
result.prefix = str_utf8_prefix;
result.caseprefix = str_utf8_caseprefix;
result.create_key = str_utf8_create_key;
#ifdef MC__USE_STR_UTF8_CREATE_KEY_FOR_FILENAME
/* case insensitive sort files in "a1 a2 a10" order */
result.create_key_for_filename = str_utf8_create_key_for_filename;
#else
/* case insensitive sort files in "a1 a10 a2" order */
result.create_key_for_filename = str_utf8_create_key;
#endif
result.key_collate = str_utf8_key_collate;
result.release_key = str_utf8_release_key;
return result;
}
/* --------------------------------------------------------------------------------------------- */