diff --git a/src/chars.c b/src/chars.c index d36faaee..eb070794 100644 --- a/src/chars.c +++ b/src/chars.c @@ -35,8 +35,6 @@ static bool use_utf8 = FALSE; /* Whether we've enabled UTF-8 support. */ -static const char *const bad_mbchar = "\xEF\xBF\xBD"; -static const int bad_mbchar_len = 3; /* Enable UTF-8 support. */ void utf8_init(void) @@ -230,38 +228,32 @@ char control_mbrep(const char *c) return control_rep(*c); } -/* c is a multibyte non-control character. We return that multibyte - * character. If crep is an invalid multibyte sequence, it will be - * replaced with Unicode 0xFFFD (Replacement Character). */ -char *mbrep(const char *c, char *crep, int *crep_len) +/* Assess how many bytes the given (multibyte) character occupies. Return -1 + * if the byte sequence is invalid, and return the number of bytes minus 8 + * when the byte sequence encodes an invalid codepoint. */ +int length_of_char(const char *c) { - assert(c != NULL && crep != NULL && crep_len != NULL); + assert(c != NULL); #ifdef ENABLE_UTF8 if (use_utf8) { wchar_t wc; + int charlen = mbtowc(&wc, c, MB_CUR_MAX); - /* Reject invalid Unicode characters. */ - if (mbtowc(&wc, c, MB_CUR_MAX) < 0 || !is_valid_unicode(wc)) { + /* If the sequence is invalid... */ + if (charlen < 0) { mbtowc_reset(); - *crep_len = bad_mbchar_len; - strncpy(crep, bad_mbchar, *crep_len); - } else { - *crep_len = wctomb(crep, wc); - - if (*crep_len < 0) { - wctomb_reset(); - *crep_len = 0; - } + return -1; } + + /* If the codepoint is invalid... */ + if (!is_valid_unicode(wc)) + return charlen - 8; + else + return charlen; } else #endif - { - *crep_len = 1; - *crep = *c; - } - - return crep; + return 1; } /* This function is equivalent to wcwidth() for multibyte characters. */ diff --git a/src/proto.h b/src/proto.h index 3d154210..470df16a 100644 --- a/src/proto.h +++ b/src/proto.h @@ -188,7 +188,7 @@ bool is_punct_mbchar(const char *c); bool is_word_mbchar(const char *c, bool allow_punct); char control_rep(const signed char c); char control_mbrep(const char *c); -char *mbrep(const char *c, char *crep, int *crep_len); +int length_of_char(const char *c); int mbwidth(const char *c); int mb_cur_max(void); char *make_mbchar(long chr, int *chr_mb_len); diff --git a/src/winio.c b/src/winio.c index 3bfac99f..0ad2140c 100644 --- a/src/winio.c +++ b/src/winio.c @@ -1780,6 +1780,8 @@ char *display_string(const char *buf, size_t start_col, size_t len, bool } while (*buf != '\0') { + int charlength; + if (*buf == ' ') { /* Show a space as a visible character, or as a space. */ #ifndef NANO_TINY @@ -1792,6 +1794,8 @@ char *display_string(const char *buf, size_t start_col, size_t len, bool #endif converted[index++] = ' '; start_col++; + buf++; + continue; } else if (*buf == '\t') { /* Show a tab as a visible character, or as as a space. */ #ifndef NANO_TINY @@ -1809,30 +1813,46 @@ char *display_string(const char *buf, size_t start_col, size_t len, bool converted[index++] = ' '; start_col++; } + buf++; + continue; + } + + charlength = length_of_char(buf); + /* If buf contains a control character, represent it. */ - } else if (is_cntrl_mbchar(buf)) { + if (is_cntrl_mbchar(buf)) { converted[index++] = '^'; converted[index++] = control_mbrep(buf); start_col += 2; - /* If buf contains a non-control character, interpret it. If buf - * contains an invalid multibyte sequence, display it as such. */ - } else { - char *character = charalloc(mb_cur_max()); - int charlen, i; - character = mbrep(buf, character, &charlen); - - for (i = 0; i < charlen; i++) - converted[index++] = character[i]; - - start_col += mbwidth(character); - - free(character); - - if (mbwidth(buf) > 1) - seen_wide = TRUE; + buf += charlength; + continue; } - buf += parse_mbchar(buf, NULL, NULL); + /* If buf contains a valid non-control character, simply copy it. */ + if (charlength > 0) { + int width = mbwidth(buf); + + for (; charlength > 0; charlength--) + converted[index++] = *(buf++); + + start_col += width; + if (width > 1) + seen_wide = TRUE; + + continue; + } + + /* Represent an invalid sequence with the Replacement Character. */ + converted[index++] = '\xEF'; + converted[index++] = '\xBF'; + converted[index++] = '\xBD'; + + start_col += 1; + buf++; + + /* For invalid codepoints, skip extra bytes. */ + if (charlength < -1) + buf += charlength + 7; } /* Null-terminate converted. */