From 279de0c8eb3cb186914799ccc5ee94ea97f56de4 Mon Sep 17 00:00:00 2001 From: Chris Angelico Date: Sat, 7 Jun 2014 15:28:35 +1000 Subject: [PATCH] Formatting/layout improvements - introduce macros for UTF-8 byte detection, add braces. No functional changes. --- py/builtin.c | 4 ++-- py/misc.h | 2 ++ py/objstr.c | 29 +++++++++++++++++++++-------- py/qstr.c | 14 ++++++++++---- 4 files changed, 35 insertions(+), 14 deletions(-) diff --git a/py/builtin.c b/py/builtin.c index ded8e522f1..f5102feb59 100644 --- a/py/builtin.c +++ b/py/builtin.c @@ -360,12 +360,12 @@ STATIC mp_obj_t mp_builtin_ord(mp_obj_t o_in) { uint len, charlen; const char *str = mp_obj_str_get_data_len(o_in, &len, &charlen); if (charlen == 1) { - if (MP_OBJ_IS_STR(o_in) && (*str & 0x80)) { + if (MP_OBJ_IS_STR(o_in) && UTF8_IS_NONASCII(*str)) { machine_int_t ord = *str++ & 0x7F; for (machine_int_t mask = 0x40; ord & mask; mask >>= 1) { ord &= ~mask; } - while ((*str & 0xC0) == 0x80) { + while (UTF8_IS_CONT(*str)) { ord = (ord << 6) | (*str++ & 0x3F); } return mp_obj_new_int(ord); diff --git a/py/misc.h b/py/misc.h index fd54147efd..f2d375d251 100644 --- a/py/misc.h +++ b/py/misc.h @@ -100,6 +100,8 @@ bool unichar_isupper(unichar c); bool unichar_islower(unichar c); unichar unichar_tolower(unichar c); unichar unichar_toupper(unichar c); +#define UTF8_IS_NONASCII(ch) ((ch) & 0x80) +#define UTF8_IS_CONT(ch) (((ch) & 0xC0) == 0x80) /** variable string *********************************************/ diff --git a/py/objstr.c b/py/objstr.c index 9f6a9d5771..21b0b3a19b 100644 --- a/py/objstr.c +++ b/py/objstr.c @@ -109,7 +109,7 @@ void mp_str_print_quoted(void (*print)(void *env, const char *fmt, ...), void *e for (machine_int_t mask = 0x40; ord & mask; mask >>= 1) { ord &= ~mask; } - while ((*s & 0xC0) == 0x80) { + while (UTF8_IS_CONT(*s)) { ord = (ord << 6) | (*s++ & 0x3F); } --s; // s will be incremented by the main loop @@ -398,12 +398,22 @@ STATIC mp_obj_t str_subscr(mp_obj_t self_in, mp_obj_t index, mp_obj_t value) { // Assumes that the string is correctly formed - will run past the // end of the buffer if there aren't that many characters in it const char *s; - for (s=(const char *)self_data; index_val; ++s) - if ((*s & 0xC0) != 0x80) --index_val; - while ((*s & 0xC0) == 0x80) ++s; // Skip continuation bytes after the last lead byte + for (s=(const char *)self_data; index_val; ++s) { + if (!UTF8_IS_CONT(*s)) { + --index_val; + } + } + // Skip continuation bytes after the last lead byte + while (UTF8_IS_CONT(*s)) { + ++s; + } int len = 1; - if (*s & 0x80) - for (char mask = 0x40; *s & mask; mask >>= 1) ++len; // Count the number of 1 bits (after the first) + if (UTF8_IS_NONASCII(*s)) { + // Count the number of 1 bits (after the first) + for (char mask = 0x40; *s & mask; mask >>= 1) { + ++len; + } + } return mp_obj_new_str(s, len, true); // This will create a one-character string } } else { @@ -1769,8 +1779,11 @@ mp_obj_t mp_obj_new_str_of_type(const mp_obj_type_t *type, const byte* data, uin // Count non-continuation bytes so we know how long the string is in characters. const byte *endptr, *top = data + len; uint charlen = 0; - for (endptr = data; endptr < top; ++endptr) - if ((*endptr & 0xC0) != 0x80) ++charlen; + for (endptr = data; endptr < top; ++endptr) { + if (!UTF8_IS_CONT(*endptr)) { + ++charlen; + } + } o->charlen = charlen; } else { // For byte strings, the 'character' length (really the "exposed length" or "Python length") equals the byte length. diff --git a/py/qstr.c b/py/qstr.c index 4013a67f79..5637aea77d 100644 --- a/py/qstr.c +++ b/py/qstr.c @@ -162,8 +162,11 @@ qstr qstr_from_strn(const char *str, uint len) { machine_uint_t hash = qstr_compute_hash((const byte*)str, len); byte *q_ptr = m_new(byte, 7 + len + 1); uint charlen = 0; - for (const char *s = str; s < str + len; ++s) - if ((*s & 0xC0) != 0x80) ++charlen; + for (const char *s = str; s < str + len; ++s) { + if (!UTF8_IS_CONT(*s)) { + ++charlen; + } + } q_ptr[0] = hash; q_ptr[1] = hash >> 8; q_ptr[2] = len; @@ -195,8 +198,11 @@ qstr qstr_build_end(byte *q_ptr) { q_ptr[0] = hash; q_ptr[1] = hash >> 8; uint charlen = 0; - for (const byte *s = str; s < str + len; ++s) - if ((*s & 0xC0) != 0x80) ++charlen; + for (const byte *s = str; s < str + len; ++s) { + if (!UTF8_IS_CONT(*s)) { + ++charlen; + } + } q_ptr[4] = charlen; q_ptr[5] = charlen >> 8; q_ptr[6] = 1;