Formatting/layout improvements - introduce macros for UTF-8 byte detection, add braces. No functional changes.
This commit is contained in:
parent
f1911f53d5
commit
279de0c8eb
|
@ -360,12 +360,12 @@ STATIC mp_obj_t mp_builtin_ord(mp_obj_t o_in) {
|
|||
uint len, charlen;
|
||||
const char *str = mp_obj_str_get_data_len(o_in, &len, &charlen);
|
||||
if (charlen == 1) {
|
||||
if (MP_OBJ_IS_STR(o_in) && (*str & 0x80)) {
|
||||
if (MP_OBJ_IS_STR(o_in) && UTF8_IS_NONASCII(*str)) {
|
||||
machine_int_t ord = *str++ & 0x7F;
|
||||
for (machine_int_t mask = 0x40; ord & mask; mask >>= 1) {
|
||||
ord &= ~mask;
|
||||
}
|
||||
while ((*str & 0xC0) == 0x80) {
|
||||
while (UTF8_IS_CONT(*str)) {
|
||||
ord = (ord << 6) | (*str++ & 0x3F);
|
||||
}
|
||||
return mp_obj_new_int(ord);
|
||||
|
|
|
@ -100,6 +100,8 @@ bool unichar_isupper(unichar c);
|
|||
bool unichar_islower(unichar c);
|
||||
unichar unichar_tolower(unichar c);
|
||||
unichar unichar_toupper(unichar c);
|
||||
#define UTF8_IS_NONASCII(ch) ((ch) & 0x80)
|
||||
#define UTF8_IS_CONT(ch) (((ch) & 0xC0) == 0x80)
|
||||
|
||||
/** variable string *********************************************/
|
||||
|
||||
|
|
29
py/objstr.c
29
py/objstr.c
|
@ -109,7 +109,7 @@ void mp_str_print_quoted(void (*print)(void *env, const char *fmt, ...), void *e
|
|||
for (machine_int_t mask = 0x40; ord & mask; mask >>= 1) {
|
||||
ord &= ~mask;
|
||||
}
|
||||
while ((*s & 0xC0) == 0x80) {
|
||||
while (UTF8_IS_CONT(*s)) {
|
||||
ord = (ord << 6) | (*s++ & 0x3F);
|
||||
}
|
||||
--s; // s will be incremented by the main loop
|
||||
|
@ -398,12 +398,22 @@ STATIC mp_obj_t str_subscr(mp_obj_t self_in, mp_obj_t index, mp_obj_t value) {
|
|||
// Assumes that the string is correctly formed - will run past the
|
||||
// end of the buffer if there aren't that many characters in it
|
||||
const char *s;
|
||||
for (s=(const char *)self_data; index_val; ++s)
|
||||
if ((*s & 0xC0) != 0x80) --index_val;
|
||||
while ((*s & 0xC0) == 0x80) ++s; // Skip continuation bytes after the last lead byte
|
||||
for (s=(const char *)self_data; index_val; ++s) {
|
||||
if (!UTF8_IS_CONT(*s)) {
|
||||
--index_val;
|
||||
}
|
||||
}
|
||||
// Skip continuation bytes after the last lead byte
|
||||
while (UTF8_IS_CONT(*s)) {
|
||||
++s;
|
||||
}
|
||||
int len = 1;
|
||||
if (*s & 0x80)
|
||||
for (char mask = 0x40; *s & mask; mask >>= 1) ++len; // Count the number of 1 bits (after the first)
|
||||
if (UTF8_IS_NONASCII(*s)) {
|
||||
// Count the number of 1 bits (after the first)
|
||||
for (char mask = 0x40; *s & mask; mask >>= 1) {
|
||||
++len;
|
||||
}
|
||||
}
|
||||
return mp_obj_new_str(s, len, true); // This will create a one-character string
|
||||
}
|
||||
} else {
|
||||
|
@ -1769,8 +1779,11 @@ mp_obj_t mp_obj_new_str_of_type(const mp_obj_type_t *type, const byte* data, uin
|
|||
// Count non-continuation bytes so we know how long the string is in characters.
|
||||
const byte *endptr, *top = data + len;
|
||||
uint charlen = 0;
|
||||
for (endptr = data; endptr < top; ++endptr)
|
||||
if ((*endptr & 0xC0) != 0x80) ++charlen;
|
||||
for (endptr = data; endptr < top; ++endptr) {
|
||||
if (!UTF8_IS_CONT(*endptr)) {
|
||||
++charlen;
|
||||
}
|
||||
}
|
||||
o->charlen = charlen;
|
||||
} else {
|
||||
// For byte strings, the 'character' length (really the "exposed length" or "Python length") equals the byte length.
|
||||
|
|
14
py/qstr.c
14
py/qstr.c
|
@ -162,8 +162,11 @@ qstr qstr_from_strn(const char *str, uint len) {
|
|||
machine_uint_t hash = qstr_compute_hash((const byte*)str, len);
|
||||
byte *q_ptr = m_new(byte, 7 + len + 1);
|
||||
uint charlen = 0;
|
||||
for (const char *s = str; s < str + len; ++s)
|
||||
if ((*s & 0xC0) != 0x80) ++charlen;
|
||||
for (const char *s = str; s < str + len; ++s) {
|
||||
if (!UTF8_IS_CONT(*s)) {
|
||||
++charlen;
|
||||
}
|
||||
}
|
||||
q_ptr[0] = hash;
|
||||
q_ptr[1] = hash >> 8;
|
||||
q_ptr[2] = len;
|
||||
|
@ -195,8 +198,11 @@ qstr qstr_build_end(byte *q_ptr) {
|
|||
q_ptr[0] = hash;
|
||||
q_ptr[1] = hash >> 8;
|
||||
uint charlen = 0;
|
||||
for (const byte *s = str; s < str + len; ++s)
|
||||
if ((*s & 0xC0) != 0x80) ++charlen;
|
||||
for (const byte *s = str; s < str + len; ++s) {
|
||||
if (!UTF8_IS_CONT(*s)) {
|
||||
++charlen;
|
||||
}
|
||||
}
|
||||
q_ptr[4] = charlen;
|
||||
q_ptr[5] = charlen >> 8;
|
||||
q_ptr[6] = 1;
|
||||
|
|
Loading…
Reference in New Issue