objstr: Record character length separately from byte length

CAUTION: Buggy, may crash stuff - qstr needs equivalent functionality too
2014-06-06 13:15:32 +10:00 · 2014-06-06 13:15:32 +10:00 · 47c234584d
commit 47c234584d
parent b0f41c72af
2 changed files with 21 additions and 11 deletions
--- a/py/objstr.c
+++ b/py/objstr.c
@ -52,6 +52,10 @@ const mp_obj_t mp_const_empty_bytes;
 // use this macro to extract the string data and length
 #define GET_STR_DATA_LEN_FLAGS(str_obj_in, str_data, str_len, str_flags) const byte *str_data; uint str_len; char str_flags; if (MP_OBJ_IS_QSTR(str_obj_in)) { str_data = qstr_data(MP_OBJ_QSTR_VALUE(str_obj_in), &str_len, &str_flags); } else { str_len = ((mp_obj_str_t*)str_obj_in)->len; str_data = ((mp_obj_str_t*)str_obj_in)->data; str_flags = ((mp_obj_str_t*)str_obj_in)->flags; }

+// use this macro to extract the string data, lengths, and flags
+// NOTE: Currently buggy as regards qstr, which doesn't record a charlen
+#define GET_STR_INFO(str_obj_in, str_data, str_len, str_charlen, str_flags) const byte *str_data; uint str_len, str_charlen = -1; char str_flags; if (MP_OBJ_IS_QSTR(str_obj_in)) { str_data = qstr_data(MP_OBJ_QSTR_VALUE(str_obj_in), &str_len, &str_flags); } else { str_len = ((mp_obj_str_t*)str_obj_in)->len; str_charlen = ((mp_obj_str_t*)str_obj_in)->charlen; str_data = ((mp_obj_str_t*)str_obj_in)->data; str_flags = ((mp_obj_str_t*)str_obj_in)->flags; }
+
 // don't use this macro, it's only for conversions
 #define GET_STR_DATA_LEN(str_obj_in, str_data, str_len) GET_STR_DATA_LEN_FLAGS(str_obj_in, str_data, str_len, str_data ## _flags); assert(str_data ## _flags == 1);

@ -355,7 +359,7 @@ uncomparable:

 STATIC mp_obj_t str_subscr(mp_obj_t self_in, mp_obj_t index, mp_obj_t value) {
    mp_obj_type_t *type = mp_obj_get_type(self_in);
-    GET_STR_DATA_LEN(self_in, self_data, self_len);
+    GET_STR_INFO(self_in, self_data, self_len, self_charlen, self_flags);
    if (value == MP_OBJ_SENTINEL) {
        // load
 #if MICROPY_PY_BUILTINS_SLICE
@ -368,7 +372,7 @@ STATIC mp_obj_t str_subscr(mp_obj_t self_in, mp_obj_t index, mp_obj_t value) {
            return mp_obj_new_str_of_type(type, self_data + slice.start, slice.stop - slice.start);
        }
 #endif
-        uint index_val = mp_get_index(type, self_len, index, false);
+        uint index_val = mp_get_index(type, self_charlen, index, false);
        if (type == &mp_type_bytes) {
            return MP_OBJ_NEW_SMALL_INT((mp_small_int_t)self_data[index_val]);
        } else {
@ -377,8 +381,11 @@ STATIC mp_obj_t str_subscr(mp_obj_t self_in, mp_obj_t index, mp_obj_t value) {
            // end of the buffer if there aren't that many characters in it
            const char *s;
            for (s=(const char *)self_data; index_val; ++s)
-                if ((*s&0xC0) != 0x80) --index_val;
-            return mp_obj_new_str(s, 1, true);
+                if ((*s & 0xC0) != 0x80) --index_val;
+            int len = 1;
+            if (*s > 0x7f)
+                for (char mask = 0x40; *s & mask; mask >>= 1) ++len; // Count the number of 1 bits (after the first)
+            return mp_obj_new_str(s, len, true); // This will create a one-character string
        }
    } else {
        return MP_OBJ_NULL; // op not supported
@ -1710,7 +1717,7 @@ const mp_obj_type_t mp_type_bytes = {
 };

 // the zero-length bytes
-STATIC const mp_obj_str_t empty_bytes_obj = {{&mp_type_bytes}, 0, 0, 1, NULL};
+STATIC const mp_obj_str_t empty_bytes_obj = {{&mp_type_bytes}, 0, 0, 0, 1, NULL};
 const mp_obj_t mp_const_empty_bytes = (mp_obj_t)&empty_bytes_obj;

 mp_obj_t mp_obj_str_builder_start(const mp_obj_type_t *type, uint len, byte **data) {
@ -1739,12 +1746,12 @@ mp_obj_t mp_obj_new_str_of_type(const mp_obj_type_t *type, const byte* data, uin
    o->len = len;
    o->flags = 1;
    if (data) {
-        // Calculate the byte length used by 'len' characters (by counting non-continuation bytes)
+        // Count non-continuation bytes so we know how long the string is in characters.
        const byte *endptr, *top = data + len;
-        uint lenleft = len;
-        for (endptr = data; endptr < top && lenleft; ++endptr)
-            if ((*endptr & 0xC0) != 0x80) --lenleft;
-        len = endptr - data; // Work with the byte length now (the object's length is stored above)
+        uint charlen = 0;
+        for (endptr = data; endptr < top; ++endptr)
+            if ((*endptr & 0xC0) != 0x80) ++charlen;
+        o->charlen = charlen;
        o->hash = qstr_compute_hash(data, len);
        byte *p = m_new(byte, len + 1);
        o->data = p;
--- a/py/objstr.h
+++ b/py/objstr.h
@ -30,11 +30,14 @@ typedef struct _mp_obj_str_t {
    machine_uint_t hash : 16;
    // len == number of bytes used in data, alloc = len + 1 because (at the moment) we also append a null byte
    machine_uint_t len : 16;
+    // charlen == number of characters in the string - charlen <= len - 1, and is the value returned by len() in Python
+    machine_uint_t charlen : 16;
    char flags; //Currently unused, always 1. Will later get markers eg ASCII-only.
    const void *data; //Character data is encoded UTF-8 and should not be blindly indexed.
 } mp_obj_str_t;

-#define MP_DEFINE_STR_OBJ(obj_name, str) mp_obj_str_t obj_name = {{&mp_type_str}, 0, sizeof(str) - 1, 1, (const byte*)str};
+// This is valid ONLY for pure-ASCII strings!
+#define MP_DEFINE_STR_OBJ(obj_name, str) mp_obj_str_t obj_name = {{&mp_type_str}, 0, sizeof(str) - 1, sizeof(str) - 1, 1, (const byte*)str};

 mp_obj_t mp_obj_str_format(uint n_args, const mp_obj_t *args);
 mp_obj_t mp_obj_new_str_of_type(const mp_obj_type_t *type, const byte* data, uint len);