objstr: Record character length separately from byte length

CAUTION: Buggy, may crash stuff - qstr needs equivalent functionality too
This commit is contained in:
Chris Angelico 2014-06-06 13:15:32 +10:00
parent b0f41c72af
commit 47c234584d
2 changed files with 21 additions and 11 deletions

View File

@ -52,6 +52,10 @@ const mp_obj_t mp_const_empty_bytes;
// use this macro to extract the string data and length
#define GET_STR_DATA_LEN_FLAGS(str_obj_in, str_data, str_len, str_flags) const byte *str_data; uint str_len; char str_flags; if (MP_OBJ_IS_QSTR(str_obj_in)) { str_data = qstr_data(MP_OBJ_QSTR_VALUE(str_obj_in), &str_len, &str_flags); } else { str_len = ((mp_obj_str_t*)str_obj_in)->len; str_data = ((mp_obj_str_t*)str_obj_in)->data; str_flags = ((mp_obj_str_t*)str_obj_in)->flags; }
// use this macro to extract the string data, lengths, and flags
// NOTE: Currently buggy as regards qstr, which doesn't record a charlen
#define GET_STR_INFO(str_obj_in, str_data, str_len, str_charlen, str_flags) const byte *str_data; uint str_len, str_charlen = -1; char str_flags; if (MP_OBJ_IS_QSTR(str_obj_in)) { str_data = qstr_data(MP_OBJ_QSTR_VALUE(str_obj_in), &str_len, &str_flags); } else { str_len = ((mp_obj_str_t*)str_obj_in)->len; str_charlen = ((mp_obj_str_t*)str_obj_in)->charlen; str_data = ((mp_obj_str_t*)str_obj_in)->data; str_flags = ((mp_obj_str_t*)str_obj_in)->flags; }
// don't use this macro, it's only for conversions
#define GET_STR_DATA_LEN(str_obj_in, str_data, str_len) GET_STR_DATA_LEN_FLAGS(str_obj_in, str_data, str_len, str_data ## _flags); assert(str_data ## _flags == 1);
@ -355,7 +359,7 @@ uncomparable:
STATIC mp_obj_t str_subscr(mp_obj_t self_in, mp_obj_t index, mp_obj_t value) {
mp_obj_type_t *type = mp_obj_get_type(self_in);
GET_STR_DATA_LEN(self_in, self_data, self_len);
GET_STR_INFO(self_in, self_data, self_len, self_charlen, self_flags);
if (value == MP_OBJ_SENTINEL) {
// load
#if MICROPY_PY_BUILTINS_SLICE
@ -368,7 +372,7 @@ STATIC mp_obj_t str_subscr(mp_obj_t self_in, mp_obj_t index, mp_obj_t value) {
return mp_obj_new_str_of_type(type, self_data + slice.start, slice.stop - slice.start);
}
#endif
uint index_val = mp_get_index(type, self_len, index, false);
uint index_val = mp_get_index(type, self_charlen, index, false);
if (type == &mp_type_bytes) {
return MP_OBJ_NEW_SMALL_INT((mp_small_int_t)self_data[index_val]);
} else {
@ -377,8 +381,11 @@ STATIC mp_obj_t str_subscr(mp_obj_t self_in, mp_obj_t index, mp_obj_t value) {
// end of the buffer if there aren't that many characters in it
const char *s;
for (s=(const char *)self_data; index_val; ++s)
if ((*s&0xC0) != 0x80) --index_val;
return mp_obj_new_str(s, 1, true);
if ((*s & 0xC0) != 0x80) --index_val;
int len = 1;
if (*s > 0x7f)
for (char mask = 0x40; *s & mask; mask >>= 1) ++len; // Count the number of 1 bits (after the first)
return mp_obj_new_str(s, len, true); // This will create a one-character string
}
} else {
return MP_OBJ_NULL; // op not supported
@ -1710,7 +1717,7 @@ const mp_obj_type_t mp_type_bytes = {
};
// the zero-length bytes
STATIC const mp_obj_str_t empty_bytes_obj = {{&mp_type_bytes}, 0, 0, 1, NULL};
STATIC const mp_obj_str_t empty_bytes_obj = {{&mp_type_bytes}, 0, 0, 0, 1, NULL};
const mp_obj_t mp_const_empty_bytes = (mp_obj_t)&empty_bytes_obj;
mp_obj_t mp_obj_str_builder_start(const mp_obj_type_t *type, uint len, byte **data) {
@ -1739,12 +1746,12 @@ mp_obj_t mp_obj_new_str_of_type(const mp_obj_type_t *type, const byte* data, uin
o->len = len;
o->flags = 1;
if (data) {
// Calculate the byte length used by 'len' characters (by counting non-continuation bytes)
// Count non-continuation bytes so we know how long the string is in characters.
const byte *endptr, *top = data + len;
uint lenleft = len;
for (endptr = data; endptr < top && lenleft; ++endptr)
if ((*endptr & 0xC0) != 0x80) --lenleft;
len = endptr - data; // Work with the byte length now (the object's length is stored above)
uint charlen = 0;
for (endptr = data; endptr < top; ++endptr)
if ((*endptr & 0xC0) != 0x80) ++charlen;
o->charlen = charlen;
o->hash = qstr_compute_hash(data, len);
byte *p = m_new(byte, len + 1);
o->data = p;

View File

@ -30,11 +30,14 @@ typedef struct _mp_obj_str_t {
machine_uint_t hash : 16;
// len == number of bytes used in data, alloc = len + 1 because (at the moment) we also append a null byte
machine_uint_t len : 16;
// charlen == number of characters in the string - charlen <= len - 1, and is the value returned by len() in Python
machine_uint_t charlen : 16;
char flags; //Currently unused, always 1. Will later get markers eg ASCII-only.
const void *data; //Character data is encoded UTF-8 and should not be blindly indexed.
} mp_obj_str_t;
#define MP_DEFINE_STR_OBJ(obj_name, str) mp_obj_str_t obj_name = {{&mp_type_str}, 0, sizeof(str) - 1, 1, (const byte*)str};
// This is valid ONLY for pure-ASCII strings!
#define MP_DEFINE_STR_OBJ(obj_name, str) mp_obj_str_t obj_name = {{&mp_type_str}, 0, sizeof(str) - 1, sizeof(str) - 1, 1, (const byte*)str};
mp_obj_t mp_obj_str_format(uint n_args, const mp_obj_t *args);
mp_obj_t mp_obj_new_str_of_type(const mp_obj_type_t *type, const byte* data, uint len);