objstr: Record character length separately from byte length
CAUTION: Buggy, may crash stuff - qstr needs equivalent functionality too
This commit is contained in:
parent
b0f41c72af
commit
47c234584d
27
py/objstr.c
27
py/objstr.c
@ -52,6 +52,10 @@ const mp_obj_t mp_const_empty_bytes;
|
||||
// use this macro to extract the string data and length
|
||||
#define GET_STR_DATA_LEN_FLAGS(str_obj_in, str_data, str_len, str_flags) const byte *str_data; uint str_len; char str_flags; if (MP_OBJ_IS_QSTR(str_obj_in)) { str_data = qstr_data(MP_OBJ_QSTR_VALUE(str_obj_in), &str_len, &str_flags); } else { str_len = ((mp_obj_str_t*)str_obj_in)->len; str_data = ((mp_obj_str_t*)str_obj_in)->data; str_flags = ((mp_obj_str_t*)str_obj_in)->flags; }
|
||||
|
||||
// use this macro to extract the string data, lengths, and flags
|
||||
// NOTE: Currently buggy as regards qstr, which doesn't record a charlen
|
||||
#define GET_STR_INFO(str_obj_in, str_data, str_len, str_charlen, str_flags) const byte *str_data; uint str_len, str_charlen = -1; char str_flags; if (MP_OBJ_IS_QSTR(str_obj_in)) { str_data = qstr_data(MP_OBJ_QSTR_VALUE(str_obj_in), &str_len, &str_flags); } else { str_len = ((mp_obj_str_t*)str_obj_in)->len; str_charlen = ((mp_obj_str_t*)str_obj_in)->charlen; str_data = ((mp_obj_str_t*)str_obj_in)->data; str_flags = ((mp_obj_str_t*)str_obj_in)->flags; }
|
||||
|
||||
// don't use this macro, it's only for conversions
|
||||
#define GET_STR_DATA_LEN(str_obj_in, str_data, str_len) GET_STR_DATA_LEN_FLAGS(str_obj_in, str_data, str_len, str_data ## _flags); assert(str_data ## _flags == 1);
|
||||
|
||||
@ -355,7 +359,7 @@ uncomparable:
|
||||
|
||||
STATIC mp_obj_t str_subscr(mp_obj_t self_in, mp_obj_t index, mp_obj_t value) {
|
||||
mp_obj_type_t *type = mp_obj_get_type(self_in);
|
||||
GET_STR_DATA_LEN(self_in, self_data, self_len);
|
||||
GET_STR_INFO(self_in, self_data, self_len, self_charlen, self_flags);
|
||||
if (value == MP_OBJ_SENTINEL) {
|
||||
// load
|
||||
#if MICROPY_PY_BUILTINS_SLICE
|
||||
@ -368,7 +372,7 @@ STATIC mp_obj_t str_subscr(mp_obj_t self_in, mp_obj_t index, mp_obj_t value) {
|
||||
return mp_obj_new_str_of_type(type, self_data + slice.start, slice.stop - slice.start);
|
||||
}
|
||||
#endif
|
||||
uint index_val = mp_get_index(type, self_len, index, false);
|
||||
uint index_val = mp_get_index(type, self_charlen, index, false);
|
||||
if (type == &mp_type_bytes) {
|
||||
return MP_OBJ_NEW_SMALL_INT((mp_small_int_t)self_data[index_val]);
|
||||
} else {
|
||||
@ -377,8 +381,11 @@ STATIC mp_obj_t str_subscr(mp_obj_t self_in, mp_obj_t index, mp_obj_t value) {
|
||||
// end of the buffer if there aren't that many characters in it
|
||||
const char *s;
|
||||
for (s=(const char *)self_data; index_val; ++s)
|
||||
if ((*s&0xC0) != 0x80) --index_val;
|
||||
return mp_obj_new_str(s, 1, true);
|
||||
if ((*s & 0xC0) != 0x80) --index_val;
|
||||
int len = 1;
|
||||
if (*s > 0x7f)
|
||||
for (char mask = 0x40; *s & mask; mask >>= 1) ++len; // Count the number of 1 bits (after the first)
|
||||
return mp_obj_new_str(s, len, true); // This will create a one-character string
|
||||
}
|
||||
} else {
|
||||
return MP_OBJ_NULL; // op not supported
|
||||
@ -1710,7 +1717,7 @@ const mp_obj_type_t mp_type_bytes = {
|
||||
};
|
||||
|
||||
// the zero-length bytes
|
||||
STATIC const mp_obj_str_t empty_bytes_obj = {{&mp_type_bytes}, 0, 0, 1, NULL};
|
||||
STATIC const mp_obj_str_t empty_bytes_obj = {{&mp_type_bytes}, 0, 0, 0, 1, NULL};
|
||||
const mp_obj_t mp_const_empty_bytes = (mp_obj_t)&empty_bytes_obj;
|
||||
|
||||
mp_obj_t mp_obj_str_builder_start(const mp_obj_type_t *type, uint len, byte **data) {
|
||||
@ -1739,12 +1746,12 @@ mp_obj_t mp_obj_new_str_of_type(const mp_obj_type_t *type, const byte* data, uin
|
||||
o->len = len;
|
||||
o->flags = 1;
|
||||
if (data) {
|
||||
// Calculate the byte length used by 'len' characters (by counting non-continuation bytes)
|
||||
// Count non-continuation bytes so we know how long the string is in characters.
|
||||
const byte *endptr, *top = data + len;
|
||||
uint lenleft = len;
|
||||
for (endptr = data; endptr < top && lenleft; ++endptr)
|
||||
if ((*endptr & 0xC0) != 0x80) --lenleft;
|
||||
len = endptr - data; // Work with the byte length now (the object's length is stored above)
|
||||
uint charlen = 0;
|
||||
for (endptr = data; endptr < top; ++endptr)
|
||||
if ((*endptr & 0xC0) != 0x80) ++charlen;
|
||||
o->charlen = charlen;
|
||||
o->hash = qstr_compute_hash(data, len);
|
||||
byte *p = m_new(byte, len + 1);
|
||||
o->data = p;
|
||||
|
@ -30,11 +30,14 @@ typedef struct _mp_obj_str_t {
|
||||
machine_uint_t hash : 16;
|
||||
// len == number of bytes used in data, alloc = len + 1 because (at the moment) we also append a null byte
|
||||
machine_uint_t len : 16;
|
||||
// charlen == number of characters in the string - charlen <= len - 1, and is the value returned by len() in Python
|
||||
machine_uint_t charlen : 16;
|
||||
char flags; //Currently unused, always 1. Will later get markers eg ASCII-only.
|
||||
const void *data; //Character data is encoded UTF-8 and should not be blindly indexed.
|
||||
} mp_obj_str_t;
|
||||
|
||||
#define MP_DEFINE_STR_OBJ(obj_name, str) mp_obj_str_t obj_name = {{&mp_type_str}, 0, sizeof(str) - 1, 1, (const byte*)str};
|
||||
// This is valid ONLY for pure-ASCII strings!
|
||||
#define MP_DEFINE_STR_OBJ(obj_name, str) mp_obj_str_t obj_name = {{&mp_type_str}, 0, sizeof(str) - 1, sizeof(str) - 1, 1, (const byte*)str};
|
||||
|
||||
mp_obj_t mp_obj_str_format(uint n_args, const mp_obj_t *args);
|
||||
mp_obj_t mp_obj_new_str_of_type(const mp_obj_type_t *type, const byte* data, uint len);
|
||||
|
Loading…
Reference in New Issue
Block a user