py: Add stream reading of n unicode chars; unicode support by default.

With unicode enabled, this patch allows reading a fixed number of characters from text-mode streams; eg file.read(5) will read 5 unicode chars, which can made of more than 5 bytes. For an ASCII stream (ie no chars > 127) it only needs to do 1 read. If there are lots of non-ASCII chars in a stream, then it needs multiple reads of the underlying object. Adds a new test for this case. Enables unicode support by default on unix and stmhal ports.
2014-07-16 11:45:10 +01:00 · 2014-07-16 11:45:10 +01:00 · 1694bc733d
commit 1694bc733d
parent 02bc882c3d
6 changed files with 103 additions and 4 deletions
--- a/py/stream.c
+++ b/py/stream.c
@ -67,6 +67,9 @@ STATIC mp_obj_t stream_read(uint n_args, const mp_obj_t *args) {
        nlr_raise(mp_obj_new_exception_msg(&mp_type_OSError, "Operation not supported"));
    }
    // What to do if sz < -1?  Python docs don't specify this case.
    // CPython does a readall, but here we silently let negatives through,
    // and they will cause a MemoryError.
    mp_int_t sz;
    if (n_args == 1 || ((sz = mp_obj_get_int(args[1])) == -1)) {
        return stream_readall(args[0]);
@ -74,7 +77,90 @@ STATIC mp_obj_t stream_read(uint n_args, const mp_obj_t *args) {
    #if MICROPY_PY_BUILTINS_STR_UNICODE
    if (!o->type->stream_p->is_bytes) {
-        mp_not_implemented("Reading from unicode text streams by character count");
+        // We need to read sz number of unicode characters.  Because we don't have any
        // buffering, and because the stream API can only read bytes, we must read here
        // in units of bytes and must never over read.  If we want sz chars, then reading
        // sz bytes will never over-read, so we follow this approach, in a loop to keep
        // reading until we have exactly enough chars.  This will be 1 read for text
        // with ASCII-only chars, and about 2 reads for text with a couple of non-ASCII
        // chars.  For text with lots of non-ASCII chars, it'll be pretty inefficient
        // in time and memory.
        vstr_t vstr;
        vstr_init(&vstr, sz);
        mp_uint_t more_bytes = sz;
        mp_uint_t last_buf_offset = 0;
        while (more_bytes > 0) {
            char *p = vstr_add_len(&vstr, more_bytes);
            if (p == NULL) {
                nlr_raise(mp_obj_new_exception_msg_varg(&mp_type_MemoryError, "out of memory"));
            }
            int error;
            mp_int_t out_sz = o->type->stream_p->read(o, p, more_bytes, &error);
            if (out_sz == -1) {
                vstr_cut_tail_bytes(&vstr, more_bytes);
                if (is_nonblocking_error(error)) {
                    // With non-blocking streams, we read as much as we can.
                    // If we read nothing, return None, just like read().
                    // Otherwise, return data read so far.
                    // TODO what if we have read only half a non-ASCII char?
                    if (vstr.len == 0) {
                        vstr_clear(&vstr);
                        return mp_const_none;
                    }
                    break;
                }
                nlr_raise(mp_obj_new_exception_msg_varg(&mp_type_OSError, "[Errno %d]", error));
            }
            if (out_sz == 0) {
                // Finish reading.
                // TODO what if we have read only half a non-ASCII char?
                vstr_cut_tail_bytes(&vstr, more_bytes);
                break;
            }
            // count chars from bytes just read
            for (mp_uint_t off = last_buf_offset;;) {
                byte b = vstr.buf[off];
                int n;
                if (!UTF8_IS_NONASCII(b)) {
                    // 1-byte ASCII char
                    n = 1;
                } else if ((b & 0xe0) == 0xc0) {
                    // 2-byte char
                    n = 2;
                } else if ((b & 0xf0) == 0xe0) {
                    // 3-byte char
                    n = 3;
                } else if ((b & 0xf8) == 0xf0) {
                    // 4-byte char
                    n = 4;
                } else {
                    // TODO
                    n = 5;
                }
                if (off + n <= vstr.len) {
                    // got a whole char in n bytes
                    off += n;
                    sz -= 1;
                    last_buf_offset = off;
                    if (off >= vstr.len) {
                        more_bytes = sz;
                        break;
                    }
                } else {
                    // didn't get a whole char, so work out how many extra bytes are needed for
                    // this partial char, plus bytes for additional chars that we want
                    more_bytes = (off + n - vstr.len) + (sz - 1);
                    break;
                }
            }
        }
        mp_obj_t ret = mp_obj_new_str_of_type(&mp_type_str, (byte*)vstr.buf, vstr.len);
        vstr_clear(&vstr);
        return ret;
    }
    #endif
--- a/stmhal/mpconfigport.h
+++ b/stmhal/mpconfigport.h
@ -44,7 +44,7 @@
 */
 #define MICROPY_ENABLE_LFN          (1)
 #define MICROPY_LFN_CODE_PAGE       (437) /* 1=SFN/ANSI 437=LFN/U.S.(OEM) */
-#define MICROPY_PY_BUILTINS_STR_UNICODE (0)
+#define MICROPY_PY_BUILTINS_STR_UNICODE (1)
 #define MICROPY_PY_BUILTINS_FROZENSET (1)
 #define MICROPY_PY_SYS_EXIT         (1)
 #define MICROPY_PY_SYS_STDFILES     (1)
--- a/tests/run-tests
+++ b/tests/run-tests
@ -134,7 +134,7 @@ def main():
        if args.test_dirs is None:
            if pyb is None:
                # run PC tests
-                test_dirs = ('basics', 'micropython', 'float', 'import', 'io', 'misc')
+                test_dirs = ('basics', 'micropython', 'float', 'import', 'io', 'misc', 'unicode')
            else:
                # run pyboard tests
                test_dirs = ('basics', 'micropython', 'float', 'pyb', 'pybnative', 'inlineasm')
--- a/tests/unicode/data/utf-8_2.txt
+++ b/tests/unicode/data/utf-8_2.txt
@ -0,0 +1 @@
 aαbβcγdδ
--- a/tests/unicode/file2.py
+++ b/tests/unicode/file2.py
@ -0,0 +1,12 @@
 # test reading a given number of characters
 def do(mode):
    f = open('unicode/data/utf-8_2.txt', mode)
    print(f.read(1))
    print(f.read(1))
    print(f.read(2))
    print(f.read(4))
    f.close()
 do('rb')
 do('rt')
--- a/unix/mpconfigport.h
+++ b/unix/mpconfigport.h
@ -43,7 +43,7 @@
 #define MICROPY_LONGINT_IMPL        (MICROPY_LONGINT_IMPL_MPZ)
 #define MICROPY_STREAMS_NON_BLOCK   (1)
 #define MICROPY_OPT_COMPUTED_GOTO   (1)
-#define MICROPY_PY_BUILTINS_STR_UNICODE (0)
+#define MICROPY_PY_BUILTINS_STR_UNICODE (1)
 #define MICROPY_PY_BUILTINS_FROZENSET (1)
 #define MICROPY_PY_SYS_EXIT         (1)
 #define MICROPY_PY_SYS_PLATFORM     "linux"