Merge pull request #759 from micropython/unicode-read-chars
py: Add stream reading of n unicode chars; unicode support by default.
This commit is contained in:
commit
2c0701101b
88
py/stream.c
88
py/stream.c
|
@ -67,6 +67,9 @@ STATIC mp_obj_t stream_read(uint n_args, const mp_obj_t *args) {
|
|||
nlr_raise(mp_obj_new_exception_msg(&mp_type_OSError, "Operation not supported"));
|
||||
}
|
||||
|
||||
// What to do if sz < -1? Python docs don't specify this case.
|
||||
// CPython does a readall, but here we silently let negatives through,
|
||||
// and they will cause a MemoryError.
|
||||
mp_int_t sz;
|
||||
if (n_args == 1 || ((sz = mp_obj_get_int(args[1])) == -1)) {
|
||||
return stream_readall(args[0]);
|
||||
|
@ -74,7 +77,90 @@ STATIC mp_obj_t stream_read(uint n_args, const mp_obj_t *args) {
|
|||
|
||||
#if MICROPY_PY_BUILTINS_STR_UNICODE
|
||||
if (!o->type->stream_p->is_bytes) {
|
||||
mp_not_implemented("Reading from unicode text streams by character count");
|
||||
// We need to read sz number of unicode characters. Because we don't have any
|
||||
// buffering, and because the stream API can only read bytes, we must read here
|
||||
// in units of bytes and must never over read. If we want sz chars, then reading
|
||||
// sz bytes will never over-read, so we follow this approach, in a loop to keep
|
||||
// reading until we have exactly enough chars. This will be 1 read for text
|
||||
// with ASCII-only chars, and about 2 reads for text with a couple of non-ASCII
|
||||
// chars. For text with lots of non-ASCII chars, it'll be pretty inefficient
|
||||
// in time and memory.
|
||||
|
||||
vstr_t vstr;
|
||||
vstr_init(&vstr, sz);
|
||||
mp_uint_t more_bytes = sz;
|
||||
mp_uint_t last_buf_offset = 0;
|
||||
while (more_bytes > 0) {
|
||||
char *p = vstr_add_len(&vstr, more_bytes);
|
||||
if (p == NULL) {
|
||||
nlr_raise(mp_obj_new_exception_msg_varg(&mp_type_MemoryError, "out of memory"));
|
||||
}
|
||||
int error;
|
||||
mp_int_t out_sz = o->type->stream_p->read(o, p, more_bytes, &error);
|
||||
if (out_sz == -1) {
|
||||
vstr_cut_tail_bytes(&vstr, more_bytes);
|
||||
if (is_nonblocking_error(error)) {
|
||||
// With non-blocking streams, we read as much as we can.
|
||||
// If we read nothing, return None, just like read().
|
||||
// Otherwise, return data read so far.
|
||||
// TODO what if we have read only half a non-ASCII char?
|
||||
if (vstr.len == 0) {
|
||||
vstr_clear(&vstr);
|
||||
return mp_const_none;
|
||||
}
|
||||
break;
|
||||
}
|
||||
nlr_raise(mp_obj_new_exception_msg_varg(&mp_type_OSError, "[Errno %d]", error));
|
||||
}
|
||||
|
||||
if (out_sz == 0) {
|
||||
// Finish reading.
|
||||
// TODO what if we have read only half a non-ASCII char?
|
||||
vstr_cut_tail_bytes(&vstr, more_bytes);
|
||||
break;
|
||||
}
|
||||
|
||||
// count chars from bytes just read
|
||||
for (mp_uint_t off = last_buf_offset;;) {
|
||||
byte b = vstr.buf[off];
|
||||
int n;
|
||||
if (!UTF8_IS_NONASCII(b)) {
|
||||
// 1-byte ASCII char
|
||||
n = 1;
|
||||
} else if ((b & 0xe0) == 0xc0) {
|
||||
// 2-byte char
|
||||
n = 2;
|
||||
} else if ((b & 0xf0) == 0xe0) {
|
||||
// 3-byte char
|
||||
n = 3;
|
||||
} else if ((b & 0xf8) == 0xf0) {
|
||||
// 4-byte char
|
||||
n = 4;
|
||||
} else {
|
||||
// TODO
|
||||
n = 5;
|
||||
}
|
||||
if (off + n <= vstr.len) {
|
||||
// got a whole char in n bytes
|
||||
off += n;
|
||||
sz -= 1;
|
||||
last_buf_offset = off;
|
||||
if (off >= vstr.len) {
|
||||
more_bytes = sz;
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
// didn't get a whole char, so work out how many extra bytes are needed for
|
||||
// this partial char, plus bytes for additional chars that we want
|
||||
more_bytes = (off + n - vstr.len) + (sz - 1);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
mp_obj_t ret = mp_obj_new_str_of_type(&mp_type_str, (byte*)vstr.buf, vstr.len);
|
||||
vstr_clear(&vstr);
|
||||
return ret;
|
||||
}
|
||||
#endif
|
||||
|
||||
|
|
|
@ -44,7 +44,7 @@
|
|||
*/
|
||||
#define MICROPY_ENABLE_LFN (1)
|
||||
#define MICROPY_LFN_CODE_PAGE (437) /* 1=SFN/ANSI 437=LFN/U.S.(OEM) */
|
||||
#define MICROPY_PY_BUILTINS_STR_UNICODE (0)
|
||||
#define MICROPY_PY_BUILTINS_STR_UNICODE (1)
|
||||
#define MICROPY_PY_BUILTINS_FROZENSET (1)
|
||||
#define MICROPY_PY_SYS_EXIT (1)
|
||||
#define MICROPY_PY_SYS_STDFILES (1)
|
||||
|
|
|
@ -134,7 +134,7 @@ def main():
|
|||
if args.test_dirs is None:
|
||||
if pyb is None:
|
||||
# run PC tests
|
||||
test_dirs = ('basics', 'micropython', 'float', 'import', 'io', 'misc')
|
||||
test_dirs = ('basics', 'micropython', 'float', 'import', 'io', 'misc', 'unicode')
|
||||
else:
|
||||
# run pyboard tests
|
||||
test_dirs = ('basics', 'micropython', 'float', 'pyb', 'pybnative', 'inlineasm')
|
||||
|
|
|
@ -0,0 +1 @@
|
|||
aαbβcγdδ
|
|
@ -0,0 +1,12 @@
|
|||
# test reading a given number of characters
|
||||
|
||||
def do(mode):
|
||||
f = open('unicode/data/utf-8_2.txt', mode)
|
||||
print(f.read(1))
|
||||
print(f.read(1))
|
||||
print(f.read(2))
|
||||
print(f.read(4))
|
||||
f.close()
|
||||
|
||||
do('rb')
|
||||
do('rt')
|
|
@ -43,7 +43,7 @@
|
|||
#define MICROPY_LONGINT_IMPL (MICROPY_LONGINT_IMPL_MPZ)
|
||||
#define MICROPY_STREAMS_NON_BLOCK (1)
|
||||
#define MICROPY_OPT_COMPUTED_GOTO (1)
|
||||
#define MICROPY_PY_BUILTINS_STR_UNICODE (0)
|
||||
#define MICROPY_PY_BUILTINS_STR_UNICODE (1)
|
||||
#define MICROPY_PY_BUILTINS_FROZENSET (1)
|
||||
#define MICROPY_PY_SYS_EXIT (1)
|
||||
#define MICROPY_PY_SYS_PLATFORM "linux"
|
||||
|
|
Loading…
Reference in New Issue