From 5bea90c2e3328c12ad28e7ca513a9d7c859798b1 Mon Sep 17 00:00:00 2001 From: stephanelsmith Date: Tue, 15 Aug 2023 01:40:59 +0000 Subject: [PATCH] py/stream.c: Fixed stream write unicode count. The current implementation of mp_stream_rw always returns the number of bytes written. Cpython returns the number of characters written for unicode. This PR resolves the different behavior. Signed-off-by: stephanelsmith --- py/stream.c | 39 +++++++++++++++++++++++++++- tests/unicode/unicode_write_count.py | 7 +++++ 2 files changed, 45 insertions(+), 1 deletion(-) create mode 100644 tests/unicode/unicode_write_count.py diff --git a/py/stream.c b/py/stream.c index ac0234ac13..fe5a70c59f 100644 --- a/py/stream.c +++ b/py/stream.c @@ -54,10 +54,15 @@ mp_uint_t mp_stream_rw(mp_obj_t stream, void *buf_, mp_uint_t size, int *errcode io_func = stream_p->read; } + #if MICROPY_PY_BUILTINS_STR_UNICODE + uint32_t i_residue = 0; + #endif + *errcode = 0; mp_uint_t done = 0; while (size > 0) { mp_uint_t out_sz = io_func(stream, buf, size, errcode); + // For read, out_sz == 0 means EOF. For write, it's unspecified // what it means, but we don't make any progress, so returning // is still the best option. @@ -74,11 +79,43 @@ mp_uint_t mp_stream_rw(mp_obj_t stream, void *buf_, mp_uint_t size, int *errcode if (flags & MP_STREAM_RW_ONCE) { return out_sz; } + #if MICROPY_PY_BUILTINS_STR_UNICODE + if (stream_p->is_text && (flags & MP_STREAM_RW_WRITE)) { + // On text writes, the returned count is the number of unicode + // characters written (vs bytes written) + uint32_t i = i_residue; + while (i < out_sz) { + uint8_t b = *(buf + i); + done += 1; + if (!UTF8_IS_NONASCII(b)) { + // 1-byte ASCII char + i += 1; + } else if ((b & 0xe0) == 0xc0) { + // 2-byte char + i += 2; + } else if ((b & 0xf0) == 0xe0) { + // 3-byte char + i += 3; + } else if ((b & 0xf8) == 0xf0) { + // 4-byte char + i += 4; + } else { + // TODO + i += 5; + } + } + i_residue = i - out_sz; + } else { + done += out_sz; + } + #else + done += out_sz; + #endif // MICROPY_PY_BUILTINS_STR_UNICODE buf += out_sz; size -= out_sz; - done += out_sz; } + return done; } diff --git a/tests/unicode/unicode_write_count.py b/tests/unicode/unicode_write_count.py new file mode 100644 index 0000000000..5933fe9e41 --- /dev/null +++ b/tests/unicode/unicode_write_count.py @@ -0,0 +1,7 @@ +import sys + +n_text = sys.stdout.write("🚀\n") +sys.stdout.write("{}\n".format(n_text)) + +n_text = sys.stdout.write("1🚀2a3α4b5β6c7γ8d9δ0ぁ1🙐\n") +sys.stdout.write("{}\n".format(n_text))