From e90b85cc98a24003f2d673bab2c255ab3dce66e7 Mon Sep 17 00:00:00 2001 From: Jeff Epler Date: Mon, 5 Sep 2022 07:58:04 -0500 Subject: [PATCH] extmod/modure: Convert byte offsets to unicode indices when necessary. And add a test. Fixes issue #9202. Signed-off-by: Jeff Epler --- extmod/modure.c | 16 ++++++++++++++++ tests/unicode/unicode_ure.py | 32 ++++++++++++++++++++++++++++++++ 2 files changed, 48 insertions(+) create mode 100644 tests/unicode/unicode_ure.py diff --git a/extmod/modure.c b/extmod/modure.c index 799fef13b1..a674d66499 100644 --- a/extmod/modure.c +++ b/extmod/modure.c @@ -33,6 +33,10 @@ #include "py/objstr.h" #include "py/stackctrl.h" +#if MICROPY_PY_BUILTINS_STR_UNICODE +#include "py/unicode.h" +#endif + #if MICROPY_PY_URE #define re1_5_stack_chk() MP_STACK_CHECK() @@ -121,6 +125,18 @@ STATIC void match_span_helper(size_t n_args, const mp_obj_t *args, mp_obj_t span e = self->caps[no * 2 + 1] - begin; } + #if MICROPY_PY_BUILTINS_STR_UNICODE + if (mp_obj_get_type(self->str) == &mp_type_str) { + const byte *begin = (const byte *)mp_obj_str_get_str(self->str); + if (s != -1) { + s = utf8_ptr_to_index(begin, begin + s); + } + if (e != -1) { + e = utf8_ptr_to_index(begin, begin + e); + } + } + #endif + span[0] = mp_obj_new_int(s); span[1] = mp_obj_new_int(e); } diff --git a/tests/unicode/unicode_ure.py b/tests/unicode/unicode_ure.py new file mode 100644 index 0000000000..5a5dc60054 --- /dev/null +++ b/tests/unicode/unicode_ure.py @@ -0,0 +1,32 @@ +# test match.span() for unicode strings + +try: + import ure as re +except ImportError: + try: + import re + except ImportError: + print("SKIP") + raise SystemExit + +try: + m = re.match(".", "a") + m.span +except AttributeError: + print("SKIP") + raise SystemExit + + +def print_spans(match): + print("----") + try: + i = 0 + while True: + print(match.span(i), match.start(i), match.end(i)) + i += 1 + except IndexError: + pass + + +m = re.match(r"([0-9]*)(([a-z]*)([0-9]*))", "1234\u2764567") +print_spans(m)