From 1fc5e17987ab347f6dc3eed6dc419f290f0808ec Mon Sep 17 00:00:00 2001 From: Andrew Leech Date: Sun, 9 Jan 2022 07:43:26 +1100 Subject: [PATCH 1/3] lib/re1.5: Allow null characters in regex patterns. Signed-off-by: Andrew Leech --- lib/re1.5/compilecode.c | 45 +++++++++++++++++++++++++---------------- lib/re1.5/re1.5.h | 4 ++-- 2 files changed, 30 insertions(+), 19 deletions(-) diff --git a/lib/re1.5/compilecode.c b/lib/re1.5/compilecode.c index 513a155970..c393718f0e 100644 --- a/lib/re1.5/compilecode.c +++ b/lib/re1.5/compilecode.c @@ -21,19 +21,20 @@ static void _emit_checked(int at, char *code, int val, bool *err) { } } -static const char *_compilecode(const char *re, ByteProg *prog, int sizecode) +static const char *_compilecode(const char *re, size_t len, ByteProg *prog, int sizecode) { char *code = sizecode ? NULL : prog->insts; bool err = false; int start = PC; int term = PC; int alt_label = 0; - - for (; *re && *re != ')'; re++) { + const char *re_top = re + len; + + while (re < re_top && *re != ')') { switch (*re) { case '\\': re++; - if (!*re) return NULL; // Trailing backslash + if (re >= re_top) return NULL; // Trailing backslash if (MATCH_NAMED_CLASS_CHAR(*re)) { term = PC; EMIT(PC++, NamedClass); @@ -57,18 +58,22 @@ static const char *_compilecode(const char *re, ByteProg *prog, int sizecode) int cnt; term = PC; re++; + if (re >= re_top) return NULL; // Trailing bracket if (*re == '^') { EMIT(PC++, ClassNot); re++; + if (re >= re_top) return NULL; // Trailing ^ } else { EMIT(PC++, Class); } PC++; // Skip # of pair byte prog->len++; for (cnt = 0; *re != ']'; re++, cnt++) { + if (re >= re_top) return NULL; // Missing closing bracket char c = *re; if (c == '\\') { ++re; + if (re >= re_top) return NULL; // Trailing backslash c = *re; if (MATCH_NAMED_CLASS_CHAR(c)) { c = RE15_CLASS_NAMED_CLASS_INDICATOR; @@ -76,7 +81,7 @@ static const char *_compilecode(const char *re, ByteProg *prog, int sizecode) } } if (!c) return NULL; - if (re[1] == '-' && re[2] != ']') { + if (re_top - re > 2 && re[1] == '-' && re[2] != ']') { re += 2; } emit_char_pair: @@ -89,7 +94,7 @@ static const char *_compilecode(const char *re, ByteProg *prog, int sizecode) case '(': { term = PC; int sub = 0; - int capture = re[1] != '?' || re[2] != ':'; + int capture = re_top - re > 2 && (re[1] != '?' || re[2] != ':'); if (capture) { sub = ++prog->sub; @@ -97,11 +102,13 @@ static const char *_compilecode(const char *re, ByteProg *prog, int sizecode) EMIT_CHECKED(PC++, 2 * sub); prog->len++; } else { - re += 2; + re += 2; } - re = _compilecode(re + 1, prog, sizecode); - if (re == NULL || *re != ')') return NULL; // error, or no matching paren + re++; + if (re >= re_top) return NULL; // Trailing bracket + re = _compilecode(re, re_top - re, prog, sizecode); + if (re == NULL || re >= re_top || *re != ')') return NULL; // error, or no matching paren if (capture) { EMIT(PC++, Save); @@ -114,7 +121,7 @@ static const char *_compilecode(const char *re, ByteProg *prog, int sizecode) case '?': if (PC == term) return NULL; // nothing to repeat INSERT_CODE(term, 2, PC); - if (re[1] == '?') { + if (re_top - re > 1 && re[1] == '?') { EMIT(term, RSplit); re++; } else { @@ -130,7 +137,7 @@ static const char *_compilecode(const char *re, ByteProg *prog, int sizecode) EMIT(PC, Jmp); EMIT_CHECKED(PC + 1, REL(PC, term)); PC += 2; - if (re[1] == '?') { + if (re_top - re > 1 && re[1] == '?') { EMIT(term, RSplit); re++; } else { @@ -142,7 +149,7 @@ static const char *_compilecode(const char *re, ByteProg *prog, int sizecode) break; case '+': if (PC == term) return NULL; // nothing to repeat - if (re[1] == '?') { + if (re_top - re > 1 && re[1] == '?') { EMIT(PC, Split); re++; } else { @@ -176,27 +183,31 @@ static const char *_compilecode(const char *re, ByteProg *prog, int sizecode) term = PC; break; } + re++; } if (alt_label) { EMIT_CHECKED(alt_label, REL(alt_label, PC) + 1); } - return err ? NULL : re; + if (err) { + return NULL; + } + return re; } -int re1_5_sizecode(const char *re) +int re1_5_sizecode(const char *re, size_t len) { ByteProg dummyprog = { // Save 0, Save 1, Match; more bytes for "search" (vs "match") prefix code .bytelen = 5 + NON_ANCHORED_PREFIX }; - if (_compilecode(re, &dummyprog, /*sizecode*/1) == NULL) return -1; + if (_compilecode(re, len, &dummyprog, /*sizecode*/1) == NULL) return -1; return dummyprog.bytelen; } -int re1_5_compilecode(ByteProg *prog, const char *re) +int re1_5_compilecode(ByteProg *prog, const char *re, size_t len) { prog->len = 0; prog->bytelen = 0; @@ -216,7 +227,7 @@ int re1_5_compilecode(ByteProg *prog, const char *re) prog->insts[prog->bytelen++] = 0; prog->len++; - re = _compilecode(re, prog, /*sizecode*/0); + re = _compilecode(re, len, prog, /*sizecode*/0); if (re == NULL || *re) return 1; prog->insts[prog->bytelen++] = Save; diff --git a/lib/re1.5/re1.5.h b/lib/re1.5/re1.5.h index b1ec01cbc5..cc0c52e42f 100644 --- a/lib/re1.5/re1.5.h +++ b/lib/re1.5/re1.5.h @@ -146,8 +146,8 @@ int re1_5_recursiveloopprog(ByteProg*, Subject*, const char**, int, int); int re1_5_recursiveprog(ByteProg*, Subject*, const char**, int, int); int re1_5_thompsonvm(ByteProg*, Subject*, const char**, int, int); -int re1_5_sizecode(const char *re); -int re1_5_compilecode(ByteProg *prog, const char *re); +int re1_5_sizecode(const char *re, size_t len); +int re1_5_compilecode(ByteProg *prog, const char *re, size_t len); void re1_5_dumpcode(ByteProg *prog); void cleanmarks(ByteProg *prog); int _re1_5_classmatch(const char *pc, const char *sp); From 5deef8005b84367d583ec66eacf74eaadd848441 Mon Sep 17 00:00:00 2001 From: Andrew Leech Date: Tue, 11 Jan 2022 12:19:33 +1100 Subject: [PATCH 2/3] extmod/re: Use buffer protocol for data to search through. Signed-off-by: Andrew Leech --- extmod/modre.c | 54 +++++++++++++++++++++++++++------------- tests/extmod/re1.py | 17 +++++++++++++ tests/extmod/re_split.py | 5 ++++ tests/extmod/re_sub.py | 10 +++++++- 4 files changed, 68 insertions(+), 18 deletions(-) diff --git a/extmod/modre.c b/extmod/modre.c index 7f00b1c23c..8697e1c6ef 100644 --- a/extmod/modre.c +++ b/extmod/modre.c @@ -80,7 +80,13 @@ STATIC mp_obj_t match_group(mp_obj_t self_in, mp_obj_t no_in) { // no match for this group return mp_const_none; } - return mp_obj_new_str_of_type(mp_obj_get_type(self->str), + const mp_obj_type_t *str_type = mp_obj_get_type(self->str); + if (str_type != &mp_type_str) { + // bytes, bytearray etc. args should return bytes + str_type = &mp_type_bytes; + } + + return mp_obj_new_str_of_type(str_type, (const byte *)start, self->caps[no * 2 + 1] - start); } MP_DEFINE_CONST_FUN_OBJ_2(match_group_obj, match_group); @@ -120,7 +126,9 @@ STATIC void match_span_helper(size_t n_args, const mp_obj_t *args, mp_obj_t span const char *start = self->caps[no * 2]; if (start != NULL) { // have a match for this group - const char *begin = mp_obj_str_get_str(self->str); + mp_buffer_info_t bufinfo; + mp_get_buffer_raise(self->str, &bufinfo, MP_BUFFER_READ); + const char *begin = bufinfo.buf; s = start - begin; e = self->caps[no * 2 + 1] - begin; } @@ -203,9 +211,10 @@ STATIC mp_obj_t re_exec(bool is_anchored, uint n_args, const mp_obj_t *args) { self = MP_OBJ_TO_PTR(mod_re_compile(1, args)); } Subject subj; - size_t len; - subj.begin_line = subj.begin = mp_obj_str_get_data(args[1], &len); - subj.end = subj.begin + len; + mp_buffer_info_t bufinfo; + mp_get_buffer_raise(args[1], &bufinfo, MP_BUFFER_READ); + subj.begin_line = subj.begin = bufinfo.buf; + subj.end = subj.begin + bufinfo.len; int caps_num = (self->re.sub + 1) * 2; mp_obj_match_t *match = m_new_obj_var(mp_obj_match_t, char *, caps_num); // cast is a workaround for a bug in msvc: it treats const char** as a const pointer instead of a pointer to pointer to const char @@ -235,10 +244,15 @@ MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(re_search_obj, 2, 4, re_search); STATIC mp_obj_t re_split(size_t n_args, const mp_obj_t *args) { mp_obj_re_t *self = MP_OBJ_TO_PTR(args[0]); Subject subj; - size_t len; + mp_buffer_info_t bufinfo; const mp_obj_type_t *str_type = mp_obj_get_type(args[1]); - subj.begin_line = subj.begin = mp_obj_str_get_data(args[1], &len); - subj.end = subj.begin + len; + if (str_type != &mp_type_str) { + // bytes, bytearray etc. args should return bytes + str_type = &mp_type_bytes; + } + mp_get_buffer_raise(args[1], &bufinfo, MP_BUFFER_READ); + subj.begin_line = subj.begin = bufinfo.buf; + subj.end = subj.begin + bufinfo.len; int caps_num = (self->re.sub + 1) * 2; int maxsplit = 0; @@ -294,11 +308,11 @@ STATIC mp_obj_t re_sub_helper(size_t n_args, const mp_obj_t *args) { // Note: flags are currently ignored } - size_t where_len; - const char *where_str = mp_obj_str_get_data(where, &where_len); Subject subj; - subj.begin_line = subj.begin = where_str; - subj.end = subj.begin + where_len; + mp_buffer_info_t bufinfo; + mp_get_buffer_raise(where, &bufinfo, MP_BUFFER_READ); + subj.begin_line = subj.begin = bufinfo.buf; + subj.end = subj.begin + bufinfo.len; int caps_num = (self->re.sub + 1) * 2; vstr_t vstr_return; @@ -327,10 +341,13 @@ STATIC mp_obj_t re_sub_helper(size_t n_args, const mp_obj_t *args) { vstr_add_strn(&vstr_return, subj.begin, match->caps[0] - subj.begin); // Get replacement string - const char *repl = mp_obj_str_get_str((mp_obj_is_callable(replace) ? mp_call_function_1(replace, MP_OBJ_FROM_PTR(match)) : replace)); + mp_obj_t repl_obj = (mp_obj_is_callable(replace) ? mp_call_function_1(replace, MP_OBJ_FROM_PTR(match)) : replace); + mp_get_buffer_raise(repl_obj, &bufinfo, MP_BUFFER_READ); + const char *repl = bufinfo.buf; + const char *repl_top = repl + bufinfo.len; // Append replacement string to result, substituting any regex groups - while (*repl != '\0') { + while (repl < repl_top) { if (*repl == '\\') { ++repl; bool is_g_format = false; @@ -423,8 +440,11 @@ STATIC MP_DEFINE_CONST_OBJ_TYPE( STATIC mp_obj_t mod_re_compile(size_t n_args, const mp_obj_t *args) { (void)n_args; - const char *re_str = mp_obj_str_get_str(args[0]); - int size = re1_5_sizecode(re_str); + + mp_buffer_info_t bufinfo; + mp_get_buffer_raise(args[0], &bufinfo, MP_BUFFER_READ); + const char *re_str = bufinfo.buf; + int size = re1_5_sizecode(re_str, bufinfo.len); if (size == -1) { goto error; } @@ -435,7 +455,7 @@ STATIC mp_obj_t mod_re_compile(size_t n_args, const mp_obj_t *args) { flags = mp_obj_get_int(args[1]); } #endif - int error = re1_5_compilecode(&o->re, re_str); + int error = re1_5_compilecode(&o->re, re_str, bufinfo.len); if (error != 0) { error: mp_raise_ValueError(MP_ERROR_TEXT("error in regex")); diff --git a/tests/extmod/re1.py b/tests/extmod/re1.py index 7e3839ae24..4fd5820a15 100644 --- a/tests/extmod/re1.py +++ b/tests/extmod/re1.py @@ -93,6 +93,23 @@ m = re.match(rb"a+?", b"ab") print(m.group(0)) print("===") +# bytearray / memoryview objects +m = re.match(rb"a.", bytearray(b"ab")) +print(m.group(0)) +m = re.match(rb"a.", memoryview(b"ab")) +print(m.group(0)) +# While micropython supports bytearray pattern, cpython does not. +# m = re.match(bytearray(b"a."), b"ab") +# print(m.group(0)) +print("===") + +# null chars +m = re.match("ab.d", "ab\x00d") +print(list(m.group(0))) +m = re.match("ab\x00d", "ab\x00d") +print(list(m.group(0))) +print("===") + # escaping m = re.match(r"a\.c", "a.c") print(m.group(0) if m else "") diff --git a/tests/extmod/re_split.py b/tests/extmod/re_split.py index 7769e1a121..486b1c3881 100644 --- a/tests/extmod/re_split.py +++ b/tests/extmod/re_split.py @@ -38,3 +38,8 @@ print(s) r = re.compile("^ab|cab") s = r.split("abababcabab") print(s) + +# bytearray objects +r = re.compile(b"x") +s = r.split(bytearray(b"fooxbar")) +print(s) diff --git a/tests/extmod/re_sub.py b/tests/extmod/re_sub.py index 229c0e63ee..779d32374f 100644 --- a/tests/extmod/re_sub.py +++ b/tests/extmod/re_sub.py @@ -26,6 +26,13 @@ def A(): print(re.sub("a", A(), "aBCBABCDabcda.")) + +def B(): + return bytearray(b"B") + + +print(re.sub(b"a", B(), b"aBCBABCDabcda.")) + print( re.sub( r"def\s+([a-zA-Z_][a-zA-Z_0-9]*)\s*\(\s*\):", @@ -61,10 +68,11 @@ try: except: print("invalid group") -# Module function takes str/bytes/re. +# Module function takes str/bytes/re/bytearray. print(re.sub("a", "a", "a")) print(re.sub(b".", b"a", b"a")) print(re.sub(re.compile("a"), "a", "a")) +print(re.sub(b"a", bytearray(b"b"), bytearray(b"a"))) try: re.sub(123, "a", "a") except TypeError: From 8e354bf903066a0698e029d0f5abbe5987274b62 Mon Sep 17 00:00:00 2001 From: Andrew Leech Date: Wed, 7 Jun 2023 10:07:14 +1000 Subject: [PATCH 3/3] lib/re1.5: Minor improvement in code size. Signed-off-by: Andrew Leech --- lib/re1.5/compilecode.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/lib/re1.5/compilecode.c b/lib/re1.5/compilecode.c index c393718f0e..c6858b49dd 100644 --- a/lib/re1.5/compilecode.c +++ b/lib/re1.5/compilecode.c @@ -10,7 +10,7 @@ #define INSERT_CODE(at, num, pc) \ ((code ? memmove(code + at + num, code + at, pc - at) : 0), pc += num) #define REL(at, to) (to - at - 2) -#define EMIT(at, byte) (code ? (code[at] = byte) : (at)) +#define EMIT(at, byte) {int _at = at; code ? (code[_at] = byte) : (0);} #define EMIT_CHECKED(at, byte) (_emit_checked(at, code, byte, &err)) #define PC (prog->bytelen) @@ -29,8 +29,9 @@ static const char *_compilecode(const char *re, size_t len, ByteProg *prog, int int term = PC; int alt_label = 0; const char *re_top = re + len; - - while (re < re_top && *re != ')') { + int remain; + + while ((remain = re_top - re) && *re != ')') { switch (*re) { case '\\': re++; @@ -80,8 +81,7 @@ static const char *_compilecode(const char *re, size_t len, ByteProg *prog, int goto emit_char_pair; } } - if (!c) return NULL; - if (re_top - re > 2 && re[1] == '-' && re[2] != ']') { + if (remain > 2 && re[1] == '-' && re[2] != ']') { re += 2; } emit_char_pair: @@ -94,7 +94,7 @@ static const char *_compilecode(const char *re, size_t len, ByteProg *prog, int case '(': { term = PC; int sub = 0; - int capture = re_top - re > 2 && (re[1] != '?' || re[2] != ':'); + int capture = remain > 2 && (re[1] != '?' || re[2] != ':'); if (capture) { sub = ++prog->sub; @@ -107,8 +107,8 @@ static const char *_compilecode(const char *re, size_t len, ByteProg *prog, int re++; if (re >= re_top) return NULL; // Trailing bracket - re = _compilecode(re, re_top - re, prog, sizecode); - if (re == NULL || re >= re_top || *re != ')') return NULL; // error, or no matching paren + re = _compilecode(re, remain, prog, sizecode); + if (re == NULL || *re != ')') return NULL; // error, or no matching paren if (capture) { EMIT(PC++, Save); @@ -121,7 +121,7 @@ static const char *_compilecode(const char *re, size_t len, ByteProg *prog, int case '?': if (PC == term) return NULL; // nothing to repeat INSERT_CODE(term, 2, PC); - if (re_top - re > 1 && re[1] == '?') { + if (remain > 1 && re[1] == '?') { EMIT(term, RSplit); re++; } else { @@ -137,7 +137,7 @@ static const char *_compilecode(const char *re, size_t len, ByteProg *prog, int EMIT(PC, Jmp); EMIT_CHECKED(PC + 1, REL(PC, term)); PC += 2; - if (re_top - re > 1 && re[1] == '?') { + if (remain > 1 && re[1] == '?') { EMIT(term, RSplit); re++; } else { @@ -149,7 +149,7 @@ static const char *_compilecode(const char *re, size_t len, ByteProg *prog, int break; case '+': if (PC == term) return NULL; // nothing to repeat - if (re_top - re > 1 && re[1] == '?') { + if (remain > 1 && re[1] == '?') { EMIT(PC, Split); re++; } else {