From 1fc5e17987ab347f6dc3eed6dc419f290f0808ec Mon Sep 17 00:00:00 2001
From: Andrew Leech <andrew@alelec.net>
Date: Sun, 9 Jan 2022 07:43:26 +1100
Subject: [PATCH 1/3] lib/re1.5: Allow null characters in regex patterns.

Signed-off-by: Andrew Leech <andrew@alelec.net>
---
 lib/re1.5/compilecode.c | 45 +++++++++++++++++++++++++----------------
 lib/re1.5/re1.5.h       |  4 ++--
 2 files changed, 30 insertions(+), 19 deletions(-)

diff --git a/lib/re1.5/compilecode.c b/lib/re1.5/compilecode.c
index 513a155970..c393718f0e 100644
--- a/lib/re1.5/compilecode.c
+++ b/lib/re1.5/compilecode.c
@@ -21,19 +21,20 @@ static void _emit_checked(int at, char *code, int val, bool *err) {
     }
 }
 
-static const char *_compilecode(const char *re, ByteProg *prog, int sizecode)
+static const char *_compilecode(const char *re, size_t len, ByteProg *prog, int sizecode)
 {
     char *code = sizecode ? NULL : prog->insts;
     bool err = false;
     int start = PC;
     int term = PC;
     int alt_label = 0;
-
-    for (; *re && *re != ')'; re++) {
+    const char *re_top = re + len;
+    
+    while (re < re_top && *re != ')') {
         switch (*re) {
         case '\\':
             re++;
-            if (!*re) return NULL; // Trailing backslash
+            if (re >= re_top) return NULL; // Trailing backslash
             if (MATCH_NAMED_CLASS_CHAR(*re)) {
                 term = PC;
                 EMIT(PC++, NamedClass);
@@ -57,18 +58,22 @@ static const char *_compilecode(const char *re, ByteProg *prog, int sizecode)
             int cnt;
             term = PC;
             re++;
+            if (re >= re_top) return NULL; // Trailing bracket
             if (*re == '^') {
                 EMIT(PC++, ClassNot);
                 re++;
+                if (re >= re_top) return NULL; // Trailing ^
             } else {
                 EMIT(PC++, Class);
             }
             PC++; // Skip # of pair byte
             prog->len++;
             for (cnt = 0; *re != ']'; re++, cnt++) {
+                if (re >= re_top) return NULL; // Missing closing bracket
                 char c = *re;
                 if (c == '\\') {
                     ++re;
+                    if (re >= re_top) return NULL; // Trailing backslash
                     c = *re;
                     if (MATCH_NAMED_CLASS_CHAR(c)) {
                         c = RE15_CLASS_NAMED_CLASS_INDICATOR;
@@ -76,7 +81,7 @@ static const char *_compilecode(const char *re, ByteProg *prog, int sizecode)
                     }
                 }
                 if (!c) return NULL;
-                if (re[1] == '-' && re[2] != ']') {
+                if (re_top - re > 2 && re[1] == '-' && re[2] != ']') {
                     re += 2;
                 }
             emit_char_pair:
@@ -89,7 +94,7 @@ static const char *_compilecode(const char *re, ByteProg *prog, int sizecode)
         case '(': {
             term = PC;
             int sub = 0;
-            int capture = re[1] != '?' || re[2] != ':';
+            int capture = re_top - re > 2 && (re[1] != '?' || re[2] != ':');
 
             if (capture) {
                 sub = ++prog->sub;
@@ -97,11 +102,13 @@ static const char *_compilecode(const char *re, ByteProg *prog, int sizecode)
                 EMIT_CHECKED(PC++, 2 * sub);
                 prog->len++;
             } else {
-                    re += 2;
+                re += 2;
             }
 
-            re = _compilecode(re + 1, prog, sizecode);
-            if (re == NULL || *re != ')') return NULL; // error, or no matching paren
+            re++;
+            if (re >= re_top) return NULL; // Trailing bracket
+            re = _compilecode(re, re_top - re, prog, sizecode);
+            if (re == NULL || re >= re_top || *re != ')') return NULL; // error, or no matching paren
 
             if (capture) {
                 EMIT(PC++, Save);
@@ -114,7 +121,7 @@ static const char *_compilecode(const char *re, ByteProg *prog, int sizecode)
         case '?':
             if (PC == term) return NULL; // nothing to repeat
             INSERT_CODE(term, 2, PC);
-            if (re[1] == '?') {
+            if (re_top - re > 1 && re[1] == '?') {
                 EMIT(term, RSplit);
                 re++;
             } else {
@@ -130,7 +137,7 @@ static const char *_compilecode(const char *re, ByteProg *prog, int sizecode)
             EMIT(PC, Jmp);
             EMIT_CHECKED(PC + 1, REL(PC, term));
             PC += 2;
-            if (re[1] == '?') {
+            if (re_top - re > 1 && re[1] == '?') {
                 EMIT(term, RSplit);
                 re++;
             } else {
@@ -142,7 +149,7 @@ static const char *_compilecode(const char *re, ByteProg *prog, int sizecode)
             break;
         case '+':
             if (PC == term) return NULL; // nothing to repeat
-            if (re[1] == '?') {
+            if (re_top - re > 1 && re[1] == '?') {
                 EMIT(PC, Split);
                 re++;
             } else {
@@ -176,27 +183,31 @@ static const char *_compilecode(const char *re, ByteProg *prog, int sizecode)
             term = PC;
             break;
         }
+        re++;
     }
 
     if (alt_label) {
         EMIT_CHECKED(alt_label, REL(alt_label, PC) + 1);
     }
-    return err ? NULL : re;
+    if (err) {
+        return NULL;
+    }
+    return re;
 }
 
-int re1_5_sizecode(const char *re)
+int re1_5_sizecode(const char *re, size_t len)
 {
     ByteProg dummyprog = {
          // Save 0, Save 1, Match; more bytes for "search" (vs "match") prefix code
         .bytelen = 5 + NON_ANCHORED_PREFIX
     };
 
-    if (_compilecode(re, &dummyprog, /*sizecode*/1) == NULL) return -1;
+    if (_compilecode(re, len, &dummyprog, /*sizecode*/1) == NULL) return -1;
 
     return dummyprog.bytelen;
 }
 
-int re1_5_compilecode(ByteProg *prog, const char *re)
+int re1_5_compilecode(ByteProg *prog, const char *re, size_t len)
 {
     prog->len = 0;
     prog->bytelen = 0;
@@ -216,7 +227,7 @@ int re1_5_compilecode(ByteProg *prog, const char *re)
     prog->insts[prog->bytelen++] = 0;
     prog->len++;
 
-    re = _compilecode(re, prog, /*sizecode*/0);
+    re = _compilecode(re, len, prog, /*sizecode*/0);
     if (re == NULL || *re) return 1;
 
     prog->insts[prog->bytelen++] = Save;
diff --git a/lib/re1.5/re1.5.h b/lib/re1.5/re1.5.h
index b1ec01cbc5..cc0c52e42f 100644
--- a/lib/re1.5/re1.5.h
+++ b/lib/re1.5/re1.5.h
@@ -146,8 +146,8 @@ int re1_5_recursiveloopprog(ByteProg*, Subject*, const char**, int, int);
 int re1_5_recursiveprog(ByteProg*, Subject*, const char**, int, int);
 int re1_5_thompsonvm(ByteProg*, Subject*, const char**, int, int);
 
-int re1_5_sizecode(const char *re);
-int re1_5_compilecode(ByteProg *prog, const char *re);
+int re1_5_sizecode(const char *re, size_t len);
+int re1_5_compilecode(ByteProg *prog, const char *re, size_t len);
 void re1_5_dumpcode(ByteProg *prog);
 void cleanmarks(ByteProg *prog);
 int _re1_5_classmatch(const char *pc, const char *sp);

From 5deef8005b84367d583ec66eacf74eaadd848441 Mon Sep 17 00:00:00 2001
From: Andrew Leech <andrew@alelec.net>
Date: Tue, 11 Jan 2022 12:19:33 +1100
Subject: [PATCH 2/3] extmod/re: Use buffer protocol for data to search
 through.

Signed-off-by: Andrew Leech <andrew@alelec.net>
---
 extmod/modre.c           | 54 +++++++++++++++++++++++++++-------------
 tests/extmod/re1.py      | 17 +++++++++++++
 tests/extmod/re_split.py |  5 ++++
 tests/extmod/re_sub.py   | 10 +++++++-
 4 files changed, 68 insertions(+), 18 deletions(-)

diff --git a/extmod/modre.c b/extmod/modre.c
index 7f00b1c23c..8697e1c6ef 100644
--- a/extmod/modre.c
+++ b/extmod/modre.c
@@ -80,7 +80,13 @@ STATIC mp_obj_t match_group(mp_obj_t self_in, mp_obj_t no_in) {
         // no match for this group
         return mp_const_none;
     }
-    return mp_obj_new_str_of_type(mp_obj_get_type(self->str),
+    const mp_obj_type_t *str_type = mp_obj_get_type(self->str);
+    if (str_type != &mp_type_str) {
+        // bytes, bytearray etc. args should return bytes
+        str_type = &mp_type_bytes;
+    }
+
+    return mp_obj_new_str_of_type(str_type,
         (const byte *)start, self->caps[no * 2 + 1] - start);
 }
 MP_DEFINE_CONST_FUN_OBJ_2(match_group_obj, match_group);
@@ -120,7 +126,9 @@ STATIC void match_span_helper(size_t n_args, const mp_obj_t *args, mp_obj_t span
     const char *start = self->caps[no * 2];
     if (start != NULL) {
         // have a match for this group
-        const char *begin = mp_obj_str_get_str(self->str);
+        mp_buffer_info_t bufinfo;
+        mp_get_buffer_raise(self->str, &bufinfo, MP_BUFFER_READ);
+        const char *begin = bufinfo.buf;
         s = start - begin;
         e = self->caps[no * 2 + 1] - begin;
     }
@@ -203,9 +211,10 @@ STATIC mp_obj_t re_exec(bool is_anchored, uint n_args, const mp_obj_t *args) {
         self = MP_OBJ_TO_PTR(mod_re_compile(1, args));
     }
     Subject subj;
-    size_t len;
-    subj.begin_line = subj.begin = mp_obj_str_get_data(args[1], &len);
-    subj.end = subj.begin + len;
+    mp_buffer_info_t bufinfo;
+    mp_get_buffer_raise(args[1], &bufinfo, MP_BUFFER_READ);
+    subj.begin_line = subj.begin = bufinfo.buf;
+    subj.end = subj.begin + bufinfo.len;
     int caps_num = (self->re.sub + 1) * 2;
     mp_obj_match_t *match = m_new_obj_var(mp_obj_match_t, char *, caps_num);
     // cast is a workaround for a bug in msvc: it treats const char** as a const pointer instead of a pointer to pointer to const char
@@ -235,10 +244,15 @@ MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(re_search_obj, 2, 4, re_search);
 STATIC mp_obj_t re_split(size_t n_args, const mp_obj_t *args) {
     mp_obj_re_t *self = MP_OBJ_TO_PTR(args[0]);
     Subject subj;
-    size_t len;
+    mp_buffer_info_t bufinfo;
     const mp_obj_type_t *str_type = mp_obj_get_type(args[1]);
-    subj.begin_line = subj.begin = mp_obj_str_get_data(args[1], &len);
-    subj.end = subj.begin + len;
+    if (str_type != &mp_type_str) {
+        // bytes, bytearray etc. args should return bytes
+        str_type = &mp_type_bytes;
+    }
+    mp_get_buffer_raise(args[1], &bufinfo, MP_BUFFER_READ);
+    subj.begin_line = subj.begin = bufinfo.buf;
+    subj.end = subj.begin + bufinfo.len;
     int caps_num = (self->re.sub + 1) * 2;
 
     int maxsplit = 0;
@@ -294,11 +308,11 @@ STATIC mp_obj_t re_sub_helper(size_t n_args, const mp_obj_t *args) {
         // Note: flags are currently ignored
     }
 
-    size_t where_len;
-    const char *where_str = mp_obj_str_get_data(where, &where_len);
     Subject subj;
-    subj.begin_line = subj.begin = where_str;
-    subj.end = subj.begin + where_len;
+    mp_buffer_info_t bufinfo;
+    mp_get_buffer_raise(where, &bufinfo, MP_BUFFER_READ);
+    subj.begin_line = subj.begin = bufinfo.buf;
+    subj.end = subj.begin + bufinfo.len;
     int caps_num = (self->re.sub + 1) * 2;
 
     vstr_t vstr_return;
@@ -327,10 +341,13 @@ STATIC mp_obj_t re_sub_helper(size_t n_args, const mp_obj_t *args) {
         vstr_add_strn(&vstr_return, subj.begin, match->caps[0] - subj.begin);
 
         // Get replacement string
-        const char *repl = mp_obj_str_get_str((mp_obj_is_callable(replace) ? mp_call_function_1(replace, MP_OBJ_FROM_PTR(match)) : replace));
+        mp_obj_t repl_obj = (mp_obj_is_callable(replace) ? mp_call_function_1(replace, MP_OBJ_FROM_PTR(match)) : replace);
+        mp_get_buffer_raise(repl_obj, &bufinfo, MP_BUFFER_READ);
+        const char *repl = bufinfo.buf;
+        const char *repl_top = repl + bufinfo.len;
 
         // Append replacement string to result, substituting any regex groups
-        while (*repl != '\0') {
+        while (repl < repl_top) {
             if (*repl == '\\') {
                 ++repl;
                 bool is_g_format = false;
@@ -423,8 +440,11 @@ STATIC MP_DEFINE_CONST_OBJ_TYPE(
 
 STATIC mp_obj_t mod_re_compile(size_t n_args, const mp_obj_t *args) {
     (void)n_args;
-    const char *re_str = mp_obj_str_get_str(args[0]);
-    int size = re1_5_sizecode(re_str);
+
+    mp_buffer_info_t bufinfo;
+    mp_get_buffer_raise(args[0], &bufinfo, MP_BUFFER_READ);
+    const char *re_str = bufinfo.buf;
+    int size = re1_5_sizecode(re_str, bufinfo.len);
     if (size == -1) {
         goto error;
     }
@@ -435,7 +455,7 @@ STATIC mp_obj_t mod_re_compile(size_t n_args, const mp_obj_t *args) {
         flags = mp_obj_get_int(args[1]);
     }
     #endif
-    int error = re1_5_compilecode(&o->re, re_str);
+    int error = re1_5_compilecode(&o->re, re_str, bufinfo.len);
     if (error != 0) {
     error:
         mp_raise_ValueError(MP_ERROR_TEXT("error in regex"));
diff --git a/tests/extmod/re1.py b/tests/extmod/re1.py
index 7e3839ae24..4fd5820a15 100644
--- a/tests/extmod/re1.py
+++ b/tests/extmod/re1.py
@@ -93,6 +93,23 @@ m = re.match(rb"a+?", b"ab")
 print(m.group(0))
 print("===")
 
+# bytearray / memoryview objects
+m = re.match(rb"a.", bytearray(b"ab"))
+print(m.group(0))
+m = re.match(rb"a.", memoryview(b"ab"))
+print(m.group(0))
+# While micropython supports bytearray pattern, cpython does not.
+# m = re.match(bytearray(b"a."), b"ab")
+# print(m.group(0))
+print("===")
+
+# null chars
+m = re.match("ab.d", "ab\x00d")
+print(list(m.group(0)))
+m = re.match("ab\x00d", "ab\x00d")
+print(list(m.group(0)))
+print("===")
+
 # escaping
 m = re.match(r"a\.c", "a.c")
 print(m.group(0) if m else "")
diff --git a/tests/extmod/re_split.py b/tests/extmod/re_split.py
index 7769e1a121..486b1c3881 100644
--- a/tests/extmod/re_split.py
+++ b/tests/extmod/re_split.py
@@ -38,3 +38,8 @@ print(s)
 r = re.compile("^ab|cab")
 s = r.split("abababcabab")
 print(s)
+
+# bytearray objects
+r = re.compile(b"x")
+s = r.split(bytearray(b"fooxbar"))
+print(s)
diff --git a/tests/extmod/re_sub.py b/tests/extmod/re_sub.py
index 229c0e63ee..779d32374f 100644
--- a/tests/extmod/re_sub.py
+++ b/tests/extmod/re_sub.py
@@ -26,6 +26,13 @@ def A():
 
 print(re.sub("a", A(), "aBCBABCDabcda."))
 
+
+def B():
+    return bytearray(b"B")
+
+
+print(re.sub(b"a", B(), b"aBCBABCDabcda."))
+
 print(
     re.sub(
         r"def\s+([a-zA-Z_][a-zA-Z_0-9]*)\s*\(\s*\):",
@@ -61,10 +68,11 @@ try:
 except:
     print("invalid group")
 
-# Module function takes str/bytes/re.
+# Module function takes str/bytes/re/bytearray.
 print(re.sub("a", "a", "a"))
 print(re.sub(b".", b"a", b"a"))
 print(re.sub(re.compile("a"), "a", "a"))
+print(re.sub(b"a", bytearray(b"b"), bytearray(b"a")))
 try:
     re.sub(123, "a", "a")
 except TypeError:

From 8e354bf903066a0698e029d0f5abbe5987274b62 Mon Sep 17 00:00:00 2001
From: Andrew Leech <andrew.leech@planetinnovation.com.au>
Date: Wed, 7 Jun 2023 10:07:14 +1000
Subject: [PATCH 3/3] lib/re1.5: Minor improvement in code size.

Signed-off-by: Andrew Leech <andrew@alelec.net>
---
 lib/re1.5/compilecode.c | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/lib/re1.5/compilecode.c b/lib/re1.5/compilecode.c
index c393718f0e..c6858b49dd 100644
--- a/lib/re1.5/compilecode.c
+++ b/lib/re1.5/compilecode.c
@@ -10,7 +10,7 @@
 #define INSERT_CODE(at, num, pc) \
     ((code ? memmove(code + at + num, code + at, pc - at) : 0), pc += num)
 #define REL(at, to) (to - at - 2)
-#define EMIT(at, byte) (code ? (code[at] = byte) : (at))
+#define EMIT(at, byte) {int _at = at; code ? (code[_at] = byte) : (0);}
 #define EMIT_CHECKED(at, byte) (_emit_checked(at, code, byte, &err))
 #define PC (prog->bytelen)
 
@@ -29,8 +29,9 @@ static const char *_compilecode(const char *re, size_t len, ByteProg *prog, int
     int term = PC;
     int alt_label = 0;
     const char *re_top = re + len;
-    
-    while (re < re_top && *re != ')') {
+    int remain;
+
+    while ((remain = re_top - re) && *re != ')') {
         switch (*re) {
         case '\\':
             re++;
@@ -80,8 +81,7 @@ static const char *_compilecode(const char *re, size_t len, ByteProg *prog, int
                         goto emit_char_pair;
                     }
                 }
-                if (!c) return NULL;
-                if (re_top - re > 2 && re[1] == '-' && re[2] != ']') {
+                if (remain > 2 && re[1] == '-' && re[2] != ']') {
                     re += 2;
                 }
             emit_char_pair:
@@ -94,7 +94,7 @@ static const char *_compilecode(const char *re, size_t len, ByteProg *prog, int
         case '(': {
             term = PC;
             int sub = 0;
-            int capture = re_top - re > 2 && (re[1] != '?' || re[2] != ':');
+            int capture = remain > 2 && (re[1] != '?' || re[2] != ':');
 
             if (capture) {
                 sub = ++prog->sub;
@@ -107,8 +107,8 @@ static const char *_compilecode(const char *re, size_t len, ByteProg *prog, int
 
             re++;
             if (re >= re_top) return NULL; // Trailing bracket
-            re = _compilecode(re, re_top - re, prog, sizecode);
-            if (re == NULL || re >= re_top || *re != ')') return NULL; // error, or no matching paren
+            re = _compilecode(re, remain, prog, sizecode);
+            if (re == NULL || *re != ')') return NULL; // error, or no matching paren
 
             if (capture) {
                 EMIT(PC++, Save);
@@ -121,7 +121,7 @@ static const char *_compilecode(const char *re, size_t len, ByteProg *prog, int
         case '?':
             if (PC == term) return NULL; // nothing to repeat
             INSERT_CODE(term, 2, PC);
-            if (re_top - re > 1 && re[1] == '?') {
+            if (remain > 1 && re[1] == '?') {
                 EMIT(term, RSplit);
                 re++;
             } else {
@@ -137,7 +137,7 @@ static const char *_compilecode(const char *re, size_t len, ByteProg *prog, int
             EMIT(PC, Jmp);
             EMIT_CHECKED(PC + 1, REL(PC, term));
             PC += 2;
-            if (re_top - re > 1 && re[1] == '?') {
+            if (remain > 1 && re[1] == '?') {
                 EMIT(term, RSplit);
                 re++;
             } else {
@@ -149,7 +149,7 @@ static const char *_compilecode(const char *re, size_t len, ByteProg *prog, int
             break;
         case '+':
             if (PC == term) return NULL; // nothing to repeat
-            if (re_top - re > 1 && re[1] == '?') {
+            if (remain > 1 && re[1] == '?') {
                 EMIT(PC, Split);
                 re++;
             } else {