kopia lustrzana https://github.com/micropython/micropython
Merge 8e354bf903
into 5114f2c1ea
commit
4bfc4cc33e
|
@ -80,7 +80,13 @@ static mp_obj_t match_group(mp_obj_t self_in, mp_obj_t no_in) {
|
|||
// no match for this group
|
||||
return mp_const_none;
|
||||
}
|
||||
return mp_obj_new_str_of_type(mp_obj_get_type(self->str),
|
||||
const mp_obj_type_t *str_type = mp_obj_get_type(self->str);
|
||||
if (str_type != &mp_type_str) {
|
||||
// bytes, bytearray etc. args should return bytes
|
||||
str_type = &mp_type_bytes;
|
||||
}
|
||||
|
||||
return mp_obj_new_str_of_type(str_type,
|
||||
(const byte *)start, self->caps[no * 2 + 1] - start);
|
||||
}
|
||||
MP_DEFINE_CONST_FUN_OBJ_2(match_group_obj, match_group);
|
||||
|
@ -120,7 +126,9 @@ static void match_span_helper(size_t n_args, const mp_obj_t *args, mp_obj_t span
|
|||
const char *start = self->caps[no * 2];
|
||||
if (start != NULL) {
|
||||
// have a match for this group
|
||||
const char *begin = mp_obj_str_get_str(self->str);
|
||||
mp_buffer_info_t bufinfo;
|
||||
mp_get_buffer_raise(self->str, &bufinfo, MP_BUFFER_READ);
|
||||
const char *begin = bufinfo.buf;
|
||||
s = start - begin;
|
||||
e = self->caps[no * 2 + 1] - begin;
|
||||
}
|
||||
|
@ -203,9 +211,10 @@ static mp_obj_t re_exec(bool is_anchored, uint n_args, const mp_obj_t *args) {
|
|||
self = MP_OBJ_TO_PTR(mod_re_compile(1, args));
|
||||
}
|
||||
Subject subj;
|
||||
size_t len;
|
||||
subj.begin_line = subj.begin = mp_obj_str_get_data(args[1], &len);
|
||||
subj.end = subj.begin + len;
|
||||
mp_buffer_info_t bufinfo;
|
||||
mp_get_buffer_raise(args[1], &bufinfo, MP_BUFFER_READ);
|
||||
subj.begin_line = subj.begin = bufinfo.buf;
|
||||
subj.end = subj.begin + bufinfo.len;
|
||||
int caps_num = (self->re.sub + 1) * 2;
|
||||
mp_obj_match_t *match = m_new_obj_var(mp_obj_match_t, caps, char *, caps_num);
|
||||
// cast is a workaround for a bug in msvc: it treats const char** as a const pointer instead of a pointer to pointer to const char
|
||||
|
@ -235,10 +244,15 @@ MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(re_search_obj, 2, 4, re_search);
|
|||
static mp_obj_t re_split(size_t n_args, const mp_obj_t *args) {
|
||||
mp_obj_re_t *self = MP_OBJ_TO_PTR(args[0]);
|
||||
Subject subj;
|
||||
size_t len;
|
||||
mp_buffer_info_t bufinfo;
|
||||
const mp_obj_type_t *str_type = mp_obj_get_type(args[1]);
|
||||
subj.begin_line = subj.begin = mp_obj_str_get_data(args[1], &len);
|
||||
subj.end = subj.begin + len;
|
||||
if (str_type != &mp_type_str) {
|
||||
// bytes, bytearray etc. args should return bytes
|
||||
str_type = &mp_type_bytes;
|
||||
}
|
||||
mp_get_buffer_raise(args[1], &bufinfo, MP_BUFFER_READ);
|
||||
subj.begin_line = subj.begin = bufinfo.buf;
|
||||
subj.end = subj.begin + bufinfo.len;
|
||||
int caps_num = (self->re.sub + 1) * 2;
|
||||
|
||||
int maxsplit = 0;
|
||||
|
@ -294,11 +308,11 @@ static mp_obj_t re_sub_helper(size_t n_args, const mp_obj_t *args) {
|
|||
// Note: flags are currently ignored
|
||||
}
|
||||
|
||||
size_t where_len;
|
||||
const char *where_str = mp_obj_str_get_data(where, &where_len);
|
||||
Subject subj;
|
||||
subj.begin_line = subj.begin = where_str;
|
||||
subj.end = subj.begin + where_len;
|
||||
mp_buffer_info_t bufinfo;
|
||||
mp_get_buffer_raise(where, &bufinfo, MP_BUFFER_READ);
|
||||
subj.begin_line = subj.begin = bufinfo.buf;
|
||||
subj.end = subj.begin + bufinfo.len;
|
||||
int caps_num = (self->re.sub + 1) * 2;
|
||||
|
||||
vstr_t vstr_return;
|
||||
|
@ -327,10 +341,13 @@ static mp_obj_t re_sub_helper(size_t n_args, const mp_obj_t *args) {
|
|||
vstr_add_strn(&vstr_return, subj.begin, match->caps[0] - subj.begin);
|
||||
|
||||
// Get replacement string
|
||||
const char *repl = mp_obj_str_get_str((mp_obj_is_callable(replace) ? mp_call_function_1(replace, MP_OBJ_FROM_PTR(match)) : replace));
|
||||
mp_obj_t repl_obj = (mp_obj_is_callable(replace) ? mp_call_function_1(replace, MP_OBJ_FROM_PTR(match)) : replace);
|
||||
mp_get_buffer_raise(repl_obj, &bufinfo, MP_BUFFER_READ);
|
||||
const char *repl = bufinfo.buf;
|
||||
const char *repl_top = repl + bufinfo.len;
|
||||
|
||||
// Append replacement string to result, substituting any regex groups
|
||||
while (*repl != '\0') {
|
||||
while (repl < repl_top) {
|
||||
if (*repl == '\\') {
|
||||
++repl;
|
||||
bool is_g_format = false;
|
||||
|
@ -423,8 +440,11 @@ static MP_DEFINE_CONST_OBJ_TYPE(
|
|||
|
||||
static mp_obj_t mod_re_compile(size_t n_args, const mp_obj_t *args) {
|
||||
(void)n_args;
|
||||
const char *re_str = mp_obj_str_get_str(args[0]);
|
||||
int size = re1_5_sizecode(re_str);
|
||||
|
||||
mp_buffer_info_t bufinfo;
|
||||
mp_get_buffer_raise(args[0], &bufinfo, MP_BUFFER_READ);
|
||||
const char *re_str = bufinfo.buf;
|
||||
int size = re1_5_sizecode(re_str, bufinfo.len);
|
||||
if (size == -1) {
|
||||
goto error;
|
||||
}
|
||||
|
@ -435,7 +455,7 @@ static mp_obj_t mod_re_compile(size_t n_args, const mp_obj_t *args) {
|
|||
flags = mp_obj_get_int(args[1]);
|
||||
}
|
||||
#endif
|
||||
int error = re1_5_compilecode(&o->re, re_str);
|
||||
int error = re1_5_compilecode(&o->re, re_str, bufinfo.len);
|
||||
if (error != 0) {
|
||||
error:
|
||||
mp_raise_ValueError(MP_ERROR_TEXT("error in regex"));
|
||||
|
|
|
@ -10,7 +10,7 @@
|
|||
#define INSERT_CODE(at, num, pc) \
|
||||
((code ? memmove(code + at + num, code + at, pc - at) : 0), pc += num)
|
||||
#define REL(at, to) (to - at - 2)
|
||||
#define EMIT(at, byte) (code ? (code[at] = byte) : (at))
|
||||
#define EMIT(at, byte) {int _at = at; code ? (code[_at] = byte) : (0);}
|
||||
#define EMIT_CHECKED(at, byte) (_emit_checked(at, code, byte, &err))
|
||||
#define PC (prog->bytelen)
|
||||
|
||||
|
@ -21,19 +21,21 @@ static void _emit_checked(int at, char *code, int val, bool *err) {
|
|||
}
|
||||
}
|
||||
|
||||
static const char *_compilecode(const char *re, ByteProg *prog, int sizecode)
|
||||
static const char *_compilecode(const char *re, size_t len, ByteProg *prog, int sizecode)
|
||||
{
|
||||
char *code = sizecode ? NULL : prog->insts;
|
||||
bool err = false;
|
||||
int start = PC;
|
||||
int term = PC;
|
||||
int alt_label = 0;
|
||||
const char *re_top = re + len;
|
||||
int remain;
|
||||
|
||||
for (; *re && *re != ')'; re++) {
|
||||
while ((remain = re_top - re) && *re != ')') {
|
||||
switch (*re) {
|
||||
case '\\':
|
||||
re++;
|
||||
if (!*re) return NULL; // Trailing backslash
|
||||
if (re >= re_top) return NULL; // Trailing backslash
|
||||
if (MATCH_NAMED_CLASS_CHAR(*re)) {
|
||||
term = PC;
|
||||
EMIT(PC++, NamedClass);
|
||||
|
@ -57,26 +59,29 @@ static const char *_compilecode(const char *re, ByteProg *prog, int sizecode)
|
|||
int cnt;
|
||||
term = PC;
|
||||
re++;
|
||||
if (re >= re_top) return NULL; // Trailing bracket
|
||||
if (*re == '^') {
|
||||
EMIT(PC++, ClassNot);
|
||||
re++;
|
||||
if (re >= re_top) return NULL; // Trailing ^
|
||||
} else {
|
||||
EMIT(PC++, Class);
|
||||
}
|
||||
PC++; // Skip # of pair byte
|
||||
prog->len++;
|
||||
for (cnt = 0; *re != ']'; re++, cnt++) {
|
||||
if (re >= re_top) return NULL; // Missing closing bracket
|
||||
char c = *re;
|
||||
if (c == '\\') {
|
||||
++re;
|
||||
if (re >= re_top) return NULL; // Trailing backslash
|
||||
c = *re;
|
||||
if (MATCH_NAMED_CLASS_CHAR(c)) {
|
||||
c = RE15_CLASS_NAMED_CLASS_INDICATOR;
|
||||
goto emit_char_pair;
|
||||
}
|
||||
}
|
||||
if (!c) return NULL;
|
||||
if (re[1] == '-' && re[2] != ']') {
|
||||
if (remain > 2 && re[1] == '-' && re[2] != ']') {
|
||||
re += 2;
|
||||
}
|
||||
emit_char_pair:
|
||||
|
@ -89,7 +94,7 @@ static const char *_compilecode(const char *re, ByteProg *prog, int sizecode)
|
|||
case '(': {
|
||||
term = PC;
|
||||
int sub = 0;
|
||||
int capture = re[1] != '?' || re[2] != ':';
|
||||
int capture = remain > 2 && (re[1] != '?' || re[2] != ':');
|
||||
|
||||
if (capture) {
|
||||
sub = ++prog->sub;
|
||||
|
@ -97,10 +102,12 @@ static const char *_compilecode(const char *re, ByteProg *prog, int sizecode)
|
|||
EMIT_CHECKED(PC++, 2 * sub);
|
||||
prog->len++;
|
||||
} else {
|
||||
re += 2;
|
||||
re += 2;
|
||||
}
|
||||
|
||||
re = _compilecode(re + 1, prog, sizecode);
|
||||
re++;
|
||||
if (re >= re_top) return NULL; // Trailing bracket
|
||||
re = _compilecode(re, remain, prog, sizecode);
|
||||
if (re == NULL || *re != ')') return NULL; // error, or no matching paren
|
||||
|
||||
if (capture) {
|
||||
|
@ -114,7 +121,7 @@ static const char *_compilecode(const char *re, ByteProg *prog, int sizecode)
|
|||
case '?':
|
||||
if (PC == term) return NULL; // nothing to repeat
|
||||
INSERT_CODE(term, 2, PC);
|
||||
if (re[1] == '?') {
|
||||
if (remain > 1 && re[1] == '?') {
|
||||
EMIT(term, RSplit);
|
||||
re++;
|
||||
} else {
|
||||
|
@ -130,7 +137,7 @@ static const char *_compilecode(const char *re, ByteProg *prog, int sizecode)
|
|||
EMIT(PC, Jmp);
|
||||
EMIT_CHECKED(PC + 1, REL(PC, term));
|
||||
PC += 2;
|
||||
if (re[1] == '?') {
|
||||
if (remain > 1 && re[1] == '?') {
|
||||
EMIT(term, RSplit);
|
||||
re++;
|
||||
} else {
|
||||
|
@ -142,7 +149,7 @@ static const char *_compilecode(const char *re, ByteProg *prog, int sizecode)
|
|||
break;
|
||||
case '+':
|
||||
if (PC == term) return NULL; // nothing to repeat
|
||||
if (re[1] == '?') {
|
||||
if (remain > 1 && re[1] == '?') {
|
||||
EMIT(PC, Split);
|
||||
re++;
|
||||
} else {
|
||||
|
@ -176,27 +183,31 @@ static const char *_compilecode(const char *re, ByteProg *prog, int sizecode)
|
|||
term = PC;
|
||||
break;
|
||||
}
|
||||
re++;
|
||||
}
|
||||
|
||||
if (alt_label) {
|
||||
EMIT_CHECKED(alt_label, REL(alt_label, PC) + 1);
|
||||
}
|
||||
return err ? NULL : re;
|
||||
if (err) {
|
||||
return NULL;
|
||||
}
|
||||
return re;
|
||||
}
|
||||
|
||||
int re1_5_sizecode(const char *re)
|
||||
int re1_5_sizecode(const char *re, size_t len)
|
||||
{
|
||||
ByteProg dummyprog = {
|
||||
// Save 0, Save 1, Match; more bytes for "search" (vs "match") prefix code
|
||||
.bytelen = 5 + NON_ANCHORED_PREFIX
|
||||
};
|
||||
|
||||
if (_compilecode(re, &dummyprog, /*sizecode*/1) == NULL) return -1;
|
||||
if (_compilecode(re, len, &dummyprog, /*sizecode*/1) == NULL) return -1;
|
||||
|
||||
return dummyprog.bytelen;
|
||||
}
|
||||
|
||||
int re1_5_compilecode(ByteProg *prog, const char *re)
|
||||
int re1_5_compilecode(ByteProg *prog, const char *re, size_t len)
|
||||
{
|
||||
prog->len = 0;
|
||||
prog->bytelen = 0;
|
||||
|
@ -216,7 +227,7 @@ int re1_5_compilecode(ByteProg *prog, const char *re)
|
|||
prog->insts[prog->bytelen++] = 0;
|
||||
prog->len++;
|
||||
|
||||
re = _compilecode(re, prog, /*sizecode*/0);
|
||||
re = _compilecode(re, len, prog, /*sizecode*/0);
|
||||
if (re == NULL || *re) return 1;
|
||||
|
||||
prog->insts[prog->bytelen++] = Save;
|
||||
|
|
|
@ -146,8 +146,8 @@ int re1_5_recursiveloopprog(ByteProg*, Subject*, const char**, int, int);
|
|||
int re1_5_recursiveprog(ByteProg*, Subject*, const char**, int, int);
|
||||
int re1_5_thompsonvm(ByteProg*, Subject*, const char**, int, int);
|
||||
|
||||
int re1_5_sizecode(const char *re);
|
||||
int re1_5_compilecode(ByteProg *prog, const char *re);
|
||||
int re1_5_sizecode(const char *re, size_t len);
|
||||
int re1_5_compilecode(ByteProg *prog, const char *re, size_t len);
|
||||
void re1_5_dumpcode(ByteProg *prog);
|
||||
void cleanmarks(ByteProg *prog);
|
||||
int _re1_5_classmatch(const char *pc, const char *sp);
|
||||
|
|
|
@ -93,6 +93,23 @@ m = re.match(rb"a+?", b"ab")
|
|||
print(m.group(0))
|
||||
print("===")
|
||||
|
||||
# bytearray / memoryview objects
|
||||
m = re.match(rb"a.", bytearray(b"ab"))
|
||||
print(m.group(0))
|
||||
m = re.match(rb"a.", memoryview(b"ab"))
|
||||
print(m.group(0))
|
||||
# While micropython supports bytearray pattern, cpython does not.
|
||||
# m = re.match(bytearray(b"a."), b"ab")
|
||||
# print(m.group(0))
|
||||
print("===")
|
||||
|
||||
# null chars
|
||||
m = re.match("ab.d", "ab\x00d")
|
||||
print(list(m.group(0)))
|
||||
m = re.match("ab\x00d", "ab\x00d")
|
||||
print(list(m.group(0)))
|
||||
print("===")
|
||||
|
||||
# escaping
|
||||
m = re.match(r"a\.c", "a.c")
|
||||
print(m.group(0) if m else "")
|
||||
|
|
|
@ -38,3 +38,8 @@ print(s)
|
|||
r = re.compile("^ab|cab")
|
||||
s = r.split("abababcabab")
|
||||
print(s)
|
||||
|
||||
# bytearray objects
|
||||
r = re.compile(b"x")
|
||||
s = r.split(bytearray(b"fooxbar"))
|
||||
print(s)
|
||||
|
|
|
@ -26,6 +26,13 @@ def A():
|
|||
|
||||
print(re.sub("a", A(), "aBCBABCDabcda."))
|
||||
|
||||
|
||||
def B():
|
||||
return bytearray(b"B")
|
||||
|
||||
|
||||
print(re.sub(b"a", B(), b"aBCBABCDabcda."))
|
||||
|
||||
print(
|
||||
re.sub(
|
||||
r"def\s+([a-zA-Z_][a-zA-Z_0-9]*)\s*\(\s*\):",
|
||||
|
@ -61,10 +68,11 @@ try:
|
|||
except:
|
||||
print("invalid group")
|
||||
|
||||
# Module function takes str/bytes/re.
|
||||
# Module function takes str/bytes/re/bytearray.
|
||||
print(re.sub("a", "a", "a"))
|
||||
print(re.sub(b".", b"a", b"a"))
|
||||
print(re.sub(re.compile("a"), "a", "a"))
|
||||
print(re.sub(b"a", bytearray(b"b"), bytearray(b"a")))
|
||||
try:
|
||||
re.sub(123, "a", "a")
|
||||
except TypeError:
|
||||
|
|
Ładowanie…
Reference in New Issue