extmod/modure: Add ure.sub() function and method, and tests.
This feature is controlled at compile time by MICROPY_PY_URE_SUB, disabled by default. Thanks to @dmazzella for the original patch for this feature; see #3770.
This commit is contained in:
parent
1e9b871d29
commit
e30a5fc7bc
128
extmod/modure.c
128
extmod/modure.c
|
@ -249,10 +249,127 @@ STATIC mp_obj_t re_split(size_t n_args, const mp_obj_t *args) {
|
||||||
}
|
}
|
||||||
MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(re_split_obj, 2, 3, re_split);
|
MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(re_split_obj, 2, 3, re_split);
|
||||||
|
|
||||||
|
#if MICROPY_PY_URE_SUB
|
||||||
|
|
||||||
|
STATIC mp_obj_t re_sub_helper(mp_obj_t self_in, size_t n_args, const mp_obj_t *args) {
|
||||||
|
mp_obj_re_t *self = MP_OBJ_TO_PTR(self_in);
|
||||||
|
mp_obj_t replace = args[1];
|
||||||
|
mp_obj_t where = args[2];
|
||||||
|
mp_int_t count = 0;
|
||||||
|
if (n_args > 3) {
|
||||||
|
count = mp_obj_get_int(args[3]);
|
||||||
|
// Note: flags are currently ignored
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t where_len;
|
||||||
|
const char *where_str = mp_obj_str_get_data(where, &where_len);
|
||||||
|
Subject subj;
|
||||||
|
subj.begin = where_str;
|
||||||
|
subj.end = subj.begin + where_len;
|
||||||
|
int caps_num = (self->re.sub + 1) * 2;
|
||||||
|
|
||||||
|
vstr_t vstr_return;
|
||||||
|
vstr_return.buf = NULL; // We'll init the vstr after the first match
|
||||||
|
mp_obj_match_t *match = mp_local_alloc(sizeof(mp_obj_match_t) + caps_num * sizeof(char*));
|
||||||
|
match->base.type = &match_type;
|
||||||
|
match->num_matches = caps_num / 2; // caps_num counts start and end pointers
|
||||||
|
match->str = where;
|
||||||
|
|
||||||
|
for (;;) {
|
||||||
|
// cast is a workaround for a bug in msvc: it treats const char** as a const pointer instead of a pointer to pointer to const char
|
||||||
|
memset((char*)match->caps, 0, caps_num * sizeof(char*));
|
||||||
|
int res = re1_5_recursiveloopprog(&self->re, &subj, match->caps, caps_num, false);
|
||||||
|
|
||||||
|
// If we didn't have a match, or had an empty match, it's time to stop
|
||||||
|
if (!res || match->caps[0] == match->caps[1]) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Initialise the vstr if it's not already
|
||||||
|
if (vstr_return.buf == NULL) {
|
||||||
|
vstr_init(&vstr_return, match->caps[0] - subj.begin);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Add pre-match string
|
||||||
|
vstr_add_strn(&vstr_return, subj.begin, match->caps[0] - subj.begin);
|
||||||
|
|
||||||
|
// Get replacement string
|
||||||
|
const char* repl = mp_obj_str_get_str((mp_obj_is_callable(replace) ? mp_call_function_1(replace, MP_OBJ_FROM_PTR(match)) : replace));
|
||||||
|
|
||||||
|
// Append replacement string to result, substituting any regex groups
|
||||||
|
while (*repl != '\0') {
|
||||||
|
if (*repl == '\\') {
|
||||||
|
++repl;
|
||||||
|
bool is_g_format = false;
|
||||||
|
if (*repl == 'g' && repl[1] == '<') {
|
||||||
|
// Group specified with syntax "\g<number>"
|
||||||
|
repl += 2;
|
||||||
|
is_g_format = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
if ('0' <= *repl && *repl <= '9') {
|
||||||
|
// Group specified with syntax "\g<number>" or "\number"
|
||||||
|
unsigned int match_no = 0;
|
||||||
|
do {
|
||||||
|
match_no = match_no * 10 + (*repl++ - '0');
|
||||||
|
} while ('0' <= *repl && *repl <= '9');
|
||||||
|
if (is_g_format && *repl == '>') {
|
||||||
|
++repl;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (match_no >= (unsigned int)match->num_matches) {
|
||||||
|
nlr_raise(mp_obj_new_exception_arg1(&mp_type_IndexError, MP_OBJ_NEW_SMALL_INT(match_no)));
|
||||||
|
}
|
||||||
|
|
||||||
|
const char *start_match = match->caps[match_no * 2];
|
||||||
|
if (start_match != NULL) {
|
||||||
|
// Add the substring matched by group
|
||||||
|
const char *end_match = match->caps[match_no * 2 + 1];
|
||||||
|
vstr_add_strn(&vstr_return, start_match, end_match - start_match);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// Just add the current byte from the replacement string
|
||||||
|
vstr_add_byte(&vstr_return, *repl++);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Move start pointer to end of last match
|
||||||
|
subj.begin = match->caps[1];
|
||||||
|
|
||||||
|
// Stop substitutions if count was given and gets to 0
|
||||||
|
if (count > 0 && --count == 0) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
mp_local_free(match);
|
||||||
|
|
||||||
|
if (vstr_return.buf == NULL) {
|
||||||
|
// Optimisation for case of no substitutions
|
||||||
|
return where;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Add post-match string
|
||||||
|
vstr_add_strn(&vstr_return, subj.begin, subj.end - subj.begin);
|
||||||
|
|
||||||
|
return mp_obj_new_str_from_vstr(mp_obj_get_type(where), &vstr_return);
|
||||||
|
}
|
||||||
|
|
||||||
|
STATIC mp_obj_t re_sub(size_t n_args, const mp_obj_t *args) {
|
||||||
|
return re_sub_helper(args[0], n_args, args);
|
||||||
|
}
|
||||||
|
MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(re_sub_obj, 3, 5, re_sub);
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
STATIC const mp_rom_map_elem_t re_locals_dict_table[] = {
|
STATIC const mp_rom_map_elem_t re_locals_dict_table[] = {
|
||||||
{ MP_ROM_QSTR(MP_QSTR_match), MP_ROM_PTR(&re_match_obj) },
|
{ MP_ROM_QSTR(MP_QSTR_match), MP_ROM_PTR(&re_match_obj) },
|
||||||
{ MP_ROM_QSTR(MP_QSTR_search), MP_ROM_PTR(&re_search_obj) },
|
{ MP_ROM_QSTR(MP_QSTR_search), MP_ROM_PTR(&re_search_obj) },
|
||||||
{ MP_ROM_QSTR(MP_QSTR_split), MP_ROM_PTR(&re_split_obj) },
|
{ MP_ROM_QSTR(MP_QSTR_split), MP_ROM_PTR(&re_split_obj) },
|
||||||
|
#if MICROPY_PY_URE_SUB
|
||||||
|
{ MP_ROM_QSTR(MP_QSTR_sub), MP_ROM_PTR(&re_sub_obj) },
|
||||||
|
#endif
|
||||||
};
|
};
|
||||||
|
|
||||||
STATIC MP_DEFINE_CONST_DICT(re_locals_dict, re_locals_dict_table);
|
STATIC MP_DEFINE_CONST_DICT(re_locals_dict, re_locals_dict_table);
|
||||||
|
@ -307,11 +424,22 @@ STATIC mp_obj_t mod_re_search(size_t n_args, const mp_obj_t *args) {
|
||||||
}
|
}
|
||||||
MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(mod_re_search_obj, 2, 4, mod_re_search);
|
MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(mod_re_search_obj, 2, 4, mod_re_search);
|
||||||
|
|
||||||
|
#if MICROPY_PY_URE_SUB
|
||||||
|
STATIC mp_obj_t mod_re_sub(size_t n_args, const mp_obj_t *args) {
|
||||||
|
mp_obj_t self = mod_re_compile(1, args);
|
||||||
|
return re_sub_helper(self, n_args, args);
|
||||||
|
}
|
||||||
|
MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(mod_re_sub_obj, 3, 5, mod_re_sub);
|
||||||
|
#endif
|
||||||
|
|
||||||
STATIC const mp_rom_map_elem_t mp_module_re_globals_table[] = {
|
STATIC const mp_rom_map_elem_t mp_module_re_globals_table[] = {
|
||||||
{ MP_ROM_QSTR(MP_QSTR___name__), MP_ROM_QSTR(MP_QSTR_ure) },
|
{ MP_ROM_QSTR(MP_QSTR___name__), MP_ROM_QSTR(MP_QSTR_ure) },
|
||||||
{ MP_ROM_QSTR(MP_QSTR_compile), MP_ROM_PTR(&mod_re_compile_obj) },
|
{ MP_ROM_QSTR(MP_QSTR_compile), MP_ROM_PTR(&mod_re_compile_obj) },
|
||||||
{ MP_ROM_QSTR(MP_QSTR_match), MP_ROM_PTR(&mod_re_match_obj) },
|
{ MP_ROM_QSTR(MP_QSTR_match), MP_ROM_PTR(&mod_re_match_obj) },
|
||||||
{ MP_ROM_QSTR(MP_QSTR_search), MP_ROM_PTR(&mod_re_search_obj) },
|
{ MP_ROM_QSTR(MP_QSTR_search), MP_ROM_PTR(&mod_re_search_obj) },
|
||||||
|
#if MICROPY_PY_URE_SUB
|
||||||
|
{ MP_ROM_QSTR(MP_QSTR_sub), MP_ROM_PTR(&mod_re_sub_obj) },
|
||||||
|
#endif
|
||||||
{ MP_ROM_QSTR(MP_QSTR_DEBUG), MP_ROM_INT(FLAG_DEBUG) },
|
{ MP_ROM_QSTR(MP_QSTR_DEBUG), MP_ROM_INT(FLAG_DEBUG) },
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
|
@ -1150,6 +1150,10 @@ typedef double mp_float_t;
|
||||||
#define MICROPY_PY_URE_MATCH_SPAN_START_END (0)
|
#define MICROPY_PY_URE_MATCH_SPAN_START_END (0)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#ifndef MICROPY_PY_URE_SUB
|
||||||
|
#define MICROPY_PY_URE_SUB (0)
|
||||||
|
#endif
|
||||||
|
|
||||||
#ifndef MICROPY_PY_UHEAPQ
|
#ifndef MICROPY_PY_UHEAPQ
|
||||||
#define MICROPY_PY_UHEAPQ (0)
|
#define MICROPY_PY_UHEAPQ (0)
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -0,0 +1,61 @@
|
||||||
|
try:
|
||||||
|
import ure as re
|
||||||
|
except ImportError:
|
||||||
|
try:
|
||||||
|
import re
|
||||||
|
except ImportError:
|
||||||
|
print('SKIP')
|
||||||
|
raise SystemExit
|
||||||
|
|
||||||
|
try:
|
||||||
|
re.sub
|
||||||
|
except AttributeError:
|
||||||
|
print('SKIP')
|
||||||
|
raise SystemExit
|
||||||
|
|
||||||
|
|
||||||
|
def multiply(m):
|
||||||
|
return str(int(m.group(0)) * 2)
|
||||||
|
|
||||||
|
print(re.sub("\d+", multiply, "10 20 30 40 50"))
|
||||||
|
|
||||||
|
print(re.sub("\d+", lambda m: str(int(m.group(0)) // 2), "10 20 30 40 50"))
|
||||||
|
|
||||||
|
def A():
|
||||||
|
return "A"
|
||||||
|
print(re.sub('a', A(), 'aBCBABCDabcda.'))
|
||||||
|
|
||||||
|
print(
|
||||||
|
re.sub(
|
||||||
|
r'def\s+([a-zA-Z_][a-zA-Z_0-9]*)\s*\(\s*\):',
|
||||||
|
'static PyObject*\npy_\\1(void){\n return;\n}\n',
|
||||||
|
'\n\ndef myfunc():\n\ndef myfunc1():\n\ndef myfunc2():'
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
print(
|
||||||
|
re.compile(
|
||||||
|
'(calzino) (blu|bianco|verde) e (scarpa) (blu|bianco|verde)'
|
||||||
|
).sub(
|
||||||
|
r'\g<1> colore \2 con \g<3> colore \4? ...',
|
||||||
|
'calzino blu e scarpa verde'
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
# no matches at all
|
||||||
|
print(re.sub('a', 'b', 'c'))
|
||||||
|
|
||||||
|
# with maximum substitution count specified
|
||||||
|
print(re.sub('a', 'b', '1a2a3a', 2))
|
||||||
|
|
||||||
|
# invalid group
|
||||||
|
try:
|
||||||
|
re.sub('(a)', 'b\\2', 'a')
|
||||||
|
except:
|
||||||
|
print('invalid group')
|
||||||
|
|
||||||
|
# invalid group with very large number (to test overflow in uPy)
|
||||||
|
try:
|
||||||
|
re.sub('(a)', 'b\\199999999999999999999999999999999999999', 'a')
|
||||||
|
except:
|
||||||
|
print('invalid group')
|
|
@ -0,0 +1,19 @@
|
||||||
|
# test re.sub with unmatched groups, behaviour changed in CPython 3.5
|
||||||
|
|
||||||
|
try:
|
||||||
|
import ure as re
|
||||||
|
except ImportError:
|
||||||
|
try:
|
||||||
|
import re
|
||||||
|
except ImportError:
|
||||||
|
print('SKIP')
|
||||||
|
raise SystemExit
|
||||||
|
|
||||||
|
try:
|
||||||
|
re.sub
|
||||||
|
except AttributeError:
|
||||||
|
print('SKIP')
|
||||||
|
raise SystemExit
|
||||||
|
|
||||||
|
# first group matches, second optional group doesn't so is replaced with a blank
|
||||||
|
print(re.sub(r'(a)(b)?', r'\2-\1', '1a2'))
|
|
@ -0,0 +1 @@
|
||||||
|
1-a2
|
Loading…
Reference in New Issue