Berry add ``import re`` regex module

This commit is contained in:
Stephan Hadinger 2021-11-25 22:57:37 +01:00
parent 33383d1ce4
commit 2ae03c6e43
24 changed files with 2321 additions and 0 deletions

View File

@ -13,6 +13,7 @@ All notable changes to this project will be documented in this file.
- Berry support for neopixel (WS2812, SK6812)
- Command ``IfxPeriod `` to overrule ``Teleperiod`` for Influx messages (#13750)
- OTA over HTTPS (ESP32x only)
- Berry add ``import re`` regex module
### Changed
- ESP8266 Gratuitous ARP enabled and set to 60 seconds (#13623)

View File

@ -25,6 +25,7 @@ be_extern_native_module(strict);
/* Tasmota specific */
be_extern_native_module(python_compat);
be_extern_native_module(re);
be_extern_native_module(persist);
be_extern_native_module(autoconf);
be_extern_native_module(tapp);
@ -87,6 +88,7 @@ BERRY_LOCAL const bntvmodule* const be_module_table[] = {
/* user-defined modules register start */
&be_native_module(python_compat),
&be_native_module(re),
&be_native_module(path),
&be_native_module(persist),
#ifdef USE_AUTOCONF

View File

@ -0,0 +1,218 @@
/********************************************************************
* Tasmota lib
*
* To use: `import re`
*
* Regex using re1.5
*******************************************************************/
#include "be_constobj.h"
#include "be_mem.h"
#include "re1.5.h"
/********************************************************************
# Berry skeleton for `re` module
#
class re_pattern
var _p # comobj containing the compiled bytecode for the pattern
def search() end
def match() end
def split() end
end
re = module("re")
re.compile = def (regex_str) end # native
re.match = def (regex_str, str) end # native
re.search = def (regex_str, str) end # native
*******************************************************************/
extern const bclass be_class_re_pattern;
int be_free_comobj(bvm* vm) {
int argc = be_top(vm);
if (argc > 0) {
void * obj = be_tocomptr(vm, 1);
if (obj != NULL) { be_os_free(obj); }
}
be_return_nil(vm);
}
// Native functions be_const_func()
// Berry: `re.compile(pattern:string) -> instance(be_pattern)`
int be_re_compile(bvm *vm) {
int32_t argc = be_top(vm); // Get the number of arguments
if (argc >= 1 && be_isstring(vm, 1)) {
const char * regex_str = be_tostring(vm, 1);
int sz = re1_5_sizecode(regex_str);
if (sz < 0) {
be_raise(vm, "internal_error", "error in regex");
}
ByteProg *code = be_os_malloc(sizeof(ByteProg) + sz);
int ret = re1_5_compilecode(code, regex_str);
if (ret != 0) {
be_raise(vm, "internal_error", "error in regex");
}
be_pushntvclass(vm, &be_class_re_pattern);
be_call(vm, 0);
be_newcomobj(vm, code, &be_free_comobj);
be_setmember(vm, -2, "_p");
be_pop(vm, 1);
be_return(vm);
}
be_raise(vm, "type_error", NULL);
}
int be_re_match_search_run(bvm *vm, ByteProg *code, const char *hay, bbool is_anchored) {
Subject subj = {hay, hay + strlen(hay)};
int sub_els = (code->sub + 1) * 2;
const char *sub[sub_els];
if (!re1_5_recursiveloopprog(code, &subj, sub, sub_els, is_anchored)) {
be_return_nil(vm); // no match
}
be_newobject(vm, "list");
int k;
for(k = sub_els; k > 0; k--)
if(sub[k-1])
break;
for (int i = 0; i < k; i += 2) {
be_pushnstring(vm, sub[i], sub[i+1] - sub[i]);
be_data_push(vm, -2);
be_pop(vm, 1);
}
be_pop(vm, 1); // remove list
be_return(vm); // return list object
}
int be_re_match_search(bvm *vm, bbool is_anchored) {
int32_t argc = be_top(vm); // Get the number of arguments
if (argc >= 2 && be_isstring(vm, 1) && be_isstring(vm, 2)) {
const char * regex_str = be_tostring(vm, 1);
const char * hay = be_tostring(vm, 2);
int sz = re1_5_sizecode(regex_str);
if (sz < 0) {
be_raise(vm, "internal_error", "error in regex");
}
ByteProg *code = be_os_malloc(sizeof(ByteProg) + sz);
int ret = re1_5_compilecode(code, regex_str);
if (ret != 0) {
be_raise(vm, "internal_error", "error in regex");
}
return be_re_match_search_run(vm, code, hay, is_anchored);
}
be_raise(vm, "type_error", NULL);
}
// Berry: `re.match(value:int | s:string) -> nil`
int be_re_match(bvm *vm) {
return be_re_match_search(vm, btrue);
}
// Berry: `re.search(value:int | s:string) -> nil`
int be_re_search(bvm *vm) {
return be_re_match_search(vm, bfalse);
}
// Berry: `re_pattern.search(s:string) -> list(string)`
int re_pattern_search(bvm *vm) {
int32_t argc = be_top(vm); // Get the number of arguments
if (argc >= 2 && be_isstring(vm, 2)) {
const char * hay = be_tostring(vm, 2);
be_getmember(vm, 1, "_p");
ByteProg * code = (ByteProg*) be_tocomptr(vm, -1);
return be_re_match_search_run(vm, code, hay, bfalse);
}
be_raise(vm, "type_error", NULL);
}
// Berry: `re_pattern.match(s:string) -> list(string)`
int re_pattern_match(bvm *vm) {
int32_t argc = be_top(vm); // Get the number of arguments
if (argc >= 2 && be_isstring(vm, 2)) {
const char * hay = be_tostring(vm, 2);
be_getmember(vm, 1, "_p");
ByteProg * code = (ByteProg*) be_tocomptr(vm, -1);
return be_re_match_search_run(vm, code, hay, btrue);
}
be_raise(vm, "type_error", NULL);
}
// Berry: `re_pattern.split(s:string) -> list(string)`
int re_pattern_split(bvm *vm) {
int32_t argc = be_top(vm); // Get the number of arguments
if (argc >= 2 && be_isstring(vm, 2)) {
const char * hay = be_tostring(vm, 2);
be_getmember(vm, 1, "_p");
ByteProg * code = (ByteProg*) be_tocomptr(vm, -1);
Subject subj = {hay, hay + strlen(hay)};
int sub_els = (code->sub + 1) * 2;
const char *sub[sub_els];
be_newobject(vm, "list");
while (1) {
if (!re1_5_recursiveloopprog(code, &subj, sub, sub_els, bfalse)) {
be_pushnstring(vm, subj.begin, subj.end - subj.begin);
be_data_push(vm, -2);
be_pop(vm, 1);
break;
}
if (sub[0] == NULL || sub[1] == NULL || sub[0] == sub[1]) {
be_raise(vm, "internal_error", "can't match");
}
be_pushnstring(vm, subj.begin, sub[0] - subj.begin);
be_data_push(vm, -2);
be_pop(vm, 1);
subj.begin = sub[1];
}
be_pop(vm, 1); // remove list
be_return(vm); // return list object
}
be_raise(vm, "type_error", NULL);
}
/********************************************************************
** Solidified module: re
********************************************************************/
be_local_module(re,
"re",
be_nested_map(3,
( (struct bmapnode*) &(const bmapnode[]) {
{ be_nested_key("compile", 1000265118, 7, -1), be_const_func(be_re_compile) },
{ be_nested_key("search", -2144130903, 6, 2), be_const_func(be_re_search) },
{ be_nested_key("match", 2116038550, 5, -1), be_const_func(be_re_match) },
}))
);
BE_EXPORT_VARIABLE be_define_const_native_module(re);
/********************************************************************/
// ===================================================================
/********************************************************************
** Solidified class: re_pattern
********************************************************************/
be_local_class(re_pattern,
1,
NULL,
be_nested_map(4,
( (struct bmapnode*) &(const bmapnode[]) {
{ be_nested_key("_p", 1594591802, 2, -1), be_const_var(0) },
{ be_nested_key("search", -2144130903, 6, -1), be_const_func(re_pattern_search) },
{ be_nested_key("match", 2116038550, 5, 0), be_const_func(re_pattern_match) },
{ be_nested_key("split", -2017972765, 5, -1), be_const_func(re_pattern_split) },
})),
(be_nested_const_str("re_pattern", 2041968961, 10))
);
/*******************************************************************/

View File

@ -0,0 +1,27 @@
// Copyright (c) 2007-2009 Russ Cox, Google Inc. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above
// copyright notice, this list of conditions and the following disclaimer
// in the documentation and/or other materials provided with the
// distribution.
// * Neither the name of Google, Inc. nor the names of its
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

View File

@ -0,0 +1,44 @@
# Copyright 2007-2009 Russ Cox. All Rights Reserved.
# Use of this source code is governed by a BSD-style
# license that can be found in the LICENSE file.
CC=gcc
CFLAGS=-g -Wall -Os
# Comment out when developing/testing
#CFLAGS=-DDEBUG -g -Wall -O0
TARGET=re
OFILES=\
backtrack.o\
compile.o\
main.o\
pike.o\
recursive.o\
recursiveloop.o\
sub.o\
thompson.o\
compilecode.o\
dumpcode.o\
charclass.o\
cleanmarks.o\
util.o\
y.tab.o\
HFILES=\
re1.5.h\
$(TARGET): $(OFILES)
$(CC) $(CFLAGS) -o $(TARGET) $(OFILES)
%.o: %.c $(HFILES)
$(CC) -c $(CFLAGS) $*.c
y.tab.h y.tab.c: parse.y
bison -v -y parse.y
test: $(TARGET)
./run-tests $(TFLAGS)
clean:
rm -f *.o core $(TARGET) y.tab.[ch] y.output

48
lib/libesp32/re1.5/README Normal file
View File

@ -0,0 +1,48 @@
What is re1.5?
==============
re1 (http://code.google.com/p/re1/) is "toy regular expression implementation"
by Russel Cox, featuring simplicity and minimal code size unheard of in other
implementations. re2 (http://code.google.com/p/re2/) is "an efficient,
principled regular expression library" by the same author. It is robust,
full-featured, and ... bloated, comparing to re1.
re1.5 is an attempt to start with re1 codebase and add features required for
minimalistic real-world use, while sticking to the minimal code size and
memory use.
Why?
====
re1.5 is intended for use in highly constrained, e.g. embedded, environments,
where offering familiar high-level string matching functionality is still
beneficial.
Features
========
* Like re1, re1.5 retains design where compiled expression can be executed
(matched) by multiple backends, each with its own distinctive design and
runtime properties (complexity and memory usage).
* Unlike re1, regexes are compiled to memory-efficient bytecode. Exact size
of the bytecode can be found prior to compilation (for memory allocation).
* External API functions feature namespace prefix to improve clarity and
avoid name clashes when integrating into other projects.
* Matchers are NUL-char clean and take size of the input string as a param.
* Support for quoted chars in regex.
* Support for ^, $ assertions in regex.
* Support for "match" vs "search" operations, as common in other regex APIs.
* Support for named character classes: \d \D \s \S \w \W.
TODO
====
* Support for repetition operator {n} and {n,m}.
* Support for Unicode (UTF-8).
* Support for matching flags like case-insensitive, dot matches all,
multiline, etc.
* Support for more assertions like \A, \Z.
Author and License
==================
re1.5 is maintained by Paul Sokolovsky pfalcon at users.sourceforge.net and
licensed under BSD license, just as the original re1.

View File

@ -0,0 +1,117 @@
// Copyright 2007-2009 Russ Cox. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#include "re1.5.h"
typedef struct Thread Thread;
struct Thread
{
char *pc;
const char *sp;
Sub *sub;
};
static Thread
thread(char *pc, const char *sp, Sub *sub)
{
Thread t = {pc, sp, sub};
return t;
}
int
re1_5_backtrack(ByteProg *prog, Subject *input, const char **subp, int nsubp, int is_anchored)
{
enum { MAX = 1000 };
Thread ready[MAX];
int i, nready;
char *pc;
const char *sp;
Sub *sub;
int off;
/* queue initial thread */
sub = newsub(nsubp);
for(i=0; i<nsubp; i++)
sub->sub[i] = nil;
ready[0] = thread(HANDLE_ANCHORED(prog->insts, is_anchored), input->begin, sub);
nready = 1;
/* run threads in stack order */
while(nready > 0) {
--nready; /* pop state for next thread to run */
pc = ready[nready].pc;
sp = ready[nready].sp;
sub = ready[nready].sub;
assert(sub->ref > 0);
for(;;) {
if(inst_is_consumer(*pc)) {
// If we need to match a character, but there's none left, it's fail
if(sp >= input->end)
goto Dead;
}
switch(*pc++) {
case Char:
if(*sp != *pc++)
goto Dead;
case Any:
sp++;
continue;
case Class:
case ClassNot:
if (!_re1_5_classmatch(pc, sp))
goto Dead;
pc += *(unsigned char*)pc * 2 + 1;
sp++;
continue;
case NamedClass:
if (!_re1_5_namedclassmatch(pc, sp))
goto Dead;
pc++;
sp++;
continue;
case Match:
for(i=0; i<nsubp; i++)
subp[i] = sub->sub[i];
decref(sub);
return 1;
case Jmp:
off = (signed char)*pc++;
pc = pc + off;
continue;
case Split:
if(nready >= MAX)
re1_5_fatal("backtrack overflow");
off = (signed char)*pc++;
ready[nready++] = thread(pc + off, sp, incref(sub));
// pc = pc->x; /* continue current thread */
continue;
case RSplit:
if(nready >= MAX)
re1_5_fatal("backtrack overflow");
off = (signed char)*pc++;
ready[nready++] = thread(pc, sp, incref(sub));
pc = pc + off;
continue;
case Save:
off = (unsigned char)*pc++;
sub = update(sub, off, sp);
continue;
case Bol:
if(sp != input->begin)
goto Dead;
continue;
case Eol:
if(sp != input->end)
goto Dead;
continue;
default:
re1_5_fatal("backtrack");
}
}
Dead:
decref(sub);
}
return 0;
}

View File

@ -0,0 +1,33 @@
#include "re1.5.h"
int _re1_5_classmatch(const char *pc, const char *sp)
{
// pc points to "cnt" byte after opcode
int is_positive = (pc[-1] == Class);
int cnt = *pc++;
while (cnt--) {
if (*sp >= *pc && *sp <= pc[1]) return is_positive;
pc += 2;
}
return !is_positive;
}
int _re1_5_namedclassmatch(const char *pc, const char *sp)
{
// pc points to name of class
int off = (*pc >> 5) & 1;
if ((*pc | 0x20) == 'd') {
if (!(*sp >= '0' && *sp <= '9')) {
off ^= 1;
}
} else if ((*pc | 0x20) == 's') {
if (!(*sp == ' ' || (*sp >= '\t' && *sp <= '\r'))) {
off ^= 1;
}
} else { // w
if (!((*sp >= 'A' && *sp <= 'Z') || (*sp >= 'a' && *sp <= 'z') || (*sp >= '0' && *sp <= '9') || *sp == '_')) {
off ^= 1;
}
}
return off;
}

View File

@ -0,0 +1,39 @@
// Copyright 2014 Paul Sokolovsky.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#include "re1.5.h"
void
cleanmarks(ByteProg *prog)
{
char *pc = prog->insts;
char *end = pc + prog->bytelen;
while (pc < end) {
*pc &= 0x7f;
switch (*pc) {
case Class:
case ClassNot:
pc += (unsigned char)pc[1] * 2;
case NamedClass:
case Jmp:
case Split:
case RSplit:
case Save:
case Char:
pc++;
break;
#ifdef DEBUG
case Bol:
case Eol:
case Any:
case Match:
break;
default:
printf("Unknown instruction 0x%02x pc %ld\n", (unsigned char)*pc, pc - prog->insts);
re1_5_fatal("cleanmarks");
#endif
}
pc++;
}
}

View File

@ -0,0 +1,179 @@
// Copyright 2007-2009 Russ Cox. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#ifdef DEBUG
#include "re1.5.h"
static Inst *pc;
static int count(Regexp*);
static void emit(Regexp*);
Prog*
compile(Regexp *r)
{
int n;
Prog *p;
n = count(r) + 1;
p = mal(sizeof *p + n*sizeof p->start[0]);
p->start = (Inst*)(p+1);
pc = p->start;
emit(r);
pc->opcode = Match;
pc++;
p->len = pc - p->start;
return p;
}
// how many instructions does r need?
static int
count(Regexp *r)
{
switch(r->type) {
default:
re1_5_fatal("bad count");
case Alt:
return 2 + count(r->left) + count(r->right);
case Cat:
return count(r->left) + count(r->right);
case Lit:
case Dot:
return 1;
case Paren:
return 2 + count(r->left);
case Quest:
return 1 + count(r->left);
case Star:
return 2 + count(r->left);
case Plus:
return 1 + count(r->left);
}
}
static void
emit(Regexp *r)
{
Inst *p1, *p2, *t;
switch(r->type) {
default:
re1_5_fatal("bad emit");
case Alt:
pc->opcode = Split;
p1 = pc++;
p1->x = pc;
emit(r->left);
pc->opcode = Jmp;
p2 = pc++;
p1->y = pc;
emit(r->right);
p2->x = pc;
break;
case Cat:
emit(r->left);
emit(r->right);
break;
case Lit:
pc->opcode = Char;
pc->c = r->ch;
pc++;
break;
case Dot:
pc++->opcode = Any;
break;
case Paren:
pc->opcode = Save;
pc->n = 2*r->n;
pc++;
emit(r->left);
pc->opcode = Save;
pc->n = 2*r->n + 1;
pc++;
break;
case Quest:
pc->opcode = Split;
p1 = pc++;
p1->x = pc;
emit(r->left);
p1->y = pc;
if(r->n) { // non-greedy
t = p1->x;
p1->x = p1->y;
p1->y = t;
}
break;
case Star:
pc->opcode = Split;
p1 = pc++;
p1->x = pc;
emit(r->left);
pc->opcode = Jmp;
pc->x = p1;
pc++;
p1->y = pc;
if(r->n) { // non-greedy
t = p1->x;
p1->x = p1->y;
p1->y = t;
}
break;
case Plus:
p1 = pc;
emit(r->left);
pc->opcode = Split;
pc->x = p1;
p2 = pc;
pc++;
p2->y = pc;
if(r->n) { // non-greedy
t = p2->x;
p2->x = p2->y;
p2->y = t;
}
break;
}
}
void
printprog(Prog *p)
{
Inst *pc, *e;
pc = p->start;
e = p->start + p->len;
for(; pc < e; pc++) {
switch(pc->opcode) {
default:
re1_5_fatal("printprog");
case Split:
printf("%2d. split %d, %d\n", (int)(pc-p->start), (int)(pc->x-p->start), (int)(pc->y-p->start));
break;
case Jmp:
printf("%2d. jmp %d\n", (int)(pc-p->start), (int)(pc->x-p->start));
break;
case Char:
printf("%2d. char %c\n", (int)(pc-p->start), pc->c);
break;
case Any:
printf("%2d. any\n", (int)(pc-p->start));
break;
case Match:
printf("%2d. match\n", (int)(pc-p->start));
break;
case Save:
printf("%2d. save %d\n", (int)(pc-p->start), pc->n);
}
}
}
#endif //DEBUG

View File

@ -0,0 +1,256 @@
// Copyright 2014-2019 Paul Sokolovsky.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#include "re1.5.h"
#define INSERT_CODE(at, num, pc) \
((code ? memmove(code + at + num, code + at, pc - at) : (void)0), pc += num)
#define REL(at, to) (to - at - 2)
#define EMIT(at, byte) (code ? (code[at] = byte) : (void)(at))
#define PC (prog->bytelen)
static int _compilecode(const char **re_loc, ByteProg *prog, int sizecode)
{
const char *re = *re_loc;
char *code = sizecode ? NULL : prog->insts;
int start = PC;
int term = PC;
int alt_label = 0;
for (; *re && *re != ')'; re++) {
switch (*re) {
case '\\': {
re++;
if (!*re) goto syntax_error; // Trailing backslash
char c = *re | 0x20;
if (c == 'd' || c == 's' || c == 'w') {
term = PC;
EMIT(PC++, NamedClass);
EMIT(PC++, *re);
prog->len++;
break;
}
if ((c >= '0' && c <= '9') || (c >= 'a' && c <= 'z')) {
goto unsupported_escape;
}
}
default:
term = PC;
EMIT(PC++, Char);
EMIT(PC++, *re);
prog->len++;
break;
case '.':
term = PC;
EMIT(PC++, Any);
prog->len++;
break;
case '[': {
int cnt;
term = PC;
re++;
if (*re == '^') {
EMIT(PC++, ClassNot);
re++;
} else {
EMIT(PC++, Class);
}
PC++; // Skip "# of pairs" byte
prog->len++;
for (cnt = 0; *re != ']'; re++, cnt++) {
if (!*re) goto syntax_error;
if (*re == '\\') {
re++;
if (!*re) goto syntax_error;
if (*re != '\\' && *re != ']') goto unsupported_escape;
}
EMIT(PC++, *re);
if (re[1] == '-' && re[2] != ']') {
re += 2;
}
EMIT(PC++, *re);
}
EMIT(term + 1, cnt);
break;
}
case '(': {
term = PC;
int sub;
int capture = 1;
re++;
if (*re == '?') {
re++;
if (*re == ':') {
capture = 0;
re++;
} else {
*re_loc = re;
return RE1_5_UNSUPPORTED_SYNTAX;
}
}
if (capture) {
sub = ++prog->sub;
EMIT(PC++, Save);
EMIT(PC++, 2 * sub);
prog->len++;
}
int res = _compilecode(&re, prog, sizecode);
*re_loc = re;
if (res < 0) return res;
if (*re != ')') return RE1_5_SYNTAX_ERROR;
if (capture) {
EMIT(PC++, Save);
EMIT(PC++, 2 * sub + 1);
prog->len++;
}
break;
}
case '{':
*re_loc = re;
return RE1_5_UNSUPPORTED_SYNTAX;
case '?':
if (PC == term) goto syntax_error; // nothing to repeat
INSERT_CODE(term, 2, PC);
if (re[1] == '?') {
EMIT(term, RSplit);
re++;
} else {
EMIT(term, Split);
}
EMIT(term + 1, REL(term, PC));
prog->len++;
term = PC;
break;
case '*':
if (PC == term) goto syntax_error; // nothing to repeat
INSERT_CODE(term, 2, PC);
EMIT(PC, Jmp);
EMIT(PC + 1, REL(PC, term));
PC += 2;
if (re[1] == '?') {
EMIT(term, RSplit);
re++;
} else {
EMIT(term, Split);
}
EMIT(term + 1, REL(term, PC));
prog->len += 2;
term = PC;
break;
case '+':
if (PC == term) goto syntax_error; // nothing to repeat
if (re[1] == '?') {
EMIT(PC, Split);
re++;
} else {
EMIT(PC, RSplit);
}
EMIT(PC + 1, REL(PC, term));
PC += 2;
prog->len++;
term = PC;
break;
case '|':
if (alt_label) {
EMIT(alt_label, REL(alt_label, PC) + 1);
}
INSERT_CODE(start, 2, PC);
EMIT(PC++, Jmp);
alt_label = PC++;
EMIT(start, Split);
EMIT(start + 1, REL(start, PC));
prog->len += 2;
term = PC;
break;
case '^':
EMIT(PC++, Bol);
prog->len++;
term = PC;
break;
case '$':
EMIT(PC++, Eol);
prog->len++;
term = PC;
break;
}
}
if (alt_label) {
EMIT(alt_label, REL(alt_label, PC) + 1);
}
*re_loc = re;
return RE1_5_SUCCESS;
syntax_error:
*re_loc = re;
return RE1_5_SYNTAX_ERROR;
unsupported_escape:
*re_loc = re;
return RE1_5_UNSUPPORTED_ESCAPE;
}
int re1_5_sizecode(const char *re)
{
ByteProg dummyprog = {
// Save 0, Save 1, Match; more bytes for "search" (vs "match") prefix code
.bytelen = 5 + NON_ANCHORED_PREFIX
};
int res = _compilecode(&re, &dummyprog, /*sizecode*/1);
if (res < 0) return res;
// If unparsed chars left
if (*re) return RE1_5_SYNTAX_ERROR;
return dummyprog.bytelen;
}
int re1_5_compilecode(ByteProg *prog, const char *re)
{
prog->len = 0;
prog->bytelen = 0;
prog->sub = 0;
// Add code to implement non-anchored operation ("search").
// For anchored operation ("match"), this code will be just skipped.
// TODO: Implement search in much more efficient manner
prog->insts[prog->bytelen++] = RSplit;
prog->insts[prog->bytelen++] = 3;
prog->insts[prog->bytelen++] = Any;
prog->insts[prog->bytelen++] = Jmp;
prog->insts[prog->bytelen++] = -5;
prog->len += 3;
prog->insts[prog->bytelen++] = Save;
prog->insts[prog->bytelen++] = 0;
prog->len++;
int res = _compilecode(&re, prog, /*sizecode*/0);
if (res < 0) return res;
// If unparsed chars left
if (*re) return RE1_5_SYNTAX_ERROR;
prog->insts[prog->bytelen++] = Save;
prog->insts[prog->bytelen++] = 1;
prog->len++;
prog->insts[prog->bytelen++] = Match;
prog->len++;
return RE1_5_SUCCESS;
}
#if 0
int main(int argc, char *argv[])
{
int pc = 0;
ByteProg *code = re1_5_compilecode(argv[1]);
re1_5_dumpcode(code);
}
#endif

View File

@ -0,0 +1,65 @@
// Copyright 2014 Paul Sokolovsky.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#include "re1.5.h"
void re1_5_dumpcode(ByteProg *prog)
{
int pc = 0;
char *code = prog->insts;
while (pc < prog->bytelen) {
printf("%2d: ", pc);
switch(code[pc++]) {
default:
assert(0);
// re1_5_fatal("printprog");
case Split:
printf("split %d (%d)\n", pc + (signed char)code[pc] + 1, (signed char)code[pc]);
pc++;
break;
case RSplit:
printf("rsplit %d (%d)\n", pc + (signed char)code[pc] + 1, (signed char)code[pc]);
pc++;
break;
case Jmp:
printf("jmp %d (%d)\n", pc + (signed char)code[pc] + 1, (signed char)code[pc]);
pc++;
break;
case Char:
printf("char %c\n", code[pc++]);
break;
case Any:
printf("any\n");
break;
case Class:
case ClassNot: {
int num = code[pc];
printf("class%s %d", (code[pc - 1] == ClassNot ? "not" : ""), num);
pc++;
while (num--) {
printf(" 0x%02x-0x%02x", code[pc], code[pc + 1]);
pc += 2;
}
printf("\n");
break;
}
case NamedClass:
printf("namedclass %c\n", code[pc++]);
break;
case Match:
printf("match\n");
break;
case Save:
printf("save %d\n", (unsigned char)code[pc++]);
break;
case Bol:
printf("assert bol\n");
break;
case Eol:
printf("assert eol\n");
break;
}
}
printf("Bytes: %d, insts: %d\n", prog->bytelen, prog->len);
}

View File

@ -0,0 +1,13 @@
{
"name": "re1.5",
"keywords": "esp32, re",
"description": "Regex",
"version": "0.9",
"repository":
{
"type": "git",
"url": "https://github.com/pfalcon/re1.5"
},
"frameworks": "*",
"platforms": "*"
}

150
lib/libesp32/re1.5/main.c Normal file
View File

@ -0,0 +1,150 @@
// Copyright 2007-2009 Russ Cox. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#include "re1.5.h"
struct {
char *name;
int (*fn)(ByteProg*, Subject*, const char**, int, int);
} tab[] = {
{"recursive", re1_5_recursiveprog},
{"recursiveloop", re1_5_recursiveloopprog},
{"backtrack", re1_5_backtrack},
{"thompson", re1_5_thompsonvm},
{"pike", re1_5_pikevm},
};
#ifdef DEBUG
int debug;
#endif
const char *re_engine;
void
usage(void)
{
fprintf(stderr, "Usage: re [-hmd] [-e ENGINE] <regexp> <string>...\n"
"-h: Print help message and exit\n"
"-m: String is anchored\n"
"-e ENGINE: Specify one of: recursive recursiveloop backtrack thompson pike\n");
#ifdef DEBUG
fprintf(stderr,
"-d: Print debug messages\n");
#endif
exit(2);
}
int
main(int argc, char **argv)
{
int i, j, k, l;
int is_anchored = 0;
argv++;
argc--;
while (argc > 0 && argv[0][0] == '-') {
char *arg;
for (arg = &argv[0][1]; *arg; arg++) {
switch (*arg) {
case 'h':
usage();
break;
case 'm':
is_anchored = 1;
break;
#ifdef DEBUG
case 'd':
debug = 1;
break;
#endif
case 'e':
if (argv[1] == NULL)
re1_5_fatal("-e: Missing Regex engine argument");
if (re_engine)
re1_5_fatal("-e: Regex engine already specified");
re_engine = argv[1];
argv++;
argc--;
break;
default:
re1_5_fatal("Unknown flag");
}
}
argv++;
argc--;
}
if(argc < 2)
usage();
#ifdef ODEBUG
// Old and unmaintained code
Regexp *re = parse(argv[0]);
printre(re);
printf("\n");
Prog *prog = compile(re);
printprog(prog);
printf("=============\n");
#endif
int sz = re1_5_sizecode(argv[0]);
#ifdef DEBUG
if (debug) printf("Precalculated size: %d\n", sz);
#endif
if (sz == -1) {
re1_5_fatal("Error in regexp");
}
ByteProg *code = malloc(sizeof(ByteProg) + sz);
int ret = re1_5_compilecode(code, argv[0]);
if (ret != 0) {
re1_5_fatal("Error in regexp");
}
int sub_els = (code->sub + 1) * 2;
#ifdef DEBUG
if (debug) re1_5_dumpcode(code);
#endif
const char *sub[sub_els];
int engine_found = 0;
for(i=1; i<argc; i++) {
printf("#%d %s\n", i, argv[i]);
for(j=0; j<nelem(tab); j++) {
Subject subj = {argv[i], argv[i] + strlen(argv[i])};
if (re_engine) {
if (0 != strcmp(re_engine, tab[j].name))
continue;
engine_found = 1;
}
printf("%s ", tab[j].name);
memset(sub, 0, sub_els * sizeof sub[0]);
if(!tab[j].fn(code, &subj, sub, sub_els, is_anchored)) {
printf("-no match-\n");
continue;
}
printf("match");
for(k=sub_els; k>0; k--)
if(sub[k-1])
break;
for(l=0; l<k; l+=2) {
printf(" (");
if(sub[l] == nil)
printf("?");
else
printf("%d", (int)(sub[l] - argv[i]));
printf(",");
if(sub[l+1] == nil)
printf("?");
else
printf("%d", (int)(sub[l+1] - argv[i]));
printf(")");
}
printf("\n");
}
if (re_engine && !engine_found)
re1_5_fatal("-e: Unknown engine name");
}
free(code);
return 0;
}

229
lib/libesp32/re1.5/parse.y Normal file
View File

@ -0,0 +1,229 @@
// Copyright 2007-2009 Russ Cox. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
%{
#include "re1.5.h"
static int yylex(void);
static void yyerror(char*);
static Regexp *parsed_regexp;
static int nparen;
%}
%union {
Regexp *re;
int c;
int nparen;
}
%token <c> CHAR EOL
%type <re> alt concat repeat single line
%type <nparen> count
%%
line: alt EOL
{
parsed_regexp = $1;
return 1;
}
alt:
concat
| alt '|' concat
{
$$ = reg(Alt, $1, $3);
}
;
concat:
repeat
| concat repeat
{
$$ = reg(Cat, $1, $2);
}
;
repeat:
single
| single '*'
{
$$ = reg(Star, $1, nil);
}
| single '*' '?'
{
$$ = reg(Star, $1, nil);
$$->n = 1;
}
| single '+'
{
$$ = reg(Plus, $1, nil);
}
| single '+' '?'
{
$$ = reg(Plus, $1, nil);
$$->n = 1;
}
| single '?'
{
$$ = reg(Quest, $1, nil);
}
| single '?' '?'
{
$$ = reg(Quest, $1, nil);
$$->n = 1;
}
;
count:
{
$$ = ++nparen;
}
;
single:
'(' count alt ')'
{
$$ = reg(Paren, $3, nil);
$$->n = $2;
}
| '(' '?' ':' alt ')'
{
$$ = $4;
}
| CHAR
{
$$ = reg(Lit, nil, nil);
$$->ch = $1;
}
| '.'
{
$$ = reg(Dot, nil, nil);
}
;
%%
static char *input;
static Regexp *parsed_regexp;
static int nparen;
int gen;
static int
yylex(void)
{
int c;
if(input == NULL || *input == 0)
return EOL;
c = *input++;
if(strchr("|*+?():.", c))
return c;
yylval.c = c;
return CHAR;
}
static void
yyerror(char *s)
{
re1_5_fatal(s);
}
Regexp*
parse(char *s)
{
Regexp *r, *dotstar;
input = s;
parsed_regexp = nil;
nparen = 0;
if(yyparse() != 1)
yyerror("did not parse");
if(parsed_regexp == nil)
yyerror("parser nil");
r = reg(Paren, parsed_regexp, nil); // $0 parens
return r;
dotstar = reg(Star, reg(Dot, nil, nil), nil);
dotstar->n = 1; // non-greedy
return reg(Cat, dotstar, r);
}
Regexp*
reg(int type, Regexp *left, Regexp *right)
{
Regexp *r;
r = mal(sizeof *r);
r->type = type;
r->left = left;
r->right = right;
return r;
}
void
printre(Regexp *r)
{
switch(r->type) {
default:
printf("???");
break;
case Alt:
printf("Alt(");
printre(r->left);
printf(", ");
printre(r->right);
printf(")");
break;
case Cat:
printf("Cat(");
printre(r->left);
printf(", ");
printre(r->right);
printf(")");
break;
case Lit:
printf("Lit(%c)", r->ch);
break;
case Dot:
printf("Dot");
break;
case Paren:
printf("Paren(%d, ", r->n);
printre(r->left);
printf(")");
break;
case Star:
if(r->n)
printf("Ng");
printf("Star(");
printre(r->left);
printf(")");
break;
case Plus:
if(r->n)
printf("Ng");
printf("Plus(");
printre(r->left);
printf(")");
break;
case Quest:
if(r->n)
printf("Ng");
printf("Quest(");
printre(r->left);
printf(")");
break;
}
}

176
lib/libesp32/re1.5/pike.c Normal file
View File

@ -0,0 +1,176 @@
// Copyright 2007-2009 Russ Cox. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#include "re1.5.h"
typedef struct Thread Thread;
struct Thread
{
char *pc;
Sub *sub;
};
typedef struct ThreadList ThreadList;
struct ThreadList
{
int n;
Thread t[1];
};
static Thread
thread(char *pc, Sub *sub)
{
Thread t = {pc, sub};
return t;
}
static ThreadList*
threadlist(int n)
{
return mal(sizeof(ThreadList)+n*sizeof(Thread));
}
static void
addthread(ThreadList *l, Thread t, Subject *input, const char *sp)
{
int off;
if(*t.pc & 0x80) {
decref(t.sub);
return; // already on list
}
*t.pc |= 0x80;
switch(*t.pc & 0x7f) {
default:
l->t[l->n] = t;
l->n++;
break;
case Jmp:
off = (signed char)t.pc[1];
t.pc += 2;
addthread(l, thread(t.pc + off, t.sub), input, sp);
break;
case Split:
off = (signed char)t.pc[1];
t.pc += 2;
addthread(l, thread(t.pc, incref(t.sub)), input, sp);
addthread(l, thread(t.pc + off, t.sub), input, sp);
break;
case RSplit:
off = (signed char)t.pc[1];
t.pc += 2;
addthread(l, thread(t.pc + off, incref(t.sub)), input, sp);
addthread(l, thread(t.pc, t.sub), input, sp);
break;
case Save:
off = (unsigned char)t.pc[1];
t.pc += 2;
addthread(l, thread(t.pc, update(t.sub, off, sp)), input, sp);
break;
case Bol:
if(sp == input->begin)
addthread(l, thread(t.pc + 1, t.sub), input, sp);
break;
case Eol:
if(sp == input->end)
addthread(l, thread(t.pc + 1, t.sub), input, sp);
break;
}
}
int
re1_5_pikevm(ByteProg *prog, Subject *input, const char **subp, int nsubp, int is_anchored)
{
int i, len;
ThreadList *clist, *nlist, *tmp;
char *pc;
const char *sp;
Sub *sub, *matched;
matched = nil;
for(i=0; i<nsubp; i++)
subp[i] = nil;
sub = newsub(nsubp);
for(i=0; i<nsubp; i++)
sub->sub[i] = nil;
len = prog->len;
clist = threadlist(len);
nlist = threadlist(len);
cleanmarks(prog);
addthread(clist, thread(HANDLE_ANCHORED(prog->insts, is_anchored), sub), input, input->begin);
matched = 0;
for(sp=input->begin;; sp++) {
if(clist->n == 0)
break;
// printf("%d(%02x).", (int)(sp - input->begin), *sp & 0xFF);
cleanmarks(prog);
for(i=0; i<clist->n; i++) {
pc = clist->t[i].pc;
sub = clist->t[i].sub;
// printf(" %d", (int)(pc - prog->insts));
if (inst_is_consumer(*pc & 0x7f)) {
// If we need to match a character, but there's none left,
// it's fail (we don't schedule current thread for continuation)
if(sp >= input->end) {
decref(sub);
continue;
}
}
switch(*pc++ & 0x7f) {
case Char:
if(*sp != *pc++) {
decref(sub);
break;
}
case Any:
addthread:
addthread(nlist, thread(pc, sub), input, sp+1);
break;
case Class:
case ClassNot:
if (!_re1_5_classmatch(pc, sp)) {
decref(sub);
break;
}
pc += *(unsigned char*)pc * 2 + 1;
goto addthread;
case NamedClass:
if (!_re1_5_namedclassmatch(pc, sp)) {
decref(sub);
break;
}
pc++;
goto addthread;
case Match:
if(matched)
decref(matched);
matched = sub;
for(i++; i < clist->n; i++)
decref(clist->t[i].sub);
goto BreakFor;
// Jmp, Split, Save handled in addthread, so that
// machine execution matches what a backtracker would do.
// This is discussed (but not shown as code) in
// Regular Expression Matching: the Virtual Machine Approach.
}
}
BreakFor:
// printf("\n");
tmp = clist;
clist = nlist;
nlist = tmp;
nlist->n = 0;
//if(*sp == '\0')
// break;
}
if(matched) {
for(i=0; i<nsubp; i++)
subp[i] = matched->sub[i];
decref(matched);
return 1;
}
return 0;
}

162
lib/libesp32/re1.5/re1.5.h Normal file
View File

@ -0,0 +1,162 @@
// Copyright 2007-2009 Russ Cox. All Rights Reserved.
// Copyright 2014-2019 Paul Sokolovsky.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#ifndef _RE1_5_REGEXP__H
#define _RE1_5_REGEXP__H
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdarg.h>
#include <assert.h>
#define nil ((void*)0)
#define nelem(x) (sizeof(x)/sizeof((x)[0]))
typedef struct Regexp Regexp;
typedef struct Prog Prog;
typedef struct ByteProg ByteProg;
typedef struct Inst Inst;
typedef struct Subject Subject;
struct Regexp
{
int type;
int n;
int ch;
Regexp *left;
Regexp *right;
};
enum /* Regexp.type */
{
Alt = 1,
Cat,
Lit,
Dot,
Paren,
Quest,
Star,
Plus,
};
Regexp *parse(char*);
Regexp *reg(int type, Regexp *left, Regexp *right);
void printre(Regexp*);
#ifndef re1_5_fatal
void re1_5_fatal(char*);
#endif
#ifndef re1_5_stack_chk
#define re1_5_stack_chk()
#endif
void *mal(int);
struct Prog
{
Inst *start;
int len;
};
struct ByteProg
{
int bytelen;
int len;
int sub;
char insts[0];
};
struct Inst
{
int opcode;
int c;
int n;
Inst *x;
Inst *y;
int gen; // global state, oooh!
};
enum /* Inst.opcode */
{
// Instructions which consume input bytes (and thus fail if none left)
CONSUMERS = 1,
Char = CONSUMERS,
Any,
Class,
ClassNot,
NamedClass,
ASSERTS = 0x50,
Bol = ASSERTS,
Eol,
// Instructions which take relative offset as arg
JUMPS = 0x60,
Jmp = JUMPS,
Split,
RSplit,
// Other (special) instructions
Save = 0x7e,
Match = 0x7f,
};
#define inst_is_consumer(inst) ((inst) < ASSERTS)
#define inst_is_jump(inst) ((inst) & 0x70 == JUMPS)
Prog *compile(Regexp*);
void printprog(Prog*);
extern int gen;
enum {
MAXSUB = 20
};
typedef struct Sub Sub;
struct Sub
{
int ref;
int nsub;
const char *sub[MAXSUB];
};
Sub *newsub(int n);
Sub *incref(Sub*);
Sub *copy(Sub*);
Sub *update(Sub*, int, const char*);
void decref(Sub*);
struct Subject {
const char *begin;
const char *end;
};
#define NON_ANCHORED_PREFIX 5
#define HANDLE_ANCHORED(bytecode, is_anchored) ((is_anchored) ? (bytecode) + NON_ANCHORED_PREFIX : (bytecode))
int re1_5_backtrack(ByteProg*, Subject*, const char**, int, int);
int re1_5_pikevm(ByteProg*, Subject*, const char**, int, int);
int re1_5_recursiveloopprog(ByteProg*, Subject*, const char**, int, int);
int re1_5_recursiveprog(ByteProg*, Subject*, const char**, int, int);
int re1_5_thompsonvm(ByteProg*, Subject*, const char**, int, int);
// Return codes for re1_5_sizecode() and re1_5_compilecode()
enum {
RE1_5_SUCCESS = 0,
RE1_5_SYNTAX_ERROR = -2,
RE1_5_UNSUPPORTED_ESCAPE = -3,
RE1_5_UNSUPPORTED_SYNTAX = -4,
};
int re1_5_sizecode(const char *re);
int re1_5_compilecode(ByteProg *prog, const char *re);
void re1_5_dumpcode(ByteProg *prog);
void cleanmarks(ByteProg *prog);
int _re1_5_classmatch(const char *pc, const char *sp);
int _re1_5_namedclassmatch(const char *pc, const char *sp);
#endif /*_RE1_5_REGEXP__H*/

View File

@ -0,0 +1,79 @@
// Copyright 2007-2009 Russ Cox. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#include "re1.5.h"
static int
recursive(char *pc, const char *sp, Subject *input, const char **subp, int nsubp)
{
const char *old;
int off;
if(inst_is_consumer(*pc)) {
// If we need to match a character, but there's none left, it's fail
if(sp >= input->end)
return 0;
}
re1_5_stack_chk();
switch(*pc++) {
case Char:
if(*sp != *pc++)
return 0;
case Any:
return recursive(pc, sp+1, input, subp, nsubp);
case Class:
case ClassNot:
if (!_re1_5_classmatch(pc, sp))
return 0;
pc += *(unsigned char*)pc * 2 + 1;
return recursive(pc, sp+1, input, subp, nsubp);
case NamedClass:
if (!_re1_5_namedclassmatch(pc, sp))
return 0;
return recursive(pc+1, sp+1, input, subp, nsubp);
case Match:
return 1;
case Jmp:
off = (signed char)*pc++;
return recursive(pc + off, sp, input, subp, nsubp);
case Split:
off = (signed char)*pc++;
if(recursive(pc, sp, input, subp, nsubp))
return 1;
return recursive(pc + off, sp, input, subp, nsubp);
case RSplit:
off = (signed char)*pc++;
if(recursive(pc + off, sp, input, subp, nsubp))
return 1;
return recursive(pc, sp, input, subp, nsubp);
case Save:
off = (unsigned char)*pc++;
if(off >= nsubp)
return recursive(pc, sp, input, subp, nsubp);
old = subp[off];
subp[off] = sp;
if(recursive(pc, sp, input, subp, nsubp))
return 1;
subp[off] = old;
return 0;
case Bol:
if(sp != input->begin)
return 0;
return recursive(pc, sp, input, subp, nsubp);
case Eol:
if(sp != input->end)
return 0;
return recursive(pc, sp, input, subp, nsubp);
}
re1_5_fatal("recursive");
return -1;
}
int
re1_5_recursiveprog(ByteProg *prog, Subject *input, const char **subp, int nsubp, int is_anchored)
{
return recursive(HANDLE_ANCHORED(prog->insts, is_anchored), input->begin, input, subp, nsubp);
}

View File

@ -0,0 +1,86 @@
// Copyright 2007-2009 Russ Cox. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#include "re1.5.h"
static int
recursiveloop(char *pc, const char *sp, Subject *input, const char **subp, int nsubp)
{
const char *old;
int off;
re1_5_stack_chk();
for(;;) {
if(inst_is_consumer(*pc)) {
// If we need to match a character, but there's none left, it's fail
if(sp >= input->end)
return 0;
}
switch(*pc++) {
case Char:
if(*sp != *pc++)
return 0;
case Any:
sp++;
continue;
case Class:
case ClassNot:
if (!_re1_5_classmatch(pc, sp))
return 0;
pc += *(unsigned char*)pc * 2 + 1;
sp++;
continue;
case NamedClass:
if (!_re1_5_namedclassmatch(pc, sp))
return 0;
pc++;
sp++;
continue;
case Match:
return 1;
case Jmp:
off = (signed char)*pc++;
pc = pc + off;
continue;
case Split:
off = (signed char)*pc++;
if(recursiveloop(pc, sp, input, subp, nsubp))
return 1;
pc = pc + off;
continue;
case RSplit:
off = (signed char)*pc++;
if(recursiveloop(pc + off, sp, input, subp, nsubp))
return 1;
continue;
case Save:
off = (unsigned char)*pc++;
if(off >= nsubp) {
continue;
}
old = subp[off];
subp[off] = sp;
if(recursiveloop(pc, sp, input, subp, nsubp))
return 1;
subp[off] = old;
return 0;
case Bol:
if(sp != input->begin)
return 0;
continue;
case Eol:
if(sp != input->end)
return 0;
continue;
}
re1_5_fatal("recursiveloop");
}
}
int
re1_5_recursiveloopprog(ByteProg *prog, Subject *input, const char **subp, int nsubp, int is_anchored)
{
return recursiveloop(HANDLE_ANCHORED(prog->insts, is_anchored), input->begin, input, subp, nsubp);
}

164
lib/libesp32/re1.5/run-tests Executable file
View File

@ -0,0 +1,164 @@
#! /usr/bin/env python3
RE_EXEC = "./re"
test_suite = [
# basics
("search", r"abc", "abcdef"),
("search", r"cde", "abcdef"),
("search", r"abc*", "abdef"),
("search", r"abc*", "abcccdef"),
("search", r"abc+", "abdef"),
("search", r"abc+", "abcccdef"),
# match
("match", r"abc", "abcdef"),
("match", r"abc*", "abdef"),
# search vs match distinction
("match", r"a*", "baa"),
("search", r"a*", "baa"),
# nested group matching
("match", r"(([0-9]*)([a-z]*)[0-9]*)", "1234hello567"),
("match", r"([0-9]*)(([a-z]*)([0-9]*))", "1234hello567"),
# non-capturing groups
("match", r"(([0-9]*)(?:[a-z]*)[0-9]*)", "1234hello568"),
("match", r"(?:[0-9]*)(([a-z]*)(?:[0-9]*))", "1234hello568"),
("match", r"([0-9]*)(?:([a-z]*)(?:[0-9]*))", "1234hello568"),
("match", r"(?:)", "1234hello568"),
("match", r"1?:", "1:"),
# named character classes
("match", r"\d+", "123abc456"),
("match", r"\s+", " \t123abc456"),
("match", r"\w+", "123abc_456 abc"),
("match", r"(\w+)\s+(\w+)", "ABC \t123hello456 abc"),
("match", r"(\S+)\s+(\D+)", "ABC \thello abc456 abc"),
("match", r"(([0-9]*)([a-z]*)\d*)", "123hello456"),
# classes
("match", r"[a]*", "a"),
("search", r"([yab]*)(e*)([cd])", "xyac"),
("search", r"([yab]*)(e*)([^y]?)$", "xyac"),
("match", r"[-]*", "--"),
("match", r"[-a]*", "-a-b"),
("match", r"[-ab]*", "-a-b"),
("match", r"[-a-c]*", "-a-b-d-"),
("match", r"[a-]*", "-a-b"),
("match", r"[ab-]*", "-a-b"),
("match", r"[a-c-]*", "-a-b-d-"),
# escaped metacharacters
("match", r"(\?:)", ":"),
("match", r"\(?:", "(:"),
# non-greedy
("match", r"a(b??)(b*)c", "abbc"),
("match", r"a(b+?)(b*)c", "abbbc"),
("match", r"a(b*?)(b*)c", "abbbbc"),
# greedy
("match", r"a(b?)(b*)c", "abbc"),
("match", r"a(b+)(b*)c", "abbbc"),
("match", r"a(b*)(b*)c", "abbbbc"),
# errors
("search", r"?", ""),
("search", r"*", ""),
("search", r"+", ""),
("search", r"[", ""),
("search", r"(", ""),
("search", r")", ""),
("search", "\\", ""),
("search", "|+", ""),
("search", "|*", ""),
("search", "|?", ""),
("search", "^*", ""),
("search", "$*", ""),
("search", "a*+", ""),
("search", "a*?", ""),
("search", "a**", ""),
]
import re
import sre_constants
import subprocess
from collections import OrderedDict
def parse_result(string, res):
name, rest = res.split(b" ", 1)
if rest == b"-no match-":
return name, None
if rest == b"REGEX ERROR":
return name, rest
assert rest.startswith(b"match ")
rest = rest[6:]
tuples = [eval(t) for t in rest.split()]
matches = tuple(string[t[0]:t[1]] for t in tuples)
return name, matches
def fit_str(string, width):
if len(string) <= width:
return string
else:
return string[:width - 2] + ".."
def main():
engine_stats = OrderedDict()
for kind, regex, string in test_suite:
# run Python re to get correct result
try:
if kind == "match":
py_res = re.match(regex, string)
else:
py_res = re.search(regex, string)
if py_res is not None:
py_res = (py_res.group(0),) + py_res.groups()
except sre_constants.error:
py_res = b"REGEX ERROR"
# run our code
try:
args = (["-m"] if kind == "match" else []) + [regex, string]
re_res = subprocess.check_output([RE_EXEC]+args, stderr=subprocess.STDOUT)
re_res = re_res.split(b'\n')[1:-1] # split lines, remove first and last
except subprocess.CalledProcessError as e:
if e.returncode == 2 and e.output == b"fatal error: Error in regexp\n":
re_res = [b"recursive REGEX ERROR", b"recursiveloop REGEX ERROR", b"backtrack REGEX ERROR", b"thompson REGEX ERROR", b"pike REGEX ERROR"]
else:
raise
# check result of each engine
for engine in re_res:
engine_name, re_res = parse_result(string, engine)
try:
stats = engine_stats[engine_name]
except KeyError:
engine_stats[engine_name] = stats = [0, 0]
# Thompson algo offers just boolean match/no match status
py_res_cur = py_res
re_res_cur = re_res
if engine_name == b"thompson":
if py_res is not None:
py_res_cur = True
if re_res is not None:
re_res_cur = True
if py_res_cur == re_res_cur:
print("pass ", end="")
stats[0] += 1
else:
print("FAIL ", end="")
stats[1] += 1
print("%s %-25s %-20s" % (kind[0], fit_str(regex, 25), fit_str(string, 20)))
print("Ran %d tests, results:" % len(test_suite))
for name, stats in engine_stats.items():
print("%15s %2d pass %2d fail" % (str(name, encoding='utf8'), stats[0], stats[1]))
if __name__ == "__main__":
main()

55
lib/libesp32/re1.5/sub.c Normal file
View File

@ -0,0 +1,55 @@
// Copyright 2007-2009 Russ Cox. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#include "re1.5.h"
Sub *freesub;
Sub*
newsub(int n)
{
Sub *s;
s = freesub;
if(s != nil)
freesub = (Sub*)s->sub[0];
else
s = mal(sizeof *s);
s->nsub = n;
s->ref = 1;
return s;
}
Sub*
incref(Sub *s)
{
s->ref++;
return s;
}
Sub*
update(Sub *s, int i, const char *p)
{
Sub *s1;
int j;
if(s->ref > 1) {
s1 = newsub(s->nsub);
for(j=0; j<s->nsub; j++)
s1->sub[j] = s->sub[j];
s->ref--;
s = s1;
}
s->sub[i] = p;
return s;
}
void
decref(Sub *s)
{
if(--s->ref == 0) {
s->sub[0] = (char*)freesub;
freesub = s;
}
}

View File

@ -0,0 +1,152 @@
// Copyright 2007-2009 Russ Cox. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#include "re1.5.h"
typedef struct Thread Thread;
struct Thread
{
char *pc;
};
typedef struct ThreadList ThreadList;
struct ThreadList
{
int n;
Thread t[1];
};
static Thread
thread(char *pc)
{
Thread t = {pc};
return t;
}
static ThreadList*
threadlist(int n)
{
return mal(sizeof(ThreadList)+n*sizeof(Thread));
}
static void
addthread(ThreadList *l, Thread t, Subject *input, const char *sp)
{
int off;
if(*t.pc & 0x80)
return; // already on list
*t.pc |= 0x80;
l->t[l->n] = t;
l->n++;
switch(*t.pc & 0x7f) {
case Jmp:
off = (signed char)t.pc[1];
t.pc += 2;
addthread(l, thread(t.pc + off), input, sp);
break;
case Split:
off = (signed char)t.pc[1];
t.pc += 2;
addthread(l, thread(t.pc), input, sp);
addthread(l, thread(t.pc + off), input, sp);
break;
case RSplit:
off = (signed char)t.pc[1];
t.pc += 2;
addthread(l, thread(t.pc + off), input, sp);
addthread(l, thread(t.pc), input, sp);
break;
case Save:
off = (unsigned char)t.pc[1];
t.pc += 2;
addthread(l, thread(t.pc), input, sp);
break;
case Bol:
if(sp == input->begin)
addthread(l, thread(t.pc + 1), input, sp);
break;
case Eol:
if(sp == input->end - 1)
addthread(l, thread(t.pc + 1), input, sp);
break;
}
}
int
re1_5_thompsonvm(ByteProg *prog, Subject *input, const char **subp, int nsubp, int is_anchored)
{
int i, len, matched;
ThreadList *clist, *nlist, *tmp;
char *pc;
const char *sp;
for(i=0; i<nsubp; i++)
subp[i] = nil;
len = prog->len;
clist = threadlist(len);
nlist = threadlist(len);
if(nsubp >= 1)
subp[0] = input->begin;
cleanmarks(prog);
addthread(clist, thread(HANDLE_ANCHORED(prog->insts, is_anchored)), input, input->begin);
matched = 0;
for(sp=input->begin;; sp++) {
if(clist->n == 0)
break;
// printf("%d(%02x).", (int)(sp - input->begin), *sp & 0xFF);
cleanmarks(prog);
for(i=0; i<clist->n; i++) {
pc = clist->t[i].pc;
// printf(" %d", (int)(pc - prog->insts));
if (inst_is_consumer(*pc & 0x7f)) {
// If we need to match a character, but there's none left,
// it's fail (we don't schedule current thread for continuation)
if(sp >= input->end)
continue;
}
switch(*pc++ & 0x7f) {
case Char:
if(*sp != *pc++)
break;
case Any:
addthread:
addthread(nlist, thread(pc), input, sp);
break;
case Class:
case ClassNot:
if (!_re1_5_classmatch(pc, sp))
break;
pc += *(unsigned char*)pc * 2 + 1;
goto addthread;
case NamedClass:
if (!_re1_5_namedclassmatch(pc, sp))
break;
pc++;
goto addthread;
case Match:
if(nsubp >= 2)
subp[1] = sp;
matched = 1;
goto BreakFor;
// Jmp, Split, Save handled in addthread, so that
// machine execution matches what a backtracker would do.
// This is discussed (but not shown as code) in
// Regular Expression Matching: the Virtual Machine Approach.
}
}
BreakFor:
// printf("\n");
tmp = clist;
clist = nlist;
nlist = tmp;
nlist->n = 0;
//if(sp >= input->end)
// break;
}
return matched;
}

24
lib/libesp32/re1.5/util.c Normal file
View File

@ -0,0 +1,24 @@
// Copyright 2007-2009 Russ Cox. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#include "re1.5.h"
void
re1_5_fatal(char *msg)
{
fprintf(stderr, "fatal error: %s\n", msg);
exit(2);
}
void*
mal(int n)
{
void *v;
v = malloc(n);
if(v == nil)
re1_5_fatal("out of memory");
memset(v, 0, n);
return v;
}

View File

@ -23,6 +23,8 @@
#include <berry.h>
#include <LList.h>
#include "re1.5.h"
#define BERRY_CONSOLE_CMD_DELIMITER "\x01"
typedef LList_elt<char[0]> log_elt; // store the string after the header to avoid double allocation if we had used char*