diff --git a/CHANGELOG.md b/CHANGELOG.md index 0bdc15b4a..4dc82aa10 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -13,6 +13,7 @@ All notable changes to this project will be documented in this file. - Berry support for neopixel (WS2812, SK6812) - Command ``IfxPeriod `` to overrule ``Teleperiod`` for Influx messages (#13750) - OTA over HTTPS (ESP32x only) +- Berry add ``import re`` regex module ### Changed - ESP8266 Gratuitous ARP enabled and set to 60 seconds (#13623) diff --git a/lib/libesp32/Berry/default/be_modtab.c b/lib/libesp32/Berry/default/be_modtab.c index 5c704b142..5f04457ac 100644 --- a/lib/libesp32/Berry/default/be_modtab.c +++ b/lib/libesp32/Berry/default/be_modtab.c @@ -25,6 +25,7 @@ be_extern_native_module(strict); /* Tasmota specific */ be_extern_native_module(python_compat); +be_extern_native_module(re); be_extern_native_module(persist); be_extern_native_module(autoconf); be_extern_native_module(tapp); @@ -87,6 +88,7 @@ BERRY_LOCAL const bntvmodule* const be_module_table[] = { /* user-defined modules register start */ &be_native_module(python_compat), + &be_native_module(re), &be_native_module(path), &be_native_module(persist), #ifdef USE_AUTOCONF diff --git a/lib/libesp32/Berry/default/be_re_lib.c b/lib/libesp32/Berry/default/be_re_lib.c new file mode 100644 index 000000000..2c94c3440 --- /dev/null +++ b/lib/libesp32/Berry/default/be_re_lib.c @@ -0,0 +1,218 @@ +/******************************************************************** + * Tasmota lib + * + * To use: `import re` + * + * Regex using re1.5 + *******************************************************************/ +#include "be_constobj.h" +#include "be_mem.h" +#include "re1.5.h" + +/******************************************************************** +# Berry skeleton for `re` module +# + +class re_pattern + var _p # comobj containing the compiled bytecode for the pattern + + def search() end + def match() end + def split() end +end + +re = module("re") + +re.compile = def (regex_str) end # native +re.match = def (regex_str, str) end # native +re.search = def (regex_str, str) end # native + + +*******************************************************************/ + +extern const bclass be_class_re_pattern; + +int be_free_comobj(bvm* vm) { + int argc = be_top(vm); + if (argc > 0) { + void * obj = be_tocomptr(vm, 1); + if (obj != NULL) { be_os_free(obj); } + } + be_return_nil(vm); +} + +// Native functions be_const_func() +// Berry: `re.compile(pattern:string) -> instance(be_pattern)` +int be_re_compile(bvm *vm) { + int32_t argc = be_top(vm); // Get the number of arguments + if (argc >= 1 && be_isstring(vm, 1)) { + const char * regex_str = be_tostring(vm, 1); + int sz = re1_5_sizecode(regex_str); + if (sz < 0) { + be_raise(vm, "internal_error", "error in regex"); + } + + ByteProg *code = be_os_malloc(sizeof(ByteProg) + sz); + int ret = re1_5_compilecode(code, regex_str); + if (ret != 0) { + be_raise(vm, "internal_error", "error in regex"); + } + be_pushntvclass(vm, &be_class_re_pattern); + be_call(vm, 0); + be_newcomobj(vm, code, &be_free_comobj); + be_setmember(vm, -2, "_p"); + be_pop(vm, 1); + be_return(vm); + } + be_raise(vm, "type_error", NULL); +} + + +int be_re_match_search_run(bvm *vm, ByteProg *code, const char *hay, bbool is_anchored) { + Subject subj = {hay, hay + strlen(hay)}; + + int sub_els = (code->sub + 1) * 2; + const char *sub[sub_els]; + + if (!re1_5_recursiveloopprog(code, &subj, sub, sub_els, is_anchored)) { + be_return_nil(vm); // no match + } + + be_newobject(vm, "list"); + int k; + for(k = sub_els; k > 0; k--) + if(sub[k-1]) + break; + for (int i = 0; i < k; i += 2) { + be_pushnstring(vm, sub[i], sub[i+1] - sub[i]); + be_data_push(vm, -2); + be_pop(vm, 1); + } + be_pop(vm, 1); // remove list + be_return(vm); // return list object +} + +int be_re_match_search(bvm *vm, bbool is_anchored) { + int32_t argc = be_top(vm); // Get the number of arguments + if (argc >= 2 && be_isstring(vm, 1) && be_isstring(vm, 2)) { + const char * regex_str = be_tostring(vm, 1); + const char * hay = be_tostring(vm, 2); + int sz = re1_5_sizecode(regex_str); + if (sz < 0) { + be_raise(vm, "internal_error", "error in regex"); + } + + ByteProg *code = be_os_malloc(sizeof(ByteProg) + sz); + int ret = re1_5_compilecode(code, regex_str); + if (ret != 0) { + be_raise(vm, "internal_error", "error in regex"); + } + return be_re_match_search_run(vm, code, hay, is_anchored); + } + be_raise(vm, "type_error", NULL); +} + +// Berry: `re.match(value:int | s:string) -> nil` +int be_re_match(bvm *vm) { + return be_re_match_search(vm, btrue); +} +// Berry: `re.search(value:int | s:string) -> nil` +int be_re_search(bvm *vm) { + return be_re_match_search(vm, bfalse); +} + +// Berry: `re_pattern.search(s:string) -> list(string)` +int re_pattern_search(bvm *vm) { + int32_t argc = be_top(vm); // Get the number of arguments + if (argc >= 2 && be_isstring(vm, 2)) { + const char * hay = be_tostring(vm, 2); + be_getmember(vm, 1, "_p"); + ByteProg * code = (ByteProg*) be_tocomptr(vm, -1); + return be_re_match_search_run(vm, code, hay, bfalse); + } + be_raise(vm, "type_error", NULL); +} + +// Berry: `re_pattern.match(s:string) -> list(string)` +int re_pattern_match(bvm *vm) { + int32_t argc = be_top(vm); // Get the number of arguments + if (argc >= 2 && be_isstring(vm, 2)) { + const char * hay = be_tostring(vm, 2); + be_getmember(vm, 1, "_p"); + ByteProg * code = (ByteProg*) be_tocomptr(vm, -1); + return be_re_match_search_run(vm, code, hay, btrue); + } + be_raise(vm, "type_error", NULL); +} + +// Berry: `re_pattern.split(s:string) -> list(string)` +int re_pattern_split(bvm *vm) { + int32_t argc = be_top(vm); // Get the number of arguments + if (argc >= 2 && be_isstring(vm, 2)) { + const char * hay = be_tostring(vm, 2); + be_getmember(vm, 1, "_p"); + ByteProg * code = (ByteProg*) be_tocomptr(vm, -1); + + Subject subj = {hay, hay + strlen(hay)}; + + int sub_els = (code->sub + 1) * 2; + const char *sub[sub_els]; + + be_newobject(vm, "list"); + while (1) { + if (!re1_5_recursiveloopprog(code, &subj, sub, sub_els, bfalse)) { + be_pushnstring(vm, subj.begin, subj.end - subj.begin); + be_data_push(vm, -2); + be_pop(vm, 1); + break; + } + + if (sub[0] == NULL || sub[1] == NULL || sub[0] == sub[1]) { + be_raise(vm, "internal_error", "can't match"); + } + be_pushnstring(vm, subj.begin, sub[0] - subj.begin); + be_data_push(vm, -2); + be_pop(vm, 1); + subj.begin = sub[1]; + } + be_pop(vm, 1); // remove list + be_return(vm); // return list object + + } + be_raise(vm, "type_error", NULL); +} + +/******************************************************************** +** Solidified module: re +********************************************************************/ +be_local_module(re, + "re", + be_nested_map(3, + ( (struct bmapnode*) &(const bmapnode[]) { + { be_nested_key("compile", 1000265118, 7, -1), be_const_func(be_re_compile) }, + { be_nested_key("search", -2144130903, 6, 2), be_const_func(be_re_search) }, + { be_nested_key("match", 2116038550, 5, -1), be_const_func(be_re_match) }, + })) +); +BE_EXPORT_VARIABLE be_define_const_native_module(re); +/********************************************************************/ + +// =================================================================== + +/******************************************************************** +** Solidified class: re_pattern +********************************************************************/ +be_local_class(re_pattern, + 1, + NULL, + be_nested_map(4, + ( (struct bmapnode*) &(const bmapnode[]) { + { be_nested_key("_p", 1594591802, 2, -1), be_const_var(0) }, + { be_nested_key("search", -2144130903, 6, -1), be_const_func(re_pattern_search) }, + { be_nested_key("match", 2116038550, 5, 0), be_const_func(re_pattern_match) }, + { be_nested_key("split", -2017972765, 5, -1), be_const_func(re_pattern_split) }, + })), + (be_nested_const_str("re_pattern", 2041968961, 10)) +); +/*******************************************************************/ + diff --git a/lib/libesp32/re1.5/LICENSE b/lib/libesp32/re1.5/LICENSE new file mode 100644 index 000000000..85c4185d4 --- /dev/null +++ b/lib/libesp32/re1.5/LICENSE @@ -0,0 +1,27 @@ +// Copyright (c) 2007-2009 Russ Cox, Google Inc. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google, Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/lib/libesp32/re1.5/Makefile b/lib/libesp32/re1.5/Makefile new file mode 100644 index 000000000..6fe6d3f3c --- /dev/null +++ b/lib/libesp32/re1.5/Makefile @@ -0,0 +1,44 @@ +# Copyright 2007-2009 Russ Cox. All Rights Reserved. +# Use of this source code is governed by a BSD-style +# license that can be found in the LICENSE file. + +CC=gcc + +CFLAGS=-g -Wall -Os +# Comment out when developing/testing +#CFLAGS=-DDEBUG -g -Wall -O0 + +TARGET=re +OFILES=\ + backtrack.o\ + compile.o\ + main.o\ + pike.o\ + recursive.o\ + recursiveloop.o\ + sub.o\ + thompson.o\ + compilecode.o\ + dumpcode.o\ + charclass.o\ + cleanmarks.o\ + util.o\ + y.tab.o\ + +HFILES=\ + re1.5.h\ + +$(TARGET): $(OFILES) + $(CC) $(CFLAGS) -o $(TARGET) $(OFILES) + +%.o: %.c $(HFILES) + $(CC) -c $(CFLAGS) $*.c + +y.tab.h y.tab.c: parse.y + bison -v -y parse.y + +test: $(TARGET) + ./run-tests $(TFLAGS) + +clean: + rm -f *.o core $(TARGET) y.tab.[ch] y.output diff --git a/lib/libesp32/re1.5/README b/lib/libesp32/re1.5/README new file mode 100644 index 000000000..d116d330d --- /dev/null +++ b/lib/libesp32/re1.5/README @@ -0,0 +1,48 @@ +What is re1.5? +============== + +re1 (http://code.google.com/p/re1/) is "toy regular expression implementation" +by Russel Cox, featuring simplicity and minimal code size unheard of in other +implementations. re2 (http://code.google.com/p/re2/) is "an efficient, +principled regular expression library" by the same author. It is robust, +full-featured, and ... bloated, comparing to re1. + +re1.5 is an attempt to start with re1 codebase and add features required for +minimalistic real-world use, while sticking to the minimal code size and +memory use. + +Why? +==== +re1.5 is intended for use in highly constrained, e.g. embedded, environments, +where offering familiar high-level string matching functionality is still +beneficial. + +Features +======== + +* Like re1, re1.5 retains design where compiled expression can be executed +(matched) by multiple backends, each with its own distinctive design and +runtime properties (complexity and memory usage). +* Unlike re1, regexes are compiled to memory-efficient bytecode. Exact size +of the bytecode can be found prior to compilation (for memory allocation). +* External API functions feature namespace prefix to improve clarity and +avoid name clashes when integrating into other projects. +* Matchers are NUL-char clean and take size of the input string as a param. +* Support for quoted chars in regex. +* Support for ^, $ assertions in regex. +* Support for "match" vs "search" operations, as common in other regex APIs. +* Support for named character classes: \d \D \s \S \w \W. + +TODO +==== + +* Support for repetition operator {n} and {n,m}. +* Support for Unicode (UTF-8). +* Support for matching flags like case-insensitive, dot matches all, +multiline, etc. +* Support for more assertions like \A, \Z. + +Author and License +================== +re1.5 is maintained by Paul Sokolovsky pfalcon at users.sourceforge.net and +licensed under BSD license, just as the original re1. diff --git a/lib/libesp32/re1.5/backtrack.c b/lib/libesp32/re1.5/backtrack.c new file mode 100644 index 000000000..160069c9d --- /dev/null +++ b/lib/libesp32/re1.5/backtrack.c @@ -0,0 +1,117 @@ +// Copyright 2007-2009 Russ Cox. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "re1.5.h" + +typedef struct Thread Thread; +struct Thread +{ + char *pc; + const char *sp; + Sub *sub; +}; + +static Thread +thread(char *pc, const char *sp, Sub *sub) +{ + Thread t = {pc, sp, sub}; + return t; +} + +int +re1_5_backtrack(ByteProg *prog, Subject *input, const char **subp, int nsubp, int is_anchored) +{ + enum { MAX = 1000 }; + Thread ready[MAX]; + int i, nready; + char *pc; + const char *sp; + Sub *sub; + int off; + + /* queue initial thread */ + sub = newsub(nsubp); + for(i=0; isub[i] = nil; + ready[0] = thread(HANDLE_ANCHORED(prog->insts, is_anchored), input->begin, sub); + nready = 1; + + /* run threads in stack order */ + while(nready > 0) { + --nready; /* pop state for next thread to run */ + pc = ready[nready].pc; + sp = ready[nready].sp; + sub = ready[nready].sub; + assert(sub->ref > 0); + for(;;) { + if(inst_is_consumer(*pc)) { + // If we need to match a character, but there's none left, it's fail + if(sp >= input->end) + goto Dead; + } + switch(*pc++) { + case Char: + if(*sp != *pc++) + goto Dead; + case Any: + sp++; + continue; + case Class: + case ClassNot: + if (!_re1_5_classmatch(pc, sp)) + goto Dead; + pc += *(unsigned char*)pc * 2 + 1; + sp++; + continue; + case NamedClass: + if (!_re1_5_namedclassmatch(pc, sp)) + goto Dead; + pc++; + sp++; + continue; + case Match: + for(i=0; isub[i]; + decref(sub); + return 1; + case Jmp: + off = (signed char)*pc++; + pc = pc + off; + continue; + case Split: + if(nready >= MAX) + re1_5_fatal("backtrack overflow"); + off = (signed char)*pc++; + ready[nready++] = thread(pc + off, sp, incref(sub)); +// pc = pc->x; /* continue current thread */ + continue; + case RSplit: + if(nready >= MAX) + re1_5_fatal("backtrack overflow"); + off = (signed char)*pc++; + ready[nready++] = thread(pc, sp, incref(sub)); + pc = pc + off; + continue; + case Save: + off = (unsigned char)*pc++; + sub = update(sub, off, sp); + continue; + case Bol: + if(sp != input->begin) + goto Dead; + continue; + case Eol: + if(sp != input->end) + goto Dead; + continue; + default: + re1_5_fatal("backtrack"); + } + } + Dead: + decref(sub); + } + return 0; +} + diff --git a/lib/libesp32/re1.5/charclass.c b/lib/libesp32/re1.5/charclass.c new file mode 100644 index 000000000..7f6388c93 --- /dev/null +++ b/lib/libesp32/re1.5/charclass.c @@ -0,0 +1,33 @@ +#include "re1.5.h" + +int _re1_5_classmatch(const char *pc, const char *sp) +{ + // pc points to "cnt" byte after opcode + int is_positive = (pc[-1] == Class); + int cnt = *pc++; + while (cnt--) { + if (*sp >= *pc && *sp <= pc[1]) return is_positive; + pc += 2; + } + return !is_positive; +} + +int _re1_5_namedclassmatch(const char *pc, const char *sp) +{ + // pc points to name of class + int off = (*pc >> 5) & 1; + if ((*pc | 0x20) == 'd') { + if (!(*sp >= '0' && *sp <= '9')) { + off ^= 1; + } + } else if ((*pc | 0x20) == 's') { + if (!(*sp == ' ' || (*sp >= '\t' && *sp <= '\r'))) { + off ^= 1; + } + } else { // w + if (!((*sp >= 'A' && *sp <= 'Z') || (*sp >= 'a' && *sp <= 'z') || (*sp >= '0' && *sp <= '9') || *sp == '_')) { + off ^= 1; + } + } + return off; +} diff --git a/lib/libesp32/re1.5/cleanmarks.c b/lib/libesp32/re1.5/cleanmarks.c new file mode 100644 index 000000000..afe0f91d6 --- /dev/null +++ b/lib/libesp32/re1.5/cleanmarks.c @@ -0,0 +1,39 @@ +// Copyright 2014 Paul Sokolovsky. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "re1.5.h" + +void +cleanmarks(ByteProg *prog) +{ + char *pc = prog->insts; + char *end = pc + prog->bytelen; + while (pc < end) { + *pc &= 0x7f; + switch (*pc) { + case Class: + case ClassNot: + pc += (unsigned char)pc[1] * 2; + case NamedClass: + case Jmp: + case Split: + case RSplit: + case Save: + case Char: + pc++; + break; +#ifdef DEBUG + case Bol: + case Eol: + case Any: + case Match: + break; + default: + printf("Unknown instruction 0x%02x pc %ld\n", (unsigned char)*pc, pc - prog->insts); + re1_5_fatal("cleanmarks"); +#endif + } + pc++; + } +} diff --git a/lib/libesp32/re1.5/compile.c b/lib/libesp32/re1.5/compile.c new file mode 100644 index 000000000..c5a995388 --- /dev/null +++ b/lib/libesp32/re1.5/compile.c @@ -0,0 +1,179 @@ +// Copyright 2007-2009 Russ Cox. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. +#ifdef DEBUG + +#include "re1.5.h" + +static Inst *pc; +static int count(Regexp*); +static void emit(Regexp*); + +Prog* +compile(Regexp *r) +{ + int n; + Prog *p; + + n = count(r) + 1; + p = mal(sizeof *p + n*sizeof p->start[0]); + p->start = (Inst*)(p+1); + pc = p->start; + emit(r); + pc->opcode = Match; + pc++; + p->len = pc - p->start; + return p; +} + +// how many instructions does r need? +static int +count(Regexp *r) +{ + switch(r->type) { + default: + re1_5_fatal("bad count"); + case Alt: + return 2 + count(r->left) + count(r->right); + case Cat: + return count(r->left) + count(r->right); + case Lit: + case Dot: + return 1; + case Paren: + return 2 + count(r->left); + case Quest: + return 1 + count(r->left); + case Star: + return 2 + count(r->left); + case Plus: + return 1 + count(r->left); + } +} + +static void +emit(Regexp *r) +{ + Inst *p1, *p2, *t; + + switch(r->type) { + default: + re1_5_fatal("bad emit"); + + case Alt: + pc->opcode = Split; + p1 = pc++; + p1->x = pc; + emit(r->left); + pc->opcode = Jmp; + p2 = pc++; + p1->y = pc; + emit(r->right); + p2->x = pc; + break; + + case Cat: + emit(r->left); + emit(r->right); + break; + + case Lit: + pc->opcode = Char; + pc->c = r->ch; + pc++; + break; + + case Dot: + pc++->opcode = Any; + break; + + case Paren: + pc->opcode = Save; + pc->n = 2*r->n; + pc++; + emit(r->left); + pc->opcode = Save; + pc->n = 2*r->n + 1; + pc++; + break; + + case Quest: + pc->opcode = Split; + p1 = pc++; + p1->x = pc; + emit(r->left); + p1->y = pc; + if(r->n) { // non-greedy + t = p1->x; + p1->x = p1->y; + p1->y = t; + } + break; + + case Star: + pc->opcode = Split; + p1 = pc++; + p1->x = pc; + emit(r->left); + pc->opcode = Jmp; + pc->x = p1; + pc++; + p1->y = pc; + if(r->n) { // non-greedy + t = p1->x; + p1->x = p1->y; + p1->y = t; + } + break; + + case Plus: + p1 = pc; + emit(r->left); + pc->opcode = Split; + pc->x = p1; + p2 = pc; + pc++; + p2->y = pc; + if(r->n) { // non-greedy + t = p2->x; + p2->x = p2->y; + p2->y = t; + } + break; + } +} + +void +printprog(Prog *p) +{ + Inst *pc, *e; + + pc = p->start; + e = p->start + p->len; + + for(; pc < e; pc++) { + switch(pc->opcode) { + default: + re1_5_fatal("printprog"); + case Split: + printf("%2d. split %d, %d\n", (int)(pc-p->start), (int)(pc->x-p->start), (int)(pc->y-p->start)); + break; + case Jmp: + printf("%2d. jmp %d\n", (int)(pc-p->start), (int)(pc->x-p->start)); + break; + case Char: + printf("%2d. char %c\n", (int)(pc-p->start), pc->c); + break; + case Any: + printf("%2d. any\n", (int)(pc-p->start)); + break; + case Match: + printf("%2d. match\n", (int)(pc-p->start)); + break; + case Save: + printf("%2d. save %d\n", (int)(pc-p->start), pc->n); + } + } +} + +#endif //DEBUG diff --git a/lib/libesp32/re1.5/compilecode.c b/lib/libesp32/re1.5/compilecode.c new file mode 100644 index 000000000..50ee74a13 --- /dev/null +++ b/lib/libesp32/re1.5/compilecode.c @@ -0,0 +1,256 @@ +// Copyright 2014-2019 Paul Sokolovsky. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "re1.5.h" + +#define INSERT_CODE(at, num, pc) \ + ((code ? memmove(code + at + num, code + at, pc - at) : (void)0), pc += num) +#define REL(at, to) (to - at - 2) +#define EMIT(at, byte) (code ? (code[at] = byte) : (void)(at)) +#define PC (prog->bytelen) + +static int _compilecode(const char **re_loc, ByteProg *prog, int sizecode) +{ + const char *re = *re_loc; + char *code = sizecode ? NULL : prog->insts; + int start = PC; + int term = PC; + int alt_label = 0; + + for (; *re && *re != ')'; re++) { + switch (*re) { + case '\\': { + re++; + if (!*re) goto syntax_error; // Trailing backslash + char c = *re | 0x20; + if (c == 'd' || c == 's' || c == 'w') { + term = PC; + EMIT(PC++, NamedClass); + EMIT(PC++, *re); + prog->len++; + break; + } + if ((c >= '0' && c <= '9') || (c >= 'a' && c <= 'z')) { + goto unsupported_escape; + } + } + default: + term = PC; + EMIT(PC++, Char); + EMIT(PC++, *re); + prog->len++; + break; + case '.': + term = PC; + EMIT(PC++, Any); + prog->len++; + break; + case '[': { + int cnt; + term = PC; + re++; + if (*re == '^') { + EMIT(PC++, ClassNot); + re++; + } else { + EMIT(PC++, Class); + } + PC++; // Skip "# of pairs" byte + prog->len++; + for (cnt = 0; *re != ']'; re++, cnt++) { + if (!*re) goto syntax_error; + if (*re == '\\') { + re++; + if (!*re) goto syntax_error; + if (*re != '\\' && *re != ']') goto unsupported_escape; + } + EMIT(PC++, *re); + if (re[1] == '-' && re[2] != ']') { + re += 2; + } + EMIT(PC++, *re); + } + EMIT(term + 1, cnt); + break; + } + case '(': { + term = PC; + int sub; + int capture = 1; + re++; + if (*re == '?') { + re++; + if (*re == ':') { + capture = 0; + re++; + } else { + *re_loc = re; + return RE1_5_UNSUPPORTED_SYNTAX; + } + } + + if (capture) { + sub = ++prog->sub; + EMIT(PC++, Save); + EMIT(PC++, 2 * sub); + prog->len++; + } + + int res = _compilecode(&re, prog, sizecode); + *re_loc = re; + if (res < 0) return res; + if (*re != ')') return RE1_5_SYNTAX_ERROR; + + if (capture) { + EMIT(PC++, Save); + EMIT(PC++, 2 * sub + 1); + prog->len++; + } + + break; + } + case '{': + *re_loc = re; + return RE1_5_UNSUPPORTED_SYNTAX; + case '?': + if (PC == term) goto syntax_error; // nothing to repeat + INSERT_CODE(term, 2, PC); + if (re[1] == '?') { + EMIT(term, RSplit); + re++; + } else { + EMIT(term, Split); + } + EMIT(term + 1, REL(term, PC)); + prog->len++; + term = PC; + break; + case '*': + if (PC == term) goto syntax_error; // nothing to repeat + INSERT_CODE(term, 2, PC); + EMIT(PC, Jmp); + EMIT(PC + 1, REL(PC, term)); + PC += 2; + if (re[1] == '?') { + EMIT(term, RSplit); + re++; + } else { + EMIT(term, Split); + } + EMIT(term + 1, REL(term, PC)); + prog->len += 2; + term = PC; + break; + case '+': + if (PC == term) goto syntax_error; // nothing to repeat + if (re[1] == '?') { + EMIT(PC, Split); + re++; + } else { + EMIT(PC, RSplit); + } + EMIT(PC + 1, REL(PC, term)); + PC += 2; + prog->len++; + term = PC; + break; + case '|': + if (alt_label) { + EMIT(alt_label, REL(alt_label, PC) + 1); + } + INSERT_CODE(start, 2, PC); + EMIT(PC++, Jmp); + alt_label = PC++; + EMIT(start, Split); + EMIT(start + 1, REL(start, PC)); + prog->len += 2; + term = PC; + break; + case '^': + EMIT(PC++, Bol); + prog->len++; + term = PC; + break; + case '$': + EMIT(PC++, Eol); + prog->len++; + term = PC; + break; + } + } + + if (alt_label) { + EMIT(alt_label, REL(alt_label, PC) + 1); + } + + *re_loc = re; + return RE1_5_SUCCESS; + +syntax_error: + *re_loc = re; + return RE1_5_SYNTAX_ERROR; + +unsupported_escape: + *re_loc = re; + return RE1_5_UNSUPPORTED_ESCAPE; +} + +int re1_5_sizecode(const char *re) +{ + ByteProg dummyprog = { + // Save 0, Save 1, Match; more bytes for "search" (vs "match") prefix code + .bytelen = 5 + NON_ANCHORED_PREFIX + }; + + int res = _compilecode(&re, &dummyprog, /*sizecode*/1); + if (res < 0) return res; + // If unparsed chars left + if (*re) return RE1_5_SYNTAX_ERROR; + + return dummyprog.bytelen; +} + +int re1_5_compilecode(ByteProg *prog, const char *re) +{ + prog->len = 0; + prog->bytelen = 0; + prog->sub = 0; + + // Add code to implement non-anchored operation ("search"). + // For anchored operation ("match"), this code will be just skipped. + // TODO: Implement search in much more efficient manner + prog->insts[prog->bytelen++] = RSplit; + prog->insts[prog->bytelen++] = 3; + prog->insts[prog->bytelen++] = Any; + prog->insts[prog->bytelen++] = Jmp; + prog->insts[prog->bytelen++] = -5; + prog->len += 3; + + prog->insts[prog->bytelen++] = Save; + prog->insts[prog->bytelen++] = 0; + prog->len++; + + int res = _compilecode(&re, prog, /*sizecode*/0); + if (res < 0) return res; + // If unparsed chars left + if (*re) return RE1_5_SYNTAX_ERROR; + + prog->insts[prog->bytelen++] = Save; + prog->insts[prog->bytelen++] = 1; + prog->len++; + + prog->insts[prog->bytelen++] = Match; + prog->len++; + + return RE1_5_SUCCESS; +} + +#if 0 +int main(int argc, char *argv[]) +{ + int pc = 0; + ByteProg *code = re1_5_compilecode(argv[1]); + re1_5_dumpcode(code); +} +#endif diff --git a/lib/libesp32/re1.5/dumpcode.c b/lib/libesp32/re1.5/dumpcode.c new file mode 100644 index 000000000..d7781d849 --- /dev/null +++ b/lib/libesp32/re1.5/dumpcode.c @@ -0,0 +1,65 @@ +// Copyright 2014 Paul Sokolovsky. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "re1.5.h" + +void re1_5_dumpcode(ByteProg *prog) +{ + int pc = 0; + char *code = prog->insts; + while (pc < prog->bytelen) { + printf("%2d: ", pc); + switch(code[pc++]) { + default: + assert(0); +// re1_5_fatal("printprog"); + case Split: + printf("split %d (%d)\n", pc + (signed char)code[pc] + 1, (signed char)code[pc]); + pc++; + break; + case RSplit: + printf("rsplit %d (%d)\n", pc + (signed char)code[pc] + 1, (signed char)code[pc]); + pc++; + break; + case Jmp: + printf("jmp %d (%d)\n", pc + (signed char)code[pc] + 1, (signed char)code[pc]); + pc++; + break; + case Char: + printf("char %c\n", code[pc++]); + break; + case Any: + printf("any\n"); + break; + case Class: + case ClassNot: { + int num = code[pc]; + printf("class%s %d", (code[pc - 1] == ClassNot ? "not" : ""), num); + pc++; + while (num--) { + printf(" 0x%02x-0x%02x", code[pc], code[pc + 1]); + pc += 2; + } + printf("\n"); + break; + } + case NamedClass: + printf("namedclass %c\n", code[pc++]); + break; + case Match: + printf("match\n"); + break; + case Save: + printf("save %d\n", (unsigned char)code[pc++]); + break; + case Bol: + printf("assert bol\n"); + break; + case Eol: + printf("assert eol\n"); + break; + } + } + printf("Bytes: %d, insts: %d\n", prog->bytelen, prog->len); +} diff --git a/lib/libesp32/re1.5/library.json b/lib/libesp32/re1.5/library.json new file mode 100644 index 000000000..8233b2142 --- /dev/null +++ b/lib/libesp32/re1.5/library.json @@ -0,0 +1,13 @@ +{ + "name": "re1.5", + "keywords": "esp32, re", + "description": "Regex", + "version": "0.9", + "repository": + { + "type": "git", + "url": "https://github.com/pfalcon/re1.5" + }, + "frameworks": "*", + "platforms": "*" +} \ No newline at end of file diff --git a/lib/libesp32/re1.5/main.c b/lib/libesp32/re1.5/main.c new file mode 100644 index 000000000..7d688adf1 --- /dev/null +++ b/lib/libesp32/re1.5/main.c @@ -0,0 +1,150 @@ +// Copyright 2007-2009 Russ Cox. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "re1.5.h" + +struct { + char *name; + int (*fn)(ByteProg*, Subject*, const char**, int, int); +} tab[] = { + {"recursive", re1_5_recursiveprog}, + {"recursiveloop", re1_5_recursiveloopprog}, + {"backtrack", re1_5_backtrack}, + {"thompson", re1_5_thompsonvm}, + {"pike", re1_5_pikevm}, +}; + +#ifdef DEBUG +int debug; +#endif +const char *re_engine; + +void +usage(void) +{ + fprintf(stderr, "Usage: re [-hmd] [-e ENGINE] ...\n" + "-h: Print help message and exit\n" + "-m: String is anchored\n" + "-e ENGINE: Specify one of: recursive recursiveloop backtrack thompson pike\n"); +#ifdef DEBUG + fprintf(stderr, + "-d: Print debug messages\n"); +#endif + exit(2); +} + +int +main(int argc, char **argv) +{ + int i, j, k, l; + int is_anchored = 0; + + argv++; + argc--; + while (argc > 0 && argv[0][0] == '-') { + char *arg; + for (arg = &argv[0][1]; *arg; arg++) { + switch (*arg) { + case 'h': + usage(); + break; + case 'm': + is_anchored = 1; + break; +#ifdef DEBUG + case 'd': + debug = 1; + break; +#endif + case 'e': + if (argv[1] == NULL) + re1_5_fatal("-e: Missing Regex engine argument"); + if (re_engine) + re1_5_fatal("-e: Regex engine already specified"); + re_engine = argv[1]; + argv++; + argc--; + break; + default: + re1_5_fatal("Unknown flag"); + } + } + argv++; + argc--; + } + + if(argc < 2) + usage(); + +#ifdef ODEBUG + // Old and unmaintained code + Regexp *re = parse(argv[0]); + printre(re); + printf("\n"); + + Prog *prog = compile(re); + printprog(prog); + printf("=============\n"); +#endif + int sz = re1_5_sizecode(argv[0]); +#ifdef DEBUG + if (debug) printf("Precalculated size: %d\n", sz); +#endif + if (sz == -1) { + re1_5_fatal("Error in regexp"); + } + + ByteProg *code = malloc(sizeof(ByteProg) + sz); + int ret = re1_5_compilecode(code, argv[0]); + if (ret != 0) { + re1_5_fatal("Error in regexp"); + } + + int sub_els = (code->sub + 1) * 2; +#ifdef DEBUG + if (debug) re1_5_dumpcode(code); +#endif + const char *sub[sub_els]; + int engine_found = 0; + for(i=1; i0; k--) + if(sub[k-1]) + break; + for(l=0; l CHAR EOL +%type alt concat repeat single line +%type count + +%% + +line: alt EOL + { + parsed_regexp = $1; + return 1; + } + +alt: + concat +| alt '|' concat + { + $$ = reg(Alt, $1, $3); + } +; + +concat: + repeat +| concat repeat + { + $$ = reg(Cat, $1, $2); + } +; + +repeat: + single +| single '*' + { + $$ = reg(Star, $1, nil); + } +| single '*' '?' + { + $$ = reg(Star, $1, nil); + $$->n = 1; + } +| single '+' + { + $$ = reg(Plus, $1, nil); + } +| single '+' '?' + { + $$ = reg(Plus, $1, nil); + $$->n = 1; + } +| single '?' + { + $$ = reg(Quest, $1, nil); + } +| single '?' '?' + { + $$ = reg(Quest, $1, nil); + $$->n = 1; + } +; + +count: + { + $$ = ++nparen; + } +; + +single: + '(' count alt ')' + { + $$ = reg(Paren, $3, nil); + $$->n = $2; + } +| '(' '?' ':' alt ')' + { + $$ = $4; + } +| CHAR + { + $$ = reg(Lit, nil, nil); + $$->ch = $1; + } +| '.' + { + $$ = reg(Dot, nil, nil); + } +; + +%% + +static char *input; +static Regexp *parsed_regexp; +static int nparen; +int gen; + +static int +yylex(void) +{ + int c; + + if(input == NULL || *input == 0) + return EOL; + c = *input++; + if(strchr("|*+?():.", c)) + return c; + yylval.c = c; + return CHAR; +} + +static void +yyerror(char *s) +{ + re1_5_fatal(s); +} + + +Regexp* +parse(char *s) +{ + Regexp *r, *dotstar; + + input = s; + parsed_regexp = nil; + nparen = 0; + if(yyparse() != 1) + yyerror("did not parse"); + if(parsed_regexp == nil) + yyerror("parser nil"); + + r = reg(Paren, parsed_regexp, nil); // $0 parens + return r; + dotstar = reg(Star, reg(Dot, nil, nil), nil); + dotstar->n = 1; // non-greedy + return reg(Cat, dotstar, r); +} + +Regexp* +reg(int type, Regexp *left, Regexp *right) +{ + Regexp *r; + + r = mal(sizeof *r); + r->type = type; + r->left = left; + r->right = right; + return r; +} + +void +printre(Regexp *r) +{ + switch(r->type) { + default: + printf("???"); + break; + + case Alt: + printf("Alt("); + printre(r->left); + printf(", "); + printre(r->right); + printf(")"); + break; + + case Cat: + printf("Cat("); + printre(r->left); + printf(", "); + printre(r->right); + printf(")"); + break; + + case Lit: + printf("Lit(%c)", r->ch); + break; + + case Dot: + printf("Dot"); + break; + + case Paren: + printf("Paren(%d, ", r->n); + printre(r->left); + printf(")"); + break; + + case Star: + if(r->n) + printf("Ng"); + printf("Star("); + printre(r->left); + printf(")"); + break; + + case Plus: + if(r->n) + printf("Ng"); + printf("Plus("); + printre(r->left); + printf(")"); + break; + + case Quest: + if(r->n) + printf("Ng"); + printf("Quest("); + printre(r->left); + printf(")"); + break; + } +} diff --git a/lib/libesp32/re1.5/pike.c b/lib/libesp32/re1.5/pike.c new file mode 100644 index 000000000..9f060fa11 --- /dev/null +++ b/lib/libesp32/re1.5/pike.c @@ -0,0 +1,176 @@ +// Copyright 2007-2009 Russ Cox. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "re1.5.h" + +typedef struct Thread Thread; +struct Thread +{ + char *pc; + Sub *sub; +}; + +typedef struct ThreadList ThreadList; +struct ThreadList +{ + int n; + Thread t[1]; +}; + +static Thread +thread(char *pc, Sub *sub) +{ + Thread t = {pc, sub}; + return t; +} + +static ThreadList* +threadlist(int n) +{ + return mal(sizeof(ThreadList)+n*sizeof(Thread)); +} + +static void +addthread(ThreadList *l, Thread t, Subject *input, const char *sp) +{ + int off; + if(*t.pc & 0x80) { + decref(t.sub); + return; // already on list + } + *t.pc |= 0x80; + + switch(*t.pc & 0x7f) { + default: + l->t[l->n] = t; + l->n++; + break; + case Jmp: + off = (signed char)t.pc[1]; + t.pc += 2; + addthread(l, thread(t.pc + off, t.sub), input, sp); + break; + case Split: + off = (signed char)t.pc[1]; + t.pc += 2; + addthread(l, thread(t.pc, incref(t.sub)), input, sp); + addthread(l, thread(t.pc + off, t.sub), input, sp); + break; + case RSplit: + off = (signed char)t.pc[1]; + t.pc += 2; + addthread(l, thread(t.pc + off, incref(t.sub)), input, sp); + addthread(l, thread(t.pc, t.sub), input, sp); + break; + case Save: + off = (unsigned char)t.pc[1]; + t.pc += 2; + addthread(l, thread(t.pc, update(t.sub, off, sp)), input, sp); + break; + case Bol: + if(sp == input->begin) + addthread(l, thread(t.pc + 1, t.sub), input, sp); + break; + case Eol: + if(sp == input->end) + addthread(l, thread(t.pc + 1, t.sub), input, sp); + break; + } +} + +int +re1_5_pikevm(ByteProg *prog, Subject *input, const char **subp, int nsubp, int is_anchored) +{ + int i, len; + ThreadList *clist, *nlist, *tmp; + char *pc; + const char *sp; + Sub *sub, *matched; + + matched = nil; + for(i=0; isub[i] = nil; + + len = prog->len; + clist = threadlist(len); + nlist = threadlist(len); + + cleanmarks(prog); + addthread(clist, thread(HANDLE_ANCHORED(prog->insts, is_anchored), sub), input, input->begin); + matched = 0; + for(sp=input->begin;; sp++) { + if(clist->n == 0) + break; + // printf("%d(%02x).", (int)(sp - input->begin), *sp & 0xFF); + cleanmarks(prog); + for(i=0; in; i++) { + pc = clist->t[i].pc; + sub = clist->t[i].sub; + // printf(" %d", (int)(pc - prog->insts)); + if (inst_is_consumer(*pc & 0x7f)) { + // If we need to match a character, but there's none left, + // it's fail (we don't schedule current thread for continuation) + if(sp >= input->end) { + decref(sub); + continue; + } + } + switch(*pc++ & 0x7f) { + case Char: + if(*sp != *pc++) { + decref(sub); + break; + } + case Any: + addthread: + addthread(nlist, thread(pc, sub), input, sp+1); + break; + case Class: + case ClassNot: + if (!_re1_5_classmatch(pc, sp)) { + decref(sub); + break; + } + pc += *(unsigned char*)pc * 2 + 1; + goto addthread; + case NamedClass: + if (!_re1_5_namedclassmatch(pc, sp)) { + decref(sub); + break; + } + pc++; + goto addthread; + case Match: + if(matched) + decref(matched); + matched = sub; + for(i++; i < clist->n; i++) + decref(clist->t[i].sub); + goto BreakFor; + // Jmp, Split, Save handled in addthread, so that + // machine execution matches what a backtracker would do. + // This is discussed (but not shown as code) in + // Regular Expression Matching: the Virtual Machine Approach. + } + } + BreakFor: + // printf("\n"); + tmp = clist; + clist = nlist; + nlist = tmp; + nlist->n = 0; + //if(*sp == '\0') + // break; + } + if(matched) { + for(i=0; isub[i]; + decref(matched); + return 1; + } + return 0; +} diff --git a/lib/libesp32/re1.5/re1.5.h b/lib/libesp32/re1.5/re1.5.h new file mode 100644 index 000000000..a9f255d03 --- /dev/null +++ b/lib/libesp32/re1.5/re1.5.h @@ -0,0 +1,162 @@ +// Copyright 2007-2009 Russ Cox. All Rights Reserved. +// Copyright 2014-2019 Paul Sokolovsky. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef _RE1_5_REGEXP__H +#define _RE1_5_REGEXP__H + +#include +#include +#include +#include +#include + +#define nil ((void*)0) +#define nelem(x) (sizeof(x)/sizeof((x)[0])) + +typedef struct Regexp Regexp; +typedef struct Prog Prog; +typedef struct ByteProg ByteProg; +typedef struct Inst Inst; +typedef struct Subject Subject; + +struct Regexp +{ + int type; + int n; + int ch; + Regexp *left; + Regexp *right; +}; + +enum /* Regexp.type */ +{ + Alt = 1, + Cat, + Lit, + Dot, + Paren, + Quest, + Star, + Plus, +}; + +Regexp *parse(char*); +Regexp *reg(int type, Regexp *left, Regexp *right); +void printre(Regexp*); +#ifndef re1_5_fatal +void re1_5_fatal(char*); +#endif +#ifndef re1_5_stack_chk +#define re1_5_stack_chk() +#endif +void *mal(int); + +struct Prog +{ + Inst *start; + int len; +}; + +struct ByteProg +{ + int bytelen; + int len; + int sub; + char insts[0]; +}; + +struct Inst +{ + int opcode; + int c; + int n; + Inst *x; + Inst *y; + int gen; // global state, oooh! +}; + +enum /* Inst.opcode */ +{ + // Instructions which consume input bytes (and thus fail if none left) + CONSUMERS = 1, + Char = CONSUMERS, + Any, + Class, + ClassNot, + NamedClass, + + ASSERTS = 0x50, + Bol = ASSERTS, + Eol, + + // Instructions which take relative offset as arg + JUMPS = 0x60, + Jmp = JUMPS, + Split, + RSplit, + + // Other (special) instructions + Save = 0x7e, + Match = 0x7f, +}; + +#define inst_is_consumer(inst) ((inst) < ASSERTS) +#define inst_is_jump(inst) ((inst) & 0x70 == JUMPS) + +Prog *compile(Regexp*); +void printprog(Prog*); + +extern int gen; + +enum { + MAXSUB = 20 +}; + +typedef struct Sub Sub; + +struct Sub +{ + int ref; + int nsub; + const char *sub[MAXSUB]; +}; + +Sub *newsub(int n); +Sub *incref(Sub*); +Sub *copy(Sub*); +Sub *update(Sub*, int, const char*); +void decref(Sub*); + +struct Subject { + const char *begin; + const char *end; +}; + + +#define NON_ANCHORED_PREFIX 5 +#define HANDLE_ANCHORED(bytecode, is_anchored) ((is_anchored) ? (bytecode) + NON_ANCHORED_PREFIX : (bytecode)) + +int re1_5_backtrack(ByteProg*, Subject*, const char**, int, int); +int re1_5_pikevm(ByteProg*, Subject*, const char**, int, int); +int re1_5_recursiveloopprog(ByteProg*, Subject*, const char**, int, int); +int re1_5_recursiveprog(ByteProg*, Subject*, const char**, int, int); +int re1_5_thompsonvm(ByteProg*, Subject*, const char**, int, int); + +// Return codes for re1_5_sizecode() and re1_5_compilecode() +enum { + RE1_5_SUCCESS = 0, + RE1_5_SYNTAX_ERROR = -2, + RE1_5_UNSUPPORTED_ESCAPE = -3, + RE1_5_UNSUPPORTED_SYNTAX = -4, +}; + +int re1_5_sizecode(const char *re); +int re1_5_compilecode(ByteProg *prog, const char *re); +void re1_5_dumpcode(ByteProg *prog); +void cleanmarks(ByteProg *prog); +int _re1_5_classmatch(const char *pc, const char *sp); +int _re1_5_namedclassmatch(const char *pc, const char *sp); + +#endif /*_RE1_5_REGEXP__H*/ diff --git a/lib/libesp32/re1.5/recursive.c b/lib/libesp32/re1.5/recursive.c new file mode 100644 index 000000000..466518ce3 --- /dev/null +++ b/lib/libesp32/re1.5/recursive.c @@ -0,0 +1,79 @@ +// Copyright 2007-2009 Russ Cox. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "re1.5.h" + +static int +recursive(char *pc, const char *sp, Subject *input, const char **subp, int nsubp) +{ + const char *old; + int off; + + if(inst_is_consumer(*pc)) { + // If we need to match a character, but there's none left, it's fail + if(sp >= input->end) + return 0; + } + + re1_5_stack_chk(); + + switch(*pc++) { + case Char: + if(*sp != *pc++) + return 0; + case Any: + return recursive(pc, sp+1, input, subp, nsubp); + case Class: + case ClassNot: + if (!_re1_5_classmatch(pc, sp)) + return 0; + pc += *(unsigned char*)pc * 2 + 1; + return recursive(pc, sp+1, input, subp, nsubp); + case NamedClass: + if (!_re1_5_namedclassmatch(pc, sp)) + return 0; + return recursive(pc+1, sp+1, input, subp, nsubp); + case Match: + return 1; + case Jmp: + off = (signed char)*pc++; + return recursive(pc + off, sp, input, subp, nsubp); + case Split: + off = (signed char)*pc++; + if(recursive(pc, sp, input, subp, nsubp)) + return 1; + return recursive(pc + off, sp, input, subp, nsubp); + case RSplit: + off = (signed char)*pc++; + if(recursive(pc + off, sp, input, subp, nsubp)) + return 1; + return recursive(pc, sp, input, subp, nsubp); + case Save: + off = (unsigned char)*pc++; + if(off >= nsubp) + return recursive(pc, sp, input, subp, nsubp); + old = subp[off]; + subp[off] = sp; + if(recursive(pc, sp, input, subp, nsubp)) + return 1; + subp[off] = old; + return 0; + case Bol: + if(sp != input->begin) + return 0; + return recursive(pc, sp, input, subp, nsubp); + case Eol: + if(sp != input->end) + return 0; + return recursive(pc, sp, input, subp, nsubp); + } + re1_5_fatal("recursive"); + return -1; +} + +int +re1_5_recursiveprog(ByteProg *prog, Subject *input, const char **subp, int nsubp, int is_anchored) +{ + return recursive(HANDLE_ANCHORED(prog->insts, is_anchored), input->begin, input, subp, nsubp); +} diff --git a/lib/libesp32/re1.5/recursiveloop.c b/lib/libesp32/re1.5/recursiveloop.c new file mode 100644 index 000000000..bb337decf --- /dev/null +++ b/lib/libesp32/re1.5/recursiveloop.c @@ -0,0 +1,86 @@ +// Copyright 2007-2009 Russ Cox. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "re1.5.h" + +static int +recursiveloop(char *pc, const char *sp, Subject *input, const char **subp, int nsubp) +{ + const char *old; + int off; + + re1_5_stack_chk(); + + for(;;) { + if(inst_is_consumer(*pc)) { + // If we need to match a character, but there's none left, it's fail + if(sp >= input->end) + return 0; + } + switch(*pc++) { + case Char: + if(*sp != *pc++) + return 0; + case Any: + sp++; + continue; + case Class: + case ClassNot: + if (!_re1_5_classmatch(pc, sp)) + return 0; + pc += *(unsigned char*)pc * 2 + 1; + sp++; + continue; + case NamedClass: + if (!_re1_5_namedclassmatch(pc, sp)) + return 0; + pc++; + sp++; + continue; + case Match: + return 1; + case Jmp: + off = (signed char)*pc++; + pc = pc + off; + continue; + case Split: + off = (signed char)*pc++; + if(recursiveloop(pc, sp, input, subp, nsubp)) + return 1; + pc = pc + off; + continue; + case RSplit: + off = (signed char)*pc++; + if(recursiveloop(pc + off, sp, input, subp, nsubp)) + return 1; + continue; + case Save: + off = (unsigned char)*pc++; + if(off >= nsubp) { + continue; + } + old = subp[off]; + subp[off] = sp; + if(recursiveloop(pc, sp, input, subp, nsubp)) + return 1; + subp[off] = old; + return 0; + case Bol: + if(sp != input->begin) + return 0; + continue; + case Eol: + if(sp != input->end) + return 0; + continue; + } + re1_5_fatal("recursiveloop"); + } +} + +int +re1_5_recursiveloopprog(ByteProg *prog, Subject *input, const char **subp, int nsubp, int is_anchored) +{ + return recursiveloop(HANDLE_ANCHORED(prog->insts, is_anchored), input->begin, input, subp, nsubp); +} diff --git a/lib/libesp32/re1.5/run-tests b/lib/libesp32/re1.5/run-tests new file mode 100755 index 000000000..af5568658 --- /dev/null +++ b/lib/libesp32/re1.5/run-tests @@ -0,0 +1,164 @@ +#! /usr/bin/env python3 + +RE_EXEC = "./re" + +test_suite = [ + # basics + ("search", r"abc", "abcdef"), + ("search", r"cde", "abcdef"), + ("search", r"abc*", "abdef"), + ("search", r"abc*", "abcccdef"), + ("search", r"abc+", "abdef"), + ("search", r"abc+", "abcccdef"), + + # match + ("match", r"abc", "abcdef"), + ("match", r"abc*", "abdef"), + + # search vs match distinction + ("match", r"a*", "baa"), + ("search", r"a*", "baa"), + + # nested group matching + ("match", r"(([0-9]*)([a-z]*)[0-9]*)", "1234hello567"), + ("match", r"([0-9]*)(([a-z]*)([0-9]*))", "1234hello567"), + + # non-capturing groups + ("match", r"(([0-9]*)(?:[a-z]*)[0-9]*)", "1234hello568"), + ("match", r"(?:[0-9]*)(([a-z]*)(?:[0-9]*))", "1234hello568"), + ("match", r"([0-9]*)(?:([a-z]*)(?:[0-9]*))", "1234hello568"), + ("match", r"(?:)", "1234hello568"), + ("match", r"1?:", "1:"), + + # named character classes + ("match", r"\d+", "123abc456"), + ("match", r"\s+", " \t123abc456"), + ("match", r"\w+", "123abc_456 abc"), + ("match", r"(\w+)\s+(\w+)", "ABC \t123hello456 abc"), + ("match", r"(\S+)\s+(\D+)", "ABC \thello abc456 abc"), + ("match", r"(([0-9]*)([a-z]*)\d*)", "123hello456"), + + # classes + ("match", r"[a]*", "a"), + ("search", r"([yab]*)(e*)([cd])", "xyac"), + ("search", r"([yab]*)(e*)([^y]?)$", "xyac"), + ("match", r"[-]*", "--"), + ("match", r"[-a]*", "-a-b"), + ("match", r"[-ab]*", "-a-b"), + ("match", r"[-a-c]*", "-a-b-d-"), + ("match", r"[a-]*", "-a-b"), + ("match", r"[ab-]*", "-a-b"), + ("match", r"[a-c-]*", "-a-b-d-"), + + # escaped metacharacters + ("match", r"(\?:)", ":"), + ("match", r"\(?:", "(:"), + + # non-greedy + ("match", r"a(b??)(b*)c", "abbc"), + ("match", r"a(b+?)(b*)c", "abbbc"), + ("match", r"a(b*?)(b*)c", "abbbbc"), + + # greedy + ("match", r"a(b?)(b*)c", "abbc"), + ("match", r"a(b+)(b*)c", "abbbc"), + ("match", r"a(b*)(b*)c", "abbbbc"), + + # errors + ("search", r"?", ""), + ("search", r"*", ""), + ("search", r"+", ""), + ("search", r"[", ""), + ("search", r"(", ""), + ("search", r")", ""), + ("search", "\\", ""), + ("search", "|+", ""), + ("search", "|*", ""), + ("search", "|?", ""), + ("search", "^*", ""), + ("search", "$*", ""), + ("search", "a*+", ""), + ("search", "a*?", ""), + ("search", "a**", ""), +] + +import re +import sre_constants +import subprocess +from collections import OrderedDict + +def parse_result(string, res): + name, rest = res.split(b" ", 1) + if rest == b"-no match-": + return name, None + if rest == b"REGEX ERROR": + return name, rest + assert rest.startswith(b"match ") + rest = rest[6:] + tuples = [eval(t) for t in rest.split()] + matches = tuple(string[t[0]:t[1]] for t in tuples) + return name, matches + +def fit_str(string, width): + if len(string) <= width: + return string + else: + return string[:width - 2] + ".." + +def main(): + engine_stats = OrderedDict() + for kind, regex, string in test_suite: + # run Python re to get correct result + try: + if kind == "match": + py_res = re.match(regex, string) + else: + py_res = re.search(regex, string) + if py_res is not None: + py_res = (py_res.group(0),) + py_res.groups() + except sre_constants.error: + py_res = b"REGEX ERROR" + + # run our code + try: + args = (["-m"] if kind == "match" else []) + [regex, string] + re_res = subprocess.check_output([RE_EXEC]+args, stderr=subprocess.STDOUT) + re_res = re_res.split(b'\n')[1:-1] # split lines, remove first and last + except subprocess.CalledProcessError as e: + if e.returncode == 2 and e.output == b"fatal error: Error in regexp\n": + re_res = [b"recursive REGEX ERROR", b"recursiveloop REGEX ERROR", b"backtrack REGEX ERROR", b"thompson REGEX ERROR", b"pike REGEX ERROR"] + else: + raise + + # check result of each engine + for engine in re_res: + engine_name, re_res = parse_result(string, engine) + try: + stats = engine_stats[engine_name] + except KeyError: + engine_stats[engine_name] = stats = [0, 0] + + # Thompson algo offers just boolean match/no match status + py_res_cur = py_res + re_res_cur = re_res + if engine_name == b"thompson": + if py_res is not None: + py_res_cur = True + if re_res is not None: + re_res_cur = True + + if py_res_cur == re_res_cur: + print("pass ", end="") + stats[0] += 1 + else: + print("FAIL ", end="") + stats[1] += 1 + + print("%s %-25s %-20s" % (kind[0], fit_str(regex, 25), fit_str(string, 20))) + + print("Ran %d tests, results:" % len(test_suite)) + for name, stats in engine_stats.items(): + print("%15s %2d pass %2d fail" % (str(name, encoding='utf8'), stats[0], stats[1])) + +if __name__ == "__main__": + main() diff --git a/lib/libesp32/re1.5/sub.c b/lib/libesp32/re1.5/sub.c new file mode 100644 index 000000000..7402b1175 --- /dev/null +++ b/lib/libesp32/re1.5/sub.c @@ -0,0 +1,55 @@ +// Copyright 2007-2009 Russ Cox. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "re1.5.h" + +Sub *freesub; + +Sub* +newsub(int n) +{ + Sub *s; + + s = freesub; + if(s != nil) + freesub = (Sub*)s->sub[0]; + else + s = mal(sizeof *s); + s->nsub = n; + s->ref = 1; + return s; +} + +Sub* +incref(Sub *s) +{ + s->ref++; + return s; +} + +Sub* +update(Sub *s, int i, const char *p) +{ + Sub *s1; + int j; + + if(s->ref > 1) { + s1 = newsub(s->nsub); + for(j=0; jnsub; j++) + s1->sub[j] = s->sub[j]; + s->ref--; + s = s1; + } + s->sub[i] = p; + return s; +} + +void +decref(Sub *s) +{ + if(--s->ref == 0) { + s->sub[0] = (char*)freesub; + freesub = s; + } +} diff --git a/lib/libesp32/re1.5/thompson.c b/lib/libesp32/re1.5/thompson.c new file mode 100644 index 000000000..ef2051b0e --- /dev/null +++ b/lib/libesp32/re1.5/thompson.c @@ -0,0 +1,152 @@ +// Copyright 2007-2009 Russ Cox. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "re1.5.h" + +typedef struct Thread Thread; +struct Thread +{ + char *pc; +}; + +typedef struct ThreadList ThreadList; +struct ThreadList +{ + int n; + Thread t[1]; +}; + +static Thread +thread(char *pc) +{ + Thread t = {pc}; + return t; +} + +static ThreadList* +threadlist(int n) +{ + return mal(sizeof(ThreadList)+n*sizeof(Thread)); +} + +static void +addthread(ThreadList *l, Thread t, Subject *input, const char *sp) +{ + int off; + if(*t.pc & 0x80) + return; // already on list + + *t.pc |= 0x80; + l->t[l->n] = t; + l->n++; + + switch(*t.pc & 0x7f) { + case Jmp: + off = (signed char)t.pc[1]; + t.pc += 2; + addthread(l, thread(t.pc + off), input, sp); + break; + case Split: + off = (signed char)t.pc[1]; + t.pc += 2; + addthread(l, thread(t.pc), input, sp); + addthread(l, thread(t.pc + off), input, sp); + break; + case RSplit: + off = (signed char)t.pc[1]; + t.pc += 2; + addthread(l, thread(t.pc + off), input, sp); + addthread(l, thread(t.pc), input, sp); + break; + case Save: + off = (unsigned char)t.pc[1]; + t.pc += 2; + addthread(l, thread(t.pc), input, sp); + break; + case Bol: + if(sp == input->begin) + addthread(l, thread(t.pc + 1), input, sp); + break; + case Eol: + if(sp == input->end - 1) + addthread(l, thread(t.pc + 1), input, sp); + break; + } +} + +int +re1_5_thompsonvm(ByteProg *prog, Subject *input, const char **subp, int nsubp, int is_anchored) +{ + int i, len, matched; + ThreadList *clist, *nlist, *tmp; + char *pc; + const char *sp; + + for(i=0; ilen; + clist = threadlist(len); + nlist = threadlist(len); + + if(nsubp >= 1) + subp[0] = input->begin; + cleanmarks(prog); + addthread(clist, thread(HANDLE_ANCHORED(prog->insts, is_anchored)), input, input->begin); + matched = 0; + for(sp=input->begin;; sp++) { + if(clist->n == 0) + break; + // printf("%d(%02x).", (int)(sp - input->begin), *sp & 0xFF); + cleanmarks(prog); + for(i=0; in; i++) { + pc = clist->t[i].pc; + // printf(" %d", (int)(pc - prog->insts)); + if (inst_is_consumer(*pc & 0x7f)) { + // If we need to match a character, but there's none left, + // it's fail (we don't schedule current thread for continuation) + if(sp >= input->end) + continue; + } + switch(*pc++ & 0x7f) { + case Char: + if(*sp != *pc++) + break; + case Any: + addthread: + addthread(nlist, thread(pc), input, sp); + break; + case Class: + case ClassNot: + if (!_re1_5_classmatch(pc, sp)) + break; + pc += *(unsigned char*)pc * 2 + 1; + goto addthread; + case NamedClass: + if (!_re1_5_namedclassmatch(pc, sp)) + break; + pc++; + goto addthread; + case Match: + if(nsubp >= 2) + subp[1] = sp; + matched = 1; + goto BreakFor; + // Jmp, Split, Save handled in addthread, so that + // machine execution matches what a backtracker would do. + // This is discussed (but not shown as code) in + // Regular Expression Matching: the Virtual Machine Approach. + } + } + BreakFor: + // printf("\n"); + tmp = clist; + clist = nlist; + nlist = tmp; + nlist->n = 0; + //if(sp >= input->end) + // break; + } + return matched; +} diff --git a/lib/libesp32/re1.5/util.c b/lib/libesp32/re1.5/util.c new file mode 100644 index 000000000..5b72b662a --- /dev/null +++ b/lib/libesp32/re1.5/util.c @@ -0,0 +1,24 @@ +// Copyright 2007-2009 Russ Cox. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "re1.5.h" + +void +re1_5_fatal(char *msg) +{ + fprintf(stderr, "fatal error: %s\n", msg); + exit(2); +} + +void* +mal(int n) +{ + void *v; + + v = malloc(n); + if(v == nil) + re1_5_fatal("out of memory"); + memset(v, 0, n); + return v; +} diff --git a/tasmota/xdrv_52_0_berry_struct.ino b/tasmota/xdrv_52_0_berry_struct.ino index 1c3991b80..61ff080c5 100644 --- a/tasmota/xdrv_52_0_berry_struct.ino +++ b/tasmota/xdrv_52_0_berry_struct.ino @@ -23,6 +23,8 @@ #include #include +#include "re1.5.h" + #define BERRY_CONSOLE_CMD_DELIMITER "\x01" typedef LList_elt log_elt; // store the string after the header to avoid double allocation if we had used char*