mirror of https://github.com/arendst/Tasmota.git
Merge pull request #13802 from s-hadinger/berry_re
Berry add ``import re`` regex module
This commit is contained in:
commit
8256244b94
|
@ -13,6 +13,7 @@ All notable changes to this project will be documented in this file.
|
|||
- Berry support for neopixel (WS2812, SK6812)
|
||||
- Command ``IfxPeriod `` to overrule ``Teleperiod`` for Influx messages (#13750)
|
||||
- OTA over HTTPS (ESP32x only)
|
||||
- Berry add ``import re`` regex module
|
||||
|
||||
### Changed
|
||||
- ESP8266 Gratuitous ARP enabled and set to 60 seconds (#13623)
|
||||
|
|
|
@ -25,6 +25,7 @@ be_extern_native_module(strict);
|
|||
|
||||
/* Tasmota specific */
|
||||
be_extern_native_module(python_compat);
|
||||
be_extern_native_module(re);
|
||||
be_extern_native_module(persist);
|
||||
be_extern_native_module(autoconf);
|
||||
be_extern_native_module(tapp);
|
||||
|
@ -87,6 +88,7 @@ BERRY_LOCAL const bntvmodule* const be_module_table[] = {
|
|||
/* user-defined modules register start */
|
||||
|
||||
&be_native_module(python_compat),
|
||||
&be_native_module(re),
|
||||
&be_native_module(path),
|
||||
&be_native_module(persist),
|
||||
#ifdef USE_AUTOCONF
|
||||
|
|
|
@ -0,0 +1,218 @@
|
|||
/********************************************************************
|
||||
* Tasmota lib
|
||||
*
|
||||
* To use: `import re`
|
||||
*
|
||||
* Regex using re1.5
|
||||
*******************************************************************/
|
||||
#include "be_constobj.h"
|
||||
#include "be_mem.h"
|
||||
#include "re1.5.h"
|
||||
|
||||
/********************************************************************
|
||||
# Berry skeleton for `re` module
|
||||
#
|
||||
|
||||
class re_pattern
|
||||
var _p # comobj containing the compiled bytecode for the pattern
|
||||
|
||||
def search() end
|
||||
def match() end
|
||||
def split() end
|
||||
end
|
||||
|
||||
re = module("re")
|
||||
|
||||
re.compile = def (regex_str) end # native
|
||||
re.match = def (regex_str, str) end # native
|
||||
re.search = def (regex_str, str) end # native
|
||||
|
||||
|
||||
*******************************************************************/
|
||||
|
||||
extern const bclass be_class_re_pattern;
|
||||
|
||||
int be_free_comobj(bvm* vm) {
|
||||
int argc = be_top(vm);
|
||||
if (argc > 0) {
|
||||
void * obj = be_tocomptr(vm, 1);
|
||||
if (obj != NULL) { be_os_free(obj); }
|
||||
}
|
||||
be_return_nil(vm);
|
||||
}
|
||||
|
||||
// Native functions be_const_func()
|
||||
// Berry: `re.compile(pattern:string) -> instance(be_pattern)`
|
||||
int be_re_compile(bvm *vm) {
|
||||
int32_t argc = be_top(vm); // Get the number of arguments
|
||||
if (argc >= 1 && be_isstring(vm, 1)) {
|
||||
const char * regex_str = be_tostring(vm, 1);
|
||||
int sz = re1_5_sizecode(regex_str);
|
||||
if (sz < 0) {
|
||||
be_raise(vm, "internal_error", "error in regex");
|
||||
}
|
||||
|
||||
ByteProg *code = be_os_malloc(sizeof(ByteProg) + sz);
|
||||
int ret = re1_5_compilecode(code, regex_str);
|
||||
if (ret != 0) {
|
||||
be_raise(vm, "internal_error", "error in regex");
|
||||
}
|
||||
be_pushntvclass(vm, &be_class_re_pattern);
|
||||
be_call(vm, 0);
|
||||
be_newcomobj(vm, code, &be_free_comobj);
|
||||
be_setmember(vm, -2, "_p");
|
||||
be_pop(vm, 1);
|
||||
be_return(vm);
|
||||
}
|
||||
be_raise(vm, "type_error", NULL);
|
||||
}
|
||||
|
||||
|
||||
int be_re_match_search_run(bvm *vm, ByteProg *code, const char *hay, bbool is_anchored) {
|
||||
Subject subj = {hay, hay + strlen(hay)};
|
||||
|
||||
int sub_els = (code->sub + 1) * 2;
|
||||
const char *sub[sub_els];
|
||||
|
||||
if (!re1_5_recursiveloopprog(code, &subj, sub, sub_els, is_anchored)) {
|
||||
be_return_nil(vm); // no match
|
||||
}
|
||||
|
||||
be_newobject(vm, "list");
|
||||
int k;
|
||||
for(k = sub_els; k > 0; k--)
|
||||
if(sub[k-1])
|
||||
break;
|
||||
for (int i = 0; i < k; i += 2) {
|
||||
be_pushnstring(vm, sub[i], sub[i+1] - sub[i]);
|
||||
be_data_push(vm, -2);
|
||||
be_pop(vm, 1);
|
||||
}
|
||||
be_pop(vm, 1); // remove list
|
||||
be_return(vm); // return list object
|
||||
}
|
||||
|
||||
int be_re_match_search(bvm *vm, bbool is_anchored) {
|
||||
int32_t argc = be_top(vm); // Get the number of arguments
|
||||
if (argc >= 2 && be_isstring(vm, 1) && be_isstring(vm, 2)) {
|
||||
const char * regex_str = be_tostring(vm, 1);
|
||||
const char * hay = be_tostring(vm, 2);
|
||||
int sz = re1_5_sizecode(regex_str);
|
||||
if (sz < 0) {
|
||||
be_raise(vm, "internal_error", "error in regex");
|
||||
}
|
||||
|
||||
ByteProg *code = be_os_malloc(sizeof(ByteProg) + sz);
|
||||
int ret = re1_5_compilecode(code, regex_str);
|
||||
if (ret != 0) {
|
||||
be_raise(vm, "internal_error", "error in regex");
|
||||
}
|
||||
return be_re_match_search_run(vm, code, hay, is_anchored);
|
||||
}
|
||||
be_raise(vm, "type_error", NULL);
|
||||
}
|
||||
|
||||
// Berry: `re.match(value:int | s:string) -> nil`
|
||||
int be_re_match(bvm *vm) {
|
||||
return be_re_match_search(vm, btrue);
|
||||
}
|
||||
// Berry: `re.search(value:int | s:string) -> nil`
|
||||
int be_re_search(bvm *vm) {
|
||||
return be_re_match_search(vm, bfalse);
|
||||
}
|
||||
|
||||
// Berry: `re_pattern.search(s:string) -> list(string)`
|
||||
int re_pattern_search(bvm *vm) {
|
||||
int32_t argc = be_top(vm); // Get the number of arguments
|
||||
if (argc >= 2 && be_isstring(vm, 2)) {
|
||||
const char * hay = be_tostring(vm, 2);
|
||||
be_getmember(vm, 1, "_p");
|
||||
ByteProg * code = (ByteProg*) be_tocomptr(vm, -1);
|
||||
return be_re_match_search_run(vm, code, hay, bfalse);
|
||||
}
|
||||
be_raise(vm, "type_error", NULL);
|
||||
}
|
||||
|
||||
// Berry: `re_pattern.match(s:string) -> list(string)`
|
||||
int re_pattern_match(bvm *vm) {
|
||||
int32_t argc = be_top(vm); // Get the number of arguments
|
||||
if (argc >= 2 && be_isstring(vm, 2)) {
|
||||
const char * hay = be_tostring(vm, 2);
|
||||
be_getmember(vm, 1, "_p");
|
||||
ByteProg * code = (ByteProg*) be_tocomptr(vm, -1);
|
||||
return be_re_match_search_run(vm, code, hay, btrue);
|
||||
}
|
||||
be_raise(vm, "type_error", NULL);
|
||||
}
|
||||
|
||||
// Berry: `re_pattern.split(s:string) -> list(string)`
|
||||
int re_pattern_split(bvm *vm) {
|
||||
int32_t argc = be_top(vm); // Get the number of arguments
|
||||
if (argc >= 2 && be_isstring(vm, 2)) {
|
||||
const char * hay = be_tostring(vm, 2);
|
||||
be_getmember(vm, 1, "_p");
|
||||
ByteProg * code = (ByteProg*) be_tocomptr(vm, -1);
|
||||
|
||||
Subject subj = {hay, hay + strlen(hay)};
|
||||
|
||||
int sub_els = (code->sub + 1) * 2;
|
||||
const char *sub[sub_els];
|
||||
|
||||
be_newobject(vm, "list");
|
||||
while (1) {
|
||||
if (!re1_5_recursiveloopprog(code, &subj, sub, sub_els, bfalse)) {
|
||||
be_pushnstring(vm, subj.begin, subj.end - subj.begin);
|
||||
be_data_push(vm, -2);
|
||||
be_pop(vm, 1);
|
||||
break;
|
||||
}
|
||||
|
||||
if (sub[0] == NULL || sub[1] == NULL || sub[0] == sub[1]) {
|
||||
be_raise(vm, "internal_error", "can't match");
|
||||
}
|
||||
be_pushnstring(vm, subj.begin, sub[0] - subj.begin);
|
||||
be_data_push(vm, -2);
|
||||
be_pop(vm, 1);
|
||||
subj.begin = sub[1];
|
||||
}
|
||||
be_pop(vm, 1); // remove list
|
||||
be_return(vm); // return list object
|
||||
|
||||
}
|
||||
be_raise(vm, "type_error", NULL);
|
||||
}
|
||||
|
||||
/********************************************************************
|
||||
** Solidified module: re
|
||||
********************************************************************/
|
||||
be_local_module(re,
|
||||
"re",
|
||||
be_nested_map(3,
|
||||
( (struct bmapnode*) &(const bmapnode[]) {
|
||||
{ be_nested_key("compile", 1000265118, 7, -1), be_const_func(be_re_compile) },
|
||||
{ be_nested_key("search", -2144130903, 6, 2), be_const_func(be_re_search) },
|
||||
{ be_nested_key("match", 2116038550, 5, -1), be_const_func(be_re_match) },
|
||||
}))
|
||||
);
|
||||
BE_EXPORT_VARIABLE be_define_const_native_module(re);
|
||||
/********************************************************************/
|
||||
|
||||
// ===================================================================
|
||||
|
||||
/********************************************************************
|
||||
** Solidified class: re_pattern
|
||||
********************************************************************/
|
||||
be_local_class(re_pattern,
|
||||
1,
|
||||
NULL,
|
||||
be_nested_map(4,
|
||||
( (struct bmapnode*) &(const bmapnode[]) {
|
||||
{ be_nested_key("_p", 1594591802, 2, -1), be_const_var(0) },
|
||||
{ be_nested_key("search", -2144130903, 6, -1), be_const_func(re_pattern_search) },
|
||||
{ be_nested_key("match", 2116038550, 5, 0), be_const_func(re_pattern_match) },
|
||||
{ be_nested_key("split", -2017972765, 5, -1), be_const_func(re_pattern_split) },
|
||||
})),
|
||||
(be_nested_const_str("re_pattern", 2041968961, 10))
|
||||
);
|
||||
/*******************************************************************/
|
||||
|
|
@ -0,0 +1,27 @@
|
|||
// Copyright (c) 2007-2009 Russ Cox, Google Inc. All rights reserved.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
//
|
||||
// * Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
// * Redistributions in binary form must reproduce the above
|
||||
// copyright notice, this list of conditions and the following disclaimer
|
||||
// in the documentation and/or other materials provided with the
|
||||
// distribution.
|
||||
// * Neither the name of Google, Inc. nor the names of its
|
||||
// contributors may be used to endorse or promote products derived from
|
||||
// this software without specific prior written permission.
|
||||
//
|
||||
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
@ -0,0 +1,44 @@
|
|||
# Copyright 2007-2009 Russ Cox. All Rights Reserved.
|
||||
# Use of this source code is governed by a BSD-style
|
||||
# license that can be found in the LICENSE file.
|
||||
|
||||
CC=gcc
|
||||
|
||||
CFLAGS=-g -Wall -Os
|
||||
# Comment out when developing/testing
|
||||
#CFLAGS=-DDEBUG -g -Wall -O0
|
||||
|
||||
TARGET=re
|
||||
OFILES=\
|
||||
backtrack.o\
|
||||
compile.o\
|
||||
main.o\
|
||||
pike.o\
|
||||
recursive.o\
|
||||
recursiveloop.o\
|
||||
sub.o\
|
||||
thompson.o\
|
||||
compilecode.o\
|
||||
dumpcode.o\
|
||||
charclass.o\
|
||||
cleanmarks.o\
|
||||
util.o\
|
||||
y.tab.o\
|
||||
|
||||
HFILES=\
|
||||
re1.5.h\
|
||||
|
||||
$(TARGET): $(OFILES)
|
||||
$(CC) $(CFLAGS) -o $(TARGET) $(OFILES)
|
||||
|
||||
%.o: %.c $(HFILES)
|
||||
$(CC) -c $(CFLAGS) $*.c
|
||||
|
||||
y.tab.h y.tab.c: parse.y
|
||||
bison -v -y parse.y
|
||||
|
||||
test: $(TARGET)
|
||||
./run-tests $(TFLAGS)
|
||||
|
||||
clean:
|
||||
rm -f *.o core $(TARGET) y.tab.[ch] y.output
|
|
@ -0,0 +1,48 @@
|
|||
What is re1.5?
|
||||
==============
|
||||
|
||||
re1 (http://code.google.com/p/re1/) is "toy regular expression implementation"
|
||||
by Russel Cox, featuring simplicity and minimal code size unheard of in other
|
||||
implementations. re2 (http://code.google.com/p/re2/) is "an efficient,
|
||||
principled regular expression library" by the same author. It is robust,
|
||||
full-featured, and ... bloated, comparing to re1.
|
||||
|
||||
re1.5 is an attempt to start with re1 codebase and add features required for
|
||||
minimalistic real-world use, while sticking to the minimal code size and
|
||||
memory use.
|
||||
|
||||
Why?
|
||||
====
|
||||
re1.5 is intended for use in highly constrained, e.g. embedded, environments,
|
||||
where offering familiar high-level string matching functionality is still
|
||||
beneficial.
|
||||
|
||||
Features
|
||||
========
|
||||
|
||||
* Like re1, re1.5 retains design where compiled expression can be executed
|
||||
(matched) by multiple backends, each with its own distinctive design and
|
||||
runtime properties (complexity and memory usage).
|
||||
* Unlike re1, regexes are compiled to memory-efficient bytecode. Exact size
|
||||
of the bytecode can be found prior to compilation (for memory allocation).
|
||||
* External API functions feature namespace prefix to improve clarity and
|
||||
avoid name clashes when integrating into other projects.
|
||||
* Matchers are NUL-char clean and take size of the input string as a param.
|
||||
* Support for quoted chars in regex.
|
||||
* Support for ^, $ assertions in regex.
|
||||
* Support for "match" vs "search" operations, as common in other regex APIs.
|
||||
* Support for named character classes: \d \D \s \S \w \W.
|
||||
|
||||
TODO
|
||||
====
|
||||
|
||||
* Support for repetition operator {n} and {n,m}.
|
||||
* Support for Unicode (UTF-8).
|
||||
* Support for matching flags like case-insensitive, dot matches all,
|
||||
multiline, etc.
|
||||
* Support for more assertions like \A, \Z.
|
||||
|
||||
Author and License
|
||||
==================
|
||||
re1.5 is maintained by Paul Sokolovsky pfalcon at users.sourceforge.net and
|
||||
licensed under BSD license, just as the original re1.
|
|
@ -0,0 +1,117 @@
|
|||
// Copyright 2007-2009 Russ Cox. All Rights Reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
#include "re1.5.h"
|
||||
|
||||
typedef struct Thread Thread;
|
||||
struct Thread
|
||||
{
|
||||
char *pc;
|
||||
const char *sp;
|
||||
Sub *sub;
|
||||
};
|
||||
|
||||
static Thread
|
||||
thread(char *pc, const char *sp, Sub *sub)
|
||||
{
|
||||
Thread t = {pc, sp, sub};
|
||||
return t;
|
||||
}
|
||||
|
||||
int
|
||||
re1_5_backtrack(ByteProg *prog, Subject *input, const char **subp, int nsubp, int is_anchored)
|
||||
{
|
||||
enum { MAX = 1000 };
|
||||
Thread ready[MAX];
|
||||
int i, nready;
|
||||
char *pc;
|
||||
const char *sp;
|
||||
Sub *sub;
|
||||
int off;
|
||||
|
||||
/* queue initial thread */
|
||||
sub = newsub(nsubp);
|
||||
for(i=0; i<nsubp; i++)
|
||||
sub->sub[i] = nil;
|
||||
ready[0] = thread(HANDLE_ANCHORED(prog->insts, is_anchored), input->begin, sub);
|
||||
nready = 1;
|
||||
|
||||
/* run threads in stack order */
|
||||
while(nready > 0) {
|
||||
--nready; /* pop state for next thread to run */
|
||||
pc = ready[nready].pc;
|
||||
sp = ready[nready].sp;
|
||||
sub = ready[nready].sub;
|
||||
assert(sub->ref > 0);
|
||||
for(;;) {
|
||||
if(inst_is_consumer(*pc)) {
|
||||
// If we need to match a character, but there's none left, it's fail
|
||||
if(sp >= input->end)
|
||||
goto Dead;
|
||||
}
|
||||
switch(*pc++) {
|
||||
case Char:
|
||||
if(*sp != *pc++)
|
||||
goto Dead;
|
||||
case Any:
|
||||
sp++;
|
||||
continue;
|
||||
case Class:
|
||||
case ClassNot:
|
||||
if (!_re1_5_classmatch(pc, sp))
|
||||
goto Dead;
|
||||
pc += *(unsigned char*)pc * 2 + 1;
|
||||
sp++;
|
||||
continue;
|
||||
case NamedClass:
|
||||
if (!_re1_5_namedclassmatch(pc, sp))
|
||||
goto Dead;
|
||||
pc++;
|
||||
sp++;
|
||||
continue;
|
||||
case Match:
|
||||
for(i=0; i<nsubp; i++)
|
||||
subp[i] = sub->sub[i];
|
||||
decref(sub);
|
||||
return 1;
|
||||
case Jmp:
|
||||
off = (signed char)*pc++;
|
||||
pc = pc + off;
|
||||
continue;
|
||||
case Split:
|
||||
if(nready >= MAX)
|
||||
re1_5_fatal("backtrack overflow");
|
||||
off = (signed char)*pc++;
|
||||
ready[nready++] = thread(pc + off, sp, incref(sub));
|
||||
// pc = pc->x; /* continue current thread */
|
||||
continue;
|
||||
case RSplit:
|
||||
if(nready >= MAX)
|
||||
re1_5_fatal("backtrack overflow");
|
||||
off = (signed char)*pc++;
|
||||
ready[nready++] = thread(pc, sp, incref(sub));
|
||||
pc = pc + off;
|
||||
continue;
|
||||
case Save:
|
||||
off = (unsigned char)*pc++;
|
||||
sub = update(sub, off, sp);
|
||||
continue;
|
||||
case Bol:
|
||||
if(sp != input->begin)
|
||||
goto Dead;
|
||||
continue;
|
||||
case Eol:
|
||||
if(sp != input->end)
|
||||
goto Dead;
|
||||
continue;
|
||||
default:
|
||||
re1_5_fatal("backtrack");
|
||||
}
|
||||
}
|
||||
Dead:
|
||||
decref(sub);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
|
@ -0,0 +1,33 @@
|
|||
#include "re1.5.h"
|
||||
|
||||
int _re1_5_classmatch(const char *pc, const char *sp)
|
||||
{
|
||||
// pc points to "cnt" byte after opcode
|
||||
int is_positive = (pc[-1] == Class);
|
||||
int cnt = *pc++;
|
||||
while (cnt--) {
|
||||
if (*sp >= *pc && *sp <= pc[1]) return is_positive;
|
||||
pc += 2;
|
||||
}
|
||||
return !is_positive;
|
||||
}
|
||||
|
||||
int _re1_5_namedclassmatch(const char *pc, const char *sp)
|
||||
{
|
||||
// pc points to name of class
|
||||
int off = (*pc >> 5) & 1;
|
||||
if ((*pc | 0x20) == 'd') {
|
||||
if (!(*sp >= '0' && *sp <= '9')) {
|
||||
off ^= 1;
|
||||
}
|
||||
} else if ((*pc | 0x20) == 's') {
|
||||
if (!(*sp == ' ' || (*sp >= '\t' && *sp <= '\r'))) {
|
||||
off ^= 1;
|
||||
}
|
||||
} else { // w
|
||||
if (!((*sp >= 'A' && *sp <= 'Z') || (*sp >= 'a' && *sp <= 'z') || (*sp >= '0' && *sp <= '9') || *sp == '_')) {
|
||||
off ^= 1;
|
||||
}
|
||||
}
|
||||
return off;
|
||||
}
|
|
@ -0,0 +1,39 @@
|
|||
// Copyright 2014 Paul Sokolovsky.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
#include "re1.5.h"
|
||||
|
||||
void
|
||||
cleanmarks(ByteProg *prog)
|
||||
{
|
||||
char *pc = prog->insts;
|
||||
char *end = pc + prog->bytelen;
|
||||
while (pc < end) {
|
||||
*pc &= 0x7f;
|
||||
switch (*pc) {
|
||||
case Class:
|
||||
case ClassNot:
|
||||
pc += (unsigned char)pc[1] * 2;
|
||||
case NamedClass:
|
||||
case Jmp:
|
||||
case Split:
|
||||
case RSplit:
|
||||
case Save:
|
||||
case Char:
|
||||
pc++;
|
||||
break;
|
||||
#ifdef DEBUG
|
||||
case Bol:
|
||||
case Eol:
|
||||
case Any:
|
||||
case Match:
|
||||
break;
|
||||
default:
|
||||
printf("Unknown instruction 0x%02x pc %ld\n", (unsigned char)*pc, pc - prog->insts);
|
||||
re1_5_fatal("cleanmarks");
|
||||
#endif
|
||||
}
|
||||
pc++;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,179 @@
|
|||
// Copyright 2007-2009 Russ Cox. All Rights Reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
#ifdef DEBUG
|
||||
|
||||
#include "re1.5.h"
|
||||
|
||||
static Inst *pc;
|
||||
static int count(Regexp*);
|
||||
static void emit(Regexp*);
|
||||
|
||||
Prog*
|
||||
compile(Regexp *r)
|
||||
{
|
||||
int n;
|
||||
Prog *p;
|
||||
|
||||
n = count(r) + 1;
|
||||
p = mal(sizeof *p + n*sizeof p->start[0]);
|
||||
p->start = (Inst*)(p+1);
|
||||
pc = p->start;
|
||||
emit(r);
|
||||
pc->opcode = Match;
|
||||
pc++;
|
||||
p->len = pc - p->start;
|
||||
return p;
|
||||
}
|
||||
|
||||
// how many instructions does r need?
|
||||
static int
|
||||
count(Regexp *r)
|
||||
{
|
||||
switch(r->type) {
|
||||
default:
|
||||
re1_5_fatal("bad count");
|
||||
case Alt:
|
||||
return 2 + count(r->left) + count(r->right);
|
||||
case Cat:
|
||||
return count(r->left) + count(r->right);
|
||||
case Lit:
|
||||
case Dot:
|
||||
return 1;
|
||||
case Paren:
|
||||
return 2 + count(r->left);
|
||||
case Quest:
|
||||
return 1 + count(r->left);
|
||||
case Star:
|
||||
return 2 + count(r->left);
|
||||
case Plus:
|
||||
return 1 + count(r->left);
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
emit(Regexp *r)
|
||||
{
|
||||
Inst *p1, *p2, *t;
|
||||
|
||||
switch(r->type) {
|
||||
default:
|
||||
re1_5_fatal("bad emit");
|
||||
|
||||
case Alt:
|
||||
pc->opcode = Split;
|
||||
p1 = pc++;
|
||||
p1->x = pc;
|
||||
emit(r->left);
|
||||
pc->opcode = Jmp;
|
||||
p2 = pc++;
|
||||
p1->y = pc;
|
||||
emit(r->right);
|
||||
p2->x = pc;
|
||||
break;
|
||||
|
||||
case Cat:
|
||||
emit(r->left);
|
||||
emit(r->right);
|
||||
break;
|
||||
|
||||
case Lit:
|
||||
pc->opcode = Char;
|
||||
pc->c = r->ch;
|
||||
pc++;
|
||||
break;
|
||||
|
||||
case Dot:
|
||||
pc++->opcode = Any;
|
||||
break;
|
||||
|
||||
case Paren:
|
||||
pc->opcode = Save;
|
||||
pc->n = 2*r->n;
|
||||
pc++;
|
||||
emit(r->left);
|
||||
pc->opcode = Save;
|
||||
pc->n = 2*r->n + 1;
|
||||
pc++;
|
||||
break;
|
||||
|
||||
case Quest:
|
||||
pc->opcode = Split;
|
||||
p1 = pc++;
|
||||
p1->x = pc;
|
||||
emit(r->left);
|
||||
p1->y = pc;
|
||||
if(r->n) { // non-greedy
|
||||
t = p1->x;
|
||||
p1->x = p1->y;
|
||||
p1->y = t;
|
||||
}
|
||||
break;
|
||||
|
||||
case Star:
|
||||
pc->opcode = Split;
|
||||
p1 = pc++;
|
||||
p1->x = pc;
|
||||
emit(r->left);
|
||||
pc->opcode = Jmp;
|
||||
pc->x = p1;
|
||||
pc++;
|
||||
p1->y = pc;
|
||||
if(r->n) { // non-greedy
|
||||
t = p1->x;
|
||||
p1->x = p1->y;
|
||||
p1->y = t;
|
||||
}
|
||||
break;
|
||||
|
||||
case Plus:
|
||||
p1 = pc;
|
||||
emit(r->left);
|
||||
pc->opcode = Split;
|
||||
pc->x = p1;
|
||||
p2 = pc;
|
||||
pc++;
|
||||
p2->y = pc;
|
||||
if(r->n) { // non-greedy
|
||||
t = p2->x;
|
||||
p2->x = p2->y;
|
||||
p2->y = t;
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
printprog(Prog *p)
|
||||
{
|
||||
Inst *pc, *e;
|
||||
|
||||
pc = p->start;
|
||||
e = p->start + p->len;
|
||||
|
||||
for(; pc < e; pc++) {
|
||||
switch(pc->opcode) {
|
||||
default:
|
||||
re1_5_fatal("printprog");
|
||||
case Split:
|
||||
printf("%2d. split %d, %d\n", (int)(pc-p->start), (int)(pc->x-p->start), (int)(pc->y-p->start));
|
||||
break;
|
||||
case Jmp:
|
||||
printf("%2d. jmp %d\n", (int)(pc-p->start), (int)(pc->x-p->start));
|
||||
break;
|
||||
case Char:
|
||||
printf("%2d. char %c\n", (int)(pc-p->start), pc->c);
|
||||
break;
|
||||
case Any:
|
||||
printf("%2d. any\n", (int)(pc-p->start));
|
||||
break;
|
||||
case Match:
|
||||
printf("%2d. match\n", (int)(pc-p->start));
|
||||
break;
|
||||
case Save:
|
||||
printf("%2d. save %d\n", (int)(pc-p->start), pc->n);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#endif //DEBUG
|
|
@ -0,0 +1,256 @@
|
|||
// Copyright 2014-2019 Paul Sokolovsky.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
#include "re1.5.h"
|
||||
|
||||
#define INSERT_CODE(at, num, pc) \
|
||||
((code ? memmove(code + at + num, code + at, pc - at) : (void)0), pc += num)
|
||||
#define REL(at, to) (to - at - 2)
|
||||
#define EMIT(at, byte) (code ? (code[at] = byte) : (void)(at))
|
||||
#define PC (prog->bytelen)
|
||||
|
||||
static int _compilecode(const char **re_loc, ByteProg *prog, int sizecode)
|
||||
{
|
||||
const char *re = *re_loc;
|
||||
char *code = sizecode ? NULL : prog->insts;
|
||||
int start = PC;
|
||||
int term = PC;
|
||||
int alt_label = 0;
|
||||
|
||||
for (; *re && *re != ')'; re++) {
|
||||
switch (*re) {
|
||||
case '\\': {
|
||||
re++;
|
||||
if (!*re) goto syntax_error; // Trailing backslash
|
||||
char c = *re | 0x20;
|
||||
if (c == 'd' || c == 's' || c == 'w') {
|
||||
term = PC;
|
||||
EMIT(PC++, NamedClass);
|
||||
EMIT(PC++, *re);
|
||||
prog->len++;
|
||||
break;
|
||||
}
|
||||
if ((c >= '0' && c <= '9') || (c >= 'a' && c <= 'z')) {
|
||||
goto unsupported_escape;
|
||||
}
|
||||
}
|
||||
default:
|
||||
term = PC;
|
||||
EMIT(PC++, Char);
|
||||
EMIT(PC++, *re);
|
||||
prog->len++;
|
||||
break;
|
||||
case '.':
|
||||
term = PC;
|
||||
EMIT(PC++, Any);
|
||||
prog->len++;
|
||||
break;
|
||||
case '[': {
|
||||
int cnt;
|
||||
term = PC;
|
||||
re++;
|
||||
if (*re == '^') {
|
||||
EMIT(PC++, ClassNot);
|
||||
re++;
|
||||
} else {
|
||||
EMIT(PC++, Class);
|
||||
}
|
||||
PC++; // Skip "# of pairs" byte
|
||||
prog->len++;
|
||||
for (cnt = 0; *re != ']'; re++, cnt++) {
|
||||
if (!*re) goto syntax_error;
|
||||
if (*re == '\\') {
|
||||
re++;
|
||||
if (!*re) goto syntax_error;
|
||||
if (*re != '\\' && *re != ']') goto unsupported_escape;
|
||||
}
|
||||
EMIT(PC++, *re);
|
||||
if (re[1] == '-' && re[2] != ']') {
|
||||
re += 2;
|
||||
}
|
||||
EMIT(PC++, *re);
|
||||
}
|
||||
EMIT(term + 1, cnt);
|
||||
break;
|
||||
}
|
||||
case '(': {
|
||||
term = PC;
|
||||
int sub;
|
||||
int capture = 1;
|
||||
re++;
|
||||
if (*re == '?') {
|
||||
re++;
|
||||
if (*re == ':') {
|
||||
capture = 0;
|
||||
re++;
|
||||
} else {
|
||||
*re_loc = re;
|
||||
return RE1_5_UNSUPPORTED_SYNTAX;
|
||||
}
|
||||
}
|
||||
|
||||
if (capture) {
|
||||
sub = ++prog->sub;
|
||||
EMIT(PC++, Save);
|
||||
EMIT(PC++, 2 * sub);
|
||||
prog->len++;
|
||||
}
|
||||
|
||||
int res = _compilecode(&re, prog, sizecode);
|
||||
*re_loc = re;
|
||||
if (res < 0) return res;
|
||||
if (*re != ')') return RE1_5_SYNTAX_ERROR;
|
||||
|
||||
if (capture) {
|
||||
EMIT(PC++, Save);
|
||||
EMIT(PC++, 2 * sub + 1);
|
||||
prog->len++;
|
||||
}
|
||||
|
||||
break;
|
||||
}
|
||||
case '{':
|
||||
*re_loc = re;
|
||||
return RE1_5_UNSUPPORTED_SYNTAX;
|
||||
case '?':
|
||||
if (PC == term) goto syntax_error; // nothing to repeat
|
||||
INSERT_CODE(term, 2, PC);
|
||||
if (re[1] == '?') {
|
||||
EMIT(term, RSplit);
|
||||
re++;
|
||||
} else {
|
||||
EMIT(term, Split);
|
||||
}
|
||||
EMIT(term + 1, REL(term, PC));
|
||||
prog->len++;
|
||||
term = PC;
|
||||
break;
|
||||
case '*':
|
||||
if (PC == term) goto syntax_error; // nothing to repeat
|
||||
INSERT_CODE(term, 2, PC);
|
||||
EMIT(PC, Jmp);
|
||||
EMIT(PC + 1, REL(PC, term));
|
||||
PC += 2;
|
||||
if (re[1] == '?') {
|
||||
EMIT(term, RSplit);
|
||||
re++;
|
||||
} else {
|
||||
EMIT(term, Split);
|
||||
}
|
||||
EMIT(term + 1, REL(term, PC));
|
||||
prog->len += 2;
|
||||
term = PC;
|
||||
break;
|
||||
case '+':
|
||||
if (PC == term) goto syntax_error; // nothing to repeat
|
||||
if (re[1] == '?') {
|
||||
EMIT(PC, Split);
|
||||
re++;
|
||||
} else {
|
||||
EMIT(PC, RSplit);
|
||||
}
|
||||
EMIT(PC + 1, REL(PC, term));
|
||||
PC += 2;
|
||||
prog->len++;
|
||||
term = PC;
|
||||
break;
|
||||
case '|':
|
||||
if (alt_label) {
|
||||
EMIT(alt_label, REL(alt_label, PC) + 1);
|
||||
}
|
||||
INSERT_CODE(start, 2, PC);
|
||||
EMIT(PC++, Jmp);
|
||||
alt_label = PC++;
|
||||
EMIT(start, Split);
|
||||
EMIT(start + 1, REL(start, PC));
|
||||
prog->len += 2;
|
||||
term = PC;
|
||||
break;
|
||||
case '^':
|
||||
EMIT(PC++, Bol);
|
||||
prog->len++;
|
||||
term = PC;
|
||||
break;
|
||||
case '$':
|
||||
EMIT(PC++, Eol);
|
||||
prog->len++;
|
||||
term = PC;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (alt_label) {
|
||||
EMIT(alt_label, REL(alt_label, PC) + 1);
|
||||
}
|
||||
|
||||
*re_loc = re;
|
||||
return RE1_5_SUCCESS;
|
||||
|
||||
syntax_error:
|
||||
*re_loc = re;
|
||||
return RE1_5_SYNTAX_ERROR;
|
||||
|
||||
unsupported_escape:
|
||||
*re_loc = re;
|
||||
return RE1_5_UNSUPPORTED_ESCAPE;
|
||||
}
|
||||
|
||||
int re1_5_sizecode(const char *re)
|
||||
{
|
||||
ByteProg dummyprog = {
|
||||
// Save 0, Save 1, Match; more bytes for "search" (vs "match") prefix code
|
||||
.bytelen = 5 + NON_ANCHORED_PREFIX
|
||||
};
|
||||
|
||||
int res = _compilecode(&re, &dummyprog, /*sizecode*/1);
|
||||
if (res < 0) return res;
|
||||
// If unparsed chars left
|
||||
if (*re) return RE1_5_SYNTAX_ERROR;
|
||||
|
||||
return dummyprog.bytelen;
|
||||
}
|
||||
|
||||
int re1_5_compilecode(ByteProg *prog, const char *re)
|
||||
{
|
||||
prog->len = 0;
|
||||
prog->bytelen = 0;
|
||||
prog->sub = 0;
|
||||
|
||||
// Add code to implement non-anchored operation ("search").
|
||||
// For anchored operation ("match"), this code will be just skipped.
|
||||
// TODO: Implement search in much more efficient manner
|
||||
prog->insts[prog->bytelen++] = RSplit;
|
||||
prog->insts[prog->bytelen++] = 3;
|
||||
prog->insts[prog->bytelen++] = Any;
|
||||
prog->insts[prog->bytelen++] = Jmp;
|
||||
prog->insts[prog->bytelen++] = -5;
|
||||
prog->len += 3;
|
||||
|
||||
prog->insts[prog->bytelen++] = Save;
|
||||
prog->insts[prog->bytelen++] = 0;
|
||||
prog->len++;
|
||||
|
||||
int res = _compilecode(&re, prog, /*sizecode*/0);
|
||||
if (res < 0) return res;
|
||||
// If unparsed chars left
|
||||
if (*re) return RE1_5_SYNTAX_ERROR;
|
||||
|
||||
prog->insts[prog->bytelen++] = Save;
|
||||
prog->insts[prog->bytelen++] = 1;
|
||||
prog->len++;
|
||||
|
||||
prog->insts[prog->bytelen++] = Match;
|
||||
prog->len++;
|
||||
|
||||
return RE1_5_SUCCESS;
|
||||
}
|
||||
|
||||
#if 0
|
||||
int main(int argc, char *argv[])
|
||||
{
|
||||
int pc = 0;
|
||||
ByteProg *code = re1_5_compilecode(argv[1]);
|
||||
re1_5_dumpcode(code);
|
||||
}
|
||||
#endif
|
|
@ -0,0 +1,65 @@
|
|||
// Copyright 2014 Paul Sokolovsky.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
#include "re1.5.h"
|
||||
|
||||
void re1_5_dumpcode(ByteProg *prog)
|
||||
{
|
||||
int pc = 0;
|
||||
char *code = prog->insts;
|
||||
while (pc < prog->bytelen) {
|
||||
printf("%2d: ", pc);
|
||||
switch(code[pc++]) {
|
||||
default:
|
||||
assert(0);
|
||||
// re1_5_fatal("printprog");
|
||||
case Split:
|
||||
printf("split %d (%d)\n", pc + (signed char)code[pc] + 1, (signed char)code[pc]);
|
||||
pc++;
|
||||
break;
|
||||
case RSplit:
|
||||
printf("rsplit %d (%d)\n", pc + (signed char)code[pc] + 1, (signed char)code[pc]);
|
||||
pc++;
|
||||
break;
|
||||
case Jmp:
|
||||
printf("jmp %d (%d)\n", pc + (signed char)code[pc] + 1, (signed char)code[pc]);
|
||||
pc++;
|
||||
break;
|
||||
case Char:
|
||||
printf("char %c\n", code[pc++]);
|
||||
break;
|
||||
case Any:
|
||||
printf("any\n");
|
||||
break;
|
||||
case Class:
|
||||
case ClassNot: {
|
||||
int num = code[pc];
|
||||
printf("class%s %d", (code[pc - 1] == ClassNot ? "not" : ""), num);
|
||||
pc++;
|
||||
while (num--) {
|
||||
printf(" 0x%02x-0x%02x", code[pc], code[pc + 1]);
|
||||
pc += 2;
|
||||
}
|
||||
printf("\n");
|
||||
break;
|
||||
}
|
||||
case NamedClass:
|
||||
printf("namedclass %c\n", code[pc++]);
|
||||
break;
|
||||
case Match:
|
||||
printf("match\n");
|
||||
break;
|
||||
case Save:
|
||||
printf("save %d\n", (unsigned char)code[pc++]);
|
||||
break;
|
||||
case Bol:
|
||||
printf("assert bol\n");
|
||||
break;
|
||||
case Eol:
|
||||
printf("assert eol\n");
|
||||
break;
|
||||
}
|
||||
}
|
||||
printf("Bytes: %d, insts: %d\n", prog->bytelen, prog->len);
|
||||
}
|
|
@ -0,0 +1,13 @@
|
|||
{
|
||||
"name": "re1.5",
|
||||
"keywords": "esp32, re",
|
||||
"description": "Regex",
|
||||
"version": "0.9",
|
||||
"repository":
|
||||
{
|
||||
"type": "git",
|
||||
"url": "https://github.com/pfalcon/re1.5"
|
||||
},
|
||||
"frameworks": "*",
|
||||
"platforms": "*"
|
||||
}
|
|
@ -0,0 +1,150 @@
|
|||
// Copyright 2007-2009 Russ Cox. All Rights Reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
#include "re1.5.h"
|
||||
|
||||
struct {
|
||||
char *name;
|
||||
int (*fn)(ByteProg*, Subject*, const char**, int, int);
|
||||
} tab[] = {
|
||||
{"recursive", re1_5_recursiveprog},
|
||||
{"recursiveloop", re1_5_recursiveloopprog},
|
||||
{"backtrack", re1_5_backtrack},
|
||||
{"thompson", re1_5_thompsonvm},
|
||||
{"pike", re1_5_pikevm},
|
||||
};
|
||||
|
||||
#ifdef DEBUG
|
||||
int debug;
|
||||
#endif
|
||||
const char *re_engine;
|
||||
|
||||
void
|
||||
usage(void)
|
||||
{
|
||||
fprintf(stderr, "Usage: re [-hmd] [-e ENGINE] <regexp> <string>...\n"
|
||||
"-h: Print help message and exit\n"
|
||||
"-m: String is anchored\n"
|
||||
"-e ENGINE: Specify one of: recursive recursiveloop backtrack thompson pike\n");
|
||||
#ifdef DEBUG
|
||||
fprintf(stderr,
|
||||
"-d: Print debug messages\n");
|
||||
#endif
|
||||
exit(2);
|
||||
}
|
||||
|
||||
int
|
||||
main(int argc, char **argv)
|
||||
{
|
||||
int i, j, k, l;
|
||||
int is_anchored = 0;
|
||||
|
||||
argv++;
|
||||
argc--;
|
||||
while (argc > 0 && argv[0][0] == '-') {
|
||||
char *arg;
|
||||
for (arg = &argv[0][1]; *arg; arg++) {
|
||||
switch (*arg) {
|
||||
case 'h':
|
||||
usage();
|
||||
break;
|
||||
case 'm':
|
||||
is_anchored = 1;
|
||||
break;
|
||||
#ifdef DEBUG
|
||||
case 'd':
|
||||
debug = 1;
|
||||
break;
|
||||
#endif
|
||||
case 'e':
|
||||
if (argv[1] == NULL)
|
||||
re1_5_fatal("-e: Missing Regex engine argument");
|
||||
if (re_engine)
|
||||
re1_5_fatal("-e: Regex engine already specified");
|
||||
re_engine = argv[1];
|
||||
argv++;
|
||||
argc--;
|
||||
break;
|
||||
default:
|
||||
re1_5_fatal("Unknown flag");
|
||||
}
|
||||
}
|
||||
argv++;
|
||||
argc--;
|
||||
}
|
||||
|
||||
if(argc < 2)
|
||||
usage();
|
||||
|
||||
#ifdef ODEBUG
|
||||
// Old and unmaintained code
|
||||
Regexp *re = parse(argv[0]);
|
||||
printre(re);
|
||||
printf("\n");
|
||||
|
||||
Prog *prog = compile(re);
|
||||
printprog(prog);
|
||||
printf("=============\n");
|
||||
#endif
|
||||
int sz = re1_5_sizecode(argv[0]);
|
||||
#ifdef DEBUG
|
||||
if (debug) printf("Precalculated size: %d\n", sz);
|
||||
#endif
|
||||
if (sz == -1) {
|
||||
re1_5_fatal("Error in regexp");
|
||||
}
|
||||
|
||||
ByteProg *code = malloc(sizeof(ByteProg) + sz);
|
||||
int ret = re1_5_compilecode(code, argv[0]);
|
||||
if (ret != 0) {
|
||||
re1_5_fatal("Error in regexp");
|
||||
}
|
||||
|
||||
int sub_els = (code->sub + 1) * 2;
|
||||
#ifdef DEBUG
|
||||
if (debug) re1_5_dumpcode(code);
|
||||
#endif
|
||||
const char *sub[sub_els];
|
||||
int engine_found = 0;
|
||||
for(i=1; i<argc; i++) {
|
||||
printf("#%d %s\n", i, argv[i]);
|
||||
for(j=0; j<nelem(tab); j++) {
|
||||
Subject subj = {argv[i], argv[i] + strlen(argv[i])};
|
||||
if (re_engine) {
|
||||
if (0 != strcmp(re_engine, tab[j].name))
|
||||
continue;
|
||||
engine_found = 1;
|
||||
}
|
||||
printf("%s ", tab[j].name);
|
||||
memset(sub, 0, sub_els * sizeof sub[0]);
|
||||
if(!tab[j].fn(code, &subj, sub, sub_els, is_anchored)) {
|
||||
printf("-no match-\n");
|
||||
continue;
|
||||
}
|
||||
printf("match");
|
||||
for(k=sub_els; k>0; k--)
|
||||
if(sub[k-1])
|
||||
break;
|
||||
for(l=0; l<k; l+=2) {
|
||||
printf(" (");
|
||||
if(sub[l] == nil)
|
||||
printf("?");
|
||||
else
|
||||
printf("%d", (int)(sub[l] - argv[i]));
|
||||
printf(",");
|
||||
if(sub[l+1] == nil)
|
||||
printf("?");
|
||||
else
|
||||
printf("%d", (int)(sub[l+1] - argv[i]));
|
||||
printf(")");
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
if (re_engine && !engine_found)
|
||||
re1_5_fatal("-e: Unknown engine name");
|
||||
}
|
||||
|
||||
free(code);
|
||||
return 0;
|
||||
}
|
|
@ -0,0 +1,229 @@
|
|||
// Copyright 2007-2009 Russ Cox. All Rights Reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
%{
|
||||
#include "re1.5.h"
|
||||
|
||||
static int yylex(void);
|
||||
static void yyerror(char*);
|
||||
static Regexp *parsed_regexp;
|
||||
static int nparen;
|
||||
|
||||
%}
|
||||
|
||||
%union {
|
||||
Regexp *re;
|
||||
int c;
|
||||
int nparen;
|
||||
}
|
||||
|
||||
%token <c> CHAR EOL
|
||||
%type <re> alt concat repeat single line
|
||||
%type <nparen> count
|
||||
|
||||
%%
|
||||
|
||||
line: alt EOL
|
||||
{
|
||||
parsed_regexp = $1;
|
||||
return 1;
|
||||
}
|
||||
|
||||
alt:
|
||||
concat
|
||||
| alt '|' concat
|
||||
{
|
||||
$$ = reg(Alt, $1, $3);
|
||||
}
|
||||
;
|
||||
|
||||
concat:
|
||||
repeat
|
||||
| concat repeat
|
||||
{
|
||||
$$ = reg(Cat, $1, $2);
|
||||
}
|
||||
;
|
||||
|
||||
repeat:
|
||||
single
|
||||
| single '*'
|
||||
{
|
||||
$$ = reg(Star, $1, nil);
|
||||
}
|
||||
| single '*' '?'
|
||||
{
|
||||
$$ = reg(Star, $1, nil);
|
||||
$$->n = 1;
|
||||
}
|
||||
| single '+'
|
||||
{
|
||||
$$ = reg(Plus, $1, nil);
|
||||
}
|
||||
| single '+' '?'
|
||||
{
|
||||
$$ = reg(Plus, $1, nil);
|
||||
$$->n = 1;
|
||||
}
|
||||
| single '?'
|
||||
{
|
||||
$$ = reg(Quest, $1, nil);
|
||||
}
|
||||
| single '?' '?'
|
||||
{
|
||||
$$ = reg(Quest, $1, nil);
|
||||
$$->n = 1;
|
||||
}
|
||||
;
|
||||
|
||||
count:
|
||||
{
|
||||
$$ = ++nparen;
|
||||
}
|
||||
;
|
||||
|
||||
single:
|
||||
'(' count alt ')'
|
||||
{
|
||||
$$ = reg(Paren, $3, nil);
|
||||
$$->n = $2;
|
||||
}
|
||||
| '(' '?' ':' alt ')'
|
||||
{
|
||||
$$ = $4;
|
||||
}
|
||||
| CHAR
|
||||
{
|
||||
$$ = reg(Lit, nil, nil);
|
||||
$$->ch = $1;
|
||||
}
|
||||
| '.'
|
||||
{
|
||||
$$ = reg(Dot, nil, nil);
|
||||
}
|
||||
;
|
||||
|
||||
%%
|
||||
|
||||
static char *input;
|
||||
static Regexp *parsed_regexp;
|
||||
static int nparen;
|
||||
int gen;
|
||||
|
||||
static int
|
||||
yylex(void)
|
||||
{
|
||||
int c;
|
||||
|
||||
if(input == NULL || *input == 0)
|
||||
return EOL;
|
||||
c = *input++;
|
||||
if(strchr("|*+?():.", c))
|
||||
return c;
|
||||
yylval.c = c;
|
||||
return CHAR;
|
||||
}
|
||||
|
||||
static void
|
||||
yyerror(char *s)
|
||||
{
|
||||
re1_5_fatal(s);
|
||||
}
|
||||
|
||||
|
||||
Regexp*
|
||||
parse(char *s)
|
||||
{
|
||||
Regexp *r, *dotstar;
|
||||
|
||||
input = s;
|
||||
parsed_regexp = nil;
|
||||
nparen = 0;
|
||||
if(yyparse() != 1)
|
||||
yyerror("did not parse");
|
||||
if(parsed_regexp == nil)
|
||||
yyerror("parser nil");
|
||||
|
||||
r = reg(Paren, parsed_regexp, nil); // $0 parens
|
||||
return r;
|
||||
dotstar = reg(Star, reg(Dot, nil, nil), nil);
|
||||
dotstar->n = 1; // non-greedy
|
||||
return reg(Cat, dotstar, r);
|
||||
}
|
||||
|
||||
Regexp*
|
||||
reg(int type, Regexp *left, Regexp *right)
|
||||
{
|
||||
Regexp *r;
|
||||
|
||||
r = mal(sizeof *r);
|
||||
r->type = type;
|
||||
r->left = left;
|
||||
r->right = right;
|
||||
return r;
|
||||
}
|
||||
|
||||
void
|
||||
printre(Regexp *r)
|
||||
{
|
||||
switch(r->type) {
|
||||
default:
|
||||
printf("???");
|
||||
break;
|
||||
|
||||
case Alt:
|
||||
printf("Alt(");
|
||||
printre(r->left);
|
||||
printf(", ");
|
||||
printre(r->right);
|
||||
printf(")");
|
||||
break;
|
||||
|
||||
case Cat:
|
||||
printf("Cat(");
|
||||
printre(r->left);
|
||||
printf(", ");
|
||||
printre(r->right);
|
||||
printf(")");
|
||||
break;
|
||||
|
||||
case Lit:
|
||||
printf("Lit(%c)", r->ch);
|
||||
break;
|
||||
|
||||
case Dot:
|
||||
printf("Dot");
|
||||
break;
|
||||
|
||||
case Paren:
|
||||
printf("Paren(%d, ", r->n);
|
||||
printre(r->left);
|
||||
printf(")");
|
||||
break;
|
||||
|
||||
case Star:
|
||||
if(r->n)
|
||||
printf("Ng");
|
||||
printf("Star(");
|
||||
printre(r->left);
|
||||
printf(")");
|
||||
break;
|
||||
|
||||
case Plus:
|
||||
if(r->n)
|
||||
printf("Ng");
|
||||
printf("Plus(");
|
||||
printre(r->left);
|
||||
printf(")");
|
||||
break;
|
||||
|
||||
case Quest:
|
||||
if(r->n)
|
||||
printf("Ng");
|
||||
printf("Quest(");
|
||||
printre(r->left);
|
||||
printf(")");
|
||||
break;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,176 @@
|
|||
// Copyright 2007-2009 Russ Cox. All Rights Reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
#include "re1.5.h"
|
||||
|
||||
typedef struct Thread Thread;
|
||||
struct Thread
|
||||
{
|
||||
char *pc;
|
||||
Sub *sub;
|
||||
};
|
||||
|
||||
typedef struct ThreadList ThreadList;
|
||||
struct ThreadList
|
||||
{
|
||||
int n;
|
||||
Thread t[1];
|
||||
};
|
||||
|
||||
static Thread
|
||||
thread(char *pc, Sub *sub)
|
||||
{
|
||||
Thread t = {pc, sub};
|
||||
return t;
|
||||
}
|
||||
|
||||
static ThreadList*
|
||||
threadlist(int n)
|
||||
{
|
||||
return mal(sizeof(ThreadList)+n*sizeof(Thread));
|
||||
}
|
||||
|
||||
static void
|
||||
addthread(ThreadList *l, Thread t, Subject *input, const char *sp)
|
||||
{
|
||||
int off;
|
||||
if(*t.pc & 0x80) {
|
||||
decref(t.sub);
|
||||
return; // already on list
|
||||
}
|
||||
*t.pc |= 0x80;
|
||||
|
||||
switch(*t.pc & 0x7f) {
|
||||
default:
|
||||
l->t[l->n] = t;
|
||||
l->n++;
|
||||
break;
|
||||
case Jmp:
|
||||
off = (signed char)t.pc[1];
|
||||
t.pc += 2;
|
||||
addthread(l, thread(t.pc + off, t.sub), input, sp);
|
||||
break;
|
||||
case Split:
|
||||
off = (signed char)t.pc[1];
|
||||
t.pc += 2;
|
||||
addthread(l, thread(t.pc, incref(t.sub)), input, sp);
|
||||
addthread(l, thread(t.pc + off, t.sub), input, sp);
|
||||
break;
|
||||
case RSplit:
|
||||
off = (signed char)t.pc[1];
|
||||
t.pc += 2;
|
||||
addthread(l, thread(t.pc + off, incref(t.sub)), input, sp);
|
||||
addthread(l, thread(t.pc, t.sub), input, sp);
|
||||
break;
|
||||
case Save:
|
||||
off = (unsigned char)t.pc[1];
|
||||
t.pc += 2;
|
||||
addthread(l, thread(t.pc, update(t.sub, off, sp)), input, sp);
|
||||
break;
|
||||
case Bol:
|
||||
if(sp == input->begin)
|
||||
addthread(l, thread(t.pc + 1, t.sub), input, sp);
|
||||
break;
|
||||
case Eol:
|
||||
if(sp == input->end)
|
||||
addthread(l, thread(t.pc + 1, t.sub), input, sp);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
int
|
||||
re1_5_pikevm(ByteProg *prog, Subject *input, const char **subp, int nsubp, int is_anchored)
|
||||
{
|
||||
int i, len;
|
||||
ThreadList *clist, *nlist, *tmp;
|
||||
char *pc;
|
||||
const char *sp;
|
||||
Sub *sub, *matched;
|
||||
|
||||
matched = nil;
|
||||
for(i=0; i<nsubp; i++)
|
||||
subp[i] = nil;
|
||||
sub = newsub(nsubp);
|
||||
for(i=0; i<nsubp; i++)
|
||||
sub->sub[i] = nil;
|
||||
|
||||
len = prog->len;
|
||||
clist = threadlist(len);
|
||||
nlist = threadlist(len);
|
||||
|
||||
cleanmarks(prog);
|
||||
addthread(clist, thread(HANDLE_ANCHORED(prog->insts, is_anchored), sub), input, input->begin);
|
||||
matched = 0;
|
||||
for(sp=input->begin;; sp++) {
|
||||
if(clist->n == 0)
|
||||
break;
|
||||
// printf("%d(%02x).", (int)(sp - input->begin), *sp & 0xFF);
|
||||
cleanmarks(prog);
|
||||
for(i=0; i<clist->n; i++) {
|
||||
pc = clist->t[i].pc;
|
||||
sub = clist->t[i].sub;
|
||||
// printf(" %d", (int)(pc - prog->insts));
|
||||
if (inst_is_consumer(*pc & 0x7f)) {
|
||||
// If we need to match a character, but there's none left,
|
||||
// it's fail (we don't schedule current thread for continuation)
|
||||
if(sp >= input->end) {
|
||||
decref(sub);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
switch(*pc++ & 0x7f) {
|
||||
case Char:
|
||||
if(*sp != *pc++) {
|
||||
decref(sub);
|
||||
break;
|
||||
}
|
||||
case Any:
|
||||
addthread:
|
||||
addthread(nlist, thread(pc, sub), input, sp+1);
|
||||
break;
|
||||
case Class:
|
||||
case ClassNot:
|
||||
if (!_re1_5_classmatch(pc, sp)) {
|
||||
decref(sub);
|
||||
break;
|
||||
}
|
||||
pc += *(unsigned char*)pc * 2 + 1;
|
||||
goto addthread;
|
||||
case NamedClass:
|
||||
if (!_re1_5_namedclassmatch(pc, sp)) {
|
||||
decref(sub);
|
||||
break;
|
||||
}
|
||||
pc++;
|
||||
goto addthread;
|
||||
case Match:
|
||||
if(matched)
|
||||
decref(matched);
|
||||
matched = sub;
|
||||
for(i++; i < clist->n; i++)
|
||||
decref(clist->t[i].sub);
|
||||
goto BreakFor;
|
||||
// Jmp, Split, Save handled in addthread, so that
|
||||
// machine execution matches what a backtracker would do.
|
||||
// This is discussed (but not shown as code) in
|
||||
// Regular Expression Matching: the Virtual Machine Approach.
|
||||
}
|
||||
}
|
||||
BreakFor:
|
||||
// printf("\n");
|
||||
tmp = clist;
|
||||
clist = nlist;
|
||||
nlist = tmp;
|
||||
nlist->n = 0;
|
||||
//if(*sp == '\0')
|
||||
// break;
|
||||
}
|
||||
if(matched) {
|
||||
for(i=0; i<nsubp; i++)
|
||||
subp[i] = matched->sub[i];
|
||||
decref(matched);
|
||||
return 1;
|
||||
}
|
||||
return 0;
|
||||
}
|
|
@ -0,0 +1,162 @@
|
|||
// Copyright 2007-2009 Russ Cox. All Rights Reserved.
|
||||
// Copyright 2014-2019 Paul Sokolovsky.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
#ifndef _RE1_5_REGEXP__H
|
||||
#define _RE1_5_REGEXP__H
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <stdarg.h>
|
||||
#include <assert.h>
|
||||
|
||||
#define nil ((void*)0)
|
||||
#define nelem(x) (sizeof(x)/sizeof((x)[0]))
|
||||
|
||||
typedef struct Regexp Regexp;
|
||||
typedef struct Prog Prog;
|
||||
typedef struct ByteProg ByteProg;
|
||||
typedef struct Inst Inst;
|
||||
typedef struct Subject Subject;
|
||||
|
||||
struct Regexp
|
||||
{
|
||||
int type;
|
||||
int n;
|
||||
int ch;
|
||||
Regexp *left;
|
||||
Regexp *right;
|
||||
};
|
||||
|
||||
enum /* Regexp.type */
|
||||
{
|
||||
Alt = 1,
|
||||
Cat,
|
||||
Lit,
|
||||
Dot,
|
||||
Paren,
|
||||
Quest,
|
||||
Star,
|
||||
Plus,
|
||||
};
|
||||
|
||||
Regexp *parse(char*);
|
||||
Regexp *reg(int type, Regexp *left, Regexp *right);
|
||||
void printre(Regexp*);
|
||||
#ifndef re1_5_fatal
|
||||
void re1_5_fatal(char*);
|
||||
#endif
|
||||
#ifndef re1_5_stack_chk
|
||||
#define re1_5_stack_chk()
|
||||
#endif
|
||||
void *mal(int);
|
||||
|
||||
struct Prog
|
||||
{
|
||||
Inst *start;
|
||||
int len;
|
||||
};
|
||||
|
||||
struct ByteProg
|
||||
{
|
||||
int bytelen;
|
||||
int len;
|
||||
int sub;
|
||||
char insts[0];
|
||||
};
|
||||
|
||||
struct Inst
|
||||
{
|
||||
int opcode;
|
||||
int c;
|
||||
int n;
|
||||
Inst *x;
|
||||
Inst *y;
|
||||
int gen; // global state, oooh!
|
||||
};
|
||||
|
||||
enum /* Inst.opcode */
|
||||
{
|
||||
// Instructions which consume input bytes (and thus fail if none left)
|
||||
CONSUMERS = 1,
|
||||
Char = CONSUMERS,
|
||||
Any,
|
||||
Class,
|
||||
ClassNot,
|
||||
NamedClass,
|
||||
|
||||
ASSERTS = 0x50,
|
||||
Bol = ASSERTS,
|
||||
Eol,
|
||||
|
||||
// Instructions which take relative offset as arg
|
||||
JUMPS = 0x60,
|
||||
Jmp = JUMPS,
|
||||
Split,
|
||||
RSplit,
|
||||
|
||||
// Other (special) instructions
|
||||
Save = 0x7e,
|
||||
Match = 0x7f,
|
||||
};
|
||||
|
||||
#define inst_is_consumer(inst) ((inst) < ASSERTS)
|
||||
#define inst_is_jump(inst) ((inst) & 0x70 == JUMPS)
|
||||
|
||||
Prog *compile(Regexp*);
|
||||
void printprog(Prog*);
|
||||
|
||||
extern int gen;
|
||||
|
||||
enum {
|
||||
MAXSUB = 20
|
||||
};
|
||||
|
||||
typedef struct Sub Sub;
|
||||
|
||||
struct Sub
|
||||
{
|
||||
int ref;
|
||||
int nsub;
|
||||
const char *sub[MAXSUB];
|
||||
};
|
||||
|
||||
Sub *newsub(int n);
|
||||
Sub *incref(Sub*);
|
||||
Sub *copy(Sub*);
|
||||
Sub *update(Sub*, int, const char*);
|
||||
void decref(Sub*);
|
||||
|
||||
struct Subject {
|
||||
const char *begin;
|
||||
const char *end;
|
||||
};
|
||||
|
||||
|
||||
#define NON_ANCHORED_PREFIX 5
|
||||
#define HANDLE_ANCHORED(bytecode, is_anchored) ((is_anchored) ? (bytecode) + NON_ANCHORED_PREFIX : (bytecode))
|
||||
|
||||
int re1_5_backtrack(ByteProg*, Subject*, const char**, int, int);
|
||||
int re1_5_pikevm(ByteProg*, Subject*, const char**, int, int);
|
||||
int re1_5_recursiveloopprog(ByteProg*, Subject*, const char**, int, int);
|
||||
int re1_5_recursiveprog(ByteProg*, Subject*, const char**, int, int);
|
||||
int re1_5_thompsonvm(ByteProg*, Subject*, const char**, int, int);
|
||||
|
||||
// Return codes for re1_5_sizecode() and re1_5_compilecode()
|
||||
enum {
|
||||
RE1_5_SUCCESS = 0,
|
||||
RE1_5_SYNTAX_ERROR = -2,
|
||||
RE1_5_UNSUPPORTED_ESCAPE = -3,
|
||||
RE1_5_UNSUPPORTED_SYNTAX = -4,
|
||||
};
|
||||
|
||||
int re1_5_sizecode(const char *re);
|
||||
int re1_5_compilecode(ByteProg *prog, const char *re);
|
||||
void re1_5_dumpcode(ByteProg *prog);
|
||||
void cleanmarks(ByteProg *prog);
|
||||
int _re1_5_classmatch(const char *pc, const char *sp);
|
||||
int _re1_5_namedclassmatch(const char *pc, const char *sp);
|
||||
|
||||
#endif /*_RE1_5_REGEXP__H*/
|
|
@ -0,0 +1,79 @@
|
|||
// Copyright 2007-2009 Russ Cox. All Rights Reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
#include "re1.5.h"
|
||||
|
||||
static int
|
||||
recursive(char *pc, const char *sp, Subject *input, const char **subp, int nsubp)
|
||||
{
|
||||
const char *old;
|
||||
int off;
|
||||
|
||||
if(inst_is_consumer(*pc)) {
|
||||
// If we need to match a character, but there's none left, it's fail
|
||||
if(sp >= input->end)
|
||||
return 0;
|
||||
}
|
||||
|
||||
re1_5_stack_chk();
|
||||
|
||||
switch(*pc++) {
|
||||
case Char:
|
||||
if(*sp != *pc++)
|
||||
return 0;
|
||||
case Any:
|
||||
return recursive(pc, sp+1, input, subp, nsubp);
|
||||
case Class:
|
||||
case ClassNot:
|
||||
if (!_re1_5_classmatch(pc, sp))
|
||||
return 0;
|
||||
pc += *(unsigned char*)pc * 2 + 1;
|
||||
return recursive(pc, sp+1, input, subp, nsubp);
|
||||
case NamedClass:
|
||||
if (!_re1_5_namedclassmatch(pc, sp))
|
||||
return 0;
|
||||
return recursive(pc+1, sp+1, input, subp, nsubp);
|
||||
case Match:
|
||||
return 1;
|
||||
case Jmp:
|
||||
off = (signed char)*pc++;
|
||||
return recursive(pc + off, sp, input, subp, nsubp);
|
||||
case Split:
|
||||
off = (signed char)*pc++;
|
||||
if(recursive(pc, sp, input, subp, nsubp))
|
||||
return 1;
|
||||
return recursive(pc + off, sp, input, subp, nsubp);
|
||||
case RSplit:
|
||||
off = (signed char)*pc++;
|
||||
if(recursive(pc + off, sp, input, subp, nsubp))
|
||||
return 1;
|
||||
return recursive(pc, sp, input, subp, nsubp);
|
||||
case Save:
|
||||
off = (unsigned char)*pc++;
|
||||
if(off >= nsubp)
|
||||
return recursive(pc, sp, input, subp, nsubp);
|
||||
old = subp[off];
|
||||
subp[off] = sp;
|
||||
if(recursive(pc, sp, input, subp, nsubp))
|
||||
return 1;
|
||||
subp[off] = old;
|
||||
return 0;
|
||||
case Bol:
|
||||
if(sp != input->begin)
|
||||
return 0;
|
||||
return recursive(pc, sp, input, subp, nsubp);
|
||||
case Eol:
|
||||
if(sp != input->end)
|
||||
return 0;
|
||||
return recursive(pc, sp, input, subp, nsubp);
|
||||
}
|
||||
re1_5_fatal("recursive");
|
||||
return -1;
|
||||
}
|
||||
|
||||
int
|
||||
re1_5_recursiveprog(ByteProg *prog, Subject *input, const char **subp, int nsubp, int is_anchored)
|
||||
{
|
||||
return recursive(HANDLE_ANCHORED(prog->insts, is_anchored), input->begin, input, subp, nsubp);
|
||||
}
|
|
@ -0,0 +1,86 @@
|
|||
// Copyright 2007-2009 Russ Cox. All Rights Reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
#include "re1.5.h"
|
||||
|
||||
static int
|
||||
recursiveloop(char *pc, const char *sp, Subject *input, const char **subp, int nsubp)
|
||||
{
|
||||
const char *old;
|
||||
int off;
|
||||
|
||||
re1_5_stack_chk();
|
||||
|
||||
for(;;) {
|
||||
if(inst_is_consumer(*pc)) {
|
||||
// If we need to match a character, but there's none left, it's fail
|
||||
if(sp >= input->end)
|
||||
return 0;
|
||||
}
|
||||
switch(*pc++) {
|
||||
case Char:
|
||||
if(*sp != *pc++)
|
||||
return 0;
|
||||
case Any:
|
||||
sp++;
|
||||
continue;
|
||||
case Class:
|
||||
case ClassNot:
|
||||
if (!_re1_5_classmatch(pc, sp))
|
||||
return 0;
|
||||
pc += *(unsigned char*)pc * 2 + 1;
|
||||
sp++;
|
||||
continue;
|
||||
case NamedClass:
|
||||
if (!_re1_5_namedclassmatch(pc, sp))
|
||||
return 0;
|
||||
pc++;
|
||||
sp++;
|
||||
continue;
|
||||
case Match:
|
||||
return 1;
|
||||
case Jmp:
|
||||
off = (signed char)*pc++;
|
||||
pc = pc + off;
|
||||
continue;
|
||||
case Split:
|
||||
off = (signed char)*pc++;
|
||||
if(recursiveloop(pc, sp, input, subp, nsubp))
|
||||
return 1;
|
||||
pc = pc + off;
|
||||
continue;
|
||||
case RSplit:
|
||||
off = (signed char)*pc++;
|
||||
if(recursiveloop(pc + off, sp, input, subp, nsubp))
|
||||
return 1;
|
||||
continue;
|
||||
case Save:
|
||||
off = (unsigned char)*pc++;
|
||||
if(off >= nsubp) {
|
||||
continue;
|
||||
}
|
||||
old = subp[off];
|
||||
subp[off] = sp;
|
||||
if(recursiveloop(pc, sp, input, subp, nsubp))
|
||||
return 1;
|
||||
subp[off] = old;
|
||||
return 0;
|
||||
case Bol:
|
||||
if(sp != input->begin)
|
||||
return 0;
|
||||
continue;
|
||||
case Eol:
|
||||
if(sp != input->end)
|
||||
return 0;
|
||||
continue;
|
||||
}
|
||||
re1_5_fatal("recursiveloop");
|
||||
}
|
||||
}
|
||||
|
||||
int
|
||||
re1_5_recursiveloopprog(ByteProg *prog, Subject *input, const char **subp, int nsubp, int is_anchored)
|
||||
{
|
||||
return recursiveloop(HANDLE_ANCHORED(prog->insts, is_anchored), input->begin, input, subp, nsubp);
|
||||
}
|
|
@ -0,0 +1,164 @@
|
|||
#! /usr/bin/env python3
|
||||
|
||||
RE_EXEC = "./re"
|
||||
|
||||
test_suite = [
|
||||
# basics
|
||||
("search", r"abc", "abcdef"),
|
||||
("search", r"cde", "abcdef"),
|
||||
("search", r"abc*", "abdef"),
|
||||
("search", r"abc*", "abcccdef"),
|
||||
("search", r"abc+", "abdef"),
|
||||
("search", r"abc+", "abcccdef"),
|
||||
|
||||
# match
|
||||
("match", r"abc", "abcdef"),
|
||||
("match", r"abc*", "abdef"),
|
||||
|
||||
# search vs match distinction
|
||||
("match", r"a*", "baa"),
|
||||
("search", r"a*", "baa"),
|
||||
|
||||
# nested group matching
|
||||
("match", r"(([0-9]*)([a-z]*)[0-9]*)", "1234hello567"),
|
||||
("match", r"([0-9]*)(([a-z]*)([0-9]*))", "1234hello567"),
|
||||
|
||||
# non-capturing groups
|
||||
("match", r"(([0-9]*)(?:[a-z]*)[0-9]*)", "1234hello568"),
|
||||
("match", r"(?:[0-9]*)(([a-z]*)(?:[0-9]*))", "1234hello568"),
|
||||
("match", r"([0-9]*)(?:([a-z]*)(?:[0-9]*))", "1234hello568"),
|
||||
("match", r"(?:)", "1234hello568"),
|
||||
("match", r"1?:", "1:"),
|
||||
|
||||
# named character classes
|
||||
("match", r"\d+", "123abc456"),
|
||||
("match", r"\s+", " \t123abc456"),
|
||||
("match", r"\w+", "123abc_456 abc"),
|
||||
("match", r"(\w+)\s+(\w+)", "ABC \t123hello456 abc"),
|
||||
("match", r"(\S+)\s+(\D+)", "ABC \thello abc456 abc"),
|
||||
("match", r"(([0-9]*)([a-z]*)\d*)", "123hello456"),
|
||||
|
||||
# classes
|
||||
("match", r"[a]*", "a"),
|
||||
("search", r"([yab]*)(e*)([cd])", "xyac"),
|
||||
("search", r"([yab]*)(e*)([^y]?)$", "xyac"),
|
||||
("match", r"[-]*", "--"),
|
||||
("match", r"[-a]*", "-a-b"),
|
||||
("match", r"[-ab]*", "-a-b"),
|
||||
("match", r"[-a-c]*", "-a-b-d-"),
|
||||
("match", r"[a-]*", "-a-b"),
|
||||
("match", r"[ab-]*", "-a-b"),
|
||||
("match", r"[a-c-]*", "-a-b-d-"),
|
||||
|
||||
# escaped metacharacters
|
||||
("match", r"(\?:)", ":"),
|
||||
("match", r"\(?:", "(:"),
|
||||
|
||||
# non-greedy
|
||||
("match", r"a(b??)(b*)c", "abbc"),
|
||||
("match", r"a(b+?)(b*)c", "abbbc"),
|
||||
("match", r"a(b*?)(b*)c", "abbbbc"),
|
||||
|
||||
# greedy
|
||||
("match", r"a(b?)(b*)c", "abbc"),
|
||||
("match", r"a(b+)(b*)c", "abbbc"),
|
||||
("match", r"a(b*)(b*)c", "abbbbc"),
|
||||
|
||||
# errors
|
||||
("search", r"?", ""),
|
||||
("search", r"*", ""),
|
||||
("search", r"+", ""),
|
||||
("search", r"[", ""),
|
||||
("search", r"(", ""),
|
||||
("search", r")", ""),
|
||||
("search", "\\", ""),
|
||||
("search", "|+", ""),
|
||||
("search", "|*", ""),
|
||||
("search", "|?", ""),
|
||||
("search", "^*", ""),
|
||||
("search", "$*", ""),
|
||||
("search", "a*+", ""),
|
||||
("search", "a*?", ""),
|
||||
("search", "a**", ""),
|
||||
]
|
||||
|
||||
import re
|
||||
import sre_constants
|
||||
import subprocess
|
||||
from collections import OrderedDict
|
||||
|
||||
def parse_result(string, res):
|
||||
name, rest = res.split(b" ", 1)
|
||||
if rest == b"-no match-":
|
||||
return name, None
|
||||
if rest == b"REGEX ERROR":
|
||||
return name, rest
|
||||
assert rest.startswith(b"match ")
|
||||
rest = rest[6:]
|
||||
tuples = [eval(t) for t in rest.split()]
|
||||
matches = tuple(string[t[0]:t[1]] for t in tuples)
|
||||
return name, matches
|
||||
|
||||
def fit_str(string, width):
|
||||
if len(string) <= width:
|
||||
return string
|
||||
else:
|
||||
return string[:width - 2] + ".."
|
||||
|
||||
def main():
|
||||
engine_stats = OrderedDict()
|
||||
for kind, regex, string in test_suite:
|
||||
# run Python re to get correct result
|
||||
try:
|
||||
if kind == "match":
|
||||
py_res = re.match(regex, string)
|
||||
else:
|
||||
py_res = re.search(regex, string)
|
||||
if py_res is not None:
|
||||
py_res = (py_res.group(0),) + py_res.groups()
|
||||
except sre_constants.error:
|
||||
py_res = b"REGEX ERROR"
|
||||
|
||||
# run our code
|
||||
try:
|
||||
args = (["-m"] if kind == "match" else []) + [regex, string]
|
||||
re_res = subprocess.check_output([RE_EXEC]+args, stderr=subprocess.STDOUT)
|
||||
re_res = re_res.split(b'\n')[1:-1] # split lines, remove first and last
|
||||
except subprocess.CalledProcessError as e:
|
||||
if e.returncode == 2 and e.output == b"fatal error: Error in regexp\n":
|
||||
re_res = [b"recursive REGEX ERROR", b"recursiveloop REGEX ERROR", b"backtrack REGEX ERROR", b"thompson REGEX ERROR", b"pike REGEX ERROR"]
|
||||
else:
|
||||
raise
|
||||
|
||||
# check result of each engine
|
||||
for engine in re_res:
|
||||
engine_name, re_res = parse_result(string, engine)
|
||||
try:
|
||||
stats = engine_stats[engine_name]
|
||||
except KeyError:
|
||||
engine_stats[engine_name] = stats = [0, 0]
|
||||
|
||||
# Thompson algo offers just boolean match/no match status
|
||||
py_res_cur = py_res
|
||||
re_res_cur = re_res
|
||||
if engine_name == b"thompson":
|
||||
if py_res is not None:
|
||||
py_res_cur = True
|
||||
if re_res is not None:
|
||||
re_res_cur = True
|
||||
|
||||
if py_res_cur == re_res_cur:
|
||||
print("pass ", end="")
|
||||
stats[0] += 1
|
||||
else:
|
||||
print("FAIL ", end="")
|
||||
stats[1] += 1
|
||||
|
||||
print("%s %-25s %-20s" % (kind[0], fit_str(regex, 25), fit_str(string, 20)))
|
||||
|
||||
print("Ran %d tests, results:" % len(test_suite))
|
||||
for name, stats in engine_stats.items():
|
||||
print("%15s %2d pass %2d fail" % (str(name, encoding='utf8'), stats[0], stats[1]))
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
|
@ -0,0 +1,55 @@
|
|||
// Copyright 2007-2009 Russ Cox. All Rights Reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
#include "re1.5.h"
|
||||
|
||||
Sub *freesub;
|
||||
|
||||
Sub*
|
||||
newsub(int n)
|
||||
{
|
||||
Sub *s;
|
||||
|
||||
s = freesub;
|
||||
if(s != nil)
|
||||
freesub = (Sub*)s->sub[0];
|
||||
else
|
||||
s = mal(sizeof *s);
|
||||
s->nsub = n;
|
||||
s->ref = 1;
|
||||
return s;
|
||||
}
|
||||
|
||||
Sub*
|
||||
incref(Sub *s)
|
||||
{
|
||||
s->ref++;
|
||||
return s;
|
||||
}
|
||||
|
||||
Sub*
|
||||
update(Sub *s, int i, const char *p)
|
||||
{
|
||||
Sub *s1;
|
||||
int j;
|
||||
|
||||
if(s->ref > 1) {
|
||||
s1 = newsub(s->nsub);
|
||||
for(j=0; j<s->nsub; j++)
|
||||
s1->sub[j] = s->sub[j];
|
||||
s->ref--;
|
||||
s = s1;
|
||||
}
|
||||
s->sub[i] = p;
|
||||
return s;
|
||||
}
|
||||
|
||||
void
|
||||
decref(Sub *s)
|
||||
{
|
||||
if(--s->ref == 0) {
|
||||
s->sub[0] = (char*)freesub;
|
||||
freesub = s;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,152 @@
|
|||
// Copyright 2007-2009 Russ Cox. All Rights Reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
#include "re1.5.h"
|
||||
|
||||
typedef struct Thread Thread;
|
||||
struct Thread
|
||||
{
|
||||
char *pc;
|
||||
};
|
||||
|
||||
typedef struct ThreadList ThreadList;
|
||||
struct ThreadList
|
||||
{
|
||||
int n;
|
||||
Thread t[1];
|
||||
};
|
||||
|
||||
static Thread
|
||||
thread(char *pc)
|
||||
{
|
||||
Thread t = {pc};
|
||||
return t;
|
||||
}
|
||||
|
||||
static ThreadList*
|
||||
threadlist(int n)
|
||||
{
|
||||
return mal(sizeof(ThreadList)+n*sizeof(Thread));
|
||||
}
|
||||
|
||||
static void
|
||||
addthread(ThreadList *l, Thread t, Subject *input, const char *sp)
|
||||
{
|
||||
int off;
|
||||
if(*t.pc & 0x80)
|
||||
return; // already on list
|
||||
|
||||
*t.pc |= 0x80;
|
||||
l->t[l->n] = t;
|
||||
l->n++;
|
||||
|
||||
switch(*t.pc & 0x7f) {
|
||||
case Jmp:
|
||||
off = (signed char)t.pc[1];
|
||||
t.pc += 2;
|
||||
addthread(l, thread(t.pc + off), input, sp);
|
||||
break;
|
||||
case Split:
|
||||
off = (signed char)t.pc[1];
|
||||
t.pc += 2;
|
||||
addthread(l, thread(t.pc), input, sp);
|
||||
addthread(l, thread(t.pc + off), input, sp);
|
||||
break;
|
||||
case RSplit:
|
||||
off = (signed char)t.pc[1];
|
||||
t.pc += 2;
|
||||
addthread(l, thread(t.pc + off), input, sp);
|
||||
addthread(l, thread(t.pc), input, sp);
|
||||
break;
|
||||
case Save:
|
||||
off = (unsigned char)t.pc[1];
|
||||
t.pc += 2;
|
||||
addthread(l, thread(t.pc), input, sp);
|
||||
break;
|
||||
case Bol:
|
||||
if(sp == input->begin)
|
||||
addthread(l, thread(t.pc + 1), input, sp);
|
||||
break;
|
||||
case Eol:
|
||||
if(sp == input->end - 1)
|
||||
addthread(l, thread(t.pc + 1), input, sp);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
int
|
||||
re1_5_thompsonvm(ByteProg *prog, Subject *input, const char **subp, int nsubp, int is_anchored)
|
||||
{
|
||||
int i, len, matched;
|
||||
ThreadList *clist, *nlist, *tmp;
|
||||
char *pc;
|
||||
const char *sp;
|
||||
|
||||
for(i=0; i<nsubp; i++)
|
||||
subp[i] = nil;
|
||||
|
||||
len = prog->len;
|
||||
clist = threadlist(len);
|
||||
nlist = threadlist(len);
|
||||
|
||||
if(nsubp >= 1)
|
||||
subp[0] = input->begin;
|
||||
cleanmarks(prog);
|
||||
addthread(clist, thread(HANDLE_ANCHORED(prog->insts, is_anchored)), input, input->begin);
|
||||
matched = 0;
|
||||
for(sp=input->begin;; sp++) {
|
||||
if(clist->n == 0)
|
||||
break;
|
||||
// printf("%d(%02x).", (int)(sp - input->begin), *sp & 0xFF);
|
||||
cleanmarks(prog);
|
||||
for(i=0; i<clist->n; i++) {
|
||||
pc = clist->t[i].pc;
|
||||
// printf(" %d", (int)(pc - prog->insts));
|
||||
if (inst_is_consumer(*pc & 0x7f)) {
|
||||
// If we need to match a character, but there's none left,
|
||||
// it's fail (we don't schedule current thread for continuation)
|
||||
if(sp >= input->end)
|
||||
continue;
|
||||
}
|
||||
switch(*pc++ & 0x7f) {
|
||||
case Char:
|
||||
if(*sp != *pc++)
|
||||
break;
|
||||
case Any:
|
||||
addthread:
|
||||
addthread(nlist, thread(pc), input, sp);
|
||||
break;
|
||||
case Class:
|
||||
case ClassNot:
|
||||
if (!_re1_5_classmatch(pc, sp))
|
||||
break;
|
||||
pc += *(unsigned char*)pc * 2 + 1;
|
||||
goto addthread;
|
||||
case NamedClass:
|
||||
if (!_re1_5_namedclassmatch(pc, sp))
|
||||
break;
|
||||
pc++;
|
||||
goto addthread;
|
||||
case Match:
|
||||
if(nsubp >= 2)
|
||||
subp[1] = sp;
|
||||
matched = 1;
|
||||
goto BreakFor;
|
||||
// Jmp, Split, Save handled in addthread, so that
|
||||
// machine execution matches what a backtracker would do.
|
||||
// This is discussed (but not shown as code) in
|
||||
// Regular Expression Matching: the Virtual Machine Approach.
|
||||
}
|
||||
}
|
||||
BreakFor:
|
||||
// printf("\n");
|
||||
tmp = clist;
|
||||
clist = nlist;
|
||||
nlist = tmp;
|
||||
nlist->n = 0;
|
||||
//if(sp >= input->end)
|
||||
// break;
|
||||
}
|
||||
return matched;
|
||||
}
|
|
@ -0,0 +1,24 @@
|
|||
// Copyright 2007-2009 Russ Cox. All Rights Reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
#include "re1.5.h"
|
||||
|
||||
void
|
||||
re1_5_fatal(char *msg)
|
||||
{
|
||||
fprintf(stderr, "fatal error: %s\n", msg);
|
||||
exit(2);
|
||||
}
|
||||
|
||||
void*
|
||||
mal(int n)
|
||||
{
|
||||
void *v;
|
||||
|
||||
v = malloc(n);
|
||||
if(v == nil)
|
||||
re1_5_fatal("out of memory");
|
||||
memset(v, 0, n);
|
||||
return v;
|
||||
}
|
|
@ -23,6 +23,8 @@
|
|||
#include <berry.h>
|
||||
#include <LList.h>
|
||||
|
||||
#include "re1.5.h"
|
||||
|
||||
#define BERRY_CONSOLE_CMD_DELIMITER "\x01"
|
||||
|
||||
typedef LList_elt<char[0]> log_elt; // store the string after the header to avoid double allocation if we had used char*
|
||||
|
|
Loading…
Reference in New Issue