Module Name: othersrc
Committed By: agc
Date: Tue Oct 5 01:23:39 UTC 2021
Modified Files:
othersrc/external/bsd/agcre/dist: Makefile.bsd Makefile.in
Makefile.lib.in Makefile.libtool.in agcre.h libagcre.3 main.c
othersrc/external/bsd/agcre/dist/tests: 54.expected 62.expected
othersrc/external/bsd/agcre/lib: Makefile shlib_version
Added Files:
othersrc/external/bsd/agcre/dist: agcre.c
Removed Files:
othersrc/external/bsd/agcre/dist: comp.c error.c exec.c free.c
internal.h lex.c lex.h new.c set.c set.h unicode.c unicode.h
Log Message:
Update agcre to version 20211001
20211001
========
+ move to a single source file, which can just be included as a single
file
(the library method can still be used for this as well - sometimes,
it's just easier to
drop in a single file to other projects)
+ add AGCRE_EMBED_CTAGS as a primary regular expresion type (same as
basic regexps,
but where a '*' occurs as a non-column 0 element of the pattern, it is
assumed
to be a standard character, as produced by ctags(1) output; basic
regular expressions
only accept '*' as a standard character as the first element of the
expression).
Yay.
+ make the version string in the program a real string, so it can be
found using strings(1)
+ add an agcre_rev_regexec(3) function, which searches the text in an
efficient, reverse
manner, a la strrchr(3) or memrchr(3) fashion.
+ add agcre_regnsub(3) and agcre_regasub(3) functions
+ bump the shared lib version to 1.0
+ cut version 20211001
To generate a diff of this commit:
cvs rdiff -u -r1.1 -r1.2 othersrc/external/bsd/agcre/dist/Makefile.bsd \
othersrc/external/bsd/agcre/dist/Makefile.in \
othersrc/external/bsd/agcre/dist/Makefile.lib.in \
othersrc/external/bsd/agcre/dist/Makefile.libtool.in \
othersrc/external/bsd/agcre/dist/agcre.h \
othersrc/external/bsd/agcre/dist/main.c
cvs rdiff -u -r0 -r1.1 othersrc/external/bsd/agcre/dist/agcre.c
cvs rdiff -u -r1.2 -r0 othersrc/external/bsd/agcre/dist/comp.c \
othersrc/external/bsd/agcre/dist/error.c
cvs rdiff -u -r1.3 -r0 othersrc/external/bsd/agcre/dist/exec.c
cvs rdiff -u -r1.1 -r0 othersrc/external/bsd/agcre/dist/free.c \
othersrc/external/bsd/agcre/dist/internal.h \
othersrc/external/bsd/agcre/dist/lex.c \
othersrc/external/bsd/agcre/dist/lex.h \
othersrc/external/bsd/agcre/dist/new.c \
othersrc/external/bsd/agcre/dist/set.c \
othersrc/external/bsd/agcre/dist/set.h \
othersrc/external/bsd/agcre/dist/unicode.c \
othersrc/external/bsd/agcre/dist/unicode.h
cvs rdiff -u -r1.2 -r1.3 othersrc/external/bsd/agcre/dist/libagcre.3
cvs rdiff -u -r1.2 -r1.3 othersrc/external/bsd/agcre/dist/tests/54.expected
cvs rdiff -u -r1.1 -r1.2 othersrc/external/bsd/agcre/dist/tests/62.expected
cvs rdiff -u -r1.1 -r1.2 othersrc/external/bsd/agcre/lib/Makefile \
othersrc/external/bsd/agcre/lib/shlib_version
Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.
Modified files:
Index: othersrc/external/bsd/agcre/dist/Makefile.bsd
diff -u othersrc/external/bsd/agcre/dist/Makefile.bsd:1.1 othersrc/external/bsd/agcre/dist/Makefile.bsd:1.2
--- othersrc/external/bsd/agcre/dist/Makefile.bsd:1.1 Wed Aug 16 23:38:13 2017
+++ othersrc/external/bsd/agcre/dist/Makefile.bsd Tue Oct 5 01:23:39 2021
@@ -1,11 +1,5 @@
PROG= agcre
-SRCS+= comp.c
-SRCS+= error.c
-SRCS+= exec.c
-SRCS+= free.c
-SRCS+= lex.c
-SRCS+= set.c
-SRCS+= unicode.c
+SRCS+= agcre.c
SRCS+= main.c
MAN= libagcre.3
WARNS= 5
Index: othersrc/external/bsd/agcre/dist/Makefile.in
diff -u othersrc/external/bsd/agcre/dist/Makefile.in:1.1 othersrc/external/bsd/agcre/dist/Makefile.in:1.2
--- othersrc/external/bsd/agcre/dist/Makefile.in:1.1 Wed Aug 16 23:38:13 2017
+++ othersrc/external/bsd/agcre/dist/Makefile.in Tue Oct 5 01:23:39 2021
@@ -1,8 +1,8 @@
-# $NetBSD: Makefile.in,v 1.1 2017/08/16 23:38:13 agc Exp $
+# $NetBSD: Makefile.in,v 1.2 2021/10/05 01:23:39 agc Exp $
PROG= agcre
-OBJS= comp.o error.o exec.o free.o lex.o new.o set.o unicode.o main.o
+OBJS= agcre.o main.o
PREFIX=@PREFIX@
MANDIR=@MANDIR@
Index: othersrc/external/bsd/agcre/dist/Makefile.lib.in
diff -u othersrc/external/bsd/agcre/dist/Makefile.lib.in:1.1 othersrc/external/bsd/agcre/dist/Makefile.lib.in:1.2
--- othersrc/external/bsd/agcre/dist/Makefile.lib.in:1.1 Wed Aug 16 23:38:13 2017
+++ othersrc/external/bsd/agcre/dist/Makefile.lib.in Tue Oct 5 01:23:39 2021
@@ -1,8 +1,8 @@
-# $NetBSD: Makefile.lib.in,v 1.1 2017/08/16 23:38:13 agc Exp $
+# $NetBSD: Makefile.lib.in,v 1.2 2021/10/05 01:23:39 agc Exp $
LIB= libagcre.a
- OBJS= comp.o error.o exec.o free.o lex.o new.o set.o unicode.o
+OBJS= agcre.o
PREFIX=@PREFIX@
MANDIR=@MANDIR@
Index: othersrc/external/bsd/agcre/dist/Makefile.libtool.in
diff -u othersrc/external/bsd/agcre/dist/Makefile.libtool.in:1.1 othersrc/external/bsd/agcre/dist/Makefile.libtool.in:1.2
--- othersrc/external/bsd/agcre/dist/Makefile.libtool.in:1.1 Wed Aug 16 23:38:13 2017
+++ othersrc/external/bsd/agcre/dist/Makefile.libtool.in Tue Oct 5 01:23:39 2021
@@ -1,8 +1,8 @@
-# $NetBSD: Makefile.libtool.in,v 1.1 2017/08/16 23:38:13 agc Exp $
+# $NetBSD: Makefile.libtool.in,v 1.2 2021/10/05 01:23:39 agc Exp $
LIB= libagcre.a
-OBJS= comp.o error.o exec.o free.o lex.o new.o set.o unicode.o
+OBJS= agcre.o
PREFIX=@PREFIX@
MANDIR=@MANDIR@
Index: othersrc/external/bsd/agcre/dist/agcre.h
diff -u othersrc/external/bsd/agcre/dist/agcre.h:1.1 othersrc/external/bsd/agcre/dist/agcre.h:1.2
--- othersrc/external/bsd/agcre/dist/agcre.h:1.1 Wed Aug 16 23:38:13 2017
+++ othersrc/external/bsd/agcre/dist/agcre.h Tue Oct 5 01:23:39 2021
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2013,2017 Alistair Crooks. All Rights reserved.
+ * Copyright (c) 2013,2017,2020,2021 Alistair Crooks. All Rights reserved.
* Copyright (c) 2007-2009 Russ Cox, Google Inc. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
@@ -29,7 +29,7 @@
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef AGCRE_VERSION
-#define AGCRE_VERSION 20170801
+#define AGCRE_VERSION 20211001
#include <inttypes.h>
@@ -52,12 +52,15 @@
#define REG_UTF32BE AGCRE_REG_UTF32BE
#define REG_SUCCESS AGCRE_REG_SUCCESS
#define REG_FAILURE AGCRE_REG_FAILURE
+#define REG_NOMATCH AGCRE_REG_FAILURE
#define REG_MAGIC AGCRE_MAGIC
#define REG_MAX_SUBEXPR AGCRE_MAX_SUBEXPR
#define REG_MAX_EXPR_LENGTH AGCRE_MAX_EXPR_LENGTH
+#define REG_ANCHOR AGCRE_REG_ANCHOR
+#define REG_CTAGS AGCRE_REG_CTAGS
#define regoff_t agcre_regoff_t
#define regmatch_t agcre_regmatch_t
-#define regex_t agcre_re_t
+#define regex_t agcre_regex_t
#define regcomp agcre_regcomp
#define regexec agcre_regexec
#define regfree agcre_regfree
@@ -80,6 +83,8 @@
#define AGCRE_REG_UTF16BE 0x00002000
#define AGCRE_REG_UTF32LE 0x00004000
#define AGCRE_REG_UTF32BE 0x00008000
+#define AGCRE_REG_ANCHOR 0x00010000
+#define AGCRE_REG_CTAGS 0x00020000
#define AGCRE_REG_SUCCESS 0
#define AGCRE_REG_FAILURE 1
@@ -99,12 +104,12 @@ typedef struct agcre_regmatch_t {
} agcre_regmatch_t;
/* a regexp structure */
-typedef struct agcre_re_t {
+typedef struct agcre_regex_t {
uint32_t re_magic; /* magic number */
size_t re_nsub; /* number of subexpressions */
const char *re_endp; /* pointer to end of expr */
void *re_g; /* internals */
-} agcre_re_t;
+} agcre_regex_t;
#ifndef __BEGIN_DECLS
# if defined(__cplusplus)
@@ -118,13 +123,19 @@ typedef struct agcre_re_t {
__BEGIN_DECLS
-agcre_re_t *agcre_new(void);
-int agcre_regcomp(agcre_re_t * /*re*/, const void * /*s*/, uint32_t /*flags*/);
-int agcre_regexec(agcre_re_t * /*re*/, const void * /*vs*/, size_t /*mc*/,
- agcre_regmatch_t * /*m*/, uint32_t /*flags*/);
-void agcre_regfree(agcre_re_t * /*re*/);
-size_t agcre_regerror(int /*errcode*/, const agcre_re_t * /*agcre*/,
- char * /*errbuf*/, size_t /*size*/);
+agcre_regex_t *agcre_new(void);
+int agcre_regcomp(agcre_regex_t */*re*/, const void */*s*/, uint32_t /*flags*/);
+int agcre_regexec(agcre_regex_t */*re*/, const void */*vs*/, size_t /*mc*/,
+ agcre_regmatch_t */*m*/, uint32_t /*flags*/);
+int agcre_rev_regexec(agcre_regex_t */*agcre*/, const void */*vs*/,
+ size_t /*matchc*/, agcre_regmatch_t */*m*/, uint32_t /*flags*/);
+void agcre_regfree(agcre_regex_t */*re*/);
+size_t agcre_regerror(int /*errcode*/, const agcre_regex_t */*agcre*/,
+ char */*errbuf*/, size_t /*size*/);
+ssize_t agcre_regnsub(char */*buf*/, size_t /*size*/, const char */*repl*/,
+ const agcre_regmatch_t */*rm*/, const char */*str*/);
+ssize_t agcre_regasub(char **/*buf*/, const char */*repl*/,
+ const agcre_regmatch_t */*rm*/, const char */*str*/);
__END_DECLS
Index: othersrc/external/bsd/agcre/dist/main.c
diff -u othersrc/external/bsd/agcre/dist/main.c:1.1 othersrc/external/bsd/agcre/dist/main.c:1.2
--- othersrc/external/bsd/agcre/dist/main.c:1.1 Wed Aug 16 23:38:13 2017
+++ othersrc/external/bsd/agcre/dist/main.c Tue Oct 5 01:23:39 2021
@@ -67,6 +67,10 @@ typedef struct file_t {
uint8_t binary; /* file is a "binary" */
} file_t;
+#define STRINGIFY(x) #x
+
+#define AGCRE_VERSION_STRING(x) STRINGIFY(x)
+
static inline int
isbinary(char *buf, size_t size)
{
@@ -133,7 +137,7 @@ static char to[2][10];
/* display the match */
static int
-display_match(agcre_re_t *re, agcre_regmatch_t *m, rtflags_t *rtflags, char *s,
+display_match(agcre_regex_t *re, agcre_regmatch_t *m, rtflags_t *rtflags, char *s,
agcre_regoff_t start, agcre_regoff_t len)
{
size_t i;
@@ -160,7 +164,7 @@ display_match(agcre_re_t *re, agcre_regm
/* search a line in a file */
static int
-doline(agcre_re_t *re, rtflags_t *rtflags, file_t *file, char *line, size_t len, agcre_regmatch_t *m)
+doline(agcre_regex_t *re, rtflags_t *rtflags, file_t *file, char *line, size_t len, agcre_regmatch_t *m)
{
agcre_regoff_t start;
int flags;
@@ -203,7 +207,7 @@ doline(agcre_re_t *re, rtflags_t *rtflag
/* search in a file stream */
static int
-dofp(agcre_re_t *re, FILE *fp, rtflags_t *rtflags, const char *f)
+dofp(agcre_regex_t *re, FILE *fp, rtflags_t *rtflags, const char *f)
{
agcre_regmatch_t *m;
rtflags_t localflags;
@@ -258,7 +262,7 @@ dofp(agcre_re_t *re, FILE *fp, rtflags_t
/* search in a file/dir */
static int
-dofile(agcre_re_t *re, const char *f, rtflags_t *rtflags)
+dofile(agcre_regex_t *re, const char *f, rtflags_t *rtflags)
{
struct dirent entry;
struct dirent *d;
@@ -333,7 +337,7 @@ readexpression(const char *f)
int
main(int argc, char **argv)
{
- agcre_re_t re;
+ agcre_regex_t re;
rtflags_t rtflags;
uint32_t pend;
char *expression;
@@ -351,7 +355,7 @@ main(int argc, char **argv)
file = expression = NULL;
c = 0;
pend = 0;
- while ((i = getopt(argc, argv, "DOP:S:T:Ve:f:ilm:nrtv")) != -1) {
+ while ((i = getopt(argc, argv, "DOP:S:T:Vae:f:ilm:nrtv")) != -1) {
switch(i) {
case 'D':
flags |= AGCRE_REG_DUMP;
@@ -380,8 +384,11 @@ main(int argc, char **argv)
}
break;
case 'V':
- printf("agcre %d\n", AGCRE_VERSION);
+ printf("agcre " AGCRE_VERSION_STRING(AGCRE_VERSION) "\n");
exit(EXIT_SUCCESS);
+ case 'a':
+ flags |= AGCRE_REG_ANCHOR;
+ break;
case 'e':
expression = optarg;
break;
Index: othersrc/external/bsd/agcre/dist/libagcre.3
diff -u othersrc/external/bsd/agcre/dist/libagcre.3:1.2 othersrc/external/bsd/agcre/dist/libagcre.3:1.3
--- othersrc/external/bsd/agcre/dist/libagcre.3:1.2 Thu Aug 17 11:03:21 2017
+++ othersrc/external/bsd/agcre/dist/libagcre.3 Tue Oct 5 01:23:39 2021
@@ -1,6 +1,6 @@
-.\" $NetBSD: libagcre.3,v 1.2 2017/08/17 11:03:21 wiz Exp $
+.\" $NetBSD: libagcre.3,v 1.3 2021/10/05 01:23:39 agc Exp $
.\"
-.\" Copyright (c) 2017 Alistair Crooks <[email protected]>
+.\" Copyright (c) 2017,2020,2021 Alistair Crooks <[email protected]>
.\" All rights reserved.
.\"
.\" Redistribution and use in source and binary forms, with or without
@@ -23,14 +23,14 @@
.\" (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
.\" THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.\"
-.Dd July 4, 2017
-.Dt LIBAGCRE 3
+.Dd October 1, 2021
+.Dt AGCRE-EMBED 3
.Os
.Sh NAME
-.Nm libagcre
+.Nm agcre
.Nd regular expression library
.Sh LIBRARY
-.Lb libagcre
+.Lb agcre
.Sh SYNOPSIS
.In agcre.h
.Ft "agcre_regex_t *"
@@ -39,11 +39,15 @@
.Fc
.Ft "int"
.Fo agcre_regcomp
-.Fa "agcre_regex_t *expression" "const void *pat" "int flags"
+.Fa "agcre_regex_t *expression" "const void *pat" "uint32_t flags"
.Fc
.Ft int
.Fo agcre_regexec
-.Fa "agcre_regex_t *expression" "const void *input" "size_t nmatch" "agcre_regmatch_t *matches" "int flags"
+.Fa "agcre_regex_t *expression" "const void *input" "size_t nmatch" "agcre_regmatch_t *matches" "uint32_t flags"
+.Fc
+.Ft int
+.Fo agcre_rev_regexec
+.Fa "agcre_regex_t *expression" "const void *input" "size_t nmatch" "agcre_regmatch_t *matches" "uint32_t flags"
.Fc
.Ft void
.Fo agcre_regfree
@@ -53,6 +57,14 @@
.Fo agcre_regerror
.Fa "int errnum" "const agcre_regex_t *expression" "char *buf" "size_t size"
.Fc
+.Ft ssize_t
+.Fo agcre_regnsub
+.Fa "char *buf" "size_t size" "const char *repl" "const agcre_regmatch_t *matches" "const char *str"
+.Fc
+.Ft ssize_t
+.Fo agcre_regasub
+.Fa "char **buf" "const char *repl" "const agcre_regmatch_t *matches" "const char *str"
+.Fc
.Sh DESCRIPTION
The
.Nm
@@ -119,10 +131,6 @@ expression, the individual virtual machi
will be printed on standard output.
.It Dv AGCRE_REG_TRACE
Traces execution of the regular expression.
-.It Dv AGCRE_REG_LARGE
-Is deprecated, and only recognised for legacy reasons.
-.It Dv AGCRE_REG_BACKR
-Is deprecated, and only recognised for legacy reasons.
.\".It Dv AGCRE_IN_8BITS
.\"This indicates to the
.\".Fn agcre_regcomp
@@ -186,6 +194,32 @@ and
fields provided in the zero-th element of the
.Fa matchv
argument.
+.It Dv AGCRE_REG_ANCHOR
+Normally regular expressions are not assumed to start at the first character,
+but rather to have a lazy search at their start.
+Occasionally, a search anchored to the start of the
+string is preferred.
+This cannot be simulated by the
+.Dv AGCRE_REG_NOTBOL
+flag.
+Searches where a match needs to be found in the first character
+should therefore use the
+.Dv AGCRE_REG_ANCHOR
+definition.
+Anchoring can be specified at expression compile time,
+and/or at expression execution time.
+.It Dv AGCRE_REG_ANCHOR
+This is used where a regular expression pattern was generated by the
+.Xr ctags 1
+program, since the syntax generated does not conform to a standard
+basic regular expression.
+In particular,
+.Xr ctags 1
+outputs function location patterns which contain an unescaped
+.Dq *
+character, whereas basic regular expressions will only
+recognise an asterisk as a standard character if it appears
+at the start of an expression.
.\".It Dv AGCRE_IN_8BITS
.\"This indicates to the
.\".Fn agcre_regexec
@@ -208,6 +242,20 @@ argument.
.\"integers.
.El
.Pp
+Since normal expression searches take place moving forwards within a byte stream,
+there is not normally any need to have a function which searches a byte stream or buffer
+moving in reverse.
+However, editors frequently must implement this functionality on their own.
+To make reverse searching easier, the
+.Fn agcre_rev_regexec
+function is provided, which will start at the end of the sized
+buffer, using the
+.Dv AGCRE_REG_STARTEND
+definition, and the search will continue character by character
+until either a match is found or the start of the buffer is reached.
+This search will be done in an efficient manner, so that buffer
+contents which have already been searched will not be searched again.
+.Pp
The
.Fn agcre_regcomp
and
Index: othersrc/external/bsd/agcre/dist/tests/54.expected
diff -u othersrc/external/bsd/agcre/dist/tests/54.expected:1.2 othersrc/external/bsd/agcre/dist/tests/54.expected:1.3
--- othersrc/external/bsd/agcre/dist/tests/54.expected:1.2 Sun Dec 3 21:25:56 2017
+++ othersrc/external/bsd/agcre/dist/tests/54.expected Tue Oct 5 01:23:39 2021
@@ -1,4 +1,6 @@
+/usr/include/dev/ic/nvmeio.h:136: for (i = 0; i < __arraycount(identify->lbaf); i++)
+/usr/include/dev/ic/nvmeio.h:192: for (i = 0; i < __arraycount(identify->psd); i++)
/usr/include/rump/rumpuser_port.h:260:#ifndef __arraycount
/usr/include/rump/rumpuser_port.h:261:#define __arraycount(_ar_) (sizeof(_ar_)/sizeof(_ar_[0]))
/usr/include/sys/bitops.h:324: for (__i = 0; __i < __arraycount(__v->_b); __i++) \
-/usr/include/sys/cdefs.h:573:#define __arraycount(__x) (sizeof(__x) / sizeof(__x[0]))
+/usr/include/sys/cdefs.h:636:#define __arraycount(__x) (sizeof(__x) / sizeof(__x[0]))
Index: othersrc/external/bsd/agcre/dist/tests/62.expected
diff -u othersrc/external/bsd/agcre/dist/tests/62.expected:1.1 othersrc/external/bsd/agcre/dist/tests/62.expected:1.2
--- othersrc/external/bsd/agcre/dist/tests/62.expected:1.1 Wed Aug 16 23:38:13 2017
+++ othersrc/external/bsd/agcre/dist/tests/62.expected Tue Oct 5 01:23:39 2021
@@ -1 +1 @@
-agcre 20170801
+agcre 20211001
Index: othersrc/external/bsd/agcre/lib/Makefile
diff -u othersrc/external/bsd/agcre/lib/Makefile:1.1 othersrc/external/bsd/agcre/lib/Makefile:1.2
--- othersrc/external/bsd/agcre/lib/Makefile:1.1 Wed Aug 16 23:38:13 2017
+++ othersrc/external/bsd/agcre/lib/Makefile Tue Oct 5 01:23:39 2021
@@ -1,14 +1,7 @@
-# $NetBSD: Makefile,v 1.1 2017/08/16 23:38:13 agc Exp $
+# $NetBSD: Makefile,v 1.2 2021/10/05 01:23:39 agc Exp $
LIB= agcre
-SRCS+= comp.c
-SRCS+= error.c
-SRCS+= exec.c
-SRCS+= free.c
-SRCS+= lex.c
-SRCS+= new.c
-SRCS+= set.c
-SRCS+= unicode.c
+SRCS+= agcre.c
MAN= libagcre.3 agcre_format.7
MLINKS+= libagcre.3 agcre_regcomp.3
MLINKS+= libagcre.3 agcre_regexec.3
Index: othersrc/external/bsd/agcre/lib/shlib_version
diff -u othersrc/external/bsd/agcre/lib/shlib_version:1.1 othersrc/external/bsd/agcre/lib/shlib_version:1.2
--- othersrc/external/bsd/agcre/lib/shlib_version:1.1 Wed Aug 16 23:38:13 2017
+++ othersrc/external/bsd/agcre/lib/shlib_version Tue Oct 5 01:23:39 2021
@@ -1,2 +1,2 @@
-major=0
+major=1
minor=0
Added files:
Index: othersrc/external/bsd/agcre/dist/agcre.c
diff -u /dev/null othersrc/external/bsd/agcre/dist/agcre.c:1.1
--- /dev/null Tue Oct 5 01:23:39 2021
+++ othersrc/external/bsd/agcre/dist/agcre.c Tue Oct 5 01:23:39 2021
@@ -0,0 +1,3065 @@
+/*-
+ * Copyright (c) 2013,2017 Alistair Crooks. All Rights reserved.
+ * All rights reserved.
+ *
+ * Parts of this are:
+ * Copyright (c) 2007-2009 Russ Cox, Google Inc. All rights reserved.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <sys/types.h>
+#include <sys/param.h>
+
+#include <inttypes.h>
+#include <stdarg.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+#include "agcre.h"
+
+/* callback struct */
+typedef struct cb_t {
+ int (*func)(uint32_t); /* function */
+ uint8_t negate; /* take the complement of the result */
+} cb_t;
+
+/* sets are implemented as combinations of ranges, individual cases, and callbacks */
+typedef struct set_t {
+ uint32_t rangec; /* # of ranges */
+ uint32_t maxrange; /* size of range array */
+ uint32_t *ranges; /* range array */
+ uint32_t casec; /* # of cases */
+ uint32_t maxcase; /* size of case array */
+ uint32_t *cases; /* case array */
+ uint32_t callbackc; /* # of callbacks */
+ uint32_t maxcallback; /* size of callback array */
+ cb_t *callbacks; /* callback array */
+ uint8_t complement; /* complement of set */
+} set_t;
+
+#define AT_LEAST 0xffffffff
+#define BOF 0xffffffffU
+
+/* a regexp token */
+typedef struct retoken_t {
+ struct retoken_t *left; /* token on left */
+ struct retoken_t *right; /* token on right */
+ uint64_t so; /* start offset */
+ uint32_t n; /* number */
+ uint32_t nparens; /* () count */
+ uint32_t lo; /* repeat low count */
+ uint32_t hi; /* repeat high count */
+ uint32_t ch; /* current char */
+ uint8_t type; /* type of token */
+ uint8_t lazy; /* use lazy matching */
+ uint8_t noncapture; /* non-capture subexpression */
+ uint8_t icase; /* icase token */
+} retoken_t;
+
+/* token types */
+#define TOK_ALT 0x01
+#define TOK_BACKREF 0x02
+#define TOK_BROKEN 0x03
+#define TOK_CAT 0x04
+#define TOK_COLLECTION 0x05
+#define TOK_DOT 0x06
+#define TOK_EOL 0x07
+#define TOK_FORGET 0x08
+#define TOK_LIT 0x09
+#define TOK_PAREN 0x0a
+#define TOK_PLUS 0x0b
+#define TOK_QUEST 0x0c
+#define TOK_REP 0x0d
+#define TOK_SET TOK_COLLECTION
+#define TOK_STAR 0x0e
+#define TOK_WIDTH 0x0f
+#define TOK_ZEROWIDTH 0x10
+
+/* input structure */
+typedef struct input_t {
+ const char *s; /* string */
+ uint64_t so; /* start offset */
+ uint64_t eo; /* end offset */
+ uint64_t c; /* current offset */
+ int msgc; /* # of chars in message buf */
+ char msg[256]; /* message buffer */
+ retoken_t parse; /* current token when parsing */
+ uint32_t parenc; /* # of parentheses */
+ struct re_t *re; /* pointer to internal re */
+ uint32_t ch; /* current char */
+ uint32_t prevch; /* previous char */
+ uint8_t bytes; /* size of current char */
+ uint8_t prevbytes; /* size of previous char */
+ uint8_t icase; /* persistent icase */
+} input_t;
+
+#define AGCRE_IN_WIDTH \
+ (AGCRE_REG_UTF16LE | AGCRE_REG_UTF16BE | AGCRE_REG_UTF32LE | AGCRE_REG_UTF32BE)
+
+/* virtual machine instruction */
+typedef struct instr_t {
+ struct instr_t *lhs; /* lhs */
+ struct instr_t *rhs; /* rhs */
+ uint32_t n; /* character */
+ uint32_t gen; /* generation number */
+ uint8_t op; /* instruction number */
+ uint8_t icase; /* instruction number */
+} instr_t;
+
+/* instruction opcodes */
+#define OP_ANY 0x01
+#define OP_BACKREF 0x02
+#define OP_CHAR 0x03
+#define OP_JUMP 0x04
+#define OP_MATCH 0x05
+#define OP_SAVE 0x06
+#define OP_SET 0x07
+#define OP_SPLIT 0x08
+#define OP_ZEROWIDTH 0x09
+
+/* a context structure */
+typedef struct context_t {
+ struct context_t *next; /* for freeing nodes */
+ uint32_t ref; /* reference count */
+ uint32_t nsub; /* # matches */
+ uint32_t c; /* chars in backref */
+ agcre_regmatch_t sub[1]; /* match structures */
+} context_t;
+
+/* a thread structure */
+typedef struct re_thread_t {
+ instr_t *pc; /* instruction */
+ context_t *ctx; /* context */
+} re_thread_t;
+
+/* list of threads */
+typedef struct threadlist_t {
+ uint32_t nthreads; /* # of threads */
+ re_thread_t t[1]; /* the threads */
+} threadlist_t;
+
+/* regular expression internals */
+typedef struct re_t {
+ instr_t *prog; /* start of instructions */
+ uint32_t instrc; /* # of instructions */
+ uint32_t gen; /* generation number */
+ uint32_t setc; /* # of sets */
+ uint32_t maxset; /* allocated # of sets */
+ set_t *sets; /* sets */
+ uint32_t flags; /* comp/exec flags */
+ context_t *ctxlist; /* list of contexts */
+ instr_t *pc; /* prog counter */
+ int msgc; /* # of chars in msg buffer */
+ char msg[256]; /* message buffer */
+} re_t;
+
+/*
+ * regular expressions have an implicit .* at the start to find the
+ * first match. for anchored searches (not that many in comparison
+ * to regular regular expressions) we need to jump over those
+ * instructions (in the regexec phase).
+ * Dump of compiled expression instructions for the pattern: "i"
+ * (without quotes)
+ *
+ * Cat(LazyStar(Dot), Paren(0, Lit(i)))
+ * 0. split 3, 1
+ * 1. any
+ * 2. jmp 0
+ * 3. save 0
+ * 4. char i
+ * 5. save 1
+ * 6. match
+ *
+ * so for anchored searches, we jump over the first 3 instructions
+ */
+#define LEADING_DOT_STAR_INSTRS 3
+
+/* all static function forward declarations */
+__BEGIN_DECLS
+
+static void set_free(set_t */*set*/);
+static int set_add_case(set_t */*set*/, uint32_t /*n*/);
+static int set_add_range(set_t */*set*/, uint32_t /*lo*/, uint32_t /*hi*/);
+static int set_add_callback(set_t */*set*/, int (*/*func*/)(uint32_t), int /*negate*/);
+static int set_isset(set_t */*set*/, uint32_t /*n*/);
+static int set_complement(set_t */*set*/);
+
+static retoken_t *parse_extended(agcre_regex_t */*re*/, input_t */*in*/);
+static retoken_t *parse_nospec(agcre_regex_t */*re*/, input_t */*in*/);
+static retoken_t *parse_basic(agcre_regex_t */*re*/, input_t */*in*/, uint32_t /*flags*/);
+
+static int get_next_token(input_t */*in*/);
+
+static int unicode_isWhite_Space(uint32_t /*ch*/);
+static int unicode_isDash(uint32_t /*ch*/);
+static int unicode_isHyphen(uint32_t /*ch*/);
+static int unicode_isQuotation_Mark(uint32_t /*ch*/);
+static int unicode_isTerminal_Punctuation(uint32_t /*ch*/);
+static int unicode_isHex_Digit(uint32_t /*ch*/);
+static int unicode_isOther_Alphabetic(uint32_t /*ch*/);
+static int unicode_isOther_Lowercase(uint32_t /*ch*/);
+static int unicode_isOther_Uppercase(uint32_t /*ch*/);
+static int unicode_isPattern_White_Space(uint32_t /*ch*/);
+static int unicode_isalnum(uint32_t /*ch*/);
+static int unicode_isalpha(uint32_t /*ch*/);
+static int unicode_isblank(uint32_t /*ch*/);
+static int unicode_iscntrl(uint32_t /*ch*/);
+static int unicode_isdigit(uint32_t /*ch*/);
+static int unicode_isgraph(uint32_t /*ch*/);
+static int unicode_islower(uint32_t /*ch*/);
+static int unicode_isprint(uint32_t /*ch*/);
+static int unicode_ispunct(uint32_t /*ch*/);
+static int unicode_isspace(uint32_t /*ch*/);
+static int unicode_isupper(uint32_t /*ch*/);
+static int unicode_isxdigit(uint32_t /*ch*/);
+static int unicode_isident(uint32_t /*ch*/);
+static uint32_t unicode_tolower(uint32_t /*ch*/);
+static uint32_t unicode_toupper(uint32_t /*ch*/);
+
+__END_DECLS
+
+#ifndef __printflike
+#define __printflike(n, m) __attribute__((format(printf,n,m)))
+#endif
+#ifndef howmany
+#define howmany(x, y) (((x)+((y)-1))/(y))
+#endif
+
+/* create a new token */
+static retoken_t *
+newtok(int type, retoken_t *left, retoken_t *right, uint8_t icase)
+{
+ retoken_t *token;
+
+ if ((token = calloc(1, sizeof(*token))) != NULL) {
+ token->type = type;
+ token->left = left;
+ token->right = right;
+ token->icase = icase;
+ }
+ return token;
+}
+
+/* how many instructions does tok need? */
+static int
+count(retoken_t *tok, int *errc)
+{
+ switch(tok->type) {
+ case TOK_ALT:
+ return 2 + count(tok->left, errc) + count(tok->right, errc);
+ case TOK_BACKREF:
+ case TOK_COLLECTION:
+ case TOK_DOT:
+ case TOK_LIT:
+ case TOK_WIDTH:
+ case TOK_ZEROWIDTH:
+ return 1;
+ case TOK_FORGET:
+ return 0;
+ case TOK_CAT:
+ return count(tok->left, errc) + count(tok->right, errc);
+ case TOK_PAREN:
+ case TOK_STAR:
+ return 2 + count(tok->left, errc);
+ case TOK_PLUS:
+ case TOK_QUEST:
+ return 1 + count(tok->left, errc);
+ default:
+ *errc = 1;
+ return 0;
+ }
+}
+
+/* make a new set */
+static inline set_t *
+newset(re_t *re)
+{
+ set_t *set;
+
+ if (re->setc == re->maxset) {
+ set = realloc(re->sets, (re->maxset + 8) * sizeof(*set));
+ if (set == NULL) {
+ //error_message(in, "can't extend sets past %u", in->re->maxset);
+ return NULL;
+ }
+ memset(&set[re->setc], 0x0, 8 * sizeof(*set));
+ re->maxset += 8;
+ re->sets = set;
+ }
+ return &re->sets[re->setc++];
+}
+
+/* take parse tokens, and emit vm instructions */
+static int
+emit(re_t *re, retoken_t *tok)
+{
+ instr_t *p1;
+ instr_t *p2;
+ instr_t *t;
+ set_t *set;
+
+ switch(tok->type) {
+ case TOK_ALT:
+ re->pc->op = OP_SPLIT;
+ p1 = re->pc++;
+ p1->lhs = re->pc;
+ emit(re, tok->left);
+ re->pc->op = OP_JUMP;
+ p2 = re->pc++;
+ p1->rhs = re->pc;
+ emit(re, tok->right);
+ p2->lhs = re->pc;
+ return 1;
+ case TOK_BACKREF:
+ re->pc->op = OP_BACKREF;
+ re->pc->n = tok->ch;
+ re->pc->icase = tok->icase;
+ re->pc++;
+ return 1;
+ case TOK_CAT:
+ emit(re, tok->left);
+ emit(re, tok->right);
+ return 1;
+ case TOK_COLLECTION:
+ re->pc->op = OP_SET;
+ re->pc->n = tok->ch;
+ re->pc->icase = tok->icase;
+ re->pc++;
+ return 1;
+ case TOK_DOT:
+ re->pc++->op = OP_ANY;
+ return 1;
+ case TOK_LIT:
+ re->pc->op = OP_CHAR;
+ re->pc->n = tok->ch;
+ re->pc->icase = tok->icase;
+ re->pc++;
+ return 1;
+ case TOK_PAREN:
+ re->pc->op = OP_SAVE;
+ re->pc->n = 2*tok->nparens;
+ re->pc++;
+ emit(re, tok->left);
+ re->pc->op = OP_SAVE;
+ re->pc->n = 2*tok->nparens + 1;
+ re->pc++;
+ return 1;
+ case TOK_PLUS:
+ p1 = re->pc;
+ emit(re, tok->left);
+ re->pc->op = OP_SPLIT;
+ re->pc->lhs = p1;
+ p2 = re->pc;
+ re->pc++;
+ p2->rhs = re->pc;
+ if (tok->lazy) {
+ t = p2->lhs;
+ p2->lhs = p2->rhs;
+ p2->rhs = t;
+ }
+ return 1;
+ case TOK_QUEST:
+ re->pc->op = OP_SPLIT;
+ p1 = re->pc++;
+ p1->lhs = re->pc;
+ emit(re, tok->left);
+ p1->rhs = re->pc;
+ if (tok->lazy) {
+ t = p1->lhs;
+ p1->lhs = p1->rhs;
+ p1->rhs = t;
+ }
+ return 1;
+ case TOK_STAR:
+ re->pc->op = OP_SPLIT;
+ p1 = re->pc++;
+ p1->lhs = re->pc;
+ emit(re, tok->left);
+ re->pc->op = OP_JUMP;
+ re->pc->lhs = p1;
+ re->pc++;
+ p1->rhs = re->pc;
+ if (tok->lazy) {
+ t = p1->lhs;
+ p1->lhs = p1->rhs;
+ p1->rhs = t;
+ }
+ return 1;
+ case TOK_WIDTH:
+ re->pc->op = OP_SET;
+ re->pc->n = re->setc;
+ re->pc->icase = tok->icase;
+ if ((set = newset(re)) == NULL) {
+ return 0;
+ }
+ set_add_callback(set,
+ (tok->ch == 'd' || tok->ch == 'D') ? unicode_isdigit :
+ (tok->ch == 'p' || tok->ch == 'P') ? unicode_isprint :
+ (tok->ch == 's' || tok->ch == 'S') ? unicode_isspace :
+ unicode_isident,
+ unicode_isupper(tok->ch));
+ re->pc++;
+ return 1;
+ case TOK_ZEROWIDTH:
+ re->pc->op = OP_ZEROWIDTH;
+ re->pc->n = tok->ch;
+ re->pc++;
+ return 1;
+ }
+ return 1;
+}
+
+/* recursive function to free a token and its children */
+static void
+freetok(retoken_t *tok)
+{
+ if (tok) {
+ freetok(tok->left);
+ freetok(tok->right);
+ free(tok);
+ }
+}
+
+/* recursive function to print a token and its children */
+static void
+print_token(retoken_t *tok)
+{
+ if (tok->icase) {
+ printf("Icase+");
+ }
+ switch(tok->type) {
+ case TOK_ALT:
+ printf("Alt(");
+ print_token(tok->left);
+ printf(", ");
+ print_token(tok->right);
+ printf(")");
+ break;
+ case TOK_BACKREF:
+ printf("Backref(%d)", tok->ch);
+ break;
+ case TOK_CAT:
+ printf("Cat(");
+ print_token(tok->left);
+ printf(", ");
+ print_token(tok->right);
+ printf(")");
+ break;
+ case TOK_COLLECTION:
+ printf("Set(%u)", tok->ch);
+ break;
+ case TOK_DOT:
+ printf("Dot");
+ break;
+ case TOK_LIT:
+ printf("Lit(%c)", tok->ch);
+ break;
+ case TOK_PAREN:
+ printf("Paren(%d, ", tok->nparens);
+ print_token(tok->left);
+ printf(")");
+ break;
+ case TOK_PLUS:
+ if (tok->lazy) {
+ printf("Lazy");
+ }
+ printf("Plus(");
+ print_token(tok->left);
+ printf(")");
+ break;
+ case TOK_QUEST:
+ if (tok->lazy) {
+ printf("Lazy");
+ }
+ printf("Quest(");
+ print_token(tok->left);
+ printf(")");
+ break;
+ case TOK_STAR:
+ if (tok->lazy) {
+ printf("Lazy");
+ }
+ printf("Star(");
+ print_token(tok->left);
+ printf(")");
+ break;
+ case TOK_WIDTH:
+ printf("Width(%c)", tok->ch);
+ break;
+ case TOK_ZEROWIDTH:
+ printf("Zerowidth(%c)", tok->ch);
+ break;
+ default:
+ printf("???");
+ break;
+ }
+}
+
+/* print all the instructions in the program */
+static int
+printprog(re_t *re)
+{
+ uint32_t pc;
+ instr_t *instr;
+
+ for (pc = 0, instr = re->prog ; pc < re->instrc ; pc++, instr++) {
+ switch(instr->op) {
+ case OP_ANY:
+ printf("%2d. any\n", pc);
+ break;
+ case OP_BACKREF:
+ printf("%2d. backref %d\n", pc, instr->n);
+ break;
+ case OP_CHAR:
+ printf("%2d. char %c\n", pc, instr->n);
+ break;
+ case OP_JUMP:
+ printf("%2d. jmp %d\n",
+ pc,
+ (int)(instr->lhs - re->prog));
+ break;
+ case OP_MATCH:
+ printf("%2d. match\n", pc);
+ break;
+ case OP_SAVE:
+ printf("%2d. save %d\n", pc, instr->n);
+ break;
+ case OP_SET:
+ printf("%2d. set %d\n", pc, instr->n);
+ break;
+ case OP_SPLIT:
+ printf("%2d. split %d, %d\n",
+ pc,
+ (int)(instr->lhs - re->prog),
+ (int)(instr->rhs - re->prog));
+ break;
+ case OP_ZEROWIDTH:
+ printf("%2d. zerowidth(%c)\n", pc, instr->n);
+ break;
+ default:
+ fprintf(stderr, "printprog - bad type %d\n", instr->op);
+ return 0;
+ }
+ }
+ return 1;
+}
+
+static retoken_t *rec_repeat(input_t *in);
+
+/* add a warning message */
+static __printflike(2,3) void
+error_message(input_t *in, const char *fmt, ...)
+{
+ va_list args;
+
+ if (in->msgc == 0) {
+ /* don't overwrite previous messages */
+ va_start(args, fmt);
+ in->msgc = vsnprintf(in->msg, sizeof(in->msg), fmt, args);
+ va_end(args);
+ }
+}
+
+/* check current character is as expected, then get next token */
+static int
+accept_char(input_t *in, uint32_t expected)
+{
+ int ret;
+
+ ret = 1;
+ if (in->parse.ch != expected) {
+ error_message(in, "expected: '%c', got '%c'",
+ expected, in->s[in->c]);
+ ret = 0;
+ }
+ get_next_token(in);
+ return ret;
+}
+
+/* clone a token */
+static retoken_t *
+clone_token(input_t *in, retoken_t *orig)
+{
+ retoken_t *token;
+
+ if ((token = newtok(orig->type, NULL, NULL, orig->icase)) == NULL) {
+ return NULL;
+ }
+ token->lazy = orig->lazy;
+ token->nparens = orig->nparens;
+ token->ch = orig->ch;
+ if (orig->left) {
+ token->left = clone_token(in, orig->left);
+ }
+ if (orig->right) {
+ token->right = clone_token(in, orig->right);
+ }
+ return token;
+}
+
+/* concat two subexpressions together */
+static retoken_t *
+rec_concat(input_t *in, retoken_t *token)
+{
+ retoken_t *rhs;
+ retoken_t *cat;
+
+ while (in->parse.type != TOK_EOL && in->parse.type != TOK_ALT && in->parse.ch != ')') {
+ if ((rhs = rec_repeat(in)) == NULL ||
+ (cat = newtok(TOK_CAT, token, rhs, 0)) == NULL ||
+ (token = rec_concat(in, cat)) == NULL) {
+ return NULL;
+ }
+ }
+ return token;
+}
+
+/* recognise an alternate */
+static retoken_t *
+rec_alt(input_t *in)
+{
+ retoken_t *repeat;
+ retoken_t *token;
+ retoken_t *cat;
+ retoken_t *alt;
+
+ if ((repeat = rec_repeat(in)) == NULL ||
+ (token = rec_concat(in, repeat)) == NULL) {
+ return NULL;
+ }
+ while (in->parse.type == TOK_ALT) {
+ if (!accept_char(in, '|')) {
+ return NULL;
+ }
+ if ((repeat = rec_repeat(in)) == NULL ||
+ (cat = rec_concat(in, repeat)) == NULL ||
+ (alt = newtok(TOK_ALT, token, cat, 0)) == NULL ||
+ (token = rec_concat(in, alt)) == NULL) {
+ return NULL;
+ }
+ }
+ return token;
+}
+
+/* recognise a single atom */
+static retoken_t *
+rec_single(input_t *in)
+{
+ retoken_t *token;
+ retoken_t *alt;
+ int parenc;
+
+ token = NULL;
+ switch(in->parse.type) {
+ case TOK_PAREN:
+ accept_char(in, '(');
+ if (in->parse.noncapture) {
+ if ((token = rec_alt(in)) == NULL) {
+ return NULL;
+ }
+ } else {
+ parenc = ++in->parenc;
+ if ((alt = rec_alt(in)) == NULL ||
+ (token = newtok(TOK_PAREN, alt, NULL, 0)) == NULL) {
+ return NULL;
+ }
+ token->nparens = parenc;
+ }
+ if (!accept_char(in, ')')) {
+ return NULL;
+ }
+ break;
+ case TOK_BACKREF:
+ if ((token = newtok(TOK_BACKREF, NULL, NULL, in->parse.icase)) == NULL) {
+ return NULL;
+ }
+ if (in->parse.ch > in->parenc) {
+ error_message(in, "backref %d out of range", in->ch);
+ return NULL;
+ }
+ accept_char(in, token->ch = in->parse.ch);
+ break;
+ case TOK_DOT:
+ case TOK_LIT:
+ case TOK_COLLECTION:
+ case TOK_WIDTH:
+ case TOK_ZEROWIDTH:
+ if ((token = newtok(in->parse.type, NULL, NULL, in->parse.icase)) == NULL) {
+ return NULL;
+ }
+ token->ch = in->parse.ch;
+ token->icase = in->parse.icase;
+ get_next_token(in);
+ break;
+ }
+ return token;
+}
+
+/* a repetition of subexpressions */
+static retoken_t *
+rec_repeat(input_t *in)
+{
+ retoken_t *cloned;
+ retoken_t *quest;
+ retoken_t *token;
+ retoken_t *orig;
+ uint32_t i;
+
+ if ((token = rec_single(in)) == NULL) {
+ return NULL;
+ }
+ orig = token;
+ switch(in->parse.type) {
+ case TOK_STAR:
+ case TOK_PLUS:
+ case TOK_QUEST:
+ if ((token = newtok(in->parse.type, token, NULL, in->parse.icase)) == NULL) {
+ return NULL;
+ }
+ token->lazy = in->parse.lazy;
+ get_next_token(in);
+ break;
+ case TOK_REP:
+ if (in->parse.lo == 0 && in->parse.hi == 0) {
+ orig->type = TOK_FORGET;
+ }
+ for (i = 1 ; i < in->parse.lo ; i++) {
+ if ((cloned = clone_token(in, orig)) == NULL ||
+ (token = newtok(TOK_CAT, token, cloned, in->parse.icase)) == NULL) {
+ return NULL;
+ }
+ }
+ if (in->parse.hi == AT_LEAST) {
+ if ((token = newtok(TOK_PLUS, token, NULL, in->parse.icase)) == NULL) {
+ return NULL;
+ }
+ } else {
+ for ( ; i < in->parse.hi ; i++) {
+ if ((cloned = clone_token(in, orig)) == NULL ||
+ (quest = newtok(TOK_QUEST, cloned, NULL, in->parse.icase)) == NULL ||
+ (token = newtok(TOK_CAT, token, quest, in->parse.icase)) == NULL) {
+ return NULL;
+ }
+ }
+ }
+ get_next_token(in);
+ break;
+ }
+ return token;
+}
+
+/* recognise literal tokens in nospec expression */
+static retoken_t *
+nospec_token(input_t *in)
+{
+ retoken_t *tok;
+
+ tok = newtok(TOK_LIT, NULL, NULL, 0);
+ tok->ch = in->s[in->c++];
+ if (in->c == in->eo) {
+ return tok;
+ }
+ return newtok(TOK_CAT, tok, nospec_token(in), 0);
+}
+
+/* either start of line, or start of nested subexpression */
+#define BASIC_IS_START(i, in) \
+ ((i) == (in) || ((size_t)(i - in) >= 2 && (i)[-2] == '\\' && (i)[-1] == '('))
+
+#define BASIC_IS_END(i, in, sz) \
+ (((size_t)((i) - (in)) == (sz) - 1) || ((i)[1] == '\\' && (i)[2] == ')'))
+
+/* convert a basic regexp into an extended one */
+static ssize_t
+regextend(const char *in, size_t insize, char *out, size_t outsize, uint32_t flags)
+{
+ const char *i;
+ char *o;
+
+ for (i = in, o = out ; (size_t)(i - in) < insize && (size_t)(o - out) < outsize ; i++) {
+ switch(*i) {
+ case '+':
+ case '|':
+ case '?':
+ case '{':
+ case '}':
+ case '(':
+ case ')':
+ /* +, |, ?, {, }, (, ) are ordinary characters */
+ *o++ = '\\';
+ *o++ = *i;
+ break;
+ case '\\':
+ switch(i[1]) {
+ case '(':
+ case ')':
+ case '{':
+ case '}':
+ /* basic bounds delimiters are \{ and \} */
+ /* basic nested subexpressions are \( and \) */
+ *o++ = *++i;
+ break;
+ default:
+ /* pass through other backslash-escaped chars */
+ *o++ = *i;
+ break;
+ }
+ break;
+ case '^':
+ case '$':
+ /* ^ is an ordinary char, except at start */
+ /* $ is an ordinary char, except at end */
+ if ((*i == '^' && !BASIC_IS_START(i, in)) ||
+ (*i == '$' && !BASIC_IS_END(i, in, insize))) {
+ *o++ = '\\';
+ }
+ *o++ = *i;
+ break;
+ case '*':
+ if (flags & AGCRE_REG_CTAGS) {
+ /* * is an ordinary character - see ctags */
+ *o++ = '\\';
+ *o++ = *i;
+ } else {
+ /* THIS IS WHAT THE RE_FORMAT PAGE SAYS FOR '*'
+ not for ctags, though */
+ if ((*i == '*' && BASIC_IS_START(i, in))) {
+ *o++ = '\\';
+ }
+ *o++ = *i;
+ }
+ break;
+ default:
+ *o++ = *i;
+ break;
+ }
+ }
+ *o = 0x0;
+ return (ssize_t)(o - out);
+}
+
+/* parse the string */
+static retoken_t *
+parse_string(input_t *in)
+{
+ retoken_t *dotstar;
+ retoken_t *token;
+ retoken_t *cat;
+ retoken_t *alt;
+
+ get_next_token(in);
+ if ((alt = rec_alt(in)) == NULL) {
+ return NULL;
+ }
+ /* make pattern ".*?(PATTERN)" */
+ if ((token = newtok(TOK_PAREN, alt, NULL, 0)) == NULL ||
+ (cat = newtok(TOK_DOT, NULL, NULL, 0)) == NULL ||
+ (dotstar = newtok(TOK_STAR, cat, NULL, 0)) == NULL) {
+ return NULL;
+ }
+ dotstar->lazy = 1;
+ return newtok(TOK_CAT, dotstar, token, 0);
+}
+
+/* parse an extended reg exp */
+static retoken_t *
+parse_extended(agcre_regex_t *agcre, input_t *in)
+{
+ retoken_t *entry;
+ re_t *re;
+
+ re = agcre->re_g;
+ if ((entry = parse_string(in)) == 0) {
+ memcpy(re->msg, in->msg, in->msgc);
+ return 0;
+ }
+ agcre->re_nsub = in->parenc + 1;
+ return entry;
+}
+
+/* parse a nospec regexp */
+static retoken_t *
+parse_nospec(agcre_regex_t *agcre, input_t *in)
+{
+ retoken_t *dotstar;
+ retoken_t *tok;
+
+ tok = nospec_token(in);
+ tok = newtok(TOK_PAREN, tok, NULL, 0);
+ dotstar = newtok(TOK_STAR, newtok(TOK_DOT, NULL, NULL, 0), NULL, 0);
+ dotstar->lazy = 1;
+ tok = newtok(TOK_CAT, dotstar, tok, 0);
+ agcre->re_nsub = in->parenc + 1;
+ return tok;
+}
+
+/* convert basic regexp to extended then parse */
+static retoken_t *
+parse_basic(agcre_regex_t *agcre, input_t *in, uint32_t flags)
+{
+ ssize_t size;
+ char s[AGCRE_MAX_EXPR_LENGTH * 4];
+
+ size = regextend(&in->s[in->so], in->eo - in->so, s, sizeof(s), flags);
+ in->s = s;
+ in->so = 0;
+ in->eo = size;
+ return parse_extended(agcre, in);
+}
+
+#define BOL 0xffffffffU
+
+/* unwrap a character, including UTF-8 chars, into 32bit char */
+static inline int
+unwrap_utf8(input_t *in)
+{
+ uint32_t n;
+ uint32_t j;
+
+ in->ch = in->s[in->c];
+ if (in->ch <= 0x7f) {
+ /* 0XXX XXXX one byte */
+ in->bytes = 1;
+ n = in->ch;
+ } else if ((in->ch & 0xe0) == 0xc0) {
+ /* 110X XXXX 2 bytes */
+ in->bytes = 2;
+ n = (in->ch & 0x1f);
+ } else if ((in->ch & 0xf0) == 0xe0) {
+ /* 1110 XXXX three bytes */
+ in->bytes = 3;
+ n = (in->ch & 0x0f);
+ } else if ((in->ch & 0xf8) == 0xf0) {
+ /* 1111 0XXX four bytes */
+ in->bytes = 4;
+ n = (in->ch & 0x07);
+ } else if ((in->ch & 0xfc) == 0xf8) {
+ /* 1111 10XX five bytes */
+ in->bytes = 5;
+ n = (in->ch & 0x03);
+ } else if ((in->ch & 0xfe) == 0xfc) {
+ /* 1111 110X six bytes */
+ in->bytes = 6;
+ n = (in->ch & 0x01);
+ } else {
+ return 0;
+ }
+ if (in->c + in->bytes - 1 > in->eo) {
+ return 0;
+ }
+ for (j = 1 ; j < in->bytes ; j++) {
+ n = (n << 6) | (in->s[in->c + j] & 0x3f);
+ }
+ in->ch = n;
+ return 1;
+}
+
+/* return 1 if host is bigendian */
+static inline int
+host_is_bigendian(void)
+{
+ int indian = 1;
+
+ return (*(char *)(void *)&indian) ? 0 : 1;
+}
+
+/* swap 32bit integer */
+static inline uint32_t
+swap32(uint32_t x)
+{
+ x = ((x & 0x0000ffff) << 16) | ((x & 0xffff0000) >> 16);
+ x = ((x & 0x00ff00ff) << 8) | ((x & 0xff00ff00) >> 8);
+ return x;
+}
+
+/* swap 16bit integer */
+static inline uint32_t
+swap16(uint16_t x)
+{
+ return ((x & 0x00ff) << 8) | ((x & 0xff00) >> 8);
+}
+
+/* higher level get-next-char function */
+static inline int
+get_next_char(input_t *in, uint32_t flags)
+{
+ uint32_t i32;
+ uint16_t i16;
+
+ in->prevch = in->ch;
+ in->prevbytes = in->bytes;
+ switch(flags & AGCRE_IN_WIDTH) {
+ case AGCRE_REG_UTF16LE:
+ in->bytes = 1;
+ memcpy(&i16, &in->s[in->c * 2], 2);
+ in->ch = (host_is_bigendian()) ? swap16(i16) : i16;
+ return 1;
+ case AGCRE_REG_UTF16BE:
+ in->bytes = 1;
+ memcpy(&i16, &in->s[in->c * 2], 2);
+ in->ch = (!host_is_bigendian()) ? swap16(i16) : i16;
+ return 1;
+ case AGCRE_REG_UTF32LE:
+ in->bytes = 1;
+ memcpy(&i32, &in->s[in->c * 4], 4);
+ in->ch = (host_is_bigendian()) ? swap32(i32) : i32;
+ return 1;
+ case AGCRE_REG_UTF32BE:
+ in->bytes = 1;
+ memcpy(&i32, &in->s[in->c * 4], 4);
+ in->ch = (!host_is_bigendian()) ? swap32(i32) : i32;
+ return 1;
+ default:
+ return unwrap_utf8(in);
+ }
+}
+
+/* return the number of bytes in a context */
+static inline uint32_t
+context_bytes(uint32_t n)
+{
+ return sizeof(context_t) + (n * sizeof(agcre_regmatch_t));
+}
+
+/* create a new context */
+static context_t *
+make_new_context(re_t *re, int n)
+{
+ context_t *ctx;
+
+ if ((ctx = re->ctxlist) != NULL) {
+ re->ctxlist = ctx->next;
+ memset(ctx, 0x0, context_bytes(n));
+ } else {
+ ctx = calloc(1, context_bytes(n));
+ }
+ ctx->nsub = n;
+ ctx->ref = 1;
+ return ctx;
+}
+
+/* increment reference count in the context */
+static inline context_t *
+incref(context_t *ctx)
+{
+ ctx->ref += 1;
+ return ctx;
+}
+
+/* save the current offset in the match context */
+static context_t *
+mark(re_t *re, context_t *ctx, int i, input_t *in, int delta)
+{
+ context_t *newctx;
+ uint32_t j;
+
+ if (ctx->ref > 1) {
+ newctx = make_new_context(re, ctx->nsub);
+ for (j = 0; j < ctx->nsub; j++) {
+ newctx->sub[j] = ctx->sub[j];
+ }
+ ctx->ref -= 1;
+ ctx = newctx;
+ }
+ if (i % 2 == 0) {
+ ctx->sub[i / 2].rm_so = in->c + delta;
+ } else {
+ ctx->sub[i / 2].rm_eo = in->c + delta;
+ }
+ return ctx;
+}
+
+/* decrement reference count in the context */
+static inline void
+decref(re_t *re, context_t *ctx)
+{
+ if (--ctx->ref == 0) {
+ ctx->next = re->ctxlist;
+ re->ctxlist = ctx;
+ }
+}
+
+/* create a new thread */
+static re_thread_t
+newthread(instr_t *pc, context_t *ctx)
+{
+ re_thread_t t = {pc, ctx};
+
+ return t;
+}
+
+/* create a new thread list */
+static threadlist_t *
+threadlist(uint32_t instrc)
+{
+ return calloc(1, sizeof(threadlist_t) + (instrc * sizeof(re_thread_t)));
+}
+
+/* add a new thread to the list */
+static int
+addthread(re_t *re, threadlist_t *list, re_thread_t t, input_t *in, int delta, int width)
+{
+ context_t *ctx;
+
+ if (width) {
+ /* we don't check generation for 0-width matches */
+ if (t.pc->gen == re->gen) {
+ decref(re, t.ctx);
+ return 1;
+ }
+ t.pc->gen = re->gen;
+ }
+ switch(t.pc->op) {
+ case OP_JUMP:
+ addthread(re, list, newthread(t.pc->lhs, t.ctx), in, delta, width);
+ break;
+ case OP_SPLIT:
+ addthread(re, list, newthread(t.pc->lhs, incref(t.ctx)), in, delta, width);
+ addthread(re, list, newthread(t.pc->rhs, t.ctx), in, delta, width);
+ break;
+ case OP_SAVE:
+ if ((ctx = mark(re, t.ctx, t.pc->n, in, delta)) == NULL) {
+ return 0;
+ }
+ addthread(re, list, newthread(t.pc+1, ctx), in, delta, width);
+ break;
+ default:
+ list->t[list->nthreads++] = t;
+ break;
+ }
+ return 1;
+}
+
+/* return 1 at beginning of line */
+static inline int
+isbegline(re_t *re, input_t *in)
+{
+ return (re->flags & AGCRE_REG_NOTBOL) ? 0 : (in->prevch == '\n' || in->prevch == BOL);
+}
+
+/* return 1 at end of line */
+static inline int
+isendline(re_t *re, input_t *in)
+{
+ return (re->flags & AGCRE_REG_NOTEOL) ? 0 : in->c == in->eo;
+}
+
+/* return 1 at beginning of words */
+static inline int
+isbegword(re_t *re, input_t *in)
+{
+ return (isbegline(re, in) || !unicode_isident(in->prevch)) &&
+ unicode_isident(in->ch);
+}
+
+/* return 1 at end of words */
+static inline int
+isendword(re_t *re, input_t *in)
+{
+ return (isendline(re, in) || (in->c > in->so && unicode_isident(in->prevch))) &&
+ (!unicode_isident(in->ch));
+}
+
+/* do the chars match? */
+static inline int
+charmatch(re_t *re, instr_t *instr, uint32_t ch, uint32_t wanted)
+{
+ if ((re->flags & AGCRE_REG_ICASE) || instr->icase) {
+ return unicode_tolower(ch) == unicode_tolower(wanted);
+ }
+ return (ch == wanted);
+}
+
+/* can we match "any" character? */
+static inline int
+anymatch(re_t *re, input_t *in)
+{
+ if ((re->flags & AGCRE_REG_NEWLINE) && in->ch == '\n') {
+ return 0;
+ }
+ return (in->c < in->eo);
+}
+
+/* are the chars in the set? */
+static inline int
+setmatch(re_t *re, set_t *set, input_t *in, instr_t *instr)
+{
+ if ((re->flags & AGCRE_REG_ICASE) || instr->icase) {
+ return set_isset(set, unicode_tolower(in->ch)) ||
+ set_isset(set, unicode_toupper(in->ch));
+ }
+ return set_isset(set, in->ch);
+}
+
+/* is this a zerowidth match? */
+static inline int
+zeromatch(re_t *re, uint32_t type, input_t *in)
+{
+ switch(type) {
+ case '^':
+ return isbegline(re, in);
+ case '$':
+ return isendline(re, in);
+ case '<':
+ return isbegword(re, in);
+ case '>':
+ return isendword(re, in);
+ case 'b':
+ return isbegword(re, in) || isendword(re, in);
+ }
+ return 0;
+}
+
+/* we take the leftmost longest match */
+static inline int
+is_better_match(context_t *ctx, context_t *prev)
+{
+ agcre_regoff_t prevlen;
+ agcre_regoff_t ctxlen;
+
+ prevlen = (prev) ? prev->sub[0].rm_eo - prev->sub[0].rm_so : -1;
+ ctxlen = ctx->sub[0].rm_eo - ctx->sub[0].rm_so;
+ if (ctxlen > prevlen) {
+ return 1;
+ }
+ return (ctxlen == prevlen) ? (ctx->sub[0].rm_so < prev->sub[0].rm_so) : 0;
+}
+
+/* get length of utf16 string in 16bit chars */
+static inline size_t
+strlen16(const uint16_t *s)
+{
+ uint32_t i;
+
+ for (i = 0 ; s[i] != 0x0000 ; i++) {
+ }
+ return i;
+}
+
+/* get length of utf32 string in 32bit chars */
+static inline size_t
+strlen32(const uint32_t *s)
+{
+ uint32_t i;
+
+ for (i = 0 ; s[i] != 0 ; i++) {
+ }
+ return i;
+}
+
+/* get length of string in terms of bytes */
+static inline uint32_t
+get_length_in_bytes(const void *vs, uint32_t flags)
+{
+ switch(flags & AGCRE_IN_WIDTH) {
+ case AGCRE_REG_UTF16LE:
+ case AGCRE_REG_UTF16BE:
+ return strlen16(vs) * 2;
+ case AGCRE_REG_UTF32LE:
+ case AGCRE_REG_UTF32BE:
+ return strlen32(vs) * 4;
+ default:
+ return strlen(vs);
+ }
+}
+
+/* free all contexts */
+static inline void
+free_contexts(context_t *ctx)
+{
+ context_t *next;
+
+ for ( ; ctx ; ctx = next) {
+ next = ctx->next;
+ free(ctx);
+ }
+}
+
+/* unwrap a character, including UTF-8 chars, into 32bit char */
+static inline int
+lex_unwrap_utf8(input_t *in)
+{
+ uint32_t n;
+ uint32_t j;
+
+ in->ch = in->s[in->c];
+ if (in->ch <= 0x7f) {
+ /* 0XXX XXXX one byte */
+ in->bytes = 1;
+ n = in->ch;
+ } else if ((in->ch & 0xe0) == 0xc0) {
+ /* 110X XXXX 2 bytes */
+ in->bytes = 2;
+ n = (in->ch & 0x1f);
+ } else if ((in->ch & 0xf0) == 0xe0) {
+ /* 1110 XXXX three bytes */
+ in->bytes = 3;
+ n = (in->ch & 0x0f);
+ } else if ((in->ch & 0xf8) == 0xf0) {
+ /* 1111 0XXX four bytes */
+ in->bytes = 4;
+ n = (in->ch & 0x07);
+ } else if ((in->ch & 0xfc) == 0xf8) {
+ /* 1111 10XX five bytes */
+ in->bytes = 5;
+ n = (in->ch & 0x03);
+ } else if ((in->ch & 0xfe) == 0xfc) {
+ /* 1111 110X six bytes */
+ in->bytes = 6;
+ n = (in->ch & 0x01);
+ } else {
+ return 0;
+ }
+ if (in->c + in->bytes - 1 > in->eo) {
+ return 0;
+ }
+ for (j = 1 ; j < in->bytes ; j++) {
+ n = (n << 6) | (in->s[in->c + j] & 0x3f);
+ }
+ in->ch = n;
+ in->c += in->bytes;
+ return 1;
+}
+
+/* return 1 if host is bigendian */
+static inline int
+lex_host_is_bigendian(void)
+{
+ int indian = 1;
+
+ return (*(char *)(void *)&indian) ? 0 : 1;
+}
+
+/* swap 32bit integer */
+static inline uint32_t
+lex_swap32(uint32_t x)
+{
+ x = ((x & 0x0000ffff) << 16) | ((x & 0xffff0000) >> 16);
+ x = ((x & 0x00ff00ff) << 8) | ((x & 0xff00ff00) >> 8);
+ return x;
+}
+
+/* swap 16bit integer */
+static inline uint32_t
+lex_swap16(uint16_t x)
+{
+ return ((x & 0x00ff) << 8) | ((x & 0xff00) >> 8);
+}
+
+/* return the next char */
+static inline int
+in_nextchar(input_t *in)
+{
+ uint32_t i32;
+ uint16_t i16;
+
+ if (in->c == in->eo) {
+ return 0;
+ }
+ in->prevch = in->ch;
+ in->prevbytes = in->bytes;
+ switch(in->re->flags & AGCRE_IN_WIDTH) {
+ case AGCRE_REG_UTF16LE:
+ in->bytes = 2;
+ memcpy(&i16, &in->s[in->c], in->bytes);
+ in->ch = (lex_host_is_bigendian()) ? lex_swap16(i16) : i16;
+ in->c += in->bytes;
+ return 1;
+ case AGCRE_REG_UTF16BE:
+ in->bytes = 2;
+ memcpy(&i16, &in->s[in->c], in->bytes);
+ in->ch = (!lex_host_is_bigendian()) ? lex_swap16(i16) : i16;
+ in->c += in->bytes;
+ return 1;
+ case AGCRE_REG_UTF32LE:
+ in->bytes = 4;
+ memcpy(&i32, &in->s[in->c], in->bytes);
+ in->ch = (lex_host_is_bigendian()) ? lex_swap32(i32) : i32;
+ in->c += in->bytes;
+ return 1;
+ case AGCRE_REG_UTF32BE:
+ in->bytes = 4;
+ memcpy(&i32, &in->s[in->c], in->bytes);
+ in->ch = (!lex_host_is_bigendian()) ? lex_swap32(i32) : i32;
+ in->c += in->bytes;
+ return 1;
+ default:
+ return lex_unwrap_utf8(in);
+ }
+}
+
+/* look ahead */
+static inline uint32_t
+in_peek(input_t *in, int32_t delta)
+{
+ return in->s[in->c + delta];
+}
+
+/* keep an error message, only if we don't have one already */
+static int
+lex_error_message(input_t *in, const char *fmt, ...)
+{
+ va_list args;
+
+ if (in->msgc == 0) {
+ va_start(args, fmt);
+ in->msgc = vsnprintf(in->msg, sizeof(in->msg), fmt, args);
+ va_end(args);
+ }
+ return TOK_BROKEN;
+}
+
+/* do we have a lazy marker? */
+static inline void
+lazy(input_t *in, retoken_t *token)
+{
+ if (in_peek(in, 0) == '?') {
+ in_nextchar(in);
+ token->lazy = 1;
+ }
+}
+
+/* get a number, up to width digits wide */
+static inline int
+get_number(input_t *in, uint32_t *num, uint32_t width, uint32_t base)
+{
+ static const char *hex = "0123456789abcdef";
+ const char *p;
+ uint32_t i;
+
+ for (*num = 0, i = 0 ; i < width ; i++) {
+ if ((p = strchr(hex, in_peek(in, i))) == NULL ||
+ (uint32_t)(p - hex) >= base) {
+ break;
+ }
+ *num = (*num * base) + (uint32_t)(p - hex);
+ }
+ if (i > width) {
+ return 0;
+ }
+ in->c += i;
+ return 1;
+}
+
+/* recognise a specific, bounded repeat */
+static inline int
+rep(input_t *in)
+{
+ retoken_t *parse;
+
+ parse = &in->parse;
+ if (!get_number(in, &parse->lo, 4, 10)) {
+ return lex_error_message(in, "number too large '%.6s", &in->s[parse->so]);
+ }
+ parse->hi = parse->lo;
+ if (in_peek(in, 0) == ',') {
+ in_nextchar(in);
+ if (!get_number(in, &parse->hi, 4, 10)) {
+ return lex_error_message(in, "number too large '%.6s", &in->s[parse->so]);
+ }
+ if (parse->hi == 0) {
+ parse->hi = AT_LEAST;
+ }
+ }
+ if (parse->lo > parse->hi) {
+ return lex_error_message(in, "low %u > hi %u", parse->lo, parse->hi);
+ }
+ in_nextchar(in);
+ if (in->ch != '}') {
+ return lex_error_message(in, "expected } at end of repeat", &in->s[parse->so]);
+ }
+ return TOK_REP;
+}
+
+/* recognise a backslash sequence */
+static int
+rec_backslash(input_t *in, retoken_t *token, int want_backrefs)
+{
+ in_nextchar(in);
+ switch(token->ch = in->ch) {
+ case 'u':
+ if (!get_number(in, &token->ch, 4, 16)) {
+ return lex_error_message(in, "bad unicode character '%.6s", &in->s[token->so]);
+ }
+ break;
+ case 'n':
+ token->ch = '\n';
+ return TOK_LIT;
+ case 'r':
+ token->ch = '\r';
+ return TOK_LIT;
+ case 't':
+ token->ch = '\t';
+ return TOK_LIT;
+ case '0': case '1': case '2': case '3': case '4':
+ case '5': case '6': case '7': case '8': case '9':
+ in->c -= in->prevbytes;
+ if (want_backrefs && get_number(in, &token->ch, 2, 10)) {
+ return TOK_BACKREF;
+ }
+ break;
+ default:
+ if (strchr("<>b", token->ch) != NULL) {
+ return TOK_ZEROWIDTH;
+ }
+ if (strchr("DPSWdpsw", token->ch) != NULL) {
+ return TOK_WIDTH;
+ }
+ break;
+ }
+ return TOK_LIT;
+}
+
+/* function to calculate a hash, used in stringswitch */
+static uint32_t
+djbhashn(const char *s, size_t len)
+{
+ uint32_t hash;
+
+ for (hash = 5381; len-- > 0 ; ) {
+ hash = hash * 33 + *s++;
+ }
+ return hash + (hash >> 5);
+}
+
+/* recognise a posix class */
+static int
+rec_posix_class(input_t *in, set_t *set)
+{
+ char *p;
+
+ if ((p = memchr(&in->s[in->c], ']', 10)) == NULL) {
+ return 0;
+ }
+ switch(djbhashn(&in->s[in->c], (size_t)(p - &in->s[in->c]) + 1)) {
+ case /* ":alnum:]" */ 0x32ef56fd:
+ set_add_callback(set, unicode_isalnum, 0);
+ in->c += 8;
+ return 1;
+ case /* ":alpha:]" */ 0x330d1aee:
+ set_add_callback(set, unicode_isalpha, 0);
+ in->c += 8;
+ return 1;
+ case /* ":blank:]" */ 0x815acf00:
+ set_add_callback(set, unicode_isblank, 0);
+ in->c += 8;
+ return 1;
+ case /* ":cntrl:]" */ 0xd6f15906:
+ set_add_callback(set, unicode_iscntrl, 0);
+ in->c += 8;
+ return 1;
+ case /* ":digit:]" */ 0x1154c1e2:
+ set_add_callback(set, unicode_isdigit, 0);
+ in->c += 8;
+ return 1;
+ case /* ":graph:]" */ 0xcb51807:
+ set_add_callback(set, unicode_isgraph, 0);
+ in->c += 8;
+ return 1;
+ case /* ":ident:]" */ 0x8a1572f1:
+ set_add_callback(set, unicode_isident, 0);
+ in->c += 8;
+ return 1;
+ case /* ":lower:]" */ 0x8bfc6af8:
+ set_add_callback(set, unicode_islower, 0);
+ in->c += 8;
+ return 1;
+ case /* ":print:]" */ 0xc7bbf92b:
+ set_add_callback(set, unicode_isprint, 0);
+ in->c += 8;
+ return 1;
+ case /* ":punct:]" */ 0xcf4a84ca:
+ set_add_callback(set, unicode_ispunct, 0);
+ in->c += 8;
+ return 1;
+ case /* ":space:]" */ 0xa876bcf2:
+ set_add_callback(set, unicode_isspace, 0);
+ in->c += 8;
+ return 1;
+ case /* ":upper:]" */ 0x40541fdb:
+ set_add_callback(set, unicode_isupper, 0);
+ in->c += 8;
+ return 1;
+ case /* ":xdigit:]" */ 0x24758e84:
+ set_add_callback(set, unicode_isxdigit, 0);
+ in->c += 9;
+ return 1;
+ default:
+ return 0;
+ }
+}
+
+/* recognise a set */
+static int
+rec_set(input_t *in)
+{
+ const int nobackrefs = 0;
+ retoken_t *token;
+ retoken_t bs;
+ uint32_t prevch;
+ set_t *set;
+ int complement;
+ int done;
+ int ch;
+
+ if (memcmp(&in->s[in->c], "[:<:]]", 6) == 0 ||
+ memcmp(&in->s[in->c], "[:>:]]", 6) == 0) {
+ in->parse.ch = in->s[in->c + 2];
+ in->c += 6;
+ return TOK_ZEROWIDTH;
+ }
+ if (in->re->setc == in->re->maxset) {
+ set = realloc(in->re->sets, (in->re->maxset + 8) * sizeof(*set));
+ if (set == NULL) {
+ return lex_error_message(in, "can't extend sets past %u", in->re->maxset);
+ }
+ memset(&set[in->re->setc], 0x0, 8 * sizeof(*set));
+ in->re->maxset += 8;
+ in->re->sets = set;
+ }
+ set = &in->re->sets[in->parse.ch = in->re->setc++];
+ token = &in->parse;
+ complement = 0;
+ if (in_peek(in, 0) == '^') {
+ complement = 1;
+ set_complement(set);
+ in_nextchar(in);
+ }
+ for (done = 0, prevch = in->prevch ; !done && in_nextchar(in) ; ) {
+ switch(ch = in->ch) {
+ case '\\':
+ memset(&bs, 0x0, sizeof(bs));
+ switch (rec_backslash(in, &bs, nobackrefs)) {
+ case TOK_BROKEN:
+ return lex_error_message(in, "bad backslash '%.10s'", &in->s[token->so]);
+ case TOK_LIT:
+ set_add_case(set, bs.ch);
+ break;
+ case TOK_WIDTH:
+ set_add_callback(set,
+ (token->ch == 'd' || token->ch == 'D') ? unicode_isdigit :
+ (token->ch == 'p' || token->ch == 'P') ? unicode_isprint :
+ (token->ch == 's' || token->ch == 'S') ? unicode_isspace :
+ unicode_isident,
+ unicode_isupper((uint8_t)token->ch));
+ break;
+ }
+ break;
+ case '[':
+ if (!rec_posix_class(in, set)) {
+ return lex_error_message(in, "bad posix class '%.30s'", &in->s[token->so]);
+ }
+ break;
+ case ']':
+ if (in->c == token->so + complement + 2) {
+ set_add_case(set, ']');
+ } else {
+ done = 1;
+ }
+ break;
+ case '-':
+ if (in->c == token->so + complement + 2) {
+ set_add_case(set, '-');
+ } else {
+ prevch = in->prevch;
+ in_nextchar(in);
+ set_add_range(set, prevch, in->ch);
+ }
+ break;
+ default:
+ set_add_case(set, ch);
+ break;
+ }
+ }
+ if (in->ch != ']') {
+ return lex_error_message(in, "expected ']' at end of set: %s", &in->s[token->so]);
+ }
+ return TOK_SET;
+}
+
+/* lexer function */
+static int
+get_next_token(input_t *in)
+{
+ const int backrefs = 1;
+ retoken_t *token = &in->parse;
+
+ memset(token, 0x0, sizeof(*token));
+ token->so = in->c;
+ if (!in_nextchar(in)) {
+ return token->type = TOK_EOL;
+ }
+ token->icase = in->icase;
+ switch(token->ch = in->ch) {
+ case '(':
+ if (in_peek(in, 0) == '?') {
+ if (in_peek(in, 1) == ':') {
+ in->c += 2;
+ token->noncapture = 1;
+ }
+ if (in_peek(in, 1) == 'i') {
+ in->c += 3;
+ in->icase = 1;
+ return get_next_token(in);
+ }
+ if (in_peek(in, 1) == '-' && in_peek(in, 2) == 'i') {
+ in->c += 4;
+ in->icase = 0;
+ return get_next_token(in);
+ }
+ }
+ return token->type = TOK_PAREN;
+ case ')':
+ return token->type = TOK_PAREN;
+ case '?':
+ lazy(in, token);
+ return token->type = TOK_QUEST;
+ case '*':
+ lazy(in, token);
+ return token->type = TOK_STAR;
+ case '+':
+ lazy(in, token);
+ return token->type = TOK_PLUS;
+ case '|':
+ return token->type = TOK_ALT;
+ case '{':
+ return token->type = rep(in);
+ case '^':
+ case '$':
+ return token->type = TOK_ZEROWIDTH;
+ case '[':
+ return token->type = rec_set(in);
+ case '.':
+ return token->type = TOK_DOT;
+ case '\\':
+ return token->type = rec_backslash(in, token, backrefs);
+ default:
+ return token->type = TOK_LIT;
+ }
+}
+
+/* free resources used by a set */
+static void
+set_free(set_t *set)
+{
+ free(set->ranges);
+ free(set->cases);
+ free(set->callbacks);
+}
+
+/* add a single 'case' to a set - a single member */
+static int
+set_add_case(set_t *set, uint32_t n)
+{
+ uint32_t *v;
+
+ if (set->casec == set->maxcase) {
+ v = realloc(set->cases, (set->maxcase + 16) * sizeof(*v));
+ if (v == NULL) {
+ fprintf(stderr, "set_add_case: failure\n");
+ return 0;
+ }
+ set->maxcase += 16;
+ set->cases = v;
+ }
+ set->cases[set->casec++] = n;
+ return 1;
+}
+
+/* add a range to a set */
+static int
+set_add_range(set_t *set, uint32_t lo, uint32_t hi)
+{
+ uint32_t *v;
+
+ if (set->rangec == set->maxrange) {
+ v = realloc(set->ranges, (set->maxrange + 16) * sizeof(*v));
+ if (v == NULL) {
+ fprintf(stderr, "set_add_range: failure\n");
+ return 0;
+ }
+ set->maxrange += 16;
+ set->ranges = v;
+ }
+ set->ranges[set->rangec++] = lo;
+ set->ranges[set->rangec++] = hi;
+ return 1;
+}
+
+/* add a callback function to a set */
+static int
+set_add_callback(set_t *set, int (*func)(uint32_t), int negate)
+{
+ cb_t *v;
+
+ if (set->callbackc == set->maxcallback) {
+ v = realloc(set->callbacks, (set->maxcallback + 16) * sizeof(*v));
+ if (v == NULL) {
+ fprintf(stderr, "set_add_callback: failure\n");
+ return 0;
+ }
+ set->maxcallback += 16;
+ set->callbacks = v;
+ }
+ set->callbacks[set->callbackc].func = func;
+ set->callbacks[set->callbackc++].negate = negate;
+ return 1;
+}
+
+/* test if a number is present in the set */
+static int
+set_isset(set_t *set, uint32_t n)
+{
+ uint32_t i;
+
+ for (i = 0 ; i < set->casec ; i++) {
+ if (set->cases[i] == n) {
+ return 1 - set->complement;
+ }
+ }
+ for (i = 0 ; i < set->rangec ; i += 2) {
+ if (n >= set->ranges[i] && n <= set->ranges[i + 1]) {
+ return 1 - set->complement;
+ }
+ }
+ for (i = 0 ; i < set->callbackc ; i += 2) {
+ if ((*set->callbacks[i].func)(n) == 1 - set->callbacks[i].negate) {
+ return 1 - set->complement;
+ }
+ }
+ return set->complement;
+}
+
+/* set the complement flag */
+static int
+set_complement(set_t *set)
+{
+ return set->complement = 1 - set->complement;
+}
+
+#define RANGE(a, b) do { \
+ if (ch >= (a) && ch <= (b)) { \
+ return 1; \
+ } \
+} while(/*CONSTCOND*/0)
+
+
+/* return 1 if ch has White Space property */
+static int
+unicode_isWhite_Space(uint32_t ch)
+{
+ switch(ch) {
+ case 0x0020:
+ case 0x0085:
+ case 0x00A0:
+ case 0x1680:
+ case 0x2028:
+ case 0x2029:
+ case 0x202F:
+ case 0x205F:
+ case 0x3000:
+ return 1;
+ }
+ RANGE(0x0009, 0x000D);
+ RANGE(0x2000, 0x200A);
+ return 0;
+}
+
+/* return 1 if ch has Dash property */
+static int
+unicode_isDash(uint32_t ch)
+{
+ switch(ch) {
+ case 0x002D:
+ case 0x058A:
+ case 0x05BE:
+ case 0x1400:
+ case 0x1806:
+ case 0x2053:
+ case 0x207B:
+ case 0x208B:
+ case 0x2212:
+ case 0x2E17:
+ case 0x2E1A:
+ case 0x2E40:
+ case 0x301C:
+ case 0x3030:
+ case 0x30A0:
+ case 0xFE58:
+ case 0xFE63:
+ case 0xFF0D:
+ return 1;
+ }
+ RANGE(0x2010, 0x2015);
+ RANGE(0x2E3A, 0x2E3B);
+ RANGE(0xFE31, 0xFE32);
+ return 0;
+}
+
+/* return 1 if ch has Hyphen property */
+static int
+unicode_isHyphen(uint32_t ch)
+{
+ switch(ch) {
+ case 0x002D:
+ case 0x00AD:
+ case 0x058A:
+ case 0x1806:
+ case 0x2E17:
+ case 0x30FB:
+ case 0xFE63:
+ case 0xFF0D:
+ case 0xFF65:
+ return 1;
+ }
+ RANGE(0x2010, 0x2011);
+ return 0;
+}
+
+/* return 1 if ch has Quotation Mark property */
+static int
+unicode_isQuotation_Mark(uint32_t ch)
+{
+ switch(ch) {
+ case 0x0022:
+ case 0x0027:
+ case 0x00AB:
+ case 0x00BB:
+ case 0x2018:
+ case 0x2019:
+ case 0x201A:
+ case 0x201D:
+ case 0x201E:
+ case 0x201F:
+ case 0x2039:
+ case 0x203A:
+ case 0x2E42:
+ case 0x300C:
+ case 0x300D:
+ case 0x300E:
+ case 0x300F:
+ case 0x301D:
+ case 0xFE41:
+ case 0xFE42:
+ case 0xFE43:
+ case 0xFE44:
+ case 0xFF02:
+ case 0xFF07:
+ case 0xFF62:
+ case 0xFF63:
+ return 1;
+ }
+ RANGE(0x201B, 0x201C);
+ RANGE(0x301E, 0x301F);
+ return 0;
+}
+
+/* return 1 if ch has Terminal Punctuation property */
+static int
+unicode_isTerminal_Punctuation(uint32_t ch)
+{
+ switch(ch) {
+ case 0x0021:
+ case 0x002C:
+ case 0x002E:
+ case 0x003F:
+ case 0x037E:
+ case 0x0387:
+ case 0x0589:
+ case 0x05C3:
+ case 0x060C:
+ case 0x061B:
+ case 0x061F:
+ case 0x06D4:
+ case 0x070C:
+ case 0x085E:
+ case 0x0F08:
+ case 0x17DA:
+ case 0x2E2E:
+ case 0x2E3C:
+ case 0x2E41:
+ case 0xA92F:
+ case 0xAADF:
+ case 0xABEB:
+ case 0xFF01:
+ case 0xFF0C:
+ case 0xFF0E:
+ case 0xFF1F:
+ case 0xFF61:
+ case 0xFF64:
+ case 0x1039F:
+ case 0x103D0:
+ case 0x10857:
+ case 0x1091F:
+ case 0x111CD:
+ case 0x112A9:
+ case 0x1145B:
+ case 0x11C71:
+ case 0x16AF5:
+ case 0x16B44:
+ case 0x1BC9F:
+ return 1;
+ }
+ RANGE(0x003A, 0x003B);
+ RANGE(0x0700, 0x070A);
+ RANGE(0x07F8, 0x07F9);
+ RANGE(0x0830, 0x083E);
+ RANGE(0x0964, 0x0965);
+ RANGE(0x0E5A, 0x0E5B);
+ RANGE(0x0F0D, 0x0F12);
+ RANGE(0x104A, 0x104B);
+ RANGE(0x1361, 0x1368);
+ RANGE(0x166D, 0x166E);
+ RANGE(0x16EB, 0x16ED);
+ RANGE(0x1735, 0x1736);
+ RANGE(0x17D4, 0x17D6);
+ RANGE(0x1802, 0x1805);
+ RANGE(0x1808, 0x1809);
+ RANGE(0x1944, 0x1945);
+ RANGE(0x1AA8, 0x1AAB);
+ RANGE(0x1B5A, 0x1B5B);
+ RANGE(0x1B5D, 0x1B5F);
+ RANGE(0x1C3B, 0x1C3F);
+ RANGE(0x1C7E, 0x1C7F);
+ RANGE(0x203C, 0x203D);
+ RANGE(0x2047, 0x2049);
+ RANGE(0x3001, 0x3002);
+ RANGE(0xA4FE, 0xA4FF);
+ RANGE(0xA60D, 0xA60F);
+ RANGE(0xA6F3, 0xA6F7);
+ RANGE(0xA876, 0xA877);
+ RANGE(0xA8CE, 0xA8CF);
+ RANGE(0xA9C7, 0xA9C9);
+ RANGE(0xAA5D, 0xAA5F);
+ RANGE(0xAAF0, 0xAAF1);
+ RANGE(0xFE50, 0xFE52);
+ RANGE(0xFE54, 0xFE57);
+ RANGE(0xFF1A, 0xFF1B);
+ RANGE(0x10A56, 0x10A57);
+ RANGE(0x10AF0, 0x10AF5);
+ RANGE(0x10B3A, 0x10B3F);
+ RANGE(0x10B99, 0x10B9C);
+ RANGE(0x11047, 0x1104D);
+ RANGE(0x110BE, 0x110C1);
+ RANGE(0x11141, 0x11143);
+ RANGE(0x111C5, 0x111C6);
+ RANGE(0x111DE, 0x111DF);
+ RANGE(0x11238, 0x1123C);
+ RANGE(0x1144B, 0x1144D);
+ RANGE(0x115C2, 0x115C5);
+ RANGE(0x115C9, 0x115D7);
+ RANGE(0x11641, 0x11642);
+ RANGE(0x1173C, 0x1173E);
+ RANGE(0x11A42, 0x11A43);
+ RANGE(0x11A9B, 0x11A9C);
+ RANGE(0x11AA1, 0x11AA2);
+ RANGE(0x11C41, 0x11C43);
+ RANGE(0x12470, 0x12474);
+ RANGE(0x16A6E, 0x16A6F);
+ RANGE(0x16B37, 0x16B39);
+ RANGE(0x1DA87, 0x1DA8A);
+ return 0;
+}
+
+/* return 1 if ch has Hex Digit property */
+static int
+unicode_isHex_Digit(uint32_t ch)
+{
+ RANGE(0x0030, 0x0039);
+ RANGE(0x0041, 0x0046);
+ RANGE(0x0061, 0x0066);
+ RANGE(0xFF10, 0xFF19);
+ RANGE(0xFF21, 0xFF26);
+ RANGE(0xFF41, 0xFF46);
+ return 0;
+}
+
+/* return 1 if ch has Other Alphabetic property */
+static int
+unicode_isOther_Alphabetic(uint32_t ch)
+{
+ switch(ch) {
+ case 0x0345:
+ case 0x05BF:
+ case 0x05C7:
+ case 0x0670:
+ case 0x06ED:
+ case 0x0711:
+ case 0x0903:
+ case 0x093A:
+ case 0x093B:
+ case 0x0981:
+ case 0x09D7:
+ case 0x0A03:
+ case 0x0A51:
+ case 0x0A75:
+ case 0x0A83:
+ case 0x0AC9:
+ case 0x0B01:
+ case 0x0B3E:
+ case 0x0B3F:
+ case 0x0B40:
+ case 0x0B56:
+ case 0x0B57:
+ case 0x0B82:
+ case 0x0BC0:
+ case 0x0BD7:
+ case 0x0C00:
+ case 0x0C81:
+ case 0x0CBE:
+ case 0x0CBF:
+ case 0x0CC6:
+ case 0x0CCC:
+ case 0x0D57:
+ case 0x0DD6:
+ case 0x0E31:
+ case 0x0E4D:
+ case 0x0EB1:
+ case 0x0ECD:
+ case 0x0F7F:
+ case 0x1031:
+ case 0x1038:
+ case 0x1062:
+ case 0x1082:
+ case 0x109C:
+ case 0x109D:
+ case 0x135F:
+ case 0x17B6:
+ case 0x17C6:
+ case 0x18A9:
+ case 0x1932:
+ case 0x1A1B:
+ case 0x1A55:
+ case 0x1A56:
+ case 0x1A57:
+ case 0x1A61:
+ case 0x1A62:
+ case 0x1B04:
+ case 0x1B35:
+ case 0x1B3B:
+ case 0x1B3C:
+ case 0x1B42:
+ case 0x1B43:
+ case 0x1B82:
+ case 0x1BA1:
+ case 0x1BE7:
+ case 0x1BED:
+ case 0x1BEE:
+ case 0xA827:
+ case 0xA8C5:
+ case 0xA952:
+ case 0xA983:
+ case 0xA9BC:
+ case 0xAA43:
+ case 0xAA4C:
+ case 0xAA4D:
+ case 0xAAB0:
+ case 0xAABE:
+ case 0xAAEB:
+ case 0xAAF5:
+ case 0xABE5:
+ case 0xABE8:
+ case 0xFB1E:
+ case 0x11000:
+ case 0x11001:
+ case 0x11002:
+ case 0x11082:
+ case 0x1112C:
+ case 0x11182:
+ case 0x111BF:
+ case 0x11234:
+ case 0x11237:
+ case 0x1123E:
+ case 0x112DF:
+ case 0x11340:
+ case 0x11357:
+ case 0x11445:
+ case 0x114B9:
+ case 0x114BA:
+ case 0x114C1:
+ case 0x115BE:
+ case 0x1163D:
+ case 0x1163E:
+ case 0x11640:
+ case 0x116AB:
+ case 0x116AC:
+ case 0x116AD:
+ case 0x11726:
+ case 0x11A39:
+ case 0x11A97:
+ case 0x11C2F:
+ case 0x11C3E:
+ case 0x11CA9:
+ case 0x11CB1:
+ case 0x11CB4:
+ case 0x11D3A:
+ case 0x11D43:
+ case 0x11D47:
+ case 0x1BC9E:
+ case 0x1E947:
+ return 1;
+ }
+ RANGE(0x05B0, 0x05BD);
+ RANGE(0x05C1, 0x05C2);
+ RANGE(0x05C4, 0x05C5);
+ RANGE(0x0610, 0x061A);
+ RANGE(0x064B, 0x0657);
+ RANGE(0x0659, 0x065F);
+ RANGE(0x06D6, 0x06DC);
+ RANGE(0x06E1, 0x06E4);
+ RANGE(0x06E7, 0x06E8);
+ RANGE(0x0730, 0x073F);
+ RANGE(0x07A6, 0x07B0);
+ RANGE(0x0816, 0x0817);
+ RANGE(0x081B, 0x0823);
+ RANGE(0x0825, 0x0827);
+ RANGE(0x0829, 0x082C);
+ RANGE(0x08D4, 0x08DF);
+ RANGE(0x08E3, 0x08E9);
+ RANGE(0x08F0, 0x0902);
+ RANGE(0x093E, 0x0940);
+ RANGE(0x0941, 0x0948);
+ RANGE(0x0949, 0x094C);
+ RANGE(0x094E, 0x094F);
+ RANGE(0x0955, 0x0957);
+ RANGE(0x0962, 0x0963);
+ RANGE(0x0982, 0x0983);
+ RANGE(0x09BE, 0x09C0);
+ RANGE(0x09C1, 0x09C4);
+ RANGE(0x09C7, 0x09C8);
+ RANGE(0x09CB, 0x09CC);
+ RANGE(0x09E2, 0x09E3);
+ RANGE(0x0A01, 0x0A02);
+ RANGE(0x0A3E, 0x0A40);
+ RANGE(0x0A41, 0x0A42);
+ RANGE(0x0A47, 0x0A48);
+ RANGE(0x0A4B, 0x0A4C);
+ RANGE(0x0A70, 0x0A71);
+ RANGE(0x0A81, 0x0A82);
+ RANGE(0x0ABE, 0x0AC0);
+ RANGE(0x0AC1, 0x0AC5);
+ RANGE(0x0AC7, 0x0AC8);
+ RANGE(0x0ACB, 0x0ACC);
+ RANGE(0x0AE2, 0x0AE3);
+ RANGE(0x0AFA, 0x0AFC);
+ RANGE(0x0B02, 0x0B03);
+ RANGE(0x0B41, 0x0B44);
+ RANGE(0x0B47, 0x0B48);
+ RANGE(0x0B4B, 0x0B4C);
+ RANGE(0x0B62, 0x0B63);
+ RANGE(0x0BBE, 0x0BBF);
+ RANGE(0x0BC1, 0x0BC2);
+ RANGE(0x0BC6, 0x0BC8);
+ RANGE(0x0BCA, 0x0BCC);
+ RANGE(0x0C01, 0x0C03);
+ RANGE(0x0C3E, 0x0C40);
+ RANGE(0x0C41, 0x0C44);
+ RANGE(0x0C46, 0x0C48);
+ RANGE(0x0C4A, 0x0C4C);
+ RANGE(0x0C55, 0x0C56);
+ RANGE(0x0C62, 0x0C63);
+ RANGE(0x0C82, 0x0C83);
+ RANGE(0x0CC0, 0x0CC4);
+ RANGE(0x0CC7, 0x0CC8);
+ RANGE(0x0CCA, 0x0CCB);
+ RANGE(0x0CD5, 0x0CD6);
+ RANGE(0x0CE2, 0x0CE3);
+ RANGE(0x0D00, 0x0D01);
+ RANGE(0x0D02, 0x0D03);
+ RANGE(0x0D3E, 0x0D40);
+ RANGE(0x0D41, 0x0D44);
+ RANGE(0x0D46, 0x0D48);
+ RANGE(0x0D4A, 0x0D4C);
+ RANGE(0x0D62, 0x0D63);
+ RANGE(0x0D82, 0x0D83);
+ RANGE(0x0DCF, 0x0DD1);
+ RANGE(0x0DD2, 0x0DD4);
+ RANGE(0x0DD8, 0x0DDF);
+ RANGE(0x0DF2, 0x0DF3);
+ RANGE(0x0E34, 0x0E3A);
+ RANGE(0x0EB4, 0x0EB9);
+ RANGE(0x0EBB, 0x0EBC);
+ RANGE(0x0F71, 0x0F7E);
+ RANGE(0x0F80, 0x0F81);
+ RANGE(0x0F8D, 0x0F97);
+ RANGE(0x0F99, 0x0FBC);
+ RANGE(0x102B, 0x102C);
+ RANGE(0x102D, 0x1030);
+ RANGE(0x1032, 0x1036);
+ RANGE(0x103B, 0x103C);
+ RANGE(0x103D, 0x103E);
+ RANGE(0x1056, 0x1057);
+ RANGE(0x1058, 0x1059);
+ RANGE(0x105E, 0x1060);
+ RANGE(0x1067, 0x1068);
+ RANGE(0x1071, 0x1074);
+ RANGE(0x1083, 0x1084);
+ RANGE(0x1085, 0x1086);
+ RANGE(0x1712, 0x1713);
+ RANGE(0x1732, 0x1733);
+ RANGE(0x1752, 0x1753);
+ RANGE(0x1772, 0x1773);
+ RANGE(0x17B7, 0x17BD);
+ RANGE(0x17BE, 0x17C5);
+ RANGE(0x17C7, 0x17C8);
+ RANGE(0x1885, 0x1886);
+ RANGE(0x1920, 0x1922);
+ RANGE(0x1923, 0x1926);
+ RANGE(0x1927, 0x1928);
+ RANGE(0x1929, 0x192B);
+ RANGE(0x1930, 0x1931);
+ RANGE(0x1933, 0x1938);
+ RANGE(0x1A17, 0x1A18);
+ RANGE(0x1A19, 0x1A1A);
+ RANGE(0x1A58, 0x1A5E);
+ RANGE(0x1A63, 0x1A64);
+ RANGE(0x1A65, 0x1A6C);
+ RANGE(0x1A6D, 0x1A72);
+ RANGE(0x1A73, 0x1A74);
+ RANGE(0x1B00, 0x1B03);
+ RANGE(0x1B36, 0x1B3A);
+ RANGE(0x1B3D, 0x1B41);
+ RANGE(0x1B80, 0x1B81);
+ RANGE(0x1BA2, 0x1BA5);
+ RANGE(0x1BA6, 0x1BA7);
+ RANGE(0x1BA8, 0x1BA9);
+ RANGE(0x1BAC, 0x1BAD);
+ RANGE(0x1BE8, 0x1BE9);
+ RANGE(0x1BEA, 0x1BEC);
+ RANGE(0x1BEF, 0x1BF1);
+ RANGE(0x1C24, 0x1C2B);
+ RANGE(0x1C2C, 0x1C33);
+ RANGE(0x1C34, 0x1C35);
+ RANGE(0x1CF2, 0x1CF3);
+ RANGE(0x1DE7, 0x1DF4);
+ RANGE(0x24B6, 0x24E9);
+ RANGE(0x2DE0, 0x2DFF);
+ RANGE(0xA674, 0xA67B);
+ RANGE(0xA69E, 0xA69F);
+ RANGE(0xA823, 0xA824);
+ RANGE(0xA825, 0xA826);
+ RANGE(0xA880, 0xA881);
+ RANGE(0xA8B4, 0xA8C3);
+ RANGE(0xA926, 0xA92A);
+ RANGE(0xA947, 0xA951);
+ RANGE(0xA980, 0xA982);
+ RANGE(0xA9B4, 0xA9B5);
+ RANGE(0xA9B6, 0xA9B9);
+ RANGE(0xA9BA, 0xA9BB);
+ RANGE(0xA9BD, 0xA9BF);
+ RANGE(0xAA29, 0xAA2E);
+ RANGE(0xAA2F, 0xAA30);
+ RANGE(0xAA31, 0xAA32);
+ RANGE(0xAA33, 0xAA34);
+ RANGE(0xAA35, 0xAA36);
+ RANGE(0xAAB2, 0xAAB4);
+ RANGE(0xAAB7, 0xAAB8);
+ RANGE(0xAAEC, 0xAAED);
+ RANGE(0xAAEE, 0xAAEF);
+ RANGE(0xABE3, 0xABE4);
+ RANGE(0xABE6, 0xABE7);
+ RANGE(0xABE9, 0xABEA);
+ RANGE(0x10376, 0x1037A);
+ RANGE(0x10A01, 0x10A03);
+ RANGE(0x10A05, 0x10A06);
+ RANGE(0x10A0C, 0x10A0F);
+ RANGE(0x11038, 0x11045);
+ RANGE(0x110B0, 0x110B2);
+ RANGE(0x110B3, 0x110B6);
+ RANGE(0x110B7, 0x110B8);
+ RANGE(0x11100, 0x11102);
+ RANGE(0x11127, 0x1112B);
+ RANGE(0x1112D, 0x11132);
+ RANGE(0x11180, 0x11181);
+ RANGE(0x111B3, 0x111B5);
+ RANGE(0x111B6, 0x111BE);
+ RANGE(0x1122C, 0x1122E);
+ RANGE(0x1122F, 0x11231);
+ RANGE(0x11232, 0x11233);
+ RANGE(0x112E0, 0x112E2);
+ RANGE(0x112E3, 0x112E8);
+ RANGE(0x11300, 0x11301);
+ RANGE(0x11302, 0x11303);
+ RANGE(0x1133E, 0x1133F);
+ RANGE(0x11341, 0x11344);
+ RANGE(0x11347, 0x11348);
+ RANGE(0x1134B, 0x1134C);
+ RANGE(0x11362, 0x11363);
+ RANGE(0x11435, 0x11437);
+ RANGE(0x11438, 0x1143F);
+ RANGE(0x11440, 0x11441);
+ RANGE(0x11443, 0x11444);
+ RANGE(0x114B0, 0x114B2);
+ RANGE(0x114B3, 0x114B8);
+ RANGE(0x114BB, 0x114BE);
+ RANGE(0x114BF, 0x114C0);
+ RANGE(0x115AF, 0x115B1);
+ RANGE(0x115B2, 0x115B5);
+ RANGE(0x115B8, 0x115BB);
+ RANGE(0x115BC, 0x115BD);
+ RANGE(0x115DC, 0x115DD);
+ RANGE(0x11630, 0x11632);
+ RANGE(0x11633, 0x1163A);
+ RANGE(0x1163B, 0x1163C);
+ RANGE(0x116AE, 0x116AF);
+ RANGE(0x116B0, 0x116B5);
+ RANGE(0x1171D, 0x1171F);
+ RANGE(0x11720, 0x11721);
+ RANGE(0x11722, 0x11725);
+ RANGE(0x11727, 0x1172A);
+ RANGE(0x11A01, 0x11A06);
+ RANGE(0x11A07, 0x11A08);
+ RANGE(0x11A09, 0x11A0A);
+ RANGE(0x11A35, 0x11A38);
+ RANGE(0x11A3B, 0x11A3E);
+ RANGE(0x11A51, 0x11A56);
+ RANGE(0x11A57, 0x11A58);
+ RANGE(0x11A59, 0x11A5B);
+ RANGE(0x11A8A, 0x11A96);
+ RANGE(0x11C30, 0x11C36);
+ RANGE(0x11C38, 0x11C3D);
+ RANGE(0x11C92, 0x11CA7);
+ RANGE(0x11CAA, 0x11CB0);
+ RANGE(0x11CB2, 0x11CB3);
+ RANGE(0x11CB5, 0x11CB6);
+ RANGE(0x11D31, 0x11D36);
+ RANGE(0x11D3C, 0x11D3D);
+ RANGE(0x11D3F, 0x11D41);
+ RANGE(0x16B30, 0x16B36);
+ RANGE(0x16F51, 0x16F7E);
+ RANGE(0x1E000, 0x1E006);
+ RANGE(0x1E008, 0x1E018);
+ RANGE(0x1E01B, 0x1E021);
+ RANGE(0x1E023, 0x1E024);
+ RANGE(0x1E026, 0x1E02A);
+ RANGE(0x1F130, 0x1F149);
+ RANGE(0x1F150, 0x1F169);
+ RANGE(0x1F170, 0x1F189);
+ return 0;
+}
+
+/* return 1 if ch has Other Lowercase property */
+static int
+unicode_isOther_Lowercase(uint32_t ch)
+{
+ switch(ch) {
+ case 0x00AA:
+ case 0x00BA:
+ case 0x0345:
+ case 0x037A:
+ case 0x1D78:
+ case 0x2071:
+ case 0x207F:
+ case 0xA770:
+ return 1;
+ }
+ RANGE(0x02B0, 0x02B8);
+ RANGE(0x02C0, 0x02C1);
+ RANGE(0x02E0, 0x02E4);
+ RANGE(0x1D2C, 0x1D6A);
+ RANGE(0x1D9B, 0x1DBF);
+ RANGE(0x2090, 0x209C);
+ RANGE(0x2170, 0x217F);
+ RANGE(0x24D0, 0x24E9);
+ RANGE(0x2C7C, 0x2C7D);
+ RANGE(0xA69C, 0xA69D);
+ RANGE(0xA7F8, 0xA7F9);
+ RANGE(0xAB5C, 0xAB5F);
+ return 0;
+}
+
+/* return 1 if ch has Other Uppercase property */
+static int
+unicode_isOther_Uppercase(uint32_t ch)
+{
+ RANGE(0x2160, 0x216F);
+ RANGE(0x24B6, 0x24CF);
+ RANGE(0x1F130, 0x1F149);
+ RANGE(0x1F150, 0x1F169);
+ RANGE(0x1F170, 0x1F189);
+ return 0;
+}
+
+/* return 1 if ch has Pattern White Space property */
+static int
+unicode_isPattern_White_Space(uint32_t ch)
+{
+ switch(ch) {
+ case 0x0020:
+ case 0x0085:
+ case 0x2028:
+ case 0x2029:
+ return 1;
+ }
+ RANGE(0x0009, 0x000D);
+ RANGE(0x200E, 0x200F);
+ return 0;
+}
+
+static int
+unicode_isalnum(uint32_t ch)
+{
+ return unicode_isalpha(ch) || unicode_isdigit(ch);
+}
+
+static int
+unicode_isalpha(uint32_t ch)
+{
+ RANGE(0x0041, 0x005a);
+ RANGE(0x0061, 0x007a);
+ return unicode_isOther_Alphabetic(ch);
+}
+
+static int
+unicode_isblank(uint32_t ch)
+{
+ switch(ch) {
+ case 0x0009:
+ return 1;
+ }
+ return unicode_isWhite_Space(ch) || unicode_isPattern_White_Space(ch);
+}
+
+static int
+unicode_iscntrl(uint32_t ch)
+{
+ switch(ch) {
+ case 0x0000:
+ case 0x007f:
+ return 1;
+ }
+ RANGE(0x0001, 0x001f);
+ return 0;
+}
+
+static int
+unicode_isdigit(uint32_t ch)
+{
+ RANGE(0x0030, 0x0039);
+ RANGE(0xFF10, 0xFF19);
+ return 0;
+}
+
+static int
+unicode_isgraph(uint32_t ch)
+{
+ RANGE(0x0021, 0x007e);
+ return 0;
+}
+
+static int
+unicode_islower(uint32_t ch)
+{
+ RANGE(0x0061, 0x007a);
+ return unicode_isOther_Lowercase(ch);
+}
+
+static int
+unicode_isprint(uint32_t ch)
+{
+ RANGE(0x0020, 0x007e);
+ /* XXX */
+ return 0;
+}
+
+static int
+unicode_ispunct(uint32_t ch)
+{
+ return unicode_isTerminal_Punctuation(ch) ||
+ unicode_isQuotation_Mark(ch) ||
+ unicode_isHyphen(ch) ||
+ unicode_isDash(ch);
+}
+
+static int
+unicode_isspace(uint32_t ch)
+{
+ return unicode_isWhite_Space(ch);
+}
+
+static int
+unicode_isupper(uint32_t ch)
+{
+ RANGE(0x0041, 0x005a);
+ return unicode_isOther_Uppercase(ch);
+}
+
+static int
+unicode_isxdigit(uint32_t ch)
+{
+ return unicode_isHex_Digit(ch);
+}
+
+static int
+unicode_isident(uint32_t ch)
+{
+ return unicode_isalnum(ch) || ch == '_';
+}
+
+static uint32_t
+unicode_tolower(uint32_t ch)
+{
+ if (unicode_isupper(ch)) {
+ ch += 0x20;
+ }
+ return ch;
+}
+
+static uint32_t
+unicode_toupper(uint32_t ch)
+{
+ if (unicode_islower(ch)) {
+ ch -= 0x20;
+ }
+ return ch;
+}
+
+/* allocate space - used in regasub */
+static int
+growspace(char **buf, size_t *size, size_t cc, size_t incr)
+{
+ size_t newsize;
+ char *newv;
+
+ if (*size < cc + incr) {
+ newsize = howmany(cc + incr, 64) * 64;
+ newv = realloc(*buf, newsize);
+ if (newv == NULL) {
+ return 0;
+ }
+ memset(&newv[*size], 0x0, newsize - *size);
+ *buf = newv;
+ *size = newsize;
+ }
+ return 1;
+}
+
+/***********************************************************/
+
+/* allocate a new structure and return it */
+agcre_regex_t *
+agcre_new(void)
+{
+ return calloc(1, sizeof(agcre_regex_t));
+}
+
+/* compile regular expression */
+int
+agcre_regcomp(agcre_regex_t *agcre, const void *vs, uint32_t flags)
+{
+ retoken_t *tok;
+ uint32_t n;
+ input_t in;
+ re_t *re;
+ int errc;
+
+ if (agcre == NULL || vs == NULL) {
+ return AGCRE_REG_FAILURE;
+ }
+ memset(&in, 0x0, sizeof(in));
+ in.s = (const char *)vs;
+ in.eo = (flags & AGCRE_REG_PEND) ?
+ (agcre_regoff_t)(agcre->re_endp - in.s) :
+ (agcre_regoff_t)strlen(in.s);
+ memset(agcre, 0x0, sizeof(*agcre));
+ agcre->re_g = re = in.re = calloc(1, sizeof(*re));
+ if (in.eo - in.so > AGCRE_MAX_EXPR_LENGTH) {
+ re->msgc = snprintf(re->msg, sizeof(re->msg),
+ "expression length %llu larger than %u",
+ (unsigned long long)(in.eo - in.so),
+ AGCRE_MAX_EXPR_LENGTH);
+ return AGCRE_REG_FAILURE;
+ }
+ if (flags & AGCRE_REG_EXTENDED) {
+ tok = parse_extended(agcre, &in);
+ } else if (flags & AGCRE_REG_BASIC) {
+ tok = parse_basic(agcre, &in, flags);
+ } else if (flags & AGCRE_REG_NOSPEC) {
+ tok = parse_nospec(agcre, &in);
+ } else {
+ re->msgc = snprintf(re->msg, sizeof(re->msg),
+ "unknown type of regexp");
+ return AGCRE_REG_FAILURE;
+ }
+ if (tok == NULL) {
+ return AGCRE_REG_FAILURE;
+ }
+ if (flags & AGCRE_REG_DUMP) {
+ print_token(tok);
+ printf("\n");
+ }
+ errc = 0;
+ n = count(tok, &errc) + 1;
+ if (errc) {
+ re->msgc = snprintf(re->msg, sizeof(re->msg), "illegal token parse");
+ return AGCRE_REG_FAILURE;
+ }
+ if ((re->pc = re->prog = calloc(1, n * sizeof(re->prog[0]))) == NULL) {
+ re->msgc = snprintf(re->msg, sizeof(re->msg), "%d instruction allocation failure", n);
+ return AGCRE_REG_FAILURE;
+ }
+ re->flags = flags;
+ emit(re, tok);
+ freetok(tok);
+ re->pc->op = OP_MATCH;
+ re->pc += 1;
+ re->instrc = re->pc - re->prog;
+ agcre->re_magic = AGCRE_MAGIC;
+ if (flags & AGCRE_REG_DUMP) {
+ printprog(re);
+ }
+ return AGCRE_REG_SUCCESS;
+}
+
+#ifndef USE_ARG
+#define USE_ARG(x) /*LINTED*/(void)&x
+#endif
+
+/* format the error nicely */
+size_t
+agcre_regerror(int errcode, const agcre_regex_t *agcre, char *errbuf, size_t size)
+{
+ re_t *re;
+
+ USE_ARG(errcode);
+ if (agcre == NULL || size == 0 || errbuf == NULL) {
+ return 0;
+ }
+ re = agcre->re_g;
+ return snprintf(errbuf, size, "%s", re->msg);
+}
+
+/* execute the regular expression */
+int
+agcre_regexec(agcre_regex_t *agcre, const void *vs, size_t matchc, agcre_regmatch_t *m, uint32_t flags)
+{
+ threadlist_t *current;
+ threadlist_t *next;
+ threadlist_t *tmp;
+ context_t *matched;
+ context_t *ctx;
+ uint32_t prevflags;
+ uint32_t instrc;
+ input_t in;
+ instr_t *instr;
+ instr_t *prog;
+ size_t i;
+ re_t *re;
+ int ret;
+
+ if (agcre == NULL || vs == NULL || (matchc > 0 && m == NULL)) {
+ return AGCRE_REG_FAILURE;
+ }
+ if ((re = agcre->re_g) == NULL) {
+ return AGCRE_REG_FAILURE;
+ }
+ if (agcre->re_magic != AGCRE_MAGIC) {
+ re->msgc = snprintf(re->msg, sizeof(re->msg),
+ "bad magic number 0x%x, not 0x%x", agcre->re_magic, AGCRE_MAGIC);
+ return AGCRE_REG_FAILURE;
+ }
+ if (matchc > AGCRE_MAX_SUBEXPR) {
+ re->msgc = snprintf(re->msg, sizeof(re->msg),
+ "matchc %zu larger than limit %u", matchc, AGCRE_MAX_SUBEXPR);
+ return AGCRE_REG_FAILURE;
+ }
+ memset(&in, 0x0, sizeof(in));
+ in.s = (const char *)vs;
+ in.so = (flags & AGCRE_REG_STARTEND) ? m[0].rm_so : 0;
+ in.eo = (flags & AGCRE_REG_STARTEND) ? m[0].rm_eo : get_length_in_bytes(vs, flags);
+ matched = NULL;
+ ctx = make_new_context(re, matchc);
+ for (i = 0; i < matchc ; i++) {
+ m[i].rm_so = m[i].rm_eo = -1;
+ ctx->sub[i].rm_so = ctx->sub[i].rm_eo = -1;
+ }
+ instrc = re->instrc;
+ prog = re->prog;
+ if ((flags | re->flags) & AGCRE_REG_ANCHOR) {
+ instrc -= LEADING_DOT_STAR_INSTRS;
+ prog += LEADING_DOT_STAR_INSTRS;
+ }
+ current = threadlist(instrc);
+ next = threadlist(instrc);
+ re->gen += 1;
+ prevflags = re->flags;
+ re->flags |= flags;
+ in.c = in.so;
+ addthread(re, current, newthread(prog, ctx), &in, 0, 1);
+ ret = AGCRE_REG_FAILURE;
+ for (in.ch = BOL ; ; in.c += in.bytes) {
+ if (current->nthreads == 0) {
+ break;
+ }
+ if (!get_next_char(&in, flags)) {
+ break;
+ }
+ if (re->flags & AGCRE_REG_TRACE) {
+ printf("\n%lld 0x%0x", (long long)in.c, in.ch);
+ }
+ for (re->gen++, i = 0; i < current->nthreads ; i++) {
+ instr = current->t[i].pc;
+ ctx = current->t[i].ctx;
+ if (re->flags & AGCRE_REG_TRACE) {
+ printf("\nthread %zu, pc %d, ctx %p", i, (int)(instr - prog), ctx);
+ }
+ switch(instr->op) {
+ case OP_CHAR:
+ if (!charmatch(re, instr, in.ch, instr->n)) {
+ decref(re, ctx);
+ break;
+ }
+ goto eol_check;
+ case OP_SET:
+ if (!setmatch(re, &re->sets[instr->n], &in, instr)) {
+ decref(re, ctx);
+ break;
+ }
+ goto eol_check;
+ case OP_ANY:
+eol_check:
+ if (!anymatch(re, &in)) {
+ decref(re, ctx);
+ break;
+ }
+ addthread(re, next, newthread(instr + 1, ctx), &in, in.bytes, 1);
+ break;
+ case OP_BACKREF:
+ if (!charmatch(re, instr, in.ch, in.s[ctx->sub[instr->n].rm_so + ctx->c])) {
+ decref(re, ctx);
+ break;
+ }
+ if (ctx->c == ctx->sub[instr->n].rm_eo - ctx->sub[instr->n].rm_so - in.bytes) {
+ addthread(re, next, newthread(instr + 1, ctx), &in, 0, 1);
+ } else {
+ ctx->c += in.bytes;
+ addthread(re, next, newthread(instr, ctx), &in, ctx->c, 1);
+ }
+ continue;
+ case OP_ZEROWIDTH:
+ if (!zeromatch(re, instr->n, &in)) {
+ decref(re, ctx);
+ break;
+ }
+ addthread(re, current, newthread(instr + 1, ctx), &in, 0, 0);
+ continue;
+ case OP_MATCH:
+ if (is_better_match(ctx, matched)) {
+ if (!matched) {
+ matched = make_new_context(re, matchc);
+ }
+ memcpy(matched, ctx, context_bytes(matchc));
+ }
+ for (i++; i < current->nthreads ; i++) {
+ decref(re, current->t[i].ctx);
+ }
+ if (ctx->sub[0].rm_eo - ctx->sub[0].rm_so == 0) {
+ addthread(re, next, newthread(prog, ctx), &in, in.bytes, 1);
+ }
+ goto break_for;
+ /* OP_JUMP, OP_SPLIT, OP_SAVE handled in addthread, so that */
+ /* machine execution matches what a backtracker would do. */
+ /* This is discussed (but not shown as code) in */
+ /* Regular Expression Matching: the Virtual Machine Approach. */
+ }
+ }
+break_for:
+ tmp = current;
+ current = next;
+ next = tmp;
+ next->nthreads = 0;
+ if (in.c > in.eo) {
+ break;
+ }
+#if 0
+ if ((flags & AGCRE_REG_ANCHOR) &&
+ ((matched == NULL && current->nthreads < 2 && in.c > in.so) ||
+ (matched && matched->sub[0].rm_so > (int64_t)in.so))) {
+ break;
+ }
+#endif
+ }
+ if (matched) {
+ if (!(flags & AGCRE_REG_NOSUB)) {
+ for (i = 0; i < matchc; i++) {
+ m[i] = matched->sub[i];
+ }
+ }
+ decref(re, matched);
+ ret = AGCRE_REG_SUCCESS;
+ }
+ free(current);
+ free(next);
+ decref(re, ctx);
+ re->flags = prevflags;
+ return ret;
+}
+
+/* execute the regular expression backwards */
+int
+agcre_rev_regexec(agcre_regex_t *agcre, const void *vs, size_t matchc, agcre_regmatch_t *m, uint32_t flags)
+{
+ int64_t from;
+ int64_t to;
+ int64_t i;
+ int found;
+ int this;
+
+ if (flags & AGCRE_REG_STARTEND) {
+ from = m[0].rm_so;
+ to = m[0].rm_eo;
+ } else {
+ from = 0;
+ to = strlen(vs);
+ }
+ flags |= (AGCRE_REG_STARTEND | AGCRE_REG_ANCHOR);
+ for (found = 0, i = to - 1 ; i >= from ; --i) {
+ m[0].rm_so = i;
+ m[0].rm_eo = to;
+ if ((this = agcre_regexec(agcre, vs, matchc, m, flags)) == 0) {
+ found += 1;
+ } else if (found > 0) {
+ m[0].rm_so = i + 1;
+ m[0].rm_eo = to;
+ return agcre_regexec(agcre, vs, matchc, m, flags);
+ }
+ }
+ return AGCRE_REG_FAILURE;
+}
+
+/* free the regular expression */
+void
+agcre_regfree(agcre_regex_t *agcre)
+{
+ uint32_t i;
+ re_t *re;
+
+ if (agcre) {
+ if ((re = agcre->re_g) != NULL) {
+ free(re->prog);
+ for (i = 0 ; i < re->setc ; i++) {
+ set_free(&re->sets[i]);
+ }
+ free(re->sets);
+ free_contexts(re->ctxlist);
+ free(re);
+ }
+ }
+}
+
+/* do regexp sed-style substitution */
+ssize_t
+agcre_regnsub(char *buf, size_t size, const char *repl, const agcre_regmatch_t *rm, const char *str)
+{
+ const char *i;
+ uint32_t len;
+ uint32_t num;
+ char *o;
+ int outofroom = 0;
+
+ if (buf == NULL || repl == NULL || rm == NULL || str == NULL || size == 0) {
+ return -1;
+ }
+ memset(buf, 0x0, size);
+ for (i = repl, o = buf ; *i ; ) {
+ if ((size_t)(o - buf) >= size - 1) {
+ /* not enough space, transition to observe */
+ outofroom = 1;
+ }
+ if (*i == '&' ||
+ (*i == '\\' && (i[1] >= '0' && i[1] <= '9'))) {
+ num = (i[0] == '&') ? 0 : (i[1] - '0');
+ if (rm[num].rm_so < 0 || rm[num].rm_eo < 0 ||
+ rm[num].rm_eo - rm[num].rm_so < 0 ||
+ rm[num].rm_eo < rm[num].rm_so ||
+ rm[num].rm_eo > rm[0].rm_eo) {
+ /* sanity check on regmatch values */
+ return -1;
+ }
+ len = rm[num].rm_eo - rm[num].rm_so;
+ if (len >= (size - (size_t)(o - buf))) {
+ /* not enough space, transition to observe */
+ outofroom = 1;
+ }
+ if (!outofroom) {
+ memcpy(o, &str[rm[num].rm_so], len);
+ }
+ i += 2;
+ o += len;
+ } else {
+ if (!outofroom) {
+ *o = *i;
+ }
+ i += 1;
+ o += 1;
+ }
+ }
+ if (!outofroom) {
+ *o = 0x0;
+ }
+ return (ssize_t)(o - buf);
+}
+
+/* do regexp sed-style substitution */
+ssize_t
+agcre_regasub(char **buf, const char *repl, const agcre_regmatch_t *rm, const char *str)
+{
+ const char *i;
+ uint32_t len;
+ uint32_t num;
+ size_t size;
+ size_t cc;
+
+ if (buf == NULL || repl == NULL || rm == NULL || str == NULL) {
+ return -1;
+ }
+ *buf = NULL;
+ for (i = repl, size = cc = 0 ; *i ; ) {
+ if (*i == '&' ||
+ (*i == '\\' && (i[1] >= '0' && i[1] <= '9'))) {
+ num = (i[0] == '&') ? 0 : (i[1] - '0');
+ if (rm[num].rm_so < 0 || rm[num].rm_eo < 0 ||
+ rm[num].rm_eo - rm[num].rm_so < 0 ||
+ rm[num].rm_eo < rm[num].rm_so ||
+ rm[num].rm_eo > rm[0].rm_eo) {
+ /* sanity check on regmatch values */
+ return -1;
+ }
+ len = rm[num].rm_eo - rm[num].rm_so;
+ if (!growspace(buf, &size, cc, len)) {
+ return -1;
+ }
+ memcpy(&(*buf)[cc], &str[rm[num].rm_so], len);
+ i += 2;
+ cc += len;
+ } else {
+ if (!growspace(buf, &size, cc, 1)) {
+ return -1;
+ }
+ (*buf)[cc++] = *i++;
+ }
+ }
+ (*buf)[cc] = 0x0;
+ return cc;
+}