Module Name: othersrc Committed By: agc Date: Tue Oct 5 01:23:39 UTC 2021
Modified Files: othersrc/external/bsd/agcre/dist: Makefile.bsd Makefile.in Makefile.lib.in Makefile.libtool.in agcre.h libagcre.3 main.c othersrc/external/bsd/agcre/dist/tests: 54.expected 62.expected othersrc/external/bsd/agcre/lib: Makefile shlib_version Added Files: othersrc/external/bsd/agcre/dist: agcre.c Removed Files: othersrc/external/bsd/agcre/dist: comp.c error.c exec.c free.c internal.h lex.c lex.h new.c set.c set.h unicode.c unicode.h Log Message: Update agcre to version 20211001 20211001 ======== + move to a single source file, which can just be included as a single file (the library method can still be used for this as well - sometimes, it's just easier to drop in a single file to other projects) + add AGCRE_EMBED_CTAGS as a primary regular expresion type (same as basic regexps, but where a '*' occurs as a non-column 0 element of the pattern, it is assumed to be a standard character, as produced by ctags(1) output; basic regular expressions only accept '*' as a standard character as the first element of the expression). Yay. + make the version string in the program a real string, so it can be found using strings(1) + add an agcre_rev_regexec(3) function, which searches the text in an efficient, reverse manner, a la strrchr(3) or memrchr(3) fashion. + add agcre_regnsub(3) and agcre_regasub(3) functions + bump the shared lib version to 1.0 + cut version 20211001 To generate a diff of this commit: cvs rdiff -u -r1.1 -r1.2 othersrc/external/bsd/agcre/dist/Makefile.bsd \ othersrc/external/bsd/agcre/dist/Makefile.in \ othersrc/external/bsd/agcre/dist/Makefile.lib.in \ othersrc/external/bsd/agcre/dist/Makefile.libtool.in \ othersrc/external/bsd/agcre/dist/agcre.h \ othersrc/external/bsd/agcre/dist/main.c cvs rdiff -u -r0 -r1.1 othersrc/external/bsd/agcre/dist/agcre.c cvs rdiff -u -r1.2 -r0 othersrc/external/bsd/agcre/dist/comp.c \ othersrc/external/bsd/agcre/dist/error.c cvs rdiff -u -r1.3 -r0 othersrc/external/bsd/agcre/dist/exec.c cvs rdiff -u -r1.1 -r0 othersrc/external/bsd/agcre/dist/free.c \ othersrc/external/bsd/agcre/dist/internal.h \ othersrc/external/bsd/agcre/dist/lex.c \ othersrc/external/bsd/agcre/dist/lex.h \ othersrc/external/bsd/agcre/dist/new.c \ othersrc/external/bsd/agcre/dist/set.c \ othersrc/external/bsd/agcre/dist/set.h \ othersrc/external/bsd/agcre/dist/unicode.c \ othersrc/external/bsd/agcre/dist/unicode.h cvs rdiff -u -r1.2 -r1.3 othersrc/external/bsd/agcre/dist/libagcre.3 cvs rdiff -u -r1.2 -r1.3 othersrc/external/bsd/agcre/dist/tests/54.expected cvs rdiff -u -r1.1 -r1.2 othersrc/external/bsd/agcre/dist/tests/62.expected cvs rdiff -u -r1.1 -r1.2 othersrc/external/bsd/agcre/lib/Makefile \ othersrc/external/bsd/agcre/lib/shlib_version Please note that diffs are not public domain; they are subject to the copyright notices on the relevant files.
Modified files: Index: othersrc/external/bsd/agcre/dist/Makefile.bsd diff -u othersrc/external/bsd/agcre/dist/Makefile.bsd:1.1 othersrc/external/bsd/agcre/dist/Makefile.bsd:1.2 --- othersrc/external/bsd/agcre/dist/Makefile.bsd:1.1 Wed Aug 16 23:38:13 2017 +++ othersrc/external/bsd/agcre/dist/Makefile.bsd Tue Oct 5 01:23:39 2021 @@ -1,11 +1,5 @@ PROG= agcre -SRCS+= comp.c -SRCS+= error.c -SRCS+= exec.c -SRCS+= free.c -SRCS+= lex.c -SRCS+= set.c -SRCS+= unicode.c +SRCS+= agcre.c SRCS+= main.c MAN= libagcre.3 WARNS= 5 Index: othersrc/external/bsd/agcre/dist/Makefile.in diff -u othersrc/external/bsd/agcre/dist/Makefile.in:1.1 othersrc/external/bsd/agcre/dist/Makefile.in:1.2 --- othersrc/external/bsd/agcre/dist/Makefile.in:1.1 Wed Aug 16 23:38:13 2017 +++ othersrc/external/bsd/agcre/dist/Makefile.in Tue Oct 5 01:23:39 2021 @@ -1,8 +1,8 @@ -# $NetBSD: Makefile.in,v 1.1 2017/08/16 23:38:13 agc Exp $ +# $NetBSD: Makefile.in,v 1.2 2021/10/05 01:23:39 agc Exp $ PROG= agcre -OBJS= comp.o error.o exec.o free.o lex.o new.o set.o unicode.o main.o +OBJS= agcre.o main.o PREFIX=@PREFIX@ MANDIR=@MANDIR@ Index: othersrc/external/bsd/agcre/dist/Makefile.lib.in diff -u othersrc/external/bsd/agcre/dist/Makefile.lib.in:1.1 othersrc/external/bsd/agcre/dist/Makefile.lib.in:1.2 --- othersrc/external/bsd/agcre/dist/Makefile.lib.in:1.1 Wed Aug 16 23:38:13 2017 +++ othersrc/external/bsd/agcre/dist/Makefile.lib.in Tue Oct 5 01:23:39 2021 @@ -1,8 +1,8 @@ -# $NetBSD: Makefile.lib.in,v 1.1 2017/08/16 23:38:13 agc Exp $ +# $NetBSD: Makefile.lib.in,v 1.2 2021/10/05 01:23:39 agc Exp $ LIB= libagcre.a - OBJS= comp.o error.o exec.o free.o lex.o new.o set.o unicode.o +OBJS= agcre.o PREFIX=@PREFIX@ MANDIR=@MANDIR@ Index: othersrc/external/bsd/agcre/dist/Makefile.libtool.in diff -u othersrc/external/bsd/agcre/dist/Makefile.libtool.in:1.1 othersrc/external/bsd/agcre/dist/Makefile.libtool.in:1.2 --- othersrc/external/bsd/agcre/dist/Makefile.libtool.in:1.1 Wed Aug 16 23:38:13 2017 +++ othersrc/external/bsd/agcre/dist/Makefile.libtool.in Tue Oct 5 01:23:39 2021 @@ -1,8 +1,8 @@ -# $NetBSD: Makefile.libtool.in,v 1.1 2017/08/16 23:38:13 agc Exp $ +# $NetBSD: Makefile.libtool.in,v 1.2 2021/10/05 01:23:39 agc Exp $ LIB= libagcre.a -OBJS= comp.o error.o exec.o free.o lex.o new.o set.o unicode.o +OBJS= agcre.o PREFIX=@PREFIX@ MANDIR=@MANDIR@ Index: othersrc/external/bsd/agcre/dist/agcre.h diff -u othersrc/external/bsd/agcre/dist/agcre.h:1.1 othersrc/external/bsd/agcre/dist/agcre.h:1.2 --- othersrc/external/bsd/agcre/dist/agcre.h:1.1 Wed Aug 16 23:38:13 2017 +++ othersrc/external/bsd/agcre/dist/agcre.h Tue Oct 5 01:23:39 2021 @@ -1,5 +1,5 @@ /* - * Copyright (c) 2013,2017 Alistair Crooks. All Rights reserved. + * Copyright (c) 2013,2017,2020,2021 Alistair Crooks. All Rights reserved. * Copyright (c) 2007-2009 Russ Cox, Google Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -29,7 +29,7 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef AGCRE_VERSION -#define AGCRE_VERSION 20170801 +#define AGCRE_VERSION 20211001 #include <inttypes.h> @@ -52,12 +52,15 @@ #define REG_UTF32BE AGCRE_REG_UTF32BE #define REG_SUCCESS AGCRE_REG_SUCCESS #define REG_FAILURE AGCRE_REG_FAILURE +#define REG_NOMATCH AGCRE_REG_FAILURE #define REG_MAGIC AGCRE_MAGIC #define REG_MAX_SUBEXPR AGCRE_MAX_SUBEXPR #define REG_MAX_EXPR_LENGTH AGCRE_MAX_EXPR_LENGTH +#define REG_ANCHOR AGCRE_REG_ANCHOR +#define REG_CTAGS AGCRE_REG_CTAGS #define regoff_t agcre_regoff_t #define regmatch_t agcre_regmatch_t -#define regex_t agcre_re_t +#define regex_t agcre_regex_t #define regcomp agcre_regcomp #define regexec agcre_regexec #define regfree agcre_regfree @@ -80,6 +83,8 @@ #define AGCRE_REG_UTF16BE 0x00002000 #define AGCRE_REG_UTF32LE 0x00004000 #define AGCRE_REG_UTF32BE 0x00008000 +#define AGCRE_REG_ANCHOR 0x00010000 +#define AGCRE_REG_CTAGS 0x00020000 #define AGCRE_REG_SUCCESS 0 #define AGCRE_REG_FAILURE 1 @@ -99,12 +104,12 @@ typedef struct agcre_regmatch_t { } agcre_regmatch_t; /* a regexp structure */ -typedef struct agcre_re_t { +typedef struct agcre_regex_t { uint32_t re_magic; /* magic number */ size_t re_nsub; /* number of subexpressions */ const char *re_endp; /* pointer to end of expr */ void *re_g; /* internals */ -} agcre_re_t; +} agcre_regex_t; #ifndef __BEGIN_DECLS # if defined(__cplusplus) @@ -118,13 +123,19 @@ typedef struct agcre_re_t { __BEGIN_DECLS -agcre_re_t *agcre_new(void); -int agcre_regcomp(agcre_re_t * /*re*/, const void * /*s*/, uint32_t /*flags*/); -int agcre_regexec(agcre_re_t * /*re*/, const void * /*vs*/, size_t /*mc*/, - agcre_regmatch_t * /*m*/, uint32_t /*flags*/); -void agcre_regfree(agcre_re_t * /*re*/); -size_t agcre_regerror(int /*errcode*/, const agcre_re_t * /*agcre*/, - char * /*errbuf*/, size_t /*size*/); +agcre_regex_t *agcre_new(void); +int agcre_regcomp(agcre_regex_t */*re*/, const void */*s*/, uint32_t /*flags*/); +int agcre_regexec(agcre_regex_t */*re*/, const void */*vs*/, size_t /*mc*/, + agcre_regmatch_t */*m*/, uint32_t /*flags*/); +int agcre_rev_regexec(agcre_regex_t */*agcre*/, const void */*vs*/, + size_t /*matchc*/, agcre_regmatch_t */*m*/, uint32_t /*flags*/); +void agcre_regfree(agcre_regex_t */*re*/); +size_t agcre_regerror(int /*errcode*/, const agcre_regex_t */*agcre*/, + char */*errbuf*/, size_t /*size*/); +ssize_t agcre_regnsub(char */*buf*/, size_t /*size*/, const char */*repl*/, + const agcre_regmatch_t */*rm*/, const char */*str*/); +ssize_t agcre_regasub(char **/*buf*/, const char */*repl*/, + const agcre_regmatch_t */*rm*/, const char */*str*/); __END_DECLS Index: othersrc/external/bsd/agcre/dist/main.c diff -u othersrc/external/bsd/agcre/dist/main.c:1.1 othersrc/external/bsd/agcre/dist/main.c:1.2 --- othersrc/external/bsd/agcre/dist/main.c:1.1 Wed Aug 16 23:38:13 2017 +++ othersrc/external/bsd/agcre/dist/main.c Tue Oct 5 01:23:39 2021 @@ -67,6 +67,10 @@ typedef struct file_t { uint8_t binary; /* file is a "binary" */ } file_t; +#define STRINGIFY(x) #x + +#define AGCRE_VERSION_STRING(x) STRINGIFY(x) + static inline int isbinary(char *buf, size_t size) { @@ -133,7 +137,7 @@ static char to[2][10]; /* display the match */ static int -display_match(agcre_re_t *re, agcre_regmatch_t *m, rtflags_t *rtflags, char *s, +display_match(agcre_regex_t *re, agcre_regmatch_t *m, rtflags_t *rtflags, char *s, agcre_regoff_t start, agcre_regoff_t len) { size_t i; @@ -160,7 +164,7 @@ display_match(agcre_re_t *re, agcre_regm /* search a line in a file */ static int -doline(agcre_re_t *re, rtflags_t *rtflags, file_t *file, char *line, size_t len, agcre_regmatch_t *m) +doline(agcre_regex_t *re, rtflags_t *rtflags, file_t *file, char *line, size_t len, agcre_regmatch_t *m) { agcre_regoff_t start; int flags; @@ -203,7 +207,7 @@ doline(agcre_re_t *re, rtflags_t *rtflag /* search in a file stream */ static int -dofp(agcre_re_t *re, FILE *fp, rtflags_t *rtflags, const char *f) +dofp(agcre_regex_t *re, FILE *fp, rtflags_t *rtflags, const char *f) { agcre_regmatch_t *m; rtflags_t localflags; @@ -258,7 +262,7 @@ dofp(agcre_re_t *re, FILE *fp, rtflags_t /* search in a file/dir */ static int -dofile(agcre_re_t *re, const char *f, rtflags_t *rtflags) +dofile(agcre_regex_t *re, const char *f, rtflags_t *rtflags) { struct dirent entry; struct dirent *d; @@ -333,7 +337,7 @@ readexpression(const char *f) int main(int argc, char **argv) { - agcre_re_t re; + agcre_regex_t re; rtflags_t rtflags; uint32_t pend; char *expression; @@ -351,7 +355,7 @@ main(int argc, char **argv) file = expression = NULL; c = 0; pend = 0; - while ((i = getopt(argc, argv, "DOP:S:T:Ve:f:ilm:nrtv")) != -1) { + while ((i = getopt(argc, argv, "DOP:S:T:Vae:f:ilm:nrtv")) != -1) { switch(i) { case 'D': flags |= AGCRE_REG_DUMP; @@ -380,8 +384,11 @@ main(int argc, char **argv) } break; case 'V': - printf("agcre %d\n", AGCRE_VERSION); + printf("agcre " AGCRE_VERSION_STRING(AGCRE_VERSION) "\n"); exit(EXIT_SUCCESS); + case 'a': + flags |= AGCRE_REG_ANCHOR; + break; case 'e': expression = optarg; break; Index: othersrc/external/bsd/agcre/dist/libagcre.3 diff -u othersrc/external/bsd/agcre/dist/libagcre.3:1.2 othersrc/external/bsd/agcre/dist/libagcre.3:1.3 --- othersrc/external/bsd/agcre/dist/libagcre.3:1.2 Thu Aug 17 11:03:21 2017 +++ othersrc/external/bsd/agcre/dist/libagcre.3 Tue Oct 5 01:23:39 2021 @@ -1,6 +1,6 @@ -.\" $NetBSD: libagcre.3,v 1.2 2017/08/17 11:03:21 wiz Exp $ +.\" $NetBSD: libagcre.3,v 1.3 2021/10/05 01:23:39 agc Exp $ .\" -.\" Copyright (c) 2017 Alistair Crooks <a...@netbsd.org> +.\" Copyright (c) 2017,2020,2021 Alistair Crooks <a...@netbsd.org> .\" All rights reserved. .\" .\" Redistribution and use in source and binary forms, with or without @@ -23,14 +23,14 @@ .\" (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF .\" THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .\" -.Dd July 4, 2017 -.Dt LIBAGCRE 3 +.Dd October 1, 2021 +.Dt AGCRE-EMBED 3 .Os .Sh NAME -.Nm libagcre +.Nm agcre .Nd regular expression library .Sh LIBRARY -.Lb libagcre +.Lb agcre .Sh SYNOPSIS .In agcre.h .Ft "agcre_regex_t *" @@ -39,11 +39,15 @@ .Fc .Ft "int" .Fo agcre_regcomp -.Fa "agcre_regex_t *expression" "const void *pat" "int flags" +.Fa "agcre_regex_t *expression" "const void *pat" "uint32_t flags" .Fc .Ft int .Fo agcre_regexec -.Fa "agcre_regex_t *expression" "const void *input" "size_t nmatch" "agcre_regmatch_t *matches" "int flags" +.Fa "agcre_regex_t *expression" "const void *input" "size_t nmatch" "agcre_regmatch_t *matches" "uint32_t flags" +.Fc +.Ft int +.Fo agcre_rev_regexec +.Fa "agcre_regex_t *expression" "const void *input" "size_t nmatch" "agcre_regmatch_t *matches" "uint32_t flags" .Fc .Ft void .Fo agcre_regfree @@ -53,6 +57,14 @@ .Fo agcre_regerror .Fa "int errnum" "const agcre_regex_t *expression" "char *buf" "size_t size" .Fc +.Ft ssize_t +.Fo agcre_regnsub +.Fa "char *buf" "size_t size" "const char *repl" "const agcre_regmatch_t *matches" "const char *str" +.Fc +.Ft ssize_t +.Fo agcre_regasub +.Fa "char **buf" "const char *repl" "const agcre_regmatch_t *matches" "const char *str" +.Fc .Sh DESCRIPTION The .Nm @@ -119,10 +131,6 @@ expression, the individual virtual machi will be printed on standard output. .It Dv AGCRE_REG_TRACE Traces execution of the regular expression. -.It Dv AGCRE_REG_LARGE -Is deprecated, and only recognised for legacy reasons. -.It Dv AGCRE_REG_BACKR -Is deprecated, and only recognised for legacy reasons. .\".It Dv AGCRE_IN_8BITS .\"This indicates to the .\".Fn agcre_regcomp @@ -186,6 +194,32 @@ and fields provided in the zero-th element of the .Fa matchv argument. +.It Dv AGCRE_REG_ANCHOR +Normally regular expressions are not assumed to start at the first character, +but rather to have a lazy search at their start. +Occasionally, a search anchored to the start of the +string is preferred. +This cannot be simulated by the +.Dv AGCRE_REG_NOTBOL +flag. +Searches where a match needs to be found in the first character +should therefore use the +.Dv AGCRE_REG_ANCHOR +definition. +Anchoring can be specified at expression compile time, +and/or at expression execution time. +.It Dv AGCRE_REG_ANCHOR +This is used where a regular expression pattern was generated by the +.Xr ctags 1 +program, since the syntax generated does not conform to a standard +basic regular expression. +In particular, +.Xr ctags 1 +outputs function location patterns which contain an unescaped +.Dq * +character, whereas basic regular expressions will only +recognise an asterisk as a standard character if it appears +at the start of an expression. .\".It Dv AGCRE_IN_8BITS .\"This indicates to the .\".Fn agcre_regexec @@ -208,6 +242,20 @@ argument. .\"integers. .El .Pp +Since normal expression searches take place moving forwards within a byte stream, +there is not normally any need to have a function which searches a byte stream or buffer +moving in reverse. +However, editors frequently must implement this functionality on their own. +To make reverse searching easier, the +.Fn agcre_rev_regexec +function is provided, which will start at the end of the sized +buffer, using the +.Dv AGCRE_REG_STARTEND +definition, and the search will continue character by character +until either a match is found or the start of the buffer is reached. +This search will be done in an efficient manner, so that buffer +contents which have already been searched will not be searched again. +.Pp The .Fn agcre_regcomp and Index: othersrc/external/bsd/agcre/dist/tests/54.expected diff -u othersrc/external/bsd/agcre/dist/tests/54.expected:1.2 othersrc/external/bsd/agcre/dist/tests/54.expected:1.3 --- othersrc/external/bsd/agcre/dist/tests/54.expected:1.2 Sun Dec 3 21:25:56 2017 +++ othersrc/external/bsd/agcre/dist/tests/54.expected Tue Oct 5 01:23:39 2021 @@ -1,4 +1,6 @@ +/usr/include/dev/ic/nvmeio.h:136: for (i = 0; i < __arraycount(identify->lbaf); i++) +/usr/include/dev/ic/nvmeio.h:192: for (i = 0; i < __arraycount(identify->psd); i++) /usr/include/rump/rumpuser_port.h:260:#ifndef __arraycount /usr/include/rump/rumpuser_port.h:261:#define __arraycount(_ar_) (sizeof(_ar_)/sizeof(_ar_[0])) /usr/include/sys/bitops.h:324: for (__i = 0; __i < __arraycount(__v->_b); __i++) \ -/usr/include/sys/cdefs.h:573:#define __arraycount(__x) (sizeof(__x) / sizeof(__x[0])) +/usr/include/sys/cdefs.h:636:#define __arraycount(__x) (sizeof(__x) / sizeof(__x[0])) Index: othersrc/external/bsd/agcre/dist/tests/62.expected diff -u othersrc/external/bsd/agcre/dist/tests/62.expected:1.1 othersrc/external/bsd/agcre/dist/tests/62.expected:1.2 --- othersrc/external/bsd/agcre/dist/tests/62.expected:1.1 Wed Aug 16 23:38:13 2017 +++ othersrc/external/bsd/agcre/dist/tests/62.expected Tue Oct 5 01:23:39 2021 @@ -1 +1 @@ -agcre 20170801 +agcre 20211001 Index: othersrc/external/bsd/agcre/lib/Makefile diff -u othersrc/external/bsd/agcre/lib/Makefile:1.1 othersrc/external/bsd/agcre/lib/Makefile:1.2 --- othersrc/external/bsd/agcre/lib/Makefile:1.1 Wed Aug 16 23:38:13 2017 +++ othersrc/external/bsd/agcre/lib/Makefile Tue Oct 5 01:23:39 2021 @@ -1,14 +1,7 @@ -# $NetBSD: Makefile,v 1.1 2017/08/16 23:38:13 agc Exp $ +# $NetBSD: Makefile,v 1.2 2021/10/05 01:23:39 agc Exp $ LIB= agcre -SRCS+= comp.c -SRCS+= error.c -SRCS+= exec.c -SRCS+= free.c -SRCS+= lex.c -SRCS+= new.c -SRCS+= set.c -SRCS+= unicode.c +SRCS+= agcre.c MAN= libagcre.3 agcre_format.7 MLINKS+= libagcre.3 agcre_regcomp.3 MLINKS+= libagcre.3 agcre_regexec.3 Index: othersrc/external/bsd/agcre/lib/shlib_version diff -u othersrc/external/bsd/agcre/lib/shlib_version:1.1 othersrc/external/bsd/agcre/lib/shlib_version:1.2 --- othersrc/external/bsd/agcre/lib/shlib_version:1.1 Wed Aug 16 23:38:13 2017 +++ othersrc/external/bsd/agcre/lib/shlib_version Tue Oct 5 01:23:39 2021 @@ -1,2 +1,2 @@ -major=0 +major=1 minor=0 Added files: Index: othersrc/external/bsd/agcre/dist/agcre.c diff -u /dev/null othersrc/external/bsd/agcre/dist/agcre.c:1.1 --- /dev/null Tue Oct 5 01:23:39 2021 +++ othersrc/external/bsd/agcre/dist/agcre.c Tue Oct 5 01:23:39 2021 @@ -0,0 +1,3065 @@ +/*- + * Copyright (c) 2013,2017 Alistair Crooks. All Rights reserved. + * All rights reserved. + * + * Parts of this are: + * Copyright (c) 2007-2009 Russ Cox, Google Inc. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#include <sys/types.h> +#include <sys/param.h> + +#include <inttypes.h> +#include <stdarg.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> + +#include "agcre.h" + +/* callback struct */ +typedef struct cb_t { + int (*func)(uint32_t); /* function */ + uint8_t negate; /* take the complement of the result */ +} cb_t; + +/* sets are implemented as combinations of ranges, individual cases, and callbacks */ +typedef struct set_t { + uint32_t rangec; /* # of ranges */ + uint32_t maxrange; /* size of range array */ + uint32_t *ranges; /* range array */ + uint32_t casec; /* # of cases */ + uint32_t maxcase; /* size of case array */ + uint32_t *cases; /* case array */ + uint32_t callbackc; /* # of callbacks */ + uint32_t maxcallback; /* size of callback array */ + cb_t *callbacks; /* callback array */ + uint8_t complement; /* complement of set */ +} set_t; + +#define AT_LEAST 0xffffffff +#define BOF 0xffffffffU + +/* a regexp token */ +typedef struct retoken_t { + struct retoken_t *left; /* token on left */ + struct retoken_t *right; /* token on right */ + uint64_t so; /* start offset */ + uint32_t n; /* number */ + uint32_t nparens; /* () count */ + uint32_t lo; /* repeat low count */ + uint32_t hi; /* repeat high count */ + uint32_t ch; /* current char */ + uint8_t type; /* type of token */ + uint8_t lazy; /* use lazy matching */ + uint8_t noncapture; /* non-capture subexpression */ + uint8_t icase; /* icase token */ +} retoken_t; + +/* token types */ +#define TOK_ALT 0x01 +#define TOK_BACKREF 0x02 +#define TOK_BROKEN 0x03 +#define TOK_CAT 0x04 +#define TOK_COLLECTION 0x05 +#define TOK_DOT 0x06 +#define TOK_EOL 0x07 +#define TOK_FORGET 0x08 +#define TOK_LIT 0x09 +#define TOK_PAREN 0x0a +#define TOK_PLUS 0x0b +#define TOK_QUEST 0x0c +#define TOK_REP 0x0d +#define TOK_SET TOK_COLLECTION +#define TOK_STAR 0x0e +#define TOK_WIDTH 0x0f +#define TOK_ZEROWIDTH 0x10 + +/* input structure */ +typedef struct input_t { + const char *s; /* string */ + uint64_t so; /* start offset */ + uint64_t eo; /* end offset */ + uint64_t c; /* current offset */ + int msgc; /* # of chars in message buf */ + char msg[256]; /* message buffer */ + retoken_t parse; /* current token when parsing */ + uint32_t parenc; /* # of parentheses */ + struct re_t *re; /* pointer to internal re */ + uint32_t ch; /* current char */ + uint32_t prevch; /* previous char */ + uint8_t bytes; /* size of current char */ + uint8_t prevbytes; /* size of previous char */ + uint8_t icase; /* persistent icase */ +} input_t; + +#define AGCRE_IN_WIDTH \ + (AGCRE_REG_UTF16LE | AGCRE_REG_UTF16BE | AGCRE_REG_UTF32LE | AGCRE_REG_UTF32BE) + +/* virtual machine instruction */ +typedef struct instr_t { + struct instr_t *lhs; /* lhs */ + struct instr_t *rhs; /* rhs */ + uint32_t n; /* character */ + uint32_t gen; /* generation number */ + uint8_t op; /* instruction number */ + uint8_t icase; /* instruction number */ +} instr_t; + +/* instruction opcodes */ +#define OP_ANY 0x01 +#define OP_BACKREF 0x02 +#define OP_CHAR 0x03 +#define OP_JUMP 0x04 +#define OP_MATCH 0x05 +#define OP_SAVE 0x06 +#define OP_SET 0x07 +#define OP_SPLIT 0x08 +#define OP_ZEROWIDTH 0x09 + +/* a context structure */ +typedef struct context_t { + struct context_t *next; /* for freeing nodes */ + uint32_t ref; /* reference count */ + uint32_t nsub; /* # matches */ + uint32_t c; /* chars in backref */ + agcre_regmatch_t sub[1]; /* match structures */ +} context_t; + +/* a thread structure */ +typedef struct re_thread_t { + instr_t *pc; /* instruction */ + context_t *ctx; /* context */ +} re_thread_t; + +/* list of threads */ +typedef struct threadlist_t { + uint32_t nthreads; /* # of threads */ + re_thread_t t[1]; /* the threads */ +} threadlist_t; + +/* regular expression internals */ +typedef struct re_t { + instr_t *prog; /* start of instructions */ + uint32_t instrc; /* # of instructions */ + uint32_t gen; /* generation number */ + uint32_t setc; /* # of sets */ + uint32_t maxset; /* allocated # of sets */ + set_t *sets; /* sets */ + uint32_t flags; /* comp/exec flags */ + context_t *ctxlist; /* list of contexts */ + instr_t *pc; /* prog counter */ + int msgc; /* # of chars in msg buffer */ + char msg[256]; /* message buffer */ +} re_t; + +/* + * regular expressions have an implicit .* at the start to find the + * first match. for anchored searches (not that many in comparison + * to regular regular expressions) we need to jump over those + * instructions (in the regexec phase). + * Dump of compiled expression instructions for the pattern: "i" + * (without quotes) + * + * Cat(LazyStar(Dot), Paren(0, Lit(i))) + * 0. split 3, 1 + * 1. any + * 2. jmp 0 + * 3. save 0 + * 4. char i + * 5. save 1 + * 6. match + * + * so for anchored searches, we jump over the first 3 instructions + */ +#define LEADING_DOT_STAR_INSTRS 3 + +/* all static function forward declarations */ +__BEGIN_DECLS + +static void set_free(set_t */*set*/); +static int set_add_case(set_t */*set*/, uint32_t /*n*/); +static int set_add_range(set_t */*set*/, uint32_t /*lo*/, uint32_t /*hi*/); +static int set_add_callback(set_t */*set*/, int (*/*func*/)(uint32_t), int /*negate*/); +static int set_isset(set_t */*set*/, uint32_t /*n*/); +static int set_complement(set_t */*set*/); + +static retoken_t *parse_extended(agcre_regex_t */*re*/, input_t */*in*/); +static retoken_t *parse_nospec(agcre_regex_t */*re*/, input_t */*in*/); +static retoken_t *parse_basic(agcre_regex_t */*re*/, input_t */*in*/, uint32_t /*flags*/); + +static int get_next_token(input_t */*in*/); + +static int unicode_isWhite_Space(uint32_t /*ch*/); +static int unicode_isDash(uint32_t /*ch*/); +static int unicode_isHyphen(uint32_t /*ch*/); +static int unicode_isQuotation_Mark(uint32_t /*ch*/); +static int unicode_isTerminal_Punctuation(uint32_t /*ch*/); +static int unicode_isHex_Digit(uint32_t /*ch*/); +static int unicode_isOther_Alphabetic(uint32_t /*ch*/); +static int unicode_isOther_Lowercase(uint32_t /*ch*/); +static int unicode_isOther_Uppercase(uint32_t /*ch*/); +static int unicode_isPattern_White_Space(uint32_t /*ch*/); +static int unicode_isalnum(uint32_t /*ch*/); +static int unicode_isalpha(uint32_t /*ch*/); +static int unicode_isblank(uint32_t /*ch*/); +static int unicode_iscntrl(uint32_t /*ch*/); +static int unicode_isdigit(uint32_t /*ch*/); +static int unicode_isgraph(uint32_t /*ch*/); +static int unicode_islower(uint32_t /*ch*/); +static int unicode_isprint(uint32_t /*ch*/); +static int unicode_ispunct(uint32_t /*ch*/); +static int unicode_isspace(uint32_t /*ch*/); +static int unicode_isupper(uint32_t /*ch*/); +static int unicode_isxdigit(uint32_t /*ch*/); +static int unicode_isident(uint32_t /*ch*/); +static uint32_t unicode_tolower(uint32_t /*ch*/); +static uint32_t unicode_toupper(uint32_t /*ch*/); + +__END_DECLS + +#ifndef __printflike +#define __printflike(n, m) __attribute__((format(printf,n,m))) +#endif +#ifndef howmany +#define howmany(x, y) (((x)+((y)-1))/(y)) +#endif + +/* create a new token */ +static retoken_t * +newtok(int type, retoken_t *left, retoken_t *right, uint8_t icase) +{ + retoken_t *token; + + if ((token = calloc(1, sizeof(*token))) != NULL) { + token->type = type; + token->left = left; + token->right = right; + token->icase = icase; + } + return token; +} + +/* how many instructions does tok need? */ +static int +count(retoken_t *tok, int *errc) +{ + switch(tok->type) { + case TOK_ALT: + return 2 + count(tok->left, errc) + count(tok->right, errc); + case TOK_BACKREF: + case TOK_COLLECTION: + case TOK_DOT: + case TOK_LIT: + case TOK_WIDTH: + case TOK_ZEROWIDTH: + return 1; + case TOK_FORGET: + return 0; + case TOK_CAT: + return count(tok->left, errc) + count(tok->right, errc); + case TOK_PAREN: + case TOK_STAR: + return 2 + count(tok->left, errc); + case TOK_PLUS: + case TOK_QUEST: + return 1 + count(tok->left, errc); + default: + *errc = 1; + return 0; + } +} + +/* make a new set */ +static inline set_t * +newset(re_t *re) +{ + set_t *set; + + if (re->setc == re->maxset) { + set = realloc(re->sets, (re->maxset + 8) * sizeof(*set)); + if (set == NULL) { + //error_message(in, "can't extend sets past %u", in->re->maxset); + return NULL; + } + memset(&set[re->setc], 0x0, 8 * sizeof(*set)); + re->maxset += 8; + re->sets = set; + } + return &re->sets[re->setc++]; +} + +/* take parse tokens, and emit vm instructions */ +static int +emit(re_t *re, retoken_t *tok) +{ + instr_t *p1; + instr_t *p2; + instr_t *t; + set_t *set; + + switch(tok->type) { + case TOK_ALT: + re->pc->op = OP_SPLIT; + p1 = re->pc++; + p1->lhs = re->pc; + emit(re, tok->left); + re->pc->op = OP_JUMP; + p2 = re->pc++; + p1->rhs = re->pc; + emit(re, tok->right); + p2->lhs = re->pc; + return 1; + case TOK_BACKREF: + re->pc->op = OP_BACKREF; + re->pc->n = tok->ch; + re->pc->icase = tok->icase; + re->pc++; + return 1; + case TOK_CAT: + emit(re, tok->left); + emit(re, tok->right); + return 1; + case TOK_COLLECTION: + re->pc->op = OP_SET; + re->pc->n = tok->ch; + re->pc->icase = tok->icase; + re->pc++; + return 1; + case TOK_DOT: + re->pc++->op = OP_ANY; + return 1; + case TOK_LIT: + re->pc->op = OP_CHAR; + re->pc->n = tok->ch; + re->pc->icase = tok->icase; + re->pc++; + return 1; + case TOK_PAREN: + re->pc->op = OP_SAVE; + re->pc->n = 2*tok->nparens; + re->pc++; + emit(re, tok->left); + re->pc->op = OP_SAVE; + re->pc->n = 2*tok->nparens + 1; + re->pc++; + return 1; + case TOK_PLUS: + p1 = re->pc; + emit(re, tok->left); + re->pc->op = OP_SPLIT; + re->pc->lhs = p1; + p2 = re->pc; + re->pc++; + p2->rhs = re->pc; + if (tok->lazy) { + t = p2->lhs; + p2->lhs = p2->rhs; + p2->rhs = t; + } + return 1; + case TOK_QUEST: + re->pc->op = OP_SPLIT; + p1 = re->pc++; + p1->lhs = re->pc; + emit(re, tok->left); + p1->rhs = re->pc; + if (tok->lazy) { + t = p1->lhs; + p1->lhs = p1->rhs; + p1->rhs = t; + } + return 1; + case TOK_STAR: + re->pc->op = OP_SPLIT; + p1 = re->pc++; + p1->lhs = re->pc; + emit(re, tok->left); + re->pc->op = OP_JUMP; + re->pc->lhs = p1; + re->pc++; + p1->rhs = re->pc; + if (tok->lazy) { + t = p1->lhs; + p1->lhs = p1->rhs; + p1->rhs = t; + } + return 1; + case TOK_WIDTH: + re->pc->op = OP_SET; + re->pc->n = re->setc; + re->pc->icase = tok->icase; + if ((set = newset(re)) == NULL) { + return 0; + } + set_add_callback(set, + (tok->ch == 'd' || tok->ch == 'D') ? unicode_isdigit : + (tok->ch == 'p' || tok->ch == 'P') ? unicode_isprint : + (tok->ch == 's' || tok->ch == 'S') ? unicode_isspace : + unicode_isident, + unicode_isupper(tok->ch)); + re->pc++; + return 1; + case TOK_ZEROWIDTH: + re->pc->op = OP_ZEROWIDTH; + re->pc->n = tok->ch; + re->pc++; + return 1; + } + return 1; +} + +/* recursive function to free a token and its children */ +static void +freetok(retoken_t *tok) +{ + if (tok) { + freetok(tok->left); + freetok(tok->right); + free(tok); + } +} + +/* recursive function to print a token and its children */ +static void +print_token(retoken_t *tok) +{ + if (tok->icase) { + printf("Icase+"); + } + switch(tok->type) { + case TOK_ALT: + printf("Alt("); + print_token(tok->left); + printf(", "); + print_token(tok->right); + printf(")"); + break; + case TOK_BACKREF: + printf("Backref(%d)", tok->ch); + break; + case TOK_CAT: + printf("Cat("); + print_token(tok->left); + printf(", "); + print_token(tok->right); + printf(")"); + break; + case TOK_COLLECTION: + printf("Set(%u)", tok->ch); + break; + case TOK_DOT: + printf("Dot"); + break; + case TOK_LIT: + printf("Lit(%c)", tok->ch); + break; + case TOK_PAREN: + printf("Paren(%d, ", tok->nparens); + print_token(tok->left); + printf(")"); + break; + case TOK_PLUS: + if (tok->lazy) { + printf("Lazy"); + } + printf("Plus("); + print_token(tok->left); + printf(")"); + break; + case TOK_QUEST: + if (tok->lazy) { + printf("Lazy"); + } + printf("Quest("); + print_token(tok->left); + printf(")"); + break; + case TOK_STAR: + if (tok->lazy) { + printf("Lazy"); + } + printf("Star("); + print_token(tok->left); + printf(")"); + break; + case TOK_WIDTH: + printf("Width(%c)", tok->ch); + break; + case TOK_ZEROWIDTH: + printf("Zerowidth(%c)", tok->ch); + break; + default: + printf("???"); + break; + } +} + +/* print all the instructions in the program */ +static int +printprog(re_t *re) +{ + uint32_t pc; + instr_t *instr; + + for (pc = 0, instr = re->prog ; pc < re->instrc ; pc++, instr++) { + switch(instr->op) { + case OP_ANY: + printf("%2d. any\n", pc); + break; + case OP_BACKREF: + printf("%2d. backref %d\n", pc, instr->n); + break; + case OP_CHAR: + printf("%2d. char %c\n", pc, instr->n); + break; + case OP_JUMP: + printf("%2d. jmp %d\n", + pc, + (int)(instr->lhs - re->prog)); + break; + case OP_MATCH: + printf("%2d. match\n", pc); + break; + case OP_SAVE: + printf("%2d. save %d\n", pc, instr->n); + break; + case OP_SET: + printf("%2d. set %d\n", pc, instr->n); + break; + case OP_SPLIT: + printf("%2d. split %d, %d\n", + pc, + (int)(instr->lhs - re->prog), + (int)(instr->rhs - re->prog)); + break; + case OP_ZEROWIDTH: + printf("%2d. zerowidth(%c)\n", pc, instr->n); + break; + default: + fprintf(stderr, "printprog - bad type %d\n", instr->op); + return 0; + } + } + return 1; +} + +static retoken_t *rec_repeat(input_t *in); + +/* add a warning message */ +static __printflike(2,3) void +error_message(input_t *in, const char *fmt, ...) +{ + va_list args; + + if (in->msgc == 0) { + /* don't overwrite previous messages */ + va_start(args, fmt); + in->msgc = vsnprintf(in->msg, sizeof(in->msg), fmt, args); + va_end(args); + } +} + +/* check current character is as expected, then get next token */ +static int +accept_char(input_t *in, uint32_t expected) +{ + int ret; + + ret = 1; + if (in->parse.ch != expected) { + error_message(in, "expected: '%c', got '%c'", + expected, in->s[in->c]); + ret = 0; + } + get_next_token(in); + return ret; +} + +/* clone a token */ +static retoken_t * +clone_token(input_t *in, retoken_t *orig) +{ + retoken_t *token; + + if ((token = newtok(orig->type, NULL, NULL, orig->icase)) == NULL) { + return NULL; + } + token->lazy = orig->lazy; + token->nparens = orig->nparens; + token->ch = orig->ch; + if (orig->left) { + token->left = clone_token(in, orig->left); + } + if (orig->right) { + token->right = clone_token(in, orig->right); + } + return token; +} + +/* concat two subexpressions together */ +static retoken_t * +rec_concat(input_t *in, retoken_t *token) +{ + retoken_t *rhs; + retoken_t *cat; + + while (in->parse.type != TOK_EOL && in->parse.type != TOK_ALT && in->parse.ch != ')') { + if ((rhs = rec_repeat(in)) == NULL || + (cat = newtok(TOK_CAT, token, rhs, 0)) == NULL || + (token = rec_concat(in, cat)) == NULL) { + return NULL; + } + } + return token; +} + +/* recognise an alternate */ +static retoken_t * +rec_alt(input_t *in) +{ + retoken_t *repeat; + retoken_t *token; + retoken_t *cat; + retoken_t *alt; + + if ((repeat = rec_repeat(in)) == NULL || + (token = rec_concat(in, repeat)) == NULL) { + return NULL; + } + while (in->parse.type == TOK_ALT) { + if (!accept_char(in, '|')) { + return NULL; + } + if ((repeat = rec_repeat(in)) == NULL || + (cat = rec_concat(in, repeat)) == NULL || + (alt = newtok(TOK_ALT, token, cat, 0)) == NULL || + (token = rec_concat(in, alt)) == NULL) { + return NULL; + } + } + return token; +} + +/* recognise a single atom */ +static retoken_t * +rec_single(input_t *in) +{ + retoken_t *token; + retoken_t *alt; + int parenc; + + token = NULL; + switch(in->parse.type) { + case TOK_PAREN: + accept_char(in, '('); + if (in->parse.noncapture) { + if ((token = rec_alt(in)) == NULL) { + return NULL; + } + } else { + parenc = ++in->parenc; + if ((alt = rec_alt(in)) == NULL || + (token = newtok(TOK_PAREN, alt, NULL, 0)) == NULL) { + return NULL; + } + token->nparens = parenc; + } + if (!accept_char(in, ')')) { + return NULL; + } + break; + case TOK_BACKREF: + if ((token = newtok(TOK_BACKREF, NULL, NULL, in->parse.icase)) == NULL) { + return NULL; + } + if (in->parse.ch > in->parenc) { + error_message(in, "backref %d out of range", in->ch); + return NULL; + } + accept_char(in, token->ch = in->parse.ch); + break; + case TOK_DOT: + case TOK_LIT: + case TOK_COLLECTION: + case TOK_WIDTH: + case TOK_ZEROWIDTH: + if ((token = newtok(in->parse.type, NULL, NULL, in->parse.icase)) == NULL) { + return NULL; + } + token->ch = in->parse.ch; + token->icase = in->parse.icase; + get_next_token(in); + break; + } + return token; +} + +/* a repetition of subexpressions */ +static retoken_t * +rec_repeat(input_t *in) +{ + retoken_t *cloned; + retoken_t *quest; + retoken_t *token; + retoken_t *orig; + uint32_t i; + + if ((token = rec_single(in)) == NULL) { + return NULL; + } + orig = token; + switch(in->parse.type) { + case TOK_STAR: + case TOK_PLUS: + case TOK_QUEST: + if ((token = newtok(in->parse.type, token, NULL, in->parse.icase)) == NULL) { + return NULL; + } + token->lazy = in->parse.lazy; + get_next_token(in); + break; + case TOK_REP: + if (in->parse.lo == 0 && in->parse.hi == 0) { + orig->type = TOK_FORGET; + } + for (i = 1 ; i < in->parse.lo ; i++) { + if ((cloned = clone_token(in, orig)) == NULL || + (token = newtok(TOK_CAT, token, cloned, in->parse.icase)) == NULL) { + return NULL; + } + } + if (in->parse.hi == AT_LEAST) { + if ((token = newtok(TOK_PLUS, token, NULL, in->parse.icase)) == NULL) { + return NULL; + } + } else { + for ( ; i < in->parse.hi ; i++) { + if ((cloned = clone_token(in, orig)) == NULL || + (quest = newtok(TOK_QUEST, cloned, NULL, in->parse.icase)) == NULL || + (token = newtok(TOK_CAT, token, quest, in->parse.icase)) == NULL) { + return NULL; + } + } + } + get_next_token(in); + break; + } + return token; +} + +/* recognise literal tokens in nospec expression */ +static retoken_t * +nospec_token(input_t *in) +{ + retoken_t *tok; + + tok = newtok(TOK_LIT, NULL, NULL, 0); + tok->ch = in->s[in->c++]; + if (in->c == in->eo) { + return tok; + } + return newtok(TOK_CAT, tok, nospec_token(in), 0); +} + +/* either start of line, or start of nested subexpression */ +#define BASIC_IS_START(i, in) \ + ((i) == (in) || ((size_t)(i - in) >= 2 && (i)[-2] == '\\' && (i)[-1] == '(')) + +#define BASIC_IS_END(i, in, sz) \ + (((size_t)((i) - (in)) == (sz) - 1) || ((i)[1] == '\\' && (i)[2] == ')')) + +/* convert a basic regexp into an extended one */ +static ssize_t +regextend(const char *in, size_t insize, char *out, size_t outsize, uint32_t flags) +{ + const char *i; + char *o; + + for (i = in, o = out ; (size_t)(i - in) < insize && (size_t)(o - out) < outsize ; i++) { + switch(*i) { + case '+': + case '|': + case '?': + case '{': + case '}': + case '(': + case ')': + /* +, |, ?, {, }, (, ) are ordinary characters */ + *o++ = '\\'; + *o++ = *i; + break; + case '\\': + switch(i[1]) { + case '(': + case ')': + case '{': + case '}': + /* basic bounds delimiters are \{ and \} */ + /* basic nested subexpressions are \( and \) */ + *o++ = *++i; + break; + default: + /* pass through other backslash-escaped chars */ + *o++ = *i; + break; + } + break; + case '^': + case '$': + /* ^ is an ordinary char, except at start */ + /* $ is an ordinary char, except at end */ + if ((*i == '^' && !BASIC_IS_START(i, in)) || + (*i == '$' && !BASIC_IS_END(i, in, insize))) { + *o++ = '\\'; + } + *o++ = *i; + break; + case '*': + if (flags & AGCRE_REG_CTAGS) { + /* * is an ordinary character - see ctags */ + *o++ = '\\'; + *o++ = *i; + } else { + /* THIS IS WHAT THE RE_FORMAT PAGE SAYS FOR '*' + not for ctags, though */ + if ((*i == '*' && BASIC_IS_START(i, in))) { + *o++ = '\\'; + } + *o++ = *i; + } + break; + default: + *o++ = *i; + break; + } + } + *o = 0x0; + return (ssize_t)(o - out); +} + +/* parse the string */ +static retoken_t * +parse_string(input_t *in) +{ + retoken_t *dotstar; + retoken_t *token; + retoken_t *cat; + retoken_t *alt; + + get_next_token(in); + if ((alt = rec_alt(in)) == NULL) { + return NULL; + } + /* make pattern ".*?(PATTERN)" */ + if ((token = newtok(TOK_PAREN, alt, NULL, 0)) == NULL || + (cat = newtok(TOK_DOT, NULL, NULL, 0)) == NULL || + (dotstar = newtok(TOK_STAR, cat, NULL, 0)) == NULL) { + return NULL; + } + dotstar->lazy = 1; + return newtok(TOK_CAT, dotstar, token, 0); +} + +/* parse an extended reg exp */ +static retoken_t * +parse_extended(agcre_regex_t *agcre, input_t *in) +{ + retoken_t *entry; + re_t *re; + + re = agcre->re_g; + if ((entry = parse_string(in)) == 0) { + memcpy(re->msg, in->msg, in->msgc); + return 0; + } + agcre->re_nsub = in->parenc + 1; + return entry; +} + +/* parse a nospec regexp */ +static retoken_t * +parse_nospec(agcre_regex_t *agcre, input_t *in) +{ + retoken_t *dotstar; + retoken_t *tok; + + tok = nospec_token(in); + tok = newtok(TOK_PAREN, tok, NULL, 0); + dotstar = newtok(TOK_STAR, newtok(TOK_DOT, NULL, NULL, 0), NULL, 0); + dotstar->lazy = 1; + tok = newtok(TOK_CAT, dotstar, tok, 0); + agcre->re_nsub = in->parenc + 1; + return tok; +} + +/* convert basic regexp to extended then parse */ +static retoken_t * +parse_basic(agcre_regex_t *agcre, input_t *in, uint32_t flags) +{ + ssize_t size; + char s[AGCRE_MAX_EXPR_LENGTH * 4]; + + size = regextend(&in->s[in->so], in->eo - in->so, s, sizeof(s), flags); + in->s = s; + in->so = 0; + in->eo = size; + return parse_extended(agcre, in); +} + +#define BOL 0xffffffffU + +/* unwrap a character, including UTF-8 chars, into 32bit char */ +static inline int +unwrap_utf8(input_t *in) +{ + uint32_t n; + uint32_t j; + + in->ch = in->s[in->c]; + if (in->ch <= 0x7f) { + /* 0XXX XXXX one byte */ + in->bytes = 1; + n = in->ch; + } else if ((in->ch & 0xe0) == 0xc0) { + /* 110X XXXX 2 bytes */ + in->bytes = 2; + n = (in->ch & 0x1f); + } else if ((in->ch & 0xf0) == 0xe0) { + /* 1110 XXXX three bytes */ + in->bytes = 3; + n = (in->ch & 0x0f); + } else if ((in->ch & 0xf8) == 0xf0) { + /* 1111 0XXX four bytes */ + in->bytes = 4; + n = (in->ch & 0x07); + } else if ((in->ch & 0xfc) == 0xf8) { + /* 1111 10XX five bytes */ + in->bytes = 5; + n = (in->ch & 0x03); + } else if ((in->ch & 0xfe) == 0xfc) { + /* 1111 110X six bytes */ + in->bytes = 6; + n = (in->ch & 0x01); + } else { + return 0; + } + if (in->c + in->bytes - 1 > in->eo) { + return 0; + } + for (j = 1 ; j < in->bytes ; j++) { + n = (n << 6) | (in->s[in->c + j] & 0x3f); + } + in->ch = n; + return 1; +} + +/* return 1 if host is bigendian */ +static inline int +host_is_bigendian(void) +{ + int indian = 1; + + return (*(char *)(void *)&indian) ? 0 : 1; +} + +/* swap 32bit integer */ +static inline uint32_t +swap32(uint32_t x) +{ + x = ((x & 0x0000ffff) << 16) | ((x & 0xffff0000) >> 16); + x = ((x & 0x00ff00ff) << 8) | ((x & 0xff00ff00) >> 8); + return x; +} + +/* swap 16bit integer */ +static inline uint32_t +swap16(uint16_t x) +{ + return ((x & 0x00ff) << 8) | ((x & 0xff00) >> 8); +} + +/* higher level get-next-char function */ +static inline int +get_next_char(input_t *in, uint32_t flags) +{ + uint32_t i32; + uint16_t i16; + + in->prevch = in->ch; + in->prevbytes = in->bytes; + switch(flags & AGCRE_IN_WIDTH) { + case AGCRE_REG_UTF16LE: + in->bytes = 1; + memcpy(&i16, &in->s[in->c * 2], 2); + in->ch = (host_is_bigendian()) ? swap16(i16) : i16; + return 1; + case AGCRE_REG_UTF16BE: + in->bytes = 1; + memcpy(&i16, &in->s[in->c * 2], 2); + in->ch = (!host_is_bigendian()) ? swap16(i16) : i16; + return 1; + case AGCRE_REG_UTF32LE: + in->bytes = 1; + memcpy(&i32, &in->s[in->c * 4], 4); + in->ch = (host_is_bigendian()) ? swap32(i32) : i32; + return 1; + case AGCRE_REG_UTF32BE: + in->bytes = 1; + memcpy(&i32, &in->s[in->c * 4], 4); + in->ch = (!host_is_bigendian()) ? swap32(i32) : i32; + return 1; + default: + return unwrap_utf8(in); + } +} + +/* return the number of bytes in a context */ +static inline uint32_t +context_bytes(uint32_t n) +{ + return sizeof(context_t) + (n * sizeof(agcre_regmatch_t)); +} + +/* create a new context */ +static context_t * +make_new_context(re_t *re, int n) +{ + context_t *ctx; + + if ((ctx = re->ctxlist) != NULL) { + re->ctxlist = ctx->next; + memset(ctx, 0x0, context_bytes(n)); + } else { + ctx = calloc(1, context_bytes(n)); + } + ctx->nsub = n; + ctx->ref = 1; + return ctx; +} + +/* increment reference count in the context */ +static inline context_t * +incref(context_t *ctx) +{ + ctx->ref += 1; + return ctx; +} + +/* save the current offset in the match context */ +static context_t * +mark(re_t *re, context_t *ctx, int i, input_t *in, int delta) +{ + context_t *newctx; + uint32_t j; + + if (ctx->ref > 1) { + newctx = make_new_context(re, ctx->nsub); + for (j = 0; j < ctx->nsub; j++) { + newctx->sub[j] = ctx->sub[j]; + } + ctx->ref -= 1; + ctx = newctx; + } + if (i % 2 == 0) { + ctx->sub[i / 2].rm_so = in->c + delta; + } else { + ctx->sub[i / 2].rm_eo = in->c + delta; + } + return ctx; +} + +/* decrement reference count in the context */ +static inline void +decref(re_t *re, context_t *ctx) +{ + if (--ctx->ref == 0) { + ctx->next = re->ctxlist; + re->ctxlist = ctx; + } +} + +/* create a new thread */ +static re_thread_t +newthread(instr_t *pc, context_t *ctx) +{ + re_thread_t t = {pc, ctx}; + + return t; +} + +/* create a new thread list */ +static threadlist_t * +threadlist(uint32_t instrc) +{ + return calloc(1, sizeof(threadlist_t) + (instrc * sizeof(re_thread_t))); +} + +/* add a new thread to the list */ +static int +addthread(re_t *re, threadlist_t *list, re_thread_t t, input_t *in, int delta, int width) +{ + context_t *ctx; + + if (width) { + /* we don't check generation for 0-width matches */ + if (t.pc->gen == re->gen) { + decref(re, t.ctx); + return 1; + } + t.pc->gen = re->gen; + } + switch(t.pc->op) { + case OP_JUMP: + addthread(re, list, newthread(t.pc->lhs, t.ctx), in, delta, width); + break; + case OP_SPLIT: + addthread(re, list, newthread(t.pc->lhs, incref(t.ctx)), in, delta, width); + addthread(re, list, newthread(t.pc->rhs, t.ctx), in, delta, width); + break; + case OP_SAVE: + if ((ctx = mark(re, t.ctx, t.pc->n, in, delta)) == NULL) { + return 0; + } + addthread(re, list, newthread(t.pc+1, ctx), in, delta, width); + break; + default: + list->t[list->nthreads++] = t; + break; + } + return 1; +} + +/* return 1 at beginning of line */ +static inline int +isbegline(re_t *re, input_t *in) +{ + return (re->flags & AGCRE_REG_NOTBOL) ? 0 : (in->prevch == '\n' || in->prevch == BOL); +} + +/* return 1 at end of line */ +static inline int +isendline(re_t *re, input_t *in) +{ + return (re->flags & AGCRE_REG_NOTEOL) ? 0 : in->c == in->eo; +} + +/* return 1 at beginning of words */ +static inline int +isbegword(re_t *re, input_t *in) +{ + return (isbegline(re, in) || !unicode_isident(in->prevch)) && + unicode_isident(in->ch); +} + +/* return 1 at end of words */ +static inline int +isendword(re_t *re, input_t *in) +{ + return (isendline(re, in) || (in->c > in->so && unicode_isident(in->prevch))) && + (!unicode_isident(in->ch)); +} + +/* do the chars match? */ +static inline int +charmatch(re_t *re, instr_t *instr, uint32_t ch, uint32_t wanted) +{ + if ((re->flags & AGCRE_REG_ICASE) || instr->icase) { + return unicode_tolower(ch) == unicode_tolower(wanted); + } + return (ch == wanted); +} + +/* can we match "any" character? */ +static inline int +anymatch(re_t *re, input_t *in) +{ + if ((re->flags & AGCRE_REG_NEWLINE) && in->ch == '\n') { + return 0; + } + return (in->c < in->eo); +} + +/* are the chars in the set? */ +static inline int +setmatch(re_t *re, set_t *set, input_t *in, instr_t *instr) +{ + if ((re->flags & AGCRE_REG_ICASE) || instr->icase) { + return set_isset(set, unicode_tolower(in->ch)) || + set_isset(set, unicode_toupper(in->ch)); + } + return set_isset(set, in->ch); +} + +/* is this a zerowidth match? */ +static inline int +zeromatch(re_t *re, uint32_t type, input_t *in) +{ + switch(type) { + case '^': + return isbegline(re, in); + case '$': + return isendline(re, in); + case '<': + return isbegword(re, in); + case '>': + return isendword(re, in); + case 'b': + return isbegword(re, in) || isendword(re, in); + } + return 0; +} + +/* we take the leftmost longest match */ +static inline int +is_better_match(context_t *ctx, context_t *prev) +{ + agcre_regoff_t prevlen; + agcre_regoff_t ctxlen; + + prevlen = (prev) ? prev->sub[0].rm_eo - prev->sub[0].rm_so : -1; + ctxlen = ctx->sub[0].rm_eo - ctx->sub[0].rm_so; + if (ctxlen > prevlen) { + return 1; + } + return (ctxlen == prevlen) ? (ctx->sub[0].rm_so < prev->sub[0].rm_so) : 0; +} + +/* get length of utf16 string in 16bit chars */ +static inline size_t +strlen16(const uint16_t *s) +{ + uint32_t i; + + for (i = 0 ; s[i] != 0x0000 ; i++) { + } + return i; +} + +/* get length of utf32 string in 32bit chars */ +static inline size_t +strlen32(const uint32_t *s) +{ + uint32_t i; + + for (i = 0 ; s[i] != 0 ; i++) { + } + return i; +} + +/* get length of string in terms of bytes */ +static inline uint32_t +get_length_in_bytes(const void *vs, uint32_t flags) +{ + switch(flags & AGCRE_IN_WIDTH) { + case AGCRE_REG_UTF16LE: + case AGCRE_REG_UTF16BE: + return strlen16(vs) * 2; + case AGCRE_REG_UTF32LE: + case AGCRE_REG_UTF32BE: + return strlen32(vs) * 4; + default: + return strlen(vs); + } +} + +/* free all contexts */ +static inline void +free_contexts(context_t *ctx) +{ + context_t *next; + + for ( ; ctx ; ctx = next) { + next = ctx->next; + free(ctx); + } +} + +/* unwrap a character, including UTF-8 chars, into 32bit char */ +static inline int +lex_unwrap_utf8(input_t *in) +{ + uint32_t n; + uint32_t j; + + in->ch = in->s[in->c]; + if (in->ch <= 0x7f) { + /* 0XXX XXXX one byte */ + in->bytes = 1; + n = in->ch; + } else if ((in->ch & 0xe0) == 0xc0) { + /* 110X XXXX 2 bytes */ + in->bytes = 2; + n = (in->ch & 0x1f); + } else if ((in->ch & 0xf0) == 0xe0) { + /* 1110 XXXX three bytes */ + in->bytes = 3; + n = (in->ch & 0x0f); + } else if ((in->ch & 0xf8) == 0xf0) { + /* 1111 0XXX four bytes */ + in->bytes = 4; + n = (in->ch & 0x07); + } else if ((in->ch & 0xfc) == 0xf8) { + /* 1111 10XX five bytes */ + in->bytes = 5; + n = (in->ch & 0x03); + } else if ((in->ch & 0xfe) == 0xfc) { + /* 1111 110X six bytes */ + in->bytes = 6; + n = (in->ch & 0x01); + } else { + return 0; + } + if (in->c + in->bytes - 1 > in->eo) { + return 0; + } + for (j = 1 ; j < in->bytes ; j++) { + n = (n << 6) | (in->s[in->c + j] & 0x3f); + } + in->ch = n; + in->c += in->bytes; + return 1; +} + +/* return 1 if host is bigendian */ +static inline int +lex_host_is_bigendian(void) +{ + int indian = 1; + + return (*(char *)(void *)&indian) ? 0 : 1; +} + +/* swap 32bit integer */ +static inline uint32_t +lex_swap32(uint32_t x) +{ + x = ((x & 0x0000ffff) << 16) | ((x & 0xffff0000) >> 16); + x = ((x & 0x00ff00ff) << 8) | ((x & 0xff00ff00) >> 8); + return x; +} + +/* swap 16bit integer */ +static inline uint32_t +lex_swap16(uint16_t x) +{ + return ((x & 0x00ff) << 8) | ((x & 0xff00) >> 8); +} + +/* return the next char */ +static inline int +in_nextchar(input_t *in) +{ + uint32_t i32; + uint16_t i16; + + if (in->c == in->eo) { + return 0; + } + in->prevch = in->ch; + in->prevbytes = in->bytes; + switch(in->re->flags & AGCRE_IN_WIDTH) { + case AGCRE_REG_UTF16LE: + in->bytes = 2; + memcpy(&i16, &in->s[in->c], in->bytes); + in->ch = (lex_host_is_bigendian()) ? lex_swap16(i16) : i16; + in->c += in->bytes; + return 1; + case AGCRE_REG_UTF16BE: + in->bytes = 2; + memcpy(&i16, &in->s[in->c], in->bytes); + in->ch = (!lex_host_is_bigendian()) ? lex_swap16(i16) : i16; + in->c += in->bytes; + return 1; + case AGCRE_REG_UTF32LE: + in->bytes = 4; + memcpy(&i32, &in->s[in->c], in->bytes); + in->ch = (lex_host_is_bigendian()) ? lex_swap32(i32) : i32; + in->c += in->bytes; + return 1; + case AGCRE_REG_UTF32BE: + in->bytes = 4; + memcpy(&i32, &in->s[in->c], in->bytes); + in->ch = (!lex_host_is_bigendian()) ? lex_swap32(i32) : i32; + in->c += in->bytes; + return 1; + default: + return lex_unwrap_utf8(in); + } +} + +/* look ahead */ +static inline uint32_t +in_peek(input_t *in, int32_t delta) +{ + return in->s[in->c + delta]; +} + +/* keep an error message, only if we don't have one already */ +static int +lex_error_message(input_t *in, const char *fmt, ...) +{ + va_list args; + + if (in->msgc == 0) { + va_start(args, fmt); + in->msgc = vsnprintf(in->msg, sizeof(in->msg), fmt, args); + va_end(args); + } + return TOK_BROKEN; +} + +/* do we have a lazy marker? */ +static inline void +lazy(input_t *in, retoken_t *token) +{ + if (in_peek(in, 0) == '?') { + in_nextchar(in); + token->lazy = 1; + } +} + +/* get a number, up to width digits wide */ +static inline int +get_number(input_t *in, uint32_t *num, uint32_t width, uint32_t base) +{ + static const char *hex = "0123456789abcdef"; + const char *p; + uint32_t i; + + for (*num = 0, i = 0 ; i < width ; i++) { + if ((p = strchr(hex, in_peek(in, i))) == NULL || + (uint32_t)(p - hex) >= base) { + break; + } + *num = (*num * base) + (uint32_t)(p - hex); + } + if (i > width) { + return 0; + } + in->c += i; + return 1; +} + +/* recognise a specific, bounded repeat */ +static inline int +rep(input_t *in) +{ + retoken_t *parse; + + parse = &in->parse; + if (!get_number(in, &parse->lo, 4, 10)) { + return lex_error_message(in, "number too large '%.6s", &in->s[parse->so]); + } + parse->hi = parse->lo; + if (in_peek(in, 0) == ',') { + in_nextchar(in); + if (!get_number(in, &parse->hi, 4, 10)) { + return lex_error_message(in, "number too large '%.6s", &in->s[parse->so]); + } + if (parse->hi == 0) { + parse->hi = AT_LEAST; + } + } + if (parse->lo > parse->hi) { + return lex_error_message(in, "low %u > hi %u", parse->lo, parse->hi); + } + in_nextchar(in); + if (in->ch != '}') { + return lex_error_message(in, "expected } at end of repeat", &in->s[parse->so]); + } + return TOK_REP; +} + +/* recognise a backslash sequence */ +static int +rec_backslash(input_t *in, retoken_t *token, int want_backrefs) +{ + in_nextchar(in); + switch(token->ch = in->ch) { + case 'u': + if (!get_number(in, &token->ch, 4, 16)) { + return lex_error_message(in, "bad unicode character '%.6s", &in->s[token->so]); + } + break; + case 'n': + token->ch = '\n'; + return TOK_LIT; + case 'r': + token->ch = '\r'; + return TOK_LIT; + case 't': + token->ch = '\t'; + return TOK_LIT; + case '0': case '1': case '2': case '3': case '4': + case '5': case '6': case '7': case '8': case '9': + in->c -= in->prevbytes; + if (want_backrefs && get_number(in, &token->ch, 2, 10)) { + return TOK_BACKREF; + } + break; + default: + if (strchr("<>b", token->ch) != NULL) { + return TOK_ZEROWIDTH; + } + if (strchr("DPSWdpsw", token->ch) != NULL) { + return TOK_WIDTH; + } + break; + } + return TOK_LIT; +} + +/* function to calculate a hash, used in stringswitch */ +static uint32_t +djbhashn(const char *s, size_t len) +{ + uint32_t hash; + + for (hash = 5381; len-- > 0 ; ) { + hash = hash * 33 + *s++; + } + return hash + (hash >> 5); +} + +/* recognise a posix class */ +static int +rec_posix_class(input_t *in, set_t *set) +{ + char *p; + + if ((p = memchr(&in->s[in->c], ']', 10)) == NULL) { + return 0; + } + switch(djbhashn(&in->s[in->c], (size_t)(p - &in->s[in->c]) + 1)) { + case /* ":alnum:]" */ 0x32ef56fd: + set_add_callback(set, unicode_isalnum, 0); + in->c += 8; + return 1; + case /* ":alpha:]" */ 0x330d1aee: + set_add_callback(set, unicode_isalpha, 0); + in->c += 8; + return 1; + case /* ":blank:]" */ 0x815acf00: + set_add_callback(set, unicode_isblank, 0); + in->c += 8; + return 1; + case /* ":cntrl:]" */ 0xd6f15906: + set_add_callback(set, unicode_iscntrl, 0); + in->c += 8; + return 1; + case /* ":digit:]" */ 0x1154c1e2: + set_add_callback(set, unicode_isdigit, 0); + in->c += 8; + return 1; + case /* ":graph:]" */ 0xcb51807: + set_add_callback(set, unicode_isgraph, 0); + in->c += 8; + return 1; + case /* ":ident:]" */ 0x8a1572f1: + set_add_callback(set, unicode_isident, 0); + in->c += 8; + return 1; + case /* ":lower:]" */ 0x8bfc6af8: + set_add_callback(set, unicode_islower, 0); + in->c += 8; + return 1; + case /* ":print:]" */ 0xc7bbf92b: + set_add_callback(set, unicode_isprint, 0); + in->c += 8; + return 1; + case /* ":punct:]" */ 0xcf4a84ca: + set_add_callback(set, unicode_ispunct, 0); + in->c += 8; + return 1; + case /* ":space:]" */ 0xa876bcf2: + set_add_callback(set, unicode_isspace, 0); + in->c += 8; + return 1; + case /* ":upper:]" */ 0x40541fdb: + set_add_callback(set, unicode_isupper, 0); + in->c += 8; + return 1; + case /* ":xdigit:]" */ 0x24758e84: + set_add_callback(set, unicode_isxdigit, 0); + in->c += 9; + return 1; + default: + return 0; + } +} + +/* recognise a set */ +static int +rec_set(input_t *in) +{ + const int nobackrefs = 0; + retoken_t *token; + retoken_t bs; + uint32_t prevch; + set_t *set; + int complement; + int done; + int ch; + + if (memcmp(&in->s[in->c], "[:<:]]", 6) == 0 || + memcmp(&in->s[in->c], "[:>:]]", 6) == 0) { + in->parse.ch = in->s[in->c + 2]; + in->c += 6; + return TOK_ZEROWIDTH; + } + if (in->re->setc == in->re->maxset) { + set = realloc(in->re->sets, (in->re->maxset + 8) * sizeof(*set)); + if (set == NULL) { + return lex_error_message(in, "can't extend sets past %u", in->re->maxset); + } + memset(&set[in->re->setc], 0x0, 8 * sizeof(*set)); + in->re->maxset += 8; + in->re->sets = set; + } + set = &in->re->sets[in->parse.ch = in->re->setc++]; + token = &in->parse; + complement = 0; + if (in_peek(in, 0) == '^') { + complement = 1; + set_complement(set); + in_nextchar(in); + } + for (done = 0, prevch = in->prevch ; !done && in_nextchar(in) ; ) { + switch(ch = in->ch) { + case '\\': + memset(&bs, 0x0, sizeof(bs)); + switch (rec_backslash(in, &bs, nobackrefs)) { + case TOK_BROKEN: + return lex_error_message(in, "bad backslash '%.10s'", &in->s[token->so]); + case TOK_LIT: + set_add_case(set, bs.ch); + break; + case TOK_WIDTH: + set_add_callback(set, + (token->ch == 'd' || token->ch == 'D') ? unicode_isdigit : + (token->ch == 'p' || token->ch == 'P') ? unicode_isprint : + (token->ch == 's' || token->ch == 'S') ? unicode_isspace : + unicode_isident, + unicode_isupper((uint8_t)token->ch)); + break; + } + break; + case '[': + if (!rec_posix_class(in, set)) { + return lex_error_message(in, "bad posix class '%.30s'", &in->s[token->so]); + } + break; + case ']': + if (in->c == token->so + complement + 2) { + set_add_case(set, ']'); + } else { + done = 1; + } + break; + case '-': + if (in->c == token->so + complement + 2) { + set_add_case(set, '-'); + } else { + prevch = in->prevch; + in_nextchar(in); + set_add_range(set, prevch, in->ch); + } + break; + default: + set_add_case(set, ch); + break; + } + } + if (in->ch != ']') { + return lex_error_message(in, "expected ']' at end of set: %s", &in->s[token->so]); + } + return TOK_SET; +} + +/* lexer function */ +static int +get_next_token(input_t *in) +{ + const int backrefs = 1; + retoken_t *token = &in->parse; + + memset(token, 0x0, sizeof(*token)); + token->so = in->c; + if (!in_nextchar(in)) { + return token->type = TOK_EOL; + } + token->icase = in->icase; + switch(token->ch = in->ch) { + case '(': + if (in_peek(in, 0) == '?') { + if (in_peek(in, 1) == ':') { + in->c += 2; + token->noncapture = 1; + } + if (in_peek(in, 1) == 'i') { + in->c += 3; + in->icase = 1; + return get_next_token(in); + } + if (in_peek(in, 1) == '-' && in_peek(in, 2) == 'i') { + in->c += 4; + in->icase = 0; + return get_next_token(in); + } + } + return token->type = TOK_PAREN; + case ')': + return token->type = TOK_PAREN; + case '?': + lazy(in, token); + return token->type = TOK_QUEST; + case '*': + lazy(in, token); + return token->type = TOK_STAR; + case '+': + lazy(in, token); + return token->type = TOK_PLUS; + case '|': + return token->type = TOK_ALT; + case '{': + return token->type = rep(in); + case '^': + case '$': + return token->type = TOK_ZEROWIDTH; + case '[': + return token->type = rec_set(in); + case '.': + return token->type = TOK_DOT; + case '\\': + return token->type = rec_backslash(in, token, backrefs); + default: + return token->type = TOK_LIT; + } +} + +/* free resources used by a set */ +static void +set_free(set_t *set) +{ + free(set->ranges); + free(set->cases); + free(set->callbacks); +} + +/* add a single 'case' to a set - a single member */ +static int +set_add_case(set_t *set, uint32_t n) +{ + uint32_t *v; + + if (set->casec == set->maxcase) { + v = realloc(set->cases, (set->maxcase + 16) * sizeof(*v)); + if (v == NULL) { + fprintf(stderr, "set_add_case: failure\n"); + return 0; + } + set->maxcase += 16; + set->cases = v; + } + set->cases[set->casec++] = n; + return 1; +} + +/* add a range to a set */ +static int +set_add_range(set_t *set, uint32_t lo, uint32_t hi) +{ + uint32_t *v; + + if (set->rangec == set->maxrange) { + v = realloc(set->ranges, (set->maxrange + 16) * sizeof(*v)); + if (v == NULL) { + fprintf(stderr, "set_add_range: failure\n"); + return 0; + } + set->maxrange += 16; + set->ranges = v; + } + set->ranges[set->rangec++] = lo; + set->ranges[set->rangec++] = hi; + return 1; +} + +/* add a callback function to a set */ +static int +set_add_callback(set_t *set, int (*func)(uint32_t), int negate) +{ + cb_t *v; + + if (set->callbackc == set->maxcallback) { + v = realloc(set->callbacks, (set->maxcallback + 16) * sizeof(*v)); + if (v == NULL) { + fprintf(stderr, "set_add_callback: failure\n"); + return 0; + } + set->maxcallback += 16; + set->callbacks = v; + } + set->callbacks[set->callbackc].func = func; + set->callbacks[set->callbackc++].negate = negate; + return 1; +} + +/* test if a number is present in the set */ +static int +set_isset(set_t *set, uint32_t n) +{ + uint32_t i; + + for (i = 0 ; i < set->casec ; i++) { + if (set->cases[i] == n) { + return 1 - set->complement; + } + } + for (i = 0 ; i < set->rangec ; i += 2) { + if (n >= set->ranges[i] && n <= set->ranges[i + 1]) { + return 1 - set->complement; + } + } + for (i = 0 ; i < set->callbackc ; i += 2) { + if ((*set->callbacks[i].func)(n) == 1 - set->callbacks[i].negate) { + return 1 - set->complement; + } + } + return set->complement; +} + +/* set the complement flag */ +static int +set_complement(set_t *set) +{ + return set->complement = 1 - set->complement; +} + +#define RANGE(a, b) do { \ + if (ch >= (a) && ch <= (b)) { \ + return 1; \ + } \ +} while(/*CONSTCOND*/0) + + +/* return 1 if ch has White Space property */ +static int +unicode_isWhite_Space(uint32_t ch) +{ + switch(ch) { + case 0x0020: + case 0x0085: + case 0x00A0: + case 0x1680: + case 0x2028: + case 0x2029: + case 0x202F: + case 0x205F: + case 0x3000: + return 1; + } + RANGE(0x0009, 0x000D); + RANGE(0x2000, 0x200A); + return 0; +} + +/* return 1 if ch has Dash property */ +static int +unicode_isDash(uint32_t ch) +{ + switch(ch) { + case 0x002D: + case 0x058A: + case 0x05BE: + case 0x1400: + case 0x1806: + case 0x2053: + case 0x207B: + case 0x208B: + case 0x2212: + case 0x2E17: + case 0x2E1A: + case 0x2E40: + case 0x301C: + case 0x3030: + case 0x30A0: + case 0xFE58: + case 0xFE63: + case 0xFF0D: + return 1; + } + RANGE(0x2010, 0x2015); + RANGE(0x2E3A, 0x2E3B); + RANGE(0xFE31, 0xFE32); + return 0; +} + +/* return 1 if ch has Hyphen property */ +static int +unicode_isHyphen(uint32_t ch) +{ + switch(ch) { + case 0x002D: + case 0x00AD: + case 0x058A: + case 0x1806: + case 0x2E17: + case 0x30FB: + case 0xFE63: + case 0xFF0D: + case 0xFF65: + return 1; + } + RANGE(0x2010, 0x2011); + return 0; +} + +/* return 1 if ch has Quotation Mark property */ +static int +unicode_isQuotation_Mark(uint32_t ch) +{ + switch(ch) { + case 0x0022: + case 0x0027: + case 0x00AB: + case 0x00BB: + case 0x2018: + case 0x2019: + case 0x201A: + case 0x201D: + case 0x201E: + case 0x201F: + case 0x2039: + case 0x203A: + case 0x2E42: + case 0x300C: + case 0x300D: + case 0x300E: + case 0x300F: + case 0x301D: + case 0xFE41: + case 0xFE42: + case 0xFE43: + case 0xFE44: + case 0xFF02: + case 0xFF07: + case 0xFF62: + case 0xFF63: + return 1; + } + RANGE(0x201B, 0x201C); + RANGE(0x301E, 0x301F); + return 0; +} + +/* return 1 if ch has Terminal Punctuation property */ +static int +unicode_isTerminal_Punctuation(uint32_t ch) +{ + switch(ch) { + case 0x0021: + case 0x002C: + case 0x002E: + case 0x003F: + case 0x037E: + case 0x0387: + case 0x0589: + case 0x05C3: + case 0x060C: + case 0x061B: + case 0x061F: + case 0x06D4: + case 0x070C: + case 0x085E: + case 0x0F08: + case 0x17DA: + case 0x2E2E: + case 0x2E3C: + case 0x2E41: + case 0xA92F: + case 0xAADF: + case 0xABEB: + case 0xFF01: + case 0xFF0C: + case 0xFF0E: + case 0xFF1F: + case 0xFF61: + case 0xFF64: + case 0x1039F: + case 0x103D0: + case 0x10857: + case 0x1091F: + case 0x111CD: + case 0x112A9: + case 0x1145B: + case 0x11C71: + case 0x16AF5: + case 0x16B44: + case 0x1BC9F: + return 1; + } + RANGE(0x003A, 0x003B); + RANGE(0x0700, 0x070A); + RANGE(0x07F8, 0x07F9); + RANGE(0x0830, 0x083E); + RANGE(0x0964, 0x0965); + RANGE(0x0E5A, 0x0E5B); + RANGE(0x0F0D, 0x0F12); + RANGE(0x104A, 0x104B); + RANGE(0x1361, 0x1368); + RANGE(0x166D, 0x166E); + RANGE(0x16EB, 0x16ED); + RANGE(0x1735, 0x1736); + RANGE(0x17D4, 0x17D6); + RANGE(0x1802, 0x1805); + RANGE(0x1808, 0x1809); + RANGE(0x1944, 0x1945); + RANGE(0x1AA8, 0x1AAB); + RANGE(0x1B5A, 0x1B5B); + RANGE(0x1B5D, 0x1B5F); + RANGE(0x1C3B, 0x1C3F); + RANGE(0x1C7E, 0x1C7F); + RANGE(0x203C, 0x203D); + RANGE(0x2047, 0x2049); + RANGE(0x3001, 0x3002); + RANGE(0xA4FE, 0xA4FF); + RANGE(0xA60D, 0xA60F); + RANGE(0xA6F3, 0xA6F7); + RANGE(0xA876, 0xA877); + RANGE(0xA8CE, 0xA8CF); + RANGE(0xA9C7, 0xA9C9); + RANGE(0xAA5D, 0xAA5F); + RANGE(0xAAF0, 0xAAF1); + RANGE(0xFE50, 0xFE52); + RANGE(0xFE54, 0xFE57); + RANGE(0xFF1A, 0xFF1B); + RANGE(0x10A56, 0x10A57); + RANGE(0x10AF0, 0x10AF5); + RANGE(0x10B3A, 0x10B3F); + RANGE(0x10B99, 0x10B9C); + RANGE(0x11047, 0x1104D); + RANGE(0x110BE, 0x110C1); + RANGE(0x11141, 0x11143); + RANGE(0x111C5, 0x111C6); + RANGE(0x111DE, 0x111DF); + RANGE(0x11238, 0x1123C); + RANGE(0x1144B, 0x1144D); + RANGE(0x115C2, 0x115C5); + RANGE(0x115C9, 0x115D7); + RANGE(0x11641, 0x11642); + RANGE(0x1173C, 0x1173E); + RANGE(0x11A42, 0x11A43); + RANGE(0x11A9B, 0x11A9C); + RANGE(0x11AA1, 0x11AA2); + RANGE(0x11C41, 0x11C43); + RANGE(0x12470, 0x12474); + RANGE(0x16A6E, 0x16A6F); + RANGE(0x16B37, 0x16B39); + RANGE(0x1DA87, 0x1DA8A); + return 0; +} + +/* return 1 if ch has Hex Digit property */ +static int +unicode_isHex_Digit(uint32_t ch) +{ + RANGE(0x0030, 0x0039); + RANGE(0x0041, 0x0046); + RANGE(0x0061, 0x0066); + RANGE(0xFF10, 0xFF19); + RANGE(0xFF21, 0xFF26); + RANGE(0xFF41, 0xFF46); + return 0; +} + +/* return 1 if ch has Other Alphabetic property */ +static int +unicode_isOther_Alphabetic(uint32_t ch) +{ + switch(ch) { + case 0x0345: + case 0x05BF: + case 0x05C7: + case 0x0670: + case 0x06ED: + case 0x0711: + case 0x0903: + case 0x093A: + case 0x093B: + case 0x0981: + case 0x09D7: + case 0x0A03: + case 0x0A51: + case 0x0A75: + case 0x0A83: + case 0x0AC9: + case 0x0B01: + case 0x0B3E: + case 0x0B3F: + case 0x0B40: + case 0x0B56: + case 0x0B57: + case 0x0B82: + case 0x0BC0: + case 0x0BD7: + case 0x0C00: + case 0x0C81: + case 0x0CBE: + case 0x0CBF: + case 0x0CC6: + case 0x0CCC: + case 0x0D57: + case 0x0DD6: + case 0x0E31: + case 0x0E4D: + case 0x0EB1: + case 0x0ECD: + case 0x0F7F: + case 0x1031: + case 0x1038: + case 0x1062: + case 0x1082: + case 0x109C: + case 0x109D: + case 0x135F: + case 0x17B6: + case 0x17C6: + case 0x18A9: + case 0x1932: + case 0x1A1B: + case 0x1A55: + case 0x1A56: + case 0x1A57: + case 0x1A61: + case 0x1A62: + case 0x1B04: + case 0x1B35: + case 0x1B3B: + case 0x1B3C: + case 0x1B42: + case 0x1B43: + case 0x1B82: + case 0x1BA1: + case 0x1BE7: + case 0x1BED: + case 0x1BEE: + case 0xA827: + case 0xA8C5: + case 0xA952: + case 0xA983: + case 0xA9BC: + case 0xAA43: + case 0xAA4C: + case 0xAA4D: + case 0xAAB0: + case 0xAABE: + case 0xAAEB: + case 0xAAF5: + case 0xABE5: + case 0xABE8: + case 0xFB1E: + case 0x11000: + case 0x11001: + case 0x11002: + case 0x11082: + case 0x1112C: + case 0x11182: + case 0x111BF: + case 0x11234: + case 0x11237: + case 0x1123E: + case 0x112DF: + case 0x11340: + case 0x11357: + case 0x11445: + case 0x114B9: + case 0x114BA: + case 0x114C1: + case 0x115BE: + case 0x1163D: + case 0x1163E: + case 0x11640: + case 0x116AB: + case 0x116AC: + case 0x116AD: + case 0x11726: + case 0x11A39: + case 0x11A97: + case 0x11C2F: + case 0x11C3E: + case 0x11CA9: + case 0x11CB1: + case 0x11CB4: + case 0x11D3A: + case 0x11D43: + case 0x11D47: + case 0x1BC9E: + case 0x1E947: + return 1; + } + RANGE(0x05B0, 0x05BD); + RANGE(0x05C1, 0x05C2); + RANGE(0x05C4, 0x05C5); + RANGE(0x0610, 0x061A); + RANGE(0x064B, 0x0657); + RANGE(0x0659, 0x065F); + RANGE(0x06D6, 0x06DC); + RANGE(0x06E1, 0x06E4); + RANGE(0x06E7, 0x06E8); + RANGE(0x0730, 0x073F); + RANGE(0x07A6, 0x07B0); + RANGE(0x0816, 0x0817); + RANGE(0x081B, 0x0823); + RANGE(0x0825, 0x0827); + RANGE(0x0829, 0x082C); + RANGE(0x08D4, 0x08DF); + RANGE(0x08E3, 0x08E9); + RANGE(0x08F0, 0x0902); + RANGE(0x093E, 0x0940); + RANGE(0x0941, 0x0948); + RANGE(0x0949, 0x094C); + RANGE(0x094E, 0x094F); + RANGE(0x0955, 0x0957); + RANGE(0x0962, 0x0963); + RANGE(0x0982, 0x0983); + RANGE(0x09BE, 0x09C0); + RANGE(0x09C1, 0x09C4); + RANGE(0x09C7, 0x09C8); + RANGE(0x09CB, 0x09CC); + RANGE(0x09E2, 0x09E3); + RANGE(0x0A01, 0x0A02); + RANGE(0x0A3E, 0x0A40); + RANGE(0x0A41, 0x0A42); + RANGE(0x0A47, 0x0A48); + RANGE(0x0A4B, 0x0A4C); + RANGE(0x0A70, 0x0A71); + RANGE(0x0A81, 0x0A82); + RANGE(0x0ABE, 0x0AC0); + RANGE(0x0AC1, 0x0AC5); + RANGE(0x0AC7, 0x0AC8); + RANGE(0x0ACB, 0x0ACC); + RANGE(0x0AE2, 0x0AE3); + RANGE(0x0AFA, 0x0AFC); + RANGE(0x0B02, 0x0B03); + RANGE(0x0B41, 0x0B44); + RANGE(0x0B47, 0x0B48); + RANGE(0x0B4B, 0x0B4C); + RANGE(0x0B62, 0x0B63); + RANGE(0x0BBE, 0x0BBF); + RANGE(0x0BC1, 0x0BC2); + RANGE(0x0BC6, 0x0BC8); + RANGE(0x0BCA, 0x0BCC); + RANGE(0x0C01, 0x0C03); + RANGE(0x0C3E, 0x0C40); + RANGE(0x0C41, 0x0C44); + RANGE(0x0C46, 0x0C48); + RANGE(0x0C4A, 0x0C4C); + RANGE(0x0C55, 0x0C56); + RANGE(0x0C62, 0x0C63); + RANGE(0x0C82, 0x0C83); + RANGE(0x0CC0, 0x0CC4); + RANGE(0x0CC7, 0x0CC8); + RANGE(0x0CCA, 0x0CCB); + RANGE(0x0CD5, 0x0CD6); + RANGE(0x0CE2, 0x0CE3); + RANGE(0x0D00, 0x0D01); + RANGE(0x0D02, 0x0D03); + RANGE(0x0D3E, 0x0D40); + RANGE(0x0D41, 0x0D44); + RANGE(0x0D46, 0x0D48); + RANGE(0x0D4A, 0x0D4C); + RANGE(0x0D62, 0x0D63); + RANGE(0x0D82, 0x0D83); + RANGE(0x0DCF, 0x0DD1); + RANGE(0x0DD2, 0x0DD4); + RANGE(0x0DD8, 0x0DDF); + RANGE(0x0DF2, 0x0DF3); + RANGE(0x0E34, 0x0E3A); + RANGE(0x0EB4, 0x0EB9); + RANGE(0x0EBB, 0x0EBC); + RANGE(0x0F71, 0x0F7E); + RANGE(0x0F80, 0x0F81); + RANGE(0x0F8D, 0x0F97); + RANGE(0x0F99, 0x0FBC); + RANGE(0x102B, 0x102C); + RANGE(0x102D, 0x1030); + RANGE(0x1032, 0x1036); + RANGE(0x103B, 0x103C); + RANGE(0x103D, 0x103E); + RANGE(0x1056, 0x1057); + RANGE(0x1058, 0x1059); + RANGE(0x105E, 0x1060); + RANGE(0x1067, 0x1068); + RANGE(0x1071, 0x1074); + RANGE(0x1083, 0x1084); + RANGE(0x1085, 0x1086); + RANGE(0x1712, 0x1713); + RANGE(0x1732, 0x1733); + RANGE(0x1752, 0x1753); + RANGE(0x1772, 0x1773); + RANGE(0x17B7, 0x17BD); + RANGE(0x17BE, 0x17C5); + RANGE(0x17C7, 0x17C8); + RANGE(0x1885, 0x1886); + RANGE(0x1920, 0x1922); + RANGE(0x1923, 0x1926); + RANGE(0x1927, 0x1928); + RANGE(0x1929, 0x192B); + RANGE(0x1930, 0x1931); + RANGE(0x1933, 0x1938); + RANGE(0x1A17, 0x1A18); + RANGE(0x1A19, 0x1A1A); + RANGE(0x1A58, 0x1A5E); + RANGE(0x1A63, 0x1A64); + RANGE(0x1A65, 0x1A6C); + RANGE(0x1A6D, 0x1A72); + RANGE(0x1A73, 0x1A74); + RANGE(0x1B00, 0x1B03); + RANGE(0x1B36, 0x1B3A); + RANGE(0x1B3D, 0x1B41); + RANGE(0x1B80, 0x1B81); + RANGE(0x1BA2, 0x1BA5); + RANGE(0x1BA6, 0x1BA7); + RANGE(0x1BA8, 0x1BA9); + RANGE(0x1BAC, 0x1BAD); + RANGE(0x1BE8, 0x1BE9); + RANGE(0x1BEA, 0x1BEC); + RANGE(0x1BEF, 0x1BF1); + RANGE(0x1C24, 0x1C2B); + RANGE(0x1C2C, 0x1C33); + RANGE(0x1C34, 0x1C35); + RANGE(0x1CF2, 0x1CF3); + RANGE(0x1DE7, 0x1DF4); + RANGE(0x24B6, 0x24E9); + RANGE(0x2DE0, 0x2DFF); + RANGE(0xA674, 0xA67B); + RANGE(0xA69E, 0xA69F); + RANGE(0xA823, 0xA824); + RANGE(0xA825, 0xA826); + RANGE(0xA880, 0xA881); + RANGE(0xA8B4, 0xA8C3); + RANGE(0xA926, 0xA92A); + RANGE(0xA947, 0xA951); + RANGE(0xA980, 0xA982); + RANGE(0xA9B4, 0xA9B5); + RANGE(0xA9B6, 0xA9B9); + RANGE(0xA9BA, 0xA9BB); + RANGE(0xA9BD, 0xA9BF); + RANGE(0xAA29, 0xAA2E); + RANGE(0xAA2F, 0xAA30); + RANGE(0xAA31, 0xAA32); + RANGE(0xAA33, 0xAA34); + RANGE(0xAA35, 0xAA36); + RANGE(0xAAB2, 0xAAB4); + RANGE(0xAAB7, 0xAAB8); + RANGE(0xAAEC, 0xAAED); + RANGE(0xAAEE, 0xAAEF); + RANGE(0xABE3, 0xABE4); + RANGE(0xABE6, 0xABE7); + RANGE(0xABE9, 0xABEA); + RANGE(0x10376, 0x1037A); + RANGE(0x10A01, 0x10A03); + RANGE(0x10A05, 0x10A06); + RANGE(0x10A0C, 0x10A0F); + RANGE(0x11038, 0x11045); + RANGE(0x110B0, 0x110B2); + RANGE(0x110B3, 0x110B6); + RANGE(0x110B7, 0x110B8); + RANGE(0x11100, 0x11102); + RANGE(0x11127, 0x1112B); + RANGE(0x1112D, 0x11132); + RANGE(0x11180, 0x11181); + RANGE(0x111B3, 0x111B5); + RANGE(0x111B6, 0x111BE); + RANGE(0x1122C, 0x1122E); + RANGE(0x1122F, 0x11231); + RANGE(0x11232, 0x11233); + RANGE(0x112E0, 0x112E2); + RANGE(0x112E3, 0x112E8); + RANGE(0x11300, 0x11301); + RANGE(0x11302, 0x11303); + RANGE(0x1133E, 0x1133F); + RANGE(0x11341, 0x11344); + RANGE(0x11347, 0x11348); + RANGE(0x1134B, 0x1134C); + RANGE(0x11362, 0x11363); + RANGE(0x11435, 0x11437); + RANGE(0x11438, 0x1143F); + RANGE(0x11440, 0x11441); + RANGE(0x11443, 0x11444); + RANGE(0x114B0, 0x114B2); + RANGE(0x114B3, 0x114B8); + RANGE(0x114BB, 0x114BE); + RANGE(0x114BF, 0x114C0); + RANGE(0x115AF, 0x115B1); + RANGE(0x115B2, 0x115B5); + RANGE(0x115B8, 0x115BB); + RANGE(0x115BC, 0x115BD); + RANGE(0x115DC, 0x115DD); + RANGE(0x11630, 0x11632); + RANGE(0x11633, 0x1163A); + RANGE(0x1163B, 0x1163C); + RANGE(0x116AE, 0x116AF); + RANGE(0x116B0, 0x116B5); + RANGE(0x1171D, 0x1171F); + RANGE(0x11720, 0x11721); + RANGE(0x11722, 0x11725); + RANGE(0x11727, 0x1172A); + RANGE(0x11A01, 0x11A06); + RANGE(0x11A07, 0x11A08); + RANGE(0x11A09, 0x11A0A); + RANGE(0x11A35, 0x11A38); + RANGE(0x11A3B, 0x11A3E); + RANGE(0x11A51, 0x11A56); + RANGE(0x11A57, 0x11A58); + RANGE(0x11A59, 0x11A5B); + RANGE(0x11A8A, 0x11A96); + RANGE(0x11C30, 0x11C36); + RANGE(0x11C38, 0x11C3D); + RANGE(0x11C92, 0x11CA7); + RANGE(0x11CAA, 0x11CB0); + RANGE(0x11CB2, 0x11CB3); + RANGE(0x11CB5, 0x11CB6); + RANGE(0x11D31, 0x11D36); + RANGE(0x11D3C, 0x11D3D); + RANGE(0x11D3F, 0x11D41); + RANGE(0x16B30, 0x16B36); + RANGE(0x16F51, 0x16F7E); + RANGE(0x1E000, 0x1E006); + RANGE(0x1E008, 0x1E018); + RANGE(0x1E01B, 0x1E021); + RANGE(0x1E023, 0x1E024); + RANGE(0x1E026, 0x1E02A); + RANGE(0x1F130, 0x1F149); + RANGE(0x1F150, 0x1F169); + RANGE(0x1F170, 0x1F189); + return 0; +} + +/* return 1 if ch has Other Lowercase property */ +static int +unicode_isOther_Lowercase(uint32_t ch) +{ + switch(ch) { + case 0x00AA: + case 0x00BA: + case 0x0345: + case 0x037A: + case 0x1D78: + case 0x2071: + case 0x207F: + case 0xA770: + return 1; + } + RANGE(0x02B0, 0x02B8); + RANGE(0x02C0, 0x02C1); + RANGE(0x02E0, 0x02E4); + RANGE(0x1D2C, 0x1D6A); + RANGE(0x1D9B, 0x1DBF); + RANGE(0x2090, 0x209C); + RANGE(0x2170, 0x217F); + RANGE(0x24D0, 0x24E9); + RANGE(0x2C7C, 0x2C7D); + RANGE(0xA69C, 0xA69D); + RANGE(0xA7F8, 0xA7F9); + RANGE(0xAB5C, 0xAB5F); + return 0; +} + +/* return 1 if ch has Other Uppercase property */ +static int +unicode_isOther_Uppercase(uint32_t ch) +{ + RANGE(0x2160, 0x216F); + RANGE(0x24B6, 0x24CF); + RANGE(0x1F130, 0x1F149); + RANGE(0x1F150, 0x1F169); + RANGE(0x1F170, 0x1F189); + return 0; +} + +/* return 1 if ch has Pattern White Space property */ +static int +unicode_isPattern_White_Space(uint32_t ch) +{ + switch(ch) { + case 0x0020: + case 0x0085: + case 0x2028: + case 0x2029: + return 1; + } + RANGE(0x0009, 0x000D); + RANGE(0x200E, 0x200F); + return 0; +} + +static int +unicode_isalnum(uint32_t ch) +{ + return unicode_isalpha(ch) || unicode_isdigit(ch); +} + +static int +unicode_isalpha(uint32_t ch) +{ + RANGE(0x0041, 0x005a); + RANGE(0x0061, 0x007a); + return unicode_isOther_Alphabetic(ch); +} + +static int +unicode_isblank(uint32_t ch) +{ + switch(ch) { + case 0x0009: + return 1; + } + return unicode_isWhite_Space(ch) || unicode_isPattern_White_Space(ch); +} + +static int +unicode_iscntrl(uint32_t ch) +{ + switch(ch) { + case 0x0000: + case 0x007f: + return 1; + } + RANGE(0x0001, 0x001f); + return 0; +} + +static int +unicode_isdigit(uint32_t ch) +{ + RANGE(0x0030, 0x0039); + RANGE(0xFF10, 0xFF19); + return 0; +} + +static int +unicode_isgraph(uint32_t ch) +{ + RANGE(0x0021, 0x007e); + return 0; +} + +static int +unicode_islower(uint32_t ch) +{ + RANGE(0x0061, 0x007a); + return unicode_isOther_Lowercase(ch); +} + +static int +unicode_isprint(uint32_t ch) +{ + RANGE(0x0020, 0x007e); + /* XXX */ + return 0; +} + +static int +unicode_ispunct(uint32_t ch) +{ + return unicode_isTerminal_Punctuation(ch) || + unicode_isQuotation_Mark(ch) || + unicode_isHyphen(ch) || + unicode_isDash(ch); +} + +static int +unicode_isspace(uint32_t ch) +{ + return unicode_isWhite_Space(ch); +} + +static int +unicode_isupper(uint32_t ch) +{ + RANGE(0x0041, 0x005a); + return unicode_isOther_Uppercase(ch); +} + +static int +unicode_isxdigit(uint32_t ch) +{ + return unicode_isHex_Digit(ch); +} + +static int +unicode_isident(uint32_t ch) +{ + return unicode_isalnum(ch) || ch == '_'; +} + +static uint32_t +unicode_tolower(uint32_t ch) +{ + if (unicode_isupper(ch)) { + ch += 0x20; + } + return ch; +} + +static uint32_t +unicode_toupper(uint32_t ch) +{ + if (unicode_islower(ch)) { + ch -= 0x20; + } + return ch; +} + +/* allocate space - used in regasub */ +static int +growspace(char **buf, size_t *size, size_t cc, size_t incr) +{ + size_t newsize; + char *newv; + + if (*size < cc + incr) { + newsize = howmany(cc + incr, 64) * 64; + newv = realloc(*buf, newsize); + if (newv == NULL) { + return 0; + } + memset(&newv[*size], 0x0, newsize - *size); + *buf = newv; + *size = newsize; + } + return 1; +} + +/***********************************************************/ + +/* allocate a new structure and return it */ +agcre_regex_t * +agcre_new(void) +{ + return calloc(1, sizeof(agcre_regex_t)); +} + +/* compile regular expression */ +int +agcre_regcomp(agcre_regex_t *agcre, const void *vs, uint32_t flags) +{ + retoken_t *tok; + uint32_t n; + input_t in; + re_t *re; + int errc; + + if (agcre == NULL || vs == NULL) { + return AGCRE_REG_FAILURE; + } + memset(&in, 0x0, sizeof(in)); + in.s = (const char *)vs; + in.eo = (flags & AGCRE_REG_PEND) ? + (agcre_regoff_t)(agcre->re_endp - in.s) : + (agcre_regoff_t)strlen(in.s); + memset(agcre, 0x0, sizeof(*agcre)); + agcre->re_g = re = in.re = calloc(1, sizeof(*re)); + if (in.eo - in.so > AGCRE_MAX_EXPR_LENGTH) { + re->msgc = snprintf(re->msg, sizeof(re->msg), + "expression length %llu larger than %u", + (unsigned long long)(in.eo - in.so), + AGCRE_MAX_EXPR_LENGTH); + return AGCRE_REG_FAILURE; + } + if (flags & AGCRE_REG_EXTENDED) { + tok = parse_extended(agcre, &in); + } else if (flags & AGCRE_REG_BASIC) { + tok = parse_basic(agcre, &in, flags); + } else if (flags & AGCRE_REG_NOSPEC) { + tok = parse_nospec(agcre, &in); + } else { + re->msgc = snprintf(re->msg, sizeof(re->msg), + "unknown type of regexp"); + return AGCRE_REG_FAILURE; + } + if (tok == NULL) { + return AGCRE_REG_FAILURE; + } + if (flags & AGCRE_REG_DUMP) { + print_token(tok); + printf("\n"); + } + errc = 0; + n = count(tok, &errc) + 1; + if (errc) { + re->msgc = snprintf(re->msg, sizeof(re->msg), "illegal token parse"); + return AGCRE_REG_FAILURE; + } + if ((re->pc = re->prog = calloc(1, n * sizeof(re->prog[0]))) == NULL) { + re->msgc = snprintf(re->msg, sizeof(re->msg), "%d instruction allocation failure", n); + return AGCRE_REG_FAILURE; + } + re->flags = flags; + emit(re, tok); + freetok(tok); + re->pc->op = OP_MATCH; + re->pc += 1; + re->instrc = re->pc - re->prog; + agcre->re_magic = AGCRE_MAGIC; + if (flags & AGCRE_REG_DUMP) { + printprog(re); + } + return AGCRE_REG_SUCCESS; +} + +#ifndef USE_ARG +#define USE_ARG(x) /*LINTED*/(void)&x +#endif + +/* format the error nicely */ +size_t +agcre_regerror(int errcode, const agcre_regex_t *agcre, char *errbuf, size_t size) +{ + re_t *re; + + USE_ARG(errcode); + if (agcre == NULL || size == 0 || errbuf == NULL) { + return 0; + } + re = agcre->re_g; + return snprintf(errbuf, size, "%s", re->msg); +} + +/* execute the regular expression */ +int +agcre_regexec(agcre_regex_t *agcre, const void *vs, size_t matchc, agcre_regmatch_t *m, uint32_t flags) +{ + threadlist_t *current; + threadlist_t *next; + threadlist_t *tmp; + context_t *matched; + context_t *ctx; + uint32_t prevflags; + uint32_t instrc; + input_t in; + instr_t *instr; + instr_t *prog; + size_t i; + re_t *re; + int ret; + + if (agcre == NULL || vs == NULL || (matchc > 0 && m == NULL)) { + return AGCRE_REG_FAILURE; + } + if ((re = agcre->re_g) == NULL) { + return AGCRE_REG_FAILURE; + } + if (agcre->re_magic != AGCRE_MAGIC) { + re->msgc = snprintf(re->msg, sizeof(re->msg), + "bad magic number 0x%x, not 0x%x", agcre->re_magic, AGCRE_MAGIC); + return AGCRE_REG_FAILURE; + } + if (matchc > AGCRE_MAX_SUBEXPR) { + re->msgc = snprintf(re->msg, sizeof(re->msg), + "matchc %zu larger than limit %u", matchc, AGCRE_MAX_SUBEXPR); + return AGCRE_REG_FAILURE; + } + memset(&in, 0x0, sizeof(in)); + in.s = (const char *)vs; + in.so = (flags & AGCRE_REG_STARTEND) ? m[0].rm_so : 0; + in.eo = (flags & AGCRE_REG_STARTEND) ? m[0].rm_eo : get_length_in_bytes(vs, flags); + matched = NULL; + ctx = make_new_context(re, matchc); + for (i = 0; i < matchc ; i++) { + m[i].rm_so = m[i].rm_eo = -1; + ctx->sub[i].rm_so = ctx->sub[i].rm_eo = -1; + } + instrc = re->instrc; + prog = re->prog; + if ((flags | re->flags) & AGCRE_REG_ANCHOR) { + instrc -= LEADING_DOT_STAR_INSTRS; + prog += LEADING_DOT_STAR_INSTRS; + } + current = threadlist(instrc); + next = threadlist(instrc); + re->gen += 1; + prevflags = re->flags; + re->flags |= flags; + in.c = in.so; + addthread(re, current, newthread(prog, ctx), &in, 0, 1); + ret = AGCRE_REG_FAILURE; + for (in.ch = BOL ; ; in.c += in.bytes) { + if (current->nthreads == 0) { + break; + } + if (!get_next_char(&in, flags)) { + break; + } + if (re->flags & AGCRE_REG_TRACE) { + printf("\n%lld 0x%0x", (long long)in.c, in.ch); + } + for (re->gen++, i = 0; i < current->nthreads ; i++) { + instr = current->t[i].pc; + ctx = current->t[i].ctx; + if (re->flags & AGCRE_REG_TRACE) { + printf("\nthread %zu, pc %d, ctx %p", i, (int)(instr - prog), ctx); + } + switch(instr->op) { + case OP_CHAR: + if (!charmatch(re, instr, in.ch, instr->n)) { + decref(re, ctx); + break; + } + goto eol_check; + case OP_SET: + if (!setmatch(re, &re->sets[instr->n], &in, instr)) { + decref(re, ctx); + break; + } + goto eol_check; + case OP_ANY: +eol_check: + if (!anymatch(re, &in)) { + decref(re, ctx); + break; + } + addthread(re, next, newthread(instr + 1, ctx), &in, in.bytes, 1); + break; + case OP_BACKREF: + if (!charmatch(re, instr, in.ch, in.s[ctx->sub[instr->n].rm_so + ctx->c])) { + decref(re, ctx); + break; + } + if (ctx->c == ctx->sub[instr->n].rm_eo - ctx->sub[instr->n].rm_so - in.bytes) { + addthread(re, next, newthread(instr + 1, ctx), &in, 0, 1); + } else { + ctx->c += in.bytes; + addthread(re, next, newthread(instr, ctx), &in, ctx->c, 1); + } + continue; + case OP_ZEROWIDTH: + if (!zeromatch(re, instr->n, &in)) { + decref(re, ctx); + break; + } + addthread(re, current, newthread(instr + 1, ctx), &in, 0, 0); + continue; + case OP_MATCH: + if (is_better_match(ctx, matched)) { + if (!matched) { + matched = make_new_context(re, matchc); + } + memcpy(matched, ctx, context_bytes(matchc)); + } + for (i++; i < current->nthreads ; i++) { + decref(re, current->t[i].ctx); + } + if (ctx->sub[0].rm_eo - ctx->sub[0].rm_so == 0) { + addthread(re, next, newthread(prog, ctx), &in, in.bytes, 1); + } + goto break_for; + /* OP_JUMP, OP_SPLIT, OP_SAVE handled in addthread, so that */ + /* machine execution matches what a backtracker would do. */ + /* This is discussed (but not shown as code) in */ + /* Regular Expression Matching: the Virtual Machine Approach. */ + } + } +break_for: + tmp = current; + current = next; + next = tmp; + next->nthreads = 0; + if (in.c > in.eo) { + break; + } +#if 0 + if ((flags & AGCRE_REG_ANCHOR) && + ((matched == NULL && current->nthreads < 2 && in.c > in.so) || + (matched && matched->sub[0].rm_so > (int64_t)in.so))) { + break; + } +#endif + } + if (matched) { + if (!(flags & AGCRE_REG_NOSUB)) { + for (i = 0; i < matchc; i++) { + m[i] = matched->sub[i]; + } + } + decref(re, matched); + ret = AGCRE_REG_SUCCESS; + } + free(current); + free(next); + decref(re, ctx); + re->flags = prevflags; + return ret; +} + +/* execute the regular expression backwards */ +int +agcre_rev_regexec(agcre_regex_t *agcre, const void *vs, size_t matchc, agcre_regmatch_t *m, uint32_t flags) +{ + int64_t from; + int64_t to; + int64_t i; + int found; + int this; + + if (flags & AGCRE_REG_STARTEND) { + from = m[0].rm_so; + to = m[0].rm_eo; + } else { + from = 0; + to = strlen(vs); + } + flags |= (AGCRE_REG_STARTEND | AGCRE_REG_ANCHOR); + for (found = 0, i = to - 1 ; i >= from ; --i) { + m[0].rm_so = i; + m[0].rm_eo = to; + if ((this = agcre_regexec(agcre, vs, matchc, m, flags)) == 0) { + found += 1; + } else if (found > 0) { + m[0].rm_so = i + 1; + m[0].rm_eo = to; + return agcre_regexec(agcre, vs, matchc, m, flags); + } + } + return AGCRE_REG_FAILURE; +} + +/* free the regular expression */ +void +agcre_regfree(agcre_regex_t *agcre) +{ + uint32_t i; + re_t *re; + + if (agcre) { + if ((re = agcre->re_g) != NULL) { + free(re->prog); + for (i = 0 ; i < re->setc ; i++) { + set_free(&re->sets[i]); + } + free(re->sets); + free_contexts(re->ctxlist); + free(re); + } + } +} + +/* do regexp sed-style substitution */ +ssize_t +agcre_regnsub(char *buf, size_t size, const char *repl, const agcre_regmatch_t *rm, const char *str) +{ + const char *i; + uint32_t len; + uint32_t num; + char *o; + int outofroom = 0; + + if (buf == NULL || repl == NULL || rm == NULL || str == NULL || size == 0) { + return -1; + } + memset(buf, 0x0, size); + for (i = repl, o = buf ; *i ; ) { + if ((size_t)(o - buf) >= size - 1) { + /* not enough space, transition to observe */ + outofroom = 1; + } + if (*i == '&' || + (*i == '\\' && (i[1] >= '0' && i[1] <= '9'))) { + num = (i[0] == '&') ? 0 : (i[1] - '0'); + if (rm[num].rm_so < 0 || rm[num].rm_eo < 0 || + rm[num].rm_eo - rm[num].rm_so < 0 || + rm[num].rm_eo < rm[num].rm_so || + rm[num].rm_eo > rm[0].rm_eo) { + /* sanity check on regmatch values */ + return -1; + } + len = rm[num].rm_eo - rm[num].rm_so; + if (len >= (size - (size_t)(o - buf))) { + /* not enough space, transition to observe */ + outofroom = 1; + } + if (!outofroom) { + memcpy(o, &str[rm[num].rm_so], len); + } + i += 2; + o += len; + } else { + if (!outofroom) { + *o = *i; + } + i += 1; + o += 1; + } + } + if (!outofroom) { + *o = 0x0; + } + return (ssize_t)(o - buf); +} + +/* do regexp sed-style substitution */ +ssize_t +agcre_regasub(char **buf, const char *repl, const agcre_regmatch_t *rm, const char *str) +{ + const char *i; + uint32_t len; + uint32_t num; + size_t size; + size_t cc; + + if (buf == NULL || repl == NULL || rm == NULL || str == NULL) { + return -1; + } + *buf = NULL; + for (i = repl, size = cc = 0 ; *i ; ) { + if (*i == '&' || + (*i == '\\' && (i[1] >= '0' && i[1] <= '9'))) { + num = (i[0] == '&') ? 0 : (i[1] - '0'); + if (rm[num].rm_so < 0 || rm[num].rm_eo < 0 || + rm[num].rm_eo - rm[num].rm_so < 0 || + rm[num].rm_eo < rm[num].rm_so || + rm[num].rm_eo > rm[0].rm_eo) { + /* sanity check on regmatch values */ + return -1; + } + len = rm[num].rm_eo - rm[num].rm_so; + if (!growspace(buf, &size, cc, len)) { + return -1; + } + memcpy(&(*buf)[cc], &str[rm[num].rm_so], len); + i += 2; + cc += len; + } else { + if (!growspace(buf, &size, cc, 1)) { + return -1; + } + (*buf)[cc++] = *i++; + } + } + (*buf)[cc] = 0x0; + return cc; +}