Module Name:    othersrc
Committed By:   agc
Date:           Wed Aug 16 23:38:35 UTC 2017

Added Files:
        othersrc/external/bsd/agcre/dist: internal.h

Log Message:
Just what this world needs - another regexp library. However, for
something I was doing, I needed a regexp library in C, BSD-licensed,
and able to be exposed to a wide range of expressions, some better
controlled than others.

The resulting library is libagcre, which implements regular expression
compilation and execution. It uses the Pike Virtual Machine approach,
and features:

+ standard POSIX features where sane
+ some/most Perl escapes
+ lazy matching via '?'
+ non-capture parenthese (?:...)
+ in-expression case-insensitive directives are supported (?i)...(?-i)
+ all case-insensitivity is actioned at expression exec time.
Case-insensitivity can be specified at expression compile-time,
and, if so, it will be remembered.  But the expression itself, once
compiled, can be used to match in both a case-sensitive and insensitive
manner
+ utf8 is supported both for expressions and for input text when
matching
+ unicode escapes (in the Java format of \uABCD) are supported
+ exact multiple repetition specifiers {N}, and {N,M} are supported
+ backreferences are supported
+ utf16 (LE and BE) and utf32 (LE and BE) are supported, both for the
expression and for the input being searched
+ at the most basic level, individual 32bit unicode characters are
matched
+ an egrep/grep implementation for matching unicode regexps
is included

A simple implementation of sets is used to provide inclusion and
exclusion information for unicode characters, which is taken directly
from unicode.org. No bitmasks are used - ranges are specified by
using an upper and a lower bound for the codepoints. Callbacks can
also be added to these sets, to provide functionality similar to
the ctype macros across the whole unicode character set.

The standard regular expression basic3 torture test is passed with
4 known (and, I'd argue, incorrect) results flagged.  As expected,
the expression '(a?){9999}aaaaaaaaaaaaaaaaaaaaaaaaaaaaa' matches
in linear time, as does the expression
'((((((((((((((((((((((((((((((x))))))))))))))))))))))))))))))'

        % time agcre '(a?){9999}aaaaaaaaaaaaaaaaaaaaaaaaaaaaa' dist/tests/2.in
        aaaaaaaaaaaaaaaaaaaaaaaaaaaaa
        0.063u 0.000s 0:00.06 100.0%    0+0k 0+0io 0pf+0w
        % time egrep '(a?){9999}aaaaaaaaaaaaaaaaaaaaaaaaaaaaa' dist/tests/2.in
        ^C88.462u 0.730s 1:29.21 99.9%  0+0k 0+0io 0pf+0w
        %

The library and agcre utility have been run through valgrind to
confirm no memory leaks.

In general, the emphasis is on a modern, predictable, VM-style,
well-featured regexp library, in C, with a BSD license. In
particular, sljit has not been used to speed up on certain platforms,
most Perl regexp features are supported, as are back references,
and UTF-8, UTF-16 and UTF32.

Once again, I wouldn't expect anyone to use this as the main engine
in egrep. But I am always amazed at the uses for some of the things
that I write.

For more information about the Pike VM, and comparison to other
regexp implementations, please see:

        https://swtch.com/~rsc/regexp/regexp2.html

Alistair Crooks
Tue Aug 15 07:43:34 PDT 2017


To generate a diff of this commit:
cvs rdiff -u -r0 -r1.1 othersrc/external/bsd/agcre/dist/internal.h

Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.

Added files:

Index: othersrc/external/bsd/agcre/dist/internal.h
diff -u /dev/null othersrc/external/bsd/agcre/dist/internal.h:1.1
--- /dev/null	Wed Aug 16 23:38:35 2017
+++ othersrc/external/bsd/agcre/dist/internal.h	Wed Aug 16 23:38:35 2017
@@ -0,0 +1,165 @@
+/*-
+ * Copyright (c) 2017 Alistair Crooks <a...@netbsd.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef INTERNAL_H_
+#define INTERNAL_H_	20170801
+
+#include <inttypes.h>
+
+#include "agcre.h"
+#include "set.h"
+
+/* a regexp token */
+typedef struct retoken_t {
+	struct retoken_t	*left;		/* token on left */
+	struct retoken_t	*right;		/* token on right */
+	uint64_t	 	 so;		/* start offset */
+	uint32_t		 n;		/* number */
+	uint32_t	 	 nparens;	/* () count */
+	uint32_t	 	 lo;		/* repeat low count */
+	uint32_t	 	 hi;		/* repeat high count */
+	uint32_t	 	 ch;		/* current char */
+	uint8_t			 type;		/* type of token */
+	uint8_t			 lazy;		/* use lazy matching */
+	uint8_t		 	 noncapture;	/* non-capture subexpression */
+	uint8_t		 	 icase;		/* icase token */
+} retoken_t;
+
+/* token types */
+#define TOK_ALT			 0x01
+#define TOK_BACKREF		 0x02
+#define TOK_BROKEN		 0x03
+#define TOK_CAT			 0x04
+#define TOK_COLLECTION		 0x05
+#define TOK_DOT			 0x06
+#define TOK_EOL			 0x07
+#define TOK_FORGET		 0x08
+#define TOK_LIT			 0x09
+#define TOK_PAREN		 0x0a
+#define TOK_PLUS		 0x0b
+#define TOK_QUEST		 0x0c
+#define TOK_REP			 0x0d
+#define TOK_SET			 TOK_COLLECTION
+#define TOK_STAR		 0x0e
+#define TOK_WIDTH		 0x0f
+#define TOK_ZEROWIDTH		 0x10
+
+/* input structure */
+typedef struct input_t {
+	const char	*s;		/* string */
+	uint64_t	 so;		/* start offset */
+	uint64_t	 eo;		/* end offset */
+	uint64_t	 c;		/* current offset */
+	int		 msgc;		/* # of chars in message buf */
+	char		 msg[256];	/* message buffer */
+	retoken_t	 parse;		/* current token when parsing */
+	uint32_t	 parenc;	/* # of parentheses */
+	struct re_t	*re;		/* pointer to internal re */
+	uint32_t	 ch;		/* current char */
+	uint32_t	 prevch;	/* previous char */
+	uint8_t		 bytes;		/* size of current char */
+	uint8_t		 prevbytes;	/* size of previous char */
+	uint8_t		 icase;		/* persistent icase */
+} input_t;
+
+#define AGCRE_IN_WIDTH	\
+	(AGCRE_REG_UTF16LE | AGCRE_REG_UTF16BE | AGCRE_REG_UTF32LE | AGCRE_REG_UTF32BE)
+
+/* virtual machine instruction */
+typedef struct instr_t {
+	struct instr_t	*lhs;	/* lhs */
+	struct instr_t	*rhs;	/* rhs */
+	uint32_t	 n;	/* character */
+	uint32_t	 gen;	/* generation number */
+	uint8_t		 op;	/* instruction number */
+	uint8_t		 icase;	/* instruction number */
+} instr_t;
+
+/* instruction opcodes */
+#define OP_ANY		0x01
+#define OP_BACKREF	0x02
+#define OP_CHAR		0x03
+#define OP_JUMP		0x04
+#define OP_MATCH	0x05
+#define OP_SAVE		0x06
+#define OP_SET		0x07
+#define OP_SPLIT	0x08
+#define OP_ZEROWIDTH	0x09
+
+/* a context structure */
+typedef struct context_t {
+	struct context_t	*next;		/* for freeing nodes */
+	uint32_t 		 ref;		/* reference count */
+	uint32_t 		 nsub;		/* # matches */
+	uint32_t 		 c;		/* chars in backref */
+	agcre_regmatch_t	 sub[1];	/* match structures */
+} context_t;
+
+/* a thread structure */
+typedef struct re_thread_t {
+	instr_t		*pc;			/* instruction */
+	context_t	*ctx;			/* context */
+} re_thread_t;
+
+/* list of threads */
+typedef struct threadlist_t {
+	uint32_t	nthreads;		/* # of threads */
+	re_thread_t	t[1];			/* the threads */
+} threadlist_t;
+
+/* regular expression internals */
+typedef struct re_t {
+	instr_t		*prog;		/* start of instructions */
+	uint32_t	 instrc;	/* # of instructions */
+	uint32_t	 gen;		/* generation number */
+	uint32_t	 setc;		/* # of sets */
+	uint32_t	 maxset;	/* allocated # of sets */
+	agcre_set_t	*sets;		/* sets */
+	uint32_t	 flags;		/* comp/exec flags */
+	context_t	*ctxlist;	/* list of contexts */
+	instr_t		*pc;		/* prog counter */
+	int		 msgc;		/* # of chars in msg buffer */
+	char		 msg[256];	/* message buffer */
+} re_t;
+
+#ifndef __BEGIN_DECLS
+#  if defined(__cplusplus)
+#  define __BEGIN_DECLS           extern "C" {
+#  define __END_DECLS             }
+#  else
+#  define __BEGIN_DECLS
+#  define __END_DECLS
+#  endif
+#endif
+
+__BEGIN_DECLS
+
+retoken_t *agcre_parse_extended(agcre_re_t */*re*/, input_t */*in*/);
+retoken_t *agcre_parse_nospec(agcre_re_t */*re*/, input_t */*in*/);
+retoken_t *agcre_parse_basic(agcre_re_t */*re*/, input_t */*in*/);
+int agcre_get_next_char(input_t *in, uint32_t flags);
+
+__END_DECLS
+
+#endif

Reply via email to