Hi all, Caolan McMahon reported on IRC that (irregex-match "[a-z]{0,42}" "testing@") would take about 2 minutes to find that there was no match. This was due to the translation of "a{3,6}" to aaaa?a?a?a?a?a?, which would result in excessive backtracking caused by a combinatorial explosion: It would try to match all permutations of the As in the pattern. Even though the first a is identical to, say, the fourth, it would still match them as if they were not.
The fix is simple (though it took me a while to come up with a patch): just try to match the As in sequence: if one A fails, the next will fail as well. In case you're wondering: the cset-contains? performance improvement I posted in another thread should still be applied: it's still relevant and I haven't come up with a sane way to do this portably. I've also added a second patch to make the diff between upstream and our own version smaller: I removed the %irregex-error custom procedure, which was probably a remnant from a very old version of Irregex. It was overridden as a compiler macro to expand to "error" anyway, which is also shipped with upstream irregex as a simple helper procedure for Schemes that do not support it. The definition is compatible with CHICKEN's built-in version, so it's fine to just use it. The diff still includes a change of using symbol names instead of strings whereever the procedure name is passed as a first argument to error, because it's converted to a 'location property on the 'exn condition. I've kept this because it's a bit more user-friendly, even though it looks like a gratuitous change when you're looking at the diff. Cheers, Peter
From 667d9e4442208e020d0158b18284705ce0a6fbe9 Mon Sep 17 00:00:00 2001 From: Peter Bex <pe...@more-magic.net> Date: Thu, 17 Dec 2015 21:44:14 +0100 Subject: [PATCH 1/2] Update irregex to upstream version 0.9.4 This fixes a pathological performance problem with {n,m} patterns. --- LICENSE | 2 +- NEWS | 2 + irregex-core.scm | 129 ++++++++++++++++++++++++++++++++++++----------------- tests/re-tests.txt | 22 +++++++++ 4 files changed, 113 insertions(+), 42 deletions(-) diff --git a/LICENSE b/LICENSE index 71624fd..9149627 100644 --- a/LICENSE +++ b/LICENSE @@ -60,7 +60,7 @@ synrules.scm: irregex.scm: - Copyright (c) 2005-2011, Alex Shinn + Copyright (c) 2005-2015, Alex Shinn All rights reserved. Redistribution and use in source and binary forms, with or without diff --git a/NEWS b/NEWS index 027e7e5..3e4e8bb 100644 --- a/NEWS +++ b/NEWS @@ -23,6 +23,8 @@ - Core libraries - SRFI-18: thread-join! no longer gives an error when passed a thread in the "sleeping" state (thanks to Joerg Wittenberger) + - Irregex has been updated to 0.9.4, which fixes severe performance + problems with {n,m} repeating patterns (thanks to Caolan McMahon). - Unit "posix": The following posix procedures now work on port objects: file-stat, file-size, file-owner, file-permissions, diff --git a/irregex-core.scm b/irregex-core.scm index 9d09a48..c871369 100644 --- a/irregex-core.scm +++ b/irregex-core.scm @@ -1,6 +1,6 @@ ;;;; irregex.scm -- IrRegular Expressions ;; -;; Copyright (c) 2005-2011 Alex Shinn. All rights reserved. +;; Copyright (c) 2005-2015 Alex Shinn. All rights reserved. ;; BSD-style license: http://synthcode.com/license.txt ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -27,12 +27,11 @@ ;; performance tuning, but you can only go so far while staying ;; portable. AND-LET*, SRFI-9 records and custom macros would've been ;; nice. -;; -;; Version 1.0 will be released as a portable R7RS library. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;; History -;; +;; 0.9.4: 2015/12/14 - performance improvement for {n,m} matches +;; 0.9.3: 2014/07/01 - R7RS library ;; 0.9.2: 2012/11/29 - fixed a bug in -fold on conditional bos patterns ;; 0.9.1: 2012/11/27 - various accumulated bugfixes ;; 0.9.0: 2012/06/03 - Using tags for match extraction from Peter Bex. @@ -511,8 +510,8 @@ (define (find-tail pred ls) (let lp ((ls ls)) (cond ((null? ls) #f) - ((pred (car ls)) ls) - (else (lp (cdr ls)))))) + ((pred (car ls)) ls) + (else (lp (cdr ls)))))) (define (last ls) (if (not (pair? ls)) @@ -562,7 +561,7 @@ (define (bit-shl n i) (* n (expt 2 i))) - + (define (bit-not n) (- #xFFFF n)) (define (bit-ior a b) @@ -2881,19 +2880,26 @@ (cons (list dfa-state finalizer dfa-trans) marked-states) (+ dfa-size 1))) (let* ((closure (nfa-epsilon-closure nfa (cdar trans))) - (reordered (find-reorder-commands nfa closure marked-states)) + (reordered + (find-reorder-commands nfa closure marked-states)) (copy-cmds (if reordered (cdr reordered) '())) ;; Laurikari doesn't mention what "k" is, but it seems it ;; must be the mappings of the state's reach - (set-cmds (tag-set-commands-for-closure nfa (cdar trans) closure copy-cmds)) + (set-cmds (tag-set-commands-for-closure + nfa (cdar trans) closure copy-cmds)) (trans-closure (if reordered (car reordered) closure))) (lp2 (cdr trans) (if reordered unmarked-states (cons trans-closure unmarked-states)) - (cons `(,trans-closure ,(caar trans) ,copy-cmds . ,set-cmds) + (cons `(,trans-closure + ,(caar trans) ,copy-cmds . ,set-cmds) dfa-trans))))))))))) +;; When the conversion is complete we renumber the DFA sets-of-states +;; in order and convert the result to a vector for fast lookup. +;; Charsets containing single characters are converted to those characters +;; for quick matching of the literal parts in a regex. (define (dfa-renumber states) (let ((indexes (let lp ((i 0) (states states) (indexes '())) (if (null? states) @@ -2917,7 +2923,6 @@ ;; Extract all distinct ranges and the potential states they can transition ;; to from a given set of states. Any ranges that would overlap with ;; distinct characters are split accordingly. - ;; This function is like "reach" in Laurikari's papers, but for each ;; possible distinct range of characters rather than per character. (define (get-distinct-transitions nfa annotated-states) @@ -2940,7 +2945,8 @@ ;; but takes longer to compile. (cons (cons cs (nfa-state->mst nfa state mappings)) res)) - ((cset=? cs (caar ls)) ; Add state to existing set for this charset + ((cset=? cs (caar ls)) + ;; Add state to existing set for this charset (mst-add! nfa (cdar ls) state mappings) (append ls res)) ((csets-intersect? cs (caar ls)) => @@ -2948,8 +2954,9 @@ (let* ((only-in-new (cset-difference cs (caar ls))) (only-in-old (cset-difference (caar ls) cs)) (states-in-both (cdar ls)) - (states-for-old (and (not (cset-empty? only-in-old)) - (mst-copy states-in-both))) + (states-for-old + (and (not (cset-empty? only-in-old)) + (mst-copy states-in-both))) (res (if states-for-old (cons (cons only-in-old states-for-old) res) res))) @@ -2995,15 +3002,16 @@ ((cdar trans) => ; tagged transition? (lambda (tag) (let* ((index (next-index-for-tag! nfa tag closure)) - (new-mappings (mst-add-tagged! - nfa closure state mappings tag index))) - (lp2 (cdr trans) (cons (cons state new-mappings) stack))))) + (new-mappings + (mst-add-tagged! + nfa closure state mappings tag index))) + (lp2 (cdr trans) + (cons (cons state new-mappings) stack))))) (else (mst-add/fast! nfa closure state mappings) (lp2 (cdr trans) (cons (cons state mappings) stack))))) (else (lp2 (cdr trans) stack)))))))))) - (define (nfa-epsilon-closure nfa states) (or (nfa-get-closure nfa states) (let ((res (nfa-epsilon-closure-internal nfa states))) @@ -3084,7 +3092,6 @@ (nfa-set-reorder-commands! nfa closure res) res))) - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;; Closure Compilation ;; @@ -3104,7 +3111,7 @@ (irregex-match-start-index-set! matches 0 (cdr init)) (irregex-match-end-chunk-set! matches 0 src) (irregex-match-end-index-set! matches 0 i) - (%irregex-match-fail-set! matches fail) + (%irregex-match-fail-set! matches fail) matches))) ;; XXXX this should be inlined (define (rec sre) (lp sre n flags next)) @@ -3214,7 +3221,7 @@ (rec `(** ,(cadr sre) ,(cadr sre) ,@(cddr sre)))) ((>=) (rec `(** ,(cadr sre) #f ,@(cddr sre)))) - ((** **?) + ((**) (cond ((or (and (number? (cadr sre)) (number? (caddr sre)) @@ -3222,27 +3229,67 @@ (and (not (cadr sre)) (caddr sre))) (lambda (cnk init src str i end matches fail) (fail))) (else - (let* ((from (cadr sre)) - (to (caddr sre)) - (? (if (eq? '** (car sre)) '? '??)) - (* (if (eq? '** (car sre)) '* '*?)) - (sre (sre-sequence (cdddr sre))) - (x-sre (sre-strip-submatches sre)) - (next (if to - (if (= from to) - next - (fold (lambda (x next) - (lp `(,? ,sre) n flags next)) - next - (zero-to (- to from)))) - (rec `(,* ,sre))))) - (if (zero? from) + (letrec + ((from (cadr sre)) + (to (caddr sre)) + (body-contents (sre-sequence (cdddr sre))) + (body + (lambda (count) + (lp body-contents + n + flags + (lambda (cnk init src str i end matches fail) + (if (and to (= count to)) + (next cnk init src str i end matches fail) + ((body (+ 1 count)) + cnk init src str i end matches + (lambda () + (if (>= count from) + (next cnk init src str i end matches fail) + (fail)))))))))) + (if (and (zero? from) to (zero? to)) next - (lp `(seq ,@(map (lambda (x) x-sre) (zero-to (- from 1))) - ,sre) - n - flags - next)))))) + (lambda (cnk init src str i end matches fail) + ((body 1) cnk init src str i end matches + (lambda () + (if (zero? from) + (next cnk init src str i end matches fail) + (fail)))))))))) + ((**?) + (cond + ((or (and (number? (cadr sre)) + (number? (caddr sre)) + (> (cadr sre) (caddr sre))) + (and (not (cadr sre)) (caddr sre))) + (lambda (cnk init src str i end matches fail) (fail))) + (else + (letrec + ((from (cadr sre)) + (to (caddr sre)) + (body-contents (sre-sequence (cdddr sre))) + (body + (lambda (count) + (lp body-contents + n + flags + (lambda (cnk init src str i end matches fail) + (if (< count from) + ((body (+ 1 count)) cnk init + src str i end matches fail) + (next cnk init src str i end matches + (lambda () + (if (and to (= count to)) + (fail) + ((body (+ 1 count)) cnk init + src str i end matches fail)))))))))) + (if (and (zero? from) to (zero? to)) + next + (lambda (cnk init src str i end matches fail) + (if (zero? from) + (next cnk init src str i end matches + (lambda () + ((body 1) cnk init src str i end matches fail))) + ((body 1) cnk init src str i end matches fail)))))))) ((word) (rec `(seq bow ,@(cdr sre) eow))) ((word+) diff --git a/tests/re-tests.txt b/tests/re-tests.txt index 37e951a..7a56edb 100644 --- a/tests/re-tests.txt +++ b/tests/re-tests.txt @@ -73,6 +73,27 @@ a** - c - - (a+|b)* ab y &-\1 ab-b (a+|b)+ ab y &-\1 ab-b (a+|b)? ab y &-\1 a-a +(a+|b){0,0} ab y &-\1 - +(a+|b){0,2} ab y &-\1 ab-b +(a+|b){1,2} ab y &-\1 ab-b +^(a+|b){0,0}$ a n - - +^(a+|b){1,2}$ ab y &-\1 ab-b +^(a+|b){1,2}$ abc n - - +^(a+|b){0,1}$ ab n - - +(a+|b){0,2}b ab y &-\1 ab-a +(a+|b){0,2}b aab y &-\1 aab-aa +(a+|b){0,2}b abb y &-\1 abb-b +(a+|b){0,2}?b ab y &-\1 ab-a +(a+|b){0,2}?b aab y &-\1 aab-aa +(a+|b){0,2}?b abb y &-\1 ab-a +^(a+|b){0,2}?b$ abb y &-\1 abb-b +^(a+|b){0,2}?$ aab y &-\1 aab-b +^((a+)|(b)){0,2}?$ aaab y &-\1-\2-\3 aaab-b-aaa-b +^(a+|b){0,0}?$ a n - - +(a+|b){0,0}? ab y &-\1 - +(a+|b){1,2}?b b n - - +(a+|b){0,2}?ab ab y &-\1 ab- +(a+|b){2,3}?b ab n - - [^ab]* cde y & cde (^)* - c - - (ab|)* - c - - @@ -149,3 +170,4 @@ multiple words multiple words, yeah y & multiple words (we|wee|week)(knights|night) weeknights y &-\1-\2 weeknights-wee-knights (a([^a])*)* abcaBC y &-\1-\2 abcaBC-aBC-C ([Aa]b).*\1 abxyzab y &-\1 abxyzab-ab +a([\/\\]*)b a//\\b y &-\1 a//\\b-//\\ -- 2.1.4
From 5cc73de564d788f61535358d3f2ef8f10cef28b0 Mon Sep 17 00:00:00 2001 From: Peter Bex <pe...@more-magic.net> Date: Thu, 17 Dec 2015 21:59:11 +0100 Subject: [PATCH 2/2] Reduce difference with upstream irregex. Instead of using a custom "%irregex-error", which is redefined to just "error" in a compiler macro, we simply use "error" directly. Upstream also does this, which means the diff between upstream's irregex.scm and our irregex-core.scm is smaller, which makes maintenance less of a hassle. --- irregex-core.scm | 180 ++++++++++++++++++++++++++----------------------------- irregex.scm | 6 -- 2 files changed, 85 insertions(+), 101 deletions(-) diff --git a/irregex-core.scm b/irregex-core.scm index c871369..71939af 100644 --- a/irregex-core.scm +++ b/irregex-core.scm @@ -71,16 +71,6 @@ ;; 0.2: 2005/09/27 - adding irregex-opt (like elisp's regexp-opt) utility ;; 0.1: 2005/08/18 - simple NFA interpreter over abstract chunked strings - -(define (%irregex-error arg1 . args) - (apply - error - (if (symbol? arg1) - (cons (string-append (symbol->string arg1) ": " (car args)) - (cdr args)) - args))) - - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;; Data Structures @@ -269,24 +259,24 @@ (define (irregex-match-numeric-index location m opt) (cond ((not (irregex-match-data? m)) - (%irregex-error location "not match data" m)) + (error location "not match data" m)) ((not (pair? opt)) 0) ((pair? (cdr opt)) - (apply %irregex-error location "too many arguments" m opt)) + (apply error location "too many arguments" m opt)) (else (let ((n (car opt))) (if (number? n) (if (and (integer? n) (exact? n)) (if (irregex-match-valid-numeric-index? m n) (and (irregex-match-matched-numeric-index? m n) n) - (%irregex-error location "not a valid index" m n)) - (%irregex-error location "not an exact integer" n)) + (error location "not a valid index" m n)) + (error location "not an exact integer" n)) (let lp ((ls (irregex-match-names m)) (unknown? #t)) (cond ((null? ls) (and unknown? - (%irregex-error location "unknown match name" n))) + (error location "unknown match name" n))) ((eq? n (caar ls)) (if (%irregex-match-start-chunk m (cdar ls)) (cdar ls) @@ -295,10 +285,10 @@ (define (irregex-match-valid-index? m n) (if (not (irregex-match-data? m)) - (%irregex-error 'irregex-match-valid-index? "not match data" m)) + (error 'irregex-match-valid-index? "not match data" m)) (if (integer? n) (if (not (exact? n)) - (%irregex-error 'irregex-match-valid-index? "not an exact integer" n) + (error 'irregex-match-valid-index? "not an exact integer" n) (irregex-match-valid-numeric-index? m n)) (irregex-match-valid-named-index? m n))) @@ -317,7 +307,7 @@ (cnk (irregex-match-chunker m)) (get-subchunk (chunker-get-subchunk cnk))) (if (not get-subchunk) - (%irregex-error "this chunk type does not support match subchunks" m n) + (error "this chunk type does not support match subchunks" m n) (and n (get-subchunk (%irregex-match-start-chunk m n) (%irregex-match-start-index m n) @@ -356,7 +346,7 @@ (get-subchunk (and (pair? o) (car o)))) (if (not (and (procedure? get-next) (procedure? get-str) (procedure? get-start) (procedure? get-substr))) - (%irregex-error 'make-irregex-chunker "expected a procdure")) + (error 'make-irregex-chunker "expected a procdure")) (vector get-next get-str get-start get-end get-substr get-subchunk))) (define (chunker-get-next cnk) (vector-ref cnk 0)) @@ -515,7 +505,7 @@ (define (last ls) (if (not (pair? ls)) - (%irregex-error "can't take last of empty list") + (error "can't take last of empty list") (let lp ((ls ls)) (if (pair? (cdr ls)) (lp (cdr ls)) @@ -622,7 +612,7 @@ (define end (string-length str)) (define (read i k) (cond - ((>= i end) (%irregex-error "unterminated embedded SRE" str)) + ((>= i end) (error "unterminated embedded SRE" str)) (else (case (string-ref str i) ((#\() @@ -635,11 +625,11 @@ (k (reverse ls) j)) ((eq? x dot-token) (if (null? ls) - (%irregex-error "bad dotted form" str) + (error "bad dotted form" str) (read j (lambda (y j2) (read j2 (lambda (z j3) (if (not (eq? z close-token)) - (%irregex-error "bad dotted form" str) + (error "bad dotted form" str) (k (append (reverse (cdr ls)) (cons (car ls) y)) j3)))))))) @@ -667,7 +657,7 @@ (define (collect) (if (= from i) res (cons (substring str from i) res))) (if (>= i end) - (%irregex-error "unterminated string in embedded SRE" str) + (error "unterminated string in embedded SRE" str) (case (string-ref str i) ((#\") (k (string-cat-reverse (collect)) (+ i 1))) ((#\\) (scan (+ i 1) (+ i 2) (collect))) @@ -690,7 +680,7 @@ ((#\t #\f) (k (eqv? #\t (string-ref str (+ i 1))) (+ i 2))) (else - (%irregex-error "bad # syntax in simplified SRE" i)))) + (error "bad # syntax in simplified SRE" i)))) (else (cond ((char-whitespace? (string-ref str i)) @@ -707,7 +697,7 @@ (else (scan (+ j 1)))))))))))) (read i (lambda (res j) (if (eq? res 'close-token) - (%irregex-error "unexpected ')' in SRE" str j) + (error "unexpected ')' in SRE" str j) (proc res j))))) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -739,7 +729,7 @@ (if (string? obj) (string->sre obj) obj)) (define (string->sre str . o) - (if (not (string? str)) (%irregex-error 'string->sre "expected a string" str)) + (if (not (string? str)) (error 'string->sre "expected a string" str)) (let ((end (string-length str)) (flags (symbol-list->flags o))) @@ -834,7 +824,7 @@ ;; main parsing (if (>= i end) (if (pair? st) - (%irregex-error "unterminated parenthesis in regexp" str) + (error "unterminated parenthesis in regexp" str) (collect/terms)) (let ((c (string-ref str i))) (case c @@ -846,7 +836,7 @@ ((#\?) (let ((res (collect/single))) (if (null? res) - (%irregex-error "? can't follow empty pattern" str res) + (error "? can't follow empty pattern" str res) (let ((x (car res))) (lp (+ i 1) (+ i 1) @@ -870,9 +860,9 @@ (op (string->symbol (string c)))) (cond ((sre-repeater? x) - (%irregex-error "duplicate repetition (e.g. **) in pattern" str res)) + (error "duplicate repetition (e.g. **) in pattern" str res)) ((sre-empty? x) - (%irregex-error "can't repeat empty pattern (e.g. ()*)" str res)) + (error "can't repeat empty pattern (e.g. ()*)" str res)) (else (lp (+ i 1) (+ i 1) flags (cons (list op x) (cdr res)) @@ -880,19 +870,19 @@ ((#\() (cond ((>= (+ i 1) end) - (%irregex-error "unterminated parenthesis in regexp" str)) + (error "unterminated parenthesis in regexp" str)) ((not (memv (string-ref str (+ i 1)) '(#\? #\*))) ; normal case (lp (+ i 1) (+ i 1) (flag-join flags ~save?) '() (save))) ((>= (+ i 2) end) - (%irregex-error "unterminated parenthesis in regexp" str)) + (error "unterminated parenthesis in regexp" str)) ((eqv? (string-ref str (+ i 1)) #\*) (if (eqv? #\' (string-ref str (+ i 2))) (with-read-from-string str (+ i 3) (lambda (sre j) (if (or (>= j end) (not (eqv? #\) (string-ref str j)))) - (%irregex-error "unterminated (*'...) SRE escape" str) + (error "unterminated (*'...) SRE escape" str) (lp (+ j 1) (+ j 1) flags (cons sre (collect)) st)))) - (%irregex-error "bad regexp syntax: (*FOO) not supported" str))) + (error "bad regexp syntax: (*FOO) not supported" str))) (else ;; (?...) case (case (string-ref str (+ i 2)) ((#\#) @@ -909,7 +899,7 @@ ((#\<) (cond ((>= (+ i 3) end) - (%irregex-error "unterminated parenthesis in regexp" str)) + (error "unterminated parenthesis in regexp" str)) (else (case (string-ref str (+ i 3)) ((#\=) @@ -927,7 +917,7 @@ `(,(string->symbol (substring str (+ i 3) j)) submatch-named) (save)) - (%irregex-error "invalid (?< sequence" str)))))))) + (error "invalid (?< sequence" str)))))))) ((#\>) (lp (+ i 3) (+ i 3) (flag-clear flags ~save?) '(atomic) (save))) @@ -938,12 +928,12 @@ ((#\() (cond ((>= (+ i 3) end) - (%irregex-error "unterminated parenthesis in regexp" str)) + (error "unterminated parenthesis in regexp" str)) ((char-numeric? (string-ref str (+ i 3))) (let* ((j (string-scan-char str #\) (+ i 3))) (n (string->number (substring str (+ i 3) j)))) (if (not n) - (%irregex-error "invalid conditional reference" str) + (error "invalid conditional reference" str) (lp (+ j 1) (+ j 1) (flag-clear flags ~save?) `(,n if) (save))))) ((char-alphabetic? (string-ref str (+ i 3))) @@ -955,7 +945,7 @@ (lp (+ i 2) (+ i 2) (flag-clear flags ~save?) '(if) (save))))) ((#\{) - (%irregex-error "unsupported Perl-style cluster" str)) + (error "unsupported Perl-style cluster" str)) (else (let ((old-flags flags)) (let lp2 ((j (+ i 2)) (flags flags) (invert? #f)) @@ -969,7 +959,7 @@ (cons (if after 'w/utf8 'w/noutf8) res)))) (cond ((>= j end) - (%irregex-error "incomplete cluster" str i)) + (error "incomplete cluster" str i)) (else (case (string-ref str j) ((#\i) @@ -991,11 +981,11 @@ (lp (+ j 1) (+ j 1) flags (new-res '()) (cons (cons old-flags (collect)) st))) (else - (%irregex-error "unknown regex cluster modifier" str) + (error "unknown regex cluster modifier" str) ))))))))))) ((#\)) (if (null? st) - (%irregex-error "too many )'s in regexp" str) + (error "too many )'s in regexp" str) (lp (+ i 1) (+ i 1) (caar st) @@ -1016,7 +1006,7 @@ (let ((res (collect/single))) (cond ((null? res) - (%irregex-error "{ can't follow empty pattern")) + (error "{ can't follow empty pattern")) (else (let* ((x (car res)) (tail (cdr res)) @@ -1031,7 +1021,7 @@ (and (pair? (cdr s2)) (not (equal? "" (cadr s2))) (not m))) - (%irregex-error "invalid {n} repetition syntax" s2)) + (error "invalid {n} repetition syntax" s2)) ((null? (cdr s2)) (lp (+ j 1) (+ j 1) flags `((= ,n ,x) ,@tail) st)) (m @@ -1042,7 +1032,7 @@ ((#\\) (cond ((>= (+ i 1) end) - (%irregex-error "incomplete escape sequence" str)) + (error "incomplete escape sequence" str)) (else (let ((c (string-ref str (+ i 1)))) (case c @@ -1089,7 +1079,7 @@ ((#\k) (let ((c (string-ref str (+ i 2)))) (if (not (memv c '(#\< #\{ #\'))) - (%irregex-error "bad \\k usage, expected \\k<...>" str) + (error "bad \\k usage, expected \\k<...>" str) (let* ((terminal (char-mirror c)) (j (string-scan-char str terminal (+ i 2))) (s (and j (substring str (+ i 3) j))) @@ -1098,7 +1088,7 @@ 'backref-ci 'backref))) (if (not j) - (%irregex-error "unterminated named backref" str) + (error "unterminated named backref" str) (lp (+ j 1) (+ j 1) flags `((,backref ,(string->symbol s)) ,@(collect)) @@ -1149,7 +1139,7 @@ (if cell (lp (+ i 2) (+ i 2) flags (cons (cdr cell) (collect)) st) - (%irregex-error "unknown escape sequence" str c)))) + (error "unknown escape sequence" str c)))) (else (lp (+ i 2) (+ i 1) flags (collect) st))))))))) ((#\|) @@ -1191,24 +1181,24 @@ (define (string-parse-hex-escape str i end) (cond ((>= i end) - (%irregex-error "incomplete hex escape" str i)) + (error "incomplete hex escape" str i)) ((eqv? #\{ (string-ref str i)) (let ((j (string-scan-char-escape str #\} (+ i 1)))) (if (not j) - (%irregex-error "incomplete hex brace escape" str i) + (error "incomplete hex brace escape" str i) (let* ((s (substring str (+ i 1) j)) (n (string->number s 16))) (if n (list (integer->char n) j) - (%irregex-error "bad hex brace escape" s)))))) + (error "bad hex brace escape" s)))))) ((>= (+ i 1) end) - (%irregex-error "incomplete hex escape" str i)) + (error "incomplete hex escape" str i)) (else (let* ((s (substring str i (+ i 2))) (n (string->number s 16))) (if n (list (integer->char n) (+ i 2)) - (%irregex-error "bad hex escape" s)))))) + (error "bad hex escape" s)))))) (define (string-parse-cset str start flags) (let* ((end (string-length str)) @@ -1216,7 +1206,7 @@ (utf8? (flag-set? flags ~utf8?))) (define (go i prev-char cset) (if (>= i end) - (%irregex-error "incomplete char set" str i end) + (error "incomplete char set" str i end) (let ((c (string-ref str i))) (case c ((#\]) @@ -1234,7 +1224,7 @@ (eqv? #\] (string-ref str (+ i 1)))) (go (+ i 1) c (cset-adjoin cset c))) ((not prev-char) - (%irregex-error "bad char-set")) + (error "bad char-set")) (else (let ((char (string-ref str (+ i 1)))) (apply @@ -1260,14 +1250,14 @@ ((#\:) (let ((j (string-scan-char str #\: (+ i2 1)))) (if (or (not j) (not (eqv? #\] (string-ref str (+ j 1))))) - (%irregex-error "incomplete character class" str) + (error "incomplete character class" str) (let* ((class (sre->cset (string->symbol (substring str (+ i2 1) j)))) (class (if inv? (cset-complement class) class))) (go (+ j 2) #f (cset-union cset class)))))) ((#\= #\.) - (%irregex-error "collating sequences not supported" str)) + (error "collating sequences not supported" str)) (else (go (+ i 1) #\[ (cset-adjoin cset #\[)))))) ((#\\) @@ -1367,7 +1357,7 @@ (bit-shl (bit-and (byte (+ i 2)) #b00111111) 6) (bit-and (byte (+ i 3)) #b00111111)))) (else - (%irregex-error "invalid utf8 length" str len i)))) + (error "invalid utf8 length" str len i)))) (define (utf8-backup-to-initial-char str i) (let lp ((i i)) @@ -1381,12 +1371,12 @@ (define (utf8-lowest-digit-of-length len) (case len ((1) 0) ((2) #xC0) ((3) #xE0) ((4) #xF0) - (else (%irregex-error "invalid utf8 length" len)))) + (else (error "invalid utf8 length" len)))) (define (utf8-highest-digit-of-length len) (case len ((1) #x7F) ((2) #xDF) ((3) #xEF) ((4) #xF7) - (else (%irregex-error "invalid utf8 length" len)))) + (else (error "invalid utf8 length" len)))) (define (char->utf8-list c) (let ((i (char->integer c))) @@ -1404,7 +1394,7 @@ (bit-ior #b10000000 (bit-and (bit-shr i 12) #b111111)) (bit-ior #b10000000 (bit-and (bit-shr i 6) #b111111)) (bit-ior #b10000000 (bit-and i #b111111)))) - (else (%irregex-error "unicode codepoint out of range:" i))))) + (else (error "unicode codepoint out of range:" i))))) (define (unicode-range->utf8-pattern lo hi) (let ((lo-ls (char->utf8-list lo)) @@ -1779,13 +1769,13 @@ (let ((n (cond ((number? (cadr sre)) (cadr sre)) ((assq (cadr sre) names) => cdr) - (else (%irregex-error "unknown backreference" (cadr sre)))))) + (else (error "unknown backreference" (cadr sre)))))) (cond ((or (not (integer? n)) (not (< 0 n (vector-length sublens)))) - (%irregex-error 'sre-length "invalid backreference" sre)) + (error 'sre-length "invalid backreference" sre)) ((not (vector-ref sublens n)) - (%irregex-error 'sre-length "invalid forward backreference" sre)) + (error 'sre-length "invalid forward backreference" sre)) (else (let ((lo2 (car (vector-ref sublens n))) (hi2 (cdr (vector-ref sublens n)))) @@ -1830,7 +1820,7 @@ => (lambda (cell) (lp (apply (cdr cell) (cdr sre)) n lo hi return))) (else - (%irregex-error 'sre-length-ranges "unknown sre operator" sre))))))) + (error 'sre-length-ranges "unknown sre operator" sre))))))) ((char? sre) (grow 1)) ((string? sre) @@ -1844,7 +1834,7 @@ (if cell (lp (if (procedure? (cdr cell)) ((cdr cell)) (cdr cell)) n lo hi return) - (%irregex-error 'sre-length-ranges "unknown sre" sre))))))) + (error 'sre-length-ranges "unknown sre" sre))))))) sublens)) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -1947,13 +1937,13 @@ (substring (car src1) i j)))) (define (irregex-search x str . o) - (if (not (string? str)) (%irregex-error 'irregex-search "not a string" str)) + (if (not (string? str)) (error 'irregex-search "not a string" str)) (let ((start (or (and (pair? o) (car o)) 0)) (end (or (and (pair? o) (pair? (cdr o)) (cadr o)) (string-length str)))) (if (not (and (integer? start) (exact? start))) - (%irregex-error 'irregex-search "not an exact integer" start)) + (error 'irregex-search "not an exact integer" start)) (if (not (and (integer? end) (exact? end))) - (%irregex-error 'irregex-search "not an exact integer" end)) + (error 'irregex-search "not an exact integer" end)) (irregex-search/chunked x irregex-basic-string-chunker (list str start end) @@ -1963,7 +1953,7 @@ (let* ((irx (irregex x)) (matches (irregex-new-matches irx)) (i (if (pair? o) (car o) ((chunker-get-start cnk) src)))) - (if (not (integer? i)) (%irregex-error 'irregex-search "not an integer" i)) + (if (not (integer? i)) (error 'irregex-search "not an integer" i)) (irregex-match-chunker-set! matches cnk) (irregex-search/matches irx cnk (cons src i) src i matches))) @@ -2035,13 +2025,13 @@ #f)))))))) (define (irregex-match irx str . o) - (if (not (string? str)) (%irregex-error 'irregex-match "not a string" str)) + (if (not (string? str)) (error 'irregex-match "not a string" str)) (let ((start (or (and (pair? o) (car o)) 0)) (end (or (and (pair? o) (pair? (cdr o)) (cadr o)) (string-length str)))) (if (not (and (integer? start) (exact? start))) - (%irregex-error 'irregex-match "not an exact integer" start)) + (error 'irregex-match "not an exact integer" start)) (if (not (and (integer? end) (exact? end))) - (%irregex-error 'irregex-match "not an exact integer" end)) + (error 'irregex-match "not an exact integer" end)) (irregex-match/chunked irx irregex-basic-string-chunker (list str start end)))) @@ -2635,7 +2625,7 @@ (if (procedure? (cdr cell)) (lp (cons (apply (cdr cell) (cdar ls)) (cdr ls)) n flags next) - (%irregex-error "non-procedure in op position" (caar ls))))) + (error "non-procedure in op position" (caar ls))))) (else #f))))))) (else #f)))) @@ -3177,7 +3167,7 @@ ((*) (cond ((sre-empty? (sre-sequence (cdr sre))) - (%irregex-error "invalid sre: empty *" sre)) + (error "invalid sre: empty *" sre)) (else (letrec ((body @@ -3196,7 +3186,7 @@ ((*?) (cond ((sre-empty? (sre-sequence (cdr sre))) - (%irregex-error "invalid sre: empty *?" sre)) + (error "invalid sre: empty *?" sre)) (else (letrec ((body @@ -3363,7 +3353,7 @@ (cond ((assq (cadr sre) names) => cdr) (else - (%irregex-error "unknown named backref in SRE IF" sre))) + (error "unknown named backref in SRE IF" sre))) (cadr sre)))) (lambda (cnk init src str i end matches fail2) (if (%irregex-match-end-chunk matches index) @@ -3378,7 +3368,7 @@ ((backref backref-ci) (let ((n (cond ((number? (cadr sre)) (cadr sre)) ((assq (cadr sre) names) => cdr) - (else (%irregex-error "unknown backreference" (cadr sre))))) + (else (error "unknown backreference" (cadr sre))))) (compare (if (or (eq? (car sre) 'backref-ci) (flag-set? flags ~case-insensitive?)) string-ci=? @@ -3453,7 +3443,7 @@ ((=> submatch-named) (rec `(submatch ,@(cddr sre)))) (else - (%irregex-error "unknown regexp operator" sre))))) + (error "unknown regexp operator" sre))))) ((symbol? sre) (case sre ((any) @@ -3559,7 +3549,7 @@ (let ((cell (assq sre sre-named-definitions))) (if cell (rec (cdr cell)) - (%irregex-error "unknown regexp" sre)))))) + (error "unknown regexp" sre)))))) ((char? sre) (if (flag-set? flags ~case-insensitive?) ;; case-insensitive @@ -3612,7 +3602,7 @@ ;; (fail))))) ) (else - (%irregex-error "unknown regexp" sre))))) + (error "unknown regexp" sre))))) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;; Character Sets @@ -3710,7 +3700,7 @@ ((w/nocase) (lp (sre-alternate (cdr sre)) #t)) (else - (%irregex-error "not a valid sre char-set operator" sre))))) + (error "not a valid sre char-set operator" sre))))) ((char? sre) (if ci? (cset-case-insensitive (range->cset sre sre)) (range->cset sre sre))) @@ -3719,7 +3709,7 @@ (let ((cell (assq sre sre-named-definitions))) (if cell (rec (cdr cell)) - (%irregex-error "not a valid sre char-set" sre))))))) + (error "not a valid sre char-set" sre))))))) (define (cset->sre cset) (cons '/ @@ -3859,7 +3849,7 @@ ;;;; Match and Replace Utilities (define (irregex-fold/fast irx kons knil str . o) - (if (not (string? str)) (%irregex-error 'irregex-fold "not a string" str)) + (if (not (string? str)) (error 'irregex-fold "not a string" str)) (let* ((irx (irregex irx)) (matches (irregex-new-matches irx)) (finish (or (and (pair? o) (car o)) (lambda (i acc) acc))) @@ -3870,9 +3860,9 @@ (init-src (list str start end)) (init (cons init-src start))) (if (not (and (integer? start) (exact? start))) - (%irregex-error 'irregex-fold "not an exact integer" start)) + (error 'irregex-fold "not an exact integer" start)) (if (not (and (integer? end) (exact? end))) - (%irregex-error 'irregex-fold "not an exact integer" end)) + (error 'irregex-fold "not an exact integer" end)) (irregex-match-chunker-set! matches irregex-basic-string-chunker) (let lp ((src init-src) (i start) (acc knil)) (if (>= i end) @@ -3900,7 +3890,7 @@ (lp (list str j end) j acc))))))))))) (define (irregex-fold irx kons . args) - (if (not (procedure? kons)) (%irregex-error 'irregex-fold "not a procedure" kons)) + (if (not (procedure? kons)) (error 'irregex-fold "not a procedure" kons)) (let ((kons2 (lambda (i m acc) (kons i (irregex-copy-matches m) acc)))) (apply irregex-fold/fast irx kons2 args))) @@ -3912,7 +3902,7 @@ (cadr o) ((chunker-get-start cnk) start))) (init (cons start i))) - (if (not (integer? i)) (%irregex-error 'irregex-fold/chunked "not an integer" i)) + (if (not (integer? i)) (error 'irregex-fold/chunked "not an integer" i)) (irregex-match-chunker-set! matches cnk) (let lp ((start start) (i i) (acc knil)) (if (not start) @@ -3937,12 +3927,12 @@ (lp end-src end-index acc))))))))))) (define (irregex-fold/chunked irx kons . args) - (if (not (procedure? kons)) (%irregex-error 'irregex-fold/chunked "not a procedure" kons)) + (if (not (procedure? kons)) (error 'irregex-fold/chunked "not a procedure" kons)) (let ((kons2 (lambda (s i m acc) (kons s i (irregex-copy-matches m) acc)))) (apply irregex-fold/chunked/fast irx kons2 args))) (define (irregex-replace irx str . o) - (if (not (string? str)) (%irregex-error 'irregex-replace "not a string" str)) + (if (not (string? str)) (error 'irregex-replace "not a string" str)) (let ((m (irregex-search irx str))) (if m (string-cat-reverse @@ -3953,7 +3943,7 @@ str))) (define (irregex-replace/all irx str . o) - (if (not (string? str)) (%irregex-error 'irregex-replace/all "not a string" str)) + (if (not (string? str)) (error 'irregex-replace/all "not a string" str)) (irregex-fold/fast irx (lambda (i m acc) @@ -4000,12 +3990,12 @@ ((assq (car ls) (irregex-match-names m)) => (lambda (x) (lp (cons (cdr x) (cdr ls)) res))) (else - (%irregex-error "unknown match replacement" (car ls))))))) + (error "unknown match replacement" (car ls))))))) (else (lp (cdr ls) (cons (car ls) res))))))) (define (irregex-extract irx str . o) - (if (not (string? str)) (%irregex-error 'irregex-extract "not a string" str)) + (if (not (string? str)) (error 'irregex-extract "not a string" str)) (apply irregex-fold/fast irx (lambda (i m a) (cons (irregex-match-substring m) a)) @@ -4015,7 +4005,7 @@ o)) (define (irregex-split irx str . o) - (if (not (string? str)) (%irregex-error 'irregex-split "not a string" str)) + (if (not (string? str)) (error 'irregex-split "not a string" str)) (let ((start (if (pair? o) (car o) 0)) (end (if (and (pair? o) (pair? (cdr o))) (cadr o) (string-length str)))) (irregex-fold/fast diff --git a/irregex.scm b/irregex.scm index 7990d30..5a2b6d2 100644 --- a/irregex.scm +++ b/irregex.scm @@ -236,12 +236,6 @@ (vector-set! (##sys#slot m 1) (+ 4 (* t 2)) chunk) (vector-set! (##sys#slot m 1) (+ 5 (* t 2)) index))))) -(declare (unused %irregex-error)) -(define-compiler-syntax %irregex-error - (syntax-rules () - ((_ args ...) - (error args ...)))) - (include "irregex-core.scm") (include "irregex-utils.scm") -- 2.1.4
signature.asc
Description: Digital signature
_______________________________________________ Chicken-hackers mailing list Chicken-hackers@nongnu.org https://lists.nongnu.org/mailman/listinfo/chicken-hackers