Hi all,

Here's a patch to bring irregex in line with upstream 0.9.11 (plus one
more commit that fixes a bug in sre->string).  It would be nice to have
that for 5.4.0.

Not that much changed in this release, but a lot of code was
restructured, so it would make it easier to port future changes once
this has been applied.

Also, there are a few fixes related to utf8-mode, so maybe these help
with the UTF-8 branch.

Cheers,
Peter
>From 411dcdd39fde172f414e29a216eaaaad413a4758 Mon Sep 17 00:00:00 2001
From: Peter Bex <pe...@more-magic.net>
Date: Tue, 23 Apr 2024 08:26:07 +0200
Subject: [PATCH] Bump irregex to upstream commit 923cfc39, which is 0.9.11
 plus a bugfix

---
 NEWS                   |  5 +++++
 irregex-core.scm       | 19 +++++++++++++------
 irregex-utils.scm      |  2 +-
 tests/test-irregex.scm | 15 +++++++++++++++
 4 files changed, 34 insertions(+), 7 deletions(-)

diff --git a/NEWS b/NEWS
index 6b09db47..b1bf9e1c 100644
--- a/NEWS
+++ b/NEWS
@@ -37,6 +37,11 @@
     an `errno' property.
   - Deprecated "chicken-home" and added "include-path" in the
     chicken.platform module.
+  - Irregex has been updated to upstream 0.9.11 plus an additional fix
+    for sre->string.  The 0.9.11 release fixes a few problems related to
+    utf-8 handling (which should not affect CHICKEN) and expands the
+    definition for the 'whitespace character set to include vertical tab,
+    carriage return and form feed.
 
 - Tools
   - The -R option for csi and csc now accepts list-notation like
diff --git a/irregex-core.scm b/irregex-core.scm
index 55e9a6c0..5550ace8 100644
--- a/irregex-core.scm
+++ b/irregex-core.scm
@@ -1,6 +1,6 @@
 ;;;; irregex.scm -- IrRegular Expressions
 ;;
-;; Copyright (c) 2005-2021 Alex Shinn.  All rights reserved.
+;; Copyright (c) 2005-2024 Alex Shinn.  All rights reserved.
 ;; BSD-style license: http://synthcode.com/license.txt
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
@@ -30,6 +30,7 @@
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;;;; History
+;; 0.9.11: 2024/02/23 - Guile test and packaging support from Tomas Volf.
 ;; 0.9.10: 2021/07/06 - fixes for submatches under kleene star, empty seqs
 ;;                     in alternations, and bol in folds for backtracking
 ;;                     matcher (thanks John Clements and snan for reporting
@@ -425,7 +426,12 @@
 ;; (define *all-chars* `(/ ,(integer->char (- (char->integer #\space) 32)) 
,(integer->char (+ (char->integer #\space) 223))))
 
 ;; set to #f to ignore even an explicit request for utf8 handling
-(define *allow-utf8-mode?* #t)
+;; The utf8-mode is undesired on any implementation with native unicode 
support.
+;; It is a workaround for those that treat strings as a raw byte sequences, and
+;; does not work well otherwise.  So disable it on implementations known to
+;; handle unicode natively.
+(define *allow-utf8-mode?* (cond-expand ((and chicken (not full-unicode)) #t)
+                                        (else #f)))
 
 ;; (define *named-char-properties* '())
 
@@ -1568,8 +1574,8 @@
          (cons (car sre) (map rec (cdr sre))))))
      (else
       (case sre
-        ((any) 'utf8-any)
-        ((nonl) 'utf8-nonl)
+        ((any) (if utf8? 'utf8-any 'any))
+        ((nonl) (if utf8? 'utf8-nonl 'nonl))
         (else
          (if (and utf8? (char? sre) (high-char? sre))
              (sre-sequence (map integer->char (char->utf8-list sre)))
@@ -2292,10 +2298,11 @@
      . (or alphanumeric punctuation #\$ #\+ #\< #\= #\> #\^ #\` #\| #\~))
     (graph . graphic)
     (blank . (or #\space ,(integer->char (- (char->integer #\space) 23))))
-    (whitespace . (or blank #\newline))
+    ;; 0B - vertical tab, 0C - form feed
+    (whitespace . (or blank #\newline #\x0C #\return #\x0B))
     (space . whitespace)
     (white . whitespace)
-    (printing or graphic whitespace)
+    (printing . (or graphic whitespace))
     (print . printing)
 
     ;; XXXX we assume a (possibly shifted) ASCII-based ordering
diff --git a/irregex-utils.scm b/irregex-utils.scm
index 291b03ea..37313666 100644
--- a/irregex-utils.scm
+++ b/irregex-utils.scm
@@ -104,7 +104,7 @@
            (display ")" out))
           ((* + ? *? ??)
            (cond
-            ((pair? (cddr x))
+            ((or (pair? (cddr x)) (and (string? (cadr x)) (not (= 1 
(string-length (cadr x))))))
              (display "(?:" out) (for-each lp (cdr x)) (display ")" out))
             (else (lp (cadr x))))
            (display (car x) out))
diff --git a/tests/test-irregex.scm b/tests/test-irregex.scm
index 0888f09b..8c0464ad 100644
--- a/tests/test-irregex.scm
+++ b/tests/test-irregex.scm
@@ -419,6 +419,12 @@
   (test-equal "***x***"
       (irregex-replace/all
        (irregex '(: #\space) 'dfa) "   x   " "*"))
+  (test-equal "A:42"
+      (irregex-replace/all "^" "42" "A:"))
+  (test-equal "A:42"
+      (irregex-replace/all 'bos "42" "A:"))
+  (test-equal "A:42"
+      (irregex-replace/all 'bol "42" "A:"))
   (test-equal "xaac"
       (irregex-replace/all
        (irregex '(or (seq bos "a") (seq bos "b")) 'backtrack) "aaac" "x"))
@@ -458,6 +464,15 @@
   )
 
 
+(test-group "parsing"
+  (test-equal "c+" (sre->string '(+ "c")))
+  (test-equal "(?:abc)+" (sre->string '(+ "abc")))
+  (test-equal "(?:abc|def)+" (sre->string '(+ (or "abc" "def"))))
+  (test-equal '(+ #\c) (string->sre "c+"))
+  (test-equal '(+ "abc") (string->sre "(?:abc)+"))
+  (test-equal '(+ (or "abc" "def")) (string->sre "(?:abc|def)+"))
+  )
+
 (define (extract name irx str)
   (irregex-match-substring (irregex-match irx str) name))
 (define (valid? name irx str)
-- 
2.42.0

Reply via email to