lloda pushed a commit to branch main
in repository guile.
commit 04799ab95ae8d845854cd7a0bbc4609ad30bc17d
Author: Rob Browning <[email protected]>
AuthorDate: Fri Apr 11 15:04:51 2025 -0500
Read srfi-207 bytestrings when (read-enable 'bytestrings)
Move the bytestring parser (read-bytestring-content) to the reader to
support reading #u8"..." bytestrings, and share it with (srfi srfi-207).
Add a %boot-9-shared-internal-state hash table and use it to share
"bindings" between read.scm, (ice-9 exceptions), and srfi-207 (e.g. the
bytestring reader) without additional clutter in (guile).
* am/bootstrap.am: Remove parse.scm from srfi-207 deps.
* doc/ref/api-evaluation.texi: add bytestrings read option.
* doc/ref/srfi-modules.texi: mention bytestrings read option.
* libguile/private-options.h: Add SCM_PRINT_BYTESTRINGS_P.
* libguile/read.c (scm_read_opts): add bytestrings read option.
* module/ice-9/boot-9.scm (%boot-9-shared-internal-state): Add hash
table; populate with &message, &irritants, &bytestring-error, and
bytestring-error so read.scm, (ice-9 exceptions), and srfi-207 can share
them.
* module/ice-9/exceptions.scm: Get &message and &irritants from
%boot-9-shared-internal-state.
* module/ice-9/read.scm: Move read-bytestring-content here and use it to
read bytestrings when they're enabled; get &bytestring-error from
%boot-9-shared-internal-state and store read-bytestring-contents there.
* module/srfi/srfi-207.scm: Get &bytestring-error, bytestring-error, and
read-bytestring-contents from %boot-9-shared-internal-state, and move
read-textual-bytestring here from srfi-207/upstream/parse.scm.
* module/srfi/srfi-207/upstream/parse.scm: Drop in favor of read.scm
bytestring reader.
* test-suite/tests/srfi-207.test: Add bytestring read tests.
---
doc/ref/api-evaluation.texi | 1 +
doc/ref/srfi-modules.texi | 2 +
libguile/private-options.h | 4 +-
libguile/read.c | 3 +
module/ice-9/boot-9.scm | 29 +++++++++
module/ice-9/exceptions.scm | 8 ++-
module/ice-9/read.scm | 138 ++++++++++++++++++++++++++++++++++++++---
module/srfi/srfi-207.scm | 124 ++++--------------------------------
test-suite/tests/srfi-207.test | 45 +++++++++++++-
9 files changed, 228 insertions(+), 126 deletions(-)
diff --git a/doc/ref/api-evaluation.texi b/doc/ref/api-evaluation.texi
index 68bf38e54..1b764ba53 100644
--- a/doc/ref/api-evaluation.texi
+++ b/doc/ref/api-evaluation.texi
@@ -346,6 +346,7 @@ hungry-eol-escapes no In strings, consume leading
whitespace after an
escaped end-of-line.
curly-infix no Support SRFI-105 curly infix expressions.
r7rs-symbols no Support R7RS |...| symbol notation.
+bytestrings no Support SRFI-207 #u8"\xce;\xbb; calculus" bytestrings
@end smalllisp
Note that Guile also includes a preliminary mechanism for setting read
diff --git a/doc/ref/srfi-modules.texi b/doc/ref/srfi-modules.texi
index e003edbc0..6a07510bf 100644
--- a/doc/ref/srfi-modules.texi
+++ b/doc/ref/srfi-modules.texi
@@ -7426,6 +7426,8 @@ bytestrings and bytevectors are exactly the same type.
@cindex bytestring notation
The basic form of a string-notated bytevector is @code{#u8"CONTENT"}.
+The Scheme reader will read them if bytestrings are enabled via
+@code{(read-enable 'bytestrings)}.
To avoid character encoding issues within string-notated bytevectors,
only printable ASCII characters (that is, Unicode codepoints in the
diff --git a/libguile/private-options.h b/libguile/private-options.h
index 31f4c0ee4..9018532c0 100644
--- a/libguile/private-options.h
+++ b/libguile/private-options.h
@@ -64,7 +64,7 @@ SCM_INTERNAL scm_t_option scm_read_opts[];
#define SCM_HUNGRY_EOL_ESCAPES_P scm_read_opts[5].val
#define SCM_CURLY_INFIX_P scm_read_opts[6].val
#define SCM_R7RS_SYMBOLS_P scm_read_opts[7].val
-
-#define SCM_N_READ_OPTIONS 8
+#define SCM_READ_BYTESTRINGS_P scm_read_opts[8].val
+#define SCM_N_READ_OPTIONS 9
#endif /* PRIVATE_OPTIONS */
diff --git a/libguile/read.c b/libguile/read.c
index 3030b27ed..accd347a3 100644
--- a/libguile/read.c
+++ b/libguile/read.c
@@ -90,6 +90,9 @@ scm_t_option scm_read_opts[] =
"Support SRFI-105 curly infix expressions."},
{ SCM_OPTION_BOOLEAN, "r7rs-symbols", 0,
"Support R7RS |...| symbol notation."},
+ { SCM_OPTION_BOOLEAN, "bytestrings", 0,
+ "Read bytestrings (SRFI 207), "
+ "e.g. #u8\"\\xe2;\\x88;\\x9e; Improbability\"." },
{ 0, },
};
diff --git a/module/ice-9/boot-9.scm b/module/ice-9/boot-9.scm
index aaa998702..d6195866a 100644
--- a/module/ice-9/boot-9.scm
+++ b/module/ice-9/boot-9.scm
@@ -46,6 +46,15 @@
+;;; {Shared internal state}
+;;;
+;;; Avoids namespace clutter for things that currently can't go in a
+;;; module.
+
+(define %boot-9-shared-internal-state (make-hash-table))
+
+
+
;;; {Language primitives}
;;;
@@ -1559,6 +1568,26 @@ exception that is an instance of @var{rtd}."
(define &non-continuable
(make-exception-type '&non-continuable &programming-error '()))
+ ;; These need to be shared by read.scm, (ice-9 exceptions), and
+ ;; srfi-207, and for now, for example, we can't load exceptions here.
+ (let* ((&message (make-exception-type '&message &exception '(message)))
+ (&irritants (make-exception-type '&irritants &exception '(irritants)))
+ (&bytestring-error (make-exception-type '&bytestring-error &error
'()))
+ (make-bytestring (record-constructor &bytestring-error))
+ (make-message (record-constructor &message))
+ (make-irritants (record-constructor &irritants)))
+ (define (bytestring-error message . irritants)
+ (raise-exception (make-exception (make-bytestring)
+ (make-message message)
+ (make-irritants irritants))))
+ ;; Needed by (ice-9 exceptions)
+ (hashq-set! %boot-9-shared-internal-state '&message &message)
+ (hashq-set! %boot-9-shared-internal-state '&irritants &irritants)
+ ;; Needed by srfi-207
+ (hashq-set! %boot-9-shared-internal-state '&bytestring-error
&bytestring-error)
+ ;; Needed by read.scm and srfi-207
+ (hashq-set! %boot-9-shared-internal-state 'bytestring-error
bytestring-error))
+
;; Boot definition; overridden later.
(define-values* (make-exception-from-throw)
(define make-exception-with-kind-and-args
diff --git a/module/ice-9/exceptions.scm b/module/ice-9/exceptions.scm
index 143e7aa3e..12113eb1f 100644
--- a/module/ice-9/exceptions.scm
+++ b/module/ice-9/exceptions.scm
@@ -136,8 +136,9 @@
(define-exception-type &assertion-failure &programming-error
make-assertion-failure assertion-failure?)
-(define-exception-type &message &exception
- make-exception-with-message exception-with-message?
+(define &message (hashq-ref %boot-9-shared-internal-state '&message))
+(define-exception-type-procedures &message &exception
+ make-exception-with-message exception-with-message?
(message exception-message))
(define-exception-type &warning &exception
@@ -146,7 +147,8 @@
(define-exception-type &external-error &error
make-external-error external-error?)
-(define-exception-type &irritants &exception
+(define &irritants (hashq-ref %boot-9-shared-internal-state '&irritants))
+(define-exception-type-procedures &irritants &exception
make-exception-with-irritants exception-with-irritants?
(irritants exception-irritants))
diff --git a/module/ice-9/read.scm b/module/ice-9/read.scm
index 283933064..d0b2309b7 100644
--- a/module/ice-9/read.scm
+++ b/module/ice-9/read.scm
@@ -54,7 +54,8 @@
(define bitfield:hungry-eol-escapes? 10)
(define bitfield:curly-infix? 12)
(define bitfield:r7rs-symbols? 14)
-(define read-option-bits 16)
+(define bitfield:bytestrings? 16)
+(define read-option-bits 18)
(define read-option-mask #b11)
(define read-option-inherit #b11)
@@ -87,7 +88,8 @@
(bool 'square-brackets bitfield:square-brackets?)
(bool 'hungry-eol-escapes bitfield:hungry-eol-escapes?)
(bool 'curly-infix bitfield:curly-infix?)
- (bool 'r7rs-symbols bitfield:r7rs-symbols?))))
+ (bool 'r7rs-symbols bitfield:r7rs-symbols?)
+ (bool 'bytestrings bitfield:bytestrings?))))
(define (set-option options field new)
(logior (ash new field) (logand options (lognot (ash #b11 field)))))
@@ -114,6 +116,7 @@
(define (hungry-eol-escapes?) (enabled? bitfield:hungry-eol-escapes?))
(define (curly-infix?) (enabled? bitfield:curly-infix?))
(define (r7rs-symbols?) (enabled? bitfield:r7rs-symbols?))
+ (define (bytestrings?) (enabled? bitfield:bytestrings?))
(define neoteric 0)
(define (next) (read-char port))
(define (peek) (peek-char port))
@@ -199,9 +202,9 @@
(strip-annotation (car tail)))
(cons* op x (cdr tail))))))))))
(cond
- ((not (eqv? rdelim #\})) ret) ; Only on {...} lists.
- ((not (pair? ret)) ret) ; {} => (); {.x} => x
- ((null? (cdr ret)) (car ret)); {x} => x
+ ((not (eqv? rdelim #\})) ret) ; Only on {...} lists.
+ ((not (pair? ret)) ret) ; {} => (); {.x} => x
+ ((null? (cdr ret)) (car ret)) ; {x} => x
((and (pair? (cdr ret)) (null? (cddr ret))) ret) ; {x y} => (x y)
((extract-infix-list ret)) ; {x + y + ... + z} => (+ x y ... z)
(else (cons '$nfx$ ret)))) ; {x y . z} => ($nfx$ x y . z)
@@ -403,6 +406,18 @@
(define (read-srfi-4-vector ch)
(read-array ch))
+ (define (read-srfi-4-vector-or-bytestring)
+ (cond
+ ((not (bytestrings?)) (read-array #\u))
+ ((not (eqv? (peek) #\8)) (read-array #\u))
+ (else
+ (next)
+ (if (eqv? (peek) #\")
+ (read-bytestring-content port)
+ (begin
+ (unread-char #\8 port)
+ (read-array #\u))))))
+
(define (maybe-read-boolean-tail tail)
(let ((len (string-length tail)))
(let lp ((i 0))
@@ -426,6 +441,112 @@
(maybe-read-boolean-tail "alse")
#f))))
+ (define bytestring-error
+ (hashq-ref %boot-9-shared-internal-state 'bytestring-error))
+
+ (define (read-bytestring-content port)
+ ;; Must use port, not (peek)/(next).
+ (let ((ch (read-char port)))
+ (when (eof-object? ch)
+ (bytestring-error "end of input instead of bytestring opening #\\\""))
+ (unless (eqv? ch #\")
+ (bytestring-error "expected bytestring opening #\\\"" ch)))
+ (let lp ((out '()))
+ (let ((ch (read-char port)))
+ (cond
+ ((eof-object? ch)
+ (bytestring-error "unexpected end of input while reading
bytestring"))
+ ((eqv? ch #\")
+ (list->typed-array 'vu8 1 (reverse! out)))
+ ((eqv? ch #\\)
+ (let* ((ch (read-char port)))
+ (when (eof-object? ch)
+ (bytestring-error "unexpected end of input within escape
sequence"))
+ (case ch
+ ((#\a) (lp (cons 7 out)))
+ ((#\b) (lp (cons 8 out)))
+ ((#\t) (lp (cons 9 out)))
+ ((#\n) (lp (cons 10 out)))
+ ((#\r) (lp (cons 13 out)))
+ ((#\") (lp (cons 34 out)))
+ ((#\\) (lp (cons 92 out)))
+ ((#\|) (lp (cons 124 out)))
+ ((#\x)
+ (define (skip-prefix-zeros)
+ ;; Leave one zero before a ; to handle \x0;
+ (let ((ch (peek-char port)))
+ (cond
+ ((eof-object? ch) ch)
+ ((char=? ch #\0)
+ (let ((zero (read-char port)))
+ (if (char=? (peek-char port) #\;)
+ (unread-char zero port)
+ (skip-prefix-zeros)))))))
+ (define (read-hex which)
+ (let* ((h (read-char port)))
+ (when (eof-object? h)
+ (bytestring-error
+ (format #f "end of input at ~s bytestring hex escape
char" which)))
+ (case h
+ ((#\;) h)
+ ((#\0) 0)
+ ((#\1) 1)
+ ((#\2) 2)
+ ((#\3) 3)
+ ((#\4) 4)
+ ((#\5) 5)
+ ((#\6) 6)
+ ((#\7) 7)
+ ((#\8) 8)
+ ((#\9) 9)
+ ((#\a #\A) 10)
+ ((#\b #\B) 11)
+ ((#\c #\C) 12)
+ ((#\d #\D) 13)
+ ((#\e #\E) 14)
+ ((#\f #\F) 15)
+ (else
+ (bytestring-error
+ (format #f "non-hex ~a character in bytestring hex
escape" which)
+ h)))))
+ (skip-prefix-zeros)
+ (let* ((h1 (read-hex "first"))
+ (h2 (read-hex "second")))
+ (if (eqv? h2 #\;)
+ (lp (cons h1 out))
+ (let ((term (read-char port)))
+
+ (unless (char=? term #\;)
+ (bytestring-error "not bytestring hex escape
semicolon" term))
+ (lp (cons (+ (* 16 h1) h2) out))))))
+ (else ;; newline surrounded by optional interline blanks
+ (define (intraline? ch)
+ (and (char-whitespace? ch) (not (char=? ch #\newline))))
+ (define (skip-intraline)
+ (let ((ch (peek-char port)))
+ (when (and (not (eof-object? ch)) (intraline? ch))
+ (read-char port)
+ (skip-intraline))))
+ (cond
+ ((char=? ch #\newline) (skip-intraline) (lp out))
+ ((char-whitespace? ch)
+ (skip-intraline)
+ (unless (char=? (read-char port) #\newline)
+ (bytestring-error "expected newline after backslash and
optional spaces" ch))
+ (skip-intraline)
+ (lp out))
+ (else
+ (bytestring-error "unexpected character after bytesstring
backslash" ch)))))))
+ (else
+ (let ((i (char->integer ch)))
+ (unless (<= 20 i 127)
+ (bytestring-error "bytestring char not in valid ASCII range" ch))
+ (lp (cons i out))))))))
+
+ ;; For srfi-207
+ (hashq-set! %boot-9-shared-internal-state
+ 'read-bytestring-content read-bytestring-content)
+
(define (read-bytevector)
(define (expect ch)
(unless (eqv? (next) ch)
@@ -512,8 +633,8 @@
(error "unexpected end of input while reading array"))
(values ch
(if len
- (list lbnd (+ lbnd (1- len)))
- lbnd))))
+ (list lbnd (+ lbnd (1- len)))
+ lbnd))))
(define (read-shape ch alt)
(if (memv ch '(#\@ #\:))
(let*-values (((ch head) (read-dimension ch))
@@ -602,7 +723,8 @@
(case ch
((#\\) (read-character))
((#\() (read-vector))
- ((#\s #\u #\c) (read-srfi-4-vector ch))
+ ((#\u) (read-srfi-4-vector-or-bytestring))
+ ((#\s #\c) (read-srfi-4-vector ch))
((#\f) (read-false-or-srfi-4-vector))
((#\v) (read-bytevector))
((#\*) (read-bitvector))
diff --git a/module/srfi/srfi-207.scm b/module/srfi/srfi-207.scm
index 091915ded..a4c8ae104 100644
--- a/module/srfi/srfi-207.scm
+++ b/module/srfi/srfi-207.scm
@@ -23,11 +23,6 @@
;;; Code:
(define-module (srfi srfi-207)
- #:use-module ((ice-9 exceptions)
- #:select (&error
- define-exception-type
- make-exception-with-message
- make-exception-with-irritants))
#:use-module ((rnrs arithmetic bitwise) #:select (bitwise-and bitwise-ior))
#:use-module ((rnrs bytevectors)
#:select (bytevector->u8-list string->utf8
u8-list->bytevector))
@@ -87,20 +82,24 @@
(cond-expand-provide (current-module) '(srfi-207))
+;; This awkwardness is because read.scm (not a module, included via
+;; boot-9) also needs to be able to read bytestrings.
+(define &bytestring-error
+ (hashq-ref %boot-9-shared-internal-state '&bytestring-error))
+(define bytestring-error
+ (hashq-ref %boot-9-shared-internal-state 'bytestring-error))
+(define read-bytestring-content
+ (hashq-ref %boot-9-shared-internal-state 'read-bytestring-content))
+
+(define bytestring-error? (exception-predicate &bytestring-error))
+
;; From the upstream 207.sld library definition
(define-syntax assume
(syntax-rules ()
((_ pred) (unless pred (error "invalid assumption:" (quote pred))))
((_ pred msg ...) (unless pred (error msg ...)))))
-(define-exception-type &bytestring-error &error
- make-bytestring-error bytestring-error?)
-
-(define (bytestring-error message . irritants)
- (raise-exception (make-exception (make-bytestring-error)
- (make-exception-with-message message)
- (make-exception-with-irritants irritants))))
-
+(include-from-path "ice-9/read/bytestring.scm")
(include-from-path "srfi/srfi-207/upstream/base64.scm")
(include-from-path "srfi/srfi-207/upstream/bytestrings-impl.scm")
@@ -143,105 +142,6 @@
(make-bytestring! result 0 parts)
result))
-(define (read-bytestring-content port)
- ;; Must use port, not (peek)/(next).
- (let ((ch (read-char port)))
- (when (eof-object? ch)
- (bytestring-error "end of input instead of bytestring opening #\\\""))
- (unless (eqv? ch #\")
- (bytestring-error "expected bytestring opening #\\\"" ch)))
- (let lp ((out '()))
- (let ((ch (read-char port)))
- (cond
- ((eof-object? ch)
- (bytestring-error "unexpected end of input while reading bytestring"))
- ((eqv? ch #\")
- (list->typed-array 'vu8 1 (reverse! out)))
- ((eqv? ch #\\)
- (let* ((ch (read-char port)))
- (when (eof-object? ch)
- (bytestring-error "unexpected end of input within escape
sequence"))
- (case ch
- ((#\a) (lp (cons 7 out)))
- ((#\b) (lp (cons 8 out)))
- ((#\t) (lp (cons 9 out)))
- ((#\n) (lp (cons 10 out)))
- ((#\r) (lp (cons 13 out)))
- ((#\") (lp (cons 34 out)))
- ((#\\) (lp (cons 92 out)))
- ((#\|) (lp (cons 124 out)))
- ((#\x)
- (define (skip-prefix-zeros)
- ;; Leave one zero before a ; to handle \x0;
- (let ((ch (peek-char port)))
- (cond
- ((eof-object? ch) ch)
- ((char=? ch #\0)
- (let ((zero (read-char port)))
- (if (char=? (peek-char port) #\;)
- (unread-char zero port)
- (skip-prefix-zeros)))))))
- (define (read-hex which)
- (let* ((h (read-char port)))
- (when (eof-object? h)
- (bytestring-error
- (format #f "end of input at ~s bytestring hex escape char"
which)))
- (case h
- ((#\;) h)
- ((#\0) 0)
- ((#\1) 1)
- ((#\2) 2)
- ((#\3) 3)
- ((#\4) 4)
- ((#\5) 5)
- ((#\6) 6)
- ((#\7) 7)
- ((#\8) 8)
- ((#\9) 9)
- ((#\a #\A) 10)
- ((#\b #\B) 11)
- ((#\c #\C) 12)
- ((#\d #\D) 13)
- ((#\e #\E) 14)
- ((#\f #\F) 15)
- (else
- (bytestring-error
- (format #f "non-hex ~a character in bytestring hex
escape" which)
- h)))))
- (skip-prefix-zeros)
- (let* ((h1 (read-hex "first"))
- (h2 (read-hex "second")))
- (if (eqv? h2 #\;)
- (lp (cons h1 out))
- (let ((term (read-char port)))
-
- (unless (char=? term #\;)
- (bytestring-error "not bytestring hex escape semicolon"
term))
- (lp (cons (+ (* 16 h1) h2) out))))))
- (else ;; newline surrounded by optional interline blanks
- (define (intraline? ch)
- (and (char-whitespace? ch) (not (char=? ch #\newline))))
- (define (skip-intraline)
- (let ((ch (peek-char port)))
- (when (and (not (eof-object? ch)) (intraline? ch))
- (read-char port)
- (skip-intraline))))
- (cond
- ((char=? ch #\newline) (skip-intraline) (lp out))
- ((char-whitespace? ch)
- (skip-intraline)
- (unless (char=? (read-char port) #\newline)
- (bytestring-error "expected newline after backslash and
optional spaces" ch))
- (skip-intraline)
- (lp out))
- (else
- (bytestring-error "unexpected character after bytesstring
backslash" ch)))))))
- (else
- (let ((i (char->integer ch)))
- (unless (<= 20 i 127)
- (bytestring-error "bytestring char not in valid ASCII range" ch))
- (lp (cons i out))))))))
-
(define read-textual-bytestring
(case-lambda
((prefix) (read-textual-bytestring prefix (current-input-port)))
diff --git a/test-suite/tests/srfi-207.test b/test-suite/tests/srfi-207.test
index 8f21b1d73..735e92cd7 100644
--- a/test-suite/tests/srfi-207.test
+++ b/test-suite/tests/srfi-207.test
@@ -23,6 +23,7 @@
;;; SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
(define-module (srfi-207-test)
+ #:use-module ((rnrs bytevectors) #:select (string->utf8 u8-list->bytevector))
#:use-module ((srfi srfi-1) #:select (every list-tabulate))
#:use-module ((srfi srfi-207)
#:select (base64->bytevector
@@ -56,7 +57,10 @@
write-textual-bytestring))
#:use-module ((srfi srfi-34) #:select (guard))
#:use-module ((test-suite lib)
- #:select (pass-if-equal pass-if-exception with-test-prefix))
+ #:select (exception:read-error
+ pass-if-equal
+ pass-if-exception
+ with-test-prefix))
#:use-module ((scheme base)
#:select (bytevector
bytevector-length
@@ -434,6 +438,7 @@
test-bstrings)
=> #t)))
+;; All the reference implementation tests
(define (check-all)
(check-constructor)
(check-conversion)
@@ -445,3 +450,41 @@
(check-io))
(check-all)
+
+(let ((all-decoded (u8-list->bytevector (iota 256))))
+
+ (define (hex-esc i) (format #f "\\x~x;" i))
+ (define (hex-escs start end)
+ (string-concatenate (map hex-esc (iota (1+ (- end start)) start))))
+ (define (ascii start end)
+ (list->string (map integer->char (iota (1+ (- end start)) start))))
+ (define all-encoded
+ (string-append "#u8\""
+ (hex-escs 0 6)
+ "\\a\\b\\t\\n\\xb;\\xc;\\r"
+ (hex-escs 14 31)
+ " !"
+ "\\\""
+ (ascii 35 91)
+ "\\\\"
+ (ascii 93 123)
+ "\\|"
+ (ascii 125 126)
+ (hex-escs 127 255)
+ "\""))
+ (define (read-with-bytestrings-enabled s)
+ (call-with-input-string s
+ (λ (port)
+ (let ((keep (memq 'bytestrings (read-options))))
+ (dynamic-wind (λ () (read-enable 'bytestrings))
+ (λ () (read port))
+ (λ () (unless keep (read-disable 'bytestrings))))))))
+
+ (pass-if-exception "reading when not enabled" exception:read-error
+ (call-with-input-string "#u8\"\\xe2;\\x88;\\x9e; Improbability\"" read))
+ (pass-if-equal "reading when enabled"
+ (string->utf8 "∞ Improbability")
+ (read-with-bytestrings-enabled "#u8\"\\xe2;\\x88;\\x9e; Improbability\""))
+ (pass-if-equal "reading all encodings"
+ all-decoded
+ (read-with-bytestrings-enabled all-encoded)))