[groff] 20/28: [libgroff]: Diagnose invalid Unicode escape seqs.

G. Branden Robinson Sun, 25 Aug 2024 20:04:15 -0700

gbranden pushed a commit to branch master
in repository groff.

commit d29abf70a02ca516ee692bff818b884864da2f31
Author: G. Branden Robinson <[email protected]>
AuthorDate: Sat Aug 24 22:55:55 2024 -0500


    [libgroff]: Diagnose invalid Unicode escape seqs.
    
    Modify `valid_unicode_code_sequence()` function to take optional second
    parameter, a pointer to a character buffer in which an error message is
    stored if the character sequence in the first argument is invalid.
    Declare new constant `ERRBUFSZ` to help any caller allocate sufficient
    memory to hold any such generated message.
    
    * src/include/unicode.h:
    * src/libs/libgroff/unicode.cpp (valid_unicode_code_sequence): Do it.
      Also squawk about use of lowercase hexadecimal digits in Unicode
      special character identifiers, as these are invalid in groff.
---
 ChangeLog                     | 15 ++++++++++++
 src/include/unicode.h         |  9 ++++++-
 src/libs/libgroff/unicode.cpp | 56 ++++++++++++++++++++++++++++++++++---------
 3 files changed, 68 insertions(+), 12 deletions(-)

diff --git a/ChangeLog b/ChangeLog
index 08692a71e..c48a9ddf0 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,18 @@
+2024-08-24  G. Branden Robinson <[email protected]>
+
+       [libgroff]: Modify `valid_unicode_code_sequence()` function to
+       take optional second parameter, a pointer to a character buffer
+       in which an error message is stored if the character sequence in
+       the first argument is invalid.  Declare new constant `ERRBUFSZ`
+       to help any caller allocate sufficient memory to hold any such
+       generated message.
+
+       * src/include/unicode.h:
+       * src/libs/libgroff/unicode.cpp (valid_unicode_code_sequence):
+       Do it.  Also squawk about use of lowercase hexadecimal digits in
+       Unicode special character identifiers, as these are invalid in
+       groff.
+
 2024-08-25  G. Branden Robinson <[email protected]>
 
        * src/roff/troff/input.cpp (encode_char_for_device_output)
diff --git a/src/include/unicode.h b/src/include/unicode.h
index cea39c233..04138b588 100644
--- a/src/include/unicode.h
+++ b/src/include/unicode.h
@@ -65,7 +65,14 @@ const char *decompose_unicode(const char *);
 //
 // Return a pointer to the second character in the string (skipping the
 // leading 'u') if successful, and a null pointer otherwise.
-const char *valid_unicode_code_sequence(const char *);
+//
+// If given a second argument, store a diagnostic message there if the
+// above rules are not satisfied.
+const char *valid_unicode_code_sequence(const char *,
+  char * /* errbuf */ = 0 /* nullptr */);
+
+// valid_unicode_code_sequence() writes to an error message buffer.
+const size_t ERRBUFSZ = 256;
 
 // Local Variables:
 // fill-column: 72
diff --git a/src/libs/libgroff/unicode.cpp b/src/libs/libgroff/unicode.cpp
index 351e0294c..675709a6b 100644
--- a/src/libs/libgroff/unicode.cpp
+++ b/src/libs/libgroff/unicode.cpp
@@ -1,4 +1,4 @@
-/* Copyright (C) 2002-2020 Free Software Foundation, Inc.
+/* Copyright (C) 2002-2024 Free Software Foundation, Inc.
      Written by Werner Lemberg <[email protected]>
 
 This file is part of groff.
@@ -22,40 +22,74 @@ along with this program.  If not, see 
<http://www.gnu.org/licenses/>. */
 
 #include "unicode.h"
 
-const char *valid_unicode_code_sequence(const char *u)
+const char *valid_unicode_code_sequence(const char *u, char *errbuf)
 {
-  if (*u != 'u')
+  if (errbuf != 0 /* nullptr */)
+    (void) memset(errbuf, '\0', ERRBUFSZ);
+  if (*u != 'u') {
+    if (errbuf != 0 /* nullptr */)
+      snprintf(errbuf, ERRBUFSZ, "Unicode special character sequence"
+              " lacks 'u' as first character\n");
     return 0 /* nullptr */;
+  }
   const char *p = ++u;
   for (;;) {
     int val = 0;
     const char *start = p;
     for (;;) {
       // only uppercase hex digits allowed
-      if (!csxdigit(*p))
+      if (!csxdigit(*p)) {
+       if (errbuf != 0 /* nullptr */)
+         snprintf(errbuf, ERRBUFSZ, "Unicode special character"
+                  " sequence has non-hexadecimal digit '%c'\n", *p);
        return 0 /* nullptr */;
+      }
       if (csdigit(*p))
        val = val*0x10 + (*p-'0');
       else if (csupper(*p))
        val = val*0x10 + (*p-'A'+10);
-      else
+      else if ((*p >= 'a') && (*p <= 'f')) {
+       if (errbuf != 0 /* nullptr */)
+         snprintf(errbuf, ERRBUFSZ, "Unicode special character"
+               " sequence must use uppercase hexadecimal digit, not"
+               " '%c'\n", *p);
        return 0 /* nullptr */;
+      }
+      else {
+       assert(0 == "unhandled hexadecimal digit character");
+      }
       // biggest Unicode value is U+10FFFF
-      if (val > 0x10FFFF)
+      if (val > 0x10FFFF) {
+       if (errbuf != 0 /* nullptr */)
+         snprintf(errbuf, ERRBUFSZ, "Unicode special character code"
+                  " point %04X is out of range (0000..10FFFF)\n", val);
        return 0 /* nullptr */;
+      }
       p++;
       if (*p == '\0' || *p == '_')
        break;
     }
     // surrogates not allowed
-    if ((val >= 0xD800 && val <= 0xDBFF) || (val >= 0xDC00 && val <= 0xDFFF))
+    if ((val >= 0xD800 && val <= 0xDBFF)
+       || (val >= 0xDC00 && val <= 0xDFFF)) {
+      if (errbuf != 0 /* nullptr */)
+       snprintf(errbuf, ERRBUFSZ, "Unicode special character code"
+                " point %04X is a surrogate\n", val);
       return 0 /* nullptr */;
-    if (val > 0xFFFF) {
-      if (*start == '0')       // no leading zeros allowed if > 0xFFFF
-       return 0 /* nullptr */;
     }
-    else if (p - start != 4)   // otherwise, check for exactly 4 hex digits
+    const ptrdiff_t width = p - start;
+    if (width < 4) {
+      if (errbuf != 0 /* nullptr */)
+       snprintf(errbuf, ERRBUFSZ, "Unicode special character sequence"
+                " must be exactly 4 to 6 digits\n");
       return 0 /* nullptr */;
+    }
+    else if ((width > 4) && ('0' == *u)) {
+      if (errbuf != 0 /* nullptr */)
+       snprintf(errbuf, ERRBUFSZ, "Unicode special character sequence"
+                " %s has invalid leading zero(es)\n", u);
+      return 0 /* nullptr */;
+    }
     if (*p == '\0')
       break;
     p++;

_______________________________________________
Groff-commit mailing list
[email protected]
https://lists.gnu.org/mailman/listinfo/groff-commit

[groff] 20/28: [libgroff]: Diagnose invalid Unicode escape seqs.

Reply via email to