gbranden pushed a commit to branch master
in repository groff.
commit d29abf70a02ca516ee692bff818b884864da2f31
Author: G. Branden Robinson <[email protected]>
AuthorDate: Sat Aug 24 22:55:55 2024 -0500
[libgroff]: Diagnose invalid Unicode escape seqs.
Modify `valid_unicode_code_sequence()` function to take optional second
parameter, a pointer to a character buffer in which an error message is
stored if the character sequence in the first argument is invalid.
Declare new constant `ERRBUFSZ` to help any caller allocate sufficient
memory to hold any such generated message.
* src/include/unicode.h:
* src/libs/libgroff/unicode.cpp (valid_unicode_code_sequence): Do it.
Also squawk about use of lowercase hexadecimal digits in Unicode
special character identifiers, as these are invalid in groff.
---
ChangeLog | 15 ++++++++++++
src/include/unicode.h | 9 ++++++-
src/libs/libgroff/unicode.cpp | 56 ++++++++++++++++++++++++++++++++++---------
3 files changed, 68 insertions(+), 12 deletions(-)
diff --git a/ChangeLog b/ChangeLog
index 08692a71e..c48a9ddf0 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,18 @@
+2024-08-24 G. Branden Robinson <[email protected]>
+
+ [libgroff]: Modify `valid_unicode_code_sequence()` function to
+ take optional second parameter, a pointer to a character buffer
+ in which an error message is stored if the character sequence in
+ the first argument is invalid. Declare new constant `ERRBUFSZ`
+ to help any caller allocate sufficient memory to hold any such
+ generated message.
+
+ * src/include/unicode.h:
+ * src/libs/libgroff/unicode.cpp (valid_unicode_code_sequence):
+ Do it. Also squawk about use of lowercase hexadecimal digits in
+ Unicode special character identifiers, as these are invalid in
+ groff.
+
2024-08-25 G. Branden Robinson <[email protected]>
* src/roff/troff/input.cpp (encode_char_for_device_output)
diff --git a/src/include/unicode.h b/src/include/unicode.h
index cea39c233..04138b588 100644
--- a/src/include/unicode.h
+++ b/src/include/unicode.h
@@ -65,7 +65,14 @@ const char *decompose_unicode(const char *);
//
// Return a pointer to the second character in the string (skipping the
// leading 'u') if successful, and a null pointer otherwise.
-const char *valid_unicode_code_sequence(const char *);
+//
+// If given a second argument, store a diagnostic message there if the
+// above rules are not satisfied.
+const char *valid_unicode_code_sequence(const char *,
+ char * /* errbuf */ = 0 /* nullptr */);
+
+// valid_unicode_code_sequence() writes to an error message buffer.
+const size_t ERRBUFSZ = 256;
// Local Variables:
// fill-column: 72
diff --git a/src/libs/libgroff/unicode.cpp b/src/libs/libgroff/unicode.cpp
index 351e0294c..675709a6b 100644
--- a/src/libs/libgroff/unicode.cpp
+++ b/src/libs/libgroff/unicode.cpp
@@ -1,4 +1,4 @@
-/* Copyright (C) 2002-2020 Free Software Foundation, Inc.
+/* Copyright (C) 2002-2024 Free Software Foundation, Inc.
Written by Werner Lemberg <[email protected]>
This file is part of groff.
@@ -22,40 +22,74 @@ along with this program. If not, see
<http://www.gnu.org/licenses/>. */
#include "unicode.h"
-const char *valid_unicode_code_sequence(const char *u)
+const char *valid_unicode_code_sequence(const char *u, char *errbuf)
{
- if (*u != 'u')
+ if (errbuf != 0 /* nullptr */)
+ (void) memset(errbuf, '\0', ERRBUFSZ);
+ if (*u != 'u') {
+ if (errbuf != 0 /* nullptr */)
+ snprintf(errbuf, ERRBUFSZ, "Unicode special character sequence"
+ " lacks 'u' as first character\n");
return 0 /* nullptr */;
+ }
const char *p = ++u;
for (;;) {
int val = 0;
const char *start = p;
for (;;) {
// only uppercase hex digits allowed
- if (!csxdigit(*p))
+ if (!csxdigit(*p)) {
+ if (errbuf != 0 /* nullptr */)
+ snprintf(errbuf, ERRBUFSZ, "Unicode special character"
+ " sequence has non-hexadecimal digit '%c'\n", *p);
return 0 /* nullptr */;
+ }
if (csdigit(*p))
val = val*0x10 + (*p-'0');
else if (csupper(*p))
val = val*0x10 + (*p-'A'+10);
- else
+ else if ((*p >= 'a') && (*p <= 'f')) {
+ if (errbuf != 0 /* nullptr */)
+ snprintf(errbuf, ERRBUFSZ, "Unicode special character"
+ " sequence must use uppercase hexadecimal digit, not"
+ " '%c'\n", *p);
return 0 /* nullptr */;
+ }
+ else {
+ assert(0 == "unhandled hexadecimal digit character");
+ }
// biggest Unicode value is U+10FFFF
- if (val > 0x10FFFF)
+ if (val > 0x10FFFF) {
+ if (errbuf != 0 /* nullptr */)
+ snprintf(errbuf, ERRBUFSZ, "Unicode special character code"
+ " point %04X is out of range (0000..10FFFF)\n", val);
return 0 /* nullptr */;
+ }
p++;
if (*p == '\0' || *p == '_')
break;
}
// surrogates not allowed
- if ((val >= 0xD800 && val <= 0xDBFF) || (val >= 0xDC00 && val <= 0xDFFF))
+ if ((val >= 0xD800 && val <= 0xDBFF)
+ || (val >= 0xDC00 && val <= 0xDFFF)) {
+ if (errbuf != 0 /* nullptr */)
+ snprintf(errbuf, ERRBUFSZ, "Unicode special character code"
+ " point %04X is a surrogate\n", val);
return 0 /* nullptr */;
- if (val > 0xFFFF) {
- if (*start == '0') // no leading zeros allowed if > 0xFFFF
- return 0 /* nullptr */;
}
- else if (p - start != 4) // otherwise, check for exactly 4 hex digits
+ const ptrdiff_t width = p - start;
+ if (width < 4) {
+ if (errbuf != 0 /* nullptr */)
+ snprintf(errbuf, ERRBUFSZ, "Unicode special character sequence"
+ " must be exactly 4 to 6 digits\n");
return 0 /* nullptr */;
+ }
+ else if ((width > 4) && ('0' == *u)) {
+ if (errbuf != 0 /* nullptr */)
+ snprintf(errbuf, ERRBUFSZ, "Unicode special character sequence"
+ " %s has invalid leading zero(es)\n", u);
+ return 0 /* nullptr */;
+ }
if (*p == '\0')
break;
p++;
_______________________________________________
Groff-commit mailing list
[email protected]
https://lists.gnu.org/mailman/listinfo/groff-commit