native UTF-8 and ISO-8859-1 input support for mandoc(1)

Ingo Schwarze Sun, 26 Oct 2014 12:56:37 -0700

Hi,

you probably know that mandoc(1) has been providing a -Tutf8 *output*
mode for more than three years now.  To *input* non-ASCII characters,
however, encoding them as \[uXXXX] roff(7) esacape sequences, also
documented in mandoc_char(7), is required.


In ports land, many manual pages contain occasional non-ASCII
characters - even though i don't consider that a particularly smart
idea, but let's face it, those characters *are* out there.  There
are even some manual pages in ports completely written in non-latin
scripts; for these, using explicit \[uXXXX] escapes for almost every
letter would be rather impractical, so i can't really blame the
authors or translators for not doing that.  The way to read such
pages was to install the preconv(1) utility (contained in textproc/groff
and in portable mandoc) and do stunts like

  preconv -eutf8 utf8_manual_file | mandoc -Tutf8 | less

I doubt many people did that.

The patch below integrates the preconv(1) code, written by kristaps@
in 2011, into mandoc(1), hooking it into the input reading module,
doing the necessary UTF-8 to \[uXXXX] encoding on the fly when
encountering non-ASCII characters.  It also does some simple encoding
autodetection such that you will hopefully almost never need the -K
command line option borrowed from groff(1) to specify the input
encoding manually.

There are three reasons for doing all that:

 * For the average user using the default configuration, that is,
   LC_ALL=C, show reasonable ASCII approximations of the
   occasional UTF-8 and ISO-8859-1 characters showing up in
   ports manuals instead of "??".

 * For users of LC_CTYPE=foo_BAR.UTF-8, in the above situation,
   show non-ASCII glyphs when available, again instead of "??".

 * Make life slightly easier for users reading manuals in
   languages like Russian, Japanese, Chinese, or Greek.
   Try, for example,

    $ mandoc -aTutf8 /usr/local/man/ru/man6/wesnoth.6

Nothing changes for manuals containing ASCII characters only,
in particular for base manuals.

Since this is a somewhat bigger and user-visible change, i'm
asking whether there are any concerns or comments before committing.

Thanks,
  Ingo


Index: Makefile
===================================================================
RCS file: /cvs/src/usr.bin/mandoc/Makefile,v
retrieving revision 1.82
diff -u -p -r1.82 Makefile
--- Makefile    27 Aug 2014 00:06:08 -0000      1.82
+++ Makefile    26 Oct 2014 19:05:12 -0000
@@ -7,7 +7,7 @@ CFLAGS  += -W -Wall -Wstrict-prototypes 
 DPADD += ${LIBUTIL}
 LDADD  += -lsqlite3 -lutil
 
-SRCS=  mandoc.c mandoc_aux.c read.c \
+SRCS=  mandoc.c mandoc_aux.c preconv.c read.c \
        roff.c tbl.c tbl_opts.c tbl_layout.c tbl_data.c eqn.c
 SRCS+= mdoc_macro.c mdoc.c mdoc_hash.c \
        mdoc_argv.c mdoc_validate.c lib.c att.c \
Index: apropos.1
===================================================================
RCS file: /cvs/src/usr.bin/mandoc/apropos.1,v
retrieving revision 1.27
diff -u -p -r1.27 apropos.1
--- apropos.1   3 Sep 2014 05:17:08 -0000       1.27
+++ apropos.1   26 Oct 2014 19:05:12 -0000
@@ -79,7 +79,7 @@ to paginate them.
 In
 .Fl a
 mode, the options
-.Fl IOTW
+.Fl IKOTW
 described in the
 .Xr mandoc 1
 manual are also available.
Index: libmandoc.h
===================================================================
RCS file: /cvs/src/usr.bin/mandoc/libmandoc.h,v
retrieving revision 1.30
diff -u -p -r1.30 libmandoc.h
--- libmandoc.h 16 Oct 2014 01:10:06 -0000      1.30
+++ libmandoc.h 26 Oct 2014 19:05:12 -0000
@@ -30,6 +30,12 @@ enum rofferr {
        ROFF_ERR /* badness: puke and stop */
 };
 
+struct buf {
+       char    *buf;
+       size_t   sz;
+       size_t   offs;
+};
+
 __BEGIN_DECLS
 
 struct roff;
@@ -62,6 +68,9 @@ int            man_parseln(struct man *, int, cha
 int             man_endparse(struct man *);
 int             man_addspan(struct man *, const struct tbl_span *);
 int             man_addeqn(struct man *, const struct eqn *);
+
+int             preconv_cue(const struct buf *);
+int             preconv_encode(struct buf *, struct buf *, int *);
 
 void            roff_free(struct roff *);
 struct roff    *roff_alloc(struct mparse *, int);
Index: main.c
===================================================================
RCS file: /cvs/src/usr.bin/mandoc/main.c,v
retrieving revision 1.101
diff -u -p -r1.101 main.c
--- main.c      18 Oct 2014 15:46:16 -0000      1.101
+++ main.c      26 Oct 2014 19:05:12 -0000
@@ -75,6 +75,7 @@ struct        curparse {
        char              outopts[BUFSIZ]; /* buf of output opts */
 };
 
+static int               koptions(int *, char *);
 int                      mandocdb(int, char**);
 static int               moptions(int *, char *);
 static void              mmsg(enum mandocerr, enum mandoclevel,
@@ -145,14 +146,15 @@ main(int argc, char *argv[])
        memset(&curp, 0, sizeof(struct curparse));
        curp.outtype = OUTT_ASCII;
        curp.wlevel  = MANDOCLEVEL_FATAL;
-       options = MPARSE_SO;
+       options = MPARSE_SO | MPARSE_UTF8 | MPARSE_LATIN1;
        defos = NULL;
 
        use_pager = 1;
        show_usage = 0;
        outmode = OUTMODE_DEF;
 
-       while (-1 != (c = getopt(argc, argv, "aC:cfhI:iklM:m:O:S:s:T:VW:w"))) {
+       while (-1 != (c = getopt(argc, argv,
+                       "aC:cfhI:iK:klM:m:O:S:s:T:VW:w"))) {
                switch (c) {
                case 'a':
                        outmode = OUTMODE_ALL;
@@ -188,6 +190,10 @@ main(int argc, char *argv[])
                case 'i':
                        outmode = OUTMODE_INT;
                        break;
+               case 'K':
+                       if ( ! koptions(&options, optarg))
+                               return((int)MANDOCLEVEL_BADARG);
+                       break;
                case 'k':
                        search.argmode = ARG_EXPR;
                        break;
@@ -579,6 +585,26 @@ fail:
        fprintf(stderr, "%s: %s: SYSERR: %s: %s",
            progname, file, syscall, strerror(errno));
        return(MANDOCLEVEL_SYSERR);
+}
+
+static int
+koptions(int *options, char *arg)
+{
+
+       if ( ! strcmp(arg, "utf-8")) {
+               *options |=  MPARSE_UTF8;
+               *options &= ~MPARSE_LATIN1;
+       } else if ( ! strcmp(arg, "iso-8859-1")) {
+               *options |=  MPARSE_LATIN1;
+               *options &= ~MPARSE_UTF8;
+       } else if ( ! strcmp(arg, "us-ascii")) {
+               *options &= ~(MPARSE_UTF8 | MPARSE_LATIN1);
+       } else {
+               fprintf(stderr, "%s: -K%s: Bad argument\n",
+                   progname, arg);
+               return(0);
+       }
+       return(1);
 }
 
 static int
Index: man.1
===================================================================
RCS file: /cvs/src/usr.bin/mandoc/man.1,v
retrieving revision 1.3
diff -u -p -r1.3 man.1
--- man.1       3 Sep 2014 05:17:08 -0000       1.3
+++ man.1       26 Oct 2014 19:05:12 -0000
@@ -255,7 +255,7 @@ combination.
 The
 .Nm
 utility also supports the options
-.Fl IOTW
+.Fl IKOTW
 described in the
 .Xr mandoc 1
 manual.
Index: mandoc.1
===================================================================
RCS file: /cvs/src/usr.bin/mandoc/mandoc.1,v
retrieving revision 1.63
diff -u -p -r1.63 mandoc.1
--- mandoc.1    7 Oct 2014 18:17:05 -0000       1.63
+++ mandoc.1    26 Oct 2014 19:05:12 -0000
@@ -27,6 +27,7 @@
 .Sm off
 .Op Fl I Cm os Li = Ar name
 .Sm on
+.Op Fl K Na Ar encoding
 .Op Fl m Ns Ar format
 .Op Fl O Ns Ar option
 .Op Fl T Ns Ar output
@@ -89,6 +90,31 @@ macro.
 Display only the SYNOPSIS lines.
 Implies
 .Fl a .
+.It Fl K Ns Ar encoding
+Specify the input encoding.
+The supported
+.Ar encoding
+arguments are
+.Cm us-ascii ,
+.Cm iso-8859-1 ,
+and
+.Cm utf-8 .
+If not specified, autodetection uses the first match:
+.Bl -tag -width iso-8859-1
+.It Cm utf-8
+if the first three bytes of the input file
+are the UTF-8 byte order mark (BOM, 0xefbbbf)
+.It Ar encoding
+if the first or second line of the input file matches the
+.Sy emacs
+mode line format
+.Pp
+.D1 .\e" -*- Oo ...; Oc coding: Ar encoding ; No -*-
+.It Cm utf-8
+if the first non-ASCII byte in the file introduces a valid UTF-8 sequence
+.It Cm iso-8859-1
+otherwise
+.El
 .It Fl k
 A synonym for
 .Xr apropos 1 .
Index: mandoc.h
===================================================================
RCS file: /cvs/src/usr.bin/mandoc/mandoc.h,v
retrieving revision 1.108
diff -u -p -r1.108 mandoc.h
--- mandoc.h    26 Oct 2014 18:06:28 -0000      1.108
+++ mandoc.h    26 Oct 2014 19:05:12 -0000
@@ -393,6 +393,8 @@ struct      eqn {
 #define        MPARSE_MAN      2  /* assume -man */
 #define        MPARSE_SO       4  /* honour .so requests */
 #define        MPARSE_QUICK    8  /* abort the parse early */
+#define        MPARSE_UTF8     16 /* accept UTF-8 input */
+#define        MPARSE_LATIN1   32 /* accept ISO-LATIN-1 input */
 
 enum   mandoc_esc {
        ESCAPE_ERROR = 0, /* bail! unparsable escape */
Index: preconv.c
===================================================================
RCS file: preconv.c
diff -N preconv.c
--- /dev/null   1 Jan 1970 00:00:00 -0000
+++ preconv.c   26 Oct 2014 19:05:12 -0000
@@ -0,0 +1,211 @@
+/*     $OpenBSD$ */
+/*
+ * Copyright (c) 2011 Kristaps Dzonsons <[email protected]>
+ * Copyright (c) 2014 Ingo Schwarze <[email protected]>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include <sys/types.h>
+
+#include <stdio.h>
+#include <string.h>
+#include "mandoc.h"
+#include "libmandoc.h"
+
+int
+preconv_encode(struct buf *ib, struct buf *ob, int *filenc)
+{
+       size_t           i;
+       const long       one = 1L;
+       int              state, be;
+       unsigned int     accum;
+       unsigned char    cu;
+
+       if ( ! (*filenc & MPARSE_UTF8))
+               goto latin;
+
+       state = 0;
+       accum = 0U;
+       be = 0;
+
+       /* Quick test for big-endian value. */
+
+       if ( ! (*((const char *)(&one))))
+               be = 1;
+
+       for (i = ib->offs; i < ib->sz; i++) {
+               cu = ib->buf[i];
+               if (state) {
+                       if ( ! (cu & 128) || (cu & 64)) {
+                               /* Bad sequence header. */
+                               break;
+                       }
+
+                       /* Accept only legitimate bit patterns. */
+
+                       if (cu > 191 || cu < 128) {
+                               /* Bad in-sequence bits. */
+                               break;
+                       }
+
+                       accum |= (cu & 63) << --state * 6;
+
+                       if (state)
+                               continue;
+
+                       /*
+                        * Accum is held in little-endian order as
+                        * stipulated by the UTF-8 sequence coding.  We
+                        * need to convert to a native big-endian if our
+                        * architecture requires it.
+                        */
+
+                       if (be)
+                               accum = (accum >> 24) | 
+                                       ((accum << 8) & 0x00FF0000) |
+                                       ((accum >> 8) & 0x0000FF00) |
+                                       (accum << 24);
+
+                       if (accum < 0x80)
+                               ob->buf[ob->offs++] = accum;
+                       else
+                               ob->offs += snprintf(ob->buf + ob->offs,
+                                   11, "\\[u%.4X]", accum);
+                       ib->offs = i + 1;
+                       *filenc &= ~MPARSE_LATIN1;
+                       return(1);
+               } else {
+                       /*
+                        * Entering a UTF-8 state:  if we encounter a
+                        * UTF-8 bitmask, calculate the expected UTF-8
+                        * state from it.
+                        */
+                       for (state = 0; state < 7; state++) 
+                               if ( ! (cu & (1 << (7 - state))))
+                                       break;
+
+                       /* Accept only legitimate bit patterns. */
+
+                       switch (state--) {
+                       case (4):
+                               if (cu <= 244 && cu >= 240) {
+                                       accum = (cu & 7) << 18;
+                                       continue;
+                               }
+                               /* Bad 4-sequence start bits. */
+                               break;
+                       case (3):
+                               if (cu <= 239 && cu >= 224) {
+                                       accum = (cu & 15) << 12;
+                                       continue;
+                               }
+                               /* Bad 3-sequence start bits. */
+                               break;
+                       case (2):
+                               if (cu <= 223 && cu >= 194) {
+                                       accum = (cu & 31) << 6;
+                                       continue;
+                               }
+                               /* Bad 2-sequence start bits. */
+                               break;
+                       default:
+                               /* Bad sequence bit mask. */
+                               break;
+                       }
+                       break;
+               }
+       }
+
+       /* FALLTHROUGH: Invalid or incomplete UTF-8 sequence. */
+
+latin:
+       if ( ! (*filenc & MPARSE_LATIN1))
+               return(0);
+
+       ob->offs += snprintf(ob->buf + ob->offs, 11,
+           "\\[u%.4X]", (unsigned char)ib->buf[ib->offs++]);
+
+       *filenc &= ~MPARSE_UTF8;
+       return(1);
+}
+
+int
+preconv_cue(const struct buf *b)
+{
+       const char      *ln, *eoln, *eoph;
+       size_t           sz, phsz;
+
+       ln = b->buf + b->offs;
+       sz = b->sz - b->offs;
+
+       /* Look for the end-of-line. */
+
+       if (NULL == (eoln = memchr(ln, '\n', sz)))
+               eoln = ln + sz;
+
+       /* Check if we have the correct header/trailer. */
+
+       if ((sz = (size_t)(eoln - ln)) < 10 || 
+           memcmp(ln, ".\\\" -*-", 7) || memcmp(eoln - 3, "-*-", 3))
+               return(MPARSE_UTF8 | MPARSE_LATIN1);
+
+       /* Move after the header and adjust for the trailer. */
+
+       ln += 7;
+       sz -= 10;
+
+       while (sz > 0) {
+               while (sz > 0 && ' ' == *ln) {
+                       ln++;
+                       sz--;
+               }
+               if (0 == sz)
+                       break;
+
+               /* Find the end-of-phrase marker (or eoln). */
+
+               if (NULL == (eoph = memchr(ln, ';', sz)))
+                       eoph = eoln - 3;
+               else
+                       eoph++;
+
+               /* Only account for the "coding" phrase. */
+
+               if ((phsz = eoph - ln) < 7 ||
+                   strncasecmp(ln, "coding:", 7)) {
+                       sz -= phsz;
+                       ln += phsz;
+                       continue;
+               } 
+
+               sz -= 7;
+               ln += 7;
+
+               while (sz > 0 && ' ' == *ln) {
+                       ln++;
+                       sz--;
+               }
+               if (0 == sz)
+                       return(0);
+
+               /* Check us against known encodings. */
+
+               if (phsz > 4 && !strncasecmp(ln, "utf-8", 5))
+                       return(MPARSE_UTF8);
+               if (phsz > 10 && !strncasecmp(ln, "iso-latin-1", 11))
+                       return(MPARSE_LATIN1);
+               return(0);
+       }
+       return(MPARSE_UTF8 | MPARSE_LATIN1);
+}
Index: read.c
===================================================================
RCS file: /cvs/src/usr.bin/mandoc/read.c,v
retrieving revision 1.68
diff -u -p -r1.68 read.c
--- read.c      20 Oct 2014 19:21:31 -0000      1.68
+++ read.c      26 Oct 2014 19:05:12 -0000
@@ -39,11 +39,6 @@
 
 #define        REPARSE_LIMIT   1000
 
-struct buf {
-       char             *buf; /* binary input buffer */
-       size_t            sz; /* size of binary buffer */
-};
-
 struct mparse {
        struct man       *pman; /* persistent man parser */
        struct mdoc      *pmdoc; /* persistent mdoc parser */
@@ -59,6 +54,7 @@ struct        mparse {
        enum mandoclevel  file_status; /* status of current parse */
        enum mandoclevel  wlevel; /* ignore messages below this */
        int               options; /* parser options */
+       int               filenc; /* encoding of the current file */
        int               reparse_count; /* finite interp. stack */
        int               line; /* line number in the file */
 };
@@ -320,13 +316,20 @@ mparse_buf_r(struct mparse *curp, struct
        lnn = curp->line;
        pos = 0;
 
-       for (i = 0; i < (int)blk.sz; ) {
+       for (i = blk.offs; i < (int)blk.sz; ) {
                if (0 == pos && '\0' == blk.buf[i])
                        break;
 
                if (start) {
                        curp->line = lnn;
                        curp->reparse_count = 0;
+
+                       if (lnn < 3 &&
+                           curp->filenc & MPARSE_UTF8 &&
+                           curp->filenc & MPARSE_LATIN1) {
+                               blk.offs = i;
+                               curp->filenc = preconv_cue(&blk);
+                       }
                }
 
                while (i < (int)blk.sz && (start || '\0' != blk.buf[i])) {
@@ -347,27 +350,40 @@ mparse_buf_r(struct mparse *curp, struct
                        }
 
                        /*
-                        * Make sure we have space for at least
-                        * one backslash and one other character
-                        * and the trailing NUL byte.
+                        * Make sure we have space for the worst
+                        * case of 11 bytes: "\\[u10ffff]\0"
                         */
 
-                       if (pos + 2 >= (int)ln.sz)
+                       if (pos + 11 > (int)ln.sz)
                                resize_buf(&ln, 256);
 
                        /*
-                        * Warn about bogus characters.  If you're using
-                        * non-ASCII encoding, you're screwing your
-                        * readers.  Since I'd rather this not happen,
-                        * I'll be helpful and replace these characters
-                        * with "?", so we don't display gibberish.
-                        * Note to manual writers: use special characters.
+                        * Encode 8-bit input.
                         */
 
-                       c = (unsigned char) blk.buf[i];
+                       c = blk.buf[i];
+                       if (c & 0x80) {
+                               blk.offs = i;
+                               ln.offs = pos;
+                               if (curp->filenc && preconv_encode(
+                                   &blk, &ln, &curp->filenc)) {
+                                       pos = ln.offs;
+                                       i = blk.offs;
+                               } else {
+                                       mandoc_vmsg(MANDOCERR_BADCHAR,
+                                           curp, curp->line, pos,
+                                           "0x%x", c);
+                                       ln.buf[pos++] = '?';
+                                       i++;
+                               }
+                               continue;
+                       }
+
+                       /*
+                        * Exclude control characters.
+                        */
 
-                       if ( ! (isascii(c) &&
-                           (isgraph(c) || isblank(c)))) {
+                       if (c == 0x7f || (c < 0x20 && c != 0x09)) {
                                mandoc_vmsg(MANDOCERR_BADCHAR, curp,
                                    curp->line, pos, "0x%x", c);
                                i++;
@@ -626,6 +642,7 @@ read_whole_file(struct mparse *curp, con
                        return(0);
                }
                *with_mmap = 1;
+               fb->offs = 0;
                fb->sz = (size_t)st.st_size;
                fb->buf = mmap(NULL, fb->sz, PROT_READ, MAP_SHARED, fd, 0);
                if (fb->buf != MAP_FAILED)
@@ -656,6 +673,7 @@ read_whole_file(struct mparse *curp, con
                ssz = read(fd, fb->buf + (int)off, fb->sz - off);
                if (ssz == 0) {
                        fb->sz = off;
+                       fb->offs = 0;
                        return(1);
                }
                if (ssz == -1) {
@@ -727,6 +745,15 @@ mparse_parse_buffer(struct mparse *curp,
        curp->line = 1;
        recursion_depth++;
 
+       /* Skip an UTF-8 byte order mark. */
+       if (curp->filenc & MPARSE_UTF8 && blk.sz > 2 &&
+           (unsigned char)blk.buf[0] == 0xef &&
+           (unsigned char)blk.buf[1] == 0xbb &&
+           (unsigned char)blk.buf[2] == 0xbf) {
+               blk.offs = 3;
+               curp->filenc &= ~MPARSE_LATIN1;
+       }
+
        mparse_buf_r(curp, blk, 1);
 
        if (0 == --recursion_depth && MANDOCLEVEL_FATAL > curp->file_status)
@@ -741,6 +768,7 @@ mparse_readfd(struct mparse *curp, int f
 {
        struct buf       blk;
        int              with_mmap;
+       int              save_filenc;
 
        if (-1 == fd && -1 == (fd = open(file, O_RDONLY, 0))) {
                curp->file_status = MANDOCLEVEL_SYSERR;
@@ -759,7 +787,11 @@ mparse_readfd(struct mparse *curp, int f
         */
 
        if (read_whole_file(curp, file, fd, &blk, &with_mmap)) {
+               save_filenc = curp->filenc;
+               curp->filenc = curp->options &
+                   (MPARSE_UTF8 | MPARSE_LATIN1);
                mparse_parse_buffer(curp, blk, file);
+               curp->filenc = save_filenc;
                if (with_mmap)
                        munmap(blk.buf, blk.sz);
                else

native UTF-8 and ISO-8859-1 *input* support for mandoc(1)

Reply via email to

native UTF-8 and ISO-8859-1 input support for mandoc(1)