console-vc: add UTF-8 input decoding with CP437 rendering

Daniel P . Berrangé Tue, 24 Mar 2026 08:43:57 -0700

On Tue, Mar 24, 2026 at 06:17:37PM +0400, Marc-André Lureau wrote:
> Hi
> 
> On Tue, Mar 24, 2026 at 6:08 PM Daniel P. Berrangé <[email protected]> 
> wrote:
> >
> > On Tue, Mar 17, 2026 at 12:50:25PM +0400, Marc-André Lureau wrote:
> > > The text console receives bytes that may be UTF-8 encoded (e.g. from
> > > a guest running a modern distro), but currently treats each byte as a
> > > raw character index into the VGA/CP437 font, producing garbled output
> > > for any multi-byte sequence.
> > >
> > > Add a proper UTF-8 decoder using Bjoern Hoehrmann's DFA.
> > > The DFA inherently rejects overlong encodings, surrogates, and
> > > codepoints above U+10FFFF.  Completed codepoints are then mapped to
> > > CP437, unmappable characters are displayed as '?'.
> >
> > I'm surprised we can't do a charset conversion using GLib APIs ?
> >
> > Do the g_convert family of  APIs (which IIUC wrap the distro iconv)
> > not do what we would want ? If not, would direct use of iconv not
> > be an alternative ?
> >
> 
> I tried to use GIconv but ran into a number of issues, as it doesn't
> operate on character level, but strings. And it uses allocation etc. I
> didn't manage with iconv either.


Looking again, the g_utf8_validate function is /almost/ what we
want, but its API design collapses both "invalid utf8" and
"incomplete character" into the same error return value, so we
can't distinguish them to decide whether to wait for more bytes
or reset the state :-(

So yeah, I can see why this is needed now.

> 
> > It feels pretty wrong to need to embed UTF8 decoding code in
> > QEMU
> 
> Yes, but on a standalone qemu-vnc server, is it more acceptable?

IIUC, this will be linked into regular QEMU too, right ?

> > > Signed-off-by: Marc-André Lureau <[email protected]>
> > > ---
> > >  ui/cp437.h      |  13 ++++
> > >  ui/console-vc.c |  62 +++++++++++++++++
> > >  ui/cp437.c      | 205 
> > > ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
> > >  ui/meson.build  |   2 +-
> > >  4 files changed, 281 insertions(+), 1 deletion(-)
> > >
> > > diff --git a/ui/cp437.h b/ui/cp437.h
> > > new file mode 100644
> > > index 00000000000..81ace8317c7
> > > --- /dev/null
> > > +++ b/ui/cp437.h
> > > @@ -0,0 +1,13 @@
> > > +/*
> > > + * SPDX-License-Identifier: GPL-2.0-or-later
> > > + *
> > > + * Copyright (c) QEMU contributors
> > > + */
> > > +#ifndef QEMU_CP437_H
> > > +#define QEMU_CP437_H
> > > +
> > > +#include <stdint.h>

Shouldn't be required, since it is pulled in by osdep.h

> > > +
> > > +int unicode_to_cp437(uint32_t codepoint);

Perhaps better as qemu_unicode_to_cp437 

> > > +
> > > +#endif /* QEMU_CP437_H */


> > > diff --git a/ui/console-vc.c b/ui/console-vc.c
> > > index 8dee1f9bd01..7bbd65dea27 100644
> > > --- a/ui/console-vc.c
> > > +++ b/ui/console-vc.c
> > > @@ -9,6 +9,7 @@
> > >  #include "qemu/fifo8.h"
> > >  #include "qemu/option.h"
> > >  #include "ui/console.h"
> > > +#include "ui/cp437.h"
> > >
> > >  #include "trace.h"
> > >  #include "console-priv.h"
> > > @@ -89,6 +90,8 @@ struct VCChardev {
> > >      enum TTYState state;
> > >      int esc_params[MAX_ESC_PARAMS];
> > >      int nb_esc_params;
> > > +    uint32_t utf8_state;     /* UTF-8 DFA decoder state */
> > > +    uint32_t utf8_codepoint; /* accumulated UTF-8 code point */
> > >      TextAttributes t_attrib; /* currently active text attributes */
> > >      TextAttributes t_attrib_saved;
> > >      int x_saved, y_saved;
> > > @@ -598,6 +601,47 @@ static void vc_clear_xy(VCChardev *vc, int x, int y)
> > >      vc_update_xy(vc, x, y);
> > >  }
> > >
> > > +/*
> > > + * UTF-8 DFA decoder by Bjoern Hoehrmann.
> > > + * Copyright (c) 2008-2010 Bjoern Hoehrmann <[email protected]>
> > > + * See https://github.com/polijan/utf8_decode for details.
> > > + *
> > > + * SPDX-License-Identifier: MIT
> > > + */
> > > +#define UTF8_ACCEPT 0
> > > +#define UTF8_REJECT 12

This is an awfully generic define name, could we use something with
QEMU_ as a prefix to avoid risk of clashes with any external headers
we import

> > > +
> > > +static const uint8_t utf8d[] = {
> > > +    /* character class lookup */
> > > +    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
> > > +    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
> > > +    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
> > > +    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
> > > +    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,  9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
> > > +    7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,  7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
> > > +    8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,  2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
> > > +   10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8,
> > > +
> > > +    /* state transition lookup */
> > > +     0,12,24,36,60,96,84,12,12,12,48,72, 
> > > 12,12,12,12,12,12,12,12,12,12,12,12,
> > > +    12, 0,12,12,12,12,12, 0,12, 0,12,12, 
> > > 12,24,12,12,12,12,12,24,12,24,12,12,
> > > +    12,12,12,12,12,12,12,24,12,12,12,12, 
> > > 12,24,12,12,12,12,12,12,12,24,12,12,
> > > +    12,12,12,12,12,12,12,36,12,36,12,12, 
> > > 12,36,12,12,12,12,12,36,12,36,12,12,
> > > +    12,36,12,12,12,12,12,12,12,12,12,12,
> > > +};
> > > +
> > > +static uint32_t utf8_decode(uint32_t *state, uint32_t *codep, uint32_t 
> > > byte)
> > > +{
> > > +    uint32_t type = utf8d[byte];
> > > +
> > > +    *codep = (*state != UTF8_ACCEPT) ?
> > > +        (byte & 0x3fu) | (*codep << 6) :
> > > +        (0xffu >> type) & (byte);
> > > +
> > > +    *state = utf8d[256 + *state + type];
> > > +    return *state;
> > > +}
> > > +
> > >  static void vc_put_one(VCChardev *vc, int ch)
> > >  {
> > >      QemuTextConsole *s = vc->console;
> > > @@ -761,6 +805,24 @@ static void vc_putchar(VCChardev *vc, int ch)
> > >
> > >      switch(vc->state) {
> > >      case TTY_STATE_NORM:
> > > +        /* Feed byte through the UTF-8 DFA decoder */
> > > +        if (ch >= 0x80) {
> > > +            switch (utf8_decode(&vc->utf8_state, &vc->utf8_codepoint, 
> > > ch)) {
> > > +            case UTF8_ACCEPT:
> > > +                vc_put_one(vc, unicode_to_cp437(vc->utf8_codepoint));
> > > +                break;
> > > +            case UTF8_REJECT:
> > > +                /* Reset state so the decoder can resync */
> > > +                vc->utf8_state = UTF8_ACCEPT;
> > > +                break;
> > > +            default:
> > > +                /* Need more bytes */
> > > +                break;
> > > +            }
> > > +            break;
> > > +        }
> > > +        /* ASCII byte: abort any pending UTF-8 sequence */
> > > +        vc->utf8_state = UTF8_ACCEPT;
> > >          switch(ch) {
> > >          case '\r':  /* carriage return */
> > >              s->x = 0;

With regards,
Daniel
-- 
|: https://berrange.com       ~~        https://hachyderm.io/@berrange :|
|: https://libvirt.org          ~~          https://entangle-photo.org :|
|: https://pixelfed.art/berrange   ~~    https://fstop138.berrange.com :|

Re: [PATCH 11/60] ui/console-vc: add UTF-8 input decoding with CP437 rendering

Reply via email to