> From: <jo...@armadilloaerospace.com>
> Date: Fri, 26 Jun 2020 07:42:50 -0700
> 
> Optimized 32 bit character rendering with unrolled rows and pairwise
> foreground / background pixel rendering.
> 
> If it weren't for the 5x8 font, I would have just assumed everything
> was an even width and made the fallback path also pairwise.
> 
> In isolation, the 16x32 character case got 2x faster, but that wasn't
> a huge real world speedup where the space rendering that was already
> at memory bandwidth limits accounted for most of the character
> rendering time.  However, in combination with the previous fast
> conditional console scrolling that removes most of the space rendering,
> it becomes significant.
> 
> I also found that at least the efi and intel framebuffers are not
> currently mapped write combining, which makes this much slower than
> it should be.

Hi John,

The framebuffer should be mapped write-combining.  In OpenBSD this is
requested by specifying the BUS_SPACE_MAP_PREFETCHABLE flag to
bbus_space_map(9) when mapping the framebuffer.

I'm fairly confident since until last January the initial mapping of
the framebuffer that we used wasn't write-combining.  And things were
really, really slow.

Cheers,

Mark

> Index: rasops32.c
> ===================================================================
> RCS file: /cvs/src/sys/dev/rasops/rasops32.c,v
> retrieving revision 1.10
> diff -u -p -r1.10 rasops32.c
> --- rasops32.c        25 May 2020 09:55:49 -0000      1.10
> +++ rasops32.c        26 Jun 2020 14:34:06 -0000
> @@ -65,9 +65,14 @@ rasops32_init(struct rasops_info *ri)
>  int
>  rasops32_putchar(void *cookie, int row, int col, u_int uc, uint32_t
> attr)
>  {
> -     int width, height, cnt, fs, fb, clr[2];
> +     int width, height, step, cnt, fs, b, f;
> +     uint32_t fb, clr[2];
>       struct rasops_info *ri;
> -     int32_t *dp, *rp;
> +     int64_t *rp, q;
> +     union {
> +             int64_t q[4];
> +             int32_t d[4][2];
> +     } u;
>       u_char *fr;
>  
>       ri = (struct rasops_info *)cookie;
> @@ -81,48 +86,128 @@ rasops32_putchar(void *cookie, int row, 
>               return 0;
>  #endif
>  
> -     rp = (int32_t *)(ri->ri_bits + row*ri->ri_yscale + col*ri->ri_xscale);
> +     rp = (int64_t *)(ri->ri_bits + row*ri->ri_yscale + col*ri->ri_xscale);
>  
>       height = ri->ri_font->fontheight;
>       width = ri->ri_font->fontwidth;
> +     step = ri->ri_stride >> 3;
>  
> -     clr[0] = ri->ri_devcmap[(attr >> 16) & 0xf];
> -     clr[1] = ri->ri_devcmap[(attr >> 24) & 0xf];
> +     b = ri->ri_devcmap[(attr >> 16) & 0xf];
> +     f = ri->ri_devcmap[(attr >> 24) & 0xf];
> +     u.d[0][0] = b; u.d[0][1] = b;
> +     u.d[1][0] = b; u.d[1][1] = f;
> +     u.d[2][0] = f; u.d[2][1] = b;
> +     u.d[3][0] = f; u.d[3][1] = f;
>  
>       if (uc == ' ') {
> +             q = u.q[0];
>               while (height--) {
> -                     dp = rp;
> -                     DELTA(rp, ri->ri_stride, int32_t *);
> -
> -                     for (cnt = width; cnt; cnt--)
> -                             *dp++ = clr[0];
> +                     /* the general, pixel-at-a-time case is fast enough */
> +                     for (cnt = 0; cnt < width; cnt++)
> +                             ((int *)rp)[cnt] = b;
> +                     rp += step;
>               }
>       } else {
>               uc -= ri->ri_font->firstchar;
>               fr = (u_char *)ri->ri_font->data + uc * ri->ri_fontscale;
>               fs = ri->ri_font->stride;
> -
> -             while (height--) {
> -                     dp = rp;
> -                     fb = fr[3] | (fr[2] << 8) | (fr[1] << 16) |
> -                         (fr[0] << 24);
> -                     fr += fs;
> -                     DELTA(rp, ri->ri_stride, int32_t *);
> -
> -                     for (cnt = width; cnt; cnt--) {
> -                             *dp++ = clr[(fb >> 31) & 1];
> -                             fb <<= 1;
> -                     }
> +             /* double-pixel special cases for the common widths */
> +             switch (width) {
> +                     case 8:
> +                             while (height--) {
> +                                     fb = fr[0];
> +                                     rp[0] = u.q[fb >> 6];
> +                                     rp[1] = u.q[(fb >> 4) & 3];
> +                                     rp[2] = u.q[(fb >> 2) & 3];
> +                                     rp[3] = u.q[fb & 3];
> +                                     rp += step;
> +                                     fr += 1;
> +                             }
> +                             break;
> +     
> +                     case 12:
> +                             while (height--) {
> +                                     fb = fr[0];
> +                                     rp[0] = u.q[fb >> 6];
> +                                     rp[1] = u.q[(fb >> 4) & 3];
> +                                     rp[2] = u.q[(fb >> 2) & 3];
> +                                     rp[3] = u.q[fb & 3];
> +                                     fb = fr[1];
> +                                     rp[4] = u.q[fb >> 6];
> +                                     rp[5] = u.q[(fb >> 4) & 3];
> +                                     rp += step;
> +                                     fr += 2;
> +                             }
> +                             break;
> +                             
> +                     case 16:
> +                             while (height--) {
> +                                     fb = fr[0];
> +                                     rp[0] = u.q[fb >> 6];
> +                                     rp[1] = u.q[(fb >> 4) & 3];
> +                                     rp[2] = u.q[(fb >> 2) & 3];
> +                                     rp[3] = u.q[fb & 3];
> +                                     fb = fr[1];
> +                                     rp[4] = u.q[fb >> 6];
> +                                     rp[5] = u.q[(fb >> 4) & 3];
> +                                     rp[6] = u.q[(fb >> 2) & 3];
> +                                     rp[7] = u.q[fb & 3];
> +                                     rp += step;
> +                                     fr += 2;
> +                             }
> +                             break;  
> +                     case 32:
> +                             while (height--) {
> +                                     fb = fr[0];
> +                                     rp[0] = u.q[fb >> 6];
> +                                     rp[1] = u.q[(fb >> 4) & 3];
> +                                     rp[2] = u.q[(fb >> 2) & 3];
> +                                     rp[3] = u.q[fb & 3];
> +                                     fb = fr[1];
> +                                     rp[4] = u.q[fb >> 6];
> +                                     rp[5] = u.q[(fb >> 4) & 3];
> +                                     rp[6] = u.q[(fb >> 2) & 3];
> +                                     rp[7] = u.q[fb & 3];
> +                                     fb = fr[2];
> +                                     rp[8] = u.q[fb >> 6];
> +                                     rp[9] = u.q[(fb >> 4) & 3];
> +                                     rp[10] = u.q[(fb >> 2) & 3];
> +                                     rp[11] = u.q[fb & 3];
> +                                     fb = fr[3];
> +                                     rp[12] = u.q[fb >> 6];
> +                                     rp[13] = u.q[(fb >> 4) & 3];
> +                                     rp[14] = u.q[(fb >> 2) & 3];
> +                                     rp[15] = u.q[fb & 3];
> +                                     rp += step;
> +                                     fr += 4;
> +                             }
> +                             break;  
> +
> +
> +                     default: /* there is a 5x8 font, so fall back to 
> per-pixel */
> +                             clr[0] = b;
> +                             clr[1] = f;
> +                             while (height--) {
> +                                     fb = fr[3] | (fr[2] << 8) | (fr[1] << 
> 16) |
> +                                         (fr[0] << 24);
> +                                     fr += fs;
> +                                     for (cnt = 0; cnt < width; cnt++) {
> +                                             ((int *)rp)[cnt] = clr[fb >> 
> 31];
> +                                             fb <<= 1;
> +                                     }
> +                                     rp += step;
> +                             }
> +                             break;
>               }
>       }
>  
> -     /* Do underline */
> +     /* Do underline a pixel at a time */
>       if ((attr & 1) != 0) {
> -             DELTA(rp, -(ri->ri_stride << 1), int32_t *);
> -
> -             while (width--)
> -                     *rp++ = clr[1];
> +             rp -= step;
> +             for (cnt = 0; cnt < width ; cnt++)
> +                     ((int *)rp)[cnt] = f;
>       }
>  
>       return 0;
>  }
> +
> 
> 

Reply via email to