I believe it is mapped as normally cached right now, rather than uncached or write combining.
Reads aren't ultra-slow, and the timings of 48 byte writes appear to involve a cacheline read. 128 byte writes are actually slower than 64 byte writes, which I guessed might be because of automatic prefetching kicking in and reading the following cacheline. -------- Original Message -------- Subject: Re: [PATCH} Optimized rasops32 putchar From: Mark Kettenis <mark.kette...@xs4all.nl> Date: Sat, June 27, 2020 7:56 am To: <jo...@armadilloaerospace.com> Cc: tech@openbsd.org > From: <jo...@armadilloaerospace.com> > Date: Fri, 26 Jun 2020 07:42:50 -0700 > > Optimized 32 bit character rendering with unrolled rows and pairwise > foreground / background pixel rendering. > > If it weren't for the 5x8 font, I would have just assumed everything > was an even width and made the fallback path also pairwise. > > In isolation, the 16x32 character case got 2x faster, but that wasn't > a huge real world speedup where the space rendering that was already > at memory bandwidth limits accounted for most of the character > rendering time. However, in combination with the previous fast > conditional console scrolling that removes most of the space rendering, > it becomes significant. > > I also found that at least the efi and intel framebuffers are not > currently mapped write combining, which makes this much slower than > it should be. Hi John, The framebuffer should be mapped write-combining. In OpenBSD this is requested by specifying the BUS_SPACE_MAP_PREFETCHABLE flag to bbus_space_map(9) when mapping the framebuffer. I'm fairly confident since until last January the initial mapping of the framebuffer that we used wasn't write-combining. And things were really, really slow. Cheers, Mark > Index: rasops32.c > =================================================================== > RCS file: /cvs/src/sys/dev/rasops/rasops32.c,v > retrieving revision 1.10 > diff -u -p -r1.10 rasops32.c > --- rasops32.c 25 May 2020 09:55:49 -0000 1.10 > +++ rasops32.c 26 Jun 2020 14:34:06 -0000 > @@ -65,9 +65,14 @@ rasops32_init(struct rasops_info *ri) > int > rasops32_putchar(void *cookie, int row, int col, u_int uc, uint32_t > attr) > { > - int width, height, cnt, fs, fb, clr[2]; > + int width, height, step, cnt, fs, b, f; > + uint32_t fb, clr[2]; > struct rasops_info *ri; > - int32_t *dp, *rp; > + int64_t *rp, q; > + union { > + int64_t q[4]; > + int32_t d[4][2]; > + } u; > u_char *fr; > > ri = (struct rasops_info *)cookie; > @@ -81,48 +86,128 @@ rasops32_putchar(void *cookie, int row, > return 0; > #endif > > - rp = (int32_t *)(ri->ri_bits + row*ri->ri_yscale + col*ri->ri_xscale); > + rp = (int64_t *)(ri->ri_bits + row*ri->ri_yscale + col*ri->ri_xscale); > > height = ri->ri_font->fontheight; > width = ri->ri_font->fontwidth; > + step = ri->ri_stride >> 3; > > - clr[0] = ri->ri_devcmap[(attr >> 16) & 0xf]; > - clr[1] = ri->ri_devcmap[(attr >> 24) & 0xf]; > + b = ri->ri_devcmap[(attr >> 16) & 0xf]; > + f = ri->ri_devcmap[(attr >> 24) & 0xf]; > + u.d[0][0] = b; u.d[0][1] = b; > + u.d[1][0] = b; u.d[1][1] = f; > + u.d[2][0] = f; u.d[2][1] = b; > + u.d[3][0] = f; u.d[3][1] = f; > > if (uc == ' ') { > + q = u.q[0]; > while (height--) { > - dp = rp; > - DELTA(rp, ri->ri_stride, int32_t *); > - > - for (cnt = width; cnt; cnt--) > - *dp++ = clr[0]; > + /* the general, pixel-at-a-time case is fast enough */ > + for (cnt = 0; cnt < width; cnt++) > + ((int *)rp)[cnt] = b; > + rp += step; > } > } else { > uc -= ri->ri_font->firstchar; > fr = (u_char *)ri->ri_font->data + uc * ri->ri_fontscale; > fs = ri->ri_font->stride; > - > - while (height--) { > - dp = rp; > - fb = fr[3] | (fr[2] << 8) | (fr[1] ><< 16) | > - (fr[0] << 24); > - fr += fs; > - DELTA(rp, ri->ri_stride, int32_t *); > - > - for (cnt = width; cnt; cnt--) { > - *dp++ = clr[(fb >> 31) & 1]; > - fb <<= 1; > - } > + /* double-pixel special cases for the common widths */ > + switch (width) { > + case 8: > + while (height--) { > + fb = fr[0]; > + rp[0] = u.q[fb >> 6]; > + rp[1] = u.q[(fb >> 4) & 3]; > + rp[2] = u.q[(fb >> 2) & 3]; > + rp[3] = u.q[fb & 3]; > + rp += step; > + fr += 1; > + } > + break; > + > + case 12: > + while (height--) { > + fb = fr[0]; > + rp[0] = u.q[fb >> 6]; > + rp[1] = u.q[(fb >> 4) & 3]; > + rp[2] = u.q[(fb >> 2) & 3]; > + rp[3] = u.q[fb & 3]; > + fb = fr[1]; > + rp[4] = u.q[fb >> 6]; > + rp[5] = u.q[(fb >> 4) & 3]; > + rp += step; > + fr += 2; > + } > + break; > + > + case 16: > + while (height--) { > + fb = fr[0]; > + rp[0] = u.q[fb >> 6]; > + rp[1] = u.q[(fb >> 4) & 3]; > + rp[2] = u.q[(fb >> 2) & 3]; > + rp[3] = u.q[fb & 3]; > + fb = fr[1]; > + rp[4] = u.q[fb >> 6]; > + rp[5] = u.q[(fb >> 4) & 3]; > + rp[6] = u.q[(fb >> 2) & 3]; > + rp[7] = u.q[fb & 3]; > + rp += step; > + fr += 2; > + } > + break; > + case 32: > + while (height--) { > + fb = fr[0]; > + rp[0] = u.q[fb >> 6]; > + rp[1] = u.q[(fb >> 4) & 3]; > + rp[2] = u.q[(fb >> 2) & 3]; > + rp[3] = u.q[fb & 3]; > + fb = fr[1]; > + rp[4] = u.q[fb >> 6]; > + rp[5] = u.q[(fb >> 4) & 3]; > + rp[6] = u.q[(fb >> 2) & 3]; > + rp[7] = u.q[fb & 3]; > + fb = fr[2]; > + rp[8] = u.q[fb >> 6]; > + rp[9] = u.q[(fb >> 4) & 3]; > + rp[10] = u.q[(fb >> 2) & 3]; > + rp[11] = u.q[fb & 3]; > + fb = fr[3]; > + rp[12] = u.q[fb >> 6]; > + rp[13] = u.q[(fb >> 4) & 3]; > + rp[14] = u.q[(fb >> 2) & 3]; > + rp[15] = u.q[fb & 3]; > + rp += step; > + fr += 4; > + } > + break; > + > + > + default: /* there is a 5x8 font, so fall back to per-pixel */ > + clr[0] = b; > + clr[1] = f; > + while (height--) { > + fb = fr[3] | (fr[2] << 8) | (fr[1] ><< 16) | > + (fr[0] << 24); > + fr += fs; > + for (cnt = 0; cnt < width; cnt++) { > + ((int *)rp)[cnt] = clr[fb >> 31]; > + fb <<= 1; > + } > + rp += step; > + } > + break; > } > } > > - /* Do underline */ > + /* Do underline a pixel at a time */ > if ((attr & 1) != 0) { > - DELTA(rp, -(ri->ri_stride << 1), int32_t *); > - > - while (width--) > - *rp++ = clr[1]; > + rp -= step; > + for (cnt = 0; cnt < width ; cnt++) > + ((int *)rp)[cnt] = f; > } > > return 0; > } > + > >