I have a Sharp Zaurus C3100, where X normally runs rotated 90 degrees,
using a shadow framebuffer.  I've been hacking a bit on getting the
code that blits a rotated shadow onto the display a bit faster and
came up with the included patch.

Blitting in rotated mode is about 4x the previous speed.  Non-rotated
copies are about the same speed; maybe up to 10% slower for small
rectangles (on the Zaurus).

The idea is to copy the area in blocks of 32x32 pixels, to reduce the
number of cache misses, which are unavoidable when walking either the
source or the destination bitmap across the scanlines.  16x16, 24x24,
andd 32x32 yields about the same result, so I chose 32x32 since it
seems best for the non-rotated modes.

Any comments on this patch?

I have a question myself about the original code: This is the function
call to get the address in the destination frame buffer to write to:

          win = (FbBits *) (*pBuf->window) (pScreen,
                                            scr_y,
                                            scr_x << 2,
                                            SHADOW_WINDOW_WRITE,
                                            &winSize,
                                            pBuf->closure);

The "scr_x << 2" part seems, to me, to assume that 
sizeof(FbBits) == 4.  Am I missing something, or is this really 
correct?  Anyway, my patch does not make this problem either better
or worse, but this is a chance to fix it if it is a bug...

Staffan


Index: programs/Xserver/miext/shadow/shrotate.c
===================================================================
RCS file: /scratch/openbsd/cvs/XF4/xc/programs/Xserver/miext/shadow/shrotate.c,v
retrieving revision 1.2
diff -u -r1.2 shrotate.c
--- programs/Xserver/miext/shadow/shrotate.c    3 Nov 2004 00:09:54 -0000       
1.2
+++ programs/Xserver/miext/shadow/shrotate.c    20 Sep 2005 23:07:58 -0000
@@ -45,6 +45,106 @@
 #define TOP_TO_BOTTOM  2
 #define BOTTOM_TO_TOP  -2
 
+
+static void 
+shadowUpdateRotatePackedSubRectangle(shadowBufPtr pBuf,
+                                    FbBits *shaLine, int shaFirstShift, 
+                                    int shaStepOverX, int shaStepOverY,
+                                    int shaStepDownX, int shaStepDownY,
+                                    int shaBpp, FbBits shaMask,
+                                    ScreenPtr pScreen,
+                                    int scr_x1, int scr_y, 
+                                    int scr_h, int scr_w,
+                                    int pixelsPerBits)
+{
+    FbBits *sha;
+    int shaShift;
+    int scr_x;
+    int w;
+
+    /*
+     * Copy the bits, always write across the physical frame buffer
+     * to take advantage of write combining.
+     */
+    while (scr_h--)
+    {
+       int         p;
+       FbBits  bits;
+       FbBits  *win;
+       int         i;
+       CARD32  winSize;
+       
+       sha = shaLine;
+       shaShift = shaFirstShift;
+       w = scr_w;
+       scr_x = scr_x1 * shaBpp >> FB_SHIFT;
+       
+       while (w)
+       {
+         /*
+          * Map some of this line
+          */
+         win = (FbBits *) (*pBuf->window) (pScreen,
+                                           scr_y,
+                                           scr_x << 2,
+                                           SHADOW_WINDOW_WRITE,
+                                           &winSize,
+                                           pBuf->closure);
+           i = (winSize >> 2);
+           if (i > w)
+               i = w;
+           w -= i;
+           scr_x += i;
+           /*
+            * Copy the portion of the line mapped
+            */
+           while (i--)
+           {
+               bits = 0;
+               p = pixelsPerBits;
+               /*
+                * Build one word of output from multiple inputs
+                */
+               while (p--)
+               {
+                   bits = FbScrLeft(bits, shaBpp);
+                   bits |= FbScrRight (*sha, shaShift) & shaMask;
+                   
+                   shaShift -= shaStepOverX;
+                   if (shaShift >= FB_UNIT)
+                   {
+                       shaShift -= FB_UNIT;
+                           sha--;
+                   }
+                   else if (shaShift < 0)
+                   {
+                       shaShift += FB_UNIT;
+                       sha++;
+                   }
+                   sha += shaStepOverY;
+               }
+               *win++ = bits;
+           }
+       }
+       scr_y++;
+       shaFirstShift -= shaStepDownX;
+       if (shaFirstShift >= FB_UNIT)
+       {
+           shaFirstShift -= FB_UNIT;
+           shaLine--;
+       }
+       else if (shaFirstShift < 0)
+       {
+           shaFirstShift += FB_UNIT;
+           shaLine++;
+       }
+       shaLine += shaStepDownY;
+    }
+}
+
+#define BLOCKSIZE_HEIGHT 32
+#define BLOCKSIZE_WIDTH 32
+
 void
 shadowUpdateRotatePacked (ScreenPtr    pScreen,
                          shadowBufPtr  pBuf)
@@ -61,7 +161,6 @@
     int                sha_x1 = 0, sha_y1 = 0;
     int                scr_x1 = 0, scr_x2 = 0, scr_y1 = 0, scr_y2 = 0, scr_w, 
scr_h;
     int                scr_x, scr_y;
-    int                w;
     int                pixelsPerBits;
     int                pixelsMask;
     FbStride   shaStepOverY = 0, shaStepDownY = 0;
@@ -221,86 +320,46 @@
                   ((sha_x1 * shaBpp) >> FB_SHIFT));
 
        /*
-        * Copy the bits, always write across the physical frame buffer
-        * to take advantage of write combining.
+        * Copy in blocks of size BLOCKSIZE_WIDTH x BLOCKSIZE_HEIGHT
+        * to reduce the number of cache misses when rotating 90 or
+        * 270 degrees.
         */
-       while (scr_h--)
+       for (scr_y = scr_y1; scr_y < scr_y2; scr_y += BLOCKSIZE_HEIGHT)
        {
-           int     p;
-           FbBits  bits;
-           FbBits  *win;
-           int     i;
-           CARD32  winSize;
-           
            sha = shaLine;
            shaShift = shaFirstShift;
-           w = scr_w;
-           scr_x = scr_x1 * shaBpp >> FB_SHIFT;
 
-           while (w)
+           for (scr_x = scr_x1; scr_x < scr_x2; scr_x += BLOCKSIZE_WIDTH)
            {
-               /*
-                * Map some of this line
-                */
-               win = (FbBits *) (*pBuf->window) (pScreen,
-                                                 scr_y,
-                                                 scr_x << 2,
-                                                 SHADOW_WINDOW_WRITE,
-                                                 &winSize,
-                                                 pBuf->closure);
-               i = (winSize >> 2);
-               if (i > w)
-                   i = w;
-               w -= i;
-               scr_x += i;
-               /*
-                * Copy the portion of the line mapped
-                */
-               while (i--)
-               {
-                   bits = 0;
-                   p = pixelsPerBits;
-                   /*
-                    * Build one word of output from multiple inputs
-                    * 
-                    * Note that for 90/270 rotations, this will walk
-                    * down the shadow hitting each scanline once.
-                    * This is probably not very efficient.
-                    */
-                   while (p--)
-                   {
-                       bits = FbScrLeft(bits, shaBpp);
-                       bits |= FbScrRight (*sha, shaShift) & shaMask;
+               int h = BLOCKSIZE_HEIGHT;
+               int w = BLOCKSIZE_WIDTH;
 
-                       shaShift -= shaStepOverX;
-                       if (shaShift >= FB_UNIT)
-                       {
-                           shaShift -= FB_UNIT;
-                           sha--;
-                       }
-                       else if (shaShift < 0)
-                       {
-                           shaShift += FB_UNIT;
-                           sha++;
-                       }
-                       sha += shaStepOverY;
-                   }
-                   *win++ = bits;
-               }
-           }
-           scr_y++;
-           shaFirstShift -= shaStepDownX;
-           if (shaFirstShift >= FB_UNIT)
-           {
-               shaFirstShift -= FB_UNIT;
-               shaLine--;
-           }
-           else if (shaFirstShift < 0)
-           {
-               shaFirstShift += FB_UNIT;
-               shaLine++;
+               if (scr_y + h > scr_y2)
+                   h = scr_y2 - scr_y;
+               if (scr_x + w > scr_x2)
+                   w = scr_x2 - scr_x;
+               w = (w * shaBpp) >> FB_SHIFT;
+
+               shadowUpdateRotatePackedSubRectangle
+                 (pBuf,
+                  sha, shaShift,
+                  shaStepOverX, shaStepOverY,
+                  shaStepDownX, shaStepDownY,
+                  shaBpp, shaMask,
+                  pScreen,
+                  scr_x, scr_y,
+                  h, w,
+                  pixelsPerBits);
+               
+               shaShift -= BLOCKSIZE_WIDTH * shaStepOverX;
+               sha += BLOCKSIZE_WIDTH * shaStepOverY;
+               sha -= (shaShift >> FB_SHIFT); 
+               shaShift &= FB_MASK; 
            }
-           shaLine += shaStepDownY;
+           shaFirstShift -= BLOCKSIZE_HEIGHT * shaStepDownX;
+           shaLine += BLOCKSIZE_HEIGHT * shaStepDownY;
+           shaLine -= (shaFirstShift >> FB_SHIFT); 
+           shaFirstShift &= FB_MASK; 
        }
     }
 }
_______________________________________________
Devel mailing list
Devel@XFree86.Org
http://XFree86.Org/mailman/listinfo/devel

Reply via email to