[Mesa3d-dev] Dispatch generation for x86-64

Ian Romanick Mon, 03 Jul 2006 19:01:32 -0700

-----BEGIN PGP SIGNED MESSAGE-----
Hash: SHA1

Here is my first pass at run-time dispatch generation code for x86-64.
It's not particularly well tested yet (or I'd just commit it now).  I
wrote a test program that called fill_in_entrypoint with a few different
 parameter signature strings, and I verified that the generated code
looked correct.


After some discussion on IRC, I've been thinking about how we might make
run-time dispatch generation work on "hardend" systems.  Those systems
don't allow a memory region to be both writable and executable.  We can
use mprotect to adjust the protections.  There are a couple sublte
problems that need to be overcome to make this really work:

1. mprotect works at page granularity.

2. Once a page is marked (PROT_READ|PROT_EXEC) we can *never* make it
PROT_WRITE again.  Multithreaded applications are the problem here.
Imagine one thread jumping to a dispatch function right when another
thread makes the page containing that dispatch function PROT_WRITE.

3. Since glXGetProcAddress returns a pointer to the dispatch stub,
memory must be allocated at that time.

4. On x86-64 and PowerPC, the dispatch functions cannot be fully created
until the driver asks for them to be created.  x86 has the same problem
currently, but that will be changed soon.

Normally when an application calls glXGetProcAddress a dummy stub is
created.  In the current implementation, a dispatch offset is not
assigned at this time.  The existing code expects that the driver will
later ask for the function to be added and will provide a dispatch
offset.  This is actually a bug, and it prevents drivers that support,
for example, APPLE_vertex_array_object from working with versions of
libGL that don't.  Once I commit a fix for that, libGL could assign a
dispatch offset when glXGetProcAddress is called.

This would allow libGL to create a fully functional dispatch stub on
x86.  In fact, libGL could create an entire page of dispatch stubs the
first time glXGetProcAddress is called.  This can be done on x86 because
the dispatch function is independent of the parameter signature of the
function being dispatched.

x86-64 and PowerPC do not share this feature.  Since these platforms
pass all parameters in registers, the dispatch function for glBegin is
different from the dispatch function for glTexImage2D.  When
glXGetProcAddress is called it is impossible to know what the parameter
signature, and thus the contents of the dispatch stub, should be.  This
prevents those platforms from being able to create a page of dispatch
stubs at a time.

I guess we could do a single dispatch function per page, but since the
dispatch functions are on the order of 128 bytes, that seems awfully
wasteful.

Thoughts?
-----BEGIN PGP SIGNATURE-----
Version: GnuPG v1.4.2.2 (GNU/Linux)

iD8DBQFEqct8X1gOwKyEAw8RAm77AJ4rDv8+fzALG6YRyxGyuhc6K1X1gQCdHPu+
Xr3m6AcBqqsFWumRbrq0by0=
=m2Dd
-----END PGP SIGNATURE-----

 gl_x86-64_asm.py |    9 +-
 glapi.c          |  183 +++++++++++++++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 183 insertions(+), 9 deletions(-)

? src/mesa/glapi/glX_API.xml
? src/mesa/glapi/glX_server_table.py
? src/mesa/glapi/gl_x86_asm.nasm.py
Index: src/mesa/glapi/gl_x86-64_asm.py
===================================================================
RCS file: /cvs/mesa/Mesa/src/mesa/glapi/gl_x86-64_asm.py,v
retrieving revision 1.7
diff -u -d -u -d -r1.7 gl_x86-64_asm.py
--- src/mesa/glapi/gl_x86-64_asm.py     17 Apr 2006 18:58:24 -0000      1.7
+++ src/mesa/glapi/gl_x86-64_asm.py     3 Jul 2006 22:50:34 -0000
@@ -144,12 +144,9 @@
                print ''
                print '#ifdef GLX_USE_TLS'
                print ''
-               print '\t.globl _x86_64_get_get_dispatch; 
HIDDEN(_x86_64_get_get_dispatch)'
-               print '_x86_64_get_get_dispatch:'
-               print '\tlea\t_x86_64_get_dispatch(%rip), %rax'
-               print '\tret'
-               print ''
                print '\t.p2align\t4,,15'
+               print '\t.globl\t_x86_64_get_dispatch ; 
HIDDEN(_x86_64_get_dispatch)'
+               print '\t.type\t_x86_64_get_dispatch, @function'
                print '_x86_64_get_dispatch:'
                print '[EMAIL PROTECTED](%rip), %rax'
                print '\tmovq\t%fs:(%rax), %rax'
@@ -163,6 +160,8 @@
                print '\t.extern\tpthread_getspecific'
                print ''
                print '\t.p2align\t4,,15'
+               print '\t.globl\t_x86_64_get_dispatch ; 
HIDDEN(_x86_64_get_dispatch)'
+               print '\t.type\t_x86_64_get_dispatch, @function'
                print '_x86_64_get_dispatch:'
                print '\tmovq\t_gl_DispatchTSD(%rip), %rdi'
                print '[EMAIL PROTECTED]'
Index: src/mesa/glapi/glapi.c
===================================================================
RCS file: /cvs/mesa/Mesa/src/mesa/glapi/glapi.c,v
retrieving revision 1.103
diff -u -d -u -d -r1.103 glapi.c
--- src/mesa/glapi/glapi.c      16 Jun 2006 14:50:05 -0000      1.103
+++ src/mesa/glapi/glapi.c      3 Jul 2006 22:50:35 -0000
@@ -65,7 +65,8 @@
 #endif
 
 static _glapi_proc generate_entrypoint(GLuint functionOffset);
-static void fill_in_entrypoint_offset(_glapi_proc entrypoint, GLuint offset);
+static void fill_in_entrypoint(_glapi_proc entrypoint, GLuint offset,
+    const char *parameter_signature);
 
 /*
  * Enable/disable printing of warning messages.
@@ -530,6 +531,20 @@
 extern void __glapi_sparc_icache_flush(unsigned int *);
 #endif
 
+/* Size of the maximum register save and restore code, plus the size of the
+ * code to get the dispatch pointer, plus the size of the code to call the
+ * dispatch function.
+ */
+#if defined(USE_X86_64_ASM)
+#define X86_64_MAX_DISPATCH_FUNCTION_SIZE \
+   ((2 * (8 + 51)) + 10 + 12)
+
+#if defined(GLX_USE_TLS) || defined(PTHREADS)
+extern void * _x86_64_get_dispatch(void);
+#endif /* defined(GLX_USE_TLS) || defined(PTHREADS) */
+#endif
+
+
 /**
  * Generate a dispatch function (entrypoint) which jumps through
  * the given slot number (offset) in the current dispatch table.
@@ -550,10 +565,24 @@
 
    if ( code != NULL ) {
       (void) memcpy( code, template_func, X86_DISPATCH_FUNCTION_SIZE );
-      fill_in_entrypoint_offset( (_glapi_proc) code, functionOffset );
+      fill_in_entrypoint((_glapi_proc) code, functionOffset, NULL);
+   }
+
+   return (_glapi_proc) code;
+#elif defined(USE_X86_64_ASM)
+   GLubyte * const code = (GLubyte *) 
malloc(X86_64_MAX_DISPATCH_FUNCTION_SIZE);
+
+   /* On x86-64 we can't generate the dispatch function *at all* until
+    * the function's parameter signature is known.  So, in the mean time
+    * we just emit a single RETQ instruction and call it good.
+    */
+
+   if (code != NULL) {
+      code[0] = 0xc3;
    }
 
    return (_glapi_proc) code;
+
 #elif defined(USE_SPARC_ASM)
 
 #ifdef __arch64__
@@ -615,11 +644,14 @@
  * stub that was generated with the preceeding function.
  */
 static void
-fill_in_entrypoint_offset(_glapi_proc entrypoint, GLuint offset)
+fill_in_entrypoint(_glapi_proc entrypoint, GLuint offset,
+                  const char *parameter_signature)
 {
 #if defined(USE_X86_ASM)
    GLubyte * const code = (GLubyte *) entrypoint;
 
+   (void) parameter_signature;
+
 #if X86_DISPATCH_FUNCTION_SIZE == 32
    *((unsigned int *)(code + 11)) = 4 * offset;
    *((unsigned int *)(code + 22)) = 4 * offset;
@@ -631,10 +663,151 @@
 # error Invalid X86_DISPATCH_FUNCTION_SIZE!
 #endif
 
+#elif defined(USE_X86_64_ASM)
+
+   static const GLubyte push_size[7] = {
+      0, 0, 1, 2, 3, 4, 6,
+   };
+
+   static const GLubyte push_int[8] = {
+      0x57,          /* pushq %rdi */
+      0x56,          /* pushq %rsi */
+      0x52,          /* pushq %rdx */
+      0x51,          /* pushq %rcx */
+      0x41, 0x50,    /* pushq %r8 */
+      0x41, 0x51     /* pushq %r9 */
+   };
+
+   static const GLubyte pop_offset[7] = {
+      8, 7, 6, 5, 4, 2, 0
+   };
+
+   static const GLubyte pop_int[8] = {
+      0x41, 0x59,    /* popq %r9 */
+      0x41, 0x58,    /* popq %r8 */
+      0x59,          /* popq %rcx */
+      0x5a,          /* popq %rdx */
+      0x5e,          /* popq %rsi */
+      0x5f,          /* popq %rdi */
+   };
+   
+   static const GLubyte push_float[] = {
+      0x48, 0x83, 0xec, 0x40,               /* subq   $0x40,%rsp */
+      0x66, 0x0f, 0xd6, 0x04, 0x24,         /* movq   %xmm0,(%rsp) */
+      0x66, 0x0f, 0xd6, 0x4c, 0x24, 0x08,   /* movq   %xmm1,0x8(%rsp) */
+      0x66, 0x0f, 0xd6, 0x54, 0x24, 0x10,   /* movq   %xmm2,0x10(%rsp) */
+      0x66, 0x0f, 0xd6, 0x5c, 0x24, 0x18,   /* movq   %xmm3,0x18(%rsp) */
+      0x66, 0x0f, 0xd6, 0x64, 0x24, 0x20,   /* movq   %xmm4,0x20(%rsp) */
+      0x66, 0x0f, 0xd6, 0x6c, 0x24, 0x28,   /* movq   %xmm5,0x28(%rsp) */
+      0x66, 0x0f, 0xd6, 0x74, 0x24, 0x30,   /* movq   %xmm6,0x30(%rsp) */
+      0x66, 0x0f, 0xd6, 0x7c, 0x24, 0x38,   /* movq   %xmm7,0x38(%rsp) */
+   };
+   
+   static const GLubyte pop_float[] = {
+      0xf3, 0x0f, 0x7e, 0x7c, 0x24, 0x38,   /* movq   0x38(%rsp),%xmm7 */
+      0xf3, 0x0f, 0x7e, 0x74, 0x24, 0x30,   /* movq   0x30(%rsp),%xmm6 */
+      0xf3, 0x0f, 0x7e, 0x6c, 0x24, 0x28,   /* movq   0x28(%rsp),%xmm5 */
+      0xf3, 0x0f, 0x7e, 0x64, 0x24, 0x24,   /* movq   0x20(%rsp),%xmm4 */
+      0xf3, 0x0f, 0x7e, 0x5c, 0x24, 0x18,   /* movq   0x18(%rsp),%xmm3 */
+      0xf3, 0x0f, 0x7e, 0x54, 0x24, 0x10,   /* movq   0x10(%rsp),%xmm2 */
+      0xf3, 0x0f, 0x7e, 0x4c, 0x24, 0x08,   /* movq   0x8(%rsp),%xmm1 */
+      0xf3, 0x0f, 0x7e, 0x04, 0x24,         /* movq   (%rsp),%xmm0 */
+      0x48, 0x83, 0xc4, 0x40,               /* addq   $0x40,%rsp */
+   };
+
+   static const GLubyte do_dispatch[10] = {
+      0x4c, 0x8b, 0x98, 0x00, 0x00, 0x00, 0x00, /* mov 0x00000000(%rax), %r11 
*/
+      0x41, 0xff, 0xe3     /* jmp (%r11) */
+   };
+   
+   unsigned int_params = 0;
+   unsigned float_params = 0;
+   unsigned i;
+   GLubyte *instr = (GLubyte *) entrypoint;
+   const int32_t true_offset = offset;
+   unsigned float_save_size;
+   const void * disp =
+#if defined(GLX_USE_TLS) || defined(PTHREADS)
+     _x86_64_get_dispatch
+#else
+     _glapi_get_dispatch
+#endif
+     ;
+   
+
+   for (i = 0 ; parameter_signature[i] != '\0' ; i++) {
+      switch (parameter_signature[i]) {
+      case 'i':
+      case 'p':
+        int_params++;
+        break;
+      case 'f':
+      case 'd':
+        float_params++;
+        break;
+      }
+   }
+
+   float_save_size = (6 * float_params) + 3;
+
+
+   /* x86-64 passes floating point values in SSE registers.  Unfortunately,
+    * there is no way to push or pop SSE registers.  This means that when
+    * there are floating point parameters to a function, the stack pointer is
+    * adjusted, and values are saved using MOVQ instructions.
+    *
+    * The order of the generated code is:
+    *     - push all the integer parameter registers
+    *     - update the stack pointer
+    *     - save all the float parameter registers on the satck
+    *     - get the dispatch pointer
+    *     - restore all the float parameter registers from the stack
+    *     - restore the stack pointer
+    *     - pop all the integer parameter registers
+    *     - call the disptach function
+    */
+
+   (void) memcpy(instr, push_int, push_size[int_params]);
+   instr += push_size[int_params];
+
+   if (float_params != 0) {
+      (void) memcpy(instr, push_float, float_save_size);
+      instr += float_save_size;
+   }
+
+   /* MOVQ $_x86_64_get_dispatch, %rax */
+   *instr = 0x48;
+   instr++;
+   *instr = 0xb8;
+   instr++;
+   (void) memcpy(instr, & disp, 8);
+   instr += 8;
+
+   /* CALLQ *%rax */
+   *instr = 0xff;
+   instr++;
+   *instr = 0xd0;
+   instr++;
+
+   if (float_params != 0) {
+      (void) memcpy(instr, pop_float + (6 * (8 - float_params)),
+                   float_save_size);
+      instr += float_save_size;
+   }
+
+   (void) memcpy(instr, pop_int + pop_offset[int_params],
+                push_size[int_params]);
+   instr += push_size[int_params];
+
+   (void) memcpy(instr, do_dispatch, sizeof(do_dispatch));
+   (void) memcpy(instr + 3, & true_offset, 4);
+
 #elif defined(USE_SPARC_ASM)
 
    /* XXX this hasn't been tested! */
    unsigned int *code = (unsigned int *) entrypoint;
+
+   (void) parameter_signature;
 #ifdef __arch64__
    code[6] = 0x05000000;  /* sethi     %hi(8 * glapioffset), %g2       */
    code[7] = 0x8410a000;  /* or                %g2, %lo(8 * glapioffset), %g2  
*/
@@ -652,6 +825,7 @@
    /* an unimplemented architecture */
    (void) entrypoint;
    (void) offset;
+   (void) parameter_signature;
 
 #endif /* USE_*_ASM */
 }
@@ -826,7 +1000,8 @@
         }
 
         entry[i]->parameter_signature = str_dup(real_sig);
-        fill_in_entrypoint_offset(entry[i]->dispatch_stub, offset);
+        fill_in_entrypoint(entry[i]->dispatch_stub, offset,
+                           entry[i]->parameter_signature);
         entry[i]->dispatch_offset = offset;
       }
    }

Using Tomcat but need to do more? Need to support web services, security?
Get stuff done quickly with pre-integrated technology to make your job easier
Download IBM WebSphere Application Server v.1.0.1 based on Apache Geronimo
http://sel.as-us.falkag.net/sel?cmd=lnk&kid=120709&bid=263057&dat=121642

_______________________________________________
Mesa3d-dev mailing list
Mesa3d-dev@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/mesa3d-dev

[Mesa3d-dev] Dispatch generation for x86-64

Reply via email to