On Tue, 9 Oct 2001, Dan Sugalski wrote:

> At 01:20 PM 10/9/2001 +0200, Paolo Molaro wrote:
> >On 10/07/01 Bryan C. Warnock wrote:
> > > while (*pc) {
> > >     switch (*pc) {
> > >     }
> > > }
> If anyone wants an official ruling...
>
> DO_OP can contain any code you like as long as it:
>
> *) Some version ultimately compiles everywhere
> *) Allows opcodes above a certain point (probably 256 or 512) to be
> overridden at runtime
>
> Computed gotos, switches, function table dispatches, generated machine
> code, or some other form of coding madness are all OK. I don't care which
> way a particular platform goes as long as it doesn't constrain another
> platform's ability to go another way.

Hehe.  Ok, I delved into the particulars of gcc and found:

void
runloop(int**code) {
static void* array[] = { &&op_end, &&op1, &&op2 };

start:
  goto *array[*code];

op1:
  foo; goto start;

op2:
  foo; goto start;

op_end:
  return;
} // end code

The key was &&lbl which returns the physical address of the a jumpable
label.  This is highly gcc-specific, but given that we can use
#ifdef's I don't see a problem with it. :)

I even wrote an ugly version that avoided that array indirection.
What it did was "convert" the code-base to change the op-codes into
physical addresses.  This obviously has some negative ramifications,
but is definately doable.  By removing that extra indirection I got a
couple percent extra improvement (12%).  I can't imagine you'll get much
faster than that (except for an inlining jit or perl6->C compiler).
The only problem was that the &&lbl's didn't want to be non-local, so
I had to perform some evil magic.

The unpolished code is attached with a cheesy instruction set which
was just designed to prevent the optimizer from optimizing away my
op-codes.

Here are the benchmarks on a 466MHZ dual proc celeron:


w/ code=500,000,000 and loop_cnt = 10
(cache-bound)

gcc speeds.c
-----
switcher time delta=45
caller time delta=47
jumper time delta=43
code compile time delta=4
fast jumper time delta=34

gcc -O speeds.c
-----
switcher time delta=30
caller time delta=41
jumper time delta=28
code compile time delta=3
fast jumper time delta=26

gcc -O2 speeds.c
-------
switcher time delta=30
caller time delta=41
jumper time delta=28
code compile time delta=3
fast jumper time delta=25

gcc -O3 speeds.c
--------
switcher time delta=29
caller time delta=41
jumper time delta=28
code compile time delta=2
fast jumper time delta=26

==========
==========

 w/ code=1000, loop_cnt = 5,000,000
gcc -O3 speeds.c
------
switcher time delta=195
caller time delta=268
jumper time delta=185
code compile time delta=0
fast jumper time delta=154

# Note a marginal improvement from switch -> jump, yet an even bigger
improvement in jump to fast-jump.

It's unlikely to have a 50M (x4B+) code-base. But it's possible to
have an inner loop that has 500,000 iterations.

-Michael
#include<time.h>
#include<stdio.h>
#include<stdlib.h>
#include <unistd.h>
#include <sys/mman.h>
#include <fcntl.h>  
#include <sys/stat.h> 
#include <sys/types.h>


//#define SIZE 50000000
#define SIZE 1000
int codep[SIZE];

void
dump_mappings()
{
  char strbuf[1024]; 
  int pid = getpid();
  sprintf( strbuf, "cat /proc/%i/maps", pid  );
  system( strbuf );

  sprintf( strbuf, "ls -l /proc/%i/fd" );
  system( strbuf );

  printf("press enter to continue");
  scanf("%c",strbuf);
 
}

int x,y,z;

void
switcher( int*code)
{
  while(1)
    switch(*code) {
    case 0: goto DONE;
      break;
    case 1: x++;
      code += 1;
      break;
    case 2: y++;
      code += 1;
      break;
    case 3: z++;
      code += 1;
      break;
    case 4: x = y + z;
      code += 1;
      break;
    }
 DONE:
} // end switcher



int* end(int*code) { return 0; }
int* o1(int*code) { x++; return code+1; }
int* o2(int*code) { y++; return code+1; }
int* o3(int*code) { z++; return code+1; }
int* o4(int*code) { x = y + z; return code+1; }


typedef int* (*op_handler_t)(int*code);

op_handler_t op_codes[] = {
  end, o1, o2, o3, o4
};

void
caller( int*code)
{
  while(*code)
    code = op_codes[ *code ](code);
} // end caller

void
gotoer( int*code)
{
  static int* array[] = { &&lend, &&lo1, &&lo2, &&lo3, &&lo4 };


 start:
  /*
  if ( !code ) {
    puts("null code");
    exit(-1);
  }
  if ( *code > 3 ) {
    puts("Invalid idx");
    exit(-1);
  }
  */
  goto *array[ *code ];

  lo1:
    x++;
    code += 1;
    goto start;
  lo2:
    y++;
    code += 1;
    goto start;
  lo3:
    z++;
    code += 1;
    goto start;
  lo4:
    z++;
    code += 1;
    goto start;

 lend:
    return;
} // end gotoer

int* 
convert( int size, int*code, int*map )
{
  int* retval, *base;
  int idx =0;
  //dump_mappings();
  retval = base = (int*)malloc(size * sizeof(int));
  if ( !retval ) {
    puts("out of memory");
    exit(-1);
  }
  //dump_mappings();
  
  //printf("retval=%p, codep=%p\n", retval, codep );
  /*
  if (!retval) {
    puts("couldn't allocate retval");
    exit(-1);
  }
  */
  code[size - 1] = 0; // to make sure

  while( *code ) {

    *retval++ = 
      map[ *code++ ];
    //idx++;
  }
  //printf("size=%i, idx=%i\n", size, idx );
  //exit(0);
  *retval++ = map[ 0 ];

  return base;
} // end convert



int**
fgotoer(  int* code, int f_fake )
{
  static int* array[] = { &&fend, &&fo1, &&fo2, &&fo3, &&fo4 };
  if ( f_fake ) {

    return (int**)array;
  }

  fstart:

  goto **code;
  fo1:
    x++;
    code += 1;
    //puts("o1");
    goto fstart;
  fo2:
    y++;
    code += 1;
    //puts("o2");
    goto fstart;
  fo3:
    z++;
    code += 1;
    //puts("o3");
    goto fstart;
  fo4:
    z++;
    code += 1;
    //puts("o4");
    goto fstart;

 fend:
    //puts("end");
    return;
} // end fgotoer


int
main()
{
  time_t start_time, stop_time;
  int* code_ref = codep;
  int i;
  int** compiled_code;
  //int cnt = 10;
  int cnt = 500000;

  for( i = 0; i < SIZE ; i++ )
    *code_ref++ = ( i % 3 ) + 1;
  codep[ SIZE - 1 ] = 0;

  time(&start_time);

  for( i = 0; i < cnt; i++ )
    switcher( codep );

  time(&stop_time);
  printf("switcher time delta=%i\n", stop_time - start_time );
  start_time = stop_time;

  for( i = 0; i < cnt; i++ )
  caller( codep );
  time(&stop_time);
  printf("caller time delta=%i\n", stop_time - start_time );
  start_time = stop_time;

  for( i = 0; i < cnt; i++ )
  gotoer( codep );
  time(&stop_time);
  printf("jumper time delta=%i\n", stop_time - start_time );
  start_time = stop_time;

  
  compiled_code = (int**)convert( SIZE, codep, (int*)fgotoer(0, 1) );
  time(&stop_time);
  printf("code compile time delta=%i\n", stop_time - start_time );
  start_time = stop_time;
  
  for( i = 0; i < cnt; i++ )
    fgotoer( (int*)compiled_code, 0 );
  time(&stop_time);
  printf("code jumper time delta=%i\n", stop_time - start_time );
  start_time = stop_time;

  return 0;
}

Reply via email to