#include <stdlib.h>
#include <stdio.h>

typedef char           gchar;
typedef unsigned char  guchar;
typedef long           glong;
typedef unsigned int   guint32;

#if defined(__GNUC__) && (__GNUC__ > 2) && defined(__OPTIMIZE__)
#define _G_BOOLEAN_EXPR(expr)                   \
 __extension__ ({                               \
   int _g_boolean_var_;                         \
   if (expr)                                    \
      _g_boolean_var_ = 1;                      \
   else                                         \
      _g_boolean_var_ = 0;                      \
   _g_boolean_var_;                             \
})
#define G_LIKELY(expr) (__builtin_expect (_G_BOOLEAN_EXPR(expr), 1))
#define G_UNLIKELY(expr) (__builtin_expect (_G_BOOLEAN_EXPR(expr), 0))
#else
#define G_LIKELY(expr) (expr)
#define G_UNLIKELY(expr) (expr)
#endif


/***************************************************************************/
/***************************************************************************/
/*****                                                                 *****/
/*****                   U T F - 8   T E X T                           *****/
/*****                                                                 *****/
/***************************************************************************/
/***************************************************************************/

static gchar  *utf8_test_text;
static int     utf8_test_text_len;    /* numbers of bytes      */
static int     utf8_test_text_count;  /* numbers of characters */


/***************************************************************************/
/***************************************************************************/
/*****                                                                 *****/
/*****        B E N C H M A R K I N G   S U P P O R T                  *****/
/*****                                                                 *****/
/***************************************************************************/
/***************************************************************************/

/* we really ripped some funcs from FreeType benchmarks
 */

#include <time.h>
#ifdef UNIX
#include <sys/time.h>
#endif

static double  bench_time = 3;   /* in seconds */

double
get_time(void)
{
#ifdef UNIX
  struct timeval tv;

  gettimeofday(&tv, NULL);
  return (double)tv.tv_sec + (double)tv.tv_usec / 1E6;
#else
  /* clock() has an awful precision (~10ms) under Linux 2.4 + glibc 2.2 */
  return (double)clock() / (double)CLOCKS_PER_SEC;
#endif
}


typedef char*
(*bench_func_t)( const char*  text,
                 long         offset );

#if 1

#define  BENCH(_func,_title)                                    \
{                                                               \
  int          i, n, done;                                      \
  double       t0, delta;                                       \
  const char*  text  = utf8_test_text;                          \
  const char*  end   = text + utf8_test_text_len;               \
  int          count = utf8_test_text_count;                    \
  int          check = 1;                                       \
                                                                \
  printf("%-30s : ", _title );                                  \
  fflush(stdout);                                               \
                                                                \
  {                                                             \
    int  nn;     /* check for buggy implementation */           \
                                                                \
    for ( nn = 0; nn <= utf8_test_text_count; nn++ )            \
    {                                                           \
      gchar*  s1 = (_func)(text,nn);                            \
      gchar*  s2 = g_utf8_offset_to_pointer( text, nn );        \
                                                                \
      if ( s1 != s2 )                                           \
      {                                                         \
        printf( "BUGGY at offset %d (%p instead of %p)!!\n",    \
                nn, s1, s2 );                                   \
        check = 0;                                              \
        break;                                                  \
      }                                                         \
    }                                                           \
  }                                                             \
                                                                \
  if ( check )                                                    \
  {                                                               \
    n    = 0;                                                     \
    done = 0;                                                     \
    t0   = get_time();                                            \
    do                                                            \
    {                                                             \
      const char*  result = NULL;                                 \
                                                                  \
      result = (_func)( text, utf8_test_text_count );             \
                                                                  \
      if ( result == end )                                        \
        done++;                                                   \
                                                                  \
      n++;                                                        \
      delta = get_time() - t0;                                    \
    }                                                             \
    while ( delta < bench_time);                                  \
                                                                  \
    printf("%5.3f us/op\n", delta * 1E6 / (double)done);          \
  }                                                               \
}

#else

#define  BENCH(_func,_title)                                    \
{                                                               \
  int          i, n, done;                                      \
  double       t0, delta;                                       \
  const char*  text  = utf8_test_text;                          \
  const char*  end   = text + utf8_test_text_len;               \
  int          count = utf8_test_text_count;                    \
                                                                \
  printf("%-30s : ", _title );                                  \
  fflush(stdout);                                               \
                                                                \
  n    = 0;                                                     \
  done = 0;                                                     \
  t0   = get_time();                                            \
  do                                                            \
  {                                                             \
    int          nn;                                            \
    const char*  result = NULL;                                 \
                                                                \
    for ( nn = 0; nn <= count; nn++ )                           \
      result = (_func)( text, nn );                             \
                                                                \
    if ( result == end )                                        \
      done++;                                                   \
                                                                \
    n++;                                                        \
    delta = get_time() - t0;                                    \
  }                                                             \
  while ( delta < bench_time);                                  \
                                                                \
  printf("%5.3f ms/op\n", delta * 1E3 / (double)done);          \
}

#endif

/***************************************************************************/
/***************************************************************************/
/*****                                                                 *****/
/*****                   U T F - 8   S K I P P I N G                   *****/
/*****                                                                 *****/
/***************************************************************************/
/***************************************************************************/


static const gchar utf8_skip_data[256] = {
  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
  2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
  3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,6,6,1,1
};

const gchar * const g_utf8_skip = utf8_skip_data;

#define g_utf8_next_char(p) (char *)((p) + g_utf8_skip[*(guchar *)(p)])


gchar *
g_utf8_offset_to_pointer (const gchar *str,
                          glong        offset)
{
  const gchar *s = str;
  while (offset--)
    s = g_utf8_next_char (s);

  return (gchar *)s;
}


gchar *
g_utf8_offset_to_pointer_sven (const gchar *str,
                               glong        offset)
{
  static const unsigned char * const sven_utf8_skip = utf8_skip_data + 0xC0;

#define sven_utf8_next_char(p) ( (p) + (*(p) < 0xc0 ? 1 : sven_utf8_skip[*(p) & 0x3F]) )
  
  const guchar *s = (const guchar*)str;

  while (offset--)
    s = sven_utf8_next_char (s);

  return (gchar *)s;
}


gchar *
g_utf8_offset_to_pointer_test191 (const gchar *str,
                                  glong        offset)
{
  const guchar *s = (const guchar*)str;

  while (offset--)
    if (*s < 191) s++;
    else s = g_utf8_next_char (s);

  return (gchar *)s;
}

/* note that all variants expect proper UTF-8, restricted to 0 .. 0x10FFFF !!
 */

gchar *
g_utf8_offset_to_pointer_hibits (const gchar *str,
                                 glong        offset)
{
  static const int  steps[16] =
  {
    1, 1, 1, 1, 1, 1, 1, 1,
    1, 1, 1, 1, 2, 2, 3, 4
  };

  const gchar *s = str;
  while (offset--)
    s += steps[*(guchar*)s >> 4];

  return (gchar *)s;
}


#if 0  /* BUGGY !! */
gchar*
g_utf8_offset_to_pointer_testC0( const gchar  *str,
                                 glong         offset )
{
  while (offset)
  {
    if ((*++str & 0xC0) != 0x80)
     --offset ;
  }
  return (gchar *)str;
}


gchar*
g_utf8_offset_to_pointer_shifttest( const gchar  *str,
                                    glong         offset )
{
  while (offset)
  {
    if ( (((guchar)*++str) >> 6) != 2 )
     --offset ;
  }
  return (gchar *)str;
}
#endif /* BUGGY !! */

gchar *
g_utf8_offset_to_pointer_ifelse(const gchar *str,
                                glong        offset)
{
  const gchar *s = str;

  while (offset--)
  {
    int  c = *(guchar*)(s)++;

    if ( c >= 0x80 )
    {
      if ( c >= 0xF0 )
        s += 3;
      else if ( c >= 0xE0 )
        s += 2;
      else if ( c >= 0xC0 )
        s += 1;
    }
  }
  return (gchar*)s;
}


gchar *
g_utf8_offset_to_pointer_cond(const gchar *str,
                              glong        offset)
{
  const gchar *s = str;

  while (offset--)
  {
    int  c = *(guchar*)(s);
    int  z;

    s += 1 + (c >= 0xF0) + (c >= 0xE0) + (c >= 0xC0);
  }
  return (gchar*)s;
}


gchar *
g_utf8_offset_to_pointer_bittricks(const gchar *str,
                                   glong        offset)
{
  const gchar *s = str;

  while (offset--)
  {
    int  c = *(guchar*)(s);
    int  z;

    s += 1;
    z  = (0xEF-c) >> 31; s -= z;
    z  = (0xDF-c) >> 31; s -= z;
    z  = (0xBF-c) >> 31; s -= z;
  }
  return (gchar*)s;
}

gchar *
g_utf8_offset_to_pointer_words(const gchar *str,
                               glong        offset)
{
  union
  {
    const guint32  *p32;
    const gchar    *p8;
  }
  pt;

  pt.p8 = str;

  while (offset >= 4)
    {
      guint32 seg    = *pt.p32;
      guint32 seg_hi = seg & 0x80808080;

      if (!seg_hi)
        {
          pt.p32++;
          offset -= 4;
          continue;
        }
      else if (seg_hi == 0x80808080)
        {
          if ((seg & 0x80e080e0) == 0x80c080c0)
	    {
              pt.p32++;
              offset -= 2;
              continue;
            }

          if ((seg & 0xf08080f0) == 0xe08080e0)
            {
              pt.p8 += 6;
              offset -= 2;
              continue;
            }
        }

      pt.p8 += g_utf8_skip [(guchar) seg];
      offset--;
    }

  for ( ; offset; offset--)
    pt.p8 = g_utf8_next_char (pt.p8);

  return (gchar *) pt.p8;
}


gchar *
g_utf8_offset_to_pointer_words_glikely(const gchar *str,
                                       glong        offset)
{
  union
  {
    const guint32  *p32;
    const gchar    *p8;
  }
  pt;

  pt.p8 = str;

  while (offset >= 4)
    {
      guint32 seg    = *pt.p32;
      guint32 seg_hi = seg & 0x80808080;

      if (!seg_hi)
        {
          pt.p32++;
          offset -= 4;
          continue;
        }
      else if G_LIKELY (seg_hi == 0x80808080)
        {
          if ((seg & 0x80e080e0) == 0x80c080c0)
	    {
              pt.p32++;
              offset -= 2;
              continue;
            }

          if ((seg & 0xf08080f0) == 0xe08080e0)
            {
              pt.p8 += 6;
              offset -= 2;
              continue;
            }
        }

      pt.p8 += g_utf8_skip [(guchar) seg];
      offset--;
    }

  for ( ; offset; offset--)
    pt.p8 = g_utf8_next_char (pt.p8);

  return (gchar *) pt.p8;
}


/***************************************************************************/
/***************************************************************************/
/*****                                                                 *****/
/*****                   M A I N - D O H !                             *****/
/*****                                                                 *****/
/***************************************************************************/
/***************************************************************************/

static void
usage( const char*  progname )
{
  fprintf( stderr, "usage: %s utf8file\n", progname );
  exit(2);
}

int main( int  argc, const char* const*  argv )
{
  const char*  progname = argv[0];
  FILE*        file;

  if ( argc != 2 )
    usage( progname );

  file = fopen( argv[1], "rb" );
  if ( file == NULL )
  {
    fprintf( stderr, "could not open '%s'\n", argv[1] );
    exit(3);
  }

  fseek( file, 0, SEEK_END );
  utf8_test_text_len = ftell( file );
  fseek( file, 0, SEEK_SET );

  utf8_test_text = malloc( utf8_test_text_len );
  if ( utf8_test_text == NULL )
  {
    fprintf( stderr, "not enough memory to allocate %ld bytes\n",
                     utf8_test_text_len );
    exit(4);
  }

  fread( utf8_test_text, utf8_test_text_len, 1, file );
  fclose( file );


  /* crummy init */
  {
    gchar  *s = utf8_test_text;

    utf8_test_text_count = 0;

    while ( s < utf8_test_text + utf8_test_text_len && *s )
    {
      utf8_test_text_count++;
      s = g_utf8_next_char(s);
    }

    utf8_test_text_len = s - utf8_test_text;
  }

  /* now, do the work */
  BENCH( g_utf8_offset_to_pointer,            "original" );
  BENCH( g_utf8_offset_to_pointer_sven,       "sven neumann's" );
  BENCH( g_utf8_offset_to_pointer_test191,    "test < 0xC0" );
  BENCH( g_utf8_offset_to_pointer_words,      "32-bits at a time" );
  BENCH( g_utf8_offset_to_pointer_words,      "32-bits at a time + G_LIKELY" );
  BENCH( g_utf8_offset_to_pointer_hibits,     "hi-bits only" );
  BENCH( g_utf8_offset_to_pointer_ifelse,     "with if..else" );
  BENCH( g_utf8_offset_to_pointer_cond,       "with conditionals" );
  BENCH( g_utf8_offset_to_pointer_bittricks,  "with bit tricks" );

#if 0  /* BUGGY */
  BENCH( g_utf8_offset_to_pointer_testC0,     "testing 0xC0" );
  BENCH( g_utf8_offset_to_pointer_shifttest,  "testing with >> 6" );
#endif

  return 0;
}

