Re: horrible utf-8 performace in wc

Bruno Haible Thu, 08 May 2008 07:40:18 -0700

> $ time ./wc -m long_lines.txt
> 13357046 long_lines.txt
> real    0m1.860s


It processes at the speed of 7 million characters per second. I would not call
this a "horrible performance".

> However wc calls mbrtowc() for each multibyte character.

Yes. One could use mbstowcs (or mbsnrtowcs, but that exists in glibc only).
Or one can avoid the calls to mbrtowc() when the character is in the "basic
POSIX character set" (i.e. most of ASCII). This trick comes from Paul Eggert
and is already realized in gnulib's mbiter.h and mbswidth.c. Applied here,
it hardly changes the code but speeds it up by a factor of 3.

Timing with original coreutils-6.11:
$ time wc -w < SuSE-9.0-DVD-ARCHIVES 
6999399

real    2m26.211s
user    2m8.553s
sys     0m1.046s
$ time wc -m < SuSE-9.0-DVD-ARCHIVES 
120602576

real    2m17.754s
user    2m8.164s
sys     0m0.919s

Timing with this patch:
$ time /build/coreutils-6.11/src/wc -w < SuSE-9.0-DVD-ARCHIVES 
6999399

real    0m42.101s
user    0m40.179s
sys     0m0.875s
$ time /build/coreutils-6.11/src/wc -m < SuSE-9.0-DVD-ARCHIVES 
120602576

real    0m41.609s
user    0m40.171s
sys     0m0.908s

So the resulting counts are the same, and the time to process a 120 MB file
is reduced from 128 sec to 40 sec, i.e. the speed increases from 0.94 MB/sec
to 3.0 MB/sec.


2008-05-08  Bruno Haible  <[EMAIL PROTECTED]>

        Speed up "wc -m" and "wc -w" in multibyte case.
        * src/wc.c: Include mbchar.h.
        (wc): New variable in_shift. Use it to avoid calling mbrtowc for most
        ASCII characters.

*** coreutils-6.11/src/wc.c.bak 2008-04-19 23:34:23.000000000 +0200
--- coreutils-6.11/src/wc.c     2008-05-08 16:18:25.000000000 +0200
***************
*** 1,5 ****
  /* wc - print the number of lines, words, and bytes in files
!    Copyright (C) 85, 91, 1995-2007 Free Software Foundation, Inc.
  
     This program is free software: you can redistribute it and/or modify
     it under the terms of the GNU General Public License as published by
--- 1,5 ----
  /* wc - print the number of lines, words, and bytes in files
!    Copyright (C) 85, 91, 1995-2008 Free Software Foundation, Inc.
  
     This program is free software: you can redistribute it and/or modify
     it under the terms of the GNU General Public License as published by
***************
*** 28,33 ****
--- 28,34 ----
  #include "system.h"
  #include "error.h"
  #include "inttostr.h"
+ #include "mbchar.h"
  #include "quote.h"
  #include "readtokens0.h"
  #include "safe-read.h"
***************
*** 274,279 ****
--- 275,281 ----
        bool in_word = false;
        uintmax_t linepos = 0;
        mbstate_t state = { 0, };
+       bool in_shift = false;
  # if SUPPORT_OLD_MBRTOWC
        /* Back-up the state before each multibyte character conversion and
         move the last incomplete character of the buffer to the front
***************
*** 308,377 ****
              wchar_t wide_char;
              size_t n;
  
! # if SUPPORT_OLD_MBRTOWC
!             backup_state = state;
! # endif
!             n = mbrtowc (&wide_char, p, bytes_read, &state);
!             if (n == (size_t) -2)
                {
! # if SUPPORT_OLD_MBRTOWC
!                 state = backup_state;
! # endif
!                 break;
!               }
!             if (n == (size_t) -1)
!               {
!                 /* Remember that we read a byte, but don't complain
!                    about the error.  Because of the decoding error,
!                    this is a considered to be byte but not a
!                    character (that is, chars is not incremented).  */
!                 p++;
!                 bytes_read--;
                }
              else
                {
                  if (n == 0)
                    {
                      wide_char = 0;
                      n = 1;
                    }
!                 p += n;
!                 bytes_read -= n;
!                 chars++;
!                 switch (wide_char)
                    {
!                   case '\n':
!                     lines++;
!                     /* Fall through. */
!                   case '\r':
!                   case '\f':
!                     if (linepos > linelength)
!                       linelength = linepos;
!                     linepos = 0;
!                     goto mb_word_separator;
!                   case '\t':
!                     linepos += 8 - (linepos % 8);
!                     goto mb_word_separator;
!                   case ' ':
!                     linepos++;
!                     /* Fall through. */
!                   case '\v':
!                   mb_word_separator:
!                     words += in_word;
!                     in_word = false;
!                     break;
!                   default:
!                     if (iswprint (wide_char))
!                       {
!                         int width = wcwidth (wide_char);
!                         if (width > 0)
!                           linepos += width;
!                         if (iswspace (wide_char))
!                           goto mb_word_separator;
!                         in_word = true;
!                       }
!                     break;
                    }
                }
            }
          while (bytes_read > 0);
--- 310,390 ----
              wchar_t wide_char;
              size_t n;
  
!             if (!in_shift && is_basic (*p))
                {
!                 /* Handle most ASCII characters quickly, without calling
!                    mbrtowc().  */
!                 n = 1;
!                 wide_char = *p;
                }
              else
                {
+                 in_shift = true;
+ # if SUPPORT_OLD_MBRTOWC
+                 backup_state = state;
+ # endif
+                 n = mbrtowc (&wide_char, p, bytes_read, &state);
+                 if (n == (size_t) -2)
+                   {
+ # if SUPPORT_OLD_MBRTOWC
+                     state = backup_state;
+ # endif
+                     break;
+                   }
+                 if (n == (size_t) -1)
+                   {
+                     /* Remember that we read a byte, but don't complain
+                        about the error.  Because of the decoding error,
+                        this is a considered to be byte but not a
+                        character (that is, chars is not incremented).  */
+                     p++;
+                     bytes_read--;
+                     continue;
+                   }
+                 if (mbsinit (&state))
+                   in_shift = false;
                  if (n == 0)
                    {
                      wide_char = 0;
                      n = 1;
                    }
!               }
!             p += n;
!             bytes_read -= n;
!             chars++;
!             switch (wide_char)
!               {
!               case '\n':
!                 lines++;
!                 /* Fall through. */
!               case '\r':
!               case '\f':
!                 if (linepos > linelength)
!                   linelength = linepos;
!                 linepos = 0;
!                 goto mb_word_separator;
!               case '\t':
!                 linepos += 8 - (linepos % 8);
!                 goto mb_word_separator;
!               case ' ':
!                 linepos++;
!                 /* Fall through. */
!               case '\v':
!               mb_word_separator:
!                 words += in_word;
!                 in_word = false;
!                 break;
!               default:
!                 if (iswprint (wide_char))
                    {
!                     int width = wcwidth (wide_char);
!                     if (width > 0)
!                       linepos += width;
!                     if (iswspace (wide_char))
!                       goto mb_word_separator;
!                     in_word = true;
                    }
+                 break;
                }
            }
          while (bytes_read > 0);



_______________________________________________
Bug-coreutils mailing list
Bug-coreutils@gnu.org
http://lists.gnu.org/mailman/listinfo/bug-coreutils

Re: horrible utf-8 performace in wc

Reply via email to