> $ time ./wc -m long_lines.txt > 13357046 long_lines.txt > real 0m1.860s
It processes at the speed of 7 million characters per second. I would not call this a "horrible performance". > However wc calls mbrtowc() for each multibyte character. Yes. One could use mbstowcs (or mbsnrtowcs, but that exists in glibc only). Or one can avoid the calls to mbrtowc() when the character is in the "basic POSIX character set" (i.e. most of ASCII). This trick comes from Paul Eggert and is already realized in gnulib's mbiter.h and mbswidth.c. Applied here, it hardly changes the code but speeds it up by a factor of 3. Timing with original coreutils-6.11: $ time wc -w < SuSE-9.0-DVD-ARCHIVES 6999399 real 2m26.211s user 2m8.553s sys 0m1.046s $ time wc -m < SuSE-9.0-DVD-ARCHIVES 120602576 real 2m17.754s user 2m8.164s sys 0m0.919s Timing with this patch: $ time /build/coreutils-6.11/src/wc -w < SuSE-9.0-DVD-ARCHIVES 6999399 real 0m42.101s user 0m40.179s sys 0m0.875s $ time /build/coreutils-6.11/src/wc -m < SuSE-9.0-DVD-ARCHIVES 120602576 real 0m41.609s user 0m40.171s sys 0m0.908s So the resulting counts are the same, and the time to process a 120 MB file is reduced from 128 sec to 40 sec, i.e. the speed increases from 0.94 MB/sec to 3.0 MB/sec. 2008-05-08 Bruno Haible <[EMAIL PROTECTED]> Speed up "wc -m" and "wc -w" in multibyte case. * src/wc.c: Include mbchar.h. (wc): New variable in_shift. Use it to avoid calling mbrtowc for most ASCII characters. *** coreutils-6.11/src/wc.c.bak 2008-04-19 23:34:23.000000000 +0200 --- coreutils-6.11/src/wc.c 2008-05-08 16:18:25.000000000 +0200 *************** *** 1,5 **** /* wc - print the number of lines, words, and bytes in files ! Copyright (C) 85, 91, 1995-2007 Free Software Foundation, Inc. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by --- 1,5 ---- /* wc - print the number of lines, words, and bytes in files ! Copyright (C) 85, 91, 1995-2008 Free Software Foundation, Inc. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by *************** *** 28,33 **** --- 28,34 ---- #include "system.h" #include "error.h" #include "inttostr.h" + #include "mbchar.h" #include "quote.h" #include "readtokens0.h" #include "safe-read.h" *************** *** 274,279 **** --- 275,281 ---- bool in_word = false; uintmax_t linepos = 0; mbstate_t state = { 0, }; + bool in_shift = false; # if SUPPORT_OLD_MBRTOWC /* Back-up the state before each multibyte character conversion and move the last incomplete character of the buffer to the front *************** *** 308,377 **** wchar_t wide_char; size_t n; ! # if SUPPORT_OLD_MBRTOWC ! backup_state = state; ! # endif ! n = mbrtowc (&wide_char, p, bytes_read, &state); ! if (n == (size_t) -2) { ! # if SUPPORT_OLD_MBRTOWC ! state = backup_state; ! # endif ! break; ! } ! if (n == (size_t) -1) ! { ! /* Remember that we read a byte, but don't complain ! about the error. Because of the decoding error, ! this is a considered to be byte but not a ! character (that is, chars is not incremented). */ ! p++; ! bytes_read--; } else { if (n == 0) { wide_char = 0; n = 1; } ! p += n; ! bytes_read -= n; ! chars++; ! switch (wide_char) { ! case '\n': ! lines++; ! /* Fall through. */ ! case '\r': ! case '\f': ! if (linepos > linelength) ! linelength = linepos; ! linepos = 0; ! goto mb_word_separator; ! case '\t': ! linepos += 8 - (linepos % 8); ! goto mb_word_separator; ! case ' ': ! linepos++; ! /* Fall through. */ ! case '\v': ! mb_word_separator: ! words += in_word; ! in_word = false; ! break; ! default: ! if (iswprint (wide_char)) ! { ! int width = wcwidth (wide_char); ! if (width > 0) ! linepos += width; ! if (iswspace (wide_char)) ! goto mb_word_separator; ! in_word = true; ! } ! break; } } } while (bytes_read > 0); --- 310,390 ---- wchar_t wide_char; size_t n; ! if (!in_shift && is_basic (*p)) { ! /* Handle most ASCII characters quickly, without calling ! mbrtowc(). */ ! n = 1; ! wide_char = *p; } else { + in_shift = true; + # if SUPPORT_OLD_MBRTOWC + backup_state = state; + # endif + n = mbrtowc (&wide_char, p, bytes_read, &state); + if (n == (size_t) -2) + { + # if SUPPORT_OLD_MBRTOWC + state = backup_state; + # endif + break; + } + if (n == (size_t) -1) + { + /* Remember that we read a byte, but don't complain + about the error. Because of the decoding error, + this is a considered to be byte but not a + character (that is, chars is not incremented). */ + p++; + bytes_read--; + continue; + } + if (mbsinit (&state)) + in_shift = false; if (n == 0) { wide_char = 0; n = 1; } ! } ! p += n; ! bytes_read -= n; ! chars++; ! switch (wide_char) ! { ! case '\n': ! lines++; ! /* Fall through. */ ! case '\r': ! case '\f': ! if (linepos > linelength) ! linelength = linepos; ! linepos = 0; ! goto mb_word_separator; ! case '\t': ! linepos += 8 - (linepos % 8); ! goto mb_word_separator; ! case ' ': ! linepos++; ! /* Fall through. */ ! case '\v': ! mb_word_separator: ! words += in_word; ! in_word = false; ! break; ! default: ! if (iswprint (wide_char)) { ! int width = wcwidth (wide_char); ! if (width > 0) ! linepos += width; ! if (iswspace (wide_char)) ! goto mb_word_separator; ! in_word = true; } + break; } } while (bytes_read > 0); _______________________________________________ Bug-coreutils mailing list Bug-coreutils@gnu.org http://lists.gnu.org/mailman/listinfo/bug-coreutils