Pádraig Brady wrote: > seconds to sort 1 million ints: > ----------------------------------- > sort option time difference > ----------------------------------- > -n 2.75 > -h (ret 0) 3.10 +13% > -h 3.96 +44%
I removed a redundant to_uchar() and got it from 44% to 40% compared to -n. Note our existing -n implementation is mega fast, so at least we're not interfering with its implementation. Attached is the full patch, which hopefully we can push soon. cheers, Pádraig.
>From fef4d790423ff840d0217b6107db85b396728cb4 Mon Sep 17 00:00:00 2001 From: Michael Speer <knome...@gmail.com> Date: Mon, 27 Apr 2009 14:51:29 +0100 Subject: [PATCH] sort: new --human-numeric-sort option to sort KiB MB etc. * NEWS: document the new option * doc/coreutils.texi: ditto * src/sort.c (main): handle the new -human-numeric-sort option (-h). (human_numcompare): A new function to compare SI and IEC suffixes before falling back to the standard --numeric comparision. (find_unit_order): A new helper function to find the order of magnitude of a number string as determined by its suffix. (check_mixed_SI_IEC): A new helper function to exit with error if both SI and IEC suffixes are presented. * tests/misc/sort: Add 5 tests to test the new functionality. * THANKS: Update --- NEWS | 5 ++ THANKS | 1 + doc/coreutils.texi | 14 ++++++ src/sort.c | 115 ++++++++++++++++++++++++++++++++++++++++++++++++---- tests/misc/sort | 13 ++++++ 5 files changed, 140 insertions(+), 8 deletions(-) diff --git a/NEWS b/NEWS index 8cb17cc..bfebe1d 100644 --- a/NEWS +++ b/NEWS @@ -2,6 +2,11 @@ GNU coreutils NEWS -*- outline -*- * Noteworthy changes in release 7.3 (????-??-??) [?] +** New features + + sort accepts a new option, --human-numeric-sort (-h): sort numbers + while honouring human readable suffixes like KiB and MB etc. + ** Bug fixes sort -m no longer segfaults when its output file is also an input file. diff --git a/THANKS b/THANKS index 876a6b6..da48d6d 100644 --- a/THANKS +++ b/THANKS @@ -395,6 +395,7 @@ Michael J. Croghan mcrog...@usatoday.com Michael McFarland sid...@yahoo.com Michael McLagan mmcla...@invlogic.com Michael Piefel pie...@informatik.hu-berlin.de +Michael Speer knome...@gmail.com Michael Steffens michael.steff...@s.netic.de Michael Stone mst...@debian.org Michael Stutz st...@dsl.org diff --git a/doc/coreutils.texi b/doc/coreutils.texi index 918f44e..8f73419 100644 --- a/doc/coreutils.texi +++ b/doc/coreutils.texi @@ -3785,6 +3785,20 @@ Use this option only if there is no alternative; it is much slower than @option{--numeric-sort} (@option{-n}) and it can lose information when converting to floating point. +...@item -h +...@itemx --human-numeric-sort +...@itemx --sort=human-numeric +...@opindex -h +...@opindex --human-numeric-sort +...@opindex --sort +...@cindex human numeric sort +...@vindex LC_NUMERIC +Sort numerically, as per the @option{--numeric-sort} option, +and in addition handle IEC or SI suffixes like MiB, MB etc. +Note a mixture of these suffixes is not supported and will +be flagged as an error. Also the numbers must be abbreviated unambiguously. +I.E. 5000K and 6M will be sorted incorrectly for example. + @item -i @itemx --ignore-nonprinting @opindex -i diff --git a/src/sort.c b/src/sort.c index f48d727..4c1bec5 100644 --- a/src/sort.c +++ b/src/sort.c @@ -176,6 +176,8 @@ struct keyfield bool random; /* Sort by random hash of key. */ bool general_numeric; /* Flag for general, numeric comparison. Handle numbers in exponential notation. */ + bool human_numeric; /* Flag for sorting by human readable + units with either SI xor IEC prefixes. */ bool month; /* Flag for comparison by month name. */ bool reverse; /* Reverse the sense of comparison. */ bool version; /* sort by version number */ @@ -336,6 +338,9 @@ Ordering options:\n\ -i, --ignore-nonprinting consider only printable characters\n\ -M, --month-sort compare (unknown) < `JAN' < ... < `DEC'\n\ "), stdout); + fputs(_("\ + -h, --human-numeric-sort compare human readable numbers (e.g., 2K 1G)\n\ +"), stdout); fputs (_("\ -n, --numeric-sort compare according to string numerical value\n\ -R, --random-sort sort by random hash of keys\n\ @@ -344,8 +349,8 @@ Ordering options:\n\ "), stdout); fputs (_("\ --sort=WORD sort according to WORD:\n\ - general-numeric -g, month -M, numeric -n,\n\ - random -R, version -V\n\ + general-numeric -g, human-numeric -h, month -M,\n\ + numeric -n, random -R, version -V\n\ -V, --version-sort natural sort of (version) numbers within text\n\ \n\ "), stdout); @@ -426,7 +431,7 @@ enum SORT_OPTION }; -static char const short_options[] = "-bcCdfgik:mMno:rRsS:t:T:uVy:z"; +static char const short_options[] = "-bcCdfghik:mMno:rRsS:t:T:uVy:z"; static struct option const long_options[] = { @@ -442,6 +447,7 @@ static struct option const long_options[] = {"merge", no_argument, NULL, 'm'}, {"month-sort", no_argument, NULL, 'M'}, {"numeric-sort", no_argument, NULL, 'n'}, + {"human-numeric-sort", no_argument, NULL, 'h'}, {"version-sort", no_argument, NULL, 'V'}, {"random-sort", no_argument, NULL, 'R'}, {"random-source", required_argument, NULL, RANDOM_SOURCE_OPTION}, @@ -480,6 +486,7 @@ static char const check_types[] = #define SORT_TABLE \ _st_("general-numeric", 'g') \ + _st_("human-numeric", 'h') \ _st_("month", 'M') \ _st_("numeric", 'n') \ _st_("random", 'R') \ @@ -1673,6 +1680,87 @@ numcompare (const char *a, const char *b) return strnumcmp (a, b, decimal_point, thousands_sep); } +/* Exit with an error if a mixture of SI and IEC units detected. */ + +static void +check_mixed_SI_IEC (char prefix) +{ + static int seen_si = -1; + bool si_present = prefix == 'i'; + if (seen_si != -1 && seen_si != si_present) + error (SORT_FAILURE, 0, _("both SI and IEC prefixes present on units")); + seen_si = si_present; +} + +/* Return an integer which represents the order of magnitude of + the unit following the number. NUMBER can contain thousands separators + or a decimal point, but not have preceeding blanks. + Negative numbers return a negative unit order. */ + +static int +find_unit_order (const char *number) +{ + static const char orders [UCHAR_LIM] = { + ['K']=1, ['M']=2, ['G']=3, ['T']=4, ['P']=5, ['E']=6, ['Z']=7, ['Y']=8, + ['k']=1, + }; + + const unsigned char *p = number; + + int sign = 1; + + if (*p == '-') + { + sign = -1; + p++; + } + + /* Scan to end of number. + Decimals or separators not followed by digits stop the scan. + Numbers ending in decimals or separators are thus considered + to be lacking in units. + FIXME: add support for multibyte thousands_sep and decimal_point. */ + + while (ISDIGIT (*p)) + { + p++; + + if (*p == decimal_point && ISDIGIT (*(p+1))) + p += 2; + else if (*p == thousands_sep && ISDIGIT (*(p+1))) + p += 2; + } + + int order = orders[*p]; + + /* For valid units check for MiB vs MB etc. */ + if (order) + check_mixed_SI_IEC (*(p+1)); + + return sign * order; +} + +/* Compare numbers ending in units with SI xor IEC prefixes + <none/unknown> < K/k < M < G < T < P < E < Z < Y + Assume that numbers are properly abbreviated. + i.e. input will never have both 5000K and 6M. */ + +static int +human_numcompare (const char *a, const char *b) +{ + while (blanks[to_uchar (*a)]) + a++; + while (blanks[to_uchar (*b)]) + b++; + + int order_a = find_unit_order (a); + int order_b = find_unit_order (b); + + return (order_a > order_b ? 1 + : order_a < order_b ? -1 + : strnumcmp (a , b , decimal_point , thousands_sep)); +} + static int general_numcompare (const char *sa, const char *sb) { @@ -1917,13 +2005,14 @@ keycompare (const struct line *a, const struct line *b) if (key->random) diff = compare_random (texta, lena, textb, lenb); - else if (key->numeric | key->general_numeric) + else if (key->numeric | key->general_numeric | key->human_numeric) { char savea = *lima, saveb = *limb; *lima = *limb = '\0'; - diff = ((key->numeric ? numcompare : general_numcompare) - (texta, textb)); + diff = ((key->numeric ? numcompare + : key->general_numeric ? general_numcompare + : human_numcompare) (texta, textb)); *lima = savea, *limb = saveb; } else if (key->version) @@ -2889,7 +2978,7 @@ check_ordering_compatibility (void) for (key = keylist; key; key = key->next) if ((1 < (key->random + key->numeric + key->general_numeric + key->month - + key->version + !!key->ignore)) + + key->version + (!!key->ignore) + key->human_numeric)) || (key->random && key->translate)) { /* The following is too big, but guaranteed to be "big enough". */ @@ -2901,6 +2990,8 @@ check_ordering_compatibility (void) *p++ = 'f'; if (key->general_numeric) *p++ = 'g'; + if (key->human_numeric) + *p++ = 'h'; if (key->ignore == nonprinting) *p++ = 'i'; if (key->month) @@ -2992,6 +3083,9 @@ set_ordering (const char *s, struct keyfield *key, enum blanktype blanktype) case 'g': key->general_numeric = true; break; + case 'h': + key->human_numeric = true; + break; case 'i': /* Option order should not matter, so don't let -i override -d. -d implies -i, but -i does not imply -d. */ @@ -3140,7 +3234,8 @@ main (int argc, char **argv) gkey.sword = gkey.eword = SIZE_MAX; gkey.ignore = NULL; gkey.translate = NULL; - gkey.numeric = gkey.general_numeric = gkey.random = gkey.version = false; + gkey.numeric = gkey.general_numeric = gkey.human_numeric = false; + gkey.random = gkey.version = false; gkey.month = gkey.reverse = false; gkey.skipsblanks = gkey.skipeblanks = false; @@ -3219,6 +3314,7 @@ main (int argc, char **argv) case 'd': case 'f': case 'g': + case 'h': case 'i': case 'M': case 'n': @@ -3471,6 +3567,7 @@ main (int argc, char **argv) | key->numeric | key->version | key->general_numeric + | key->human_numeric | key->random))) { key->ignore = gkey.ignore; @@ -3480,6 +3577,7 @@ main (int argc, char **argv) key->month = gkey.month; key->numeric = gkey.numeric; key->general_numeric = gkey.general_numeric; + key->human_numeric = gkey.human_numeric; key->random = gkey.random; key->reverse = gkey.reverse; key->version = gkey.version; @@ -3495,6 +3593,7 @@ main (int argc, char **argv) | gkey.month | gkey.numeric | gkey.general_numeric + | gkey.human_numeric | gkey.random | gkey.version))) { diff --git a/tests/misc/sort b/tests/misc/sort index c5a2d74..76b271d 100755 --- a/tests/misc/sort +++ b/tests/misc/sort @@ -54,6 +54,19 @@ my @Tests = ["n11a", '-s -n -k1,1', {IN=>".01a\n.010\n"}, {OUT=>".01a\n.010\n"}], ["n11b", '-s -n -k1,1', {IN=>".010\n.01a\n"}, {OUT=>".010\n.01a\n"}], +#human readable suffixes +["h1", '-h', {IN=>"Y\nZ\nE\nP\nT\nG\nM\nK\n"}, + {OUT=>"K\nM\nG\nT\nP\nE\nZ\nY\n"}], +["h2", '-h', {IN=>"1M\n-2G\n-3K"}, {OUT=>"-2G\n-3K\n1M\n"}], +["h3", '-h', {IN=>"1Mi\n1M\n"}, {OUT=>""}, {EXIT=>2}, + {ERR=>"$prog: both SI and IEC prefixes present on units\n"}], +# decimal at end => ignore suffix +["h4", '-h', {IN=>"1.E\n2.M\n"}, {OUT=>"1.E\n2.M\n"}], +# double decimal => ignore suffix +["h5", '-h', {IN=>"1..2E\n2..2M\n"}, {OUT=>"1..2E\n2..2M\n"}], +# illustrate misordering of ambiguous abbreviations +["h6", '-h', {IN=>"1GiB\n1030MiB\n"}, {OUT=>"1030MiB\n1GiB\n"}], + ["01a", '', {IN=>"A\nB\nC\n"}, {OUT=>"A\nB\nC\n"}], # ["02a", '-c', {IN=>"A\nB\nC\n"}, {OUT=>''}], -- 1.5.3.6
_______________________________________________ Bug-coreutils mailing list Bug-coreutils@gnu.org http://lists.gnu.org/mailman/listinfo/bug-coreutils