Re: Human readable sort
Pádraig Brady wrote: > I was thinking that the mixed IEC/SI check > should be applied to each key separately rather > than globally. What do you think? Patch attached. > >>From ef06a30d122fc9ccac51a682a3abf6868d8832d6 Mon Sep 17 00:00:00 2001 ... > -check_mixed_SI_IEC (char prefix) > +check_mixed_SI_IEC (char prefix, struct keyfield *key) > { > - static int seen_si = -1; > - bool si_present = prefix == 'i'; > - if (seen_si != -1 && seen_si != si_present) > + int si_present = prefix == 'i'; > + if (key->si_present != -1 && si_present != key->si_present) > error (SORT_FAILURE, 0, _("both SI and IEC prefixes present on units")); Good idea. Not part of this change, I know, but that diagnostic should include a file name. Imagine sorting many files, with many key specifiers (hence many columns of data) and very many lines, yet only a few offenders. With a file_name:line number (and byte/char-count?) and maybe even a sample of the offending data, it'd be easier to spot and correct the problem. ___ Bug-coreutils mailing list Bug-coreutils@gnu.org http://lists.gnu.org/mailman/listinfo/bug-coreutils
Re: Human readable sort
I was thinking that the mixed IEC/SI check should be applied to each key separately rather than globally. What do you think? Patch attached. cheers, Pádraig. >From ef06a30d122fc9ccac51a682a3abf6868d8832d6 Mon Sep 17 00:00:00 2001 From: =?utf-8?q?P=C3=A1draig=20Brady?= Date: Tue, 30 Jun 2009 00:52:43 +0100 Subject: [PATCH] sort: allow SI and IEC units on separate human sort fields * src/sort.c: Store the si_present state per key rather than globally * tests/misc/sort: Add a corresponding check that previously failed --- src/sort.c | 30 -- tests/misc/sort |2 ++ 2 files changed, 18 insertions(+), 14 deletions(-) diff --git a/src/sort.c b/src/sort.c index 6acec07..62ddd49 100644 --- a/src/sort.c +++ b/src/sort.c @@ -178,6 +178,7 @@ struct keyfield Handle numbers in exponential notation. */ bool human_numeric; /* Flag for sorting by human readable units with either SI xor IEC prefixes. */ + int si_present; /* Flag for checking for mixed SI and IEC. */ bool month; /* Flag for comparison by month name. */ bool reverse; /* Reverse the sense of comparison. */ bool version; /* sort by version number */ @@ -1684,13 +1685,12 @@ numcompare (const char *a, const char *b) /* Exit with an error if a mixture of SI and IEC units detected. */ static void -check_mixed_SI_IEC (char prefix) +check_mixed_SI_IEC (char prefix, struct keyfield *key) { - static int seen_si = -1; - bool si_present = prefix == 'i'; - if (seen_si != -1 && seen_si != si_present) + int si_present = prefix == 'i'; + if (key->si_present != -1 && si_present != key->si_present) error (SORT_FAILURE, 0, _("both SI and IEC prefixes present on units")); - seen_si = si_present; + key->si_present = si_present; } /* Return an integer which represents the order of magnitude of @@ -1699,7 +1699,7 @@ check_mixed_SI_IEC (char prefix) Negative numbers return a negative unit order. */ static int -find_unit_order (const char *number) +find_unit_order (const char *number, struct keyfield *key) { static const char orders [UCHAR_LIM] = { ['K']=1, ['M']=2, ['G']=3, ['T']=4, ['P']=5, ['E']=6, ['Z']=7, ['Y']=8, @@ -1736,7 +1736,7 @@ find_unit_order (const char *number) /* For valid units check for MiB vs MB etc. */ if (order) -check_mixed_SI_IEC (*(p + 1)); +check_mixed_SI_IEC (*(p + 1), key); return sign * order; } @@ -1747,15 +1747,15 @@ find_unit_order (const char *number) i.e. input will never have both 6000K and 5M. */ static int -human_numcompare (const char *a, const char *b) +human_numcompare (const char *a, const char *b, struct keyfield *key) { while (blanks[to_uchar (*a)]) a++; while (blanks[to_uchar (*b)]) b++; - int order_a = find_unit_order (a); - int order_b = find_unit_order (b); + int order_a = find_unit_order (a, key); + int order_b = find_unit_order (b, key); return (order_a > order_b ? 1 : order_a < order_b ? -1 @@ -1982,7 +1982,7 @@ compare_version (char *restrict texta, size_t lena, static int keycompare (const struct line *a, const struct line *b) { - struct keyfield const *key = keylist; + struct keyfield *key = keylist; /* For the first iteration only, the key positions have been precomputed for us. */ @@ -2015,9 +2015,9 @@ keycompare (const struct line *a, const struct line *b) char savea = *lima, saveb = *limb; *lima = *limb = '\0'; - diff = ((key->numeric ? numcompare - : key->general_numeric ? general_numcompare - : human_numcompare) (texta, textb)); + diff = (key->numeric ? numcompare (texta, textb) + : key->general_numeric ? general_numcompare (texta, textb) + : human_numcompare (texta, textb, key)); *lima = savea, *limb = saveb; } else if (key->version) @@ -3125,6 +3125,7 @@ key_init (struct keyfield *key) { memset (key, 0, sizeof *key); key->eword = SIZE_MAX; + key->si_present = -1; return key; } @@ -3240,6 +3241,7 @@ main (int argc, char **argv) gkey.ignore = NULL; gkey.translate = NULL; gkey.numeric = gkey.general_numeric = gkey.human_numeric = false; + gkey.si_present = -1; gkey.random = gkey.version = false; gkey.month = gkey.reverse = false; gkey.skipsblanks = gkey.skipeblanks = false; diff --git a/tests/misc/sort b/tests/misc/sort index 21e7af8..1340500 100755 --- a/tests/misc/sort +++ b/tests/misc/sort @@ -71,6 +71,8 @@ my @Tests = {ERR=>"$prog: options `-hn' are incompatible\n"}], # check key processing ["h8", '-n -k2,2h', {IN=>"1 1E\n2 2M\n"}, {OUT=>"2 2M\n1 1E\n"}], +# SI and IEC prefixes on separate keys allowed +["h9", '-h -k1,1 -k2,2', {IN=>"1M 1Mi\n1M 1Mi\n"}, {OUT=>"1M 1Mi\n1M 1Mi\n"}], ["01a", '', {IN=>"A\nB\nC\n"}, {OUT=>"A\nB\nC\n"}], # -- 1.6.2.5 ___ Bug-coreutils mailing list Bug-coreutils@gnu.org http://lists.gnu.org/mailman/listinfo/bug-coreutils
Re: Human readable sort
Pádraig Brady wrote: > Latest version attached. ... > * NEWS: Document the new option > * doc/coreutils.texi (sort invocation): ditto > * src/sort.c (main): handle the new --human-numeric-sort option (-h). > (human_numcompare): A new function to compare SI and IEC suffixes > before falling back to the standard --numeric comparison. > (find_unit_order): A new helper function to find the order > of magnitude of a number string as determined by its suffix. > (check_mixed_SI_IEC): A new helper function to exit with error > if both SI and IEC suffixes are presented. Looks fine. Thank you, Pádraig and Michael. ___ Bug-coreutils mailing list Bug-coreutils@gnu.org http://lists.gnu.org/mailman/listinfo/bug-coreutils
Re: Human readable sort
Jim Meyering wrote: Maybe s/unambiguously/consistently/ or /uniformly/ and mention that it's the inconsistent precision that causes trouble. J. Random Bystander (i.e. me) prefers "consistently". ("Consistent", as in, using the same rules. "Uniformly" to me suggests maybe some other condition(s) as well.) Thanks for adjusting the example. Also, thanks for finally pushing this forward, it's been wanted for quite some time :-). -- Matthew Please do not quote my e-mail address unobfuscated in message bodies. -- Sorry, not a winner. Please try again. ___ Bug-coreutils mailing list Bug-coreutils@gnu.org http://lists.gnu.org/mailman/listinfo/bug-coreutils
Re: Human readable sort
Jim Meyering wrote: > > Please make one small change to that log message: > > s/ -human/ --human/ > > And in the documentation, > > +Sort numerically, as per the @option{--numeric-sort} option, > +and in addition handle IEC or SI suffixes like MiB, MB etc. > +Note a mixture of these suffixes is not supported and will > +be flagged as an error. Also the numbers must be abbreviated unambiguously. > +I.E. 5000K and 6M will be sorted incorrectly for example. > > Eventually, it'd be nice to explain in detail why those > would cause trouble. > > Maybe s/unambiguously/consistently/ > or /uniformly/ > and mention that it's the inconsistent precision that causes trouble. > > Hmm actually those two *are* sorted properly for me: > > $ printf '%s\n' 5000K 6M| src/sort --human > 5000K > 6M > > However, these two are not: > > $ printf '%s\n' 7000K 6M| src/sort --human > 7000K > 6M > Latest version attached. cheers, Pádraig. >From 159faba1376ffd5a46fe4bbc780d85dd3e502cea Mon Sep 17 00:00:00 2001 From: Michael Speer Date: Mon, 27 Apr 2009 14:51:29 +0100 Subject: [PATCH] sort: new --human-numeric-sort option to sort KiB MB etc. * NEWS: Document the new option * doc/coreutils.texi (sort invocation): ditto * src/sort.c (main): handle the new --human-numeric-sort option (-h). (human_numcompare): A new function to compare SI and IEC suffixes before falling back to the standard --numeric comparison. (find_unit_order): A new helper function to find the order of magnitude of a number string as determined by its suffix. (check_mixed_SI_IEC): A new helper function to exit with error if both SI and IEC suffixes are presented. * tests/misc/sort: Add 8 tests to test the new functionality. * THANKS: Update --- NEWS |3 + THANKS |1 + doc/coreutils.texi | 15 +++ src/sort.c | 115 tests/misc/sort| 18 5 files changed, 144 insertions(+), 8 deletions(-) diff --git a/NEWS b/NEWS index 3af06e4..29b09a0 100644 --- a/NEWS +++ b/NEWS @@ -11,6 +11,9 @@ GNU coreutils NEWS-*- outline -*- chroot now accepts the options --userspec and --groups. + sort accepts a new option, --human-numeric-sort (-h): sort numbers + while honoring human readable suffixes like KiB and MB etc. + * Noteworthy changes in release 7.4 (2009-05-07) [stable] diff --git a/THANKS b/THANKS index cf801c5..4392f04 100644 --- a/THANKS +++ b/THANKS @@ -396,6 +396,7 @@ Michael J. Croghan mcrog...@usatoday.com Michael McFarland sid...@yahoo.com Michael McLagan mmcla...@invlogic.com Michael Piefel pie...@informatik.hu-berlin.de +Michael Speer knome...@gmail.com Michael Steffensmichael.steff...@s.netic.de Michael Stone mst...@debian.org Michael Stutz st...@dsl.org diff --git a/doc/coreutils.texi b/doc/coreutils.texi index 97ea830..834bd46 100644 --- a/doc/coreutils.texi +++ b/doc/coreutils.texi @@ -3785,6 +3785,21 @@ Use this option only if there is no alternative; it is much slower than @option{--numeric-sort} (@option{-n}) and it can lose information when converting to floating point. +...@item -h +...@itemx --human-numeric-sort +...@itemx --sort=human-numeric +...@opindex -h +...@opindex --human-numeric-sort +...@opindex --sort +...@cindex human numeric sort +...@vindex LC_NUMERIC +Sort numerically, as per the @option{--numeric-sort} option below, and in +addition handle IEC or SI suffixes like MiB, MB etc (@ref{Block size}). +Note a mixture of IEC and SI suffixes is not supported and will +be flagged as an error. Also the numbers must be abbreviated uniformly. +I.E. values with different precisions like 6000K and 5M will be sorted +incorrectly. + @item -i @itemx --ignore-nonprinting @opindex -i diff --git a/src/sort.c b/src/sort.c index 6dea2ff..8438c05 100644 --- a/src/sort.c +++ b/src/sort.c @@ -176,6 +176,8 @@ struct keyfield bool random; /* Sort by random hash of key. */ bool general_numeric; /* Flag for general, numeric comparison. Handle numbers in exponential notation. */ + bool human_numeric; /* Flag for sorting by human readable + units with either SI xor IEC prefixes. */ bool month; /* Flag for comparison by month name. */ bool reverse; /* Reverse the sense of comparison. */ bool version; /* sort by version number */ @@ -337,6 +339,9 @@ Ordering options:\n\ -M, --month-sortcompare (unknown) < `JAN' < ... < `DEC'\n\ "), stdout); fputs (_("\ + -h, --human-numeric-sortcompare human readable numbers (e.g., 2K 1G)\n\ +"), stdout); + fputs (_("\ -n, --numeric-sort compare according to string numerical value\n\ -R, --random-sort sort by random hash of keys\n\ --r
Re: Human readable sort
Pádraig Brady wrote: >>From 75bb07bb620d37d26467ab86ffcf73d47479b358 Mon Sep 17 00:00:00 2001 > From: Michael Speer > Date: Mon, 27 Apr 2009 14:51:29 +0100 > Subject: [PATCH] sort: new --human-numeric-sort option to sort KiB MB etc. > > * NEWS: Document the new option > * doc/coreutils.texi (sort invocation): ditto > * src/sort.c (main): handle the new -human-numeric-sort option (-h). Please make one small change to that log message: s/ -human/ --human/ And in the documentation, +Sort numerically, as per the @option{--numeric-sort} option, +and in addition handle IEC or SI suffixes like MiB, MB etc. +Note a mixture of these suffixes is not supported and will +be flagged as an error. Also the numbers must be abbreviated unambiguously. +I.E. 5000K and 6M will be sorted incorrectly for example. Eventually, it'd be nice to explain in detail why those would cause trouble. Maybe s/unambiguously/consistently/ or /uniformly/ and mention that it's the inconsistent precision that causes trouble. Hmm actually those two *are* sorted properly for me: $ printf '%s\n' 5000K 6M| src/sort --human 5000K 6M However, these two are not: $ printf '%s\n' 7000K 6M| src/sort --human 7000K 6M ___ Bug-coreutils mailing list Bug-coreutils@gnu.org http://lists.gnu.org/mailman/listinfo/bug-coreutils
Re: Human readable sort
Jim Meyering wrote: > Pádraig Brady wrote: >> Eric Blake wrote: >>> Pádraig Brady draigBrady.com> writes: > ... >>> +static int >>> +find_unit_order (const char *number) >>> +{ >>> + static const char orders [UCHAR_LIM] = { >>> +['K']=1, ['M']=2, ['G']=3, ['T']=4, ['P']=5, ['E']=6, ['Z']=7, ['Y']=8, >>> +['k']=1, >>> + }; >>> >>> This assumes more of C99 than we have previously required. Are we sure that >>> all compilers out there will support this syntax? >> Designated Initializers were a GNU C C89 extension. >> So I thought they were both elegant and not too new. >> I've not got access to older machines to test unfortunately. > > Since we've been requiring declaration-after-statement support > for some time now, using a feature like the above should be safe. > I think it is worthwhile, too. I did a lot off googling last night to confirm that designated initializers are very widely supported. In fact we've been using them since coreutils 7.1 (0889381c) Attached is an updated version with 2 new tests. cheers, Pádraig. >From 75bb07bb620d37d26467ab86ffcf73d47479b358 Mon Sep 17 00:00:00 2001 From: Michael Speer Date: Mon, 27 Apr 2009 14:51:29 +0100 Subject: [PATCH] sort: new --human-numeric-sort option to sort KiB MB etc. * NEWS: Document the new option * doc/coreutils.texi (sort invocation): ditto * src/sort.c (main): handle the new -human-numeric-sort option (-h). (human_numcompare): A new function to compare SI and IEC suffixes before falling back to the standard --numeric comparison. (find_unit_order): A new helper function to find the order of magnitude of a number string as determined by its suffix. (check_mixed_SI_IEC): A new helper function to exit with error if both SI and IEC suffixes are presented. * tests/misc/sort: Add 8 tests to test the new functionality. * THANKS: Update --- NEWS |5 ++ THANKS |1 + doc/coreutils.texi | 14 ++ src/sort.c | 115 tests/misc/sort| 18 5 files changed, 145 insertions(+), 8 deletions(-) diff --git a/NEWS b/NEWS index 31f1b1a..f28097d 100644 --- a/NEWS +++ b/NEWS @@ -2,6 +2,11 @@ GNU coreutils NEWS-*- outline -*- * Noteworthy changes in release ?.? (-??-??) [?] +** New features + + sort accepts a new option, --human-numeric-sort (-h): sort numbers + while honoring human readable suffixes like KiB and MB etc. + ** Bug fixes truncate -s failed to skip all whitespace in the option argument in diff --git a/THANKS b/THANKS index cf801c5..4392f04 100644 --- a/THANKS +++ b/THANKS @@ -396,6 +396,7 @@ Michael J. Croghan mcrog...@usatoday.com Michael McFarland sid...@yahoo.com Michael McLagan mmcla...@invlogic.com Michael Piefel pie...@informatik.hu-berlin.de +Michael Speer knome...@gmail.com Michael Steffensmichael.steff...@s.netic.de Michael Stone mst...@debian.org Michael Stutz st...@dsl.org diff --git a/doc/coreutils.texi b/doc/coreutils.texi index 1a3075f..ae5c577 100644 --- a/doc/coreutils.texi +++ b/doc/coreutils.texi @@ -3785,6 +3785,20 @@ Use this option only if there is no alternative; it is much slower than @option{--numeric-sort} (@option{-n}) and it can lose information when converting to floating point. +...@item -h +...@itemx --human-numeric-sort +...@itemx --sort=human-numeric +...@opindex -h +...@opindex --human-numeric-sort +...@opindex --sort +...@cindex human numeric sort +...@vindex LC_NUMERIC +Sort numerically, as per the @option{--numeric-sort} option, +and in addition handle IEC or SI suffixes like MiB, MB etc. +Note a mixture of these suffixes is not supported and will +be flagged as an error. Also the numbers must be abbreviated unambiguously. +I.E. 5000K and 6M will be sorted incorrectly for example. + @item -i @itemx --ignore-nonprinting @opindex -i diff --git a/src/sort.c b/src/sort.c index 6dea2ff..32cd200 100644 --- a/src/sort.c +++ b/src/sort.c @@ -176,6 +176,8 @@ struct keyfield bool random; /* Sort by random hash of key. */ bool general_numeric; /* Flag for general, numeric comparison. Handle numbers in exponential notation. */ + bool human_numeric; /* Flag for sorting by human readable + units with either SI xor IEC prefixes. */ bool month; /* Flag for comparison by month name. */ bool reverse; /* Reverse the sense of comparison. */ bool version; /* sort by version number */ @@ -337,6 +339,9 @@ Ordering options:\n\ -M, --month-sortcompare (unknown) < `JAN' < ... < `DEC'\n\ "), stdout); fputs (_("\ + -h, --human-numeric-sortcompare human readable numbers (e.g., 2K 1G)\n\ +"), stdout); + fputs (_("\ -n, --numeric-sort compare according to string numerical value\n\ -R, --random-sort sort
Re: Human readable sort
Pádraig Brady wrote: > Eric Blake wrote: >> Pádraig Brady draigBrady.com> writes: ... >> +static int >> +find_unit_order (const char *number) >> +{ >> + static const char orders [UCHAR_LIM] = { >> +['K']=1, ['M']=2, ['G']=3, ['T']=4, ['P']=5, ['E']=6, ['Z']=7, ['Y']=8, >> +['k']=1, >> + }; >> >> This assumes more of C99 than we have previously required. Are we sure that >> all compilers out there will support this syntax? > > Designated Initializers were a GNU C C89 extension. > So I thought they were both elegant and not too new. > I've not got access to older machines to test unfortunately. Since we've been requiring declaration-after-statement support for some time now, using a feature like the above should be safe. I think it is worthwhile, too. ___ Bug-coreutils mailing list Bug-coreutils@gnu.org http://lists.gnu.org/mailman/listinfo/bug-coreutils
Re: Re: Human readable sort
On May 21, 2009 5:07pm, Giuseppe Scrivano wrote: I think this is a too strong assumption. I wouldn't be surprised to find, for example, both 1M and 1500K in a data set. I initiated this to patch sort primarily to support the data generated by df, du and ls. The human readable options these offer are often frustrating once a user realizes there is not a complimentary sort option to them. These, of course, do produce properly reduced data. Does anyone know of a tool which produces mixed data of this sort that would need normalized unit comparison? Are there problems to normalize values using this pseudo-code? while (abs (a) > 1000) //or 1024 { order_a += signum (a); a /= 1000; //or 1024 } Yes. The current implementation does not convert to a numeric representation but compares the numbers character by character instead. The patch rides on top of this functionality, just adding a check to scan ahead for units and assuming that difference of unit is sufficient for determining sort order. Anything more complex will probably have to extend the number comparison code found in "strnumcmp-in.h". - Michael Speer ___ Bug-coreutils mailing list Bug-coreutils@gnu.org http://lists.gnu.org/mailman/listinfo/bug-coreutils
Re: Human readable sort
Pádraig Brady writes: > + Assume that numbers are properly abbreviated. > + i.e. input will never have both 5000K and 6M. */ I think this is a too strong assumption. I wouldn't be surprised to find, for example, both 1M and 1500K in a data set. Are there problems to normalize values using this pseudo-code? while (abs (a) > 1000) //or 1024 { order_a += signum (a); a /= 1000; //or 1024 } do the same with b and only after compare them. Regards, Giuseppe ___ Bug-coreutils mailing list Bug-coreutils@gnu.org http://lists.gnu.org/mailman/listinfo/bug-coreutils
Re: Human readable sort
Eric Blake wrote: > Pádraig Brady draigBrady.com> writes: > >> Looks like your copyright assignment papers went through. >> Attached is the latest patch rebased against master >> and with a couple of extra whitespace fixups. >> > > +static int > +find_unit_order (const char *number) > +{ > + static const char orders [UCHAR_LIM] = { > +['K']=1, ['M']=2, ['G']=3, ['T']=4, ['P']=5, ['E']=6, ['Z']=7, ['Y']=8, > +['k']=1, > + }; > > This assumes more of C99 than we have previously required. Are we sure that > all compilers out there will support this syntax? Designated Initializers were a GNU C C89 extension. So I thought they were both elegant and not too new. I've not got access to older machines to test unfortunately. > > Also, your tests only cover 'sort -h'; what about covering 'sort -k1,1h'? > OK I'll flesh out the tests a bit. cheers, Pádraig. ___ Bug-coreutils mailing list Bug-coreutils@gnu.org http://lists.gnu.org/mailman/listinfo/bug-coreutils
Re: Human readable sort
Pádraig Brady draigBrady.com> writes: > Looks like your copyright assignment papers went through. > Attached is the latest patch rebased against master > and with a couple of extra whitespace fixups. > +static int +find_unit_order (const char *number) +{ + static const char orders [UCHAR_LIM] = { +['K']=1, ['M']=2, ['G']=3, ['T']=4, ['P']=5, ['E']=6, ['Z']=7, ['Y']=8, +['k']=1, + }; This assumes more of C99 than we have previously required. Are we sure that all compilers out there will support this syntax? Also, your tests only cover 'sort -h'; what about covering 'sort -k1,1h'? -- Eric Blake ___ Bug-coreutils mailing list Bug-coreutils@gnu.org http://lists.gnu.org/mailman/listinfo/bug-coreutils
Re: Human readable sort
knome@gmail.com wrote: > On Apr 27, 2009 11:27am, Pádraig Brady wrote: >> >> It seems like you'll need to go through the copyright assignment >> > > I will start this process as soon as possible. > > The -h seems to run faster than -n on my data, but only because I gave > an even distribution of unit prefixes, so it wasn't having to do the > number compares much of the time. I imagine most prefixes will be > missing, K and M in real world usage, with most comparisons routing > through the fallback to -n. Certainly my usage will be. > > I'll look at your updated patch this evening. Looks like your copyright assignment papers went through. Attached is the latest patch rebased against master and with a couple of extra whitespace fixups. Testing this with actual `du -h` output gives a 6% slow down compared to `sort -n`. That's acceptable I think given the extra processing being done. cheers, Pádraig. p.s. I've used the knome...@gmail.com address. I realize gmail is "." agnostic, but for future indexing you should try and use a single format. >From 0c522f683b592f7ecb527cfe68e79c0397efa6d5 Mon Sep 17 00:00:00 2001 From: Michael Speer Date: Mon, 27 Apr 2009 14:51:29 +0100 Subject: [PATCH] sort: new --human-numeric-sort option to sort KiB MB etc. * NEWS: Document the new option * doc/coreutils.texi (sort invocation): ditto * src/sort.c (main): handle the new -human-numeric-sort option (-h). (human_numcompare): A new function to compare SI and IEC suffixes before falling back to the standard --numeric comparison. (find_unit_order): A new helper function to find the order of magnitude of a number string as determined by its suffix. (check_mixed_SI_IEC): A new helper function to exit with error if both SI and IEC suffixes are presented. * tests/misc/sort: Add 6 tests to test the new functionality. * THANKS: Update --- NEWS |5 ++ THANKS |1 + doc/coreutils.texi | 14 ++ src/sort.c | 115 tests/misc/sort| 13 ++ 5 files changed, 140 insertions(+), 8 deletions(-) diff --git a/NEWS b/NEWS index 31f1b1a..f28097d 100644 --- a/NEWS +++ b/NEWS @@ -2,6 +2,11 @@ GNU coreutils NEWS-*- outline -*- * Noteworthy changes in release ?.? (-??-??) [?] +** New features + + sort accepts a new option, --human-numeric-sort (-h): sort numbers + while honoring human readable suffixes like KiB and MB etc. + ** Bug fixes truncate -s failed to skip all whitespace in the option argument in diff --git a/THANKS b/THANKS index cf801c5..4392f04 100644 --- a/THANKS +++ b/THANKS @@ -396,6 +396,7 @@ Michael J. Croghan mcrog...@usatoday.com Michael McFarland sid...@yahoo.com Michael McLagan mmcla...@invlogic.com Michael Piefel pie...@informatik.hu-berlin.de +Michael Speer knome...@gmail.com Michael Steffensmichael.steff...@s.netic.de Michael Stone mst...@debian.org Michael Stutz st...@dsl.org diff --git a/doc/coreutils.texi b/doc/coreutils.texi index 1a3075f..ae5c577 100644 --- a/doc/coreutils.texi +++ b/doc/coreutils.texi @@ -3785,6 +3785,20 @@ Use this option only if there is no alternative; it is much slower than @option{--numeric-sort} (@option{-n}) and it can lose information when converting to floating point. +...@item -h +...@itemx --human-numeric-sort +...@itemx --sort=human-numeric +...@opindex -h +...@opindex --human-numeric-sort +...@opindex --sort +...@cindex human numeric sort +...@vindex LC_NUMERIC +Sort numerically, as per the @option{--numeric-sort} option, +and in addition handle IEC or SI suffixes like MiB, MB etc. +Note a mixture of these suffixes is not supported and will +be flagged as an error. Also the numbers must be abbreviated unambiguously. +I.E. 5000K and 6M will be sorted incorrectly for example. + @item -i @itemx --ignore-nonprinting @opindex -i diff --git a/src/sort.c b/src/sort.c index 6dea2ff..32cd200 100644 --- a/src/sort.c +++ b/src/sort.c @@ -176,6 +176,8 @@ struct keyfield bool random; /* Sort by random hash of key. */ bool general_numeric; /* Flag for general, numeric comparison. Handle numbers in exponential notation. */ + bool human_numeric; /* Flag for sorting by human readable + units with either SI xor IEC prefixes. */ bool month; /* Flag for comparison by month name. */ bool reverse; /* Reverse the sense of comparison. */ bool version; /* sort by version number */ @@ -337,6 +339,9 @@ Ordering options:\n\ -M, --month-sortcompare (unknown) < `JAN' < ... < `DEC'\n\ "), stdout); fputs (_("\ + -h, --human-numeric-sortcompare human readable numbers (e.g., 2K 1G)\n\ +"), stdout); + fputs (_("\ -n, --numeric-sort compare according to string numerical value\n\ -R, --random-sort
Re: Re: Human readable sort
On Apr 27, 2009 11:41am, Ondřej Vašík wrote: Pádraig Brady wrote: > Pádraig Brady wrote: > Attached is the full patch, which hopefully we can push soon. I'm not objecting anything relevant in that patch, just the tab/spaces mixing looks inconsistent with the rest of the code in added lines in sort.c . Only cosmetic thing ... but is this intentional? Greetings, Ondřej Vašík That is my fault. My home and work emacs are configured with tabs disabled for interaction with my office and home repos. I'll do cleanup against the patch this afternoon when I can look at it. -Michael speer ___ Bug-coreutils mailing list Bug-coreutils@gnu.org http://lists.gnu.org/mailman/listinfo/bug-coreutils
Re: Human readable sort
knome@gmail.com wrote: > On Apr 27, 2009 11:41am, Ondřej Vašík wrote: >> Pádraig Brady wrote: >> >> > Pádraig Brady wrote: >> >> > Attached is the full patch, which hopefully we can push soon. >> >> >> >> I'm not objecting anything relevant in that patch, just the tab/spaces >> >> mixing looks inconsistent with the rest of the code in added lines in >> >> sort.c . Only cosmetic thing ... but is this intentional? >> >> >> >> Greetings, >> >> Ondřej Vašík >> > > That is my fault. My home and work emacs are configured with tabs > disabled for interaction with my office and home repos. I'll do cleanup > against the patch this afternoon when I can look at it. Thanks Ondřej. This is fixed up in attached I think. cheers, Pádraig. p.s. Micheal, which email address do you want in THANKS? >From 2e1ee46e97858717535210729f5e0181c08bf6d1 Mon Sep 17 00:00:00 2001 From: Michael Speer Date: Mon, 27 Apr 2009 14:51:29 +0100 Subject: [PATCH] sort: new --human-numeric-sort option to sort KiB MB etc. * NEWS: document the new option * doc/coreutils.texi: ditto * src/sort.c (main): handle the new -human-numeric-sort option (-h). (human_numcompare): A new function to compare SI and IEC suffixes before falling back to the standard --numeric comparision. (find_unit_order): A new helper function to find the order of magnitude of a number string as determined by its suffix. (check_mixed_SI_IEC): A new helper function to exit with error if both SI and IEC suffixes are presented. * tests/misc/sort: Add 6 tests to test the new functionality. * THANKS: Update --- NEWS |5 ++ THANKS |1 + doc/coreutils.texi | 14 ++ src/sort.c | 115 tests/misc/sort| 13 ++ 5 files changed, 140 insertions(+), 8 deletions(-) diff --git a/NEWS b/NEWS index 8cb17cc..bfebe1d 100644 --- a/NEWS +++ b/NEWS @@ -2,6 +2,11 @@ GNU coreutils NEWS-*- outline -*- * Noteworthy changes in release 7.3 (-??-??) [?] +** New features + + sort accepts a new option, --human-numeric-sort (-h): sort numbers + while honouring human readable suffixes like KiB and MB etc. + ** Bug fixes sort -m no longer segfaults when its output file is also an input file. diff --git a/THANKS b/THANKS index 876a6b6..da48d6d 100644 --- a/THANKS +++ b/THANKS @@ -395,6 +395,7 @@ Michael J. Croghan mcrog...@usatoday.com Michael McFarland sid...@yahoo.com Michael McLagan mmcla...@invlogic.com Michael Piefel pie...@informatik.hu-berlin.de +Michael Speer knome...@gmail.com Michael Steffensmichael.steff...@s.netic.de Michael Stone mst...@debian.org Michael Stutz st...@dsl.org diff --git a/doc/coreutils.texi b/doc/coreutils.texi index 918f44e..8f73419 100644 --- a/doc/coreutils.texi +++ b/doc/coreutils.texi @@ -3785,6 +3785,20 @@ Use this option only if there is no alternative; it is much slower than @option{--numeric-sort} (@option{-n}) and it can lose information when converting to floating point. +...@item -h +...@itemx --human-numeric-sort +...@itemx --sort=human-numeric +...@opindex -h +...@opindex --human-numeric-sort +...@opindex --sort +...@cindex human numeric sort +...@vindex LC_NUMERIC +Sort numerically, as per the @option{--numeric-sort} option, +and in addition handle IEC or SI suffixes like MiB, MB etc. +Note a mixture of these suffixes is not supported and will +be flagged as an error. Also the numbers must be abbreviated unambiguously. +I.E. 5000K and 6M will be sorted incorrectly for example. + @item -i @itemx --ignore-nonprinting @opindex -i diff --git a/src/sort.c b/src/sort.c index f48d727..8a85d54 100644 --- a/src/sort.c +++ b/src/sort.c @@ -176,6 +176,8 @@ struct keyfield bool random; /* Sort by random hash of key. */ bool general_numeric; /* Flag for general, numeric comparison. Handle numbers in exponential notation. */ + bool human_numeric; /* Flag for sorting by human readable + units with either SI xor IEC prefixes. */ bool month; /* Flag for comparison by month name. */ bool reverse; /* Reverse the sense of comparison. */ bool version; /* sort by version number */ @@ -337,6 +339,9 @@ Ordering options:\n\ -M, --month-sortcompare (unknown) < `JAN' < ... < `DEC'\n\ "), stdout); fputs (_("\ + -h, --human-numeric-sortcompare human readable numbers (e.g., 2K 1G)\n\ +"), stdout); + fputs (_("\ -n, --numeric-sort compare according to string numerical value\n\ -R, --random-sort sort by random hash of keys\n\ --random-source=FILEget random bytes from FILE\n\ @@ -344,8 +349,8 @@ Ordering options:\n\ "), stdout); fputs (_("\ --sort=WORD sort according to WORD:\n\ -genera
Re: Human readable sort
Pádraig Brady wrote: > Pádraig Brady wrote: > Attached is the full patch, which hopefully we can push soon. I'm not objecting anything relevant in that patch, just the tab/spaces mixing looks inconsistent with the rest of the code in added lines in sort.c . Only cosmetic thing ... but is this intentional? Greetings, Ondřej Vašík signature.asc Description: Toto je digitálně podepsaná část zprávy ___ Bug-coreutils mailing list Bug-coreutils@gnu.org http://lists.gnu.org/mailman/listinfo/bug-coreutils
Re: Human readable sort
Pádraig Brady wrote: > seconds to sort 1 million ints: > --- > sort optiontime difference > --- > -n 2.75 > -h (ret 0) 3.10 +13% > -h 3.96 +44% I removed a redundant to_uchar() and got it from 44% to 40% compared to -n. Note our existing -n implementation is mega fast, so at least we're not interfering with its implementation. Attached is the full patch, which hopefully we can push soon. cheers, Pádraig. >From fef4d790423ff840d0217b6107db85b396728cb4 Mon Sep 17 00:00:00 2001 From: Michael Speer Date: Mon, 27 Apr 2009 14:51:29 +0100 Subject: [PATCH] sort: new --human-numeric-sort option to sort KiB MB etc. * NEWS: document the new option * doc/coreutils.texi: ditto * src/sort.c (main): handle the new -human-numeric-sort option (-h). (human_numcompare): A new function to compare SI and IEC suffixes before falling back to the standard --numeric comparision. (find_unit_order): A new helper function to find the order of magnitude of a number string as determined by its suffix. (check_mixed_SI_IEC): A new helper function to exit with error if both SI and IEC suffixes are presented. * tests/misc/sort: Add 5 tests to test the new functionality. * THANKS: Update --- NEWS |5 ++ THANKS |1 + doc/coreutils.texi | 14 ++ src/sort.c | 115 tests/misc/sort| 13 ++ 5 files changed, 140 insertions(+), 8 deletions(-) diff --git a/NEWS b/NEWS index 8cb17cc..bfebe1d 100644 --- a/NEWS +++ b/NEWS @@ -2,6 +2,11 @@ GNU coreutils NEWS-*- outline -*- * Noteworthy changes in release 7.3 (-??-??) [?] +** New features + + sort accepts a new option, --human-numeric-sort (-h): sort numbers + while honouring human readable suffixes like KiB and MB etc. + ** Bug fixes sort -m no longer segfaults when its output file is also an input file. diff --git a/THANKS b/THANKS index 876a6b6..da48d6d 100644 --- a/THANKS +++ b/THANKS @@ -395,6 +395,7 @@ Michael J. Croghan mcrog...@usatoday.com Michael McFarland sid...@yahoo.com Michael McLagan mmcla...@invlogic.com Michael Piefel pie...@informatik.hu-berlin.de +Michael Speer knome...@gmail.com Michael Steffensmichael.steff...@s.netic.de Michael Stone mst...@debian.org Michael Stutz st...@dsl.org diff --git a/doc/coreutils.texi b/doc/coreutils.texi index 918f44e..8f73419 100644 --- a/doc/coreutils.texi +++ b/doc/coreutils.texi @@ -3785,6 +3785,20 @@ Use this option only if there is no alternative; it is much slower than @option{--numeric-sort} (@option{-n}) and it can lose information when converting to floating point. +...@item -h +...@itemx --human-numeric-sort +...@itemx --sort=human-numeric +...@opindex -h +...@opindex --human-numeric-sort +...@opindex --sort +...@cindex human numeric sort +...@vindex LC_NUMERIC +Sort numerically, as per the @option{--numeric-sort} option, +and in addition handle IEC or SI suffixes like MiB, MB etc. +Note a mixture of these suffixes is not supported and will +be flagged as an error. Also the numbers must be abbreviated unambiguously. +I.E. 5000K and 6M will be sorted incorrectly for example. + @item -i @itemx --ignore-nonprinting @opindex -i diff --git a/src/sort.c b/src/sort.c index f48d727..4c1bec5 100644 --- a/src/sort.c +++ b/src/sort.c @@ -176,6 +176,8 @@ struct keyfield bool random; /* Sort by random hash of key. */ bool general_numeric; /* Flag for general, numeric comparison. Handle numbers in exponential notation. */ + bool human_numeric; /* Flag for sorting by human readable + units with either SI xor IEC prefixes. */ bool month; /* Flag for comparison by month name. */ bool reverse; /* Reverse the sense of comparison. */ bool version; /* sort by version number */ @@ -336,6 +338,9 @@ Ordering options:\n\ -i, --ignore-nonprintingconsider only printable characters\n\ -M, --month-sortcompare (unknown) < `JAN' < ... < `DEC'\n\ "), stdout); + fputs(_("\ + -h, --human-numeric-sortcompare human readable numbers (e.g., 2K 1G)\n\ +"), stdout); fputs (_("\ -n, --numeric-sort compare according to string numerical value\n\ -R, --random-sort sort by random hash of keys\n\ @@ -344,8 +349,8 @@ Ordering options:\n\ "), stdout); fputs (_("\ --sort=WORD sort according to WORD:\n\ -general-numeric -g, month -M, numeric -n,\n\ -random -R, version -V\n\ +general-numeric -g, human-numeric -h, month -M,\n\ +numeric -n, random
Re: Human readable sort
Michael Speer wrote: > 2009/4/25 Pádraig Brady : >> I've further modified your latest in the attached. >> I refactored the suffix finding a bit and also added >> support for --sort=human-numeric. > > I refactored it again to handle some potential problems with how > separators and decimals points were handled. It will still let you > write something silly like "1,3,4.5.6", but I've stopped scanning on > "4..4" or "3,,2" or even "5.M". I'm not sure if that last one is used > meaningfully anywhere. This needs another cycle I think. BTW earlier in this thread I pasted the wrong link to the previous attempt to include this feature. This is the right one: http://lists.gnu.org/archive/html/bug-coreutils/2009-01/threads.html#6 Anyway attached an updated version which supports negative numbers as it was pretty trivial to add. I also removed the explicit check for thousands_sep==-1 as I changed to using unsigned char. Some performance measurements of this version are (where "ret 0" is just returning 0 at the top of find_unit_order() to show the function call overheads.) seconds to sort 1 million ints: --- sort optiontime difference --- -n 2.75 -h (ret 0) 3.10 +13% -h 3.96 +44% seconds to sort 1 million sizes (max len = 4): --- sort optiontime difference --- -n 2.54 -h (ret 0) 2.70 +6% -h 3.50 +38% I haven't really looked at optimizing it yet. cheers, Pádraig. diff --git a/src/sort.c b/src/sort.c index f48d727..640cf1c 100644 --- a/src/sort.c +++ b/src/sort.c @@ -176,6 +176,8 @@ struct keyfield bool random; /* Sort by random hash of key. */ bool general_numeric; /* Flag for general, numeric comparison. Handle numbers in exponential notation. */ + bool human_numeric; /* Flag for sorting by human readable + units with either SI xor IEC prefixes. */ bool month; /* Flag for comparison by month name. */ bool reverse; /* Reverse the sense of comparison. */ bool version; /* sort by version number */ @@ -336,6 +338,9 @@ Ordering options:\n\ -i, --ignore-nonprintingconsider only printable characters\n\ -M, --month-sortcompare (unknown) < `JAN' < ... < `DEC'\n\ "), stdout); + fputs(_("\ + -h, --human-numeric-sortcompare human readable numbers (e.g., 2K 1G)\n\ +"), stdout); fputs (_("\ -n, --numeric-sort compare according to string numerical value\n\ -R, --random-sort sort by random hash of keys\n\ @@ -344,8 +349,8 @@ Ordering options:\n\ "), stdout); fputs (_("\ --sort=WORD sort according to WORD:\n\ -general-numeric -g, month -M, numeric -n,\n\ -random -R, version -V\n\ +general-numeric -g, human-numeric -h, month -M,\n\ +numeric -n, random -R, version -V\n\ -V, --version-sort natural sort of (version) numbers within text\n\ \n\ "), stdout); @@ -426,7 +431,7 @@ enum SORT_OPTION }; -static char const short_options[] = "-bcCdfgik:mMno:rRsS:t:T:uVy:z"; +static char const short_options[] = "-bcCdfghik:mMno:rRsS:t:T:uVy:z"; static struct option const long_options[] = { @@ -442,6 +447,7 @@ static struct option const long_options[] = {"merge", no_argument, NULL, 'm'}, {"month-sort", no_argument, NULL, 'M'}, {"numeric-sort", no_argument, NULL, 'n'}, + {"human-numeric-sort", no_argument, NULL, 'h'}, {"version-sort", no_argument, NULL, 'V'}, {"random-sort", no_argument, NULL, 'R'}, {"random-source", required_argument, NULL, RANDOM_SOURCE_OPTION}, @@ -480,6 +486,7 @@ static char const check_types[] = #define SORT_TABLE \ _st_("general-numeric", 'g') \ + _st_("human-numeric", 'h') \ _st_("month", 'M') \ _st_("numeric", 'n') \ _st_("random", 'R') \ @@ -1673,6 +1680,87 @@ numcompare (const char *a, const char *b) return strnumcmp (a, b, decimal_point, thousands_sep); } +/* Exit with an error if a mixture of SI and IEC units detected. */ + +static void +check_mixed_SI_IEC (char prefix) +{ + static int seen_si = -1; + bool si_present = prefix == 'i'; + if (seen_si != -1 && seen_si != si_present) +error (SORT_FAILURE, 0, _("both SI and IEC prefixes present on units")); + seen_si = si_present; +} + +/* Return an integer which represents the order of magnitude of + the unit following the number. NUMBER can contain thousands separators + or a decimal point, but not have preceeding blanks. + Negative numbers return a negative unit order. */ + +static int +find_unit_order (const char* number) +{ + static const char orders [UCHAR_LIM] = { +['K']=1, ['M']=2, ['G']=3, ['T']=4, ['P']=5, ['E']=6,
Re: Human readable sort
Michael Speer wrote: > 2009/4/25 Pádraig Brady : >> I've further modified your latest in the attached. >> I refactored the suffix finding a bit and also added >> support for --sort=human-numeric. > > I refactored it again to handle some potential problems with how > separators and decimals points were handled. find_unit_order() is a better refactoring, thanks. > It will still let you > write something silly like "1,3,4.5.6", but I've stopped scanning on > "4..4" or "3,,2" or even "5.M". I'm not sure if that last one is used > meaningfully anywhere. That's an improvement. I'll have to do some benchmarking to see if these extra checks have any significant effect. > I did this partly to avoid breaking locales > where space is the separator. Supporting spaces in numbers is problematic anyway due to the way fields are handled. Good to have the space as thousands separator handled anyway. > I poked around a bit to see if any locales used space. > Apparently, the Hungarian locale does. I stopped looking there. Doesn't have spaces here at least: LANG=hu_HU.utf8 printf "%'d\n" 1234 >> I'm wondering whether "numeric" is superfluous? >> I.E. are --sort=human and --human-sort sufficient. >> > > I started with just human, but thought it better to add the numeric > since sort is by default for strings, and both current switches that > enable numeric sorts have it in their name. I would not fight a > reversion on this if no one thought it would look confusing or too > inconsistent to end users. Well I was worried about it being too verbose, but it is more consistent with other numeric options, so we'll leave it as is I think. OK, I'll do up docs and tests tomorrow for this. If you would prefer to do that please shout now. cheers, Pádraig. ___ Bug-coreutils mailing list Bug-coreutils@gnu.org http://lists.gnu.org/mailman/listinfo/bug-coreutils
Re: Human readable sort
2009/4/25 Pádraig Brady : > > I've further modified your latest in the attached. > I refactored the suffix finding a bit and also added > support for --sort=human-numeric. I refactored it again to handle some potential problems with how separators and decimals points were handled. It will still let you write something silly like "1,3,4.5.6", but I've stopped scanning on "4..4" or "3,,2" or even "5.M". I'm not sure if that last one is used meaningfully anywhere. I did this partly to avoid breaking locales where space is the separator. `du --h --apparent-size` output like this : >> 4TO-DO >> 5Million-dollar-idea >> 3K whatever would have triggered the mixed prefix error spuriously due to the greedy consumption of space in the second line. I am not concerned with making it parse intelligently for all the various locales, but only to make sure it doesn't do anything particularly stupid. http://en.wikipedia.org/wiki/ISO_31-0#Numbers It appears ISO suggests the space for separator. I poked around a bit to see if any locales used space. Apparently, the Hungarian locale does. I stopped looking there. > I'm wondering whether "numeric" is superfluous? > I.E. are --sort=human and --human-sort sufficient. > I started with just human, but thought it better to add the numeric since sort is by default for strings, and both current switches that enable numeric sorts have it in their name. I would not fight a reversion on this if no one thought it would look confusing or too inconsistent to end users. -Michael Speer --- orig/coreutils-7.2/src/sort.c 2009-03-29 13:44:10.0 -0400 +++ coreutils-7.2/src/sort.c 2009-04-26 00:46:42.0 -0400 @@ -176,6 +176,8 @@ bool random; /* Sort by random hash of key. */ bool general_numeric; /* Flag for general, numeric comparison. Handle numbers in exponential notation. */ + bool human_numeric; /* Flag for sorting by human readable + units with either SI xor IEC prefixes. */ bool month; /* Flag for comparison by month name. */ bool reverse; /* Reverse the sense of comparison. */ bool version; /* sort by version number */ @@ -336,6 +338,9 @@ -i, --ignore-nonprintingconsider only printable characters\n\ -M, --month-sortcompare (unknown) < `JAN' < ... < `DEC'\n\ "), stdout); + fputs(_("\ + -h, --human-numeric-sortcompare human readable numbers (e.g., 2K 1G)\n\ +"), stdout); fputs (_("\ -n, --numeric-sort compare according to string numerical value\n\ -R, --random-sort sort by random hash of keys\n\ @@ -344,8 +349,8 @@ "), stdout); fputs (_("\ --sort=WORD sort according to WORD:\n\ -general-numeric -g, month -M, numeric -n,\n\ -random -R, version -V\n\ +general-numeric -g, human-numeric -h, month -M,\n\ +numeric -n, random -R, version -V\n\ -V, --version-sort sort by numeric version\n\ \n\ "), stdout); @@ -426,7 +431,7 @@ SORT_OPTION }; -static char const short_options[] = "-bcCdfgik:mMno:rRsS:t:T:uVy:z"; +static char const short_options[] = "-bcCdfghik:mMno:rRsS:t:T:uVy:z"; static struct option const long_options[] = { @@ -442,6 +447,7 @@ {"merge", no_argument, NULL, 'm'}, {"month-sort", no_argument, NULL, 'M'}, {"numeric-sort", no_argument, NULL, 'n'}, + {"human-numeric-sort", no_argument, NULL, 'h'}, {"version-sort", no_argument, NULL, 'V'}, {"random-sort", no_argument, NULL, 'R'}, {"random-source", required_argument, NULL, RANDOM_SOURCE_OPTION}, @@ -480,6 +486,7 @@ #define SORT_TABLE \ _st_("general-numeric", 'g') \ + _st_("human-numeric", 'h') \ _st_("month", 'M') \ _st_("numeric", 'n') \ _st_("random", 'R') \ @@ -1673,6 +1680,85 @@ return strnumcmp (a, b, decimal_point, thousands_sep); } +/* Exit with an error if a mixture of SI and IEC units detected. */ + +static void +check_mixed_SI_IEC (char prefix) +{ + static int seen_si = -1; + bool si_present = prefix == 'i'; + if (seen_si != -1 && seen_si != si_present) +error (SORT_FAILURE, 0, _("both SI and IEC prefixes present on units")); + seen_si = si_present; +} + +/* return an integer which represents the order of magnitude of + the unit following the number +*/ +unsigned int +find_unit_order (const char* number) +{ + /* FIXME : if sort is fixed for multibyte + * separators this will need to be fixed too + */ + + static const char weights [UCHAR_LIM] = { +['K']=1, ['M']=2, ['G']=3, ['T']=4, ['P']=5, ['E']=6, ['Z']=7, ['Y']=8, +['k']=1, + }; + + const char *p = number; + + /* scan to end of number + * decimals or separators not followed by digits + * stop the scan + * numbers ending in decimals or separators are + * are thus considered to be lacking in units + */ +
Re: Human readable sort
Michael Speer wrote: > That's much more readable. I tacked in a size. Good catch. The size is required or otherwise one could get undefined results for some chars. > The standards do not > reference the lowercase letters you commented out, so I just deleted > them outright. Fair enough. >> Something else to consider is to flag when >> a mixture of SI and IEC units are used, as >> this not being supported might not be obvious >> to users and could cause difficult to debug issues for users. >> I.E. flag an error if the following input is presented. >> 999MB >> 998MiB >> I added a very quick hack for that to the patch for illustration. >> > > While du only outputs the first letter, this makes the change better > for more general use. I added a bounds check, but do not see anything > else beyond your illustration would be needed. Oops, yes the bounds check is also needed. I've further modified your latest in the attached. I refactored the suffix finding a bit and also added support for --sort=human-numeric. I'm wondering whether "numeric" is superfluous? I.E. are --sort=human and --human-sort sufficient. cheers, Pádraig. diff --git a/src/sort.c b/src/sort.c index f48d727..9d7d659 100644 --- a/src/sort.c +++ b/src/sort.c @@ -176,6 +176,8 @@ struct keyfield bool random; /* Sort by random hash of key. */ bool general_numeric; /* Flag for general, numeric comparison. Handle numbers in exponential notation. */ + bool human_numeric; /* Flag for sorting by human readable + units with either SI xor IEC prefixes. */ bool month; /* Flag for comparison by month name. */ bool reverse; /* Reverse the sense of comparison. */ bool version; /* sort by version number */ @@ -336,6 +338,9 @@ Ordering options:\n\ -i, --ignore-nonprintingconsider only printable characters\n\ -M, --month-sortcompare (unknown) < `JAN' < ... < `DEC'\n\ "), stdout); + fputs(_("\ + -h, --human-numeric-sortcompare human readable numbers (e.g., 2K 1G)\n\ +"), stdout); fputs (_("\ -n, --numeric-sort compare according to string numerical value\n\ -R, --random-sort sort by random hash of keys\n\ @@ -344,8 +349,8 @@ Ordering options:\n\ "), stdout); fputs (_("\ --sort=WORD sort according to WORD:\n\ -general-numeric -g, month -M, numeric -n,\n\ -random -R, version -V\n\ +general-numeric -g, human-numeric -h, month -M,\n\ +numeric -n, random -R, version -V\n\ -V, --version-sort natural sort of (version) numbers within text\n\ \n\ "), stdout); @@ -426,7 +431,7 @@ enum SORT_OPTION }; -static char const short_options[] = "-bcCdfgik:mMno:rRsS:t:T:uVy:z"; +static char const short_options[] = "-bcCdfghik:mMno:rRsS:t:T:uVy:z"; static struct option const long_options[] = { @@ -442,6 +447,7 @@ static struct option const long_options[] = {"merge", no_argument, NULL, 'm'}, {"month-sort", no_argument, NULL, 'M'}, {"numeric-sort", no_argument, NULL, 'n'}, + {"human-numeric-sort", no_argument, NULL, 'h'}, {"version-sort", no_argument, NULL, 'V'}, {"random-sort", no_argument, NULL, 'R'}, {"random-source", required_argument, NULL, RANDOM_SOURCE_OPTION}, @@ -480,6 +486,7 @@ static char const check_types[] = #define SORT_TABLE \ _st_("general-numeric", 'g') \ + _st_("human-numeric", 'h') \ _st_("month", 'M') \ _st_("numeric", 'n') \ _st_("random", 'R') \ @@ -1673,6 +1680,60 @@ numcompare (const char *a, const char *b) return strnumcmp (a, b, decimal_point, thousands_sep); } +/* Exit with an error if a mixture of SI and IEC units detected. */ + +static void +check_mixed_SI_IEC (char prefix) +{ + static int seen_si = -1; + bool si_present = prefix == 'i'; + if (seen_si != -1 && seen_si != si_present) +error (SORT_FAILURE, 0, _("both SI and IEC prefixes present on units")); + seen_si = si_present; +} + +/* Return the address of the number suffix or NUL if not present */ + +static const char* +find_suffix (const char* number) +{ + const char *p = number; + + while (ISDIGIT (*p) || *p == decimal_point || *p == thousands_sep) +p++; + + if (*p) +check_mixed_SI_IEC (*(p+1)); + + return p; +} + +/* Compare numbers ending in units with SI xor IEC prefixes + < K < M < G < T < P < E < Z < Y + Assume that numbers are properly abbreviated. + i.e. input will never have 5000K instead of 5M. */ + +static int +human_numcompare (const char *a, const char *b) +{ + static const char weights [UCHAR_LIM] = { +['K']=1, ['M']=2, ['G']=3, ['T']=4, ['P']=5, ['E']=6, ['Z']=7, ['Y']=8, +['k']=1, + }; + + while (blanks[to_uchar (*a)]) +a++; + while (blanks[to_uchar (*b)]) +b++; + + int aw = weights[to_uchar (*find_suffix (a))]; + int bw = weights[to_uch
Re: Human readable sort
2009/4/24 Pádraig Brady : > Michael Speer wrote: >> I wrote the following patch to the 7.2 branch of coreutils to allow >> `sort` to sort by human readable byte sizes. I looked around a bit to >> see what the status of previous attempts to integrate this >> functionality were, but didn't see any very recent activity. This is >> my first interaction with coreutils, so if I missed something obvious, >> please point me towards it. >> >> Is the last potential patch ( >> http://www.mail-archive.com/bug-coreutils@gnu.org/msg14080.html ) >> moving through? If not, if I cleaned this up ( tabs, documentation, >> and test cases ) and applied it to the current HEAD on savannah is >> there a chance of getting this functionality into sort? > > Thanks for reviving this again. > There was a more recent attempt that petered out unfortunately: > http://www.mail-archive.com/bug-coreutils@gnu.org/msg14080.html > >> >> Patch assumptions : >> * that numbers will use the best representation ( never uses 1024b >> instead of 1k, etc ) >> * that the sizes will be specified via suffixes of b, K, M, G, T, P, >> E, Z, Y or their alternately cased variants >> >> The first assumption results in checking only the suffix when they differ. >> This enables it to match the output of `du -h / du --si`, but possibly >> not other tools that do not conform to these assumptions. > > The consensus was that these assumptions are appropriate and useful. > > We assume C99 support now for coreutils so I tweaked your patch, > the main change being to greatly shrink the lookup table initialisation. > Note I commented out the lower case letters (except 'k') as I don't > think any coreutils generate those and they could preclude supporting > other suffixes in future. I'm not sure about doing that but I think it's > better to err on the side of too few suffixes than too many? > That's much more readable. I tacked in a size. The standards do not reference the lowercase letters you commented out, so I just deleted them outright. > Something else to consider is to flag when > a mixture of SI and IEC units are used, as > this not being supported might not be obvious > to users and could cause difficult to debug issues for users. > I.E. flag an error if the following input is presented. > 999MB > 998MiB > I added a very quick hack for that to the patch for illustration. > While du only outputs the first letter, this makes the change better for more general use. I added a bounds check, but do not see anything else beyond your illustration would be needed. > I also noticed that you didn't terminate the fields before > processing as was done for the other numeric sorts? > So I changed that also in the attached patch but didn't > analyze it TBH. > Your change was entirely appropriate. I should have done that originally. > > p.s. obviously docs and help and tests need to be written, > but we can do that after we get the implementation done. > I've attached the updated diff. Thanks for taking an interest in this. Michael Speer --- orig/coreutils-7.2/src/sort.c 2009-03-29 13:44:10.0 -0400 +++ coreutils-7.2/src/sort.c 2009-04-25 04:46:06.0 -0400 @@ -176,6 +176,8 @@ bool random; /* Sort by random hash of key. */ bool general_numeric; /* Flag for general, numeric comparison. Handle numbers in exponential notation. */ + bool human_numeric; /* Flag for sorting by human readable + units with either SI or IEC prefixes */ bool month; /* Flag for comparison by month name. */ bool reverse; /* Reverse the sense of comparison. */ bool version; /* sort by version number */ @@ -336,6 +338,10 @@ -i, --ignore-nonprintingconsider only printable characters\n\ -M, --month-sortcompare (unknown) < `JAN' < ... < `DEC'\n\ "), stdout); + fputs(_("\ + -h, --human-numeric-sortcompare string numerical values ending in units\n\ + prefixed with either SI xor IEC prefixes\n\ +"), stdout); fputs (_("\ -n, --numeric-sort compare according to string numerical value\n\ -R, --random-sort sort by random hash of keys\n\ @@ -426,7 +432,7 @@ SORT_OPTION }; -static char const short_options[] = "-bcCdfgik:mMno:rRsS:t:T:uVy:z"; +static char const short_options[] = "-bcCdfghik:mMno:rRsS:t:T:uVy:z"; static struct option const long_options[] = { @@ -442,6 +448,7 @@ {"merge", no_argument, NULL, 'm'}, {"month-sort", no_argument, NULL, 'M'}, {"numeric-sort", no_argument, NULL, 'n'}, + {"human-numeric-sort", no_argument, NULL, 'h'}, {"version-sort", no_argument, NULL, 'V'}, {"random-sort", no_argument, NULL, 'R'}, {"random-source", required_argument, NULL, RANDOM_SOURCE_OPTION}, @@ -1673,6 +1680,57 @@ return strnumcmp (a, b, decimal_point, thousands_sep); } +/* error if a mixture of SI and IEC units used. */ +static void +check_mixed_SI_IEC (char prefix) +{ + static int s
Re: Human readable sort
Michael Speer wrote: > I wrote the following patch to the 7.2 branch of coreutils to allow > `sort` to sort by human readable byte sizes. I looked around a bit to > see what the status of previous attempts to integrate this > functionality were, but didn't see any very recent activity. This is > my first interaction with coreutils, so if I missed something obvious, > please point me towards it. > > Is the last potential patch ( > http://www.mail-archive.com/bug-coreutils@gnu.org/msg14080.html ) > moving through? If not, if I cleaned this up ( tabs, documentation, > and test cases ) and applied it to the current HEAD on savannah is > there a chance of getting this functionality into sort? Thanks for reviving this again. There was a more recent attempt that petered out unfortunately: http://www.mail-archive.com/bug-coreutils@gnu.org/msg14080.html > > Patch assumptions : > * that numbers will use the best representation ( never uses 1024b > instead of 1k, etc ) > * that the sizes will be specified via suffixes of b, K, M, G, T, P, > E, Z, Y or their alternately cased variants > > The first assumption results in checking only the suffix when they differ. > This enables it to match the output of `du -h / du --si`, but possibly > not other tools that do not conform to these assumptions. The consensus was that these assumptions are appropriate and useful. We assume C99 support now for coreutils so I tweaked your patch, the main change being to greatly shrink the lookup table initialisation. Note I commented out the lower case letters (except 'k') as I don't think any coreutils generate those and they could preclude supporting other suffixes in future. I'm not sure about doing that but I think it's better to err on the side of too few suffixes than too many? Something else to consider is to flag when a mixture of SI and IEC units are used, as this not being supported might not be obvious to users and could cause difficult to debug issues for users. I.E. flag an error if the following input is presented. 999MB 998MiB I added a very quick hack for that to the patch for illustration. I also noticed that you didn't terminate the fields before processing as was done for the other numeric sorts? So I changed that also in the attached patch but didn't analyze it TBH. cheers, Pádraig. p.s. obviously docs and help and tests need to be written, but we can do that after we get the implementation done. diff --git a/src/sort.c b/src/sort.c index f48d727..a2ed015 100644 --- a/src/sort.c +++ b/src/sort.c @@ -176,6 +176,7 @@ struct keyfield bool random; /* Sort by random hash of key. */ bool general_numeric; /* Flag for general, numeric comparison. Handle numbers in exponential notation. */ + bool human_numeric; /* Flag for sorting by common suffixes. */ bool month; /* Flag for comparison by month name. */ bool reverse; /* Reverse the sense of comparison. */ bool version; /* sort by version number */ @@ -426,7 +427,7 @@ enum SORT_OPTION }; -static char const short_options[] = "-bcCdfgik:mMno:rRsS:t:T:uVy:z"; +static char const short_options[] = "-bcCdfghik:mMno:rRsS:t:T:uVy:z"; static struct option const long_options[] = { @@ -442,6 +443,7 @@ static struct option const long_options[] = {"merge", no_argument, NULL, 'm'}, {"month-sort", no_argument, NULL, 'M'}, {"numeric-sort", no_argument, NULL, 'n'}, + {"human-sort", no_argument, NULL, 'h'}, {"version-sort", no_argument, NULL, 'V'}, {"random-sort", no_argument, NULL, 'R'}, {"random-source", required_argument, NULL, RANDOM_SOURCE_OPTION}, @@ -1673,6 +1675,54 @@ numcompare (const char *a, const char *b) return strnumcmp (a, b, decimal_point, thousands_sep); } +/* error if a mixture of SI and IEC units used. */ +static void +check_mixed_SI_IEC (char suffix) +{ + static int seen_si = -1; + bool si_present = suffix == 'i'; + if (seen_si != -1 && seen_si != si_present) +error (SORT_FAILURE, 0, _("Both SI and IEC suffixes present")); + seen_si = si_present; +} + +/* Compare numeric entities ending in human readable size specifiers + b < K < M < G < T < P < E < Z < Y + We assume that numbers are properly abbreviated. + For example, you will never see 500,000,000b, instead of 5M. */ + +static int +human_compare(const char *a, const char *b) +{ + static const char weights [] = { +['K']=1, ['M']=2, ['G']=3, ['T']=4, ['P']=5, ['E']=6, ['Z']=7, ['Y']=8, +['k']=1, /*['m']=2, ['g']=3, ['t']=4, ['p']=5, ['e']=6, ['z']=7, ['y']=8,*/ + }; + + while (blanks[to_uchar (*a)]) +a++; + while (blanks[to_uchar (*b)]) +b++; + + const char *ar = a; + const char *br = b; + + while( ISDIGIT (*ar) || (*ar) == decimal_point || (*ar) == thousands_sep ) +ar++; + while( ISDIGIT (*br) || (*br) == decimal_point || (*br) == thousands_sep ) +br++; + + check_mixed_SI_IEC (*(ar+1)); + check_mixed_SI_IEC (*(br+1)); + + int aw = weights[to_uchar (*ar)]; + int bw =