Andrew D Jewell wrote: >> I like the idea, but I object to using up yet another single-letter >> option for this (they're not renewable - the ASCII character set has a >> fixed size). I suggest that we use a long option only. > > I agree halfway. As a separate command line argument, using up another > single character might (possibly) be too aggressive. > > However, we do need to allocate a single character for it as part of > the -k field specification; otherwise, as you said, how can you sort > some fields this way and some fields that way. I'd recommend h, to > match the -h of ls and df. Replaced '-B' with '-h'. I initially used uppercase 'B' as I felt that lowercase chars were indeed too valuable for a convenience feature with little importance. > As for the implementation, unless I'm confused, it only deals with K,M > and G. It should at least support T if not P, E, Z and Y. Added all these, up to yottabytes. > I don't know if this is important, but I also don't see anything that > would normalize numbers, so comparing 1000000 to 1M wouldn't do the > right thing. To fix that would open another can of worms, in that > you'd need to know if your M was 1000*1000 or 1024*1024. My initial "unpublished" implementation used multiplication for normalization -- which is less effective and prone to the power of 1000/1024 problem. The current implementation is 1000/1024-problem agnostic at a cost -- it works correctly only if input is scaled and suffixed appropriately. If it's not, one will get unexpected results as you pointed out -- both 10000000 and 10000K will compare less than 1M.
However, I personally don't see this as a problem, as all GNU utilites provide consistent, properly K,M,G-scaled output and I believe we should prefer short, effective code to attempts to deal with obscure corner cases. The feature and its limitations should be properly documented in the manual and the general normalization-based solution (by multiplication with a power of 1000 or 1024) should be considered only if somebody presents a proven need for it. Let me remind that one can always resort to numeric, suffix-less input and sorting -- after all, this is just a convenience feature that makes the life of sysadmins 'du -hs'-ing around their filesystems a little bit easier (yes, the feature was driven by practical needs, not by the coolness factor of contributing to The Divine Coreutils :) ). Anyway, feel free to disagree. The updated patch also fixes suffixed/non-suffixed number comparison. Regards, Mart Sõmermaa
--- sort.c.orig 2005-10-07 21:48:28.000000000 +0300 +++ sort.c 2006-04-06 20:00:51.000000000 +0300 @@ -26,6 +26,7 @@ #include <getopt.h> #include <sys/types.h> #include <signal.h> +#include <ctype.h> #include "system.h" #include "error.h" #include "hard-locale.h" @@ -149,6 +150,9 @@ struct keyfield point, but no exponential notation. */ bool general_numeric; /* Flag for general, numeric comparison. Handle numbers in exponential notation. */ + bool size_in_bytes; /* Flag for human-readable bytesize comparison. + Handle numbers suffixed with K for kilo-, + M for mega- and G for gigabytes. */ bool month; /* Flag for comparison by month name. */ bool reverse; /* Reverse the sense of comparison. */ struct keyfield *next; /* Next keyfield to try. */ @@ -295,6 +299,7 @@ Ordering options:\n\ "), stdout); fputs (_("\ -b, --ignore-leading-blanks ignore leading blanks\n\ + -B, --size-in-bytes compare bytesizes (numbers suffixed with K, M, G)\n\ -d, --dictionary-order consider only blanks and alphanumeric characters\n\ -f, --ignore-case fold lower case to upper case characters\n\ "), stdout); @@ -353,7 +358,7 @@ native byte values.\n\ exit (status); } -static char const short_options[] = "-bcdfgik:mMno:rsS:t:T:uy:z"; +static char const short_options[] = "-bcdfghik:mMno:rsS:t:T:uy:z"; static struct option const long_options[] = { @@ -362,6 +367,7 @@ static struct option const long_options[ {"dictionary-order", no_argument, NULL, 'd'}, {"ignore-case", no_argument, NULL, 'f'}, {"general-numeric-sort", no_argument, NULL, 'g'}, + {"size-in-bytes", no_argument, NULL, 'h'}, {"ignore-nonprinting", no_argument, NULL, 'i'}, {"key", required_argument, NULL, 'k'}, {"merge", no_argument, NULL, 'm'}, @@ -1077,8 +1083,14 @@ numcompare (const char *a, const char *b return strnumcmp (a, b, decimal_point, thousands_sep); } -static int -general_numcompare (const char *sa, const char *sb) +/* If size_in_bytes is true, compare strings A and B as human-readable + * positive byte counts (as returned e.g. by df -h) suffixed with + * either 'K', 'M', 'G', 'T', 'P', 'E', 'Z', 'Y' for kilobytes, + * megabytes, gigabytes, terabytes, petabytes, exabytes, zettabytes, + * yottabytes. + */ +static int general_numcompare (const char *sa, const char *sb, + bool size_in_bytes) { /* FIXME: add option to warn about failed conversions. */ /* FIXME: maybe add option to try expensive FP conversion @@ -1095,6 +1107,49 @@ general_numcompare (const char *sa, cons if (sb == eb) return 1; + if (size_in_bytes && ea && eb) + { + char ca, cb; + + while (isblank(*ea)) + ea++; + while (isblank(*eb)) + eb++; + + ca = (char) tolower(*ea); + cb = (char) tolower(*eb); + + /* 1) We don't require both operands to have a known suffix. + * 2) If both suffixes are unknown or equal, compare as usual */ + if (! ( (ca == 'k' || ca == 'm' || ca == 'g' || ca == 't' + || ca == 'p' || ca == 'e' || ca == 'z' || ca == 'y') + || + (cb == 'k' || cb == 'm' || cb == 'g' || cb == 't' + || cb == 'p' || cb == 'e' || cb == 'z' || cb == 'y') ) + || ca == cb) + goto compare_as_usual; + + /* As ca != cb, if ca in YB => ca bigger, cb in YB => cb bigger, + * if neither of these, if ca in ZB => ca bigger etc */ + return (ca == 'y' ? 1 + : cb == 'y' ? -1 + : ca == 'z' ? 1 + : cb == 'z' ? -1 + : ca == 'e' ? 1 + : cb == 'e' ? -1 + : ca == 'p' ? 1 + : cb == 'p' ? -1 + : ca == 't' ? 1 + : cb == 't' ? -1 + : ca == 'g' ? 1 + : cb == 'g' ? -1 + : ca == 'm' ? 1 + : cb == 'm' ? -1 + : ca == 'k' ? 1 /* ca in KB and cb without a known suffix */ + : -1); /* cb in KB and ca without a known suffix */ + } + +compare_as_usual: /* Sort numbers in the usual way, where -0 == +0. Put NaNs after conversion errors but before numbers; sort them by internal bit-pattern, for lack of a more portable alternative. */ @@ -1179,13 +1234,14 @@ keycompare (const struct line *a, const size_t lenb = limb <= textb ? 0 : limb - textb; /* Actually compare the fields. */ - if (key->numeric | key->general_numeric) + if (key->numeric | key->general_numeric | key->size_in_bytes) { char savea = *lima, saveb = *limb; *lima = *limb = '\0'; - diff = ((key->numeric ? numcompare : general_numcompare) - (texta, textb)); + diff = (key->numeric ? + numcompare(texta, textb) : + general_numcompare(texta, textb, key->size_in_bytes)); *lima = savea, *limb = saveb; } else if (key->month) @@ -2069,6 +2125,9 @@ set_ordering (const char *s, struct keyf case 'g': key->general_numeric = true; break; + case 'h': + key->size_in_bytes = true; + break; case 'i': /* Option order should not matter, so don't let -i override -d. -d implies -i, but -i does not imply -d. */ @@ -2187,7 +2246,8 @@ main (int argc, char **argv) gkey.sword = gkey.eword = SIZE_MAX; gkey.ignore = NULL; gkey.translate = NULL; - gkey.numeric = gkey.general_numeric = gkey.month = gkey.reverse = false; + gkey.numeric = gkey.general_numeric = gkey.size_in_bytes = false; + gkey.month = gkey.reverse = false; gkey.skipsblanks = gkey.skipeblanks = false; files = xnmalloc (argc, sizeof *files); @@ -2259,6 +2319,7 @@ main (int argc, char **argv) case 'd': case 'f': case 'g': + case 'h': case 'i': case 'M': case 'n': @@ -2418,7 +2479,7 @@ main (int argc, char **argv) if (! (key->ignore || key->translate || (key->skipsblanks | key->reverse | key->skipeblanks | key->month | key->numeric - | key->general_numeric))) + | key->general_numeric | key->size_in_bytes))) { key->ignore = gkey.ignore; key->translate = gkey.translate; @@ -2427,12 +2488,14 @@ main (int argc, char **argv) key->month = gkey.month; key->numeric = gkey.numeric; key->general_numeric = gkey.general_numeric; + key->size_in_bytes = gkey.size_in_bytes; key->reverse = gkey.reverse; } if (!keylist && (gkey.ignore || gkey.translate || (gkey.skipsblanks | gkey.skipeblanks | gkey.month - | gkey.numeric | gkey.general_numeric))) + | gkey.numeric | gkey.general_numeric + | gkey.size_in_bytes ))) insertkey (&gkey); reverse = gkey.reverse;
_______________________________________________ Bug-coreutils mailing list Bug-coreutils@gnu.org http://lists.gnu.org/mailman/listinfo/bug-coreutils