2009/4/25 Pádraig Brady <p...@draigbrady.com>: > > I've further modified your latest in the attached. > I refactored the suffix finding a bit and also added > support for --sort=human-numeric.
I refactored it again to handle some potential problems with how separators and decimals points were handled. It will still let you write something silly like "1,3,4.5.6", but I've stopped scanning on "4..4" or "3,,2" or even "5.M". I'm not sure if that last one is used meaningfully anywhere. I did this partly to avoid breaking locales where space is the separator. `du --h --apparent-size` output like this : >> 4 TO-DO >> 5 Million-dollar-idea >> 3K whatever would have triggered the mixed prefix error spuriously due to the greedy consumption of space in the second line. I am not concerned with making it parse intelligently for all the various locales, but only to make sure it doesn't do anything particularly stupid. http://en.wikipedia.org/wiki/ISO_31-0#Numbers It appears ISO suggests the space for separator. I poked around a bit to see if any locales used space. Apparently, the Hungarian locale does. I stopped looking there. > I'm wondering whether "numeric" is superfluous? > I.E. are --sort=human and --human-sort sufficient. > I started with just human, but thought it better to add the numeric since sort is by default for strings, and both current switches that enable numeric sorts have it in their name. I would not fight a reversion on this if no one thought it would look confusing or too inconsistent to end users. -Michael Speer
--- orig/coreutils-7.2/src/sort.c 2009-03-29 13:44:10.000000000 -0400 +++ coreutils-7.2/src/sort.c 2009-04-26 00:46:42.000000000 -0400 @@ -176,6 +176,8 @@ bool random; /* Sort by random hash of key. */ bool general_numeric; /* Flag for general, numeric comparison. Handle numbers in exponential notation. */ + bool human_numeric; /* Flag for sorting by human readable + units with either SI xor IEC prefixes. */ bool month; /* Flag for comparison by month name. */ bool reverse; /* Reverse the sense of comparison. */ bool version; /* sort by version number */ @@ -336,6 +338,9 @@ -i, --ignore-nonprinting consider only printable characters\n\ -M, --month-sort compare (unknown) < `JAN' < ... < `DEC'\n\ "), stdout); + fputs(_("\ + -h, --human-numeric-sort compare human readable numbers (e.g., 2K 1G)\n\ +"), stdout); fputs (_("\ -n, --numeric-sort compare according to string numerical value\n\ -R, --random-sort sort by random hash of keys\n\ @@ -344,8 +349,8 @@ "), stdout); fputs (_("\ --sort=WORD sort according to WORD:\n\ - general-numeric -g, month -M, numeric -n,\n\ - random -R, version -V\n\ + general-numeric -g, human-numeric -h, month -M,\n\ + numeric -n, random -R, version -V\n\ -V, --version-sort sort by numeric version\n\ \n\ "), stdout); @@ -426,7 +431,7 @@ SORT_OPTION }; -static char const short_options[] = "-bcCdfgik:mMno:rRsS:t:T:uVy:z"; +static char const short_options[] = "-bcCdfghik:mMno:rRsS:t:T:uVy:z"; static struct option const long_options[] = { @@ -442,6 +447,7 @@ {"merge", no_argument, NULL, 'm'}, {"month-sort", no_argument, NULL, 'M'}, {"numeric-sort", no_argument, NULL, 'n'}, + {"human-numeric-sort", no_argument, NULL, 'h'}, {"version-sort", no_argument, NULL, 'V'}, {"random-sort", no_argument, NULL, 'R'}, {"random-source", required_argument, NULL, RANDOM_SOURCE_OPTION}, @@ -480,6 +486,7 @@ #define SORT_TABLE \ _st_("general-numeric", 'g') \ + _st_("human-numeric", 'h') \ _st_("month", 'M') \ _st_("numeric", 'n') \ _st_("random", 'R') \ @@ -1673,6 +1680,85 @@ return strnumcmp (a, b, decimal_point, thousands_sep); } +/* Exit with an error if a mixture of SI and IEC units detected. */ + +static void +check_mixed_SI_IEC (char prefix) +{ + static int seen_si = -1; + bool si_present = prefix == 'i'; + if (seen_si != -1 && seen_si != si_present) + error (SORT_FAILURE, 0, _("both SI and IEC prefixes present on units")); + seen_si = si_present; +} + +/* return an integer which represents the order of magnitude of + the unit following the number +*/ +unsigned int +find_unit_order (const char* number) +{ + /* FIXME : if sort is fixed for multibyte + * separators this will need to be fixed too + */ + + static const char weights [UCHAR_LIM] = { + ['K']=1, ['M']=2, ['G']=3, ['T']=4, ['P']=5, ['E']=6, ['Z']=7, ['Y']=8, + ['k']=1, + }; + + const char *p = number; + + /* scan to end of number + * decimals or separators not followed by digits + * stop the scan + * numbers ending in decimals or separators are + * are thus considered to be lacking in units + */ + while ( ISDIGIT (*p) ) + { + p++ ; + + if ( *p == decimal_point && ISDIGIT( *(p+1) ) ) + p++ ; + else if ( thousands_sep != -1 ) + if ( *p == thousands_sep && ISDIGIT( *(p+1) ) ) + p++ ; + } + + /* only check for *ibi vs *ilo when actually on a + prefix that could be one of them */ + + int weight = weights[to_uchar ( *p )] ; + + if ( weight ) + check_mixed_SI_IEC (*(p+1)); + + return weight ; + +} + +/* Compare numbers ending in units with SI xor IEC prefixes + <none/unknown> < K < M < G < T < P < E < Z < Y + Assume that numbers are properly abbreviated. + i.e. input will never have 5000K instead of 5M. */ + +static int +human_numcompare (const char *a, const char *b) +{ + while (blanks[to_uchar (*a)]) + a++; + while (blanks[to_uchar (*b)]) + b++; + + int aw = find_unit_order ( a ); + int bw = find_unit_order ( b ); + + return (aw > bw ? 1 + : aw < bw ? -1 + : strnumcmp (a , b , decimal_point , thousands_sep)); +} + static int general_numcompare (const char *sa, const char *sb) { @@ -1917,13 +2003,14 @@ if (key->random) diff = compare_random (texta, lena, textb, lenb); - else if (key->numeric | key->general_numeric) + else if (key->numeric | key->general_numeric | key->human_numeric) { char savea = *lima, saveb = *limb; *lima = *limb = '\0'; - diff = ((key->numeric ? numcompare : general_numcompare) - (texta, textb)); + diff = ((key->numeric ? numcompare + : key->general_numeric ? general_numcompare + : human_numcompare) (texta, textb)); *lima = savea, *limb = saveb; } else if (key->version) @@ -2887,7 +2974,7 @@ for (key = keylist; key; key = key->next) if ((1 < (key->random + key->numeric + key->general_numeric + key->month - + key->version + !!key->ignore)) + + key->version + (!!key->ignore) + key->human_numeric)) || (key->random && key->translate)) { /* The following is too big, but guaranteed to be "big enough". */ @@ -2899,6 +2986,8 @@ *p++ = 'f'; if (key->general_numeric) *p++ = 'g'; + if (key->human_numeric) + *p++ = 'h'; if (key->ignore == nonprinting) *p++ = 'i'; if (key->month) @@ -2990,6 +3079,9 @@ case 'g': key->general_numeric = true; break; + case 'h': + key->human_numeric = true; + break; case 'i': /* Option order should not matter, so don't let -i override -d. -d implies -i, but -i does not imply -d. */ @@ -3138,7 +3230,8 @@ gkey.sword = gkey.eword = SIZE_MAX; gkey.ignore = NULL; gkey.translate = NULL; - gkey.numeric = gkey.general_numeric = gkey.random = gkey.version = false; + gkey.numeric = gkey.general_numeric = gkey.human_numeric = false; + gkey.random = gkey.version = false; gkey.month = gkey.reverse = false; gkey.skipsblanks = gkey.skipeblanks = false; @@ -3217,6 +3310,7 @@ case 'd': case 'f': case 'g': + case 'h': case 'i': case 'M': case 'n': @@ -3469,6 +3563,7 @@ | key->numeric | key->version | key->general_numeric + | key->human_numeric | key->random))) { key->ignore = gkey.ignore; @@ -3478,6 +3573,7 @@ key->month = gkey.month; key->numeric = gkey.numeric; key->general_numeric = gkey.general_numeric; + key->human_numeric = gkey.human_numeric; key->random = gkey.random; key->reverse = gkey.reverse; key->version = gkey.version; @@ -3493,6 +3589,7 @@ | gkey.month | gkey.numeric | gkey.general_numeric + | gkey.human_numeric | gkey.random | gkey.version))) {
_______________________________________________ Bug-coreutils mailing list Bug-coreutils@gnu.org http://lists.gnu.org/mailman/listinfo/bug-coreutils