I wrote the following patch to the 7.2 branch of coreutils to allow `sort` to sort by human readable byte sizes. I looked around a bit to see what the status of previous attempts to integrate this functionality were, but didn't see any very recent activity. This is my first interaction with coreutils, so if I missed something obvious, please point me towards it.
Is the last potential patch ( http://www.mail-archive.com/bug-coreutils@gnu.org/msg14080.html ) moving through? If not, if I cleaned this up ( tabs, documentation, and test cases ) and applied it to the current HEAD on savannah is there a chance of getting this functionality into sort? Patch assumptions : * that numbers will use the best representation ( never uses 1024b instead of 1k, etc ) * that the sizes will be specified via suffixes of b, K, M, G, T, P, E, Z, Y or their alternately cased variants The first assumption results in checking only the suffix when they differ. This enables it to match the output of `du -h / du --si`, but possibly not other tools that do not conform to these assumptions. --------- --- orig/coreutils-7.2/src/sort.c 2009-03-29 13:44:10.000000000 -0400 +++ coreutils-7.2/src/sort.c 2009-04-24 14:03:47.000000000 -0400 @@ -176,6 +176,8 @@ bool random; /* Sort by random hash of key. */ bool general_numeric; /* Flag for general, numeric comparison. Handle numbers in exponential notation. */ + bool human_numeric; /* Flag for sorting by size specified + data */ bool month; /* Flag for comparison by month name. */ bool reverse; /* Reverse the sense of comparison. */ bool version; /* sort by version number */ @@ -426,7 +428,7 @@ SORT_OPTION }; -static char const short_options[] = "-bcCdfgik:mMno:rRsS:t:T:uVy:z"; +static char const short_options[] = "-bcCdfghik:mMno:rRsS:t:T:uVy:z"; static struct option const long_options[] = { @@ -442,6 +444,7 @@ {"merge", no_argument, NULL, 'm'}, {"month-sort", no_argument, NULL, 'M'}, {"numeric-sort", no_argument, NULL, 'n'}, + {"human-sort", no_argument, NULL, 'h'}, {"version-sort", no_argument, NULL, 'V'}, {"random-sort", no_argument, NULL, 'R'}, {"random-source", required_argument, NULL, RANDOM_SOURCE_OPTION}, @@ -1673,6 +1676,57 @@ return strnumcmp (a, b, decimal_point, thousands_sep); } +/* assumes UCHAR_MAX of 255 */ +/* Y/y:8 -> K/k:1 , otherwise ( including b ) : 0 */ +const char weights [] = + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 6, 0, 3, 0, 0, 0, 1, 0, 2, 0, 0, + 5, 0, 0, 0, 4, 0, 0, 0, 0, 8, 7, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 6, 0, 3, 0, 0, 0, 1, 0, 2, 0, 0, + 5, 0, 0, 0, 4, 0, 0, 0, 0, 8, 7, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } ; + +static int +human_compare(const char *a, const char *b) +{ + /* this tests numeric entities ending in human readable size specifiers + b < K < M < G < T < P < E < Z < Y + we (rudely I admit) assume that numbers are properly abbreviated. + for example, you will never see 500,000,000b, instead of 5M + */ + + const char * ar, * br ; /* riders */ + int aw, bw ; + + while(blanks[to_uchar (*a)]) + a++; + while(blanks[to_uchar (*b)]) + b++; + + ar = a ; + br = b ; + + while( ISDIGIT(*ar) || (*ar) == decimal_point || (*ar) == thousands_sep ) + ar++ ; + while( ISDIGIT(*br) || (*br) == decimal_point || (*br) == thousands_sep ) + br++ ; + + aw = weights[to_uchar (*ar)] ; + bw = weights[to_uchar (*br)] ; + + return aw > bw ? 1 : aw < bw ? -1 : strnumcmp( a , b , decimal_point , thousands_sep) ; +} + static int general_numcompare (const char *sa, const char *sb) { @@ -1917,6 +1971,10 @@ if (key->random) diff = compare_random (texta, lena, textb, lenb); + else if (key->human_numeric) + { + diff = human_compare(texta, textb); + } else if (key->numeric | key->general_numeric) { char savea = *lima, saveb = *limb; @@ -2887,7 +2945,7 @@ for (key = keylist; key; key = key->next) if ((1 < (key->random + key->numeric + key->general_numeric + key->month - + key->version + !!key->ignore)) + + key->version + (!!key->ignore) + key->human_numeric)) || (key->random && key->translate)) { /* The following is too big, but guaranteed to be "big enough". */ @@ -2899,6 +2957,8 @@ *p++ = 'f'; if (key->general_numeric) *p++ = 'g'; + if (key->human_numeric) + *p++ = 'h'; if (key->ignore == nonprinting) *p++ = 'i'; if (key->month) @@ -2990,6 +3050,9 @@ case 'g': key->general_numeric = true; break; + case 'h': + key->human_numeric = true; + break; case 'i': /* Option order should not matter, so don't let -i override -d. -d implies -i, but -i does not imply -d. */ @@ -3138,7 +3201,7 @@ gkey.sword = gkey.eword = SIZE_MAX; gkey.ignore = NULL; gkey.translate = NULL; - gkey.numeric = gkey.general_numeric = gkey.random = gkey.version = false; + gkey.numeric = gkey.general_numeric = gkey.random = gkey.version = gkey.human_numeric = false; gkey.month = gkey.reverse = false; gkey.skipsblanks = gkey.skipeblanks = false; @@ -3217,6 +3280,7 @@ case 'd': case 'f': case 'g': + case 'h': case 'i': case 'M': case 'n': @@ -3469,6 +3533,7 @@ | key->numeric | key->version | key->general_numeric + | key->human_numeric | key->random))) { key->ignore = gkey.ignore; @@ -3478,6 +3543,7 @@ key->month = gkey.month; key->numeric = gkey.numeric; key->general_numeric = gkey.general_numeric; + key->human_numeric = gkey.human_numeric; key->random = gkey.random; key->reverse = gkey.reverse; key->version = gkey.version; @@ -3493,6 +3559,7 @@ | gkey.month | gkey.numeric | gkey.general_numeric + | gkey.human_numeric | gkey.random | gkey.version))) { _______________________________________________ Bug-coreutils mailing list Bug-coreutils@gnu.org http://lists.gnu.org/mailman/listinfo/bug-coreutils