Re: Support bytesize comparison in sort

Mart Somermaa Thu, 06 Apr 2006 10:41:33 -0700

Andrew D Jewell wrote:
>> I like the idea, but I object to using up yet another single-letter
>> option for this (they're not renewable - the ASCII character set has a
>> fixed size).  I suggest that we use a long option only.
>
> I agree halfway. As a separate command line argument, using up another
> single character might (possibly) be too aggressive.
>
> However, we do need to allocate a single character for it as part of
> the -k field specification; otherwise, as you said, how can you sort
> some fields this way and some fields that way. I'd recommend h, to
> match the -h of ls and df.
Replaced '-B' with '-h'. I initially used uppercase 'B' as I felt that
lowercase chars were indeed too valuable for a  convenience feature with
little importance.
> As for the implementation, unless I'm confused, it only deals with K,M
> and G. It should at least support T if not P, E, Z and Y.
Added all these, up to yottabytes.
> I don't know if this is important, but I also don't see anything that
> would normalize numbers, so comparing 1000000 to 1M wouldn't do the
> right thing. To fix that would open another can of worms, in that
> you'd need to know if your M was 1000*1000 or 1024*1024.
My initial "unpublished" implementation used multiplication for
normalization -- which is less effective and prone to the power of
1000/1024 problem. The current implementation is 1000/1024-problem
agnostic at a cost -- it works correctly only if input is scaled and
suffixed appropriately. If it's not, one will get unexpected results as
you pointed out -- both 10000000 and 10000K will compare less than 1M.


However, I personally don't see this as a problem, as all GNU utilites
provide consistent, properly K,M,G-scaled output and I believe we should
prefer short, effective code to attempts to deal with obscure corner
cases. The feature and its limitations should be properly documented in
the manual and the general normalization-based solution (by
multiplication with a power of 1000 or 1024) should be considered only
if somebody presents a proven need for it.

Let me remind that one can always resort to numeric, suffix-less input
and sorting -- after all, this is just a convenience feature that makes
the life of sysadmins 'du -hs'-ing around their filesystems a little bit
easier (yes, the feature was driven by practical needs, not by the
coolness factor of contributing to The Divine Coreutils  :) ).

Anyway, feel free to disagree.

The updated patch also fixes suffixed/non-suffixed number comparison.

Regards,
Mart Sõmermaa

--- sort.c.orig 2005-10-07 21:48:28.000000000 +0300
+++ sort.c      2006-04-06 20:00:51.000000000 +0300
@@ -26,6 +26,7 @@
 #include <getopt.h>
 #include <sys/types.h>
 #include <signal.h>
+#include <ctype.h>
 #include "system.h"
 #include "error.h"
 #include "hard-locale.h"
@@ -149,6 +150,9 @@ struct keyfield
                                   point, but no exponential notation. */
   bool general_numeric;                /* Flag for general, numeric comparison.
                                   Handle numbers in exponential notation. */
+  bool size_in_bytes;          /* Flag for human-readable bytesize comparison.
+                                  Handle numbers suffixed with K for kilo-,
+                                  M for mega- and G for gigabytes. */
   bool month;                  /* Flag for comparison by month name. */
   bool reverse;                        /* Reverse the sense of comparison. */
   struct keyfield *next;       /* Next keyfield to try. */
@@ -295,6 +299,7 @@ Ordering options:\n\
 "), stdout);
       fputs (_("\
   -b, --ignore-leading-blanks  ignore leading blanks\n\
+  -B, --size-in-bytes         compare bytesizes (numbers suffixed with K, M, 
G)\n\
   -d, --dictionary-order      consider only blanks and alphanumeric 
characters\n\
   -f, --ignore-case           fold lower case to upper case characters\n\
 "), stdout);
@@ -353,7 +358,7 @@ native byte values.\n\
   exit (status);
 }
 
-static char const short_options[] = "-bcdfgik:mMno:rsS:t:T:uy:z";
+static char const short_options[] = "-bcdfghik:mMno:rsS:t:T:uy:z";
 
 static struct option const long_options[] =
 {
@@ -362,6 +367,7 @@ static struct option const long_options[
   {"dictionary-order", no_argument, NULL, 'd'},
   {"ignore-case", no_argument, NULL, 'f'},
   {"general-numeric-sort", no_argument, NULL, 'g'},
+  {"size-in-bytes", no_argument, NULL, 'h'},
   {"ignore-nonprinting", no_argument, NULL, 'i'},
   {"key", required_argument, NULL, 'k'},
   {"merge", no_argument, NULL, 'm'},
@@ -1077,8 +1083,14 @@ numcompare (const char *a, const char *b
   return strnumcmp (a, b, decimal_point, thousands_sep);
 }
 
-static int
-general_numcompare (const char *sa, const char *sb)
+/* If size_in_bytes is true, compare strings A and B as human-readable
+ * positive byte counts (as returned e.g. by df -h) suffixed with
+ * either 'K', 'M', 'G', 'T', 'P', 'E', 'Z', 'Y' for kilobytes,
+ * megabytes, gigabytes, terabytes, petabytes, exabytes, zettabytes,
+ * yottabytes.  
+ */
+static int general_numcompare (const char *sa, const char *sb,
+               bool size_in_bytes)
 {
   /* FIXME: add option to warn about failed conversions.  */
   /* FIXME: maybe add option to try expensive FP conversion
@@ -1095,6 +1107,49 @@ general_numcompare (const char *sa, cons
   if (sb == eb)
     return 1;
 
+  if (size_in_bytes && ea && eb)
+    {
+      char ca, cb;
+
+      while (isblank(*ea))
+             ea++;
+      while (isblank(*eb))
+             eb++;
+
+      ca = (char) tolower(*ea);
+      cb = (char) tolower(*eb);
+
+      /* 1) We don't require both operands to have a known suffix. 
+       * 2) If both suffixes are unknown or equal, compare as usual */
+      if (! ( (ca == 'k' || ca == 'm' || ca == 'g' || ca == 't' 
+                     || ca == 'p' || ca == 'e' || ca == 'z' || ca == 'y')
+             ||
+             (cb == 'k' || cb == 'm' || cb == 'g' || cb == 't' 
+                     || cb == 'p' || cb == 'e' || cb == 'z' || cb == 'y') )
+         || ca == cb)
+       goto compare_as_usual;
+
+      /* As ca != cb, if ca in YB => ca bigger, cb in YB => cb bigger,
+       * if neither of these, if ca in ZB => ca bigger etc */
+      return (ca == 'y' ? 1
+             : cb == 'y' ? -1
+             : ca == 'z' ? 1
+             : cb == 'z' ? -1
+             : ca == 'e' ? 1
+             : cb == 'e' ? -1
+             : ca == 'p' ? 1
+             : cb == 'p' ? -1
+             : ca == 't' ? 1
+             : cb == 't' ? -1
+             : ca == 'g' ? 1
+             : cb == 'g' ? -1
+             : ca == 'm' ? 1
+             : cb == 'm' ? -1
+             : ca == 'k' ? 1 /* ca in KB and cb without a known suffix */
+             : -1); /* cb in KB and ca without a known suffix */
+    }
+
+compare_as_usual:
   /* Sort numbers in the usual way, where -0 == +0.  Put NaNs after
      conversion errors but before numbers; sort them by internal
      bit-pattern, for lack of a more portable alternative.  */
@@ -1179,13 +1234,14 @@ keycompare (const struct line *a, const 
       size_t lenb = limb <= textb ? 0 : limb - textb;
 
       /* Actually compare the fields. */
-      if (key->numeric | key->general_numeric)
+      if (key->numeric | key->general_numeric | key->size_in_bytes)
        {
          char savea = *lima, saveb = *limb;
 
          *lima = *limb = '\0';
-         diff = ((key->numeric ? numcompare : general_numcompare)
-                 (texta, textb));
+         diff = (key->numeric ?
+                   numcompare(texta, textb) :
+                   general_numcompare(texta, textb, key->size_in_bytes));
          *lima = savea, *limb = saveb;
        }
       else if (key->month)
@@ -2069,6 +2125,9 @@ set_ordering (const char *s, struct keyf
        case 'g':
          key->general_numeric = true;
          break;
+       case 'h':
+         key->size_in_bytes = true;
+         break;
        case 'i':
          /* Option order should not matter, so don't let -i override
             -d.  -d implies -i, but -i does not imply -d.  */
@@ -2187,7 +2246,8 @@ main (int argc, char **argv)
   gkey.sword = gkey.eword = SIZE_MAX;
   gkey.ignore = NULL;
   gkey.translate = NULL;
-  gkey.numeric = gkey.general_numeric = gkey.month = gkey.reverse = false;
+  gkey.numeric = gkey.general_numeric = gkey.size_in_bytes = false;
+  gkey.month = gkey.reverse = false;
   gkey.skipsblanks = gkey.skipeblanks = false;
 
   files = xnmalloc (argc, sizeof *files);
@@ -2259,6 +2319,7 @@ main (int argc, char **argv)
        case 'd':
        case 'f':
        case 'g':
+       case 'h':
        case 'i':
        case 'M':
        case 'n':
@@ -2418,7 +2479,7 @@ main (int argc, char **argv)
     if (! (key->ignore || key->translate
           || (key->skipsblanks | key->reverse
               | key->skipeblanks | key->month | key->numeric
-              | key->general_numeric)))
+              | key->general_numeric | key->size_in_bytes)))
       {
        key->ignore = gkey.ignore;
        key->translate = gkey.translate;
@@ -2427,12 +2488,14 @@ main (int argc, char **argv)
        key->month = gkey.month;
        key->numeric = gkey.numeric;
        key->general_numeric = gkey.general_numeric;
+       key->size_in_bytes = gkey.size_in_bytes;
        key->reverse = gkey.reverse;
       }
 
   if (!keylist && (gkey.ignore || gkey.translate
                   || (gkey.skipsblanks | gkey.skipeblanks | gkey.month
-                      | gkey.numeric | gkey.general_numeric)))
+                      | gkey.numeric | gkey.general_numeric
+                      | gkey.size_in_bytes )))
     insertkey (&gkey);
   reverse = gkey.reverse;

_______________________________________________
Bug-coreutils mailing list
Bug-coreutils@gnu.org
http://lists.gnu.org/mailman/listinfo/bug-coreutils

Re: Support bytesize comparison in sort

Reply via email to