2009/4/25 Pádraig Brady <p...@draigbrady.com>:
>
> I've further modified your latest in the attached.
> I refactored the suffix finding a bit and also added
> support for --sort=human-numeric.

I refactored it again to handle some potential problems with how
separators and decimals points were handled.  It will still let you
write something silly like "1,3,4.5.6", but I've stopped scanning on
"4..4" or "3,,2" or even "5.M".  I'm not sure if that last one is used
meaningfully anywhere.  I did this partly to avoid breaking locales
where space is the separator.  `du --h --apparent-size` output like
this :

>> 4    TO-DO
>> 5    Million-dollar-idea
>> 3K  whatever

would have triggered the mixed prefix error spuriously due to the
greedy consumption of space in the second line.  I am not concerned
with making it parse intelligently for all the various locales, but
only to make sure it doesn't do anything particularly stupid.

http://en.wikipedia.org/wiki/ISO_31-0#Numbers

It appears ISO suggests the space for separator.  I poked around a bit
to see if any locales used space.  Apparently, the Hungarian locale
does.  I stopped looking there.

> I'm wondering whether "numeric" is superfluous?
> I.E. are --sort=human and --human-sort sufficient.
>

I started with just human, but thought it better to add the numeric
since sort is by default for strings, and both current switches that
enable numeric sorts have it in their name.  I would not fight a
reversion on this if no one thought it would look confusing or too
inconsistent to end users.

-Michael Speer
--- orig/coreutils-7.2/src/sort.c	2009-03-29 13:44:10.000000000 -0400
+++ coreutils-7.2/src/sort.c	2009-04-26 00:46:42.000000000 -0400
@@ -176,6 +176,8 @@
   bool random;			/* Sort by random hash of key.  */
   bool general_numeric;		/* Flag for general, numeric comparison.
 				   Handle numbers in exponential notation. */
+  bool human_numeric;           /* Flag for sorting by human readable
+                                   units with either SI xor IEC prefixes. */
   bool month;			/* Flag for comparison by month name. */
   bool reverse;			/* Reverse the sense of comparison. */
   bool version;			/* sort by version number */
@@ -336,6 +338,9 @@
   -i, --ignore-nonprinting    consider only printable characters\n\
   -M, --month-sort            compare (unknown) < `JAN' < ... < `DEC'\n\
 "), stdout);
+      fputs(_("\
+  -h, --human-numeric-sort    compare human readable numbers (e.g., 2K 1G)\n\
+"), stdout);
       fputs (_("\
   -n, --numeric-sort          compare according to string numerical value\n\
   -R, --random-sort           sort by random hash of keys\n\
@@ -344,8 +349,8 @@
 "), stdout);
       fputs (_("\
       --sort=WORD             sort according to WORD:\n\
-                                general-numeric -g, month -M, numeric -n,\n\
-                                random -R, version -V\n\
+                                general-numeric -g, human-numeric -h, month -M,\n\
+                                numeric -n, random -R, version -V\n\
   -V, --version-sort          sort by numeric version\n\
 \n\
 "), stdout);
@@ -426,7 +431,7 @@
   SORT_OPTION
 };
 
-static char const short_options[] = "-bcCdfgik:mMno:rRsS:t:T:uVy:z";
+static char const short_options[] = "-bcCdfghik:mMno:rRsS:t:T:uVy:z";
 
 static struct option const long_options[] =
 {
@@ -442,6 +447,7 @@
   {"merge", no_argument, NULL, 'm'},
   {"month-sort", no_argument, NULL, 'M'},
   {"numeric-sort", no_argument, NULL, 'n'},
+  {"human-numeric-sort", no_argument, NULL, 'h'},
   {"version-sort", no_argument, NULL, 'V'},
   {"random-sort", no_argument, NULL, 'R'},
   {"random-source", required_argument, NULL, RANDOM_SOURCE_OPTION},
@@ -480,6 +486,7 @@
 
 #define SORT_TABLE \
   _st_("general-numeric", 'g') \
+  _st_("human-numeric",   'h') \
   _st_("month",           'M') \
   _st_("numeric",         'n') \
   _st_("random",          'R') \
@@ -1673,6 +1680,85 @@
   return strnumcmp (a, b, decimal_point, thousands_sep);
 }
 
+/* Exit with an error if a mixture of SI and IEC units detected.  */
+
+static void
+check_mixed_SI_IEC (char prefix)
+{
+  static int seen_si = -1;
+  bool si_present = prefix == 'i';
+  if (seen_si != -1 && seen_si != si_present)
+    error (SORT_FAILURE, 0, _("both SI and IEC prefixes present on units"));
+  seen_si = si_present;
+}
+
+/* return an integer which represents the order of magnitude of 
+   the unit following the number
+*/
+unsigned int
+find_unit_order (const char* number)
+{
+  /* FIXME : if sort is fixed for multibyte 
+   *   separators this will need to be fixed too 
+   */
+  
+  static const char weights [UCHAR_LIM] = {
+    ['K']=1, ['M']=2, ['G']=3, ['T']=4, ['P']=5, ['E']=6, ['Z']=7, ['Y']=8,
+    ['k']=1,
+  };
+  
+  const char *p = number;
+  
+  /* scan to end of number
+   * decimals or separators not followed by digits
+   *   stop the scan
+   * numbers ending in decimals or separators are
+   *   are thus considered to be lacking in units
+   */
+  while ( ISDIGIT (*p) )
+    {
+      p++ ;
+      
+      if ( *p == decimal_point && ISDIGIT( *(p+1) ) )
+          p++ ;
+      else if ( thousands_sep != -1 )
+        if ( *p == thousands_sep && ISDIGIT( *(p+1) ) )
+            p++ ;
+    }
+  
+  /* only check for *ibi vs *ilo when actually on a 
+     prefix that could be one of them */
+  
+  int weight = weights[to_uchar ( *p )] ;
+  
+  if ( weight )
+    check_mixed_SI_IEC (*(p+1));
+  
+  return weight ;
+  
+}
+
+/* Compare numbers ending in units with SI xor IEC prefixes
+          <none/unknown> < K < M < G < T < P < E < Z < Y
+   Assume that numbers are properly abbreviated.
+   i.e. input will never have 5000K instead of 5M.  */
+
+static int
+human_numcompare (const char *a, const char *b)
+{
+  while (blanks[to_uchar (*a)])
+    a++;
+  while (blanks[to_uchar (*b)])
+    b++;
+  
+  int aw = find_unit_order ( a );
+  int bw = find_unit_order ( b );
+  
+  return (aw > bw ? 1
+          : aw < bw ? -1
+          : strnumcmp (a , b , decimal_point , thousands_sep));
+}
+
 static int
 general_numcompare (const char *sa, const char *sb)
 {
@@ -1917,13 +2003,14 @@
 
       if (key->random)
 	diff = compare_random (texta, lena, textb, lenb);
-      else if (key->numeric | key->general_numeric)
+      else if (key->numeric | key->general_numeric | key->human_numeric)
 	{
 	  char savea = *lima, saveb = *limb;
 
 	  *lima = *limb = '\0';
-	  diff = ((key->numeric ? numcompare : general_numcompare)
-		  (texta, textb));
+	  diff = ((key->numeric ? numcompare
+		   : key->general_numeric ? general_numcompare
+		   : human_numcompare) (texta, textb));
 	  *lima = savea, *limb = saveb;
 	}
       else if (key->version)
@@ -2887,7 +2974,7 @@
 
   for (key = keylist; key; key = key->next)
     if ((1 < (key->random + key->numeric + key->general_numeric + key->month
-	      + key->version + !!key->ignore))
+	      + key->version + (!!key->ignore) + key->human_numeric))
 	|| (key->random && key->translate))
       {
 	/* The following is too big, but guaranteed to be "big enough". */
@@ -2899,6 +2986,8 @@
 	  *p++ = 'f';
 	if (key->general_numeric)
 	  *p++ = 'g';
+        if (key->human_numeric)
+          *p++ = 'h';
 	if (key->ignore == nonprinting)
 	  *p++ = 'i';
 	if (key->month)
@@ -2990,6 +3079,9 @@
 	case 'g':
 	  key->general_numeric = true;
 	  break;
+        case 'h':
+          key->human_numeric = true;
+          break;
 	case 'i':
 	  /* Option order should not matter, so don't let -i override
 	     -d.  -d implies -i, but -i does not imply -d.  */
@@ -3138,7 +3230,8 @@
   gkey.sword = gkey.eword = SIZE_MAX;
   gkey.ignore = NULL;
   gkey.translate = NULL;
-  gkey.numeric = gkey.general_numeric = gkey.random = gkey.version = false;
+  gkey.numeric = gkey.general_numeric = gkey.human_numeric = false;
+  gkey.random = gkey.version = false;
   gkey.month = gkey.reverse = false;
   gkey.skipsblanks = gkey.skipeblanks = false;
 
@@ -3217,6 +3310,7 @@
 	case 'd':
 	case 'f':
 	case 'g':
+        case 'h':
 	case 'i':
 	case 'M':
 	case 'n':
@@ -3469,6 +3563,7 @@
 		 | key->numeric
 		 | key->version
 		 | key->general_numeric
+                 | key->human_numeric
 		 | key->random)))
         {
           key->ignore = gkey.ignore;
@@ -3478,6 +3573,7 @@
           key->month = gkey.month;
           key->numeric = gkey.numeric;
           key->general_numeric = gkey.general_numeric;
+          key->human_numeric = gkey.human_numeric;
           key->random = gkey.random;
           key->reverse = gkey.reverse;
           key->version = gkey.version;
@@ -3493,6 +3589,7 @@
 		       | gkey.month
 		       | gkey.numeric
 		       | gkey.general_numeric
+                       | gkey.human_numeric
 		       | gkey.random
 		       | gkey.version)))
     {
_______________________________________________
Bug-coreutils mailing list
Bug-coreutils@gnu.org
http://lists.gnu.org/mailman/listinfo/bug-coreutils

Reply via email to