Re: Human readable sort

2009-07-02 Thread Jim Meyering
Pádraig Brady wrote:
 I was thinking that the mixed IEC/SI check
 should be applied to each key separately rather
 than globally. What do you think? Patch attached.

From ef06a30d122fc9ccac51a682a3abf6868d8832d6 Mon Sep 17 00:00:00 2001
...
 -check_mixed_SI_IEC (char prefix)
 +check_mixed_SI_IEC (char prefix, struct keyfield *key)
  {
 -  static int seen_si = -1;
 -  bool si_present = prefix == 'i';
 -  if (seen_si != -1  seen_si != si_present)
 +  int si_present = prefix == 'i';
 +  if (key-si_present != -1  si_present != key-si_present)
  error (SORT_FAILURE, 0, _(both SI and IEC prefixes present on units));

Good idea.

Not part of this change, I know, but that diagnostic should include a
file name.  Imagine sorting many files, with many key specifiers (hence
many columns of data) and very many lines, yet only a few offenders.

With a file_name:line number (and byte/char-count?) and maybe even a
sample of the offending data, it'd be easier to spot and correct the
problem.


___
Bug-coreutils mailing list
Bug-coreutils@gnu.org
http://lists.gnu.org/mailman/listinfo/bug-coreutils


Re: Human readable sort

2009-06-30 Thread Pádraig Brady
I was thinking that the mixed IEC/SI check
should be applied to each key separately rather
than globally. What do you think? Patch attached.

cheers,
Pádraig.
From ef06a30d122fc9ccac51a682a3abf6868d8832d6 Mon Sep 17 00:00:00 2001
From: =?utf-8?q?P=C3=A1draig=20Brady?= p...@draigbrady.com
Date: Tue, 30 Jun 2009 00:52:43 +0100
Subject: [PATCH] sort: allow SI and IEC units on separate human sort fields

* src/sort.c: Store the si_present state per key rather than globally
* tests/misc/sort: Add a corresponding check that previously failed
---
 src/sort.c  |   30 --
 tests/misc/sort |2 ++
 2 files changed, 18 insertions(+), 14 deletions(-)

diff --git a/src/sort.c b/src/sort.c
index 6acec07..62ddd49 100644
--- a/src/sort.c
+++ b/src/sort.c
@@ -178,6 +178,7 @@ struct keyfield
    Handle numbers in exponential notation. */
   bool human_numeric;		/* Flag for sorting by human readable
    units with either SI xor IEC prefixes. */
+  int si_present;		/* Flag for checking for mixed SI and IEC. */
   bool month;			/* Flag for comparison by month name. */
   bool reverse;			/* Reverse the sense of comparison. */
   bool version;			/* sort by version number */
@@ -1684,13 +1685,12 @@ numcompare (const char *a, const char *b)
 /* Exit with an error if a mixture of SI and IEC units detected.  */
 
 static void
-check_mixed_SI_IEC (char prefix)
+check_mixed_SI_IEC (char prefix, struct keyfield *key)
 {
-  static int seen_si = -1;
-  bool si_present = prefix == 'i';
-  if (seen_si != -1  seen_si != si_present)
+  int si_present = prefix == 'i';
+  if (key-si_present != -1  si_present != key-si_present)
 error (SORT_FAILURE, 0, _(both SI and IEC prefixes present on units));
-  seen_si = si_present;
+  key-si_present = si_present;
 }
 
 /* Return an integer which represents the order of magnitude of
@@ -1699,7 +1699,7 @@ check_mixed_SI_IEC (char prefix)
Negative numbers return a negative unit order.  */
 
 static int
-find_unit_order (const char *number)
+find_unit_order (const char *number, struct keyfield *key)
 {
   static const char orders [UCHAR_LIM] = {
 ['K']=1, ['M']=2, ['G']=3, ['T']=4, ['P']=5, ['E']=6, ['Z']=7, ['Y']=8,
@@ -1736,7 +1736,7 @@ find_unit_order (const char *number)
 
   /* For valid units check for MiB vs MB etc.  */
   if (order)
-check_mixed_SI_IEC (*(p + 1));
+check_mixed_SI_IEC (*(p + 1), key);
 
   return sign * order;
 }
@@ -1747,15 +1747,15 @@ find_unit_order (const char *number)
i.e. input will never have both 6000K and 5M.  */
 
 static int
-human_numcompare (const char *a, const char *b)
+human_numcompare (const char *a, const char *b, struct keyfield *key)
 {
   while (blanks[to_uchar (*a)])
 a++;
   while (blanks[to_uchar (*b)])
 b++;
 
-  int order_a = find_unit_order (a);
-  int order_b = find_unit_order (b);
+  int order_a = find_unit_order (a, key);
+  int order_b = find_unit_order (b, key);
 
   return (order_a  order_b ? 1
 	  : order_a  order_b ? -1
@@ -1982,7 +1982,7 @@ compare_version (char *restrict texta, size_t lena,
 static int
 keycompare (const struct line *a, const struct line *b)
 {
-  struct keyfield const *key = keylist;
+  struct keyfield *key = keylist;
 
   /* For the first iteration only, the key positions have been
  precomputed for us. */
@@ -2015,9 +2015,9 @@ keycompare (const struct line *a, const struct line *b)
 	  char savea = *lima, saveb = *limb;
 
 	  *lima = *limb = '\0';
-	  diff = ((key-numeric ? numcompare
-		   : key-general_numeric ? general_numcompare
-		   : human_numcompare) (texta, textb));
+	  diff = (key-numeric ? numcompare (texta, textb)
+		  : key-general_numeric ? general_numcompare (texta, textb)
+		  : human_numcompare (texta, textb, key));
 	  *lima = savea, *limb = saveb;
 	}
   else if (key-version)
@@ -3125,6 +3125,7 @@ key_init (struct keyfield *key)
 {
   memset (key, 0, sizeof *key);
   key-eword = SIZE_MAX;
+  key-si_present = -1;
   return key;
 }
 
@@ -3240,6 +3241,7 @@ main (int argc, char **argv)
   gkey.ignore = NULL;
   gkey.translate = NULL;
   gkey.numeric = gkey.general_numeric = gkey.human_numeric = false;
+  gkey.si_present = -1;
   gkey.random = gkey.version = false;
   gkey.month = gkey.reverse = false;
   gkey.skipsblanks = gkey.skipeblanks = false;
diff --git a/tests/misc/sort b/tests/misc/sort
index 21e7af8..1340500 100755
--- a/tests/misc/sort
+++ b/tests/misc/sort
@@ -71,6 +71,8 @@ my @Tests =
  {ERR=$prog: options `-hn' are incompatible\n}],
 # check key processing
 [h8, '-n -k2,2h', {IN=1 1E\n2 2M\n}, {OUT=2 2M\n1 1E\n}],
+# SI and IEC prefixes on separate keys allowed
+[h9, '-h -k1,1 -k2,2', {IN=1M 1Mi\n1M 1Mi\n}, {OUT=1M 1Mi\n1M 1Mi\n}],
 
 [01a, '', {IN=A\nB\nC\n}, {OUT=A\nB\nC\n}],
 #
-- 
1.6.2.5

___
Bug-coreutils mailing list
Bug-coreutils@gnu.org
http://lists.gnu.org/mailman/listinfo/bug-coreutils


Re: Human readable sort

2009-05-27 Thread Jim Meyering
Pádraig Brady wrote:
 Latest version attached.
...
 * NEWS: Document the new option
 * doc/coreutils.texi (sort invocation): ditto
 * src/sort.c (main): handle the new --human-numeric-sort option (-h).
 (human_numcompare): A new function to compare SI and IEC suffixes
 before falling back to the standard --numeric comparison.
 (find_unit_order): A new helper function to find the order
 of magnitude of a number string as determined by its suffix.
 (check_mixed_SI_IEC): A new helper function to exit with error
 if both SI and IEC suffixes are presented.

Looks fine.
Thank you, Pádraig and Michael.


___
Bug-coreutils mailing list
Bug-coreutils@gnu.org
http://lists.gnu.org/mailman/listinfo/bug-coreutils


Re: Human readable sort

2009-05-26 Thread Pádraig Brady
Jim Meyering wrote:
 
 Please make one small change to that log message:
 
 s/ -human/ --human/
 
 And in the documentation,
 
   +Sort numerically, as per the @option{--numeric-sort} option,
   +and in addition handle IEC or SI suffixes like MiB, MB etc.
   +Note a mixture of these suffixes is not supported and will
   +be flagged as an error. Also the numbers must be abbreviated unambiguously.
   +I.E. 5000K and 6M will be sorted incorrectly for example.
 
 Eventually, it'd be nice to explain in detail why those
 would cause trouble.
 
 Maybe s/unambiguously/consistently/
 or   /uniformly/
 and mention that it's the inconsistent precision that causes trouble.
 
 Hmm actually those two *are* sorted properly for me:
 
 $ printf '%s\n' 5000K 6M| src/sort --human
 5000K
 6M
 
 However, these two are not:
 
 $ printf '%s\n' 7000K 6M| src/sort --human
 7000K
 6M
 

Latest version attached.

cheers,
Pádraig.
From 159faba1376ffd5a46fe4bbc780d85dd3e502cea Mon Sep 17 00:00:00 2001
From: Michael Speer knome...@gmail.com
Date: Mon, 27 Apr 2009 14:51:29 +0100
Subject: [PATCH] sort: new --human-numeric-sort option to sort KiB MB etc.

* NEWS: Document the new option
* doc/coreutils.texi (sort invocation): ditto
* src/sort.c (main): handle the new --human-numeric-sort option (-h).
(human_numcompare): A new function to compare SI and IEC suffixes
before falling back to the standard --numeric comparison.
(find_unit_order): A new helper function to find the order
of magnitude of a number string as determined by its suffix.
(check_mixed_SI_IEC): A new helper function to exit with error
if both SI and IEC suffixes are presented.
* tests/misc/sort: Add 8 tests to test the new functionality.
* THANKS: Update
---
 NEWS   |3 +
 THANKS |1 +
 doc/coreutils.texi |   15 +++
 src/sort.c |  115 
 tests/misc/sort|   18 
 5 files changed, 144 insertions(+), 8 deletions(-)

diff --git a/NEWS b/NEWS
index 3af06e4..29b09a0 100644
--- a/NEWS
+++ b/NEWS
@@ -11,6 +11,9 @@ GNU coreutils NEWS-*- outline -*-
 
   chroot now accepts the options --userspec and --groups.
 
+  sort accepts a new option, --human-numeric-sort (-h): sort numbers
+  while honoring human readable suffixes like KiB and MB etc.
+
 
 * Noteworthy changes in release 7.4 (2009-05-07) [stable]
 
diff --git a/THANKS b/THANKS
index cf801c5..4392f04 100644
--- a/THANKS
+++ b/THANKS
@@ -396,6 +396,7 @@ Michael J. Croghan  mcrog...@usatoday.com
 Michael McFarland   sid...@yahoo.com
 Michael McLagan mmcla...@invlogic.com
 Michael Piefel  pie...@informatik.hu-berlin.de
+Michael Speer   knome...@gmail.com
 Michael Steffensmichael.steff...@s.netic.de
 Michael Stone   mst...@debian.org
 Michael Stutz   st...@dsl.org
diff --git a/doc/coreutils.texi b/doc/coreutils.texi
index 97ea830..834bd46 100644
--- a/doc/coreutils.texi
+++ b/doc/coreutils.texi
@@ -3785,6 +3785,21 @@ Use this option only if there is no alternative; it is much slower than
 @option{--numeric-sort} (@option{-n}) and it can lose information when
 converting to floating point.
 
+...@item -h
+...@itemx --human-numeric-sort
+...@itemx --sort=human-numeric
+...@opindex -h
+...@opindex --human-numeric-sort
+...@opindex --sort
+...@cindex human numeric sort
+...@vindex LC_NUMERIC
+Sort numerically, as per the @option{--numeric-sort} option below, and in
+addition handle IEC or SI suffixes like MiB, MB etc (@ref{Block size}).
+Note a mixture of IEC and SI suffixes is not supported and will
+be flagged as an error.  Also the numbers must be abbreviated uniformly.
+I.E. values with different precisions like 6000K and 5M will be sorted
+incorrectly.
+
 @item -i
 @itemx --ignore-nonprinting
 @opindex -i
diff --git a/src/sort.c b/src/sort.c
index 6dea2ff..8438c05 100644
--- a/src/sort.c
+++ b/src/sort.c
@@ -176,6 +176,8 @@ struct keyfield
   bool random;			/* Sort by random hash of key.  */
   bool general_numeric;		/* Flag for general, numeric comparison.
    Handle numbers in exponential notation. */
+  bool human_numeric;		/* Flag for sorting by human readable
+   units with either SI xor IEC prefixes. */
   bool month;			/* Flag for comparison by month name. */
   bool reverse;			/* Reverse the sense of comparison. */
   bool version;			/* sort by version number */
@@ -337,6 +339,9 @@ Ordering options:\n\
   -M, --month-sortcompare (unknown)  `JAN'  ...  `DEC'\n\
 ), stdout);
   fputs (_(\
+  -h, --human-numeric-sortcompare human readable numbers (e.g., 2K 1G)\n\
+), stdout);
+  fputs (_(\
   -n, --numeric-sort  compare according to string numerical value\n\
   -R, --random-sort   sort by random hash of keys\n\
   --random-source=FILE

Re: Human readable sort

2009-05-26 Thread Matthew Woehlke

Jim Meyering wrote:

Maybe s/unambiguously/consistently/
or   /uniformly/
and mention that it's the inconsistent precision that causes trouble.


J. Random Bystander (i.e. me) prefers consistently. (Consistent, as 
in, using the same rules. Uniformly to me suggests maybe some other 
condition(s) as well.)


Thanks for adjusting the example. Also, thanks for finally pushing this 
forward, it's been wanted for quite some time :-).


--
Matthew
Please do not quote my e-mail address unobfuscated in message bodies.
--
Sorry, not a winner. Please try again.



___
Bug-coreutils mailing list
Bug-coreutils@gnu.org
http://lists.gnu.org/mailman/listinfo/bug-coreutils


Re: Human readable sort

2009-05-22 Thread Jim Meyering
Pádraig Brady wrote:
 Eric Blake wrote:
 Pádraig Brady P at draigBrady.com writes:
...
 +static int
 +find_unit_order (const char *number)
 +{
 +  static const char orders [UCHAR_LIM] = {
 +['K']=1, ['M']=2, ['G']=3, ['T']=4, ['P']=5, ['E']=6, ['Z']=7, ['Y']=8,
 +['k']=1,
 +  };

 This assumes more of C99 than we have previously required.  Are we sure that
 all compilers out there will support this syntax?

 Designated Initializers were a GNU C C89 extension.
 So I thought they were both elegant and not too new.
 I've not got access to older machines to test unfortunately.

Since we've been requiring declaration-after-statement support
for some time now, using a feature like the above should be safe.
I think it is worthwhile, too.


___
Bug-coreutils mailing list
Bug-coreutils@gnu.org
http://lists.gnu.org/mailman/listinfo/bug-coreutils


Re: Human readable sort

2009-05-22 Thread Pádraig Brady
Jim Meyering wrote:
 Pádraig Brady wrote:
 Eric Blake wrote:
 Pádraig Brady P at draigBrady.com writes:
 ...
 +static int
 +find_unit_order (const char *number)
 +{
 +  static const char orders [UCHAR_LIM] = {
 +['K']=1, ['M']=2, ['G']=3, ['T']=4, ['P']=5, ['E']=6, ['Z']=7, ['Y']=8,
 +['k']=1,
 +  };

 This assumes more of C99 than we have previously required.  Are we sure that
 all compilers out there will support this syntax?
 Designated Initializers were a GNU C C89 extension.
 So I thought they were both elegant and not too new.
 I've not got access to older machines to test unfortunately.
 
 Since we've been requiring declaration-after-statement support
 for some time now, using a feature like the above should be safe.
 I think it is worthwhile, too.

I did a lot off googling last night to confirm
that designated initializers are very widely supported.
In fact we've been using them since coreutils 7.1 (0889381c)

Attached is an updated version with 2 new tests.

cheers,
Pádraig.
From 75bb07bb620d37d26467ab86ffcf73d47479b358 Mon Sep 17 00:00:00 2001
From: Michael Speer knome...@gmail.com
Date: Mon, 27 Apr 2009 14:51:29 +0100
Subject: [PATCH] sort: new --human-numeric-sort option to sort KiB MB etc.

* NEWS: Document the new option
* doc/coreutils.texi (sort invocation): ditto
* src/sort.c (main): handle the new -human-numeric-sort option (-h).
(human_numcompare): A new function to compare SI and IEC suffixes
before falling back to the standard --numeric comparison.
(find_unit_order): A new helper function to find the order
of magnitude of a number string as determined by its suffix.
(check_mixed_SI_IEC): A new helper function to exit with error
if both SI and IEC suffixes are presented.
* tests/misc/sort: Add 8 tests to test the new functionality.
* THANKS: Update
---
 NEWS   |5 ++
 THANKS |1 +
 doc/coreutils.texi |   14 ++
 src/sort.c |  115 
 tests/misc/sort|   18 
 5 files changed, 145 insertions(+), 8 deletions(-)

diff --git a/NEWS b/NEWS
index 31f1b1a..f28097d 100644
--- a/NEWS
+++ b/NEWS
@@ -2,6 +2,11 @@ GNU coreutils NEWS-*- outline -*-
 
 * Noteworthy changes in release ?.? (-??-??) [?]
 
+** New features
+
+  sort accepts a new option, --human-numeric-sort (-h): sort numbers
+  while honoring human readable suffixes like KiB and MB etc.
+
 ** Bug fixes
 
   truncate -s failed to skip all whitespace in the option argument in
diff --git a/THANKS b/THANKS
index cf801c5..4392f04 100644
--- a/THANKS
+++ b/THANKS
@@ -396,6 +396,7 @@ Michael J. Croghan  mcrog...@usatoday.com
 Michael McFarland   sid...@yahoo.com
 Michael McLagan mmcla...@invlogic.com
 Michael Piefel  pie...@informatik.hu-berlin.de
+Michael Speer   knome...@gmail.com
 Michael Steffensmichael.steff...@s.netic.de
 Michael Stone   mst...@debian.org
 Michael Stutz   st...@dsl.org
diff --git a/doc/coreutils.texi b/doc/coreutils.texi
index 1a3075f..ae5c577 100644
--- a/doc/coreutils.texi
+++ b/doc/coreutils.texi
@@ -3785,6 +3785,20 @@ Use this option only if there is no alternative; it is much slower than
 @option{--numeric-sort} (@option{-n}) and it can lose information when
 converting to floating point.
 
+...@item -h
+...@itemx --human-numeric-sort
+...@itemx --sort=human-numeric
+...@opindex -h
+...@opindex --human-numeric-sort
+...@opindex --sort
+...@cindex human numeric sort
+...@vindex LC_NUMERIC
+Sort numerically, as per the @option{--numeric-sort} option,
+and in addition handle IEC or SI suffixes like MiB, MB etc.
+Note a mixture of these suffixes is not supported and will
+be flagged as an error. Also the numbers must be abbreviated unambiguously.
+I.E. 5000K and 6M will be sorted incorrectly for example.
+
 @item -i
 @itemx --ignore-nonprinting
 @opindex -i
diff --git a/src/sort.c b/src/sort.c
index 6dea2ff..32cd200 100644
--- a/src/sort.c
+++ b/src/sort.c
@@ -176,6 +176,8 @@ struct keyfield
   bool random;			/* Sort by random hash of key.  */
   bool general_numeric;		/* Flag for general, numeric comparison.
    Handle numbers in exponential notation. */
+  bool human_numeric;		/* Flag for sorting by human readable
+   units with either SI xor IEC prefixes. */
   bool month;			/* Flag for comparison by month name. */
   bool reverse;			/* Reverse the sense of comparison. */
   bool version;			/* sort by version number */
@@ -337,6 +339,9 @@ Ordering options:\n\
   -M, --month-sortcompare (unknown)  `JAN'  ...  `DEC'\n\
 ), stdout);
   fputs (_(\
+  -h, --human-numeric-sortcompare human readable numbers (e.g., 2K 1G)\n\
+), stdout);
+  fputs (_(\
   -n, --numeric-sort  compare according to string numerical value\n\
   -R, --random-sort   sort by random hash of keys\n\
   

Re: Human readable sort

2009-05-22 Thread Jim Meyering
Pádraig Brady wrote:
From 75bb07bb620d37d26467ab86ffcf73d47479b358 Mon Sep 17 00:00:00 2001
 From: Michael Speer knome...@gmail.com
 Date: Mon, 27 Apr 2009 14:51:29 +0100
 Subject: [PATCH] sort: new --human-numeric-sort option to sort KiB MB etc.

 * NEWS: Document the new option
 * doc/coreutils.texi (sort invocation): ditto
 * src/sort.c (main): handle the new -human-numeric-sort option (-h).

Please make one small change to that log message:

s/ -human/ --human/

And in the documentation,

  +Sort numerically, as per the @option{--numeric-sort} option,
  +and in addition handle IEC or SI suffixes like MiB, MB etc.
  +Note a mixture of these suffixes is not supported and will
  +be flagged as an error. Also the numbers must be abbreviated unambiguously.
  +I.E. 5000K and 6M will be sorted incorrectly for example.

Eventually, it'd be nice to explain in detail why those
would cause trouble.

Maybe s/unambiguously/consistently/
or   /uniformly/
and mention that it's the inconsistent precision that causes trouble.

Hmm actually those two *are* sorted properly for me:

$ printf '%s\n' 5000K 6M| src/sort --human
5000K
6M

However, these two are not:

$ printf '%s\n' 7000K 6M| src/sort --human
7000K
6M


___
Bug-coreutils mailing list
Bug-coreutils@gnu.org
http://lists.gnu.org/mailman/listinfo/bug-coreutils


Re: Human readable sort

2009-05-21 Thread Pádraig Brady
knome@gmail.com wrote:
 On Apr 27, 2009 11:27am, Pádraig Brady p...@draigbrady.com wrote:

 It seems like you'll need to go through the copyright assignment

 
 I will start this process as soon as possible.
 
 The -h seems to run faster than -n on my data, but only because I gave
 an even distribution of unit prefixes, so it wasn't having to do the
 number compares much of the time. I imagine most prefixes will be
 missing, K and M in real world usage, with most comparisons routing
 through the fallback to -n. Certainly my usage will be.
 
 I'll look at your updated patch this evening.

Looks like your copyright assignment papers went through.
Attached is the latest patch rebased against master
and with a couple of extra whitespace fixups.

Testing this with actual `du -h` output gives
a 6% slow down compared to `sort -n`.
That's acceptable I think given the extra processing being done.

cheers,
Pádraig.

p.s. I've used the knome...@gmail.com address.
I realize gmail is . agnostic, but for
future indexing you should try and use a single format.
From 0c522f683b592f7ecb527cfe68e79c0397efa6d5 Mon Sep 17 00:00:00 2001
From: Michael Speer knome...@gmail.com
Date: Mon, 27 Apr 2009 14:51:29 +0100
Subject: [PATCH] sort: new --human-numeric-sort option to sort KiB MB etc.

* NEWS: Document the new option
* doc/coreutils.texi (sort invocation): ditto
* src/sort.c (main): handle the new -human-numeric-sort option (-h).
(human_numcompare): A new function to compare SI and IEC suffixes
before falling back to the standard --numeric comparison.
(find_unit_order): A new helper function to find the order
of magnitude of a number string as determined by its suffix.
(check_mixed_SI_IEC): A new helper function to exit with error
if both SI and IEC suffixes are presented.
* tests/misc/sort: Add 6 tests to test the new functionality.
* THANKS: Update
---
 NEWS   |5 ++
 THANKS |1 +
 doc/coreutils.texi |   14 ++
 src/sort.c |  115 
 tests/misc/sort|   13 ++
 5 files changed, 140 insertions(+), 8 deletions(-)

diff --git a/NEWS b/NEWS
index 31f1b1a..f28097d 100644
--- a/NEWS
+++ b/NEWS
@@ -2,6 +2,11 @@ GNU coreutils NEWS-*- outline -*-
 
 * Noteworthy changes in release ?.? (-??-??) [?]
 
+** New features
+
+  sort accepts a new option, --human-numeric-sort (-h): sort numbers
+  while honoring human readable suffixes like KiB and MB etc.
+
 ** Bug fixes
 
   truncate -s failed to skip all whitespace in the option argument in
diff --git a/THANKS b/THANKS
index cf801c5..4392f04 100644
--- a/THANKS
+++ b/THANKS
@@ -396,6 +396,7 @@ Michael J. Croghan  mcrog...@usatoday.com
 Michael McFarland   sid...@yahoo.com
 Michael McLagan mmcla...@invlogic.com
 Michael Piefel  pie...@informatik.hu-berlin.de
+Michael Speer   knome...@gmail.com
 Michael Steffensmichael.steff...@s.netic.de
 Michael Stone   mst...@debian.org
 Michael Stutz   st...@dsl.org
diff --git a/doc/coreutils.texi b/doc/coreutils.texi
index 1a3075f..ae5c577 100644
--- a/doc/coreutils.texi
+++ b/doc/coreutils.texi
@@ -3785,6 +3785,20 @@ Use this option only if there is no alternative; it is much slower than
 @option{--numeric-sort} (@option{-n}) and it can lose information when
 converting to floating point.
 
+...@item -h
+...@itemx --human-numeric-sort
+...@itemx --sort=human-numeric
+...@opindex -h
+...@opindex --human-numeric-sort
+...@opindex --sort
+...@cindex human numeric sort
+...@vindex LC_NUMERIC
+Sort numerically, as per the @option{--numeric-sort} option,
+and in addition handle IEC or SI suffixes like MiB, MB etc.
+Note a mixture of these suffixes is not supported and will
+be flagged as an error. Also the numbers must be abbreviated unambiguously.
+I.E. 5000K and 6M will be sorted incorrectly for example.
+
 @item -i
 @itemx --ignore-nonprinting
 @opindex -i
diff --git a/src/sort.c b/src/sort.c
index 6dea2ff..32cd200 100644
--- a/src/sort.c
+++ b/src/sort.c
@@ -176,6 +176,8 @@ struct keyfield
   bool random;			/* Sort by random hash of key.  */
   bool general_numeric;		/* Flag for general, numeric comparison.
    Handle numbers in exponential notation. */
+  bool human_numeric;		/* Flag for sorting by human readable
+   units with either SI xor IEC prefixes. */
   bool month;			/* Flag for comparison by month name. */
   bool reverse;			/* Reverse the sense of comparison. */
   bool version;			/* sort by version number */
@@ -337,6 +339,9 @@ Ordering options:\n\
   -M, --month-sortcompare (unknown)  `JAN'  ...  `DEC'\n\
 ), stdout);
   fputs (_(\
+  -h, --human-numeric-sortcompare human readable numbers (e.g., 2K 1G)\n\
+), stdout);
+  fputs (_(\
   -n, --numeric-sort  compare according to string numerical value\n\
   -R, 

Re: Human readable sort

2009-05-21 Thread Eric Blake
Pádraig Brady P at draigBrady.com writes:

 Looks like your copyright assignment papers went through.
 Attached is the latest patch rebased against master
 and with a couple of extra whitespace fixups.
 

+static int
+find_unit_order (const char *number)
+{
+  static const char orders [UCHAR_LIM] = {
+['K']=1, ['M']=2, ['G']=3, ['T']=4, ['P']=5, ['E']=6, ['Z']=7, ['Y']=8,
+['k']=1,
+  }; 

This assumes more of C99 than we have previously required.  Are we sure that 
all compilers out there will support this syntax?

Also, your tests only cover 'sort -h'; what about covering 'sort -k1,1h'?

-- 
Eric Blake




___
Bug-coreutils mailing list
Bug-coreutils@gnu.org
http://lists.gnu.org/mailman/listinfo/bug-coreutils


Re: Human readable sort

2009-05-21 Thread Pádraig Brady
Eric Blake wrote:
 Pádraig Brady P at draigBrady.com writes:
 
 Looks like your copyright assignment papers went through.
 Attached is the latest patch rebased against master
 and with a couple of extra whitespace fixups.

 
 +static int
 +find_unit_order (const char *number)
 +{
 +  static const char orders [UCHAR_LIM] = {
 +['K']=1, ['M']=2, ['G']=3, ['T']=4, ['P']=5, ['E']=6, ['Z']=7, ['Y']=8,
 +['k']=1,
 +  }; 
 
 This assumes more of C99 than we have previously required.  Are we sure that 
 all compilers out there will support this syntax?

Designated Initializers were a GNU C C89 extension.
So I thought they were both elegant and not too new.
I've not got access to older machines to test unfortunately.

 
 Also, your tests only cover 'sort -h'; what about covering 'sort -k1,1h'?
 

OK I'll flesh out the tests a bit.

cheers,
Pádraig.


___
Bug-coreutils mailing list
Bug-coreutils@gnu.org
http://lists.gnu.org/mailman/listinfo/bug-coreutils


Re: Human readable sort

2009-05-21 Thread Giuseppe Scrivano
Pádraig Brady p...@draigbrady.com writes:

 +   Assume that numbers are properly abbreviated.
 +   i.e. input will never have both 5000K and 6M.  */

I think this is a too strong assumption.  I wouldn't be surprised to
find, for example, both 1M and 1500K in a data set.

Are there problems to normalize values using this pseudo-code?

while (abs (a)  1000) //or 1024
  {
order_a += signum (a);
a /= 1000; //or 1024
  }

do the same with b and only after compare them.

Regards,
Giuseppe


___
Bug-coreutils mailing list
Bug-coreutils@gnu.org
http://lists.gnu.org/mailman/listinfo/bug-coreutils


Re: Re: Human readable sort

2009-05-21 Thread knome . net

On May 21, 2009 5:07pm, Giuseppe Scrivano gscriv...@gnu.org wrote:


I think this is a too strong assumption. I wouldn't be surprised to



find, for example, both 1M and 1500K in a data set.



I initiated this to patch sort primarily to support the data generated by  
df, du and ls. The human readable options these offer are often frustrating  
once a user realizes there is not a complimentary sort option to them.  
These, of course, do produce properly reduced data.



Does anyone know of a tool which produces mixed data of this sort that  
would need normalized unit comparison?




Are there problems to normalize values using this pseudo-code?



while (abs (a)  1000) //or 1024



{



order_a += signum (a);



a /= 1000; //or 1024



}



Yes. The current implementation does not convert to a numeric  
representation but compares the numbers character by character instead.


The patch rides on top of this functionality, just adding a check to scan  
ahead for units and assuming that difference of unit is sufficient for  
determining sort order.


Anything more complex will probably have to extend the number comparison  
code found in strnumcmp-in.h.


- Michael Speer
___
Bug-coreutils mailing list
Bug-coreutils@gnu.org
http://lists.gnu.org/mailman/listinfo/bug-coreutils


Re: Human readable sort

2009-04-27 Thread Pádraig Brady
Michael Speer wrote:
 2009/4/25 Pádraig Brady p...@draigbrady.com:
 I've further modified your latest in the attached.
 I refactored the suffix finding a bit and also added
 support for --sort=human-numeric.
 
 I refactored it again to handle some potential problems with how
 separators and decimals points were handled.  It will still let you
 write something silly like 1,3,4.5.6, but I've stopped scanning on
 4..4 or 3,,2 or even 5.M.  I'm not sure if that last one is used
 meaningfully anywhere.

This needs another cycle I think.
BTW earlier in this thread I pasted the wrong link to the
previous attempt to include this feature. This is the right one:
http://lists.gnu.org/archive/html/bug-coreutils/2009-01/threads.html#6

Anyway attached an updated version which supports negative
numbers as it was pretty trivial to add. I also removed the
explicit check for thousands_sep==-1 as I changed to using
unsigned char.

Some performance measurements of this version are
(where ret 0 is just returning 0 at the top of
find_unit_order() to show the function call overheads.)

seconds to sort 1 million ints:
---
sort optiontime difference
---
-n 2.75
-h (ret 0) 3.10 +13%
-h 3.96 +44%

seconds to sort 1 million sizes (max len = 4):
---
sort optiontime difference
---
-n 2.54
-h (ret 0) 2.70 +6%
-h 3.50 +38%

I haven't really looked at optimizing it yet.

cheers,
Pádraig.
diff --git a/src/sort.c b/src/sort.c
index f48d727..640cf1c 100644
--- a/src/sort.c
+++ b/src/sort.c
@@ -176,6 +176,8 @@ struct keyfield
   bool random;			/* Sort by random hash of key.  */
   bool general_numeric;		/* Flag for general, numeric comparison.
    Handle numbers in exponential notation. */
+  bool human_numeric;   /* Flag for sorting by human readable
+   units with either SI xor IEC prefixes. */
   bool month;			/* Flag for comparison by month name. */
   bool reverse;			/* Reverse the sense of comparison. */
   bool version;			/* sort by version number */
@@ -336,6 +338,9 @@ Ordering options:\n\
   -i, --ignore-nonprintingconsider only printable characters\n\
   -M, --month-sortcompare (unknown)  `JAN'  ...  `DEC'\n\
 ), stdout);
+  fputs(_(\
+  -h, --human-numeric-sortcompare human readable numbers (e.g., 2K 1G)\n\
+), stdout);
   fputs (_(\
   -n, --numeric-sort  compare according to string numerical value\n\
   -R, --random-sort   sort by random hash of keys\n\
@@ -344,8 +349,8 @@ Ordering options:\n\
 ), stdout);
   fputs (_(\
   --sort=WORD sort according to WORD:\n\
-general-numeric -g, month -M, numeric -n,\n\
-random -R, version -V\n\
+general-numeric -g, human-numeric -h, month -M,\n\
+numeric -n, random -R, version -V\n\
   -V, --version-sort  natural sort of (version) numbers within text\n\
 \n\
 ), stdout);
@@ -426,7 +431,7 @@ enum
   SORT_OPTION
 };
 
-static char const short_options[] = -bcCdfgik:mMno:rRsS:t:T:uVy:z;
+static char const short_options[] = -bcCdfghik:mMno:rRsS:t:T:uVy:z;
 
 static struct option const long_options[] =
 {
@@ -442,6 +447,7 @@ static struct option const long_options[] =
   {merge, no_argument, NULL, 'm'},
   {month-sort, no_argument, NULL, 'M'},
   {numeric-sort, no_argument, NULL, 'n'},
+  {human-numeric-sort, no_argument, NULL, 'h'},
   {version-sort, no_argument, NULL, 'V'},
   {random-sort, no_argument, NULL, 'R'},
   {random-source, required_argument, NULL, RANDOM_SOURCE_OPTION},
@@ -480,6 +486,7 @@ static char const check_types[] =
 
 #define SORT_TABLE \
   _st_(general-numeric, 'g') \
+  _st_(human-numeric,   'h') \
   _st_(month,   'M') \
   _st_(numeric, 'n') \
   _st_(random,  'R') \
@@ -1673,6 +1680,87 @@ numcompare (const char *a, const char *b)
   return strnumcmp (a, b, decimal_point, thousands_sep);
 }
 
+/* Exit with an error if a mixture of SI and IEC units detected.  */
+
+static void
+check_mixed_SI_IEC (char prefix)
+{
+  static int seen_si = -1;
+  bool si_present = prefix == 'i';
+  if (seen_si != -1  seen_si != si_present)
+error (SORT_FAILURE, 0, _(both SI and IEC prefixes present on units));
+  seen_si = si_present;
+}
+
+/* Return an integer which represents the order of magnitude of
+   the unit following the number.  NUMBER can contain thousands separators
+   or a decimal point, but not have preceeding blanks.
+   Negative numbers return a negative unit order.  */
+
+static int
+find_unit_order (const char* number)
+{
+  static const char orders [UCHAR_LIM] = {
+['K']=1, ['M']=2, ['G']=3, ['T']=4, ['P']=5, ['E']=6, ['Z']=7, ['Y']=8,
+['k']=1,
+  };
+
+  

Re: Human readable sort

2009-04-27 Thread Pádraig Brady
Pádraig Brady wrote:
 seconds to sort 1 million ints:
 ---
 sort optiontime difference
 ---
 -n 2.75
 -h (ret 0) 3.10 +13%
 -h 3.96 +44%

I removed a redundant to_uchar() and got it from 44% to 40%
compared to -n. Note our existing -n implementation is mega fast,
so at least we're not interfering with its implementation.

Attached is the full patch, which hopefully we can push soon.

cheers,
Pádraig.
From fef4d790423ff840d0217b6107db85b396728cb4 Mon Sep 17 00:00:00 2001
From: Michael Speer knome...@gmail.com
Date: Mon, 27 Apr 2009 14:51:29 +0100
Subject: [PATCH] sort: new --human-numeric-sort option to sort KiB MB etc.

* NEWS: document the new option
* doc/coreutils.texi: ditto
* src/sort.c (main): handle the new -human-numeric-sort option (-h).
(human_numcompare): A new function to compare SI and IEC suffixes
before falling back to the standard --numeric comparision.
(find_unit_order): A new helper function to find the order
of magnitude of a number string as determined by its suffix.
(check_mixed_SI_IEC): A new helper function to exit with error
if both SI and IEC suffixes are presented.
* tests/misc/sort: Add 5 tests to test the new functionality.
* THANKS: Update
---
 NEWS   |5 ++
 THANKS |1 +
 doc/coreutils.texi |   14 ++
 src/sort.c |  115 
 tests/misc/sort|   13 ++
 5 files changed, 140 insertions(+), 8 deletions(-)

diff --git a/NEWS b/NEWS
index 8cb17cc..bfebe1d 100644
--- a/NEWS
+++ b/NEWS
@@ -2,6 +2,11 @@ GNU coreutils NEWS-*- outline -*-
 
 * Noteworthy changes in release 7.3 (-??-??) [?]
 
+** New features
+
+  sort accepts a new option, --human-numeric-sort (-h): sort numbers
+  while honouring human readable suffixes like KiB and MB etc.
+
 ** Bug fixes
 
   sort -m no longer segfaults when its output file is also an input file.
diff --git a/THANKS b/THANKS
index 876a6b6..da48d6d 100644
--- a/THANKS
+++ b/THANKS
@@ -395,6 +395,7 @@ Michael J. Croghan  mcrog...@usatoday.com
 Michael McFarland   sid...@yahoo.com
 Michael McLagan mmcla...@invlogic.com
 Michael Piefel  pie...@informatik.hu-berlin.de
+Michael Speer   knome...@gmail.com
 Michael Steffensmichael.steff...@s.netic.de
 Michael Stone   mst...@debian.org
 Michael Stutz   st...@dsl.org
diff --git a/doc/coreutils.texi b/doc/coreutils.texi
index 918f44e..8f73419 100644
--- a/doc/coreutils.texi
+++ b/doc/coreutils.texi
@@ -3785,6 +3785,20 @@ Use this option only if there is no alternative; it is much slower than
 @option{--numeric-sort} (@option{-n}) and it can lose information when
 converting to floating point.
 
+...@item -h
+...@itemx --human-numeric-sort
+...@itemx --sort=human-numeric
+...@opindex -h
+...@opindex --human-numeric-sort
+...@opindex --sort
+...@cindex human numeric sort
+...@vindex LC_NUMERIC
+Sort numerically, as per the @option{--numeric-sort} option,
+and in addition handle IEC or SI suffixes like MiB, MB etc.
+Note a mixture of these suffixes is not supported and will
+be flagged as an error. Also the numbers must be abbreviated unambiguously.
+I.E. 5000K and 6M will be sorted incorrectly for example.
+
 @item -i
 @itemx --ignore-nonprinting
 @opindex -i
diff --git a/src/sort.c b/src/sort.c
index f48d727..4c1bec5 100644
--- a/src/sort.c
+++ b/src/sort.c
@@ -176,6 +176,8 @@ struct keyfield
   bool random;			/* Sort by random hash of key.  */
   bool general_numeric;		/* Flag for general, numeric comparison.
    Handle numbers in exponential notation. */
+  bool human_numeric;   /* Flag for sorting by human readable
+   units with either SI xor IEC prefixes. */
   bool month;			/* Flag for comparison by month name. */
   bool reverse;			/* Reverse the sense of comparison. */
   bool version;			/* sort by version number */
@@ -336,6 +338,9 @@ Ordering options:\n\
   -i, --ignore-nonprintingconsider only printable characters\n\
   -M, --month-sortcompare (unknown)  `JAN'  ...  `DEC'\n\
 ), stdout);
+  fputs(_(\
+  -h, --human-numeric-sortcompare human readable numbers (e.g., 2K 1G)\n\
+), stdout);
   fputs (_(\
   -n, --numeric-sort  compare according to string numerical value\n\
   -R, --random-sort   sort by random hash of keys\n\
@@ -344,8 +349,8 @@ Ordering options:\n\
 ), stdout);
   fputs (_(\
   --sort=WORD sort according to WORD:\n\
-general-numeric -g, month -M, numeric -n,\n\
-random -R, version -V\n\
+general-numeric -g, human-numeric -h, month -M,\n\
+numeric -n, 

Re: Human readable sort

2009-04-27 Thread Ondřej Vašík
Pádraig Brady wrote:
 Pádraig Brady wrote:
 Attached is the full patch, which hopefully we can push soon.

I'm not objecting anything relevant in that patch, just the tab/spaces
mixing looks inconsistent with the rest of the code in added lines in
sort.c . Only cosmetic thing ... but is this intentional?

Greetings,
 Ondřej Vašík


signature.asc
Description: Toto je digitálně	 podepsaná část	 zprávy
___
Bug-coreutils mailing list
Bug-coreutils@gnu.org
http://lists.gnu.org/mailman/listinfo/bug-coreutils


Re: Human readable sort

2009-04-27 Thread Pádraig Brady
knome@gmail.com wrote:
 On Apr 27, 2009 11:41am, Ondřej Vašík ova...@redhat.com wrote:
 Pádraig Brady wrote:

  Pádraig Brady wrote:

  Attached is the full patch, which hopefully we can push soon.



 I'm not objecting anything relevant in that patch, just the tab/spaces

 mixing looks inconsistent with the rest of the code in added lines in

 sort.c . Only cosmetic thing ... but is this intentional?



 Greetings,

 Ondřej Vašík

 
 That is my fault. My home and work emacs are configured with tabs
 disabled for interaction with my office and home repos. I'll do cleanup
 against the patch this afternoon when I can look at it.

Thanks Ondřej.
This is fixed up in attached I think.

cheers,
Pádraig.

p.s. Micheal, which email address do you want in THANKS?
From 2e1ee46e97858717535210729f5e0181c08bf6d1 Mon Sep 17 00:00:00 2001
From: Michael Speer knome...@gmail.com
Date: Mon, 27 Apr 2009 14:51:29 +0100
Subject: [PATCH] sort: new --human-numeric-sort option to sort KiB MB etc.

* NEWS: document the new option
* doc/coreutils.texi: ditto
* src/sort.c (main): handle the new -human-numeric-sort option (-h).
(human_numcompare): A new function to compare SI and IEC suffixes
before falling back to the standard --numeric comparision.
(find_unit_order): A new helper function to find the order
of magnitude of a number string as determined by its suffix.
(check_mixed_SI_IEC): A new helper function to exit with error
if both SI and IEC suffixes are presented.
* tests/misc/sort: Add 6 tests to test the new functionality.
* THANKS: Update
---
 NEWS   |5 ++
 THANKS |1 +
 doc/coreutils.texi |   14 ++
 src/sort.c |  115 
 tests/misc/sort|   13 ++
 5 files changed, 140 insertions(+), 8 deletions(-)

diff --git a/NEWS b/NEWS
index 8cb17cc..bfebe1d 100644
--- a/NEWS
+++ b/NEWS
@@ -2,6 +2,11 @@ GNU coreutils NEWS-*- outline -*-
 
 * Noteworthy changes in release 7.3 (-??-??) [?]
 
+** New features
+
+  sort accepts a new option, --human-numeric-sort (-h): sort numbers
+  while honouring human readable suffixes like KiB and MB etc.
+
 ** Bug fixes
 
   sort -m no longer segfaults when its output file is also an input file.
diff --git a/THANKS b/THANKS
index 876a6b6..da48d6d 100644
--- a/THANKS
+++ b/THANKS
@@ -395,6 +395,7 @@ Michael J. Croghan  mcrog...@usatoday.com
 Michael McFarland   sid...@yahoo.com
 Michael McLagan mmcla...@invlogic.com
 Michael Piefel  pie...@informatik.hu-berlin.de
+Michael Speer   knome...@gmail.com
 Michael Steffensmichael.steff...@s.netic.de
 Michael Stone   mst...@debian.org
 Michael Stutz   st...@dsl.org
diff --git a/doc/coreutils.texi b/doc/coreutils.texi
index 918f44e..8f73419 100644
--- a/doc/coreutils.texi
+++ b/doc/coreutils.texi
@@ -3785,6 +3785,20 @@ Use this option only if there is no alternative; it is much slower than
 @option{--numeric-sort} (@option{-n}) and it can lose information when
 converting to floating point.
 
+...@item -h
+...@itemx --human-numeric-sort
+...@itemx --sort=human-numeric
+...@opindex -h
+...@opindex --human-numeric-sort
+...@opindex --sort
+...@cindex human numeric sort
+...@vindex LC_NUMERIC
+Sort numerically, as per the @option{--numeric-sort} option,
+and in addition handle IEC or SI suffixes like MiB, MB etc.
+Note a mixture of these suffixes is not supported and will
+be flagged as an error. Also the numbers must be abbreviated unambiguously.
+I.E. 5000K and 6M will be sorted incorrectly for example.
+
 @item -i
 @itemx --ignore-nonprinting
 @opindex -i
diff --git a/src/sort.c b/src/sort.c
index f48d727..8a85d54 100644
--- a/src/sort.c
+++ b/src/sort.c
@@ -176,6 +176,8 @@ struct keyfield
   bool random;			/* Sort by random hash of key.  */
   bool general_numeric;		/* Flag for general, numeric comparison.
    Handle numbers in exponential notation. */
+  bool human_numeric;		/* Flag for sorting by human readable
+   units with either SI xor IEC prefixes. */
   bool month;			/* Flag for comparison by month name. */
   bool reverse;			/* Reverse the sense of comparison. */
   bool version;			/* sort by version number */
@@ -337,6 +339,9 @@ Ordering options:\n\
   -M, --month-sortcompare (unknown)  `JAN'  ...  `DEC'\n\
 ), stdout);
   fputs (_(\
+  -h, --human-numeric-sortcompare human readable numbers (e.g., 2K 1G)\n\
+), stdout);
+  fputs (_(\
   -n, --numeric-sort  compare according to string numerical value\n\
   -R, --random-sort   sort by random hash of keys\n\
   --random-source=FILEget random bytes from FILE\n\
@@ -344,8 +349,8 @@ Ordering options:\n\
 ), stdout);
   fputs (_(\
   --sort=WORD sort according to WORD:\n\
-general-numeric -g, month 

Re: Re: Human readable sort

2009-04-27 Thread knome . net

On Apr 27, 2009 11:41am, Ondřej Vašík ova...@redhat.com wrote:

Pádraig Brady wrote:



 Pádraig Brady wrote:



 Attached is the full patch, which hopefully we can push soon.





I'm not objecting anything relevant in that patch, just the tab/spaces



mixing looks inconsistent with the rest of the code in added lines in



sort.c . Only cosmetic thing ... but is this intentional?





Greetings,



Ondřej Vašík



That is my fault. My home and work emacs are configured with tabs disabled  
for interaction with my office and home repos. I'll do cleanup against the  
patch this afternoon when I can look at it.


-Michael speer
___
Bug-coreutils mailing list
Bug-coreutils@gnu.org
http://lists.gnu.org/mailman/listinfo/bug-coreutils


Re: Human readable sort

2009-04-26 Thread Pádraig Brady
Michael Speer wrote:
 2009/4/25 Pádraig Brady p...@draigbrady.com:
 I've further modified your latest in the attached.
 I refactored the suffix finding a bit and also added
 support for --sort=human-numeric.
 
 I refactored it again to handle some potential problems with how
 separators and decimals points were handled.

find_unit_order() is a better refactoring, thanks.

 It will still let you
 write something silly like 1,3,4.5.6, but I've stopped scanning on
 4..4 or 3,,2 or even 5.M.  I'm not sure if that last one is used
 meaningfully anywhere.

That's an improvement.
I'll have to do some benchmarking to see
if these extra checks have any significant effect.

 I did this partly to avoid breaking locales
 where space is the separator.

Supporting spaces in numbers is problematic anyway
due to the way fields are handled. Good to have
the space as thousands separator handled anyway.

 I poked around a bit to see if any locales used space.  
 Apparently, the Hungarian locale does.  I stopped looking there.

Doesn't have spaces here at least:
LANG=hu_HU.utf8 printf %'d\n 1234

 I'm wondering whether numeric is superfluous?
 I.E. are --sort=human and --human-sort sufficient.

 
 I started with just human, but thought it better to add the numeric
 since sort is by default for strings, and both current switches that
 enable numeric sorts have it in their name.  I would not fight a
 reversion on this if no one thought it would look confusing or too
 inconsistent to end users.

Well I was worried about it being too verbose, but it is more
consistent with other numeric options, so we'll leave it as is I think.

OK, I'll do up docs and tests tomorrow for this.
If you would prefer to do that please shout now.

cheers,
Pádraig.


___
Bug-coreutils mailing list
Bug-coreutils@gnu.org
http://lists.gnu.org/mailman/listinfo/bug-coreutils


Re: Human readable sort

2009-04-25 Thread Michael Speer
2009/4/24 Pádraig Brady p...@draigbrady.com:
 Michael Speer wrote:
 I wrote the following patch to the 7.2 branch of coreutils to allow
 `sort` to sort by human readable byte sizes.  I looked around a bit to
 see what the status of previous attempts to integrate this
 functionality were, but didn't see any very recent activity.  This is
 my first interaction with coreutils, so if I missed something obvious,
 please point me towards it.

 Is the last potential patch (
 http://www.mail-archive.com/bug-coreutils@gnu.org/msg14080.html )
 moving through?  If not, if I cleaned this up ( tabs, documentation,
 and test cases ) and applied it to the current HEAD on savannah is
 there a chance of getting this functionality into sort?

 Thanks for reviving this again.
 There was a more recent attempt that petered out unfortunately:
 http://www.mail-archive.com/bug-coreutils@gnu.org/msg14080.html


 Patch assumptions :
   * that numbers will use the best representation ( never uses 1024b
 instead of 1k, etc )
   * that the sizes will be specified via suffixes of b, K, M, G, T, P,
 E, Z, Y or their alternately cased variants

 The first assumption results in checking only the suffix when they differ.
 This enables it to match the output of `du -h / du --si`, but possibly
 not other tools that do not conform to these assumptions.

 The consensus was that these assumptions are appropriate and useful.

 We assume C99 support now for coreutils so I tweaked your patch,
 the main change being to greatly shrink the lookup table initialisation.
 Note I commented out the lower case letters (except 'k') as I don't
 think any coreutils generate those and they could preclude supporting
 other suffixes in future. I'm not sure about doing that but I think it's
 better to err on the side of too few suffixes than too many?


That's much more readable.  I tacked in a size.  The standards do not
reference the lowercase letters you commented out, so I just deleted
them outright.

 Something else to consider is to flag when
 a mixture of SI and IEC units are used, as
 this not being supported might not be obvious
 to users and could cause difficult to debug issues for users.
 I.E. flag an error if the following input is presented.
  999MB
  998MiB
 I added a very quick hack for that to the patch for illustration.


While du only outputs the first letter, this makes the change better
for more general use.  I added a bounds check, but do not see anything
else beyond your illustration would be needed.

 I also noticed that you didn't terminate the fields before
 processing as was done for the other numeric sorts?
 So I changed that also in the attached patch but didn't
 analyze it TBH.


Your change was entirely appropriate.  I should have done that originally.


 p.s. obviously docs and help and tests need to be written,
 but we can do that after we get the implementation done.


I've attached the updated diff.

Thanks for taking an interest in this.

Michael Speer
--- orig/coreutils-7.2/src/sort.c	2009-03-29 13:44:10.0 -0400
+++ coreutils-7.2/src/sort.c	2009-04-25 04:46:06.0 -0400
@@ -176,6 +176,8 @@
   bool random;			/* Sort by random hash of key.  */
   bool general_numeric;		/* Flag for general, numeric comparison.
    Handle numbers in exponential notation. */
+  bool human_numeric;   /* Flag for sorting by human readable 
+   units with either SI or IEC prefixes */
   bool month;			/* Flag for comparison by month name. */
   bool reverse;			/* Reverse the sense of comparison. */
   bool version;			/* sort by version number */
@@ -336,6 +338,10 @@
   -i, --ignore-nonprintingconsider only printable characters\n\
   -M, --month-sortcompare (unknown)  `JAN'  ...  `DEC'\n\
 ), stdout);
+  fputs(_(\
+  -h, --human-numeric-sortcompare string numerical values ending in units\n\
+  prefixed with either SI xor IEC prefixes\n\
+), stdout);
   fputs (_(\
   -n, --numeric-sort  compare according to string numerical value\n\
   -R, --random-sort   sort by random hash of keys\n\
@@ -426,7 +432,7 @@
   SORT_OPTION
 };
 
-static char const short_options[] = -bcCdfgik:mMno:rRsS:t:T:uVy:z;
+static char const short_options[] = -bcCdfghik:mMno:rRsS:t:T:uVy:z;
 
 static struct option const long_options[] =
 {
@@ -442,6 +448,7 @@
   {merge, no_argument, NULL, 'm'},
   {month-sort, no_argument, NULL, 'M'},
   {numeric-sort, no_argument, NULL, 'n'},
+  {human-numeric-sort, no_argument, NULL, 'h'},
   {version-sort, no_argument, NULL, 'V'},
   {random-sort, no_argument, NULL, 'R'},
   {random-source, required_argument, NULL, RANDOM_SOURCE_OPTION},
@@ -1673,6 +1680,57 @@
   return strnumcmp (a, b, decimal_point, thousands_sep);
 }
 
+/* error if a mixture of SI and IEC units used.  */
+static void
+check_mixed_SI_IEC (char prefix)
+{
+  static int seen_si = -1;
+  bool si_present = prefix == 'i';
+  if (seen_si != -1  seen_si != 

Re: Human readable sort

2009-04-25 Thread Pádraig Brady
Michael Speer wrote:
 That's much more readable.  I tacked in a size.

Good catch. The size is required or otherwise
one could get undefined results for some chars.

 The standards do not
 reference the lowercase letters you commented out, so I just deleted
 them outright.

Fair enough.

 Something else to consider is to flag when
 a mixture of SI and IEC units are used, as
 this not being supported might not be obvious
 to users and could cause difficult to debug issues for users.
 I.E. flag an error if the following input is presented.
  999MB
  998MiB
 I added a very quick hack for that to the patch for illustration.

 
 While du only outputs the first letter, this makes the change better
 for more general use.  I added a bounds check, but do not see anything
 else beyond your illustration would be needed.

Oops, yes the bounds check is also needed.

I've further modified your latest in the attached.
I refactored the suffix finding a bit and also added
support for --sort=human-numeric.
I'm wondering whether numeric is superfluous?
I.E. are --sort=human and --human-sort sufficient.

cheers,
Pádraig.
diff --git a/src/sort.c b/src/sort.c
index f48d727..9d7d659 100644
--- a/src/sort.c
+++ b/src/sort.c
@@ -176,6 +176,8 @@ struct keyfield
   bool random;			/* Sort by random hash of key.  */
   bool general_numeric;		/* Flag for general, numeric comparison.
    Handle numbers in exponential notation. */
+  bool human_numeric;   /* Flag for sorting by human readable
+   units with either SI xor IEC prefixes. */
   bool month;			/* Flag for comparison by month name. */
   bool reverse;			/* Reverse the sense of comparison. */
   bool version;			/* sort by version number */
@@ -336,6 +338,9 @@ Ordering options:\n\
   -i, --ignore-nonprintingconsider only printable characters\n\
   -M, --month-sortcompare (unknown)  `JAN'  ...  `DEC'\n\
 ), stdout);
+  fputs(_(\
+  -h, --human-numeric-sortcompare human readable numbers (e.g., 2K 1G)\n\
+), stdout);
   fputs (_(\
   -n, --numeric-sort  compare according to string numerical value\n\
   -R, --random-sort   sort by random hash of keys\n\
@@ -344,8 +349,8 @@ Ordering options:\n\
 ), stdout);
   fputs (_(\
   --sort=WORD sort according to WORD:\n\
-general-numeric -g, month -M, numeric -n,\n\
-random -R, version -V\n\
+general-numeric -g, human-numeric -h, month -M,\n\
+numeric -n, random -R, version -V\n\
   -V, --version-sort  natural sort of (version) numbers within text\n\
 \n\
 ), stdout);
@@ -426,7 +431,7 @@ enum
   SORT_OPTION
 };
 
-static char const short_options[] = -bcCdfgik:mMno:rRsS:t:T:uVy:z;
+static char const short_options[] = -bcCdfghik:mMno:rRsS:t:T:uVy:z;
 
 static struct option const long_options[] =
 {
@@ -442,6 +447,7 @@ static struct option const long_options[] =
   {merge, no_argument, NULL, 'm'},
   {month-sort, no_argument, NULL, 'M'},
   {numeric-sort, no_argument, NULL, 'n'},
+  {human-numeric-sort, no_argument, NULL, 'h'},
   {version-sort, no_argument, NULL, 'V'},
   {random-sort, no_argument, NULL, 'R'},
   {random-source, required_argument, NULL, RANDOM_SOURCE_OPTION},
@@ -480,6 +486,7 @@ static char const check_types[] =
 
 #define SORT_TABLE \
   _st_(general-numeric, 'g') \
+  _st_(human-numeric,   'h') \
   _st_(month,   'M') \
   _st_(numeric, 'n') \
   _st_(random,  'R') \
@@ -1673,6 +1680,60 @@ numcompare (const char *a, const char *b)
   return strnumcmp (a, b, decimal_point, thousands_sep);
 }
 
+/* Exit with an error if a mixture of SI and IEC units detected.  */
+
+static void
+check_mixed_SI_IEC (char prefix)
+{
+  static int seen_si = -1;
+  bool si_present = prefix == 'i';
+  if (seen_si != -1  seen_si != si_present)
+error (SORT_FAILURE, 0, _(both SI and IEC prefixes present on units));
+  seen_si = si_present;
+}
+
+/* Return the address of the number suffix or NUL if not present */
+
+static const char*
+find_suffix (const char* number)
+{
+  const char *p = number;
+
+  while (ISDIGIT (*p) || *p == decimal_point || *p == thousands_sep)
+p++;
+
+  if (*p)
+check_mixed_SI_IEC (*(p+1));
+
+  return p;
+}
+
+/* Compare numbers ending in units with SI xor IEC prefixes
+  none/unknown  K  M  G  T  P  E  Z  Y
+   Assume that numbers are properly abbreviated.
+   i.e. input will never have 5000K instead of 5M.  */
+
+static int
+human_numcompare (const char *a, const char *b)
+{
+  static const char weights [UCHAR_LIM] = {
+['K']=1, ['M']=2, ['G']=3, ['T']=4, ['P']=5, ['E']=6, ['Z']=7, ['Y']=8,
+['k']=1,
+  };
+
+  while (blanks[to_uchar (*a)])
+a++;
+  while (blanks[to_uchar (*b)])
+b++;
+
+  int aw = weights[to_uchar (*find_suffix (a))];
+  int bw = weights[to_uchar (*find_suffix (b))];
+
+  return (aw  bw ? 1
+  : aw  

Re: Human readable sort

2009-04-25 Thread Michael Speer
2009/4/25 Pádraig Brady p...@draigbrady.com:

 I've further modified your latest in the attached.
 I refactored the suffix finding a bit and also added
 support for --sort=human-numeric.

I refactored it again to handle some potential problems with how
separators and decimals points were handled.  It will still let you
write something silly like 1,3,4.5.6, but I've stopped scanning on
4..4 or 3,,2 or even 5.M.  I'm not sure if that last one is used
meaningfully anywhere.  I did this partly to avoid breaking locales
where space is the separator.  `du --h --apparent-size` output like
this :

 4TO-DO
 5Million-dollar-idea
 3K  whatever

would have triggered the mixed prefix error spuriously due to the
greedy consumption of space in the second line.  I am not concerned
with making it parse intelligently for all the various locales, but
only to make sure it doesn't do anything particularly stupid.

http://en.wikipedia.org/wiki/ISO_31-0#Numbers

It appears ISO suggests the space for separator.  I poked around a bit
to see if any locales used space.  Apparently, the Hungarian locale
does.  I stopped looking there.

 I'm wondering whether numeric is superfluous?
 I.E. are --sort=human and --human-sort sufficient.


I started with just human, but thought it better to add the numeric
since sort is by default for strings, and both current switches that
enable numeric sorts have it in their name.  I would not fight a
reversion on this if no one thought it would look confusing or too
inconsistent to end users.

-Michael Speer
--- orig/coreutils-7.2/src/sort.c	2009-03-29 13:44:10.0 -0400
+++ coreutils-7.2/src/sort.c	2009-04-26 00:46:42.0 -0400
@@ -176,6 +176,8 @@
   bool random;			/* Sort by random hash of key.  */
   bool general_numeric;		/* Flag for general, numeric comparison.
    Handle numbers in exponential notation. */
+  bool human_numeric;   /* Flag for sorting by human readable
+   units with either SI xor IEC prefixes. */
   bool month;			/* Flag for comparison by month name. */
   bool reverse;			/* Reverse the sense of comparison. */
   bool version;			/* sort by version number */
@@ -336,6 +338,9 @@
   -i, --ignore-nonprintingconsider only printable characters\n\
   -M, --month-sortcompare (unknown)  `JAN'  ...  `DEC'\n\
 ), stdout);
+  fputs(_(\
+  -h, --human-numeric-sortcompare human readable numbers (e.g., 2K 1G)\n\
+), stdout);
   fputs (_(\
   -n, --numeric-sort  compare according to string numerical value\n\
   -R, --random-sort   sort by random hash of keys\n\
@@ -344,8 +349,8 @@
 ), stdout);
   fputs (_(\
   --sort=WORD sort according to WORD:\n\
-general-numeric -g, month -M, numeric -n,\n\
-random -R, version -V\n\
+general-numeric -g, human-numeric -h, month -M,\n\
+numeric -n, random -R, version -V\n\
   -V, --version-sort  sort by numeric version\n\
 \n\
 ), stdout);
@@ -426,7 +431,7 @@
   SORT_OPTION
 };
 
-static char const short_options[] = -bcCdfgik:mMno:rRsS:t:T:uVy:z;
+static char const short_options[] = -bcCdfghik:mMno:rRsS:t:T:uVy:z;
 
 static struct option const long_options[] =
 {
@@ -442,6 +447,7 @@
   {merge, no_argument, NULL, 'm'},
   {month-sort, no_argument, NULL, 'M'},
   {numeric-sort, no_argument, NULL, 'n'},
+  {human-numeric-sort, no_argument, NULL, 'h'},
   {version-sort, no_argument, NULL, 'V'},
   {random-sort, no_argument, NULL, 'R'},
   {random-source, required_argument, NULL, RANDOM_SOURCE_OPTION},
@@ -480,6 +486,7 @@
 
 #define SORT_TABLE \
   _st_(general-numeric, 'g') \
+  _st_(human-numeric,   'h') \
   _st_(month,   'M') \
   _st_(numeric, 'n') \
   _st_(random,  'R') \
@@ -1673,6 +1680,85 @@
   return strnumcmp (a, b, decimal_point, thousands_sep);
 }
 
+/* Exit with an error if a mixture of SI and IEC units detected.  */
+
+static void
+check_mixed_SI_IEC (char prefix)
+{
+  static int seen_si = -1;
+  bool si_present = prefix == 'i';
+  if (seen_si != -1  seen_si != si_present)
+error (SORT_FAILURE, 0, _(both SI and IEC prefixes present on units));
+  seen_si = si_present;
+}
+
+/* return an integer which represents the order of magnitude of 
+   the unit following the number
+*/
+unsigned int
+find_unit_order (const char* number)
+{
+  /* FIXME : if sort is fixed for multibyte 
+   *   separators this will need to be fixed too 
+   */
+  
+  static const char weights [UCHAR_LIM] = {
+['K']=1, ['M']=2, ['G']=3, ['T']=4, ['P']=5, ['E']=6, ['Z']=7, ['Y']=8,
+['k']=1,
+  };
+  
+  const char *p = number;
+  
+  /* scan to end of number
+   * decimals or separators not followed by digits
+   *   stop the scan
+   * numbers ending in decimals or separators are
+   *   are thus considered to be lacking in units
+   */
+  while ( ISDIGIT (*p) )
+{
+  p++ ;

Human readable sort

2009-04-24 Thread Michael Speer
I wrote the following patch to the 7.2 branch of coreutils to allow
`sort` to sort by human readable byte sizes.  I looked around a bit to
see what the status of previous attempts to integrate this
functionality were, but didn't see any very recent activity.  This is
my first interaction with coreutils, so if I missed something obvious,
please point me towards it.

Is the last potential patch (
http://www.mail-archive.com/bug-coreutils@gnu.org/msg14080.html )
moving through?  If not, if I cleaned this up ( tabs, documentation,
and test cases ) and applied it to the current HEAD on savannah is
there a chance of getting this functionality into sort?

Patch assumptions :
  * that numbers will use the best representation ( never uses 1024b
instead of 1k, etc )
  * that the sizes will be specified via suffixes of b, K, M, G, T, P,
E, Z, Y or their alternately cased variants

The first assumption results in checking only the suffix when they differ.
This enables it to match the output of `du -h / du --si`, but possibly
not other tools that do not conform to these assumptions.

-

--- orig/coreutils-7.2/src/sort.c   2009-03-29 13:44:10.0 -0400
+++ coreutils-7.2/src/sort.c2009-04-24 14:03:47.0 -0400
@@ -176,6 +176,8 @@
   bool random; /* Sort by random hash of key.  */
   bool general_numeric;/* Flag for general, numeric comparison.
   Handle numbers in exponential notation. */
+  bool human_numeric;   /* Flag for sorting by size specified
+   data */
   bool month;  /* Flag for comparison by month name. */
   bool reverse;/* Reverse the sense of comparison. */
   bool version;/* sort by version number */
@@ -426,7 +428,7 @@
   SORT_OPTION
 };

-static char const short_options[] = -bcCdfgik:mMno:rRsS:t:T:uVy:z;
+static char const short_options[] = -bcCdfghik:mMno:rRsS:t:T:uVy:z;

 static struct option const long_options[] =
 {
@@ -442,6 +444,7 @@
   {merge, no_argument, NULL, 'm'},
   {month-sort, no_argument, NULL, 'M'},
   {numeric-sort, no_argument, NULL, 'n'},
+  {human-sort, no_argument, NULL, 'h'},
   {version-sort, no_argument, NULL, 'V'},
   {random-sort, no_argument, NULL, 'R'},
   {random-source, required_argument, NULL, RANDOM_SOURCE_OPTION},
@@ -1673,6 +1676,57 @@
   return strnumcmp (a, b, decimal_point, thousands_sep);
 }

+/* assumes UCHAR_MAX of 255 */
+/* Y/y:8 - K/k:1 , otherwise ( including b ) : 0 */
+const char weights [] =
+  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+0, 0, 0, 0, 0, 6, 0, 3, 0, 0, 0, 1, 0, 2, 0, 0,
+5, 0, 0, 0, 4, 0, 0, 0, 0, 8, 7, 0, 0, 0, 0, 0,
+0, 0, 0, 0, 0, 6, 0, 3, 0, 0, 0, 1, 0, 2, 0, 0,
+5, 0, 0, 0, 4, 0, 0, 0, 0, 8, 7, 0, 0, 0, 0, 0,
+0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } ;
+
+static int
+human_compare(const char *a, const char *b)
+{
+  /* this tests numeric entities ending in human readable size specifiers
+ b  K  M  G  T  P  E  Z  Y
+ we (rudely I admit) assume that numbers are properly abbreviated.
+ for example, you will never see 500,000,000b, instead of 5M
+  */
+
+  const char * ar, * br ; /* riders */
+  int aw, bw ;
+
+  while(blanks[to_uchar (*a)])
+a++;
+  while(blanks[to_uchar (*b)])
+b++;
+
+  ar = a ;
+  br = b ;
+
+  while( ISDIGIT(*ar) || (*ar) == decimal_point || (*ar) == thousands_sep )
+ar++ ;
+  while( ISDIGIT(*br) || (*br) == decimal_point || (*br) == thousands_sep )
+br++ ;
+
+  aw = weights[to_uchar (*ar)] ;
+  bw = weights[to_uchar (*br)] ;
+
+  return aw  bw ? 1 : aw  bw ? -1 : strnumcmp( a , b ,
decimal_point , thousands_sep) ;
+}
+
 static int
 general_numcompare (const char *sa, const char *sb)
 {
@@ -1917,6 +1971,10 @@

   if (key-random)
diff = compare_random (texta, lena, textb, lenb);
+  else if (key-human_numeric)
+{
+  diff = human_compare(texta, textb);
+}
   else if (key-numeric | key-general_numeric)
{
  char savea = *lima, saveb = *limb;
@@ -2887,7 +2945,7 @@

   for (key = keylist; key; key = key-next)
 if ((1  (key-random + key-numeric + key-general_numeric + key-month
- + key-version + !!key-ignore))
+ + key-version + (!!key-ignore) + key-human_numeric))
|| (key-random  key-translate))
   {
/* The following is too big, but guaranteed to 

Re: Human readable sort

2009-04-24 Thread Pádraig Brady
Michael Speer wrote:
 I wrote the following patch to the 7.2 branch of coreutils to allow
 `sort` to sort by human readable byte sizes.  I looked around a bit to
 see what the status of previous attempts to integrate this
 functionality were, but didn't see any very recent activity.  This is
 my first interaction with coreutils, so if I missed something obvious,
 please point me towards it.
 
 Is the last potential patch (
 http://www.mail-archive.com/bug-coreutils@gnu.org/msg14080.html )
 moving through?  If not, if I cleaned this up ( tabs, documentation,
 and test cases ) and applied it to the current HEAD on savannah is
 there a chance of getting this functionality into sort?

Thanks for reviving this again.
There was a more recent attempt that petered out unfortunately:
http://www.mail-archive.com/bug-coreutils@gnu.org/msg14080.html

 
 Patch assumptions :
   * that numbers will use the best representation ( never uses 1024b
 instead of 1k, etc )
   * that the sizes will be specified via suffixes of b, K, M, G, T, P,
 E, Z, Y or their alternately cased variants
 
 The first assumption results in checking only the suffix when they differ.
 This enables it to match the output of `du -h / du --si`, but possibly
 not other tools that do not conform to these assumptions.

The consensus was that these assumptions are appropriate and useful.

We assume C99 support now for coreutils so I tweaked your patch,
the main change being to greatly shrink the lookup table initialisation.
Note I commented out the lower case letters (except 'k') as I don't
think any coreutils generate those and they could preclude supporting
other suffixes in future. I'm not sure about doing that but I think it's
better to err on the side of too few suffixes than too many?

Something else to consider is to flag when
a mixture of SI and IEC units are used, as
this not being supported might not be obvious
to users and could cause difficult to debug issues for users.
I.E. flag an error if the following input is presented.
  999MB
  998MiB
I added a very quick hack for that to the patch for illustration.

I also noticed that you didn't terminate the fields before
processing as was done for the other numeric sorts?
So I changed that also in the attached patch but didn't
analyze it TBH.

cheers,
Pádraig.

p.s. obviously docs and help and tests need to be written,
but we can do that after we get the implementation done.
diff --git a/src/sort.c b/src/sort.c
index f48d727..a2ed015 100644
--- a/src/sort.c
+++ b/src/sort.c
@@ -176,6 +176,7 @@ struct keyfield
   bool random;			/* Sort by random hash of key.  */
   bool general_numeric;		/* Flag for general, numeric comparison.
    Handle numbers in exponential notation. */
+  bool human_numeric;   /* Flag for sorting by common suffixes. */
   bool month;			/* Flag for comparison by month name. */
   bool reverse;			/* Reverse the sense of comparison. */
   bool version;			/* sort by version number */
@@ -426,7 +427,7 @@ enum
   SORT_OPTION
 };
 
-static char const short_options[] = -bcCdfgik:mMno:rRsS:t:T:uVy:z;
+static char const short_options[] = -bcCdfghik:mMno:rRsS:t:T:uVy:z;
 
 static struct option const long_options[] =
 {
@@ -442,6 +443,7 @@ static struct option const long_options[] =
   {merge, no_argument, NULL, 'm'},
   {month-sort, no_argument, NULL, 'M'},
   {numeric-sort, no_argument, NULL, 'n'},
+  {human-sort, no_argument, NULL, 'h'},
   {version-sort, no_argument, NULL, 'V'},
   {random-sort, no_argument, NULL, 'R'},
   {random-source, required_argument, NULL, RANDOM_SOURCE_OPTION},
@@ -1673,6 +1675,54 @@ numcompare (const char *a, const char *b)
   return strnumcmp (a, b, decimal_point, thousands_sep);
 }
 
+/* error if a mixture of SI and IEC units used.  */
+static void
+check_mixed_SI_IEC (char suffix)
+{
+  static int seen_si = -1;
+  bool si_present = suffix == 'i';
+  if (seen_si != -1  seen_si != si_present)
+error (SORT_FAILURE, 0, _(Both SI and IEC suffixes present));
+  seen_si = si_present;
+}
+
+/* Compare numeric entities ending in human readable size specifiers
+  b  K  M  G  T  P  E  Z  Y
+   We assume that numbers are properly abbreviated.
+   For example, you will never see 500,000,000b, instead of 5M.  */
+
+static int
+human_compare(const char *a, const char *b)
+{
+  static const char weights [] = {
+['K']=1, ['M']=2, ['G']=3, ['T']=4, ['P']=5, ['E']=6, ['Z']=7, ['Y']=8,
+['k']=1, /*['m']=2, ['g']=3, ['t']=4, ['p']=5, ['e']=6, ['z']=7, ['y']=8,*/
+  };
+
+  while (blanks[to_uchar (*a)])
+a++;
+  while (blanks[to_uchar (*b)])
+b++;
+
+  const char *ar = a;
+  const char *br = b;
+
+  while( ISDIGIT (*ar) || (*ar) == decimal_point || (*ar) == thousands_sep )
+ar++;
+  while( ISDIGIT (*br) || (*br) == decimal_point || (*br) == thousands_sep )
+br++;
+
+  check_mixed_SI_IEC (*(ar+1));
+  check_mixed_SI_IEC (*(br+1));
+
+  int aw = weights[to_uchar (*ar)];
+  int bw = weights[to_uchar (*br)];
+
+  return (aw  bw ? 1
+