[[email protected]: Re: sort -N patch: sort in natural order]

Martin Sarfy Fri, 05 Jun 2009 08:27:31 -0700

Hello,

attached there is a patch against coreutils-6.10/sort.c implementing
"Natural order sorting" (e.g. "A1" < "A2" < "A10"), using "-N" parameter.


Please, consider it's inclusion in upstream :-)

Signing papers with copyright is not a problem.

Best regards,
Martin Sarfy

----- Forwarded message from Paul Eggert <[email protected]> -----

To: Martin Sarfy <[email protected]>
Subject: Re: sort -N patch: sort in natural order
From: Paul Eggert <[email protected]>
Date: Thu, 04 Jun 2009 12:10:04 -0700

Could you please send this directly to [email protected]?
Also, assuming it meets favor there, will you and your employer be willing
to sign papers assigning copyright to the FSF?


----- End forwarded message -----

-- 
Best regards
Martin Sarfy

--- coreutils-6.10-sort.c	2009-06-04 17:27:35.000000000 +0200
+++ coreutils-6.10-sort-natural.c	2009-06-04 17:27:44.000000000 +0200
@@ -170,6 +170,7 @@
   bool random;			/* Sort by random hash of key.  */
   bool general_numeric;		/* Flag for general, numeric comparison.
 				   Handle numbers in exponential notation. */
+  bool natural;			/* Flag for natural order comparison. */
   bool month;			/* Flag for comparison by month name. */
   bool reverse;			/* Reverse the sense of comparison. */
   struct keyfield *next;	/* Next keyfield to try. */
@@ -327,6 +328,7 @@
   -i, --ignore-nonprinting    consider only printable characters\n\
   -M, --month-sort            compare (unknown) < `JAN' < ... < `DEC'\n\
   -n, --numeric-sort          compare according to string numerical value\n\
+  -N, --natural-order-sort    sort strings containing numbers in natural order\n\
   -R, --random-sort           sort by random hash of keys\n\
       --random-source=FILE    get random bytes from FILE (default /dev/urandom)\n\
   -r, --reverse               reverse the result of comparisons\n\
@@ -394,7 +396,7 @@
   RANDOM_SOURCE_OPTION
 };
 
-static char const short_options[] = "-bcCdfgik:mMno:rRsS:t:T:uy:z";
+static char const short_options[] = "-bcCdfgik:mMnNo:rRsS:t:T:uy:z";
 
 static struct option const long_options[] =
 {
@@ -409,6 +411,7 @@
   {"merge", no_argument, NULL, 'm'},
   {"month-sort", no_argument, NULL, 'M'},
   {"numeric-sort", no_argument, NULL, 'n'},
+  {"natural-order-sort", no_argument, NULL, 'N'},
   {"random-sort", no_argument, NULL, 'R'},
   {"random-source", required_argument, NULL, RANDOM_SOURCE_OPTION},
   {"output", required_argument, NULL, 'o'},
@@ -1514,6 +1517,139 @@
   return strnumcmp (a, b, decimal_point, thousands_sep);
 }
 
+/* Perform 'natural order' comparisons of strings ("A1" < "A2" < "A10")
+   Adoption for "sort" from strnatcmp.c by Martin Sarfy <[email protected]>
+   Original author (C) 2000, 2004 by Martin Pool <mbp sourcefrog net>  */
+
+static inline int
+nat_isdigit(char a)
+{
+     return isdigit((unsigned char) a);
+}
+
+
+static inline int
+nat_isspace(char a)
+{
+     return isspace((unsigned char) a);
+}
+
+
+static inline char
+nat_toupper(char a)
+{
+     return toupper((unsigned char) a);
+}
+
+static int
+nat_compare_right(char const *a, char const *b)
+{
+     int bias = 0;
+
+     /* The longest run of digits wins.  That aside, the greatest
+    value wins, but we can't know that it will until we've scanned
+    both numbers to know that they have the same magnitude, so we
+    remember it in BIAS. */
+     for (;; a++, b++) {
+      if (!nat_isdigit(*a)  &&  !nat_isdigit(*b))
+           return bias;
+      else if (!nat_isdigit(*a))
+           return -1;
+      else if (!nat_isdigit(*b))
+           return +1;
+      else if (*a < *b) {
+           if (!bias)
+            bias = -1;
+      } else if (*a > *b) {
+           if (!bias)
+            bias = +1;
+      } else if (!*a  &&  !*b)
+           return bias;
+     }
+
+     return 0;
+}
+
+
+static int
+nat_compare_left(char const *a, char const *b)
+{
+     /* Compare two left-aligned numbers: the first to have a
+        different value wins. */
+     for (;; a++, b++) {
+      if (!nat_isdigit(*a)  &&  !nat_isdigit(*b))
+           return 0;
+      else if (!nat_isdigit(*a))
+           return -1;
+      else if (!nat_isdigit(*b))
+           return +1;
+      else if (*a < *b)
+           return -1;
+      else if (*a > *b)
+           return +1;
+     }
+
+     return 0;
+}
+
+
+static int strnatcmp0(char const *a, char const *b, int fold_case)
+{
+     int ai, bi;
+     char ca, cb;
+     int fractional, result;
+
+     /* assert(a && b); */
+     ai = bi = 0;
+     while (1) {
+      ca = a[ai]; cb = b[bi];
+
+      /* skip over leading spaces or zeros */
+      while (nat_isspace(ca))
+           ca = a[++ai];
+
+      while (nat_isspace(cb))
+           cb = b[++bi];
+
+      /* process run of digits */
+      if (nat_isdigit(ca)  &&  nat_isdigit(cb)) {
+           fractional = (ca == '0' || cb == '0');
+
+           if (fractional) {
+            if ((result = nat_compare_left(a+ai, b+bi)) != 0)
+             return result;
+           } else {
+            if ((result = nat_compare_right(a+ai, b+bi)) != 0)
+             return result;
+           }
+      }
+
+      if (!ca && !cb) {
+           /* The strings compare the same.  Perhaps the caller
+                  will want to call strcmp to break the tie. */
+           return 0;
+      }
+
+      if (fold_case) {
+           ca = nat_toupper(ca);
+           cb = nat_toupper(cb);
+      }
+
+      if (ca < cb)
+           return -1;
+      else if (ca > cb)
+           return +1;
+
+      ++ai; ++bi;
+     }
+}
+
+static int
+natural_compare (const char *sa, const char *sb)
+{
+  return strnatcmp0(sa,sb,0);
+}
+
 static int
 general_numcompare (const char *sa, const char *sb)
 {
@@ -1702,6 +1838,34 @@
   return diff;
 }
 
+static void
+translate_chars(const char* texta, size_t lena, const char* textb, size_t lenb, 
+		char *copy_a, size_t *p_new_len_a, char* copy_b, size_t *p_new_len_b, 
+		bool const *ignore, char const *translate)
+{
+  int i;
+  /* Ignore and/or translate chars before comparing.  */
+  for (*p_new_len_a = *p_new_len_b = i = 0; i < MAX (lena, lenb); i++)
+	{
+	  if (i < lena)
+	    {
+	      copy_a[*p_new_len_a] = (translate
+				   ? translate[to_uchar (texta[i])]
+				   : texta[i]);
+	      if (!ignore || !ignore[to_uchar (texta[i])])
+		++(*p_new_len_a);
+	    }
+	  if (i < lenb)
+	    {
+	      copy_b[*p_new_len_b] = (translate
+				   ? translate[to_uchar (textb[i])]
+				   : textb [i]);
+	      if (!ignore || !ignore[to_uchar (textb[i])])
+		++(*p_new_len_b);
+	    }
+	}
+}
+
 /* Compare two lines A and B trying every key in sequence until there
    are no more keys or a difference is found. */
 
@@ -1741,6 +1905,22 @@
 		  (texta, textb));
 	  *lima = savea, *limb = saveb;
 	}
+      else if (key->natural) 
+	{
+	      char buf[4000];
+	      size_t size = lena + 1 + lenb + 1;
+	      char *copy_a = (size <= sizeof buf ? buf : xmalloc (size));
+	      char *copy_b = copy_a + lena + 1;
+	      size_t new_len_a, new_len_b;
+
+              translate_chars(texta, lena, textb, lenb, copy_a, &new_len_a, 
+			      copy_b, &new_len_b, ignore, translate);
+
+	      diff = natural_compare (copy_a, copy_b );
+
+	      if (sizeof buf < size)
+		free (copy_a);
+	}
       else if (key->month)
 	diff = getmonth (texta, lena) - getmonth (textb, lenb);
       /* Sorting like this may become slow, so in a simple locale the user
@@ -1753,28 +1933,10 @@
 	      size_t size = lena + 1 + lenb + 1;
 	      char *copy_a = (size <= sizeof buf ? buf : xmalloc (size));
 	      char *copy_b = copy_a + lena + 1;
-	      size_t new_len_a, new_len_b, i;
+	      size_t new_len_a, new_len_b;
 
-	      /* Ignore and/or translate chars before comparing.  */
-	      for (new_len_a = new_len_b = i = 0; i < MAX (lena, lenb); i++)
-		{
-		  if (i < lena)
-		    {
-		      copy_a[new_len_a] = (translate
-					   ? translate[to_uchar (texta[i])]
-					   : texta[i]);
-		      if (!ignore || !ignore[to_uchar (texta[i])])
-			++new_len_a;
-		    }
-		  if (i < lenb)
-		    {
-		      copy_b[new_len_b] = (translate
-					   ? translate[to_uchar (textb[i])]
-					   : textb [i]);
-		      if (!ignore || !ignore[to_uchar (textb[i])])
-			++new_len_b;
-		    }
-		}
+              translate_chars(texta, lena, textb, lenb, copy_a, &new_len_a, 
+			      copy_b, &new_len_b, ignore, translate);
 
 	      diff = xmemcoll (copy_a, new_len_a, copy_b, new_len_b);
 
@@ -2587,7 +2749,7 @@
 
   for (key = keylist; key; key = key->next)
     if ((1 < (key->random + key->numeric + key->general_numeric + key->month
-	      + !!key->ignore))
+	      + key->natural + !!key->ignore))
 	|| (key->random && key->translate))
       {
 	char opts[7];
@@ -2604,6 +2766,8 @@
 	  *p++ = 'M';
 	if (key->numeric)
 	  *p++ = 'n';
+	if (key->natural)
+	  *p++ = 'N';
 	if (key->random)
 	  *p++ = 'R';
 	*p = '\0';
@@ -2696,6 +2860,9 @@
 	case 'M':
 	  key->month = true;
 	  break;
+	case 'N':
+	  key->natural = true;
+	  break;
 	case 'n':
 	  key->numeric = true;
 	  break;
@@ -2830,7 +2997,7 @@
   gkey.sword = gkey.eword = SIZE_MAX;
   gkey.ignore = NULL;
   gkey.translate = NULL;
-  gkey.numeric = gkey.general_numeric = gkey.random = false;
+  gkey.numeric = gkey.general_numeric = gkey.natural = gkey.random = false;
   gkey.month = gkey.reverse = false;
   gkey.skipsblanks = gkey.skipeblanks = false;
 
@@ -2909,6 +3076,7 @@
 	case 'i':
 	case 'M':
 	case 'n':
+	case 'N':
 	case 'r':
 	case 'R':
 	  {
@@ -3087,7 +3255,7 @@
       if (! (key->ignore || key->translate
              || (key->skipsblanks | key->reverse
                  | key->skipeblanks | key->month | key->numeric
-                 | key->general_numeric
+                 | key->general_numeric | key->natural
                  | key->random)))
         {
           key->ignore = gkey.ignore;
@@ -3097,6 +3265,7 @@
           key->month = gkey.month;
           key->numeric = gkey.numeric;
           key->general_numeric = gkey.general_numeric;
+          key->natural = gkey.natural;
           key->random = gkey.random;
           key->reverse = gkey.reverse;
         }
@@ -3106,7 +3275,7 @@
 
   if (!keylist && (gkey.ignore || gkey.translate
 		   || (gkey.skipsblanks | gkey.skipeblanks | gkey.month
-		       | gkey.numeric | gkey.general_numeric
+		       | gkey.numeric | gkey.general_numeric | gkey.natural
                        | gkey.random)))
     {
       insertkey (&gkey);

_______________________________________________
Bug-coreutils mailing list
[email protected]
http://lists.gnu.org/mailman/listinfo/bug-coreutils

[[email protected]: Re: sort -N patch: sort in natural order]

Reply via email to