[PATCH] join numerically sorted files

Herbert Xu Sun, 08 Apr 2001 04:29:36 -0700

The foolowing patch implements the -n option for join, i.e., joining two
files sorted using sort -n.  It is probably better to split out all the
comparison functions from sort and have join support each and every one of
them.
-- 
Debian GNU/Linux 2.2 is out! ( http://www.debian.org/ )
Email:  Herbert Xu ~{PmV>HI~} <[EMAIL PROTECTED]>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

Index: doc/stamp-vti
===================================================================
RCS file: /home/gondolin/herbert/src/CVS/debian/textutils/doc/stamp-vti,v
retrieving revision 1.4
diff -u -r1.4 stamp-vti
--- doc/stamp-vti       2001/01/28 04:40:19     1.4
+++ doc/stamp-vti       2001/04/08 11:42:02
@@ -1,3 +1,3 @@
-@set UPDATED 28 January 2001
+@set UPDATED 8 April 2001
 @set EDITION 2.0
 @set VERSION 2.0
Index: doc/textutils.info
===================================================================
RCS file: /home/gondolin/herbert/src/CVS/debian/textutils/doc/textutils.info,v
retrieving revision 1.5
diff -u -r1.5 textutils.info
--- doc/textutils.info  2001/01/28 04:40:19     1.5
+++ doc/textutils.info  2001/04/08 11:42:02
@@ -2621,6 +2621,10 @@
 `-j FIELD'
      Equivalent to `-1 FIELD -2 FIELD'.
 
+`-n'
+     Use numerical order when joining FILE1 and FILE2.  They must be
+     sorted numerically beforehand.
+
 `-o FIELD-LIST...'
      Construct each output line according to the format in FIELD-LIST.
      Each element in FIELD-LIST is either the single character `0' or
@@ -3773,6 +3777,7 @@
 * -M:                                    sort invocation.
 * -m <1>:                                sort invocation.
 * -m:                                    pr invocation.
+* -n <1>:                                join invocation.
 * -n:                                    cut invocation.
 * -N:                                    uniq invocation.
 * -n <1>:                                sort invocation.
@@ -4069,23 +4074,23 @@
 Node: cut invocation98383
 Node: paste invocation100298
 Node: join invocation101152
-Node: Operating on characters104536
-Node: tr invocation104982
-Node: Character sets106099
-Node: Translating109690
-Node: Squeezing111487
-Node: Warnings in tr113394
-Node: expand invocation114527
-Node: unexpand invocation115844
-Node: Opening the software toolbox117279
-Node: Toolbox introduction117967
-Node: I/O redirection120689
-Node: The who command123525
-Node: The cut command124413
-Node: The sort command125288
-Node: The uniq command125992
-Node: Putting the tools together126721
-Ref: Putting the tools together-Footnote-1138549
-Node: Index138711
+Node: Operating on characters104647
+Node: tr invocation105093
+Node: Character sets106210
+Node: Translating109801
+Node: Squeezing111598
+Node: Warnings in tr113505
+Node: expand invocation114638
+Node: unexpand invocation115955
+Node: Opening the software toolbox117390
+Node: Toolbox introduction118078
+Node: I/O redirection120800
+Node: The who command123636
+Node: The cut command124524
+Node: The sort command125399
+Node: The uniq command126103
+Node: Putting the tools together126832
+Ref: Putting the tools together-Footnote-1138660
+Node: Index138822
 
 End Tag Table
Index: doc/textutils.texi
===================================================================
RCS file: /home/gondolin/herbert/src/CVS/debian/textutils/doc/textutils.texi,v
retrieving revision 1.5
diff -u -r1.5 textutils.texi
--- doc/textutils.texi  2001/01/28 04:40:19     1.5
+++ doc/textutils.texi  2001/04/08 11:41:33
@@ -3297,6 +3297,11 @@
 @item -j @var{field}
 Equivalent to @samp{-1 @var{field} -2 @var{field}}.
 
+@item -n
+@opindex -n
+Use numerical order when joining @var{file1} and @var{file2}.  They must be
+sorted numerically beforehand.
+
 @item -o @var{field-list}@dots{}
 Construct each output line according to the format in @var{field-list}.
 Each element in @var{field-list} is either the single character @samp{0} or
Index: doc/version.texi
===================================================================
RCS file: /home/gondolin/herbert/src/CVS/debian/textutils/doc/version.texi,v
retrieving revision 1.4
diff -u -r1.4 version.texi
--- doc/version.texi    2001/01/28 04:40:19     1.4
+++ doc/version.texi    2001/04/08 11:42:02
@@ -1,3 +1,3 @@
-@set UPDATED 28 January 2001
+@set UPDATED 8 April 2001
 @set EDITION 2.0
 @set VERSION 2.0
Index: man/join.1
===================================================================
RCS file: /home/gondolin/herbert/src/CVS/debian/textutils/man/join.1,v
retrieving revision 1.1.1.1
diff -u -r1.1.1.1 join.1
--- man/join.1  1999/08/06 19:24:08     1.1.1.1
+++ man/join.1  2001/04/08 11:37:19
@@ -1,5 +1,5 @@
 .\" DO NOT MODIFY THIS FILE!  It was generated by help2man 1.012.
-.TH JOIN "1" "August 1999" "GNU textutils 2.0" FSF
+.TH JOIN "1" "April 2001" "GNU textutils 2.0" FSF
 .SH NAME
 join \- join lines of two files on a common field
 .SH SYNOPSIS
@@ -23,6 +23,7 @@
 \fB\-j\fR FIELD          (obsolescent) equivalent to `-1 FIELD \fB\-2\fR FIELD'
 \fB\-j1\fR FIELD         (obsolescent) equivalent to `-1 FIELD'
 \fB\-j2\fR FIELD         (obsolescent) equivalent to `-2 FIELD'
+\fB\-n\fR                input files are sorted numerically
 \fB\-o\fR FORMAT         obey FORMAT while constructing output line
 \fB\-t\fR CHAR           use CHAR as input and output field separator
 \fB\-v\fR SIDE           like \fB\-a\fR SIDE, but suppress joined output lines
Index: po/cat-id-tbl.c
===================================================================
RCS file: /home/gondolin/herbert/src/CVS/debian/textutils/po/cat-id-tbl.c,v
retrieving revision 1.2
diff -u -r1.2 cat-id-tbl.c
--- po/cat-id-tbl.c     2000/06/28 11:20:30     1.2
+++ po/cat-id-tbl.c     2001/04/08 11:37:19
@@ -210,6 +210,7 @@
   -j FIELD          (obsolescent) equivalent to `-1 FIELD -2 FIELD'\n\
   -j1 FIELD         (obsolescent) equivalent to `-1 FIELD'\n\
   -j2 FIELD         (obsolescent) equivalent to `-2 FIELD'\n\
+  -n                input files are sorted numerically\n\
   -o FORMAT         obey FORMAT while constructing output line\n\
   -t CHAR           use CHAR as input and output field separator\n\
   -v SIDE           like -a SIDE, but suppress joined output lines\n\
Index: po/textutils.pot
===================================================================
RCS file: /home/gondolin/herbert/src/CVS/debian/textutils/po/textutils.pot,v
retrieving revision 1.2
diff -u -r1.2 textutils.pot
--- po/textutils.pot    2000/06/28 11:20:30     1.2
+++ po/textutils.pot    2001/04/08 11:37:19
@@ -6,7 +6,7 @@
 msgid ""
 msgstr ""
 "Project-Id-Version: PACKAGE VERSION\n"
-"POT-Creation-Date: 2000-06-28 21:17+1000\n"
+"POT-Creation-Date: 2001-04-08 21:37+1000\n"
 "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
 "Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
 "Language-Team: LANGUAGE <[EMAIL PROTECTED]>\n"
@@ -15,7 +15,7 @@
 "Content-Transfer-Encoding: ENCODING\n"
 
 #: src/cat.c:84 src/cksum.c:265 src/comm.c:70 src/csplit.c:1502 src/cut.c:193
-#: src/expand.c:104 src/fmt.c:268 src/fold.c:61 src/head.c:79 src/join.c:141
+#: src/expand.c:104 src/fmt.c:268 src/fold.c:61 src/head.c:79 src/join.c:161
 #: src/md5sum.c:100 src/nl.c:171 src/od.c:262 src/paste.c:405 src/pr.c:2772
 #: src/ptx.c:1854 src/sort.c:251 src/split.c:83 src/sum.c:57 src/tac.c:124
 #: src/tail.c:208 src/tr.c:321 src/tsort.c:88 src/unexpand.c:357
@@ -58,7 +58,7 @@
 msgstr ""
 
 #: src/cat.c:117 src/cksum.c:279 src/comm.c:87 src/csplit.c:1533 src/cut.c:225
-#: src/expand.c:124 src/fmt.c:289 src/fold.c:79 src/head.c:103 src/join.c:175
+#: src/expand.c:124 src/fmt.c:289 src/fold.c:79 src/head.c:103 src/join.c:196
 #: src/md5sum.c:126 src/nl.c:213 src/od.c:327 src/paste.c:424 src/pr.c:2859
 #: src/sort.c:298 src/split.c:106 src/sum.c:75 src/tac.c:142 src/tail.c:261
 #: src/tr.c:383 src/unexpand.c:377 src/uniq.c:136 src/wc.c:100
@@ -70,7 +70,7 @@
 #: src/cat.c:177 src/cat.c:259 src/cat.c:312 src/cat.c:840 src/comm.c:220
 #: src/csplit.c:1493 src/cut.c:799 src/expand.c:392 src/fmt.c:416
 #: src/fold.c:225 src/fold.c:306 src/head.c:139 src/head.c:169 src/head.c:387
-#: src/join.c:871 src/md5sum.c:629 src/nl.c:608 src/od.c:1940 src/paste.c:485
+#: src/join.c:1131 src/md5sum.c:629 src/nl.c:608 src/od.c:1940 src/paste.c:485
 #: src/pr.c:1158 src/tac.c:715 src/tail.c:285 src/tail.c:1518 src/tr.c:1670
 #: src/tr.c:1916 src/tr.c:2024 src/tr.c:2031 src/tsort.c:485
 #: src/unexpand.c:454
@@ -452,12 +452,12 @@
 msgid "unrecognized option `-%c'"
 msgstr ""
 
-#: src/join.c:145
+#: src/join.c:165
 #, c-format
 msgid "Usage: %s [OPTION]... FILE1 FILE2\n"
 msgstr ""
 
-#: src/join.c:149
+#: src/join.c:169
 msgid ""
 "For each pair of input lines with identical join fields, write a line to\n"
 "standard output.  The default join field is the first, delimited\n"
@@ -469,6 +469,7 @@
 "  -j FIELD          (obsolescent) equivalent to `-1 FIELD -2 FIELD'\n"
 "  -j1 FIELD         (obsolescent) equivalent to `-1 FIELD'\n"
 "  -j2 FIELD         (obsolescent) equivalent to `-2 FIELD'\n"
+"  -n                input files are sorted numerically\n"
 "  -o FORMAT         obey FORMAT while constructing output line\n"
 "  -t CHAR           use CHAR as input and output field separator\n"
 "  -v SIDE           like -a SIDE, but suppress joined output lines\n"
@@ -486,40 +487,40 @@
 msgstr ""
 
 #. `0' must be all alone -- no `.FIELD'.
-#: src/join.c:640
+#: src/join.c:879
 #, c-format
 msgid "invalid field specifier: `%s'"
 msgstr ""
 
-#: src/join.c:654 src/join.c:767 src/join.c:803
+#: src/join.c:893 src/join.c:1023 src/join.c:1063
 #, c-format
 msgid "invalid field number: `%s'"
 msgstr ""
 
-#: src/join.c:667
+#: src/join.c:906
 #, c-format
 msgid "invalid file number in field spec: `%s'"
 msgstr ""
 
-#: src/join.c:787
+#: src/join.c:1047
 #, c-format
 msgid "invalid field number for file 1: `%s'"
 msgstr ""
 
-#: src/join.c:796
+#: src/join.c:1056
 #, c-format
 msgid "invalid field number for file 2: `%s'"
 msgstr ""
 
-#: src/join.c:828
+#: src/join.c:1088
 msgid "too many non-option arguments"
 msgstr ""
 
-#: src/join.c:850
+#: src/join.c:1110
 msgid "too few non-option arguments"
 msgstr ""
 
-#: src/join.c:861
+#: src/join.c:1121
 msgid "both files cannot be standard input"
 msgstr ""
 
Index: src/join.c
===================================================================
RCS file: /home/gondolin/herbert/src/CVS/debian/textutils/src/join.c,v
retrieving revision 1.1.1.1
diff -u -r1.1.1.1 join.c
--- src/join.c  1999/07/04 10:38:02     1.1.1.1
+++ src/join.c  2001/04/08 11:35:34
@@ -62,7 +62,7 @@
 /* A field of a line.  */
 struct field
   {
-    const unsigned char *beg;  /* First character in field.  */
+    unsigned char *beg;                /* First character in field.  */
     size_t len;                        /* The length of the field.  */
   };
 
@@ -87,9 +87,25 @@
 /* The name this program was run with.  */
 char *program_name;
 
+#define C_DECIMAL_POINT '.'
+#define NEGATION_SIGN   '-'
+#define NUMERIC_ZERO    '0'
+
 #ifdef ENABLE_NLS
+
+static char decimal_point;
+static int th_sep; /* if CHAR_MAX + 1, then there is no thousands separator */
+
 /* Nonzero if the LC_COLLATE locale is hard.  */
 static int hard_LC_COLLATE;
+
+# define IS_THOUSANDS_SEP(x) ((x) == th_sep)
+
+#else
+
+# define decimal_point C_DECIMAL_POINT
+# define IS_THOUSANDS_SEP(x) 0
+
 #endif
 
 /* If nonzero, print unpairable lines in file 1 or 2.  */
@@ -123,6 +139,7 @@
   {"j", required_argument, NULL, 'j'},
   {"j1", required_argument, NULL, '1'},
   {"j2", required_argument, NULL, '2'},
+  {"n", required_argument, NULL, 'n'},
   {GETOPT_HELP_OPTION_DECL},
   {GETOPT_VERSION_OPTION_DECL},
   {NULL, 0, NULL, 0}
@@ -134,6 +151,9 @@
 /* If nonzero, ignore case when comparing join fields.  */
 static int ignore_case;
 
+/* If nonzero, do numeric comparison.  */
+static int numeric;
+
 void
 usage (int status)
 {
@@ -157,6 +177,7 @@
   -j FIELD          (obsolescent) equivalent to `-1 FIELD -2 FIELD'\n\
   -j1 FIELD         (obsolescent) equivalent to `-1 FIELD'\n\
   -j2 FIELD         (obsolescent) equivalent to `-2 FIELD'\n\
+  -n                input files are sorted numerically\n\
   -o FORMAT         obey FORMAT while constructing output line\n\
   -t CHAR           use CHAR as input and output field separator\n\
   -v SIDE           like -a SIDE, but suppress joined output lines\n\
@@ -178,7 +199,7 @@
 }
 
 static void
-ADD_FIELD (struct line *line, const unsigned char *field, size_t len)
+ADD_FIELD (struct line *line, unsigned char *field, size_t len)
 {
   if (line->nfields >= line->nfields_allocated)
     {
@@ -314,6 +335,213 @@
   free ((char *) seq->lines);
 }
 
+/* Compare strings A and B containing decimal fractions < 1.  Each string
+   should begin with a decimal point followed immediately by the digits
+   of the fraction.  Strings not of this form are considered to be zero. */
+
+/* The goal here, is to take two numbers a and b... compare these
+   in parallel.  Instead of converting each, and then comparing the
+   outcome.  Most likely stopping the comparison before the conversion
+   is complete.  The algorithm used, in the old sort:
+
+   Algorithm: fraccompare
+   Action   : compare two decimal fractions
+   accepts  : char *a, char *b
+   returns  : -1 if a<b, 0 if a=b, 1 if a>b.
+   implement:
+
+   if *a == decimal_point AND *b == decimal_point
+     find first character different in a and b.
+     if both are digits, return the difference *a - *b.
+     if *a is a digit
+       skip past zeros
+       if digit return 1, else 0
+     if *b is a digit
+       skip past zeros
+       if digit return -1, else 0
+   if *a is a decimal_point
+     skip past decimal_point and zeros
+     if digit return 1, else 0
+   if *b is a decimal_point
+     skip past decimal_point and zeros
+     if digit return -1, else 0
+   return 0 */
+
+static int
+fraccompare (register const unsigned char *a, register const unsigned char *b)
+{
+  if (*a == decimal_point && *b == decimal_point)
+    {
+      while (*++a == *++b)
+       if (! ISDIGIT (*a))
+         return 0;
+      if (ISDIGIT (*a) && ISDIGIT (*b))
+       return *a - *b;
+      if (ISDIGIT (*a))
+       goto a_trailing_nonzero;
+      if (ISDIGIT (*b))
+       goto b_trailing_nonzero;
+      return 0;
+    }
+  else if (*a++ == decimal_point)
+    {
+    a_trailing_nonzero:
+      while (*a == NUMERIC_ZERO)
+       a++;
+      return ISDIGIT (*a);
+    }
+  else if (*b++ == decimal_point)
+    {
+    b_trailing_nonzero:
+      while (*b == NUMERIC_ZERO)
+       b++;
+      return - ISDIGIT (*b);
+    }
+  return 0;
+}
+
+/* Compare strings A and B as numbers without explicitly converting them to
+   machine numbers.  Comparatively slow for short strings, but asymptotically
+   hideously fast. */
+
+static int
+numcompare (register const unsigned char *a, register const unsigned char *b)
+{
+  register int tmpa, tmpb, loga, logb, tmp;
+
+  tmpa = *a;
+  tmpb = *b;
+
+  while (ISBLANK (tmpa))
+    tmpa = *++a;
+  while (ISBLANK (tmpb))
+    tmpb = *++b;
+
+  if (tmpa == NEGATION_SIGN)
+    {
+      do
+       tmpa = *++a;
+      while (tmpa == NUMERIC_ZERO || IS_THOUSANDS_SEP (tmpa));
+      if (tmpb != NEGATION_SIGN)
+       {
+         if (tmpa == decimal_point)
+           do
+             tmpa = *++a;
+           while (tmpa == NUMERIC_ZERO);
+         if (ISDIGIT (tmpa))
+           return -1;
+         while (tmpb == NUMERIC_ZERO || IS_THOUSANDS_SEP (tmpb))
+           tmpb = *++b;
+         if (tmpb == decimal_point)
+           do
+             tmpb = *++b;
+           while (tmpb == NUMERIC_ZERO);
+         if (ISDIGIT (tmpb))
+           return -1;
+         return 0;
+       }
+      do
+       tmpb = *++b;
+      while (tmpb == NUMERIC_ZERO || IS_THOUSANDS_SEP (tmpb));
+
+      while (tmpa == tmpb && ISDIGIT (tmpa))
+       {
+         do
+           tmpa = *++a;
+         while (IS_THOUSANDS_SEP (tmpa));
+         do
+           tmpb = *++b;
+         while (IS_THOUSANDS_SEP (tmpb));
+       }
+
+      if ((tmpa == decimal_point && !ISDIGIT (tmpb))
+         || (tmpb == decimal_point && !ISDIGIT (tmpa)))
+       return -fraccompare (a, b);
+
+      tmp = tmpb - tmpa;
+
+      for (loga = 0; ISDIGIT (tmpa); ++loga)
+       do
+         tmpa = *++a;
+       while (IS_THOUSANDS_SEP (tmpa));
+
+      for (logb = 0; ISDIGIT (tmpb); ++logb)
+       do
+         tmpb = *++b;
+       while (IS_THOUSANDS_SEP (tmpb));
+
+      if (logb - loga != 0)
+       return logb - loga;
+
+      if (!loga)
+       return 0;
+
+      return tmp;
+    }
+  else if (tmpb == NEGATION_SIGN)
+    {
+      do
+       tmpb = *++b;
+      while (tmpb == NUMERIC_ZERO || IS_THOUSANDS_SEP (tmpb));
+      if (tmpb == decimal_point)
+       do
+         tmpb = *++b;
+       while (tmpb == NUMERIC_ZERO);
+      if (ISDIGIT (tmpb))
+       return 1;
+      while (tmpa == NUMERIC_ZERO || IS_THOUSANDS_SEP (tmpa))
+       tmpa = *++a;
+      if (tmpa == decimal_point)
+       do
+         tmpa = *++a;
+       while (tmpa == NUMERIC_ZERO);
+      if (ISDIGIT (tmpa))
+       return 1;
+      return 0;
+    }
+  else
+    {
+      while (tmpa == NUMERIC_ZERO || IS_THOUSANDS_SEP (tmpa))
+       tmpa = *++a;
+      while (tmpb == NUMERIC_ZERO || IS_THOUSANDS_SEP (tmpb))
+       tmpb = *++b;
+
+      while (tmpa == tmpb && ISDIGIT (tmpa))
+       {
+         do
+           tmpa = *++a;
+         while (IS_THOUSANDS_SEP (tmpa));
+         do
+           tmpb = *++b;
+         while (IS_THOUSANDS_SEP (tmpb));
+       }
+
+      if ((tmpa == decimal_point && !ISDIGIT (tmpb))
+         || (tmpb == decimal_point && !ISDIGIT (tmpa)))
+       return fraccompare (a, b);
+
+      tmp = tmpa - tmpb;
+
+      for (loga = 0; ISDIGIT (tmpa); ++loga)
+       do
+         tmpa = *++a;
+       while (IS_THOUSANDS_SEP (tmpa));
+
+      for (logb = 0; ISDIGIT (tmpb); ++logb)
+       do
+         tmpb = *++b;
+       while (IS_THOUSANDS_SEP (tmpb));
+
+      if (loga - logb != 0)
+       return loga - logb;
+
+      if (!loga)
+       return 0;
+
+      return tmp;
+    }
+}
+
 /* Return <0 if the join field in LINE1 compares less than the one in LINE2;
    >0 if it compares greater; 0 if it compares equal.  */
 
@@ -321,7 +549,7 @@
 keycmp (struct line *line1, struct line *line2)
 {
   /* Start of field to compare in each file.  */
-  const unsigned char *beg1, *beg2;
+  unsigned char *beg1, *beg2;
 
   int len1, len2;              /* Length of fields to compare.  */
   int diff;
@@ -356,8 +584,19 @@
   /* Use an if-statement here rather than a function variable to
      avoid portability hassles of getting a non-conflicting declaration
      of memcmp.  */
-  if (ignore_case)
+  if (numeric)
     {
+      unsigned char save1, save2;
+
+      save1 = beg1[len1];
+      save2 = beg2[len2];
+      beg1[len1] = beg2[len2] = '\0';
+      diff = numcompare(beg1, beg2);
+      beg1[len1] = save1;
+      beg2[len2] = save2;
+    }
+  else if (ignore_case)
+    {
       /* FIXME: ignore_case does not work with NLS (in particular,
          with multibyte chars).  */
       diff = memcasecmp (beg1, beg2, min (len1, len2));
@@ -738,6 +977,23 @@
 
 #ifdef ENABLE_NLS
   hard_LC_COLLATE = hard_locale (LC_COLLATE);
+
+  /* Let's get locale's representation of the decimal point */
+  {
+    struct lconv *lconvp = localeconv ();
+
+    /* If the locale doesn't define a decimal point, or if the decimal
+       point is multibyte, use the C decimal point.  We don't support
+       multibyte decimal points yet.  */
+    decimal_point = *lconvp->decimal_point;
+    if (! decimal_point || lconvp->decimal_point[1])
+      decimal_point = C_DECIMAL_POINT;
+
+    /* We don't support multibyte thousands separators yet.  */
+    th_sep = *lconvp->thousands_sep;
+    if (! th_sep || lconvp->thousands_sep[1])
+      th_sep = CHAR_MAX + 1;
+  }
 #endif
 
   /* Initialize this before parsing options.  In parsing options,
@@ -747,7 +1003,7 @@
   nfiles = 0;
   print_pairables = 1;
 
-  while ((optc = getopt_long_only (argc, argv, "-a:e:i1:2:o:t:v:", longopts,
+  while ((optc = getopt_long_only (argc, argv, "-a:e:in1:2:o:t:v:", longopts,
                                   NULL)) != -1)
     {
       long int val;
@@ -777,6 +1033,10 @@
 
        case 'i':
          ignore_case = 1;
+         break;
+
+       case 'n':
+         numeric = 1;
          break;
 
        case '1':

[PATCH] join numerically sorted files

Reply via email to