The foolowing patch implements the -n option for join, i.e., joining two
files sorted using sort -n. It is probably better to split out all the
comparison functions from sort and have join support each and every one of
them.
--
Debian GNU/Linux 2.2 is out! ( http://www.debian.org/ )
Email: Herbert Xu ~{PmV>HI~} <[EMAIL PROTECTED]>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
Index: doc/stamp-vti
===================================================================
RCS file: /home/gondolin/herbert/src/CVS/debian/textutils/doc/stamp-vti,v
retrieving revision 1.4
diff -u -r1.4 stamp-vti
--- doc/stamp-vti 2001/01/28 04:40:19 1.4
+++ doc/stamp-vti 2001/04/08 11:42:02
@@ -1,3 +1,3 @@
-@set UPDATED 28 January 2001
+@set UPDATED 8 April 2001
@set EDITION 2.0
@set VERSION 2.0
Index: doc/textutils.info
===================================================================
RCS file: /home/gondolin/herbert/src/CVS/debian/textutils/doc/textutils.info,v
retrieving revision 1.5
diff -u -r1.5 textutils.info
--- doc/textutils.info 2001/01/28 04:40:19 1.5
+++ doc/textutils.info 2001/04/08 11:42:02
@@ -2621,6 +2621,10 @@
`-j FIELD'
Equivalent to `-1 FIELD -2 FIELD'.
+`-n'
+ Use numerical order when joining FILE1 and FILE2. They must be
+ sorted numerically beforehand.
+
`-o FIELD-LIST...'
Construct each output line according to the format in FIELD-LIST.
Each element in FIELD-LIST is either the single character `0' or
@@ -3773,6 +3777,7 @@
* -M: sort invocation.
* -m <1>: sort invocation.
* -m: pr invocation.
+* -n <1>: join invocation.
* -n: cut invocation.
* -N: uniq invocation.
* -n <1>: sort invocation.
@@ -4069,23 +4074,23 @@
Node: cut invocation98383
Node: paste invocation100298
Node: join invocation101152
-Node: Operating on characters104536
-Node: tr invocation104982
-Node: Character sets106099
-Node: Translating109690
-Node: Squeezing111487
-Node: Warnings in tr113394
-Node: expand invocation114527
-Node: unexpand invocation115844
-Node: Opening the software toolbox117279
-Node: Toolbox introduction117967
-Node: I/O redirection120689
-Node: The who command123525
-Node: The cut command124413
-Node: The sort command125288
-Node: The uniq command125992
-Node: Putting the tools together126721
-Ref: Putting the tools together-Footnote-1138549
-Node: Index138711
+Node: Operating on characters104647
+Node: tr invocation105093
+Node: Character sets106210
+Node: Translating109801
+Node: Squeezing111598
+Node: Warnings in tr113505
+Node: expand invocation114638
+Node: unexpand invocation115955
+Node: Opening the software toolbox117390
+Node: Toolbox introduction118078
+Node: I/O redirection120800
+Node: The who command123636
+Node: The cut command124524
+Node: The sort command125399
+Node: The uniq command126103
+Node: Putting the tools together126832
+Ref: Putting the tools together-Footnote-1138660
+Node: Index138822
End Tag Table
Index: doc/textutils.texi
===================================================================
RCS file: /home/gondolin/herbert/src/CVS/debian/textutils/doc/textutils.texi,v
retrieving revision 1.5
diff -u -r1.5 textutils.texi
--- doc/textutils.texi 2001/01/28 04:40:19 1.5
+++ doc/textutils.texi 2001/04/08 11:41:33
@@ -3297,6 +3297,11 @@
@item -j @var{field}
Equivalent to @samp{-1 @var{field} -2 @var{field}}.
+@item -n
+@opindex -n
+Use numerical order when joining @var{file1} and @var{file2}. They must be
+sorted numerically beforehand.
+
@item -o @var{field-list}@dots{}
Construct each output line according to the format in @var{field-list}.
Each element in @var{field-list} is either the single character @samp{0} or
Index: doc/version.texi
===================================================================
RCS file: /home/gondolin/herbert/src/CVS/debian/textutils/doc/version.texi,v
retrieving revision 1.4
diff -u -r1.4 version.texi
--- doc/version.texi 2001/01/28 04:40:19 1.4
+++ doc/version.texi 2001/04/08 11:42:02
@@ -1,3 +1,3 @@
-@set UPDATED 28 January 2001
+@set UPDATED 8 April 2001
@set EDITION 2.0
@set VERSION 2.0
Index: man/join.1
===================================================================
RCS file: /home/gondolin/herbert/src/CVS/debian/textutils/man/join.1,v
retrieving revision 1.1.1.1
diff -u -r1.1.1.1 join.1
--- man/join.1 1999/08/06 19:24:08 1.1.1.1
+++ man/join.1 2001/04/08 11:37:19
@@ -1,5 +1,5 @@
.\" DO NOT MODIFY THIS FILE! It was generated by help2man 1.012.
-.TH JOIN "1" "August 1999" "GNU textutils 2.0" FSF
+.TH JOIN "1" "April 2001" "GNU textutils 2.0" FSF
.SH NAME
join \- join lines of two files on a common field
.SH SYNOPSIS
@@ -23,6 +23,7 @@
\fB\-j\fR FIELD (obsolescent) equivalent to `-1 FIELD \fB\-2\fR FIELD'
\fB\-j1\fR FIELD (obsolescent) equivalent to `-1 FIELD'
\fB\-j2\fR FIELD (obsolescent) equivalent to `-2 FIELD'
+\fB\-n\fR input files are sorted numerically
\fB\-o\fR FORMAT obey FORMAT while constructing output line
\fB\-t\fR CHAR use CHAR as input and output field separator
\fB\-v\fR SIDE like \fB\-a\fR SIDE, but suppress joined output lines
Index: po/cat-id-tbl.c
===================================================================
RCS file: /home/gondolin/herbert/src/CVS/debian/textutils/po/cat-id-tbl.c,v
retrieving revision 1.2
diff -u -r1.2 cat-id-tbl.c
--- po/cat-id-tbl.c 2000/06/28 11:20:30 1.2
+++ po/cat-id-tbl.c 2001/04/08 11:37:19
@@ -210,6 +210,7 @@
-j FIELD (obsolescent) equivalent to `-1 FIELD -2 FIELD'\n\
-j1 FIELD (obsolescent) equivalent to `-1 FIELD'\n\
-j2 FIELD (obsolescent) equivalent to `-2 FIELD'\n\
+ -n input files are sorted numerically\n\
-o FORMAT obey FORMAT while constructing output line\n\
-t CHAR use CHAR as input and output field separator\n\
-v SIDE like -a SIDE, but suppress joined output lines\n\
Index: po/textutils.pot
===================================================================
RCS file: /home/gondolin/herbert/src/CVS/debian/textutils/po/textutils.pot,v
retrieving revision 1.2
diff -u -r1.2 textutils.pot
--- po/textutils.pot 2000/06/28 11:20:30 1.2
+++ po/textutils.pot 2001/04/08 11:37:19
@@ -6,7 +6,7 @@
msgid ""
msgstr ""
"Project-Id-Version: PACKAGE VERSION\n"
-"POT-Creation-Date: 2000-06-28 21:17+1000\n"
+"POT-Creation-Date: 2001-04-08 21:37+1000\n"
"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
"Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
"Language-Team: LANGUAGE <[EMAIL PROTECTED]>\n"
@@ -15,7 +15,7 @@
"Content-Transfer-Encoding: ENCODING\n"
#: src/cat.c:84 src/cksum.c:265 src/comm.c:70 src/csplit.c:1502 src/cut.c:193
-#: src/expand.c:104 src/fmt.c:268 src/fold.c:61 src/head.c:79 src/join.c:141
+#: src/expand.c:104 src/fmt.c:268 src/fold.c:61 src/head.c:79 src/join.c:161
#: src/md5sum.c:100 src/nl.c:171 src/od.c:262 src/paste.c:405 src/pr.c:2772
#: src/ptx.c:1854 src/sort.c:251 src/split.c:83 src/sum.c:57 src/tac.c:124
#: src/tail.c:208 src/tr.c:321 src/tsort.c:88 src/unexpand.c:357
@@ -58,7 +58,7 @@
msgstr ""
#: src/cat.c:117 src/cksum.c:279 src/comm.c:87 src/csplit.c:1533 src/cut.c:225
-#: src/expand.c:124 src/fmt.c:289 src/fold.c:79 src/head.c:103 src/join.c:175
+#: src/expand.c:124 src/fmt.c:289 src/fold.c:79 src/head.c:103 src/join.c:196
#: src/md5sum.c:126 src/nl.c:213 src/od.c:327 src/paste.c:424 src/pr.c:2859
#: src/sort.c:298 src/split.c:106 src/sum.c:75 src/tac.c:142 src/tail.c:261
#: src/tr.c:383 src/unexpand.c:377 src/uniq.c:136 src/wc.c:100
@@ -70,7 +70,7 @@
#: src/cat.c:177 src/cat.c:259 src/cat.c:312 src/cat.c:840 src/comm.c:220
#: src/csplit.c:1493 src/cut.c:799 src/expand.c:392 src/fmt.c:416
#: src/fold.c:225 src/fold.c:306 src/head.c:139 src/head.c:169 src/head.c:387
-#: src/join.c:871 src/md5sum.c:629 src/nl.c:608 src/od.c:1940 src/paste.c:485
+#: src/join.c:1131 src/md5sum.c:629 src/nl.c:608 src/od.c:1940 src/paste.c:485
#: src/pr.c:1158 src/tac.c:715 src/tail.c:285 src/tail.c:1518 src/tr.c:1670
#: src/tr.c:1916 src/tr.c:2024 src/tr.c:2031 src/tsort.c:485
#: src/unexpand.c:454
@@ -452,12 +452,12 @@
msgid "unrecognized option `-%c'"
msgstr ""
-#: src/join.c:145
+#: src/join.c:165
#, c-format
msgid "Usage: %s [OPTION]... FILE1 FILE2\n"
msgstr ""
-#: src/join.c:149
+#: src/join.c:169
msgid ""
"For each pair of input lines with identical join fields, write a line to\n"
"standard output. The default join field is the first, delimited\n"
@@ -469,6 +469,7 @@
" -j FIELD (obsolescent) equivalent to `-1 FIELD -2 FIELD'\n"
" -j1 FIELD (obsolescent) equivalent to `-1 FIELD'\n"
" -j2 FIELD (obsolescent) equivalent to `-2 FIELD'\n"
+" -n input files are sorted numerically\n"
" -o FORMAT obey FORMAT while constructing output line\n"
" -t CHAR use CHAR as input and output field separator\n"
" -v SIDE like -a SIDE, but suppress joined output lines\n"
@@ -486,40 +487,40 @@
msgstr ""
#. `0' must be all alone -- no `.FIELD'.
-#: src/join.c:640
+#: src/join.c:879
#, c-format
msgid "invalid field specifier: `%s'"
msgstr ""
-#: src/join.c:654 src/join.c:767 src/join.c:803
+#: src/join.c:893 src/join.c:1023 src/join.c:1063
#, c-format
msgid "invalid field number: `%s'"
msgstr ""
-#: src/join.c:667
+#: src/join.c:906
#, c-format
msgid "invalid file number in field spec: `%s'"
msgstr ""
-#: src/join.c:787
+#: src/join.c:1047
#, c-format
msgid "invalid field number for file 1: `%s'"
msgstr ""
-#: src/join.c:796
+#: src/join.c:1056
#, c-format
msgid "invalid field number for file 2: `%s'"
msgstr ""
-#: src/join.c:828
+#: src/join.c:1088
msgid "too many non-option arguments"
msgstr ""
-#: src/join.c:850
+#: src/join.c:1110
msgid "too few non-option arguments"
msgstr ""
-#: src/join.c:861
+#: src/join.c:1121
msgid "both files cannot be standard input"
msgstr ""
Index: src/join.c
===================================================================
RCS file: /home/gondolin/herbert/src/CVS/debian/textutils/src/join.c,v
retrieving revision 1.1.1.1
diff -u -r1.1.1.1 join.c
--- src/join.c 1999/07/04 10:38:02 1.1.1.1
+++ src/join.c 2001/04/08 11:35:34
@@ -62,7 +62,7 @@
/* A field of a line. */
struct field
{
- const unsigned char *beg; /* First character in field. */
+ unsigned char *beg; /* First character in field. */
size_t len; /* The length of the field. */
};
@@ -87,9 +87,25 @@
/* The name this program was run with. */
char *program_name;
+#define C_DECIMAL_POINT '.'
+#define NEGATION_SIGN '-'
+#define NUMERIC_ZERO '0'
+
#ifdef ENABLE_NLS
+
+static char decimal_point;
+static int th_sep; /* if CHAR_MAX + 1, then there is no thousands separator */
+
/* Nonzero if the LC_COLLATE locale is hard. */
static int hard_LC_COLLATE;
+
+# define IS_THOUSANDS_SEP(x) ((x) == th_sep)
+
+#else
+
+# define decimal_point C_DECIMAL_POINT
+# define IS_THOUSANDS_SEP(x) 0
+
#endif
/* If nonzero, print unpairable lines in file 1 or 2. */
@@ -123,6 +139,7 @@
{"j", required_argument, NULL, 'j'},
{"j1", required_argument, NULL, '1'},
{"j2", required_argument, NULL, '2'},
+ {"n", required_argument, NULL, 'n'},
{GETOPT_HELP_OPTION_DECL},
{GETOPT_VERSION_OPTION_DECL},
{NULL, 0, NULL, 0}
@@ -134,6 +151,9 @@
/* If nonzero, ignore case when comparing join fields. */
static int ignore_case;
+/* If nonzero, do numeric comparison. */
+static int numeric;
+
void
usage (int status)
{
@@ -157,6 +177,7 @@
-j FIELD (obsolescent) equivalent to `-1 FIELD -2 FIELD'\n\
-j1 FIELD (obsolescent) equivalent to `-1 FIELD'\n\
-j2 FIELD (obsolescent) equivalent to `-2 FIELD'\n\
+ -n input files are sorted numerically\n\
-o FORMAT obey FORMAT while constructing output line\n\
-t CHAR use CHAR as input and output field separator\n\
-v SIDE like -a SIDE, but suppress joined output lines\n\
@@ -178,7 +199,7 @@
}
static void
-ADD_FIELD (struct line *line, const unsigned char *field, size_t len)
+ADD_FIELD (struct line *line, unsigned char *field, size_t len)
{
if (line->nfields >= line->nfields_allocated)
{
@@ -314,6 +335,213 @@
free ((char *) seq->lines);
}
+/* Compare strings A and B containing decimal fractions < 1. Each string
+ should begin with a decimal point followed immediately by the digits
+ of the fraction. Strings not of this form are considered to be zero. */
+
+/* The goal here, is to take two numbers a and b... compare these
+ in parallel. Instead of converting each, and then comparing the
+ outcome. Most likely stopping the comparison before the conversion
+ is complete. The algorithm used, in the old sort:
+
+ Algorithm: fraccompare
+ Action : compare two decimal fractions
+ accepts : char *a, char *b
+ returns : -1 if a<b, 0 if a=b, 1 if a>b.
+ implement:
+
+ if *a == decimal_point AND *b == decimal_point
+ find first character different in a and b.
+ if both are digits, return the difference *a - *b.
+ if *a is a digit
+ skip past zeros
+ if digit return 1, else 0
+ if *b is a digit
+ skip past zeros
+ if digit return -1, else 0
+ if *a is a decimal_point
+ skip past decimal_point and zeros
+ if digit return 1, else 0
+ if *b is a decimal_point
+ skip past decimal_point and zeros
+ if digit return -1, else 0
+ return 0 */
+
+static int
+fraccompare (register const unsigned char *a, register const unsigned char *b)
+{
+ if (*a == decimal_point && *b == decimal_point)
+ {
+ while (*++a == *++b)
+ if (! ISDIGIT (*a))
+ return 0;
+ if (ISDIGIT (*a) && ISDIGIT (*b))
+ return *a - *b;
+ if (ISDIGIT (*a))
+ goto a_trailing_nonzero;
+ if (ISDIGIT (*b))
+ goto b_trailing_nonzero;
+ return 0;
+ }
+ else if (*a++ == decimal_point)
+ {
+ a_trailing_nonzero:
+ while (*a == NUMERIC_ZERO)
+ a++;
+ return ISDIGIT (*a);
+ }
+ else if (*b++ == decimal_point)
+ {
+ b_trailing_nonzero:
+ while (*b == NUMERIC_ZERO)
+ b++;
+ return - ISDIGIT (*b);
+ }
+ return 0;
+}
+
+/* Compare strings A and B as numbers without explicitly converting them to
+ machine numbers. Comparatively slow for short strings, but asymptotically
+ hideously fast. */
+
+static int
+numcompare (register const unsigned char *a, register const unsigned char *b)
+{
+ register int tmpa, tmpb, loga, logb, tmp;
+
+ tmpa = *a;
+ tmpb = *b;
+
+ while (ISBLANK (tmpa))
+ tmpa = *++a;
+ while (ISBLANK (tmpb))
+ tmpb = *++b;
+
+ if (tmpa == NEGATION_SIGN)
+ {
+ do
+ tmpa = *++a;
+ while (tmpa == NUMERIC_ZERO || IS_THOUSANDS_SEP (tmpa));
+ if (tmpb != NEGATION_SIGN)
+ {
+ if (tmpa == decimal_point)
+ do
+ tmpa = *++a;
+ while (tmpa == NUMERIC_ZERO);
+ if (ISDIGIT (tmpa))
+ return -1;
+ while (tmpb == NUMERIC_ZERO || IS_THOUSANDS_SEP (tmpb))
+ tmpb = *++b;
+ if (tmpb == decimal_point)
+ do
+ tmpb = *++b;
+ while (tmpb == NUMERIC_ZERO);
+ if (ISDIGIT (tmpb))
+ return -1;
+ return 0;
+ }
+ do
+ tmpb = *++b;
+ while (tmpb == NUMERIC_ZERO || IS_THOUSANDS_SEP (tmpb));
+
+ while (tmpa == tmpb && ISDIGIT (tmpa))
+ {
+ do
+ tmpa = *++a;
+ while (IS_THOUSANDS_SEP (tmpa));
+ do
+ tmpb = *++b;
+ while (IS_THOUSANDS_SEP (tmpb));
+ }
+
+ if ((tmpa == decimal_point && !ISDIGIT (tmpb))
+ || (tmpb == decimal_point && !ISDIGIT (tmpa)))
+ return -fraccompare (a, b);
+
+ tmp = tmpb - tmpa;
+
+ for (loga = 0; ISDIGIT (tmpa); ++loga)
+ do
+ tmpa = *++a;
+ while (IS_THOUSANDS_SEP (tmpa));
+
+ for (logb = 0; ISDIGIT (tmpb); ++logb)
+ do
+ tmpb = *++b;
+ while (IS_THOUSANDS_SEP (tmpb));
+
+ if (logb - loga != 0)
+ return logb - loga;
+
+ if (!loga)
+ return 0;
+
+ return tmp;
+ }
+ else if (tmpb == NEGATION_SIGN)
+ {
+ do
+ tmpb = *++b;
+ while (tmpb == NUMERIC_ZERO || IS_THOUSANDS_SEP (tmpb));
+ if (tmpb == decimal_point)
+ do
+ tmpb = *++b;
+ while (tmpb == NUMERIC_ZERO);
+ if (ISDIGIT (tmpb))
+ return 1;
+ while (tmpa == NUMERIC_ZERO || IS_THOUSANDS_SEP (tmpa))
+ tmpa = *++a;
+ if (tmpa == decimal_point)
+ do
+ tmpa = *++a;
+ while (tmpa == NUMERIC_ZERO);
+ if (ISDIGIT (tmpa))
+ return 1;
+ return 0;
+ }
+ else
+ {
+ while (tmpa == NUMERIC_ZERO || IS_THOUSANDS_SEP (tmpa))
+ tmpa = *++a;
+ while (tmpb == NUMERIC_ZERO || IS_THOUSANDS_SEP (tmpb))
+ tmpb = *++b;
+
+ while (tmpa == tmpb && ISDIGIT (tmpa))
+ {
+ do
+ tmpa = *++a;
+ while (IS_THOUSANDS_SEP (tmpa));
+ do
+ tmpb = *++b;
+ while (IS_THOUSANDS_SEP (tmpb));
+ }
+
+ if ((tmpa == decimal_point && !ISDIGIT (tmpb))
+ || (tmpb == decimal_point && !ISDIGIT (tmpa)))
+ return fraccompare (a, b);
+
+ tmp = tmpa - tmpb;
+
+ for (loga = 0; ISDIGIT (tmpa); ++loga)
+ do
+ tmpa = *++a;
+ while (IS_THOUSANDS_SEP (tmpa));
+
+ for (logb = 0; ISDIGIT (tmpb); ++logb)
+ do
+ tmpb = *++b;
+ while (IS_THOUSANDS_SEP (tmpb));
+
+ if (loga - logb != 0)
+ return loga - logb;
+
+ if (!loga)
+ return 0;
+
+ return tmp;
+ }
+}
+
/* Return <0 if the join field in LINE1 compares less than the one in LINE2;
>0 if it compares greater; 0 if it compares equal. */
@@ -321,7 +549,7 @@
keycmp (struct line *line1, struct line *line2)
{
/* Start of field to compare in each file. */
- const unsigned char *beg1, *beg2;
+ unsigned char *beg1, *beg2;
int len1, len2; /* Length of fields to compare. */
int diff;
@@ -356,8 +584,19 @@
/* Use an if-statement here rather than a function variable to
avoid portability hassles of getting a non-conflicting declaration
of memcmp. */
- if (ignore_case)
+ if (numeric)
{
+ unsigned char save1, save2;
+
+ save1 = beg1[len1];
+ save2 = beg2[len2];
+ beg1[len1] = beg2[len2] = '\0';
+ diff = numcompare(beg1, beg2);
+ beg1[len1] = save1;
+ beg2[len2] = save2;
+ }
+ else if (ignore_case)
+ {
/* FIXME: ignore_case does not work with NLS (in particular,
with multibyte chars). */
diff = memcasecmp (beg1, beg2, min (len1, len2));
@@ -738,6 +977,23 @@
#ifdef ENABLE_NLS
hard_LC_COLLATE = hard_locale (LC_COLLATE);
+
+ /* Let's get locale's representation of the decimal point */
+ {
+ struct lconv *lconvp = localeconv ();
+
+ /* If the locale doesn't define a decimal point, or if the decimal
+ point is multibyte, use the C decimal point. We don't support
+ multibyte decimal points yet. */
+ decimal_point = *lconvp->decimal_point;
+ if (! decimal_point || lconvp->decimal_point[1])
+ decimal_point = C_DECIMAL_POINT;
+
+ /* We don't support multibyte thousands separators yet. */
+ th_sep = *lconvp->thousands_sep;
+ if (! th_sep || lconvp->thousands_sep[1])
+ th_sep = CHAR_MAX + 1;
+ }
#endif
/* Initialize this before parsing options. In parsing options,
@@ -747,7 +1003,7 @@
nfiles = 0;
print_pairables = 1;
- while ((optc = getopt_long_only (argc, argv, "-a:e:i1:2:o:t:v:", longopts,
+ while ((optc = getopt_long_only (argc, argv, "-a:e:in1:2:o:t:v:", longopts,
NULL)) != -1)
{
long int val;
@@ -777,6 +1033,10 @@
case 'i':
ignore_case = 1;
+ break;
+
+ case 'n':
+ numeric = 1;
break;
case '1':