sort

David Laight Thu, 10 Sep 2009 15:02:56 -0700

Module Name:    src
Committed By:   dsl
Date:           Thu Sep 10 22:02:40 UTC 2009


Modified Files:
        src/usr.bin/sort: Makefile append.c fields.c files.c fsort.c init.c
            msort.c radix_sort.c sort.c sort.h

Log Message:
Save length of key instead of relying of the weight of the record sep.
This frees a byte value to use for 'end of key' (to correctly sort
short keys) while still having a weight assigned to the field sep.
(Unless -t is given, the field sep is in the field data.)
Do reverse sorts by writing the output file in reverse order (rather
than reversing the sort - apart from merges).
All key compares are now unweighted.
For 'sort -u' mark duplicates keys during the sort and don't write
to the output.
Use -S to mean a posix sort - where equal keys are sorted using the
raw record (rather than being kept in the original order).
For 'sort -f' (no keys) generate a key of the folded data (as for -n
-i and -d), simplifies the code and allows a 'posix' sort.


To generate a diff of this commit:
cvs rdiff -u -r1.7 -r1.8 src/usr.bin/sort/Makefile
cvs rdiff -u -r1.21 -r1.22 src/usr.bin/sort/append.c
cvs rdiff -u -r1.27 -r1.28 src/usr.bin/sort/fields.c src/usr.bin/sort/sort.h
cvs rdiff -u -r1.36 -r1.37 src/usr.bin/sort/files.c
cvs rdiff -u -r1.40 -r1.41 src/usr.bin/sort/fsort.c
cvs rdiff -u -r1.22 -r1.23 src/usr.bin/sort/init.c
cvs rdiff -u -r1.25 -r1.26 src/usr.bin/sort/msort.c
cvs rdiff -u -r1.2 -r1.3 src/usr.bin/sort/radix_sort.c
cvs rdiff -u -r1.54 -r1.55 src/usr.bin/sort/sort.c

Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.

Modified files:

Index: src/usr.bin/sort/Makefile
diff -u src/usr.bin/sort/Makefile:1.7 src/usr.bin/sort/Makefile:1.8
--- src/usr.bin/sort/Makefile:1.7	Sat Sep  5 09:16:18 2009
+++ src/usr.bin/sort/Makefile	Thu Sep 10 22:02:40 2009
@@ -1,8 +1,11 @@
-#	$NetBSD: Makefile,v 1.7 2009/09/05 09:16:18 dsl Exp $
+#	$NetBSD: Makefile,v 1.8 2009/09/10 22:02:40 dsl Exp $
 #	from: @(#)Makefile	8.1 (Berkeley) 6/6/93
 
 PROG=	sort
 SRCS=	append.c fields.c files.c fsort.c init.c msort.c sort.c tmp.c
 SRCS+=	radix_sort.c
 
+LDADD+=-lutil
+DPADD+=${LIBUTIL}
+
 .include <bsd.prog.mk>

Index: src/usr.bin/sort/append.c
diff -u src/usr.bin/sort/append.c:1.21 src/usr.bin/sort/append.c:1.22
--- src/usr.bin/sort/append.c:1.21	Sat Sep  5 12:00:25 2009
+++ src/usr.bin/sort/append.c	Thu Sep 10 22:02:40 2009
@@ -1,4 +1,4 @@
-/*	$NetBSD: append.c,v 1.21 2009/09/05 12:00:25 dsl Exp $	*/
+/*	$NetBSD: append.c,v 1.22 2009/09/10 22:02:40 dsl Exp $	*/
 
 /*-
  * Copyright (c) 2000-2003 The NetBSD Foundation, Inc.
@@ -64,82 +64,34 @@
 #include "sort.h"
 
 #ifndef lint
-__RCSID("$NetBSD: append.c,v 1.21 2009/09/05 12:00:25 dsl Exp $");
+__RCSID("$NetBSD: append.c,v 1.22 2009/09/10 22:02:40 dsl Exp $");
 __SCCSID("@(#)append.c	8.1 (Berkeley) 6/6/93");
 #endif /* not lint */
 
 #include <stdlib.h>
-#include <string.h>
-
-static int
-wt_cmp(const u_char *a, const u_char *b, size_t len, u_char *wts)
-{
-    size_t i;
-
-    for (i = 0; i < len; i++) {
-	    if (wts[*a++] != wts[*b++])
-		return 1;
-    }
-
-    return 0;
-}
 
 /*
- * copy sorted lines to output; check for uniqueness
+ * copy sorted lines to output
+ * Ignore duplicates (marked with -ve keylen)
  */
 void
-append(const RECHEADER **keylist, int nelem, FILE *fp, put_func_t put, u_char *wts)
+append(RECHEADER **keylist, int nelem, FILE *fp, put_func_t put)
 {
-	const RECHEADER **cpos, **lastkey;
-	const RECHEADER *crec, *prec;
-	size_t plen;
+	RECHEADER **cpos, **lastkey;
+	RECHEADER *crec;
 
 	lastkey = keylist + nelem;
-	if (!UNIQUE || wts == NULL) {
-		for (cpos = keylist; cpos < lastkey; cpos++)
-			put(*cpos, fp);
-		return;
-	}
-
-	if (nelem == 0)
-		return;
-
-	cpos = keylist;
-	prec = *cpos;
-
-	if (!SINGL_FLD) {
-		/* Key for each line is already in adjacent bytes */
-		plen = prec->offset;
-		for (cpos = &keylist[1]; cpos < lastkey; cpos++) {
+	if (REVERSE) {
+		for (cpos = lastkey; cpos-- > keylist;) {
 			crec = *cpos;
-			if (crec->offset == plen
-			    && memcmp(crec->data, prec->data, plen) == 0) {
-				/* Duplicate key */
-				continue;
-			}
-			put(prec, fp);
-			prec = crec;
-			plen = prec->offset;
+			if (crec->keylen >= 0)
+				put(crec, fp);
 		}
-		put(prec, fp);
-		return;
-	}
-
-	/* We have to compare the raw data - which means applying weight */
-
-	/* Key for each line is already in adjacent bytes */
-	plen = prec->length;
-	for (cpos = &keylist[1]; cpos < lastkey; cpos++) {
-		crec = *cpos;
-		if (crec->length == plen
-		    && wt_cmp(crec->data, prec->data, plen, wts) == 0) {
-			/* Duplicate key */
-			continue;
+	} else {
+		for (cpos = keylist; cpos < lastkey; cpos++) {
+			crec = *cpos;
+			if (crec->keylen >= 0)
+				put(crec, fp);
 		}
-		put(prec, fp);
-		prec = crec;
-		plen = prec->length;
 	}
-	put(prec, fp);
-	return;
 }

Index: src/usr.bin/sort/fields.c
diff -u src/usr.bin/sort/fields.c:1.27 src/usr.bin/sort/fields.c:1.28
--- src/usr.bin/sort/fields.c:1.27	Sat Aug 22 21:28:55 2009
+++ src/usr.bin/sort/fields.c	Thu Sep 10 22:02:40 2009
@@ -1,4 +1,4 @@
-/*	$NetBSD: fields.c,v 1.27 2009/08/22 21:28:55 dsl Exp $	*/
+/*	$NetBSD: fields.c,v 1.28 2009/09/10 22:02:40 dsl Exp $	*/
 
 /*-
  * Copyright (c) 2000-2003 The NetBSD Foundation, Inc.
@@ -66,7 +66,7 @@
 #include "sort.h"
 
 #ifndef lint
-__RCSID("$NetBSD: fields.c,v 1.27 2009/08/22 21:28:55 dsl Exp $");
+__RCSID("$NetBSD: fields.c,v 1.28 2009/09/10 22:02:40 dsl Exp $");
 __SCCSID("@(#)fields.c	8.1 (Berkeley) 6/6/93");
 #endif /* not lint */
 
@@ -152,11 +152,19 @@
 		    fieldtable->flags)) == NULL)
 			return (1);
 	}
-	*keypos++ = 0;
 
 	keybuf->offset = keypos - keybuf->data;
 	keybuf->length = keybuf->offset + line_size;
 
+	/*
+	 * Posix requires that equal keys be further sorted by the
+	 * entire original record.
+	 * NetBSD has (at least for some time) kept equal keys in
+	 * their original order.
+	 * For 'sort -u' posix_sort is unset.
+	 */
+	keybuf->keylen = posix_sort ? keybuf->length : keybuf->offset;
+
 	memcpy(keypos, line_data, line_size);
 	return (0);
 }
@@ -209,33 +217,31 @@
 			*tablepos++ = lweight[*start];
 		}
 	}
-	/* Add extra byte to sort short keys correctly */
-	*tablepos++ = flags & R ? 255 : 1;
+	/* Add extra byte (absent from lweight) to sort short keys correctly */
+	*tablepos++ = lweight[REC_D];
 	return tablepos;
 }
 
 /*
  * Numbers are converted to a floating point format (exponent & mantissa)
  * so that they compare correctly as sequence of unsigned bytes.
- * The output cannot contain a 0x00 byte (the record separator).
- * Bytes 0x01 and 0xff are used to terminate positive and negative numbers
+ * Bytes 0x00 and 0xff are used to terminate positive and negative numbers
  * to ensure that 0.123 sorts after 0.12 and -0.123 sorts before -0.12.
  *
  * The first byte contain the overall sign, exponent sign and some of the
  * exponent. These have to be ordered (-ve value, decreasing exponent),
  * zero, (+ve value, increasing exponent).
- * After excluding 0, 1, 0xff and 0x80 (used for zero) there are 61
- * exponent values available, this isn't quite enough and the highest
- * values are used to encode large exponents in multiple bytes.
  *
- * An exponent of zero has value 0xc0 for +ve numbers and 0x40 for -ves.
+ * The first byte is 0x80 for zero, 0x40 for -ve with exponent 0 and
+ * 0xc0 for +ve with exponent zero.
+ * This only leaves 62 byte values for +ve exponents - which isn't enough.
+ * The largest 5 exponent values are used to hold a byte count of the
+ * number of following bytes that contain 7 exponent bits per byte,
  *
  * The mantissa is stored 2 digits per byte offset by 0x40, for negative
  * numbers the order must be reversed (they are subtracted from 0x100).
  *
  * Reverse sorts are done by inverting the sign of the number.
- *
- * We don't have to worry about REC_D, the key is terminated by 0x00.
  */
 
 #define SIGNED(reverse, value) ((reverse) ? 0x100 - (value) : (value))
@@ -298,16 +304,14 @@
 
 	/* Maybe here we should allow for e+12 (etc) */
 
-	/* exponent 0 is 0xc0 for +ve numbers and 0x40 for -ve ones */
-	exponent += 0xc0;
-
-	if (exponent > 0x80 + MAX_EXP_ENC && exponent < 0x100 - MAX_EXP_ENC) {
+	if (exponent < 0x3f - MAX_EXP_ENC && -exponent < 0x3f - MAX_EXP_ENC) {
 		/* Value ok for simple encoding */
+		/* exponent 0 is 0xc0 for +ve numbers and 0x40 for -ve ones */
+		exponent += 0xc0;
 		*pos++ = SIGNED(reverse, exponent);
 	} else {
 		/* Out or range for a single byte */
 		int c, t;
-		exponent -= 0xc0;
 		t = exponent > 0 ? exponent : -exponent;
 		/* Count how many 7-bit blocks are needed */
 		for (c = 0; ; c++) {
Index: src/usr.bin/sort/sort.h
diff -u src/usr.bin/sort/sort.h:1.27 src/usr.bin/sort/sort.h:1.28
--- src/usr.bin/sort/sort.h:1.27	Sat Sep  5 12:00:25 2009
+++ src/usr.bin/sort/sort.h	Thu Sep 10 22:02:40 2009
@@ -1,4 +1,4 @@
-/*	$NetBSD: sort.h,v 1.27 2009/09/05 12:00:25 dsl Exp $	*/
+/*	$NetBSD: sort.h,v 1.28 2009/09/10 22:02:40 dsl Exp $	*/
 
 /*-
  * Copyright (c) 2000-2003 The NetBSD Foundation, Inc.
@@ -104,16 +104,20 @@
 		 err(2, NULL);						\
 }
 
-/* length of record is currently limited to maximum string length (size_t) */
-typedef size_t length_t;
+/* Records are limited to MAXBUFSIZE (8MB) and less if you want to sort
+ * in a sane way.
+ * Anyone who wants to sort data records longer than 2GB definitely needs a
+ * different program! */
+typedef unsigned int length_t;
 
 /* A record is a key/line pair starting at rec.data. It has a total length
  * and an offset to the start of the line half of the pair.
  */
 typedef struct recheader {
-	length_t length;
-	length_t offset;
-	u_char data[1];
+	length_t length;	/* total length of key and line */
+	length_t offset;	/* to line */
+	int      keylen;	/* length of key */
+	u_char   data[];	/* key then line */
 } RECHEADER;
 
 /* This is the column as seen by struct field.  It is used by enterfield.
@@ -161,18 +165,17 @@
 extern u_char ascii[NBINS], Rascii[NBINS], Ftable[NBINS], RFtable[NBINS];
 extern u_char *const weight_tables[4];   /* ascii, Rascii, Ftable, RFtable */
 extern u_char d_mask[NBINS];
-extern int SINGL_FLD, SEP_FLAG, UNIQUE;
+extern int SINGL_FLD, SEP_FLAG, UNIQUE, REVERSE;
+extern int posix_sort;
 extern int REC_D;
 extern const char *tmpdir;
-extern u_char unweighted[NBINS];
 extern struct coldesc *clist;
 extern int ncols;
 
 #define DEBUG(ch) (debug_flags & (1 << ((ch) & 31)))
 extern unsigned int debug_flags;
 
-void	 append(const RECHEADER **, int, FILE *,
-	    void (*)(const RECHEADER *, FILE *), u_char *);
+void	 append(RECHEADER **, int, FILE *, void (*)(const RECHEADER *, FILE *));
 void	 concat(FILE *, FILE *);
 length_t enterkey(RECHEADER *, const u_char *, u_char *, size_t, struct field *);
 void	 fixit(int *, char **);
@@ -193,6 +196,6 @@
 void	 putrec(const RECHEADER *, FILE *);
 void	 putkeydump(const RECHEADER *, FILE *);
 void	 rd_append(int, int, int, FILE *, u_char *, u_char *);
-int	 radix_sort(const RECHEADER **, const RECHEADER **, int, const u_char *, u_int);
+void	 radix_sort(RECHEADER **, RECHEADER **, int);
 int	 setfield(const char *, struct field *, int);
 void	 settables(void);

Index: src/usr.bin/sort/files.c
diff -u src/usr.bin/sort/files.c:1.36 src/usr.bin/sort/files.c:1.37
--- src/usr.bin/sort/files.c:1.36	Sat Sep  5 12:00:25 2009
+++ src/usr.bin/sort/files.c	Thu Sep 10 22:02:40 2009
@@ -1,4 +1,4 @@
-/*	$NetBSD: files.c,v 1.36 2009/09/05 12:00:25 dsl Exp $	*/
+/*	$NetBSD: files.c,v 1.37 2009/09/10 22:02:40 dsl Exp $	*/
 
 /*-
  * Copyright (c) 2000-2003 The NetBSD Foundation, Inc.
@@ -65,7 +65,7 @@
 #include "fsort.h"
 
 #ifndef lint
-__RCSID("$NetBSD: files.c,v 1.36 2009/09/05 12:00:25 dsl Exp $");
+__RCSID("$NetBSD: files.c,v 1.37 2009/09/10 22:02:40 dsl Exp $");
 __SCCSID("@(#)files.c	8.1 (Berkeley) 6/6/93");
 #endif /* not lint */
 
@@ -123,6 +123,7 @@
 			if (c == REC_D) {
 				recbuf->offset = 0;
 				recbuf->length = pos - recbuf->data;
+				recbuf->keylen = recbuf->length - 1;
 				return (0);
 			}
 		}
@@ -138,6 +139,7 @@
 				*pos++ = REC_D;
 				recbuf->offset = 0;
 				recbuf->length = pos - recbuf->data;
+				recbuf->keylen = recbuf->length - 1;
 				return (0);
 			}
 			FCLOSE(fp);
@@ -154,6 +156,7 @@
 
 			recbuf->offset = 0;
 			recbuf->length = 0;
+			recbuf->keylen = 0;
 
 			return (BUFFEND);
 		}

Index: src/usr.bin/sort/fsort.c
diff -u src/usr.bin/sort/fsort.c:1.40 src/usr.bin/sort/fsort.c:1.41
--- src/usr.bin/sort/fsort.c:1.40	Sat Sep  5 12:00:25 2009
+++ src/usr.bin/sort/fsort.c	Thu Sep 10 22:02:40 2009
@@ -1,4 +1,4 @@
-/*	$NetBSD: fsort.c,v 1.40 2009/09/05 12:00:25 dsl Exp $	*/
+/*	$NetBSD: fsort.c,v 1.41 2009/09/10 22:02:40 dsl Exp $	*/
 
 /*-
  * Copyright (c) 2000-2003 The NetBSD Foundation, Inc.
@@ -72,7 +72,7 @@
 #include "fsort.h"
 
 #ifndef lint
-__RCSID("$NetBSD: fsort.c,v 1.40 2009/09/05 12:00:25 dsl Exp $");
+__RCSID("$NetBSD: fsort.c,v 1.41 2009/09/10 22:02:40 dsl Exp $");
 __SCCSID("@(#)fsort.c	8.1 (Berkeley) 6/6/93");
 #endif /* not lint */
 
@@ -86,8 +86,8 @@
 void
 fsort(struct filelist *filelist, int nfiles, FILE *outfp, struct field *ftbl)
 {
-	const RECHEADER **keylist;
-	const RECHEADER **keypos, **keyp;
+	RECHEADER **keylist;
+	RECHEADER **keypos, **keyp;
 	RECHEADER *buffer;
 	size_t bufsize = DEFBUFSIZE;
 	u_char *bufend;
@@ -158,25 +158,19 @@
 		}
 
 		/* Sort this set of records */
-		if (SINGL_FLD) {
-			nelem = radix_sort(keylist, keylist + MAXNUM, nelem,
-			    ftbl[0].weights, REC_D);
-		} else {
-			nelem = radix_sort(keylist, keylist + MAXNUM, nelem,
-			    unweighted, 0);
-		}
+		radix_sort(keylist, keylist + MAXNUM, nelem);
 
 		if (c == EOF && mfct == 0) {
 			/* all the data is (sorted) in the buffer */
 			append(keylist, nelem, outfp,
-			    DEBUG('k') ? putkeydump : putline, ftbl->weights);
+			    DEBUG('k') ? putkeydump : putline);
 			break;
 		}
 
 		/* Save current data to a temporary file for a later merge */
 		fp = ftmp();
 		fstack[mfct].fp = fp;
-		append(keylist, nelem, fp, putrec, NULL);
+		append(keylist, nelem, fp, putrec);
 		mfct++;
 
 		if (c == EOF) {

Index: src/usr.bin/sort/init.c
diff -u src/usr.bin/sort/init.c:1.22 src/usr.bin/sort/init.c:1.23
--- src/usr.bin/sort/init.c:1.22	Sat Sep  5 09:16:18 2009
+++ src/usr.bin/sort/init.c	Thu Sep 10 22:02:40 2009
@@ -1,4 +1,4 @@
-/*	$NetBSD: init.c,v 1.22 2009/09/05 09:16:18 dsl Exp $	*/
+/*	$NetBSD: init.c,v 1.23 2009/09/10 22:02:40 dsl Exp $	*/
 
 /*-
  * Copyright (c) 2000-2003 The NetBSD Foundation, Inc.
@@ -64,7 +64,7 @@
 #include "sort.h"
 
 #ifndef lint
-__RCSID("$NetBSD: init.c,v 1.22 2009/09/05 09:16:18 dsl Exp $");
+__RCSID("$NetBSD: init.c,v 1.23 2009/09/10 22:02:40 dsl Exp $");
 __SCCSID("@(#)init.c	8.1 (Berkeley) 6/6/93");
 #endif /* not lint */
 
@@ -72,7 +72,7 @@
 #include <string.h>
 
 static void insertcol(struct field *);
-static const char *setcolumn(const char *, struct field *, int);
+static const char *setcolumn(const char *, struct field *);
 
 /*
  * masks of ignored characters.
@@ -152,7 +152,7 @@
  * interprets a column in a -k field
  */
 static const char *
-setcolumn(const char *pos, struct field *cur_fld, int gflag)
+setcolumn(const char *pos, struct field *cur_fld)
 {
 	struct column *col;
 	char *npos;
@@ -183,27 +183,27 @@
 int
 setfield(const char *pos, struct field *cur_fld, int gflag)
 {
-	int tmp;
-
 	cur_fld->mask = NULL;
 
-	pos = setcolumn(pos, cur_fld, gflag);
+	pos = setcolumn(pos, cur_fld);
 	if (*pos == '\0')			/* key extends to EOL. */
 		cur_fld->tcol.num = 0;
 	else {
 		if (*pos != ',')
 			errx(2, "illegal field descriptor");
-		setcolumn((++pos), cur_fld, gflag);
+		setcolumn((++pos), cur_fld);
 	}
 	if (!cur_fld->flags)
 		cur_fld->flags = gflag;
-	tmp = cur_fld->flags;
+	if (REVERSE)
+		/* A local 'r' doesn't invert the global one */
+		cur_fld->flags &= ~R;
 
 	/* Assign appropriate mask table and weight table. */
-	cur_fld->weights = weight_tables[tmp & (R | F)];
-	if (tmp & I)
+	cur_fld->weights = weight_tables[cur_fld->flags & (R | F)];
+	if (cur_fld->flags & I)
 		cur_fld->mask = itable;
-	else if (tmp & D)
+	else if (cur_fld->flags & D)
 		cur_fld->mask = dtable;
 
 	cur_fld->flags |= (gflag & (BI | BT));
@@ -217,6 +217,7 @@
 		    && cur_fld->tcol.indent != 0
 		    && cur_fld->tcol.indent < cur_fld->icol.indent))
 		errx(2, "fields out of order");
+
 	insertcol(cur_fld);
 	return (cur_fld->tcol.num);
 }
@@ -319,18 +320,27 @@
  * Convert 'ascii' characters into their sort order.
  * The 'F' variants fold lower case to upper equivalent
  * The 'R' variants are for reverse sorting.
- * The record separator (REC_D) always maps to 0.
- * One is reserved for field separators (added when key is generated)
- * The field separator (from -t<ch>) map to 1 or 255 (unless SINGL_FLD)
+ *
+ * The record separator (REC_D) never needs a weight, this frees one
+ * byte value as an 'end of key' marker. This must be 0 for normal
+ * weight tables, and 0xff for reverse weight tables - and is used
+ * to terminate keys so that short keys sort before (after if reverse)
+ * longer keys.
+ *
+ * The field separator has a normal weight - although it cannot occur
+ * within a key unless it is the default (space+tab).
+ *
  * All other bytes map to the appropriate value for the sort order.
  * Numeric sorts don't need any tables, they are reversed by negation.
  *
+ * Global reverse sorts are done by writing the sorted keys in reverse
+ * order - the sort itself is stil forwards.
+ * This means that weights are only ever used when generating keys, any
+ * sort of the original data bytes is always forwards and unweighted.
+ *
  * Note: this is only good for ASCII sorting.  For different LC 's,
  * all bets are off.
  *
- * If SINGL_FLD then the weights have to be applied during the actual sort.
- * Otherwise they are applied when the key bytes are built.
- *
  * itable[] and dtable[] are the masks for -i (ignore non-printables)
  * and -d (only sort blank and alphanumerics).
  */
@@ -338,21 +348,17 @@
 settables(void)
 {
 	int i;
-	int next_weight = SINGL_FLD ? 1 : 2;
-	int rev_weight = SINGL_FLD ? 255 : 254;
-	int had_field_sep = SINGL_FLD;
+	int next_weight = 1;
+	int rev_weight = 254;
+
+	ascii[REC_D] = 0;
+	Rascii[REC_D] = 255;
+	Ftable[REC_D] = 0;
+	RFtable[REC_D] = 255;
 
 	for (i = 0; i < 256; i++) {
-		unweighted[i] = i;
-		if (d_mask[i] & REC_D_F)
+		if (i == REC_D)
 			continue;
-		if (!had_field_sep && d_mask[i] & FLD_D) {
-			/* First/only separator sorts before any data */
-			ascii[i] = 1;
-			Rascii[i] = 255;
-			had_field_sep = 1;
-			continue;
-		}
 		ascii[i] = next_weight;
 		Rascii[i] = rev_weight;
 		if (Ftable[i] == 0) {

Index: src/usr.bin/sort/msort.c
diff -u src/usr.bin/sort/msort.c:1.25 src/usr.bin/sort/msort.c:1.26
--- src/usr.bin/sort/msort.c:1.25	Sat Sep  5 12:00:25 2009
+++ src/usr.bin/sort/msort.c	Thu Sep 10 22:02:40 2009
@@ -1,4 +1,4 @@
-/*	$NetBSD: msort.c,v 1.25 2009/09/05 12:00:25 dsl Exp $	*/
+/*	$NetBSD: msort.c,v 1.26 2009/09/10 22:02:40 dsl Exp $	*/
 
 /*-
  * Copyright (c) 2000-2003 The NetBSD Foundation, Inc.
@@ -65,7 +65,7 @@
 #include "fsort.h"
 
 #ifndef lint
-__RCSID("$NetBSD: msort.c,v 1.25 2009/09/05 12:00:25 dsl Exp $");
+__RCSID("$NetBSD: msort.c,v 1.26 2009/09/10 22:02:40 dsl Exp $");
 __SCCSID("@(#)msort.c	8.1 (Berkeley) 6/6/93");
 #endif /* not lint */
 
@@ -78,12 +78,10 @@
 
 typedef struct mfile {
 	u_char *end;
-	short flno;
+	int flno;
 	RECHEADER rec[1];
 } MFILE;
 
-static u_char *wts;
-
 static int cmp(RECHEADER *, RECHEADER *);
 static int insert(struct mfile **, struct mfile **, int, int);
 static void merge(int, int, get_func_t, FILE *, put_func_t, struct field *);
@@ -96,8 +94,6 @@
 	int i, j, last;
 	put_func_t put;
 
-	wts = ftbl->weights;
-
 	while (nfiles) {
 		put = putrec;
 		for (j = 0; j < nfiles; j += MERGE_FNUM) {
@@ -278,6 +274,8 @@
 			 * comes here).
 			 */
 			cmpv = tmprec->flno - flist[mid]->flno;
+			if (REVERSE)
+				cmpv = -cmpv;
 		}
 		if (cmpv < 0)
 			top = mid;
@@ -332,7 +330,6 @@
 	crec_end = crec->data + DEFLLEN;
 	prec = malloc(offsetof(RECHEADER, data[DEFLLEN]));
 	prec_end = prec->data + DEFLLEN;
-	wts = ftbl->weights;
 
 	/* XXX this does exit(0) for overlong lines */
 	if (get(-1, 0, filelist, 1, prec, prec_end, ftbl) != 0)
@@ -365,27 +362,15 @@
 static int
 cmp(RECHEADER *rec1, RECHEADER *rec2)
 {
+	int len;
 	int r;
-	size_t len, i;
-	u_char *pos1, *pos2;
-	u_char *cwts;
-
-	if (!SINGL_FLD)
-		/* key is weights, and is 0x00 terminated */
-		return memcmp(rec1->data, rec2->data, rec1->offset);
-
-	/* We have to apply the weights ourselves */
-	cwts = wts;
-
-	pos1 = rec1->data;
-	pos2 = rec2->data;
-	len = rec1->length;
-
-	for (i = 0; i < len; i++) {
-		r = cwts[pos1[i]] - cwts[pos2[i]];
-		if (r)
-			return r;
-	}
 
-	return (0);
+	/* key is weights */
+	len = min(rec1->keylen, rec2->keylen);
+	r = memcmp(rec1->data, rec2->data, len);
+	if (r == 0)
+		r = rec1->keylen - rec2->keylen;
+	if (REVERSE)
+		r = -r;
+	return r;
 }

Index: src/usr.bin/sort/radix_sort.c
diff -u src/usr.bin/sort/radix_sort.c:1.2 src/usr.bin/sort/radix_sort.c:1.3
--- src/usr.bin/sort/radix_sort.c:1.2	Sat Sep  5 12:00:25 2009
+++ src/usr.bin/sort/radix_sort.c	Thu Sep 10 22:02:40 2009
@@ -1,4 +1,4 @@
-/*	$NetBSD: radix_sort.c,v 1.2 2009/09/05 12:00:25 dsl Exp $	*/
+/*	$NetBSD: radix_sort.c,v 1.3 2009/09/10 22:02:40 dsl Exp $	*/
 
 /*-
  * Copyright (c) 1990, 1993
@@ -37,7 +37,7 @@
 #if 0
 static char sccsid[] = "@(#)radixsort.c	8.2 (Berkeley) 4/28/95";
 #else
-__RCSID("$NetBSD: radix_sort.c,v 1.2 2009/09/05 12:00:25 dsl Exp $");
+__RCSID("$NetBSD: radix_sort.c,v 1.3 2009/09/10 22:02:40 dsl Exp $");
 #endif
 #endif /* LIBC_SCCS and not lint */
 
@@ -49,82 +49,96 @@
 
 #include <assert.h>
 #include <errno.h>
+#include <util.h>
 #include "sort.h"
 
 typedef struct {
-	const RECHEADER **sa;	/* Base of saved area */
-	int sn;				/* Number of entries */
-	int si;				/* index into data for compare */
+	RECHEADER **sa;		/* Base of saved area */
+	int sn;			/* Number of entries */
+	int si;			/* index into data for compare */
 } stack;
 
-static inline int simplesort(const RECHEADER **, int, int, const u_char *, u_int);
-static int r_sort_b(const RECHEADER **,
-	    const RECHEADER **, int, int, const u_char *, u_int);
+static void simplesort(RECHEADER **, int, int);
 
 #define	THRESHOLD	20		/* Divert to simplesort(). */
-#define	SIZE		512		/* Default stack size. */
 
 #define empty(s)	(s >= sp)
 #define pop(a, n, i)	a = (--sp)->sa, n = sp->sn, i = sp->si
 #define push(a, n, i)	sp->sa = a, sp->sn = n, (sp++)->si = i
 #define swap(a, b, t)	t = a, a = b, b = t
 
-int
-radix_sort(const RECHEADER **a, const RECHEADER **ta, int n, const u_char *tab, u_int endch)
+void
+radix_sort(RECHEADER **a, RECHEADER **ta, int n)
 {
-	endch = tab[endch];
+	u_int count[256], nc, bmin;
+	u_int c;
+	RECHEADER **ak, **tai, **lim;
+	RECHEADER *hdr;
+	int stack_size = 512;
+	stack *s, *sp, *sp0, *sp1, temp;
+	RECHEADER **top[256];
+	u_int *cp, bigc;
+	int data_index = 0;
+
 	if (n < THRESHOLD && !DEBUG('r')) {
-		return simplesort(a, n, 0, tab, endch);
+		simplesort(a, n, 0);
+		return;
 	}
-	return r_sort_b(a, ta, n, 0, tab, endch);
-}
 
-static int
-r_sort_b(const RECHEADER **a, const RECHEADER **ta, int n, int i, const u_char *tr, u_int endch)
-{
-	static u_int count[256], nc, bmin;
-	u_int c;
-	const RECHEADER **ak, **ai;
-	stack s[512], *sp, *sp0, *sp1, temp;
-	const RECHEADER **top[256];
-	u_int *cp, bigc;
-	int nrec = n;
+	s = emalloc(stack_size * sizeof *s);
+	memset(&count, 0, sizeof count);
+	/* Technically 'top' doesn't need zeroing */
+	memset(&top, 0, sizeof top);
 
 	sp = s;
-	push(a, n, i);
+	push(a, n, data_index);
 	while (!empty(s)) {
-		pop(a, n, i);
+		pop(a, n, data_index);
 		if (n < THRESHOLD && !DEBUG('r')) {
-			simplesort(a, n, i, tr, endch);
+			simplesort(a, n, data_index);
 			continue;
 		}
 
-		if (nc == 0) {
-			bmin = 255;
-			for (ak = a + n; --ak >= a;) {
-				c = tr[(*ak)->data[i]];
-				if (++count[c] == 1 && c != endch) {
-					if (c < bmin)
-						bmin = c;
-					nc++;
-				}
-			}
-			if (sp + nc > s + SIZE) {
-				r_sort_b(a, ta, n, i, tr, endch);
+		/* Count number of times each 'next byte' occurs */
+		nc = 0;
+		bmin = 255;
+		lim = a + n;
+		for (ak = a, tai = ta; ak < lim; ak++) {
+			hdr = *ak;
+			if (data_index >= hdr->keylen) {
+				/* Short key, copy to start of output */
+				if (UNIQUE && a != sp->sa)
+					/* Stop duplicate being written out */
+					hdr->keylen = -1;
+				*a++ = hdr;
+				n--;
 				continue;
 			}
+			/* Save in temp buffer for distribute */
+			*tai++ = hdr;
+			c = hdr->data[data_index];
+			if (++count[c] == 1) {
+				if (c < bmin)
+					bmin = c;
+				nc++;
+			}
+		}
+		/*
+		 * We need save the bounds for each 'next byte' that
+		 * occurs more so we can sort each block.
+		 */
+		if (sp + nc > s + stack_size) {
+			stack_size *= 2;
+			sp1 = erealloc(s, stack_size * sizeof *s);
+			sp = sp1 + (sp - s);
+			s = sp1;
 		}
 
+		/* Minor optimisation to do the largest set last */
 		sp0 = sp1 = sp;
 		bigc = 2;
-		if (endch == 0) {
-			top[0] = ak = a + count[0];
-			count[0] = 0;
-		} else {
-			ak = a;
-			top[255] = a + n;
-			count[255] = 0;
-		}
+		/* Convert 'counts' positions, saving bounds for later sorts */
+		ak = a;
 		for (cp = count + bmin; nc > 0; cp++) {
 			while (*cp == 0)
 				cp++;
@@ -133,48 +147,69 @@
 					bigc = c;
 					sp1 = sp;
 				}
-				push(ak, c, i+1);
+				push(ak, c, data_index+1);
 			}
-			top[cp-count] = ak += c;
+			ak += c;
+			top[cp-count] = ak;
 			*cp = 0;			/* Reset count[]. */
 			nc--;
 		}
 		swap(*sp0, *sp1, temp);
 
-		for (ak = ta + n, ai = a+n; ak > ta;)	/* Copy to temp. */
-			*--ak = *--ai;
 		for (ak = ta+n; --ak >= ta;)		/* Deal to piles. */
-			*--top[tr[(*ak)->data[i]]] = *ak;
+			*--top[(*ak)->data[data_index]] = *ak;
 	}
 
-	return nrec;
+	free(s);
 }
 
-/* insertion sort */
-static inline int
-simplesort(const RECHEADER **a, int n, int b, const u_char *tr, u_int endch)
+/* insertion sort, short records are sorted before long ones */
+static void
+simplesort(RECHEADER **a, int n, int data_index)
 {
-	const RECHEADER **ak, **ai, *tmp;
-	const RECHEADER **lim = a + n;
+	RECHEADER **ak, **ai;
+	RECHEADER *akh;
+	RECHEADER **lim = a + n;
 	const u_char *s, *t;
-	u_char ch;
+	int s_len, t_len;
+	int i;
+	int r;
 
 	if (n <= 1)
-		return n;
+		return;
 
 	for (ak = a+1; ak < lim; ak++) {
-		for (ai = ak; ai > a; ai--) {
-			for (s = ai[0]->data + b, t = ai[-1]->data + b;
-			    (ch = tr[*s]) != endch; s++, t++)
-				if (ch != tr[*t])
+		akh = *ak;
+		s = akh->data;
+		s_len = akh->keylen;
+		for (ai = ak; ;) {
+			ai--;
+			t = (*ai)->data;
+			t_len = (*ai)->keylen;
+			for (i = data_index; ; i++) {
+				if (i >= s_len || i >= t_len) {
+					r = s_len - t_len;
 					break;
-			if (ch >= tr[*t]) {
-
+				}
+				r =  s[i]  - t[i];
+				if (r != 0)
+					break;
+			}
+			if (r >= 0) {
+				if (r == 0 && UNIQUE) {
+					/* Put record below existing */
+					ai[1] = ai[0];
+					/* Mark so ignored by output() */
+					akh->keylen = -1;
+				} else {
+					ai++;
+				}
 				break;
 			}
-			swap(ai[0], ai[-1], tmp);
+			ai[1] = ai[0];
+			if (ai == a)
+				break;
 		}
+		ai[0] = akh;
 	}
-
-	return n;
 }

Index: src/usr.bin/sort/sort.c
diff -u src/usr.bin/sort/sort.c:1.54 src/usr.bin/sort/sort.c:1.55
--- src/usr.bin/sort/sort.c:1.54	Sat Sep  5 09:16:18 2009
+++ src/usr.bin/sort/sort.c	Thu Sep 10 22:02:40 2009
@@ -1,4 +1,4 @@
-/*	$NetBSD: sort.c,v 1.54 2009/09/05 09:16:18 dsl Exp $	*/
+/*	$NetBSD: sort.c,v 1.55 2009/09/10 22:02:40 dsl Exp $	*/
 
 /*-
  * Copyright (c) 2000-2003 The NetBSD Foundation, Inc.
@@ -66,6 +66,7 @@
  * a choice of merge sort and radix sort for external sorting.
  */
 
+#include <util.h>
 #include "sort.h"
 #include "fsort.h"
 #include "pathnames.h"
@@ -76,7 +77,7 @@
 #endif /* not lint */
 
 #ifndef lint
-__RCSID("$NetBSD: sort.c,v 1.54 2009/09/05 09:16:18 dsl Exp $");
+__RCSID("$NetBSD: sort.c,v 1.55 2009/09/10 22:02:40 dsl Exp $");
 __SCCSID("@(#)sort.c	8.1 (Berkeley) 6/6/93");
 #endif /* not lint */
 
@@ -100,8 +101,10 @@
  */
 u_char *const weight_tables[4] = { ascii, Rascii, Ftable, RFtable };
 u_char ascii[NBINS], Rascii[NBINS], RFtable[NBINS], Ftable[NBINS];
-u_char unweighted[NBINS];
+
 int SINGL_FLD = 0, SEP_FLAG = 0, UNIQUE = 0;
+int REVERSE = 0;
+int posix_sort;
 
 unsigned int debug_flags = 0;
 
@@ -122,7 +125,7 @@
 	int ch, i, stdinflag = 0;
 	char cflag = 0, mflag = 0;
 	char *outfile, *outpath = 0;
-	struct field *fldtab, *p;
+	struct field *fldtab;
 	size_t fldtab_sz, fld_cnt;
 	struct filelist filelist;
 	int num_input_files;
@@ -145,7 +148,7 @@
 	/* fldtab[0] is the global options. */
 	fldtab_sz = 3;
 	fld_cnt = 0;
-	fldtab = malloc(fldtab_sz * sizeof(*fldtab));
+	fldtab = emalloc(fldtab_sz * sizeof(*fldtab));
 	memset(fldtab, 0, fldtab_sz * sizeof(*fldtab));
 
 	/* Convert "+field" args to -f format */
@@ -166,7 +169,7 @@
 			for (i = 0; optarg[i]; i++)
 			    debug_flags |= 1 << (optarg[i] & 31);
 			break;
-		case 'd': case 'f': case 'i': case 'n': case 'r':
+		case 'd': case 'f': case 'i': case 'n':
 			fldtab[0].flags |= optval(ch, 0);
 			break;
 		case 'H':
@@ -174,10 +177,7 @@
 			/* That is now the default. */
 			break;
 		case 'k':
-			p = realloc(fldtab, (fldtab_sz + 1) * sizeof(*fldtab));
-			if (!p)
-				err(1, "realloc");
-			fldtab = p;
+			fldtab = erealloc(fldtab, (fldtab_sz + 1) * sizeof(*fldtab));
 			memset(&fldtab[fldtab_sz], 0, sizeof(fldtab[0]));
 			fldtab_sz++;
 
@@ -189,12 +189,16 @@
 		case 'o':
 			outpath = optarg;
 			break;
+		case 'r':
+			REVERSE = 1;
+			break;
 		case 's':
 			/*
 			 * Nominally 'stable sort', keep lines with equal keys
 			 * in input file order. (Default for NetBSD)
 			 * (-s for GNU sort compatibility.)
 			 */
+			posix_sort = 0;
 			break;
 		case 'S':
 			/*
@@ -205,6 +209,7 @@
 			 * (using libc radixsort() v sradixsort() doesn't
 			 * have the desired effect.)
 			 */
+			posix_sort = 1;
 			break;
 		case 't':
 			if (SEP_FLAG)
@@ -249,6 +254,10 @@
 		}
 	}
 
+	if (UNIQUE)
+		/* Don't sort on raw record if keys match */
+		posix_sort = 0;
+
 	if (cflag && argc > optind+1)
 		errx(2, "too many input files for -c option");
 	if (argc - 2 > optind && !strcmp(argv[argc-2], "-o")) {
@@ -277,17 +286,22 @@
 			err(2, "%s", argv[i]);
 	}
 
-	if (!(fldtab[0].flags & (I|D|N) || fldtab[1].icol.num)) {
-		SINGL_FLD = 1;
-		fldtab[0].icol.num = 1;
-		fldtab[0].weights = weight_tables[fldtab->flags & (R | F)];
-	} else {
-		if (!fldtab[1].icol.num) {
+	if (fldtab[1].icol.num == 0) {
+		/* No sort key specified */
+		if (fldtab[0].flags & (I|D|F|N)) {
+			/* Modified - generate a key that covers the line */
 			fldtab[0].flags &= ~(BI|BT);
 			setfield("1", &fldtab[++fld_cnt], fldtab->flags);
+			fldreset(fldtab);
+		} else {
+			/* Unmodified, just compare the line */
+			SINGL_FLD = 1;
+			fldtab[0].icol.num = 1;
 		}
+	} else {
 		fldreset(fldtab);
 	}
+
 	settables();
 
 	if (optind == argc) {

CVS commit: src/usr.bin/sort

Reply via email to