Module Name: src
Committed By: dsl
Date: Thu Sep 10 22:02:40 UTC 2009
Modified Files:
src/usr.bin/sort: Makefile append.c fields.c files.c fsort.c init.c
msort.c radix_sort.c sort.c sort.h
Log Message:
Save length of key instead of relying of the weight of the record sep.
This frees a byte value to use for 'end of key' (to correctly sort
short keys) while still having a weight assigned to the field sep.
(Unless -t is given, the field sep is in the field data.)
Do reverse sorts by writing the output file in reverse order (rather
than reversing the sort - apart from merges).
All key compares are now unweighted.
For 'sort -u' mark duplicates keys during the sort and don't write
to the output.
Use -S to mean a posix sort - where equal keys are sorted using the
raw record (rather than being kept in the original order).
For 'sort -f' (no keys) generate a key of the folded data (as for -n
-i and -d), simplifies the code and allows a 'posix' sort.
To generate a diff of this commit:
cvs rdiff -u -r1.7 -r1.8 src/usr.bin/sort/Makefile
cvs rdiff -u -r1.21 -r1.22 src/usr.bin/sort/append.c
cvs rdiff -u -r1.27 -r1.28 src/usr.bin/sort/fields.c src/usr.bin/sort/sort.h
cvs rdiff -u -r1.36 -r1.37 src/usr.bin/sort/files.c
cvs rdiff -u -r1.40 -r1.41 src/usr.bin/sort/fsort.c
cvs rdiff -u -r1.22 -r1.23 src/usr.bin/sort/init.c
cvs rdiff -u -r1.25 -r1.26 src/usr.bin/sort/msort.c
cvs rdiff -u -r1.2 -r1.3 src/usr.bin/sort/radix_sort.c
cvs rdiff -u -r1.54 -r1.55 src/usr.bin/sort/sort.c
Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.
Modified files:
Index: src/usr.bin/sort/Makefile
diff -u src/usr.bin/sort/Makefile:1.7 src/usr.bin/sort/Makefile:1.8
--- src/usr.bin/sort/Makefile:1.7 Sat Sep 5 09:16:18 2009
+++ src/usr.bin/sort/Makefile Thu Sep 10 22:02:40 2009
@@ -1,8 +1,11 @@
-# $NetBSD: Makefile,v 1.7 2009/09/05 09:16:18 dsl Exp $
+# $NetBSD: Makefile,v 1.8 2009/09/10 22:02:40 dsl Exp $
# from: @(#)Makefile 8.1 (Berkeley) 6/6/93
PROG= sort
SRCS= append.c fields.c files.c fsort.c init.c msort.c sort.c tmp.c
SRCS+= radix_sort.c
+LDADD+=-lutil
+DPADD+=${LIBUTIL}
+
.include <bsd.prog.mk>
Index: src/usr.bin/sort/append.c
diff -u src/usr.bin/sort/append.c:1.21 src/usr.bin/sort/append.c:1.22
--- src/usr.bin/sort/append.c:1.21 Sat Sep 5 12:00:25 2009
+++ src/usr.bin/sort/append.c Thu Sep 10 22:02:40 2009
@@ -1,4 +1,4 @@
-/* $NetBSD: append.c,v 1.21 2009/09/05 12:00:25 dsl Exp $ */
+/* $NetBSD: append.c,v 1.22 2009/09/10 22:02:40 dsl Exp $ */
/*-
* Copyright (c) 2000-2003 The NetBSD Foundation, Inc.
@@ -64,82 +64,34 @@
#include "sort.h"
#ifndef lint
-__RCSID("$NetBSD: append.c,v 1.21 2009/09/05 12:00:25 dsl Exp $");
+__RCSID("$NetBSD: append.c,v 1.22 2009/09/10 22:02:40 dsl Exp $");
__SCCSID("@(#)append.c 8.1 (Berkeley) 6/6/93");
#endif /* not lint */
#include <stdlib.h>
-#include <string.h>
-
-static int
-wt_cmp(const u_char *a, const u_char *b, size_t len, u_char *wts)
-{
- size_t i;
-
- for (i = 0; i < len; i++) {
- if (wts[*a++] != wts[*b++])
- return 1;
- }
-
- return 0;
-}
/*
- * copy sorted lines to output; check for uniqueness
+ * copy sorted lines to output
+ * Ignore duplicates (marked with -ve keylen)
*/
void
-append(const RECHEADER **keylist, int nelem, FILE *fp, put_func_t put, u_char *wts)
+append(RECHEADER **keylist, int nelem, FILE *fp, put_func_t put)
{
- const RECHEADER **cpos, **lastkey;
- const RECHEADER *crec, *prec;
- size_t plen;
+ RECHEADER **cpos, **lastkey;
+ RECHEADER *crec;
lastkey = keylist + nelem;
- if (!UNIQUE || wts == NULL) {
- for (cpos = keylist; cpos < lastkey; cpos++)
- put(*cpos, fp);
- return;
- }
-
- if (nelem == 0)
- return;
-
- cpos = keylist;
- prec = *cpos;
-
- if (!SINGL_FLD) {
- /* Key for each line is already in adjacent bytes */
- plen = prec->offset;
- for (cpos = &keylist[1]; cpos < lastkey; cpos++) {
+ if (REVERSE) {
+ for (cpos = lastkey; cpos-- > keylist;) {
crec = *cpos;
- if (crec->offset == plen
- && memcmp(crec->data, prec->data, plen) == 0) {
- /* Duplicate key */
- continue;
- }
- put(prec, fp);
- prec = crec;
- plen = prec->offset;
+ if (crec->keylen >= 0)
+ put(crec, fp);
}
- put(prec, fp);
- return;
- }
-
- /* We have to compare the raw data - which means applying weight */
-
- /* Key for each line is already in adjacent bytes */
- plen = prec->length;
- for (cpos = &keylist[1]; cpos < lastkey; cpos++) {
- crec = *cpos;
- if (crec->length == plen
- && wt_cmp(crec->data, prec->data, plen, wts) == 0) {
- /* Duplicate key */
- continue;
+ } else {
+ for (cpos = keylist; cpos < lastkey; cpos++) {
+ crec = *cpos;
+ if (crec->keylen >= 0)
+ put(crec, fp);
}
- put(prec, fp);
- prec = crec;
- plen = prec->length;
}
- put(prec, fp);
- return;
}
Index: src/usr.bin/sort/fields.c
diff -u src/usr.bin/sort/fields.c:1.27 src/usr.bin/sort/fields.c:1.28
--- src/usr.bin/sort/fields.c:1.27 Sat Aug 22 21:28:55 2009
+++ src/usr.bin/sort/fields.c Thu Sep 10 22:02:40 2009
@@ -1,4 +1,4 @@
-/* $NetBSD: fields.c,v 1.27 2009/08/22 21:28:55 dsl Exp $ */
+/* $NetBSD: fields.c,v 1.28 2009/09/10 22:02:40 dsl Exp $ */
/*-
* Copyright (c) 2000-2003 The NetBSD Foundation, Inc.
@@ -66,7 +66,7 @@
#include "sort.h"
#ifndef lint
-__RCSID("$NetBSD: fields.c,v 1.27 2009/08/22 21:28:55 dsl Exp $");
+__RCSID("$NetBSD: fields.c,v 1.28 2009/09/10 22:02:40 dsl Exp $");
__SCCSID("@(#)fields.c 8.1 (Berkeley) 6/6/93");
#endif /* not lint */
@@ -152,11 +152,19 @@
fieldtable->flags)) == NULL)
return (1);
}
- *keypos++ = 0;
keybuf->offset = keypos - keybuf->data;
keybuf->length = keybuf->offset + line_size;
+ /*
+ * Posix requires that equal keys be further sorted by the
+ * entire original record.
+ * NetBSD has (at least for some time) kept equal keys in
+ * their original order.
+ * For 'sort -u' posix_sort is unset.
+ */
+ keybuf->keylen = posix_sort ? keybuf->length : keybuf->offset;
+
memcpy(keypos, line_data, line_size);
return (0);
}
@@ -209,33 +217,31 @@
*tablepos++ = lweight[*start];
}
}
- /* Add extra byte to sort short keys correctly */
- *tablepos++ = flags & R ? 255 : 1;
+ /* Add extra byte (absent from lweight) to sort short keys correctly */
+ *tablepos++ = lweight[REC_D];
return tablepos;
}
/*
* Numbers are converted to a floating point format (exponent & mantissa)
* so that they compare correctly as sequence of unsigned bytes.
- * The output cannot contain a 0x00 byte (the record separator).
- * Bytes 0x01 and 0xff are used to terminate positive and negative numbers
+ * Bytes 0x00 and 0xff are used to terminate positive and negative numbers
* to ensure that 0.123 sorts after 0.12 and -0.123 sorts before -0.12.
*
* The first byte contain the overall sign, exponent sign and some of the
* exponent. These have to be ordered (-ve value, decreasing exponent),
* zero, (+ve value, increasing exponent).
- * After excluding 0, 1, 0xff and 0x80 (used for zero) there are 61
- * exponent values available, this isn't quite enough and the highest
- * values are used to encode large exponents in multiple bytes.
*
- * An exponent of zero has value 0xc0 for +ve numbers and 0x40 for -ves.
+ * The first byte is 0x80 for zero, 0x40 for -ve with exponent 0 and
+ * 0xc0 for +ve with exponent zero.
+ * This only leaves 62 byte values for +ve exponents - which isn't enough.
+ * The largest 5 exponent values are used to hold a byte count of the
+ * number of following bytes that contain 7 exponent bits per byte,
*
* The mantissa is stored 2 digits per byte offset by 0x40, for negative
* numbers the order must be reversed (they are subtracted from 0x100).
*
* Reverse sorts are done by inverting the sign of the number.
- *
- * We don't have to worry about REC_D, the key is terminated by 0x00.
*/
#define SIGNED(reverse, value) ((reverse) ? 0x100 - (value) : (value))
@@ -298,16 +304,14 @@
/* Maybe here we should allow for e+12 (etc) */
- /* exponent 0 is 0xc0 for +ve numbers and 0x40 for -ve ones */
- exponent += 0xc0;
-
- if (exponent > 0x80 + MAX_EXP_ENC && exponent < 0x100 - MAX_EXP_ENC) {
+ if (exponent < 0x3f - MAX_EXP_ENC && -exponent < 0x3f - MAX_EXP_ENC) {
/* Value ok for simple encoding */
+ /* exponent 0 is 0xc0 for +ve numbers and 0x40 for -ve ones */
+ exponent += 0xc0;
*pos++ = SIGNED(reverse, exponent);
} else {
/* Out or range for a single byte */
int c, t;
- exponent -= 0xc0;
t = exponent > 0 ? exponent : -exponent;
/* Count how many 7-bit blocks are needed */
for (c = 0; ; c++) {
Index: src/usr.bin/sort/sort.h
diff -u src/usr.bin/sort/sort.h:1.27 src/usr.bin/sort/sort.h:1.28
--- src/usr.bin/sort/sort.h:1.27 Sat Sep 5 12:00:25 2009
+++ src/usr.bin/sort/sort.h Thu Sep 10 22:02:40 2009
@@ -1,4 +1,4 @@
-/* $NetBSD: sort.h,v 1.27 2009/09/05 12:00:25 dsl Exp $ */
+/* $NetBSD: sort.h,v 1.28 2009/09/10 22:02:40 dsl Exp $ */
/*-
* Copyright (c) 2000-2003 The NetBSD Foundation, Inc.
@@ -104,16 +104,20 @@
err(2, NULL); \
}
-/* length of record is currently limited to maximum string length (size_t) */
-typedef size_t length_t;
+/* Records are limited to MAXBUFSIZE (8MB) and less if you want to sort
+ * in a sane way.
+ * Anyone who wants to sort data records longer than 2GB definitely needs a
+ * different program! */
+typedef unsigned int length_t;
/* A record is a key/line pair starting at rec.data. It has a total length
* and an offset to the start of the line half of the pair.
*/
typedef struct recheader {
- length_t length;
- length_t offset;
- u_char data[1];
+ length_t length; /* total length of key and line */
+ length_t offset; /* to line */
+ int keylen; /* length of key */
+ u_char data[]; /* key then line */
} RECHEADER;
/* This is the column as seen by struct field. It is used by enterfield.
@@ -161,18 +165,17 @@
extern u_char ascii[NBINS], Rascii[NBINS], Ftable[NBINS], RFtable[NBINS];
extern u_char *const weight_tables[4]; /* ascii, Rascii, Ftable, RFtable */
extern u_char d_mask[NBINS];
-extern int SINGL_FLD, SEP_FLAG, UNIQUE;
+extern int SINGL_FLD, SEP_FLAG, UNIQUE, REVERSE;
+extern int posix_sort;
extern int REC_D;
extern const char *tmpdir;
-extern u_char unweighted[NBINS];
extern struct coldesc *clist;
extern int ncols;
#define DEBUG(ch) (debug_flags & (1 << ((ch) & 31)))
extern unsigned int debug_flags;
-void append(const RECHEADER **, int, FILE *,
- void (*)(const RECHEADER *, FILE *), u_char *);
+void append(RECHEADER **, int, FILE *, void (*)(const RECHEADER *, FILE *));
void concat(FILE *, FILE *);
length_t enterkey(RECHEADER *, const u_char *, u_char *, size_t, struct field *);
void fixit(int *, char **);
@@ -193,6 +196,6 @@
void putrec(const RECHEADER *, FILE *);
void putkeydump(const RECHEADER *, FILE *);
void rd_append(int, int, int, FILE *, u_char *, u_char *);
-int radix_sort(const RECHEADER **, const RECHEADER **, int, const u_char *, u_int);
+void radix_sort(RECHEADER **, RECHEADER **, int);
int setfield(const char *, struct field *, int);
void settables(void);
Index: src/usr.bin/sort/files.c
diff -u src/usr.bin/sort/files.c:1.36 src/usr.bin/sort/files.c:1.37
--- src/usr.bin/sort/files.c:1.36 Sat Sep 5 12:00:25 2009
+++ src/usr.bin/sort/files.c Thu Sep 10 22:02:40 2009
@@ -1,4 +1,4 @@
-/* $NetBSD: files.c,v 1.36 2009/09/05 12:00:25 dsl Exp $ */
+/* $NetBSD: files.c,v 1.37 2009/09/10 22:02:40 dsl Exp $ */
/*-
* Copyright (c) 2000-2003 The NetBSD Foundation, Inc.
@@ -65,7 +65,7 @@
#include "fsort.h"
#ifndef lint
-__RCSID("$NetBSD: files.c,v 1.36 2009/09/05 12:00:25 dsl Exp $");
+__RCSID("$NetBSD: files.c,v 1.37 2009/09/10 22:02:40 dsl Exp $");
__SCCSID("@(#)files.c 8.1 (Berkeley) 6/6/93");
#endif /* not lint */
@@ -123,6 +123,7 @@
if (c == REC_D) {
recbuf->offset = 0;
recbuf->length = pos - recbuf->data;
+ recbuf->keylen = recbuf->length - 1;
return (0);
}
}
@@ -138,6 +139,7 @@
*pos++ = REC_D;
recbuf->offset = 0;
recbuf->length = pos - recbuf->data;
+ recbuf->keylen = recbuf->length - 1;
return (0);
}
FCLOSE(fp);
@@ -154,6 +156,7 @@
recbuf->offset = 0;
recbuf->length = 0;
+ recbuf->keylen = 0;
return (BUFFEND);
}
Index: src/usr.bin/sort/fsort.c
diff -u src/usr.bin/sort/fsort.c:1.40 src/usr.bin/sort/fsort.c:1.41
--- src/usr.bin/sort/fsort.c:1.40 Sat Sep 5 12:00:25 2009
+++ src/usr.bin/sort/fsort.c Thu Sep 10 22:02:40 2009
@@ -1,4 +1,4 @@
-/* $NetBSD: fsort.c,v 1.40 2009/09/05 12:00:25 dsl Exp $ */
+/* $NetBSD: fsort.c,v 1.41 2009/09/10 22:02:40 dsl Exp $ */
/*-
* Copyright (c) 2000-2003 The NetBSD Foundation, Inc.
@@ -72,7 +72,7 @@
#include "fsort.h"
#ifndef lint
-__RCSID("$NetBSD: fsort.c,v 1.40 2009/09/05 12:00:25 dsl Exp $");
+__RCSID("$NetBSD: fsort.c,v 1.41 2009/09/10 22:02:40 dsl Exp $");
__SCCSID("@(#)fsort.c 8.1 (Berkeley) 6/6/93");
#endif /* not lint */
@@ -86,8 +86,8 @@
void
fsort(struct filelist *filelist, int nfiles, FILE *outfp, struct field *ftbl)
{
- const RECHEADER **keylist;
- const RECHEADER **keypos, **keyp;
+ RECHEADER **keylist;
+ RECHEADER **keypos, **keyp;
RECHEADER *buffer;
size_t bufsize = DEFBUFSIZE;
u_char *bufend;
@@ -158,25 +158,19 @@
}
/* Sort this set of records */
- if (SINGL_FLD) {
- nelem = radix_sort(keylist, keylist + MAXNUM, nelem,
- ftbl[0].weights, REC_D);
- } else {
- nelem = radix_sort(keylist, keylist + MAXNUM, nelem,
- unweighted, 0);
- }
+ radix_sort(keylist, keylist + MAXNUM, nelem);
if (c == EOF && mfct == 0) {
/* all the data is (sorted) in the buffer */
append(keylist, nelem, outfp,
- DEBUG('k') ? putkeydump : putline, ftbl->weights);
+ DEBUG('k') ? putkeydump : putline);
break;
}
/* Save current data to a temporary file for a later merge */
fp = ftmp();
fstack[mfct].fp = fp;
- append(keylist, nelem, fp, putrec, NULL);
+ append(keylist, nelem, fp, putrec);
mfct++;
if (c == EOF) {
Index: src/usr.bin/sort/init.c
diff -u src/usr.bin/sort/init.c:1.22 src/usr.bin/sort/init.c:1.23
--- src/usr.bin/sort/init.c:1.22 Sat Sep 5 09:16:18 2009
+++ src/usr.bin/sort/init.c Thu Sep 10 22:02:40 2009
@@ -1,4 +1,4 @@
-/* $NetBSD: init.c,v 1.22 2009/09/05 09:16:18 dsl Exp $ */
+/* $NetBSD: init.c,v 1.23 2009/09/10 22:02:40 dsl Exp $ */
/*-
* Copyright (c) 2000-2003 The NetBSD Foundation, Inc.
@@ -64,7 +64,7 @@
#include "sort.h"
#ifndef lint
-__RCSID("$NetBSD: init.c,v 1.22 2009/09/05 09:16:18 dsl Exp $");
+__RCSID("$NetBSD: init.c,v 1.23 2009/09/10 22:02:40 dsl Exp $");
__SCCSID("@(#)init.c 8.1 (Berkeley) 6/6/93");
#endif /* not lint */
@@ -72,7 +72,7 @@
#include <string.h>
static void insertcol(struct field *);
-static const char *setcolumn(const char *, struct field *, int);
+static const char *setcolumn(const char *, struct field *);
/*
* masks of ignored characters.
@@ -152,7 +152,7 @@
* interprets a column in a -k field
*/
static const char *
-setcolumn(const char *pos, struct field *cur_fld, int gflag)
+setcolumn(const char *pos, struct field *cur_fld)
{
struct column *col;
char *npos;
@@ -183,27 +183,27 @@
int
setfield(const char *pos, struct field *cur_fld, int gflag)
{
- int tmp;
-
cur_fld->mask = NULL;
- pos = setcolumn(pos, cur_fld, gflag);
+ pos = setcolumn(pos, cur_fld);
if (*pos == '\0') /* key extends to EOL. */
cur_fld->tcol.num = 0;
else {
if (*pos != ',')
errx(2, "illegal field descriptor");
- setcolumn((++pos), cur_fld, gflag);
+ setcolumn((++pos), cur_fld);
}
if (!cur_fld->flags)
cur_fld->flags = gflag;
- tmp = cur_fld->flags;
+ if (REVERSE)
+ /* A local 'r' doesn't invert the global one */
+ cur_fld->flags &= ~R;
/* Assign appropriate mask table and weight table. */
- cur_fld->weights = weight_tables[tmp & (R | F)];
- if (tmp & I)
+ cur_fld->weights = weight_tables[cur_fld->flags & (R | F)];
+ if (cur_fld->flags & I)
cur_fld->mask = itable;
- else if (tmp & D)
+ else if (cur_fld->flags & D)
cur_fld->mask = dtable;
cur_fld->flags |= (gflag & (BI | BT));
@@ -217,6 +217,7 @@
&& cur_fld->tcol.indent != 0
&& cur_fld->tcol.indent < cur_fld->icol.indent))
errx(2, "fields out of order");
+
insertcol(cur_fld);
return (cur_fld->tcol.num);
}
@@ -319,18 +320,27 @@
* Convert 'ascii' characters into their sort order.
* The 'F' variants fold lower case to upper equivalent
* The 'R' variants are for reverse sorting.
- * The record separator (REC_D) always maps to 0.
- * One is reserved for field separators (added when key is generated)
- * The field separator (from -t<ch>) map to 1 or 255 (unless SINGL_FLD)
+ *
+ * The record separator (REC_D) never needs a weight, this frees one
+ * byte value as an 'end of key' marker. This must be 0 for normal
+ * weight tables, and 0xff for reverse weight tables - and is used
+ * to terminate keys so that short keys sort before (after if reverse)
+ * longer keys.
+ *
+ * The field separator has a normal weight - although it cannot occur
+ * within a key unless it is the default (space+tab).
+ *
* All other bytes map to the appropriate value for the sort order.
* Numeric sorts don't need any tables, they are reversed by negation.
*
+ * Global reverse sorts are done by writing the sorted keys in reverse
+ * order - the sort itself is stil forwards.
+ * This means that weights are only ever used when generating keys, any
+ * sort of the original data bytes is always forwards and unweighted.
+ *
* Note: this is only good for ASCII sorting. For different LC 's,
* all bets are off.
*
- * If SINGL_FLD then the weights have to be applied during the actual sort.
- * Otherwise they are applied when the key bytes are built.
- *
* itable[] and dtable[] are the masks for -i (ignore non-printables)
* and -d (only sort blank and alphanumerics).
*/
@@ -338,21 +348,17 @@
settables(void)
{
int i;
- int next_weight = SINGL_FLD ? 1 : 2;
- int rev_weight = SINGL_FLD ? 255 : 254;
- int had_field_sep = SINGL_FLD;
+ int next_weight = 1;
+ int rev_weight = 254;
+
+ ascii[REC_D] = 0;
+ Rascii[REC_D] = 255;
+ Ftable[REC_D] = 0;
+ RFtable[REC_D] = 255;
for (i = 0; i < 256; i++) {
- unweighted[i] = i;
- if (d_mask[i] & REC_D_F)
+ if (i == REC_D)
continue;
- if (!had_field_sep && d_mask[i] & FLD_D) {
- /* First/only separator sorts before any data */
- ascii[i] = 1;
- Rascii[i] = 255;
- had_field_sep = 1;
- continue;
- }
ascii[i] = next_weight;
Rascii[i] = rev_weight;
if (Ftable[i] == 0) {
Index: src/usr.bin/sort/msort.c
diff -u src/usr.bin/sort/msort.c:1.25 src/usr.bin/sort/msort.c:1.26
--- src/usr.bin/sort/msort.c:1.25 Sat Sep 5 12:00:25 2009
+++ src/usr.bin/sort/msort.c Thu Sep 10 22:02:40 2009
@@ -1,4 +1,4 @@
-/* $NetBSD: msort.c,v 1.25 2009/09/05 12:00:25 dsl Exp $ */
+/* $NetBSD: msort.c,v 1.26 2009/09/10 22:02:40 dsl Exp $ */
/*-
* Copyright (c) 2000-2003 The NetBSD Foundation, Inc.
@@ -65,7 +65,7 @@
#include "fsort.h"
#ifndef lint
-__RCSID("$NetBSD: msort.c,v 1.25 2009/09/05 12:00:25 dsl Exp $");
+__RCSID("$NetBSD: msort.c,v 1.26 2009/09/10 22:02:40 dsl Exp $");
__SCCSID("@(#)msort.c 8.1 (Berkeley) 6/6/93");
#endif /* not lint */
@@ -78,12 +78,10 @@
typedef struct mfile {
u_char *end;
- short flno;
+ int flno;
RECHEADER rec[1];
} MFILE;
-static u_char *wts;
-
static int cmp(RECHEADER *, RECHEADER *);
static int insert(struct mfile **, struct mfile **, int, int);
static void merge(int, int, get_func_t, FILE *, put_func_t, struct field *);
@@ -96,8 +94,6 @@
int i, j, last;
put_func_t put;
- wts = ftbl->weights;
-
while (nfiles) {
put = putrec;
for (j = 0; j < nfiles; j += MERGE_FNUM) {
@@ -278,6 +274,8 @@
* comes here).
*/
cmpv = tmprec->flno - flist[mid]->flno;
+ if (REVERSE)
+ cmpv = -cmpv;
}
if (cmpv < 0)
top = mid;
@@ -332,7 +330,6 @@
crec_end = crec->data + DEFLLEN;
prec = malloc(offsetof(RECHEADER, data[DEFLLEN]));
prec_end = prec->data + DEFLLEN;
- wts = ftbl->weights;
/* XXX this does exit(0) for overlong lines */
if (get(-1, 0, filelist, 1, prec, prec_end, ftbl) != 0)
@@ -365,27 +362,15 @@
static int
cmp(RECHEADER *rec1, RECHEADER *rec2)
{
+ int len;
int r;
- size_t len, i;
- u_char *pos1, *pos2;
- u_char *cwts;
-
- if (!SINGL_FLD)
- /* key is weights, and is 0x00 terminated */
- return memcmp(rec1->data, rec2->data, rec1->offset);
-
- /* We have to apply the weights ourselves */
- cwts = wts;
-
- pos1 = rec1->data;
- pos2 = rec2->data;
- len = rec1->length;
-
- for (i = 0; i < len; i++) {
- r = cwts[pos1[i]] - cwts[pos2[i]];
- if (r)
- return r;
- }
- return (0);
+ /* key is weights */
+ len = min(rec1->keylen, rec2->keylen);
+ r = memcmp(rec1->data, rec2->data, len);
+ if (r == 0)
+ r = rec1->keylen - rec2->keylen;
+ if (REVERSE)
+ r = -r;
+ return r;
}
Index: src/usr.bin/sort/radix_sort.c
diff -u src/usr.bin/sort/radix_sort.c:1.2 src/usr.bin/sort/radix_sort.c:1.3
--- src/usr.bin/sort/radix_sort.c:1.2 Sat Sep 5 12:00:25 2009
+++ src/usr.bin/sort/radix_sort.c Thu Sep 10 22:02:40 2009
@@ -1,4 +1,4 @@
-/* $NetBSD: radix_sort.c,v 1.2 2009/09/05 12:00:25 dsl Exp $ */
+/* $NetBSD: radix_sort.c,v 1.3 2009/09/10 22:02:40 dsl Exp $ */
/*-
* Copyright (c) 1990, 1993
@@ -37,7 +37,7 @@
#if 0
static char sccsid[] = "@(#)radixsort.c 8.2 (Berkeley) 4/28/95";
#else
-__RCSID("$NetBSD: radix_sort.c,v 1.2 2009/09/05 12:00:25 dsl Exp $");
+__RCSID("$NetBSD: radix_sort.c,v 1.3 2009/09/10 22:02:40 dsl Exp $");
#endif
#endif /* LIBC_SCCS and not lint */
@@ -49,82 +49,96 @@
#include <assert.h>
#include <errno.h>
+#include <util.h>
#include "sort.h"
typedef struct {
- const RECHEADER **sa; /* Base of saved area */
- int sn; /* Number of entries */
- int si; /* index into data for compare */
+ RECHEADER **sa; /* Base of saved area */
+ int sn; /* Number of entries */
+ int si; /* index into data for compare */
} stack;
-static inline int simplesort(const RECHEADER **, int, int, const u_char *, u_int);
-static int r_sort_b(const RECHEADER **,
- const RECHEADER **, int, int, const u_char *, u_int);
+static void simplesort(RECHEADER **, int, int);
#define THRESHOLD 20 /* Divert to simplesort(). */
-#define SIZE 512 /* Default stack size. */
#define empty(s) (s >= sp)
#define pop(a, n, i) a = (--sp)->sa, n = sp->sn, i = sp->si
#define push(a, n, i) sp->sa = a, sp->sn = n, (sp++)->si = i
#define swap(a, b, t) t = a, a = b, b = t
-int
-radix_sort(const RECHEADER **a, const RECHEADER **ta, int n, const u_char *tab, u_int endch)
+void
+radix_sort(RECHEADER **a, RECHEADER **ta, int n)
{
- endch = tab[endch];
+ u_int count[256], nc, bmin;
+ u_int c;
+ RECHEADER **ak, **tai, **lim;
+ RECHEADER *hdr;
+ int stack_size = 512;
+ stack *s, *sp, *sp0, *sp1, temp;
+ RECHEADER **top[256];
+ u_int *cp, bigc;
+ int data_index = 0;
+
if (n < THRESHOLD && !DEBUG('r')) {
- return simplesort(a, n, 0, tab, endch);
+ simplesort(a, n, 0);
+ return;
}
- return r_sort_b(a, ta, n, 0, tab, endch);
-}
-static int
-r_sort_b(const RECHEADER **a, const RECHEADER **ta, int n, int i, const u_char *tr, u_int endch)
-{
- static u_int count[256], nc, bmin;
- u_int c;
- const RECHEADER **ak, **ai;
- stack s[512], *sp, *sp0, *sp1, temp;
- const RECHEADER **top[256];
- u_int *cp, bigc;
- int nrec = n;
+ s = emalloc(stack_size * sizeof *s);
+ memset(&count, 0, sizeof count);
+ /* Technically 'top' doesn't need zeroing */
+ memset(&top, 0, sizeof top);
sp = s;
- push(a, n, i);
+ push(a, n, data_index);
while (!empty(s)) {
- pop(a, n, i);
+ pop(a, n, data_index);
if (n < THRESHOLD && !DEBUG('r')) {
- simplesort(a, n, i, tr, endch);
+ simplesort(a, n, data_index);
continue;
}
- if (nc == 0) {
- bmin = 255;
- for (ak = a + n; --ak >= a;) {
- c = tr[(*ak)->data[i]];
- if (++count[c] == 1 && c != endch) {
- if (c < bmin)
- bmin = c;
- nc++;
- }
- }
- if (sp + nc > s + SIZE) {
- r_sort_b(a, ta, n, i, tr, endch);
+ /* Count number of times each 'next byte' occurs */
+ nc = 0;
+ bmin = 255;
+ lim = a + n;
+ for (ak = a, tai = ta; ak < lim; ak++) {
+ hdr = *ak;
+ if (data_index >= hdr->keylen) {
+ /* Short key, copy to start of output */
+ if (UNIQUE && a != sp->sa)
+ /* Stop duplicate being written out */
+ hdr->keylen = -1;
+ *a++ = hdr;
+ n--;
continue;
}
+ /* Save in temp buffer for distribute */
+ *tai++ = hdr;
+ c = hdr->data[data_index];
+ if (++count[c] == 1) {
+ if (c < bmin)
+ bmin = c;
+ nc++;
+ }
+ }
+ /*
+ * We need save the bounds for each 'next byte' that
+ * occurs more so we can sort each block.
+ */
+ if (sp + nc > s + stack_size) {
+ stack_size *= 2;
+ sp1 = erealloc(s, stack_size * sizeof *s);
+ sp = sp1 + (sp - s);
+ s = sp1;
}
+ /* Minor optimisation to do the largest set last */
sp0 = sp1 = sp;
bigc = 2;
- if (endch == 0) {
- top[0] = ak = a + count[0];
- count[0] = 0;
- } else {
- ak = a;
- top[255] = a + n;
- count[255] = 0;
- }
+ /* Convert 'counts' positions, saving bounds for later sorts */
+ ak = a;
for (cp = count + bmin; nc > 0; cp++) {
while (*cp == 0)
cp++;
@@ -133,48 +147,69 @@
bigc = c;
sp1 = sp;
}
- push(ak, c, i+1);
+ push(ak, c, data_index+1);
}
- top[cp-count] = ak += c;
+ ak += c;
+ top[cp-count] = ak;
*cp = 0; /* Reset count[]. */
nc--;
}
swap(*sp0, *sp1, temp);
- for (ak = ta + n, ai = a+n; ak > ta;) /* Copy to temp. */
- *--ak = *--ai;
for (ak = ta+n; --ak >= ta;) /* Deal to piles. */
- *--top[tr[(*ak)->data[i]]] = *ak;
+ *--top[(*ak)->data[data_index]] = *ak;
}
- return nrec;
+ free(s);
}
-/* insertion sort */
-static inline int
-simplesort(const RECHEADER **a, int n, int b, const u_char *tr, u_int endch)
+/* insertion sort, short records are sorted before long ones */
+static void
+simplesort(RECHEADER **a, int n, int data_index)
{
- const RECHEADER **ak, **ai, *tmp;
- const RECHEADER **lim = a + n;
+ RECHEADER **ak, **ai;
+ RECHEADER *akh;
+ RECHEADER **lim = a + n;
const u_char *s, *t;
- u_char ch;
+ int s_len, t_len;
+ int i;
+ int r;
if (n <= 1)
- return n;
+ return;
for (ak = a+1; ak < lim; ak++) {
- for (ai = ak; ai > a; ai--) {
- for (s = ai[0]->data + b, t = ai[-1]->data + b;
- (ch = tr[*s]) != endch; s++, t++)
- if (ch != tr[*t])
+ akh = *ak;
+ s = akh->data;
+ s_len = akh->keylen;
+ for (ai = ak; ;) {
+ ai--;
+ t = (*ai)->data;
+ t_len = (*ai)->keylen;
+ for (i = data_index; ; i++) {
+ if (i >= s_len || i >= t_len) {
+ r = s_len - t_len;
break;
- if (ch >= tr[*t]) {
-
+ }
+ r = s[i] - t[i];
+ if (r != 0)
+ break;
+ }
+ if (r >= 0) {
+ if (r == 0 && UNIQUE) {
+ /* Put record below existing */
+ ai[1] = ai[0];
+ /* Mark so ignored by output() */
+ akh->keylen = -1;
+ } else {
+ ai++;
+ }
break;
}
- swap(ai[0], ai[-1], tmp);
+ ai[1] = ai[0];
+ if (ai == a)
+ break;
}
+ ai[0] = akh;
}
-
- return n;
}
Index: src/usr.bin/sort/sort.c
diff -u src/usr.bin/sort/sort.c:1.54 src/usr.bin/sort/sort.c:1.55
--- src/usr.bin/sort/sort.c:1.54 Sat Sep 5 09:16:18 2009
+++ src/usr.bin/sort/sort.c Thu Sep 10 22:02:40 2009
@@ -1,4 +1,4 @@
-/* $NetBSD: sort.c,v 1.54 2009/09/05 09:16:18 dsl Exp $ */
+/* $NetBSD: sort.c,v 1.55 2009/09/10 22:02:40 dsl Exp $ */
/*-
* Copyright (c) 2000-2003 The NetBSD Foundation, Inc.
@@ -66,6 +66,7 @@
* a choice of merge sort and radix sort for external sorting.
*/
+#include <util.h>
#include "sort.h"
#include "fsort.h"
#include "pathnames.h"
@@ -76,7 +77,7 @@
#endif /* not lint */
#ifndef lint
-__RCSID("$NetBSD: sort.c,v 1.54 2009/09/05 09:16:18 dsl Exp $");
+__RCSID("$NetBSD: sort.c,v 1.55 2009/09/10 22:02:40 dsl Exp $");
__SCCSID("@(#)sort.c 8.1 (Berkeley) 6/6/93");
#endif /* not lint */
@@ -100,8 +101,10 @@
*/
u_char *const weight_tables[4] = { ascii, Rascii, Ftable, RFtable };
u_char ascii[NBINS], Rascii[NBINS], RFtable[NBINS], Ftable[NBINS];
-u_char unweighted[NBINS];
+
int SINGL_FLD = 0, SEP_FLAG = 0, UNIQUE = 0;
+int REVERSE = 0;
+int posix_sort;
unsigned int debug_flags = 0;
@@ -122,7 +125,7 @@
int ch, i, stdinflag = 0;
char cflag = 0, mflag = 0;
char *outfile, *outpath = 0;
- struct field *fldtab, *p;
+ struct field *fldtab;
size_t fldtab_sz, fld_cnt;
struct filelist filelist;
int num_input_files;
@@ -145,7 +148,7 @@
/* fldtab[0] is the global options. */
fldtab_sz = 3;
fld_cnt = 0;
- fldtab = malloc(fldtab_sz * sizeof(*fldtab));
+ fldtab = emalloc(fldtab_sz * sizeof(*fldtab));
memset(fldtab, 0, fldtab_sz * sizeof(*fldtab));
/* Convert "+field" args to -f format */
@@ -166,7 +169,7 @@
for (i = 0; optarg[i]; i++)
debug_flags |= 1 << (optarg[i] & 31);
break;
- case 'd': case 'f': case 'i': case 'n': case 'r':
+ case 'd': case 'f': case 'i': case 'n':
fldtab[0].flags |= optval(ch, 0);
break;
case 'H':
@@ -174,10 +177,7 @@
/* That is now the default. */
break;
case 'k':
- p = realloc(fldtab, (fldtab_sz + 1) * sizeof(*fldtab));
- if (!p)
- err(1, "realloc");
- fldtab = p;
+ fldtab = erealloc(fldtab, (fldtab_sz + 1) * sizeof(*fldtab));
memset(&fldtab[fldtab_sz], 0, sizeof(fldtab[0]));
fldtab_sz++;
@@ -189,12 +189,16 @@
case 'o':
outpath = optarg;
break;
+ case 'r':
+ REVERSE = 1;
+ break;
case 's':
/*
* Nominally 'stable sort', keep lines with equal keys
* in input file order. (Default for NetBSD)
* (-s for GNU sort compatibility.)
*/
+ posix_sort = 0;
break;
case 'S':
/*
@@ -205,6 +209,7 @@
* (using libc radixsort() v sradixsort() doesn't
* have the desired effect.)
*/
+ posix_sort = 1;
break;
case 't':
if (SEP_FLAG)
@@ -249,6 +254,10 @@
}
}
+ if (UNIQUE)
+ /* Don't sort on raw record if keys match */
+ posix_sort = 0;
+
if (cflag && argc > optind+1)
errx(2, "too many input files for -c option");
if (argc - 2 > optind && !strcmp(argv[argc-2], "-o")) {
@@ -277,17 +286,22 @@
err(2, "%s", argv[i]);
}
- if (!(fldtab[0].flags & (I|D|N) || fldtab[1].icol.num)) {
- SINGL_FLD = 1;
- fldtab[0].icol.num = 1;
- fldtab[0].weights = weight_tables[fldtab->flags & (R | F)];
- } else {
- if (!fldtab[1].icol.num) {
+ if (fldtab[1].icol.num == 0) {
+ /* No sort key specified */
+ if (fldtab[0].flags & (I|D|F|N)) {
+ /* Modified - generate a key that covers the line */
fldtab[0].flags &= ~(BI|BT);
setfield("1", &fldtab[++fld_cnt], fldtab->flags);
+ fldreset(fldtab);
+ } else {
+ /* Unmodified, just compare the line */
+ SINGL_FLD = 1;
+ fldtab[0].icol.num = 1;
}
+ } else {
fldreset(fldtab);
}
+
settables();
if (optind == argc) {