Module Name: src Committed By: dsl Date: Thu Sep 10 22:02:40 UTC 2009
Modified Files: src/usr.bin/sort: Makefile append.c fields.c files.c fsort.c init.c msort.c radix_sort.c sort.c sort.h Log Message: Save length of key instead of relying of the weight of the record sep. This frees a byte value to use for 'end of key' (to correctly sort short keys) while still having a weight assigned to the field sep. (Unless -t is given, the field sep is in the field data.) Do reverse sorts by writing the output file in reverse order (rather than reversing the sort - apart from merges). All key compares are now unweighted. For 'sort -u' mark duplicates keys during the sort and don't write to the output. Use -S to mean a posix sort - where equal keys are sorted using the raw record (rather than being kept in the original order). For 'sort -f' (no keys) generate a key of the folded data (as for -n -i and -d), simplifies the code and allows a 'posix' sort. To generate a diff of this commit: cvs rdiff -u -r1.7 -r1.8 src/usr.bin/sort/Makefile cvs rdiff -u -r1.21 -r1.22 src/usr.bin/sort/append.c cvs rdiff -u -r1.27 -r1.28 src/usr.bin/sort/fields.c src/usr.bin/sort/sort.h cvs rdiff -u -r1.36 -r1.37 src/usr.bin/sort/files.c cvs rdiff -u -r1.40 -r1.41 src/usr.bin/sort/fsort.c cvs rdiff -u -r1.22 -r1.23 src/usr.bin/sort/init.c cvs rdiff -u -r1.25 -r1.26 src/usr.bin/sort/msort.c cvs rdiff -u -r1.2 -r1.3 src/usr.bin/sort/radix_sort.c cvs rdiff -u -r1.54 -r1.55 src/usr.bin/sort/sort.c Please note that diffs are not public domain; they are subject to the copyright notices on the relevant files.
Modified files: Index: src/usr.bin/sort/Makefile diff -u src/usr.bin/sort/Makefile:1.7 src/usr.bin/sort/Makefile:1.8 --- src/usr.bin/sort/Makefile:1.7 Sat Sep 5 09:16:18 2009 +++ src/usr.bin/sort/Makefile Thu Sep 10 22:02:40 2009 @@ -1,8 +1,11 @@ -# $NetBSD: Makefile,v 1.7 2009/09/05 09:16:18 dsl Exp $ +# $NetBSD: Makefile,v 1.8 2009/09/10 22:02:40 dsl Exp $ # from: @(#)Makefile 8.1 (Berkeley) 6/6/93 PROG= sort SRCS= append.c fields.c files.c fsort.c init.c msort.c sort.c tmp.c SRCS+= radix_sort.c +LDADD+=-lutil +DPADD+=${LIBUTIL} + .include <bsd.prog.mk> Index: src/usr.bin/sort/append.c diff -u src/usr.bin/sort/append.c:1.21 src/usr.bin/sort/append.c:1.22 --- src/usr.bin/sort/append.c:1.21 Sat Sep 5 12:00:25 2009 +++ src/usr.bin/sort/append.c Thu Sep 10 22:02:40 2009 @@ -1,4 +1,4 @@ -/* $NetBSD: append.c,v 1.21 2009/09/05 12:00:25 dsl Exp $ */ +/* $NetBSD: append.c,v 1.22 2009/09/10 22:02:40 dsl Exp $ */ /*- * Copyright (c) 2000-2003 The NetBSD Foundation, Inc. @@ -64,82 +64,34 @@ #include "sort.h" #ifndef lint -__RCSID("$NetBSD: append.c,v 1.21 2009/09/05 12:00:25 dsl Exp $"); +__RCSID("$NetBSD: append.c,v 1.22 2009/09/10 22:02:40 dsl Exp $"); __SCCSID("@(#)append.c 8.1 (Berkeley) 6/6/93"); #endif /* not lint */ #include <stdlib.h> -#include <string.h> - -static int -wt_cmp(const u_char *a, const u_char *b, size_t len, u_char *wts) -{ - size_t i; - - for (i = 0; i < len; i++) { - if (wts[*a++] != wts[*b++]) - return 1; - } - - return 0; -} /* - * copy sorted lines to output; check for uniqueness + * copy sorted lines to output + * Ignore duplicates (marked with -ve keylen) */ void -append(const RECHEADER **keylist, int nelem, FILE *fp, put_func_t put, u_char *wts) +append(RECHEADER **keylist, int nelem, FILE *fp, put_func_t put) { - const RECHEADER **cpos, **lastkey; - const RECHEADER *crec, *prec; - size_t plen; + RECHEADER **cpos, **lastkey; + RECHEADER *crec; lastkey = keylist + nelem; - if (!UNIQUE || wts == NULL) { - for (cpos = keylist; cpos < lastkey; cpos++) - put(*cpos, fp); - return; - } - - if (nelem == 0) - return; - - cpos = keylist; - prec = *cpos; - - if (!SINGL_FLD) { - /* Key for each line is already in adjacent bytes */ - plen = prec->offset; - for (cpos = &keylist[1]; cpos < lastkey; cpos++) { + if (REVERSE) { + for (cpos = lastkey; cpos-- > keylist;) { crec = *cpos; - if (crec->offset == plen - && memcmp(crec->data, prec->data, plen) == 0) { - /* Duplicate key */ - continue; - } - put(prec, fp); - prec = crec; - plen = prec->offset; + if (crec->keylen >= 0) + put(crec, fp); } - put(prec, fp); - return; - } - - /* We have to compare the raw data - which means applying weight */ - - /* Key for each line is already in adjacent bytes */ - plen = prec->length; - for (cpos = &keylist[1]; cpos < lastkey; cpos++) { - crec = *cpos; - if (crec->length == plen - && wt_cmp(crec->data, prec->data, plen, wts) == 0) { - /* Duplicate key */ - continue; + } else { + for (cpos = keylist; cpos < lastkey; cpos++) { + crec = *cpos; + if (crec->keylen >= 0) + put(crec, fp); } - put(prec, fp); - prec = crec; - plen = prec->length; } - put(prec, fp); - return; } Index: src/usr.bin/sort/fields.c diff -u src/usr.bin/sort/fields.c:1.27 src/usr.bin/sort/fields.c:1.28 --- src/usr.bin/sort/fields.c:1.27 Sat Aug 22 21:28:55 2009 +++ src/usr.bin/sort/fields.c Thu Sep 10 22:02:40 2009 @@ -1,4 +1,4 @@ -/* $NetBSD: fields.c,v 1.27 2009/08/22 21:28:55 dsl Exp $ */ +/* $NetBSD: fields.c,v 1.28 2009/09/10 22:02:40 dsl Exp $ */ /*- * Copyright (c) 2000-2003 The NetBSD Foundation, Inc. @@ -66,7 +66,7 @@ #include "sort.h" #ifndef lint -__RCSID("$NetBSD: fields.c,v 1.27 2009/08/22 21:28:55 dsl Exp $"); +__RCSID("$NetBSD: fields.c,v 1.28 2009/09/10 22:02:40 dsl Exp $"); __SCCSID("@(#)fields.c 8.1 (Berkeley) 6/6/93"); #endif /* not lint */ @@ -152,11 +152,19 @@ fieldtable->flags)) == NULL) return (1); } - *keypos++ = 0; keybuf->offset = keypos - keybuf->data; keybuf->length = keybuf->offset + line_size; + /* + * Posix requires that equal keys be further sorted by the + * entire original record. + * NetBSD has (at least for some time) kept equal keys in + * their original order. + * For 'sort -u' posix_sort is unset. + */ + keybuf->keylen = posix_sort ? keybuf->length : keybuf->offset; + memcpy(keypos, line_data, line_size); return (0); } @@ -209,33 +217,31 @@ *tablepos++ = lweight[*start]; } } - /* Add extra byte to sort short keys correctly */ - *tablepos++ = flags & R ? 255 : 1; + /* Add extra byte (absent from lweight) to sort short keys correctly */ + *tablepos++ = lweight[REC_D]; return tablepos; } /* * Numbers are converted to a floating point format (exponent & mantissa) * so that they compare correctly as sequence of unsigned bytes. - * The output cannot contain a 0x00 byte (the record separator). - * Bytes 0x01 and 0xff are used to terminate positive and negative numbers + * Bytes 0x00 and 0xff are used to terminate positive and negative numbers * to ensure that 0.123 sorts after 0.12 and -0.123 sorts before -0.12. * * The first byte contain the overall sign, exponent sign and some of the * exponent. These have to be ordered (-ve value, decreasing exponent), * zero, (+ve value, increasing exponent). - * After excluding 0, 1, 0xff and 0x80 (used for zero) there are 61 - * exponent values available, this isn't quite enough and the highest - * values are used to encode large exponents in multiple bytes. * - * An exponent of zero has value 0xc0 for +ve numbers and 0x40 for -ves. + * The first byte is 0x80 for zero, 0x40 for -ve with exponent 0 and + * 0xc0 for +ve with exponent zero. + * This only leaves 62 byte values for +ve exponents - which isn't enough. + * The largest 5 exponent values are used to hold a byte count of the + * number of following bytes that contain 7 exponent bits per byte, * * The mantissa is stored 2 digits per byte offset by 0x40, for negative * numbers the order must be reversed (they are subtracted from 0x100). * * Reverse sorts are done by inverting the sign of the number. - * - * We don't have to worry about REC_D, the key is terminated by 0x00. */ #define SIGNED(reverse, value) ((reverse) ? 0x100 - (value) : (value)) @@ -298,16 +304,14 @@ /* Maybe here we should allow for e+12 (etc) */ - /* exponent 0 is 0xc0 for +ve numbers and 0x40 for -ve ones */ - exponent += 0xc0; - - if (exponent > 0x80 + MAX_EXP_ENC && exponent < 0x100 - MAX_EXP_ENC) { + if (exponent < 0x3f - MAX_EXP_ENC && -exponent < 0x3f - MAX_EXP_ENC) { /* Value ok for simple encoding */ + /* exponent 0 is 0xc0 for +ve numbers and 0x40 for -ve ones */ + exponent += 0xc0; *pos++ = SIGNED(reverse, exponent); } else { /* Out or range for a single byte */ int c, t; - exponent -= 0xc0; t = exponent > 0 ? exponent : -exponent; /* Count how many 7-bit blocks are needed */ for (c = 0; ; c++) { Index: src/usr.bin/sort/sort.h diff -u src/usr.bin/sort/sort.h:1.27 src/usr.bin/sort/sort.h:1.28 --- src/usr.bin/sort/sort.h:1.27 Sat Sep 5 12:00:25 2009 +++ src/usr.bin/sort/sort.h Thu Sep 10 22:02:40 2009 @@ -1,4 +1,4 @@ -/* $NetBSD: sort.h,v 1.27 2009/09/05 12:00:25 dsl Exp $ */ +/* $NetBSD: sort.h,v 1.28 2009/09/10 22:02:40 dsl Exp $ */ /*- * Copyright (c) 2000-2003 The NetBSD Foundation, Inc. @@ -104,16 +104,20 @@ err(2, NULL); \ } -/* length of record is currently limited to maximum string length (size_t) */ -typedef size_t length_t; +/* Records are limited to MAXBUFSIZE (8MB) and less if you want to sort + * in a sane way. + * Anyone who wants to sort data records longer than 2GB definitely needs a + * different program! */ +typedef unsigned int length_t; /* A record is a key/line pair starting at rec.data. It has a total length * and an offset to the start of the line half of the pair. */ typedef struct recheader { - length_t length; - length_t offset; - u_char data[1]; + length_t length; /* total length of key and line */ + length_t offset; /* to line */ + int keylen; /* length of key */ + u_char data[]; /* key then line */ } RECHEADER; /* This is the column as seen by struct field. It is used by enterfield. @@ -161,18 +165,17 @@ extern u_char ascii[NBINS], Rascii[NBINS], Ftable[NBINS], RFtable[NBINS]; extern u_char *const weight_tables[4]; /* ascii, Rascii, Ftable, RFtable */ extern u_char d_mask[NBINS]; -extern int SINGL_FLD, SEP_FLAG, UNIQUE; +extern int SINGL_FLD, SEP_FLAG, UNIQUE, REVERSE; +extern int posix_sort; extern int REC_D; extern const char *tmpdir; -extern u_char unweighted[NBINS]; extern struct coldesc *clist; extern int ncols; #define DEBUG(ch) (debug_flags & (1 << ((ch) & 31))) extern unsigned int debug_flags; -void append(const RECHEADER **, int, FILE *, - void (*)(const RECHEADER *, FILE *), u_char *); +void append(RECHEADER **, int, FILE *, void (*)(const RECHEADER *, FILE *)); void concat(FILE *, FILE *); length_t enterkey(RECHEADER *, const u_char *, u_char *, size_t, struct field *); void fixit(int *, char **); @@ -193,6 +196,6 @@ void putrec(const RECHEADER *, FILE *); void putkeydump(const RECHEADER *, FILE *); void rd_append(int, int, int, FILE *, u_char *, u_char *); -int radix_sort(const RECHEADER **, const RECHEADER **, int, const u_char *, u_int); +void radix_sort(RECHEADER **, RECHEADER **, int); int setfield(const char *, struct field *, int); void settables(void); Index: src/usr.bin/sort/files.c diff -u src/usr.bin/sort/files.c:1.36 src/usr.bin/sort/files.c:1.37 --- src/usr.bin/sort/files.c:1.36 Sat Sep 5 12:00:25 2009 +++ src/usr.bin/sort/files.c Thu Sep 10 22:02:40 2009 @@ -1,4 +1,4 @@ -/* $NetBSD: files.c,v 1.36 2009/09/05 12:00:25 dsl Exp $ */ +/* $NetBSD: files.c,v 1.37 2009/09/10 22:02:40 dsl Exp $ */ /*- * Copyright (c) 2000-2003 The NetBSD Foundation, Inc. @@ -65,7 +65,7 @@ #include "fsort.h" #ifndef lint -__RCSID("$NetBSD: files.c,v 1.36 2009/09/05 12:00:25 dsl Exp $"); +__RCSID("$NetBSD: files.c,v 1.37 2009/09/10 22:02:40 dsl Exp $"); __SCCSID("@(#)files.c 8.1 (Berkeley) 6/6/93"); #endif /* not lint */ @@ -123,6 +123,7 @@ if (c == REC_D) { recbuf->offset = 0; recbuf->length = pos - recbuf->data; + recbuf->keylen = recbuf->length - 1; return (0); } } @@ -138,6 +139,7 @@ *pos++ = REC_D; recbuf->offset = 0; recbuf->length = pos - recbuf->data; + recbuf->keylen = recbuf->length - 1; return (0); } FCLOSE(fp); @@ -154,6 +156,7 @@ recbuf->offset = 0; recbuf->length = 0; + recbuf->keylen = 0; return (BUFFEND); } Index: src/usr.bin/sort/fsort.c diff -u src/usr.bin/sort/fsort.c:1.40 src/usr.bin/sort/fsort.c:1.41 --- src/usr.bin/sort/fsort.c:1.40 Sat Sep 5 12:00:25 2009 +++ src/usr.bin/sort/fsort.c Thu Sep 10 22:02:40 2009 @@ -1,4 +1,4 @@ -/* $NetBSD: fsort.c,v 1.40 2009/09/05 12:00:25 dsl Exp $ */ +/* $NetBSD: fsort.c,v 1.41 2009/09/10 22:02:40 dsl Exp $ */ /*- * Copyright (c) 2000-2003 The NetBSD Foundation, Inc. @@ -72,7 +72,7 @@ #include "fsort.h" #ifndef lint -__RCSID("$NetBSD: fsort.c,v 1.40 2009/09/05 12:00:25 dsl Exp $"); +__RCSID("$NetBSD: fsort.c,v 1.41 2009/09/10 22:02:40 dsl Exp $"); __SCCSID("@(#)fsort.c 8.1 (Berkeley) 6/6/93"); #endif /* not lint */ @@ -86,8 +86,8 @@ void fsort(struct filelist *filelist, int nfiles, FILE *outfp, struct field *ftbl) { - const RECHEADER **keylist; - const RECHEADER **keypos, **keyp; + RECHEADER **keylist; + RECHEADER **keypos, **keyp; RECHEADER *buffer; size_t bufsize = DEFBUFSIZE; u_char *bufend; @@ -158,25 +158,19 @@ } /* Sort this set of records */ - if (SINGL_FLD) { - nelem = radix_sort(keylist, keylist + MAXNUM, nelem, - ftbl[0].weights, REC_D); - } else { - nelem = radix_sort(keylist, keylist + MAXNUM, nelem, - unweighted, 0); - } + radix_sort(keylist, keylist + MAXNUM, nelem); if (c == EOF && mfct == 0) { /* all the data is (sorted) in the buffer */ append(keylist, nelem, outfp, - DEBUG('k') ? putkeydump : putline, ftbl->weights); + DEBUG('k') ? putkeydump : putline); break; } /* Save current data to a temporary file for a later merge */ fp = ftmp(); fstack[mfct].fp = fp; - append(keylist, nelem, fp, putrec, NULL); + append(keylist, nelem, fp, putrec); mfct++; if (c == EOF) { Index: src/usr.bin/sort/init.c diff -u src/usr.bin/sort/init.c:1.22 src/usr.bin/sort/init.c:1.23 --- src/usr.bin/sort/init.c:1.22 Sat Sep 5 09:16:18 2009 +++ src/usr.bin/sort/init.c Thu Sep 10 22:02:40 2009 @@ -1,4 +1,4 @@ -/* $NetBSD: init.c,v 1.22 2009/09/05 09:16:18 dsl Exp $ */ +/* $NetBSD: init.c,v 1.23 2009/09/10 22:02:40 dsl Exp $ */ /*- * Copyright (c) 2000-2003 The NetBSD Foundation, Inc. @@ -64,7 +64,7 @@ #include "sort.h" #ifndef lint -__RCSID("$NetBSD: init.c,v 1.22 2009/09/05 09:16:18 dsl Exp $"); +__RCSID("$NetBSD: init.c,v 1.23 2009/09/10 22:02:40 dsl Exp $"); __SCCSID("@(#)init.c 8.1 (Berkeley) 6/6/93"); #endif /* not lint */ @@ -72,7 +72,7 @@ #include <string.h> static void insertcol(struct field *); -static const char *setcolumn(const char *, struct field *, int); +static const char *setcolumn(const char *, struct field *); /* * masks of ignored characters. @@ -152,7 +152,7 @@ * interprets a column in a -k field */ static const char * -setcolumn(const char *pos, struct field *cur_fld, int gflag) +setcolumn(const char *pos, struct field *cur_fld) { struct column *col; char *npos; @@ -183,27 +183,27 @@ int setfield(const char *pos, struct field *cur_fld, int gflag) { - int tmp; - cur_fld->mask = NULL; - pos = setcolumn(pos, cur_fld, gflag); + pos = setcolumn(pos, cur_fld); if (*pos == '\0') /* key extends to EOL. */ cur_fld->tcol.num = 0; else { if (*pos != ',') errx(2, "illegal field descriptor"); - setcolumn((++pos), cur_fld, gflag); + setcolumn((++pos), cur_fld); } if (!cur_fld->flags) cur_fld->flags = gflag; - tmp = cur_fld->flags; + if (REVERSE) + /* A local 'r' doesn't invert the global one */ + cur_fld->flags &= ~R; /* Assign appropriate mask table and weight table. */ - cur_fld->weights = weight_tables[tmp & (R | F)]; - if (tmp & I) + cur_fld->weights = weight_tables[cur_fld->flags & (R | F)]; + if (cur_fld->flags & I) cur_fld->mask = itable; - else if (tmp & D) + else if (cur_fld->flags & D) cur_fld->mask = dtable; cur_fld->flags |= (gflag & (BI | BT)); @@ -217,6 +217,7 @@ && cur_fld->tcol.indent != 0 && cur_fld->tcol.indent < cur_fld->icol.indent)) errx(2, "fields out of order"); + insertcol(cur_fld); return (cur_fld->tcol.num); } @@ -319,18 +320,27 @@ * Convert 'ascii' characters into their sort order. * The 'F' variants fold lower case to upper equivalent * The 'R' variants are for reverse sorting. - * The record separator (REC_D) always maps to 0. - * One is reserved for field separators (added when key is generated) - * The field separator (from -t<ch>) map to 1 or 255 (unless SINGL_FLD) + * + * The record separator (REC_D) never needs a weight, this frees one + * byte value as an 'end of key' marker. This must be 0 for normal + * weight tables, and 0xff for reverse weight tables - and is used + * to terminate keys so that short keys sort before (after if reverse) + * longer keys. + * + * The field separator has a normal weight - although it cannot occur + * within a key unless it is the default (space+tab). + * * All other bytes map to the appropriate value for the sort order. * Numeric sorts don't need any tables, they are reversed by negation. * + * Global reverse sorts are done by writing the sorted keys in reverse + * order - the sort itself is stil forwards. + * This means that weights are only ever used when generating keys, any + * sort of the original data bytes is always forwards and unweighted. + * * Note: this is only good for ASCII sorting. For different LC 's, * all bets are off. * - * If SINGL_FLD then the weights have to be applied during the actual sort. - * Otherwise they are applied when the key bytes are built. - * * itable[] and dtable[] are the masks for -i (ignore non-printables) * and -d (only sort blank and alphanumerics). */ @@ -338,21 +348,17 @@ settables(void) { int i; - int next_weight = SINGL_FLD ? 1 : 2; - int rev_weight = SINGL_FLD ? 255 : 254; - int had_field_sep = SINGL_FLD; + int next_weight = 1; + int rev_weight = 254; + + ascii[REC_D] = 0; + Rascii[REC_D] = 255; + Ftable[REC_D] = 0; + RFtable[REC_D] = 255; for (i = 0; i < 256; i++) { - unweighted[i] = i; - if (d_mask[i] & REC_D_F) + if (i == REC_D) continue; - if (!had_field_sep && d_mask[i] & FLD_D) { - /* First/only separator sorts before any data */ - ascii[i] = 1; - Rascii[i] = 255; - had_field_sep = 1; - continue; - } ascii[i] = next_weight; Rascii[i] = rev_weight; if (Ftable[i] == 0) { Index: src/usr.bin/sort/msort.c diff -u src/usr.bin/sort/msort.c:1.25 src/usr.bin/sort/msort.c:1.26 --- src/usr.bin/sort/msort.c:1.25 Sat Sep 5 12:00:25 2009 +++ src/usr.bin/sort/msort.c Thu Sep 10 22:02:40 2009 @@ -1,4 +1,4 @@ -/* $NetBSD: msort.c,v 1.25 2009/09/05 12:00:25 dsl Exp $ */ +/* $NetBSD: msort.c,v 1.26 2009/09/10 22:02:40 dsl Exp $ */ /*- * Copyright (c) 2000-2003 The NetBSD Foundation, Inc. @@ -65,7 +65,7 @@ #include "fsort.h" #ifndef lint -__RCSID("$NetBSD: msort.c,v 1.25 2009/09/05 12:00:25 dsl Exp $"); +__RCSID("$NetBSD: msort.c,v 1.26 2009/09/10 22:02:40 dsl Exp $"); __SCCSID("@(#)msort.c 8.1 (Berkeley) 6/6/93"); #endif /* not lint */ @@ -78,12 +78,10 @@ typedef struct mfile { u_char *end; - short flno; + int flno; RECHEADER rec[1]; } MFILE; -static u_char *wts; - static int cmp(RECHEADER *, RECHEADER *); static int insert(struct mfile **, struct mfile **, int, int); static void merge(int, int, get_func_t, FILE *, put_func_t, struct field *); @@ -96,8 +94,6 @@ int i, j, last; put_func_t put; - wts = ftbl->weights; - while (nfiles) { put = putrec; for (j = 0; j < nfiles; j += MERGE_FNUM) { @@ -278,6 +274,8 @@ * comes here). */ cmpv = tmprec->flno - flist[mid]->flno; + if (REVERSE) + cmpv = -cmpv; } if (cmpv < 0) top = mid; @@ -332,7 +330,6 @@ crec_end = crec->data + DEFLLEN; prec = malloc(offsetof(RECHEADER, data[DEFLLEN])); prec_end = prec->data + DEFLLEN; - wts = ftbl->weights; /* XXX this does exit(0) for overlong lines */ if (get(-1, 0, filelist, 1, prec, prec_end, ftbl) != 0) @@ -365,27 +362,15 @@ static int cmp(RECHEADER *rec1, RECHEADER *rec2) { + int len; int r; - size_t len, i; - u_char *pos1, *pos2; - u_char *cwts; - - if (!SINGL_FLD) - /* key is weights, and is 0x00 terminated */ - return memcmp(rec1->data, rec2->data, rec1->offset); - - /* We have to apply the weights ourselves */ - cwts = wts; - - pos1 = rec1->data; - pos2 = rec2->data; - len = rec1->length; - - for (i = 0; i < len; i++) { - r = cwts[pos1[i]] - cwts[pos2[i]]; - if (r) - return r; - } - return (0); + /* key is weights */ + len = min(rec1->keylen, rec2->keylen); + r = memcmp(rec1->data, rec2->data, len); + if (r == 0) + r = rec1->keylen - rec2->keylen; + if (REVERSE) + r = -r; + return r; } Index: src/usr.bin/sort/radix_sort.c diff -u src/usr.bin/sort/radix_sort.c:1.2 src/usr.bin/sort/radix_sort.c:1.3 --- src/usr.bin/sort/radix_sort.c:1.2 Sat Sep 5 12:00:25 2009 +++ src/usr.bin/sort/radix_sort.c Thu Sep 10 22:02:40 2009 @@ -1,4 +1,4 @@ -/* $NetBSD: radix_sort.c,v 1.2 2009/09/05 12:00:25 dsl Exp $ */ +/* $NetBSD: radix_sort.c,v 1.3 2009/09/10 22:02:40 dsl Exp $ */ /*- * Copyright (c) 1990, 1993 @@ -37,7 +37,7 @@ #if 0 static char sccsid[] = "@(#)radixsort.c 8.2 (Berkeley) 4/28/95"; #else -__RCSID("$NetBSD: radix_sort.c,v 1.2 2009/09/05 12:00:25 dsl Exp $"); +__RCSID("$NetBSD: radix_sort.c,v 1.3 2009/09/10 22:02:40 dsl Exp $"); #endif #endif /* LIBC_SCCS and not lint */ @@ -49,82 +49,96 @@ #include <assert.h> #include <errno.h> +#include <util.h> #include "sort.h" typedef struct { - const RECHEADER **sa; /* Base of saved area */ - int sn; /* Number of entries */ - int si; /* index into data for compare */ + RECHEADER **sa; /* Base of saved area */ + int sn; /* Number of entries */ + int si; /* index into data for compare */ } stack; -static inline int simplesort(const RECHEADER **, int, int, const u_char *, u_int); -static int r_sort_b(const RECHEADER **, - const RECHEADER **, int, int, const u_char *, u_int); +static void simplesort(RECHEADER **, int, int); #define THRESHOLD 20 /* Divert to simplesort(). */ -#define SIZE 512 /* Default stack size. */ #define empty(s) (s >= sp) #define pop(a, n, i) a = (--sp)->sa, n = sp->sn, i = sp->si #define push(a, n, i) sp->sa = a, sp->sn = n, (sp++)->si = i #define swap(a, b, t) t = a, a = b, b = t -int -radix_sort(const RECHEADER **a, const RECHEADER **ta, int n, const u_char *tab, u_int endch) +void +radix_sort(RECHEADER **a, RECHEADER **ta, int n) { - endch = tab[endch]; + u_int count[256], nc, bmin; + u_int c; + RECHEADER **ak, **tai, **lim; + RECHEADER *hdr; + int stack_size = 512; + stack *s, *sp, *sp0, *sp1, temp; + RECHEADER **top[256]; + u_int *cp, bigc; + int data_index = 0; + if (n < THRESHOLD && !DEBUG('r')) { - return simplesort(a, n, 0, tab, endch); + simplesort(a, n, 0); + return; } - return r_sort_b(a, ta, n, 0, tab, endch); -} -static int -r_sort_b(const RECHEADER **a, const RECHEADER **ta, int n, int i, const u_char *tr, u_int endch) -{ - static u_int count[256], nc, bmin; - u_int c; - const RECHEADER **ak, **ai; - stack s[512], *sp, *sp0, *sp1, temp; - const RECHEADER **top[256]; - u_int *cp, bigc; - int nrec = n; + s = emalloc(stack_size * sizeof *s); + memset(&count, 0, sizeof count); + /* Technically 'top' doesn't need zeroing */ + memset(&top, 0, sizeof top); sp = s; - push(a, n, i); + push(a, n, data_index); while (!empty(s)) { - pop(a, n, i); + pop(a, n, data_index); if (n < THRESHOLD && !DEBUG('r')) { - simplesort(a, n, i, tr, endch); + simplesort(a, n, data_index); continue; } - if (nc == 0) { - bmin = 255; - for (ak = a + n; --ak >= a;) { - c = tr[(*ak)->data[i]]; - if (++count[c] == 1 && c != endch) { - if (c < bmin) - bmin = c; - nc++; - } - } - if (sp + nc > s + SIZE) { - r_sort_b(a, ta, n, i, tr, endch); + /* Count number of times each 'next byte' occurs */ + nc = 0; + bmin = 255; + lim = a + n; + for (ak = a, tai = ta; ak < lim; ak++) { + hdr = *ak; + if (data_index >= hdr->keylen) { + /* Short key, copy to start of output */ + if (UNIQUE && a != sp->sa) + /* Stop duplicate being written out */ + hdr->keylen = -1; + *a++ = hdr; + n--; continue; } + /* Save in temp buffer for distribute */ + *tai++ = hdr; + c = hdr->data[data_index]; + if (++count[c] == 1) { + if (c < bmin) + bmin = c; + nc++; + } + } + /* + * We need save the bounds for each 'next byte' that + * occurs more so we can sort each block. + */ + if (sp + nc > s + stack_size) { + stack_size *= 2; + sp1 = erealloc(s, stack_size * sizeof *s); + sp = sp1 + (sp - s); + s = sp1; } + /* Minor optimisation to do the largest set last */ sp0 = sp1 = sp; bigc = 2; - if (endch == 0) { - top[0] = ak = a + count[0]; - count[0] = 0; - } else { - ak = a; - top[255] = a + n; - count[255] = 0; - } + /* Convert 'counts' positions, saving bounds for later sorts */ + ak = a; for (cp = count + bmin; nc > 0; cp++) { while (*cp == 0) cp++; @@ -133,48 +147,69 @@ bigc = c; sp1 = sp; } - push(ak, c, i+1); + push(ak, c, data_index+1); } - top[cp-count] = ak += c; + ak += c; + top[cp-count] = ak; *cp = 0; /* Reset count[]. */ nc--; } swap(*sp0, *sp1, temp); - for (ak = ta + n, ai = a+n; ak > ta;) /* Copy to temp. */ - *--ak = *--ai; for (ak = ta+n; --ak >= ta;) /* Deal to piles. */ - *--top[tr[(*ak)->data[i]]] = *ak; + *--top[(*ak)->data[data_index]] = *ak; } - return nrec; + free(s); } -/* insertion sort */ -static inline int -simplesort(const RECHEADER **a, int n, int b, const u_char *tr, u_int endch) +/* insertion sort, short records are sorted before long ones */ +static void +simplesort(RECHEADER **a, int n, int data_index) { - const RECHEADER **ak, **ai, *tmp; - const RECHEADER **lim = a + n; + RECHEADER **ak, **ai; + RECHEADER *akh; + RECHEADER **lim = a + n; const u_char *s, *t; - u_char ch; + int s_len, t_len; + int i; + int r; if (n <= 1) - return n; + return; for (ak = a+1; ak < lim; ak++) { - for (ai = ak; ai > a; ai--) { - for (s = ai[0]->data + b, t = ai[-1]->data + b; - (ch = tr[*s]) != endch; s++, t++) - if (ch != tr[*t]) + akh = *ak; + s = akh->data; + s_len = akh->keylen; + for (ai = ak; ;) { + ai--; + t = (*ai)->data; + t_len = (*ai)->keylen; + for (i = data_index; ; i++) { + if (i >= s_len || i >= t_len) { + r = s_len - t_len; break; - if (ch >= tr[*t]) { - + } + r = s[i] - t[i]; + if (r != 0) + break; + } + if (r >= 0) { + if (r == 0 && UNIQUE) { + /* Put record below existing */ + ai[1] = ai[0]; + /* Mark so ignored by output() */ + akh->keylen = -1; + } else { + ai++; + } break; } - swap(ai[0], ai[-1], tmp); + ai[1] = ai[0]; + if (ai == a) + break; } + ai[0] = akh; } - - return n; } Index: src/usr.bin/sort/sort.c diff -u src/usr.bin/sort/sort.c:1.54 src/usr.bin/sort/sort.c:1.55 --- src/usr.bin/sort/sort.c:1.54 Sat Sep 5 09:16:18 2009 +++ src/usr.bin/sort/sort.c Thu Sep 10 22:02:40 2009 @@ -1,4 +1,4 @@ -/* $NetBSD: sort.c,v 1.54 2009/09/05 09:16:18 dsl Exp $ */ +/* $NetBSD: sort.c,v 1.55 2009/09/10 22:02:40 dsl Exp $ */ /*- * Copyright (c) 2000-2003 The NetBSD Foundation, Inc. @@ -66,6 +66,7 @@ * a choice of merge sort and radix sort for external sorting. */ +#include <util.h> #include "sort.h" #include "fsort.h" #include "pathnames.h" @@ -76,7 +77,7 @@ #endif /* not lint */ #ifndef lint -__RCSID("$NetBSD: sort.c,v 1.54 2009/09/05 09:16:18 dsl Exp $"); +__RCSID("$NetBSD: sort.c,v 1.55 2009/09/10 22:02:40 dsl Exp $"); __SCCSID("@(#)sort.c 8.1 (Berkeley) 6/6/93"); #endif /* not lint */ @@ -100,8 +101,10 @@ */ u_char *const weight_tables[4] = { ascii, Rascii, Ftable, RFtable }; u_char ascii[NBINS], Rascii[NBINS], RFtable[NBINS], Ftable[NBINS]; -u_char unweighted[NBINS]; + int SINGL_FLD = 0, SEP_FLAG = 0, UNIQUE = 0; +int REVERSE = 0; +int posix_sort; unsigned int debug_flags = 0; @@ -122,7 +125,7 @@ int ch, i, stdinflag = 0; char cflag = 0, mflag = 0; char *outfile, *outpath = 0; - struct field *fldtab, *p; + struct field *fldtab; size_t fldtab_sz, fld_cnt; struct filelist filelist; int num_input_files; @@ -145,7 +148,7 @@ /* fldtab[0] is the global options. */ fldtab_sz = 3; fld_cnt = 0; - fldtab = malloc(fldtab_sz * sizeof(*fldtab)); + fldtab = emalloc(fldtab_sz * sizeof(*fldtab)); memset(fldtab, 0, fldtab_sz * sizeof(*fldtab)); /* Convert "+field" args to -f format */ @@ -166,7 +169,7 @@ for (i = 0; optarg[i]; i++) debug_flags |= 1 << (optarg[i] & 31); break; - case 'd': case 'f': case 'i': case 'n': case 'r': + case 'd': case 'f': case 'i': case 'n': fldtab[0].flags |= optval(ch, 0); break; case 'H': @@ -174,10 +177,7 @@ /* That is now the default. */ break; case 'k': - p = realloc(fldtab, (fldtab_sz + 1) * sizeof(*fldtab)); - if (!p) - err(1, "realloc"); - fldtab = p; + fldtab = erealloc(fldtab, (fldtab_sz + 1) * sizeof(*fldtab)); memset(&fldtab[fldtab_sz], 0, sizeof(fldtab[0])); fldtab_sz++; @@ -189,12 +189,16 @@ case 'o': outpath = optarg; break; + case 'r': + REVERSE = 1; + break; case 's': /* * Nominally 'stable sort', keep lines with equal keys * in input file order. (Default for NetBSD) * (-s for GNU sort compatibility.) */ + posix_sort = 0; break; case 'S': /* @@ -205,6 +209,7 @@ * (using libc radixsort() v sradixsort() doesn't * have the desired effect.) */ + posix_sort = 1; break; case 't': if (SEP_FLAG) @@ -249,6 +254,10 @@ } } + if (UNIQUE) + /* Don't sort on raw record if keys match */ + posix_sort = 0; + if (cflag && argc > optind+1) errx(2, "too many input files for -c option"); if (argc - 2 > optind && !strcmp(argv[argc-2], "-o")) { @@ -277,17 +286,22 @@ err(2, "%s", argv[i]); } - if (!(fldtab[0].flags & (I|D|N) || fldtab[1].icol.num)) { - SINGL_FLD = 1; - fldtab[0].icol.num = 1; - fldtab[0].weights = weight_tables[fldtab->flags & (R | F)]; - } else { - if (!fldtab[1].icol.num) { + if (fldtab[1].icol.num == 0) { + /* No sort key specified */ + if (fldtab[0].flags & (I|D|F|N)) { + /* Modified - generate a key that covers the line */ fldtab[0].flags &= ~(BI|BT); setfield("1", &fldtab[++fld_cnt], fldtab->flags); + fldreset(fldtab); + } else { + /* Unmodified, just compare the line */ + SINGL_FLD = 1; + fldtab[0].icol.num = 1; } + } else { fldreset(fldtab); } + settables(); if (optind == argc) {