Module Name: src
Committed By: dsl
Date: Tue Aug 18 18:00:28 UTC 2009
Modified Files:
src/usr.bin/sort: append.c files.c fsort.c sort.c sort.h
Log Message:
The code that attempted to sort large files by sorting each chunk by the
first key byte and writing to a temp file, then sorting the records from
each temp file that had the same first key byte (and repeating for upto
4 key bytes) was a nice idea, but completely doomed to failure.
Eg PR/9308 where a 70MB file has all but one record the same and short keys.
Not only does the code not work, it is rather guaranteed to be slow.
Instead always use a merge sort for fully sorted chunk of records (each
temporary file contains one lot of sorted records).
The -H option already did this, so just rip out all the code and variables
that can't be used when -H was specified.
Further cleanup to come ...
To generate a diff of this commit:
cvs rdiff -u -r1.17 -r1.18 src/usr.bin/sort/append.c
cvs rdiff -u -r1.33 -r1.34 src/usr.bin/sort/files.c
cvs rdiff -u -r1.36 -r1.37 src/usr.bin/sort/fsort.c
cvs rdiff -u -r1.49 -r1.50 src/usr.bin/sort/sort.c
cvs rdiff -u -r1.22 -r1.23 src/usr.bin/sort/sort.h
Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.
Modified files:
Index: src/usr.bin/sort/append.c
diff -u src/usr.bin/sort/append.c:1.17 src/usr.bin/sort/append.c:1.18
--- src/usr.bin/sort/append.c:1.17 Sun Aug 16 20:02:04 2009
+++ src/usr.bin/sort/append.c Tue Aug 18 18:00:28 2009
@@ -1,4 +1,4 @@
-/* $NetBSD: append.c,v 1.17 2009/08/16 20:02:04 dsl Exp $ */
+/* $NetBSD: append.c,v 1.18 2009/08/18 18:00:28 dsl Exp $ */
/*-
* Copyright (c) 2000-2003 The NetBSD Foundation, Inc.
@@ -64,7 +64,7 @@
#include "sort.h"
#ifndef lint
-__RCSID("$NetBSD: append.c,v 1.17 2009/08/16 20:02:04 dsl Exp $");
+__RCSID("$NetBSD: append.c,v 1.18 2009/08/18 18:00:28 dsl Exp $");
__SCCSID("@(#)append.c 8.1 (Berkeley) 6/6/93");
#endif /* not lint */
@@ -183,38 +183,3 @@
put(crec, fp);
}
}
-
-/*
- * output the already sorted eol bin.
- */
-void
-rd_append(int binno, int infl0, int nfiles, FILE *outfp, u_char *buffer,
- u_char *bufend)
-{
- RECHEADER *rec;
-
- rec = (RECHEADER *) buffer;
- if (!getnext(binno, infl0, NULL, nfiles,
- (RECHEADER *) buffer, bufend, 0)) {
- putline(rec, outfp);
- while (getnext(binno, infl0, NULL, nfiles, (RECHEADER *) buffer,
- bufend, 0) == 0) {
- if (!UNIQUE)
- putline(rec, outfp);
- }
- }
-}
-
-/*
- * append plain text--used after sorting the biggest bin.
- */
-void
-concat(FILE *a, FILE *b)
-{
- int nread;
- char buffer[4096];
-
- rewind(b);
- while ((nread = fread(buffer, 1, 4096, b)) > 0)
- EWRITE(buffer, 1, nread, a);
-}
Index: src/usr.bin/sort/files.c
diff -u src/usr.bin/sort/files.c:1.33 src/usr.bin/sort/files.c:1.34
--- src/usr.bin/sort/files.c:1.33 Sun Aug 16 19:53:43 2009
+++ src/usr.bin/sort/files.c Tue Aug 18 18:00:28 2009
@@ -1,4 +1,4 @@
-/* $NetBSD: files.c,v 1.33 2009/08/16 19:53:43 dsl Exp $ */
+/* $NetBSD: files.c,v 1.34 2009/08/18 18:00:28 dsl Exp $ */
/*-
* Copyright (c) 2000-2003 The NetBSD Foundation, Inc.
@@ -65,7 +65,7 @@
#include "fsort.h"
#ifndef lint
-__RCSID("$NetBSD: files.c,v 1.33 2009/08/16 19:53:43 dsl Exp $");
+__RCSID("$NetBSD: files.c,v 1.34 2009/08/18 18:00:28 dsl Exp $");
__SCCSID("@(#)files.c 8.1 (Berkeley) 6/6/93");
#endif /* not lint */
@@ -74,73 +74,6 @@
static ssize_t seq(FILE *, u_char **);
/*
- * this is the subroutine for file management for fsort().
- * It keeps the buffers for all temporary files.
- */
-int
-getnext(int binno, int infl0, struct filelist *filelist, int nfiles,
- RECHEADER *pos, u_char *end, struct field *dummy)
-{
- int i;
- u_char *hp;
- static size_t nleft = 0;
- static int cnt = 0, flag = -1;
- static u_char maxb = 0;
- static FILE *fp;
-
- if (nleft == 0) {
- if (binno < 0) /* reset files. */ {
- for (i = 0; i < nfiles; i++) {
- rewind(fstack[infl0 + i].fp);
- fstack[infl0 + i].max_o = 0;
- }
- flag = -1;
- nleft = cnt = 0;
- return (-1);
- }
- maxb = fstack[infl0].maxb;
- for (; nleft == 0; cnt++) {
- if (cnt >= nfiles) {
- cnt = 0;
- return (EOF);
- }
- fp = fstack[infl0 + cnt].fp;
- fread(&nleft, sizeof(nleft), 1, fp);
- if (binno < maxb)
- fstack[infl0+cnt].max_o
- += sizeof(nleft) + nleft;
- else if (binno == maxb) {
- if (binno != fstack[infl0].lastb) {
- fseeko(fp, fstack[infl0+
- cnt].max_o, SEEK_SET);
- fread(&nleft, sizeof(nleft), 1, fp);
- }
- if (nleft == 0)
- fclose(fp);
- } else if (binno == maxb + 1) { /* skip a bin */
- fseek(fp, nleft, SEEK_CUR);
- fread(&nleft, sizeof(nleft), 1, fp);
- flag = cnt;
- }
- }
- }
- if ((u_char *) pos > end - REC_DATA_OFFSET)
- return (BUFFEND);
- fread(pos, REC_DATA_OFFSET, 1, fp);
- if (end - pos->data < (ptrdiff_t)pos->length) {
- hp = ((u_char *)pos) + REC_DATA_OFFSET;
- for (i = REC_DATA_OFFSET; i ; i--)
- ungetc(*--hp, fp);
- return (BUFFEND);
- }
- fread(pos->data, pos->length, 1, fp);
- nleft -= pos->length + REC_DATA_OFFSET;
- if (nleft == 0 && binno == fstack[infl0].maxb)
- fclose(fp);
- return (0);
-}
-
-/*
* this is called when there is no special key. It's only called
* in the first fsort pass.
*/
Index: src/usr.bin/sort/fsort.c
diff -u src/usr.bin/sort/fsort.c:1.36 src/usr.bin/sort/fsort.c:1.37
--- src/usr.bin/sort/fsort.c:1.36 Sun Aug 16 20:02:04 2009
+++ src/usr.bin/sort/fsort.c Tue Aug 18 18:00:28 2009
@@ -1,4 +1,4 @@
-/* $NetBSD: fsort.c,v 1.36 2009/08/16 20:02:04 dsl Exp $ */
+/* $NetBSD: fsort.c,v 1.37 2009/08/18 18:00:28 dsl Exp $ */
/*-
* Copyright (c) 2000-2003 The NetBSD Foundation, Inc.
@@ -62,17 +62,17 @@
*/
/*
- * Read in the next bin. If it fits in one segment sort it;
- * otherwise refine it by segment deeper by one character,
- * and try again on smaller bins. Sort the final bin at this level
- * of recursion to keep the head of fstack at 0.
- * After PANIC passes, abort to merge sort.
+ * Read in a block of records (until 'enough').
+ * sort, write to temp file.
+ * Merge sort temp files into output file
+ * Small files miss out the temp file stage.
+ * Large files might get multiple merges.
*/
#include "sort.h"
#include "fsort.h"
#ifndef lint
-__RCSID("$NetBSD: fsort.c,v 1.36 2009/08/16 20:02:04 dsl Exp $");
+__RCSID("$NetBSD: fsort.c,v 1.37 2009/08/18 18:00:28 dsl Exp $");
__SCCSID("@(#)fsort.c 8.1 (Berkeley) 6/6/93");
#endif /* not lint */
@@ -100,17 +100,13 @@
const u_char **keypos;
u_char *bufend;
u_char *weights;
- int ntfiles, mfct = 0, total, i, maxb, lastb, panic = 0;
- int c, nelem, base;
- long sizes [NBINS + 1];
+ int ntfiles, mfct = 0;
+ int c, nelem;
get_func_t get;
struct recheader *crec;
struct field tfield[2];
- FILE *prevfp, *tailfp[FSORTMAX + 1];
u_char *nbuffer;
- memset(tailfp, 0, sizeof(tailfp));
- prevfp = outfp;
memset(tfield, 0, sizeof(tfield));
if (ftbl[0].flags & R)
tfield[0].weights = Rascii;
@@ -124,256 +120,113 @@
memset(keylist, 0, MAXNUM * sizeof(u_char *));
}
bufend = buffer + bufsize;
- if (binno >= 0) {
- base = top + nfiles;
- get = getnext;
- } else {
- base = 0;
- if (SINGL_FLD)
- get = makeline;
- else
- get = makekey;
- }
- for (;;) {
- memset(sizes, 0, sizeof(sizes));
- c = nelem = ntfiles = 0; /* XXXGCC -Wuninitialized m68k */
- if (binno == weights[REC_D] &&
- !(SINGL_FLD && ftbl[0].flags & F)) { /* pop */
- rd_append(weights[REC_D], top,
- nfiles, prevfp, buffer, bufend);
- break;
- } else if (binno == weights[REC_D]) {
- depth = 0; /* start over on flat weights */
- ftbl = tfield;
- weights = ftbl[0].weights;
- }
- keypos = keylist; /* XXXGCC -Wuninitialized m68k */
- crec = (RECHEADER *) buffer; /* XXXGCC -Wuninitialized m68k */
- while (c != EOF) {
- keypos = keylist;
- nelem = 0;
- crec = (RECHEADER *) buffer;
-
- do_read:
- while ((c = get(binno, top, filelist, nfiles, crec,
- bufend, ftbl)) == 0) {
- *keypos++ = crec->data + depth;
- if (++nelem == MAXNUM) {
- c = BUFFEND;
- break;
- }
- crec =(RECHEADER *)((char *) crec +
- SALIGN(crec->length) + REC_DATA_OFFSET);
- }
-
- if (c == BUFFEND && nelem < MAXNUM
- && bufsize < MAXBUFSIZE) {
- const u_char **keyp;
- u_char *oldb = buffer;
-
- /* buffer was too small for data, allocate
- * bigger buffer */
- nbuffer = realloc(buffer, bufsize * 2);
- if (!nbuffer) {
- err(2, "failed to realloc buffer to %lu bytes",
- (unsigned long) bufsize * 2);
- }
- buffer = nbuffer;
- bufsize *= 2;
- bufend = buffer + bufsize;
-
- /* patch up keylist[] */
- for (keyp = &keypos[-1]; keyp >= keylist; keyp--)
- *keyp = buffer + (*keyp - oldb);
-
- crec = (RECHEADER *) (buffer + ((u_char *)crec - oldb));
- goto do_read;
- }
-
- if (c != BUFFEND && !ntfiles && !mfct) {
- /* do not push */
- continue;
- }
+ if (SINGL_FLD)
+ get = makeline;
+ else
+ get = makekey;
- /* push */
- if (panic >= PANIC) {
- fstack[MSTART + mfct].fp = ftmp();
- if ((stable_sort)
- ? sradixsort(keylist, nelem,
- weights, REC_D)
- : radixsort(keylist, nelem,
- weights, REC_D) )
- err(2, NULL);
- append(keylist, nelem, depth,
- fstack[MSTART + mfct].fp, putrec,
- ftbl);
- mfct++;
- /* reduce number of open files */
- if (mfct == MERGE_FNUM ||(c == EOF && ntfiles)) {
- /*
- * Only copy extra incomplete crec
- * data if there are any.
- */
- int nodata = (bufend >= (u_char *)crec
- && bufend <= crec->data);
- size_t sz=0;
- u_char *tmpbuf=NULL;
-
- if (!nodata) {
- sz = bufend - crec->data;
- tmpbuf = malloc(sz);
- memmove(tmpbuf, crec->data, sz);
- }
-
- CHECKFSTACK(base + ntfiles);
- fstack[base + ntfiles].fp = ftmp();
- fmerge(0, MSTART, filelist,
- mfct, geteasy,
- fstack[base + ntfiles].fp,
- putrec, ftbl);
- ntfiles++;
- mfct = 0;
-
- if (!nodata) {
- memmove(crec->data, tmpbuf, sz);
- free(tmpbuf);
- }
- }
- } else {
- CHECKFSTACK(base + ntfiles);
- fstack[base + ntfiles].fp = ftmp();
- onepass(keylist, depth, nelem, sizes,
- weights, fstack[base + ntfiles].fp);
- ntfiles++;
+ c = nelem = ntfiles = 0; /* XXXGCC -Wuninitialized m68k */
+ keypos = keylist; /* XXXGCC -Wuninitialized m68k */
+ crec = (RECHEADER *) buffer; /* XXXGCC -Wuninitialized m68k */
+ while (c != EOF) {
+ keypos = keylist;
+ nelem = 0;
+ crec = (RECHEADER *) buffer;
+
+ do_read:
+ while ((c = get(-1, top, filelist, nfiles, crec,
+ bufend, ftbl)) == 0) {
+ *keypos++ = crec->data + depth;
+ if (++nelem == MAXNUM) {
+ c = BUFFEND;
+ break;
}
+ crec =(RECHEADER *)((char *) crec +
+ SALIGN(crec->length) + REC_DATA_OFFSET);
}
- if (!ntfiles && !mfct) { /* everything in memory--pop */
- if (nelem > 1
- && ((stable_sort)
- ? sradixsort(keylist, nelem, weights, REC_D)
- : radixsort(keylist, nelem, weights, REC_D) ))
- err(2, NULL);
- if (nelem > 0)
- append(keylist, nelem, depth, outfp, putline, ftbl);
- break; /* pop */
- }
- if (panic >= PANIC) {
- if (!ntfiles)
- fmerge(0, MSTART, filelist, mfct, geteasy,
- outfp, putline, ftbl);
- else
- fmerge(0, base, filelist, ntfiles, geteasy,
- outfp, putline, ftbl);
- break;
-
- }
- total = maxb = lastb = 0; /* find if one bin dominates */
- for (i = 0; i < NBINS; i++)
- if (sizes[i]) {
- if (sizes[i] > sizes[maxb])
- maxb = i;
- lastb = i;
- total += sizes[i];
+
+ if (c == BUFFEND && nelem < MAXNUM
+ && bufsize < MAXBUFSIZE) {
+ const u_char **keyp;
+ u_char *oldb = buffer;
+
+ /* buffer was too small for data, allocate
+ * bigger buffer */
+ nbuffer = realloc(buffer, bufsize * 2);
+ if (!nbuffer) {
+ err(2, "failed to realloc buffer to %lu bytes",
+ (unsigned long) bufsize * 2);
}
- if (sizes[maxb] < max((total / 2) , BUFSIZE))
- maxb = lastb; /* otherwise pop after last bin */
- fstack[base].lastb = lastb;
- fstack[base].maxb = maxb;
-
- /* start refining next level. */
- getnext(-1, base, NULL, ntfiles, crec, bufend, 0); /* rewind */
- for (i = 0; i < maxb; i++) {
- if (!sizes[i]) /* bin empty; step ahead file offset */
- getnext(i, base, NULL,ntfiles, crec, bufend, 0);
- else
- fsort(i, depth + 1, base, filelist, ntfiles,
- outfp, ftbl);
- }
+ buffer = nbuffer;
+ bufsize *= 2;
+ bufend = buffer + bufsize;
- get = getnext;
+ /* patch up keylist[] */
+ for (keyp = &keypos[-1]; keyp >= keylist; keyp--)
+ *keyp = buffer + (*keyp - oldb);
- if (lastb != maxb) {
- if (prevfp != outfp)
- tailfp[panic] = prevfp;
- prevfp = ftmp();
- for (i = maxb + 1; i <= lastb; i++)
- if (!sizes[i])
- getnext(i, base, NULL, ntfiles, crec,
- bufend,0);
- else
- fsort(i, depth + 1, base, filelist,
- ntfiles, prevfp, ftbl);
+ crec = (RECHEADER *) (buffer + ((u_char *)crec - oldb));
+ goto do_read;
}
- /* sort biggest (or last) bin at this level */
- depth++;
- panic++;
- binno = maxb;
- top = base;
- nfiles = ntfiles; /* so overwrite them */
- }
- if (prevfp != outfp) {
- concat(outfp, prevfp);
- fclose(prevfp);
- }
- for (i = panic; i >= 0; --i)
- if (tailfp[i]) {
- concat(outfp, tailfp[i]);
- fclose(tailfp[i]);
+ if (c != BUFFEND && !ntfiles && !mfct) {
+ /* do not push */
+ continue;
}
- /* If on top level, free our structures */
- if (depth == 0) {
- free(keylist), keylist = NULL;
- free(buffer), buffer = NULL;
- }
-}
+ /* push */
+ fstack[MSTART + mfct].fp = ftmp();
+ if (radix_sort(keylist, nelem, weights, REC_D))
+ err(2, NULL);
+ append(keylist, nelem, depth, fstack[MSTART + mfct].fp, putrec,
+ ftbl);
+ mfct++;
+ /* reduce number of open files */
+ if (mfct == MERGE_FNUM ||(c == EOF && ntfiles)) {
+ /*
+ * Only copy extra incomplete crec
+ * data if there are any.
+ */
+ int nodata = (bufend >= (u_char *)crec
+ && bufend <= crec->data);
+ size_t sz=0;
+ u_char *tmpbuf=NULL;
+
+ if (!nodata) {
+ sz = bufend - crec->data;
+ tmpbuf = malloc(sz);
+ memmove(tmpbuf, crec->data, sz);
+ }
-/*
- * This is one pass of radix exchange, dumping the bins to disk.
- */
-#define swap(a, b, t) t = a, a = b, b = t
-void
-onepass(const u_char **a, int depth, long n, long sizes[], u_char *tr, FILE *fp)
-{
- size_t tsizes[NBINS + 1];
- const u_char **bin[257], ***bp, ***bpmax, **top[256], ***tp;
- static int histo[256];
- int *hp;
- int c;
- int hdr_off;
- const u_char **an, *t, **aj;
- const u_char **ak, *r;
-
- memset(tsizes, 0, sizeof(tsizes));
- hdr_off = REC_DATA_OFFSET + depth;
- an = &a[n];
- for (ak = a; ak < an; ak++) {
- histo[c = tr[**ak]]++;
- tsizes[c] += ((const RECHEADER *) (*ak -= hdr_off))->length;
+ CHECKFSTACK(ntfiles);
+ fstack[ntfiles].fp = ftmp();
+ fmerge(0, MSTART, filelist, mfct, geteasy,
+ fstack[ntfiles].fp, putrec, ftbl);
+ ntfiles++;
+ mfct = 0;
+
+ if (!nodata) {
+ memmove(crec->data, tmpbuf, sz);
+ free(tmpbuf);
+ }
+ }
}
- bin[0] = a;
- bpmax = bin + 256;
- tp = top, hp = histo;
- for (bp = bin; bp < bpmax; bp++) {
- *tp++ = *(bp + 1) = *bp + (c = *hp);
- *hp++ = 0;
- if (c <= 1)
- continue;
- }
- for (aj = a; aj < an; *aj = r, aj = bin[c + 1])
- for (r = *aj; aj < (ak = --top[c = tr[r[hdr_off]]]) ;)
- swap(*ak, r, t);
-
- for (ak = a, c = 0; c < 256; c++) {
- an = bin[c + 1];
- n = an - ak;
- tsizes[c] += n * REC_DATA_OFFSET;
- /* tell getnext how many elements in this bin, this segment. */
- EWRITE(&tsizes[c], sizeof(size_t), 1, fp);
- sizes[c] += tsizes[c];
- for (; ak < an; ++ak)
- putrec((const RECHEADER *) *ak, fp);
- }
+ if (!ntfiles && !mfct) { /* everything in memory--pop */
+ if (nelem > 1 && radix_sort(keylist, nelem, weights, REC_D))
+ err(2, NULL);
+ if (nelem > 0)
+ append(keylist, nelem, depth, outfp, putline, ftbl);
+ }
+ if (!ntfiles)
+ fmerge(0, MSTART, filelist, mfct, geteasy,
+ outfp, putline, ftbl);
+ else
+ fmerge(0, 0, filelist, ntfiles, geteasy,
+ outfp, putline, ftbl);
+
+ free(keylist);
+ keylist = NULL;
+ free(buffer);
+ buffer = NULL;
}
Index: src/usr.bin/sort/sort.c
diff -u src/usr.bin/sort/sort.c:1.49 src/usr.bin/sort/sort.c:1.50
--- src/usr.bin/sort/sort.c:1.49 Sat Aug 15 09:48:46 2009
+++ src/usr.bin/sort/sort.c Tue Aug 18 18:00:28 2009
@@ -1,4 +1,4 @@
-/* $NetBSD: sort.c,v 1.49 2009/08/15 09:48:46 dsl Exp $ */
+/* $NetBSD: sort.c,v 1.50 2009/08/18 18:00:28 dsl Exp $ */
/*-
* Copyright (c) 2000-2003 The NetBSD Foundation, Inc.
@@ -76,7 +76,7 @@
#endif /* not lint */
#ifndef lint
-__RCSID("$NetBSD: sort.c,v 1.49 2009/08/15 09:48:46 dsl Exp $");
+__RCSID("$NetBSD: sort.c,v 1.50 2009/08/18 18:00:28 dsl Exp $");
__SCCSID("@(#)sort.c 8.1 (Berkeley) 6/6/93");
#endif /* not lint */
@@ -105,6 +105,7 @@
* Default to stable sort.
*/
int stable_sort = 1;
+int (*radix_sort)(const u_char **, int, const u_char *, u_int) = sradixsort;
static char toutpath[MAXPATHLEN];
@@ -169,7 +170,8 @@
fldtab->flags |= tmp;
break;
case 'H':
- PANIC = 0;
+ /* -H was ; use merge sort for blocks of large files' */
+ /* That is now the default. */
break;
case 'k':
p = realloc(fldtab, (fldtab_sz + 1) * sizeof(*fldtab));
@@ -191,9 +193,11 @@
case 's':
/* for GNU sort compatibility (this is our default) */
stable_sort = 1;
+ radix_sort = radixsort;
break;
case 'S':
stable_sort = 0;
+ radix_sort = sradixsort;
break;
case 't':
if (SEP_FLAG)
Index: src/usr.bin/sort/sort.h
diff -u src/usr.bin/sort/sort.h:1.22 src/usr.bin/sort/sort.h:1.23
--- src/usr.bin/sort/sort.h:1.22 Sun Aug 16 19:53:43 2009
+++ src/usr.bin/sort/sort.h Tue Aug 18 18:00:28 2009
@@ -1,4 +1,4 @@
-/* $NetBSD: sort.h,v 1.22 2009/08/16 19:53:43 dsl Exp $ */
+/* $NetBSD: sort.h,v 1.23 2009/08/18 18:00:28 dsl Exp $ */
/*-
* Copyright (c) 2000-2003 The NetBSD Foundation, Inc.
@@ -160,13 +160,13 @@
RECHEADER *, u_char *, struct field *);
typedef void (*put_func_t)(const struct recheader *, FILE *);
-extern int PANIC; /* maximum depth of fsort before fmerge is called */
extern u_char ascii[NBINS], Rascii[NBINS], Ftable[NBINS], RFtable[NBINS];
extern u_char d_mask[NBINS];
extern int SINGL_FLD, SEP_FLAG, UNIQUE;
extern int REC_D;
extern const char *tmpdir;
extern int stable_sort;
+extern int (*radix_sort)(const u_char **, int, const u_char *, u_int);
extern u_char gweights[NBINS];
extern struct coldesc *clist;
extern int ncols;
@@ -184,8 +184,6 @@
struct field *);
int geteasy(int, int, struct filelist *,
int, RECHEADER *, u_char *, struct field *);
-int getnext(int, int, struct filelist *,
- int, RECHEADER *, u_char *, struct field *);
int makekey(int, int, struct filelist *,
int, RECHEADER *, u_char *, struct field *);
int makeline(int, int, struct filelist *,