Module Name: src Committed By: dsl Date: Fri Oct 9 20:29:44 UTC 2009
Modified Files: src/usr.bin/sort: msort.c Log Message: When we need to merge more than 16 files, do them in a hierarchy. Reduces the amount of data written to temporary files. The 3-level stack has to do a simple reduce after 4352 input files, for a normal file sort this is 35GB of data or about 500 million records. This needs about 50 open fd's - which should be ok. Clearly the merge sort could process more input files in one go - speeding up the sort, but at some point the number of input files would exceed whatever limit was applied. To generate a diff of this commit: cvs rdiff -u -r1.27 -r1.28 src/usr.bin/sort/msort.c Please note that diffs are not public domain; they are subject to the copyright notices on the relevant files.
Modified files: Index: src/usr.bin/sort/msort.c diff -u src/usr.bin/sort/msort.c:1.27 src/usr.bin/sort/msort.c:1.28 --- src/usr.bin/sort/msort.c:1.27 Sat Sep 26 21:16:55 2009 +++ src/usr.bin/sort/msort.c Fri Oct 9 20:29:43 2009 @@ -1,4 +1,4 @@ -/* $NetBSD: msort.c,v 1.27 2009/09/26 21:16:55 dsl Exp $ */ +/* $NetBSD: msort.c,v 1.28 2009/10/09 20:29:43 dsl Exp $ */ /*- * Copyright (c) 2000-2003 The NetBSD Foundation, Inc. @@ -65,7 +65,7 @@ #include "fsort.h" #ifndef lint -__RCSID("$NetBSD: msort.c,v 1.27 2009/09/26 21:16:55 dsl Exp $"); +__RCSID("$NetBSD: msort.c,v 1.28 2009/10/09 20:29:43 dsl Exp $"); __SCCSID("@(#)msort.c 8.1 (Berkeley) 6/6/93"); #endif /* not lint */ @@ -86,29 +86,49 @@ static int cmp(RECHEADER *, RECHEADER *); static int insert(struct mfile **, struct mfile *, int, int); +static void merge_sort_fstack(FILE *, put_func_t, struct field *); /* * Number of files merge() can merge in one pass. - * This should be power of two so that it's possible to use this value - * for rouding. */ #define MERGE_FNUM 16 static struct mfile fstack[MERGE_FNUM]; -static int fstack_count; +static struct mfile fstack_1[MERGE_FNUM]; +static struct mfile fstack_2[MERGE_FNUM]; +static int fstack_count, fstack_1_count, fstack_2_count; void save_for_merge(FILE *fp, get_func_t get, struct field *ftbl) { - FILE *mfp; + FILE *mfp, *mfp1, *mfp2; if (fstack_count == MERGE_FNUM) { /* Must reduce the number of temporary files */ mfp = ftmp(); - merge_sort(mfp, putrec, ftbl); - fstack[0].fp = mfp; - fstack[0].get = geteasy; - fstack_count = 1; + merge_sort_fstack(mfp, putrec, ftbl); + /* Save output in next layer */ + if (fstack_1_count == MERGE_FNUM) { + mfp1 = ftmp(); + memcpy(fstack, fstack_1, sizeof fstack); + merge_sort_fstack(mfp1, putrec, ftbl); + if (fstack_2_count == MERGE_FNUM) { + /* More than 4096 files! */ + mfp2 = ftmp(); + memcpy(fstack, fstack_2, sizeof fstack); + merge_sort_fstack(mfp2, putrec, ftbl); + fstack_2[0].fp = mfp2; + fstack_2_count = 1; + } + fstack_2[fstack_2_count].fp = mfp1; + fstack_2[fstack_2_count].get = geteasy; + fstack_2_count++; + fstack_1_count = 0; + } + fstack_1[fstack_1_count].fp = mfp; + fstack_1[fstack_1_count].get = geteasy; + fstack_1_count++; + fstack_count = 0; } fstack[fstack_count].fp = fp; @@ -135,6 +155,49 @@ void merge_sort(FILE *outfp, put_func_t put, struct field *ftbl) { + int count = fstack_1_count + fstack_2_count; + FILE *mfp; + int i; + + if (count == 0) { + /* All files in initial array */ + merge_sort_fstack(outfp, put, ftbl); + return; + } + + count += fstack_count; + + /* Too many files for one merge sort */ + for (;;) { + /* Sort latest 16 files */ + i = count; + if (i > MERGE_FNUM) + i = MERGE_FNUM; + while (fstack_count > 0) + fstack[--i] = fstack[--fstack_count]; + while (i > 0 && fstack_1_count > 0) + fstack[--i] = fstack_1[--fstack_1_count]; + while (i > 0) + fstack[--i] = fstack_2[--fstack_2_count]; + if (count <= MERGE_FNUM) { + /* Got all the data */ + fstack_count = count; + merge_sort_fstack(outfp, put, ftbl); + return; + } + mfp = ftmp(); + fstack_count = count > MERGE_FNUM ? MERGE_FNUM : count; + merge_sort_fstack(mfp, putrec, ftbl); + fstack[0].fp = mfp; + fstack[0].get = geteasy; + fstack_count = 1; + count -= MERGE_FNUM - 1; + } +} + +static void +merge_sort_fstack(FILE *outfp, put_func_t put, struct field *ftbl) +{ struct mfile *flistb[MERGE_FNUM], **flist = flistb, *cfile; RECHEADER *new_rec; u_char *new_end;