Hi,
I had another look at this and thought of a few improvements I'd like
to include.
I added an rlimit check for open file descriptors. I also added some
clearer error messages and a few more tests.
Thanks,
Bo
From 093c965cbf5458c05c841ce1269499bd08aeff16 Mon Sep 17 00:00:00 2001
From: Bo Borgerson <[EMAIL PROTECTED]>
Date: Sat, 5 Apr 2008 13:33:51 -0400
Subject: [PATCH] Added --batch-size=NMERGE option to sort
* src/sort.c: (static unsigned int nmerge) Replace constant NMERGE.
(static void specify_nmerge) Validate and apply new option.
(mergefps) Replace some arrays with pointers to xnmalloc'd storage.
* tests/misc/sort-merge: Test new option.
* doc/coreutils.texi: Describe new option.
* NEWS: Advertise new option.
Signed-off-by: Bo Borgerson <[EMAIL PROTECTED]>
---
NEWS | 4 ++
doc/coreutils.texi | 21 +++++++++
src/sort.c | 116 +++++++++++++++++++++++++++++++++++++++++++------
tests/misc/sort-merge | 45 ++++++++++++++++++-
4 files changed, 170 insertions(+), 16 deletions(-)
diff --git a/NEWS b/NEWS
index c05e0ad..c8b727c 100644
--- a/NEWS
+++ b/NEWS
@@ -50,6 +50,10 @@ GNU coreutils NEWS -*- outline -*-
options --general-numeric-sort/-g, --month-sort/-M, --numeric-sort/-n
and --random-sort/-R, resp.
+ sort accepts a new option --batch-size=NMERGE, where NMERGE
+ represents the maximum number of inputs that will be merged at once.
+ When more than NMERGE inputs are present temporary files are used.
+
** Improvements
id and groups work around an AFS-related bug whereby those programs
diff --git a/doc/coreutils.texi b/doc/coreutils.texi
index ee7dbb2..9a2ee7f 100644
--- a/doc/coreutils.texi
+++ b/doc/coreutils.texi
@@ -3690,6 +3690,27 @@ multiple fields.
Example: To sort on the second field, use @option{--key=2,2}
(@option{-k 2,2}). See below for more examples.
[EMAIL PROTECTED] [EMAIL PROTECTED]
[EMAIL PROTECTED] --batch-size
[EMAIL PROTECTED] number of inputs to merge, nmerge
+Merge at most @var{nmerge} inputs at once.
+
+If more than @var{nmerge} inputs are to be merged, then temporary files
+will be used.
+
+A large value of @var{nmerge} may improve merge performance and decrease
+temporary storage utilization at the expense of increased memory usage
+and I/0. Conversely a small value of @var{nmerge} may reduce memory
+requirements and I/0 at the expense of temporary storage consumption and
+merge performance.
+
+The value of @var{nmerge} must be at least 2.
+
+The value of @var{nmerge} may be bounded by a resource limit for open
+file descriptors. Try @samp{ulimit -n} to check for such a limit. If
+the value of @var{nmerge} exceeds this limit, then @command{sort} will
+issue a warning to standard error and exit with a nonzero status.
+
@item -o @var{output-file}
@itemx [EMAIL PROTECTED]
@opindex -o
diff --git a/src/sort.c b/src/sort.c
index 8b2eec5..8905420 100644
--- a/src/sort.c
+++ b/src/sort.c
@@ -223,13 +223,16 @@ static struct month monthtab[] =
};
/* During the merge phase, the number of files to merge at once. */
-#define NMERGE 16
+#define NMERGE_DEFAULT 16
/* Minimum size for a merge or check buffer. */
#define MIN_MERGE_BUFFER_SIZE (2 + sizeof (struct line))
/* Minimum sort size; the code might not work with smaller sizes. */
-#define MIN_SORT_SIZE (NMERGE * MIN_MERGE_BUFFER_SIZE)
+#define MIN_SORT_SIZE (nmerge * MIN_MERGE_BUFFER_SIZE)
+
+/* Maximum merge buffers we can theoretically support */
+#define MAX_NMERGE (SIZE_MAX / MIN_MERGE_BUFFER_SIZE)
/* The number of bytes needed for a merge or check buffer, which can
function relatively efficiently even if it holds only one line. If
@@ -281,6 +284,10 @@ static struct keyfield *keylist;
/* Program used to (de)compress temp files. Must accept -d. */
static char const *compress_program;
+/* Maximum number of files to merge in one go. If more than this
+ number are present, temp files will be used. */
+static unsigned int nmerge = NMERGE_DEFAULT;
+
static void sortlines_temp (struct line *, size_t, struct line *);
/* Report MESSAGE for FILE, then clean up and exit.
@@ -344,6 +351,8 @@ Other options:\n\
decompress them with PROG -d\n\
-k, --key=POS1[,POS2] start a key at POS1, end it at POS2 (origin 1)\n\
-m, --merge merge already sorted files; do not sort\n\
+ --batch-size=NMERGE merge at most NMERGE inputs at once;\n\
+ for more use temp files\n\
"), stdout);
fputs (_("\
-o, --output=FILE write result to FILE instead of standard output\n\
@@ -395,7 +404,8 @@ enum
CHECK_OPTION = CHAR_MAX + 1,
COMPRESS_PROGRAM_OPTION,
RANDOM_SOURCE_OPTION,
- SORT_OPTION
+ SORT_OPTION,
+ NMERGE_OPTION
};
static char const short_options[] = "-bcCdfgik:mMno:rRsS:t:T:uy:z";
@@ -419,6 +429,7 @@ static struct option const long_options[] =
{"output", required_argument, NULL, 'o'},
{"reverse", no_argument, NULL, 'r'},
{"stable", no_argument, NULL, 's'},
+ {"batch-size", required_argument, NULL, NMERGE_OPTION},
{"buffer-size", required_argument, NULL, 'S'},
{"field-separator", required_argument, NULL, 't'},
{"temporary-directory", required_argument, NULL, 'T'},
@@ -1045,6 +1056,71 @@ inittables (void)
#endif
}
+/* Specify how many inputs may be merged at once.
+ This may be set on the command-line with the
+ --batch-size option. */
+static void
+specify_nmerge (int oi, char c, char const *s)
+{
+ uintmax_t n;
+ struct rlimit rlimit;
+ unsigned int max_nmerge = MAX_NMERGE;
+ char max_nmerge_buf[INT_BUFSIZE_BOUND (unsigned int)];
+ enum strtol_error e = xstrtoumax (s, NULL, 10, &n, NULL);
+
+ /* Try to find out how many file descriptors we'll be able
+ to open. We need at least nmerge + 3 (STDIN_FILENO,
+ STDOUT_FILENO and STDERR_FILENO). */
+ if (getrlimit (RLIMIT_NOFILE, &rlimit) == 0)
+ max_nmerge = MIN (max_nmerge, rlimit.rlim_cur - 3);
+
+ if (e == LONGINT_OK)
+ {
+ nmerge = n;
+ if (nmerge == n)
+ {
+ if (nmerge < 2)
+ {
+ error (0, 0, _("invalid --%s argument %s"),
+ long_options[oi].name, quote(s));
+ error (SORT_FAILURE, 0,
+ _("minimum --%s argument is %s"),
+ long_options[oi].name, quote("2"));
+ }
+ else if (nmerge > max_nmerge)
+ {
+ e = LONGINT_OVERFLOW;
+ }
+ else
+ {
+ /* Need to re-check that we meet the minimum
+ requirement for memory usage with the new,
+ potentially larger, nmerge. */
+ sort_size = MAX (sort_size, MIN_SORT_SIZE);
+
+ return;
+ }
+ }
+ else
+ e = LONGINT_OVERFLOW;
+ }
+
+ if (e == LONGINT_OVERFLOW)
+ {
+
+ sprintf (max_nmerge_buf, "%u", max_nmerge);
+
+ error (0, 0, _("--%s argument %s too large"),
+ long_options[oi].name, quote(s));
+
+ error (SORT_FAILURE, 0,
+ _("maximum --%s argument with current rlimit is %s"),
+ long_options[oi].name, quote (max_nmerge_buf));
+ }
+ else
+ xstrtol_fatal (e, oi, c, long_options, s);
+}
+
/* Specify the amount of main memory to use when sorting. */
static void
specify_sort_size (int oi, char c, char const *s)
@@ -2029,15 +2105,20 @@ static void
mergefps (struct sortfile *files, size_t ntemps, size_t nfiles,
FILE *ofp, char const *output_file)
{
- FILE *fps[NMERGE]; /* Input streams for each file. */
- struct buffer buffer[NMERGE]; /* Input buffers for each file. */
+ FILE **fps = xnmalloc (nmerge, sizeof *fps);
+ /* Input streams for each file. */
+ struct buffer *buffer = xnmalloc (nmerge, sizeof *buffer);
+ /* Input buffers for each file. */
struct line saved; /* Saved line storage for unique check. */
struct line const *savedline = NULL;
/* &saved if there is a saved line. */
size_t savealloc = 0; /* Size allocated for the saved line. */
- struct line const *cur[NMERGE]; /* Current line in each line table. */
- struct line const *base[NMERGE]; /* Base of each line table. */
- size_t ord[NMERGE]; /* Table representing a permutation of fps,
+ struct line const **cur = xnmalloc (nmerge, sizeof *cur);
+ /* Current line in each line table. */
+ struct line const **base = xnmalloc (nmerge, sizeof *base);
+ /* Base of each line table. */
+ size_t *ord = xnmalloc (nmerge, sizeof *ord);
+ /* Table representing a permutation of fps,
such that cur[ord[0]] is the smallest line
and will be next output. */
size_t i;
@@ -2210,6 +2291,11 @@ mergefps (struct sortfile *files, size_t ntemps, size_t nfiles,
}
xfclose (ofp, output_file);
+ free(fps);
+ free(buffer);
+ free(ord);
+ free(base);
+ free(cur);
}
/* Merge into T the two sorted arrays of lines LO (with NLO members)
@@ -2397,7 +2483,7 @@ static void
merge (struct sortfile *files, size_t ntemps, size_t nfiles,
char const *output_file)
{
- while (NMERGE < nfiles)
+ while (nmerge < nfiles)
{
/* Number of input files processed so far. */
size_t in;
@@ -2413,20 +2499,20 @@ merge (struct sortfile *files, size_t ntemps, size_t nfiles,
size_t cheap_slots;
/* Do as many NMERGE-size merges as possible. */
- for (out = in = 0; out < nfiles / NMERGE; out++, in += NMERGE)
+ for (out = in = 0; out < nfiles / nmerge; out++, in += nmerge)
{
FILE *tfp;
pid_t pid;
char *temp = create_temp (&tfp, &pid);
- size_t nt = MIN (ntemps, NMERGE);
+ size_t nt = MIN (ntemps, nmerge);
ntemps -= nt;
- mergefps (&files[in], nt, NMERGE, tfp, temp);
+ mergefps (&files[in], nt, nmerge, tfp, temp);
files[out].name = temp;
files[out].pid = pid;
}
remainder = nfiles - in;
- cheap_slots = NMERGE - out % NMERGE;
+ cheap_slots = nmerge - out % nmerge;
if (cheap_slots < remainder)
{
@@ -3013,6 +3099,10 @@ main (int argc, char **argv)
mergeonly = true;
break;
+ case NMERGE_OPTION:
+ specify_nmerge (oi, c, optarg);
+ break;
+
case 'o':
if (outfile && !STREQ (outfile, optarg))
error (SORT_FAILURE, 0, _("multiple output files specified"));
diff --git a/tests/misc/sort-merge b/tests/misc/sort-merge
index 7ac9f84..c40dc80 100755
--- a/tests/misc/sort-merge
+++ b/tests/misc/sort-merge
@@ -1,7 +1,7 @@
#!/bin/sh
# Test "sort -m".
-# Copyright (C) 2002, 2003, 2005-2007 Free Software Foundation, Inc.
+# Copyright (C) 2002, 2003, 2005-2008 Free Software Foundation, Inc.
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
@@ -25,19 +25,58 @@ require 5.003;
use strict;
(my $program_name = $0) =~ s|.*/||;
+my $prog = 'sort';
# Turn off localisation of executable's ouput.
@ENV{qw(LANGUAGE LANG LC_ALL)} = ('C') x 3;
+# three empty files and one that says 'foo'
+my @inputs = (+(map{{IN=> {"empty$_"=> ''}}}1..3), {IN=> {foo=> "foo\n"}});
+
+# don't need to check for existence, since we're running in a temp dir
+my $badtmp = 'does/not/exist';
+
+# 2^64+1
+my $bigint = "18446744073709551617";
+
my @Tests =
(
- ['m1', '-m', {IN=> {empty=> ''}}, {IN=> {f=> "foo\n"}}, {OUT=>"foo\n"}],
+ ['m1', '-m', @inputs, {OUT=>"foo\n"}],
+
+ # check validation of --batch-size option
+ ['nmerge-0', "-m --batch-size=0", @inputs,
+ {ERR=>"$prog: invalid --batch-size argument `0'\n".
+ "$prog: minimum --batch-size argument is `2'\n"}, {EXIT=>2}],
+
+ ['nmerge-1', "-m --batch-size=1", @inputs,
+ {ERR=>"$prog: invalid --batch-size argument `1'\n".
+ "$prog: minimum --batch-size argument is `2'\n"}, {EXIT=>2}],
+
+ ['nmerge-neg', "-m --batch-size=-1", @inputs,
+ {ERR=>"$prog: invalid --batch-size argument `-1'\n"}, {EXIT=>2}],
+
+ ['nmerge-nan', "-m --batch-size=a", @inputs,
+ {ERR=>"$prog: invalid --batch-size argument `a'\n"}, {EXIT=>2}],
+
+ ['nmerge-big', "-m --batch-size=$bigint", @inputs,
+ {ERR_SUBST=>'s/current rlimit is .+\n/current rlimit is/'},
+ {ERR=>"$prog: --batch-size argument `$bigint' too large\n".
+ "$prog: maximum --batch-size argument with current rlimit is"},
+ {EXIT=>2}],
+
+ # This should work since nmerge >= the number of input files
+ ['nmerge-yes', "-m --batch-size=4 -T$badtmp", @inputs, {OUT=>"foo\n"}],
+
+ # this should fail since nmerge < # of input files, so
+ # temp files are needed
+ ['nmerge-no', "-m --batch-size=2 -T$badtmp", @inputs,
+ {ERR_SUBST=>"s|: $badtmp/sort.+||"},
+ {ERR=>"$prog: cannot create temporary file\n"}, {EXIT=>2}],
);
my $save_temps = $ENV{DEBUG};
my $verbose = $ENV{VERBOSE};
-my $prog = 'sort';
my $fail = run_tests ($program_name, $prog, [EMAIL PROTECTED], $save_temps, $verbose);
exit $fail;
EOF
--
1.5.2.5
_______________________________________________
Bug-coreutils mailing list
[email protected]
http://lists.gnu.org/mailman/listinfo/bug-coreutils