Okay, I've added some tests and documentation as per your contribution
guidelines.

I realize it may be a little premature for a "signoff", but I want to
verify that I'm using the tools correctly.

Thanks,

Bo
From 70b1e98b5bbf119e9b14b63780c4d02769cdb8ea Mon Sep 17 00:00:00 2001
From: Bo Borgerson <[EMAIL PROTECTED]>
Date: Mon, 31 Mar 2008 16:58:21 -0400
Subject: [PATCH] sort: added --merge-batch-size=NMERGE option.

* src/sort.c: Replace constant NMERGE with static int nmerge.
Validate and apply nmerge command-line settings. Replace some (now
variable-length) arrays with pointers to xnmalloc'd storage.
* tests/misc/sort-merge: Test new option
* doc/coreutils.texi: Describe new option
* NEWS: Advertise new option

Signed-off-by: Bo Borgerson <[EMAIL PROTECTED]>
---
 NEWS                  |    4 ++
 doc/coreutils.texi    |   16 +++++++++
 src/sort.c            |   85 +++++++++++++++++++++++++++++++++++++++----------
 tests/misc/sort-merge |   28 +++++++++++++++-
 4 files changed, 114 insertions(+), 19 deletions(-)

diff --git a/NEWS b/NEWS
index c05e0ad..92bdea1 100644
--- a/NEWS
+++ b/NEWS
@@ -50,6 +50,10 @@ GNU coreutils NEWS                                    -*- outline -*-
   options --general-numeric-sort/-g, --month-sort/-M, --numeric-sort/-n
   and --random-sort/-R, resp.
 
+  sort accepts a new option --merge-batch-size=NMERGE, where NMERGE
+  represents the maximum number of inputs that will be merged at once.
+  When more than NMERGE inputs are present temporary files are used.
+
 ** Improvements
 
   id and groups work around an AFS-related bug whereby those programs
diff --git a/doc/coreutils.texi b/doc/coreutils.texi
index f161c4d..7ccbb8a 100644
--- a/doc/coreutils.texi
+++ b/doc/coreutils.texi
@@ -3689,6 +3689,22 @@ multiple fields.
 Example:  To sort on the second field, use @option{--key=2,2}
 (@option{-k 2,2}).  See below for more examples.
 
[EMAIL PROTECTED] [EMAIL PROTECTED]
[EMAIL PROTECTED] --merge-batch-size
[EMAIL PROTECTED] number of inputs to merge, nmerge
+Merge at most @var{nmerge} inputs at once.
+
+If more than @var{nmerge} inputs are to be merged, then temporary files
+will be used.
+
+A large value of @var{nmerge} may improve merge performance and decrease
+temporary storage utilization at the expense of increased memory usage
+an I/0.  Conversely a small value of @var{nmerge} may reduce memory
+requirements and I/0 at the expense of temporary storage consumption and
+merge performance.
+
+The value of @var{nmerge} must be at least 2.
+
 @item -o @var{output-file}
 @itemx [EMAIL PROTECTED]
 @opindex -o
diff --git a/src/sort.c b/src/sort.c
index 8b2eec5..3d4f5a6 100644
--- a/src/sort.c
+++ b/src/sort.c
@@ -223,13 +223,13 @@ static struct month monthtab[] =
 };
 
 /* During the merge phase, the number of files to merge at once. */
-#define NMERGE 16
+#define NMERGE_DEFAULT 16
 
 /* Minimum size for a merge or check buffer.  */
 #define MIN_MERGE_BUFFER_SIZE (2 + sizeof (struct line))
 
 /* Minimum sort size; the code might not work with smaller sizes.  */
-#define MIN_SORT_SIZE (NMERGE * MIN_MERGE_BUFFER_SIZE)
+#define MIN_SORT_SIZE (nmerge * MIN_MERGE_BUFFER_SIZE)
 
 /* The number of bytes needed for a merge or check buffer, which can
    function relatively efficiently even if it holds only one line.  If
@@ -281,6 +281,10 @@ static struct keyfield *keylist;
 /* Program used to (de)compress temp files.  Must accept -d.  */
 static char const *compress_program;
 
+/* Maximum number of files to merge in one go.  If more than this
+   number are present, temp files will be used. */
+static int nmerge = NMERGE_DEFAULT;
+
 static void sortlines_temp (struct line *, size_t, struct line *);
 
 /* Report MESSAGE for FILE, then clean up and exit.
@@ -344,6 +348,8 @@ Other options:\n\
                               decompress them with PROG -d\n\
   -k, --key=POS1[,POS2]     start a key at POS1, end it at POS2 (origin 1)\n\
   -m, --merge               merge already sorted files; do not sort\n\
+      --merge-batch-size=NMERGE  merge at most NMERGE inputs at once;\n\
+                                for more use temp files\n\
 "), stdout);
       fputs (_("\
   -o, --output=FILE         write result to FILE instead of standard output\n\
@@ -395,7 +401,8 @@ enum
   CHECK_OPTION = CHAR_MAX + 1,
   COMPRESS_PROGRAM_OPTION,
   RANDOM_SOURCE_OPTION,
-  SORT_OPTION
+  SORT_OPTION,
+  NMERGE_OPTION
 };
 
 static char const short_options[] = "-bcCdfgik:mMno:rRsS:t:T:uy:z";
@@ -419,6 +426,7 @@ static struct option const long_options[] =
   {"output", required_argument, NULL, 'o'},
   {"reverse", no_argument, NULL, 'r'},
   {"stable", no_argument, NULL, 's'},
+  {"merge-batch-size", required_argument, NULL, NMERGE_OPTION},
   {"buffer-size", required_argument, NULL, 'S'},
   {"field-separator", required_argument, NULL, 't'},
   {"temporary-directory", required_argument, NULL, 'T'},
@@ -1045,6 +1053,35 @@ inittables (void)
 #endif
 }
 
+/* Specify how many inputs may be merged at once. */
+static void
+specify_nmerge (int oi, char c, char const *s)
+{
+  uintmax_t n;
+  enum strtol_error e = xstrtoumax (s, NULL, 10, &n, NULL);
+
+  if (e == LONGINT_OK)
+    {
+      nmerge = n;
+      if (nmerge == n)
+	{
+	  if (nmerge >= 2)
+	    {
+	      /* Need to re-check that we meet the minimum
+		 requirement for memory usage with the new,
+		 potentially larger, nmerge */
+	      sort_size = MAX (sort_size, MIN_SORT_SIZE);
+	      return;
+	    }
+	  e = LONGINT_INVALID;
+	}
+      else
+	e = LONGINT_OVERFLOW;
+    }
+
+  xstrtol_fatal (e, oi, c, long_options, s);
+}
+
 /* Specify the amount of main memory to use when sorting.  */
 static void
 specify_sort_size (int oi, char c, char const *s)
@@ -2029,15 +2066,20 @@ static void
 mergefps (struct sortfile *files, size_t ntemps, size_t nfiles,
 	  FILE *ofp, char const *output_file)
 {
-  FILE *fps[NMERGE];		/* Input streams for each file.  */
-  struct buffer buffer[NMERGE];	/* Input buffers for each file. */
+  FILE **fps = xnmalloc(nmerge, sizeof *fps);
+				/* Input streams for each file.  */
+  struct buffer *buffer = xnmalloc(nmerge, sizeof *buffer);
+				/* Input buffers for each file. */
   struct line saved;		/* Saved line storage for unique check. */
   struct line const *savedline = NULL;
 				/* &saved if there is a saved line. */
   size_t savealloc = 0;		/* Size allocated for the saved line. */
-  struct line const *cur[NMERGE]; /* Current line in each line table. */
-  struct line const *base[NMERGE]; /* Base of each line table.  */
-  size_t ord[NMERGE];		/* Table representing a permutation of fps,
+  struct line const **cur = xnmalloc(nmerge, sizeof *cur);
+				/* Current line in each line table. */
+  struct line const **base = xnmalloc(nmerge, sizeof *base);
+				/* Base of each line table.  */
+  size_t *ord = xnmalloc(nmerge, sizeof ord);
+				/* Table representing a permutation of fps,
 				   such that cur[ord[0]] is the smallest line
 				   and will be next output. */
   size_t i;
@@ -2210,6 +2252,11 @@ mergefps (struct sortfile *files, size_t ntemps, size_t nfiles,
     }
 
   xfclose (ofp, output_file);
+  free(fps);
+  free(buffer);
+  free(ord);
+  free(base);
+  free(cur);
 }
 
 /* Merge into T the two sorted arrays of lines LO (with NLO members)
@@ -2397,7 +2444,7 @@ static void
 merge (struct sortfile *files, size_t ntemps, size_t nfiles,
        char const *output_file)
 {
-  while (NMERGE < nfiles)
+  while (nmerge < nfiles)
     {
       /* Number of input files processed so far.  */
       size_t in;
@@ -2405,33 +2452,33 @@ merge (struct sortfile *files, size_t ntemps, size_t nfiles,
       /* Number of output files generated so far.  */
       size_t out;
 
-      /* nfiles % NMERGE; this counts input files that are left over
+      /* nfiles % nmerge; this counts input files that are left over
 	 after all full-sized merges have been done.  */
       size_t remainder;
 
       /* Number of easily-available slots at the next loop iteration.  */
       size_t cheap_slots;
 
-      /* Do as many NMERGE-size merges as possible.  */
-      for (out = in = 0; out < nfiles / NMERGE; out++, in += NMERGE)
+      /* Do as many nmerge-size merges as possible.  */
+      for (out = in = 0; out < nfiles / nmerge; out++, in += nmerge)
 	{
 	  FILE *tfp;
 	  pid_t pid;
 	  char *temp = create_temp (&tfp, &pid);
-	  size_t nt = MIN (ntemps, NMERGE);
+	  size_t nt = MIN (ntemps, nmerge);
 	  ntemps -= nt;
-	  mergefps (&files[in], nt, NMERGE, tfp, temp);
+	  mergefps (&files[in], nt, nmerge, tfp, temp);
 	  files[out].name = temp;
 	  files[out].pid = pid;
 	}
 
       remainder = nfiles - in;
-      cheap_slots = NMERGE - out % NMERGE;
+      cheap_slots = nmerge - out % nmerge;
 
       if (cheap_slots < remainder)
 	{
 	  /* So many files remain that they can't all be put into the last
-	     NMERGE-sized output window.  Do one more merge.  Merge as few
+	     nmerge-sized output window.  Do one more merge.  Merge as few
 	     files as possible, to avoid needless I/O.  */
 	  size_t nshortmerge = remainder - cheap_slots + 1;
 	  FILE *tfp;
@@ -2445,7 +2492,7 @@ merge (struct sortfile *files, size_t ntemps, size_t nfiles,
 	  in += nshortmerge;
 	}
 
-      /* Put the remaining input files into the last NMERGE-sized output
+      /* Put the remaining input files into the last nmerge-sized output
 	 window, so they will be merged in the next pass.  */
       memmove(&files[out], &files[in], (nfiles - in) * sizeof *files);
       ntemps += out;
@@ -3013,6 +3060,10 @@ main (int argc, char **argv)
 	  mergeonly = true;
 	  break;
 
+	case NMERGE_OPTION:
+	  specify_nmerge (oi, c, optarg);
+	  break;
+
 	case 'o':
 	  if (outfile && !STREQ (outfile, optarg))
 	    error (SORT_FAILURE, 0, _("multiple output files specified"));
diff --git a/tests/misc/sort-merge b/tests/misc/sort-merge
index 7ac9f84..544d367 100755
--- a/tests/misc/sort-merge
+++ b/tests/misc/sort-merge
@@ -25,19 +25,43 @@ require 5.003;
 use strict;
 
 (my $program_name = $0) =~ s|.*/||;
+my $prog = 'sort';
+
 
 # Turn off localisation of executable's ouput.
 @ENV{qw(LANGUAGE LANG LC_ALL)} = ('C') x 3;
 
+my $mbs = '--merge-batch-size';
+
+# three empty files and one that says 'foo'
+my @inputs = (+(map{{IN=> {"empty$_"=> ''}}}1..3), {IN=> {foo=> "foo\n"}});
+
+my $badtmp = 'does/not/exist';
+-d $badtmp and die "$badtmp exists";
+
 my @Tests =
     (
-     ['m1', '-m', {IN=> {empty=> ''}}, {IN=> {f=> "foo\n"}}, {OUT=>"foo\n"}],
+     ['m1', '-m', @inputs, {OUT=>"foo\n"}],
+
+     # check bounds on --merge-batch-size
+     ['m2', "-m $mbs=1", @inputs,
+	{ERR=>"$prog: invalid $mbs argument `1'\n"}, {EXIT=>2}],
+     ['m3', "-m $mbs=2147483649", @inputs,
+	{ERR=>"$prog: $mbs argument `2147483649' too large\n"}, {EXIT=>2}],
+
+     # This should work since nmerge >= the number of input files
+     ['m4', "-m $mbs=4 -T$badtmp", @inputs, {OUT=>"foo\n"}],
+
+     # this should fail since nmerge < # of input files, so
+     # temp files are needed
+     ['m5', "-m $mbs=2 -T$badtmp", @inputs,
+	{ERR_SUBST=>"s|: $badtmp/sort.+||"},
+	{ERR=>"$prog: cannot create temporary file\n"}, {EXIT=>2}],
     );
 
 my $save_temps = $ENV{DEBUG};
 my $verbose = $ENV{VERBOSE};
 
-my $prog = 'sort';
 my $fail = run_tests ($program_name, $prog, [EMAIL PROTECTED], $save_temps, $verbose);
 exit $fail;
 EOF
-- 
1.5.2.5

_______________________________________________
Bug-coreutils mailing list
Bug-coreutils@gnu.org
http://lists.gnu.org/mailman/listinfo/bug-coreutils

Reply via email to