On Sun, Jun 29, 2014 at 06:48:32PM -0400, Jared Yanovich wrote:

> Hi,
> 
> sort(1) does some funky things and isn't hard to break:
> 
>   $ perl -e 'print "\n"x117000,"x\n"' | sort | sort -c
> 
> This patch contains a few changes from NetBSD to correct the behavior 
> regarding
> ordering of appending bins to output in certain circumstances which helps pass
> more of our own regress tests and improves performance (e.g. regress suite
> runtime is <40% with new code compared to old/current code on my box).  The 
> new
> code is also much easier to understand..
> 
> NetBSD logs:
> 
>   msort.c -r 1.9
>     merge(): use array of buffers instead of one big buffer for all records, 
> and
>            enlarge them as necessary to read records from merged files; the 
> buffers
>            are allocated once per program run, so there shouldn't be any
>            performance difference
>     This makes sort(1) pass also regression 40B and should make it
>     fully arbitrary long record capable.
>     XXX the buffer array could probably be freed on end of fmerge() to save 
> memory
> 
>   fsort.c -r 1.37
>     The code that attempted to sort large files by sorting each chunk by the
>     first key byte and writing to a temp file, then sorting the records from
>     each temp file that had the same first key byte (and repeating for upto
>     4 key bytes) was a nice idea, but completely doomed to failure.
>     Eg PR/9308 where a 70MB file has all but one record the same and short 
> keys.
>     Not only does the code not work, it is rather guaranteed to be slow.
>     Instead always use a merge sort for fully sorted chunk of records (each
>     temporary file contains one lot of sorted records).
>     The -H option already did this, so just rip out all the code and variables
>     that can't be used when -H was specified.

Hi Jared, good to see you active again ;-)

This indeed solves some problems, but I have a test file on which it cores.

The testfile is available at http://www.drijf.net/openbsd/test4.gz

        -Otto


> 
> Index: append.c
> ===================================================================
> RCS file: /cvs/src/usr.bin/sort/append.c,v
> retrieving revision 1.10
> diff -u -p -r1.10 append.c
> --- append.c  27 Oct 2009 23:59:43 -0000      1.10
> +++ append.c  29 Jun 2014 22:17:16 -0000
> @@ -148,37 +148,3 @@ append(u_char **keylist, int nelem, int 
>               put(crec, fp);
>       }
>  }
> -
> -/*
> - * output the already sorted eol bin.
> - */
> -void
> -rd_append(int binno, union f_handle infl0, int nfiles, FILE *outfp,
> -    u_char *buffer, u_char *bufend)
> -{
> -     RECHEADER *rec;
> -
> -     rec = (RECHEADER *) buffer;
> -     if (!getnext(binno, infl0, nfiles, (RECHEADER *) buffer, bufend, 0)) {
> -             putline(rec, outfp);
> -             while (getnext(binno, infl0, nfiles, (RECHEADER *) buffer,
> -                     bufend, 0) == 0) {
> -                     if (!UNIQUE)
> -                             putline(rec, outfp);
> -             }
> -     }
> -}
> -
> -/*
> - * append plain text--used after sorting the biggest bin.
> - */
> -void
> -concat(FILE *a, FILE *b)
> -{
> -        int nread;
> -        char buffer[4096];
> -
> -     rewind(b);
> -        while ((nread = fread(buffer, 1, 4096, b)) > 0)
> -                EWRITE(buffer, 1, nread, a);
> -}
> Index: extern.h
> ===================================================================
> RCS file: /cvs/src/usr.bin/sort/extern.h,v
> retrieving revision 1.8
> diff -u -p -r1.8 extern.h
> --- extern.h  22 Dec 2009 19:47:02 -0000      1.8
> +++ extern.h  29 Jun 2014 22:17:16 -0000
> @@ -36,7 +36,6 @@
>  
>  void  append(u_char **, int, int, FILE *, void (*)(RECHEADER *, FILE *),
>           struct field *);
> -void  concat(FILE *, FILE *);
>  length_t enterkey(RECHEADER *, DBT *, int, struct field *);
>  void  fixit(int *, char **);
>  void  fldreset(struct field *);
> @@ -44,11 +43,9 @@ FILE       *ftmp(void);
>  void  fmerge(int, union f_handle, int,
>           int (*)(int, union f_handle, int, RECHEADER *, u_char *, struct 
> field *),
>           FILE *, void (*)(RECHEADER *, FILE *), struct field *);
> -void  fsort(int, int, union f_handle, int, FILE *, struct field *);
> +void  fsort(union f_handle, int, FILE *, struct field *);
>  int   geteasy(int, union f_handle,
>           int, RECHEADER *, u_char *, struct field *);
> -int   getnext(int, union f_handle,
> -         int, RECHEADER *, u_char *, struct field *);
>  int   makekey(int, union f_handle,
>           int, RECHEADER *, u_char *, struct field *);
>  int   makeline(int, union f_handle,
> @@ -57,7 +54,6 @@ void         merge(int, int,
>           int (*)(int, union f_handle, int, RECHEADER *, u_char *, struct 
> field *),
>           FILE *, void (*)(RECHEADER *, FILE *), struct field *);
>  void  num_init(void);
> -void  onepass(u_char **, int, long, long *, u_char *, FILE *);
>  int   optval(int, int);
>  void  order(union f_handle,
>           int (*)(int, union f_handle, int, RECHEADER *, u_char *, struct 
> field *),
> @@ -65,6 +61,5 @@ void         order(union f_handle,
>           int c_warn);
>  void  putline(RECHEADER *, FILE *);
>  void  putrec(RECHEADER *, FILE *);
> -void  rd_append(int, union f_handle, int, FILE *, u_char *, u_char *);
>  int   setfield(char *, struct field *, int);
>  void  settables(int);
> Index: files.c
> ===================================================================
> RCS file: /cvs/src/usr.bin/sort/files.c,v
> retrieving revision 1.13
> diff -u -p -r1.13 files.c
> --- files.c   27 Oct 2009 23:59:43 -0000      1.13
> +++ files.c   29 Jun 2014 22:17:16 -0000
> @@ -40,73 +40,6 @@
>  static int   seq(FILE *, DBT *, DBT *);
>  
>  /*
> - * this is the subroutine for file management for fsort().
> - * It keeps the buffers for all temporary files.
> - */
> -int
> -getnext(int binno, union f_handle infl0, int nfiles, RECHEADER *pos, u_char 
> *end,
> -    struct field *dummy)
> -{
> -     int i;
> -     u_char *hp;
> -     static size_t nleft = 0;
> -     static int cnt = 0, flag = -1;
> -     static u_char maxb = 0;
> -     static FILE *fp;
> -
> -     if (nleft == 0) {
> -             if (binno < 0)  /* reset files. */ {
> -                     for (i = 0; i < nfiles; i++) {
> -                             rewind(fstack[infl0.top + i].fp);
> -                             fstack[infl0.top + i].max_o = 0;
> -                     }
> -                     flag = -1;
> -                     nleft = cnt = 0;
> -                     return (-1);
> -             }
> -             maxb = fstack[infl0.top].maxb;
> -             for (; nleft == 0; cnt++) {
> -                     if (cnt >= nfiles) {
> -                             cnt = 0;
> -                             return (EOF);
> -                     }
> -                     fp = fstack[infl0.top + cnt].fp;
> -                     fread(&nleft, sizeof(nleft), 1, fp);
> -                     if (binno < maxb)
> -                             fstack[infl0.top+cnt].max_o
> -                                     += sizeof(nleft) + nleft;
> -                     else if (binno == maxb) {
> -                             if (binno != fstack[infl0.top].lastb) {
> -                                     fseek(fp, fstack[infl0.top+
> -                                             cnt].max_o, SEEK_SET);
> -                                     fread(&nleft, sizeof(nleft), 1, fp);
> -                             }
> -                             if (nleft == 0)
> -                                     fclose(fp);
> -                     } else if (binno == maxb + 1) {         /* skip a bin */
> -                             fseek(fp, nleft, SEEK_CUR);
> -                             fread(&nleft, sizeof(nleft), 1, fp);
> -                             flag = cnt;
> -                     }
> -             }
> -     }
> -     if ((u_char *) pos > end - sizeof(TRECHEADER))
> -             return (BUFFEND);
> -     fread(pos, sizeof(TRECHEADER), 1, fp);
> -     if (end - pos->data < pos->length) {
> -             hp = ((u_char *)pos) + sizeof(TRECHEADER);
> -             for (i = sizeof(TRECHEADER); i ;  i--)
> -                     ungetc(*--hp, fp);
> -             return (BUFFEND);
> -     }
> -     fread(pos->data, pos->length, 1, fp);
> -     nleft -= pos->length + sizeof(TRECHEADER);
> -     if (nleft == 0 && binno == fstack[infl0.top].maxb)
> -             fclose(fp);
> -     return (0);
> -}
> -
> -/*
>   * this is called when there is no special key. It's only called
>   * in the first fsort pass.
>   */
> Index: fsort.c
> ===================================================================
> RCS file: /cvs/src/usr.bin/sort/fsort.c,v
> retrieving revision 1.21
> diff -u -p -r1.21 fsort.c
> --- fsort.c   27 Oct 2009 23:59:43 -0000      1.21
> +++ fsort.c   29 Jun 2014 22:17:16 -0000
> @@ -33,11 +33,11 @@
>   */
>  
>  /*
> - * Read in the next bin.  If it fits in one segment sort it;
> - * otherwise refine it by segment deeper by one character,
> - * and try again on smaller bins.  Sort the final bin at this level
> - * of recursion to keep the head of fstack at 0.
> - * After PANIC passes, abort to merge sort.
> + * Read in a block of records (until 'enough').
> + * sort, write to temp file.
> + * Merge sort temp files into output file
> + * Small files miss out the temp file stage.
> + * Large files might get multiple merges.
>   */
>  #include "sort.h"
>  #include "fsort.h"
> @@ -48,28 +48,26 @@
>  u_char *linebuf;
>  size_t linebuf_size = MAXLLEN;
>  struct tempfile fstack[MAXFCT];
> -#define FSORTMAX 4
> -int PANIC = FSORTMAX;
> +
> +#define CHECKFSTACK(n)                                       \
> +     if (n >= MAXFCT)                                \
> +             errx(2, "fstack: too many temporary files; sort in pieces")
>  
>  void
> -fsort(int binno, int depth, union f_handle infiles, int nfiles, FILE *outfp,
> +fsort(union f_handle infiles, int nfiles, FILE *outfp,
>      struct field *ftbl)
>  {
> -     u_char *weights, **keypos, *bufend, *tmpbuf;
> +     u_char *weights, **keypos, *bufend;
>       static u_char *buffer, **keylist;
>       static size_t bufsize;
> -     int ntfiles, mfct = 0, total, i, maxb, lastb, panic = 0;
> +     int ntfiles, mfct = 0;
>       int c, nelem;
> -     long sizes[NBINS+1];
> -     union f_handle tfiles, mstart = {MAXFCT-16};
> +     union f_handle tfiles, mstart = { MAXFCT - MERGE_FNUM };
>       int (*get)(int, union f_handle, int, RECHEADER *,
>               u_char *, struct field *);
>       RECHEADER *crec;
>       struct field tfield[2];
> -     FILE *prevfp, *tailfp[FSORTMAX+1];
>  
> -     memset(tailfp, 0, sizeof(tailfp));
> -     prevfp = outfp;
>       memset(tfield, 0, sizeof(tfield));
>       if (ftbl[0].flags & R)
>               tfield[0].weights = Rascii;
> @@ -84,205 +82,114 @@ fsort(int binno, int depth, union f_hand
>                       err(2, NULL);
>       }
>       bufend = buffer + bufsize - 1;
> -     if (binno >= 0) {
> -             tfiles.top = infiles.top + nfiles;
> -             get = getnext;
> -     } else {
> -             tfiles.top = 0;
> -             if (SINGL_FLD)
> -                     get = makeline;
> -             else
> -                     get = makekey;
> -     }                               
> -     for (;;) {
> -             memset(sizes, 0, sizeof(sizes));
> -             c = ntfiles = 0;
> -             if (binno == weights[REC_D] &&
> -                 !(SINGL_FLD && ftbl[0].flags & F)) {        /* pop */
> -                     rd_append(weights[REC_D],
> -                         infiles, nfiles, prevfp, buffer, bufend);
> -                     break;
> -             } else if (binno == weights[REC_D]) {
> -                     depth = 0;              /* start over on flat weights */
> -                     ftbl = tfield;
> -                     weights = ftbl[0].weights;
> +
> +     if (SINGL_FLD)
> +             get = makeline;
> +     else
> +             get = makekey;
> +
> +     c = nelem = ntfiles = 0;
> +     keypos = keylist;
> +     crec = (RECHEADER *) buffer;
> +     while (c != EOF) {
> +             keypos = keylist;
> +             nelem = 0;
> +             crec = (RECHEADER *) buffer;
> +
> + do_read:
> +             while ((c = get(-1, infiles, nfiles, crec, bufend,
> +                 ftbl)) == 0) {
> +                     *keypos++ = crec->data;
> +                     if (++nelem == MAXNUM) {
> +                             c = BUFFEND;
> +                             break;
> +                     }
> +                     crec = (RECHEADER *)((char *) crec +
> +                         SALIGN(crec->length) + sizeof(TRECHEADER));
>               }
> -             while (c != EOF) {
> -                     keypos = keylist;
> -                     nelem = 0;
> -                     crec = (RECHEADER *) buffer;
> -                     while ((c = get(binno, infiles, nfiles, crec, bufend,
> -                         ftbl)) == 0) {
> -                             *keypos++ = crec->data + depth;
> -                             if (++nelem == MAXNUM) {
> -                                     c = BUFFEND;
> -                                     break;
> -                             }
> -                             crec = (RECHEADER *)((char *)crec +
> -                                 SALIGN(crec->length) + sizeof(TRECHEADER));
> +             if (c == BUFFEND && nelem < MAXNUM &&
> +                 bufsize < MAXBUFSIZE) {
> +                     u_char **keyp;
> +                     u_char *oldb = buffer;
> +                     u_char *nbuffer;
> +
> +                     /* buffer was too small for data, allocate
> +                      * bigger buffer */
> +                     nbuffer = realloc(buffer, bufsize * 2);
> +                     if (!nbuffer) {
> +                             err(2, "failed to realloc buffer to %lu bytes",
> +                                 (unsigned long) bufsize * 2);
>                       }
> +                     buffer = nbuffer;
> +                     bufsize *= 2;
> +                     bufend = buffer + bufsize;
> +
> +                     /* patch up keylist[] */
> +                     for (keyp = &keypos[-1]; keyp >= keylist; keyp--)
> +                             *keyp = buffer + (*keyp - oldb);
> +
> +                     crec = (RECHEADER *) (buffer +
> +                         ((u_char *)crec - oldb));
> +                     goto do_read;
> +             }
> +             if (c != BUFFEND && !ntfiles && !mfct) {
> +                     /* do not push */
> +                     continue;
> +             }
> +
> +             /* push */
> +             fstack[mstart.top + mfct].fp = ftmp();
> +             if (radix_sort((const u_char **)keylist, nelem, weights,
> +                 REC_D))
> +                     err(2, NULL);
> +             append(keylist, nelem, 0, fstack[mstart.top + mfct].fp,
> +                 putrec, ftbl);
> +             mfct++;
> +
> +             /* reduce number of open files */
> +             if (mfct == MERGE_FNUM || (c == EOF && ntfiles)) {
>                       /*
> -                      * buffer was too small for data, allocate
> -                      * a bigger buffer.
> +                      * Only copy extra incomplete crec
> +                      * data if there are any.
>                        */
> -                     if (c == BUFFEND && nelem == 0) {
> -                             bufsize *= 2;
> -                             tmpbuf = realloc(buffer, bufsize);
> -                             if (!tmpbuf)
> -                                     err(2, "failed to realloc buffer");
> -                             crec = (RECHEADER *)
> -                                 (tmpbuf + ((u_char *)crec - buffer));
> -                             buffer = tmpbuf;
> -                             bufend = buffer + bufsize - 1;
> -                             continue;
> +                     int nodata = (bufend >= (u_char *)crec &&
> +                         bufend <= crec->data);
> +                     size_t sz = 0;
> +                     u_char *tmpbuf = NULL;
> +
> +                     if (!nodata) {
> +                             sz = bufend - crec->data;
> +                             tmpbuf = malloc(sz);
> +                             memmove(tmpbuf, crec->data, sz);
>                       }
> -                     if (c == BUFFEND || ntfiles || mfct) {  /* push */
> -                             if (panic >= PANIC) {
> -                                     fstack[MAXFCT-16+mfct].fp = ftmp();
> -                                     if (radixsort((const u_char **)keylist,
> -                                         nelem, weights, REC_D))
> -                                             err(2, NULL);
> -                                     append(keylist, nelem, depth, fstack[
> -                                      MAXFCT-16+mfct].fp, putrec, ftbl);
> -                                     mfct++;
> -                                     /* reduce number of open files */
> -                                     if (mfct == 16 ||(c == EOF && ntfiles)) 
> {
> -                                             fstack[tfiles.top + ntfiles].fp
> -                                                 = ftmp();
> -                                             fmerge(0, mstart, mfct, geteasy,
> -                                               fstack[tfiles.top+ntfiles].fp,
> -                                               putrec, ftbl);
> -                                             ntfiles++;
> -                                             mfct = 0;
> -                                     }
> -                             } else {
> -                                     fstack[tfiles.top + ntfiles].fp= ftmp();
> -                                     onepass(keylist, depth, nelem, sizes,
> -                                     weights, fstack[tfiles.top+ntfiles].fp);
> -                                     ntfiles++;
> -                             }
> +                     CHECKFSTACK(ntfiles);
> +                     fstack[ntfiles].fp = ftmp();
> +                     fmerge(0, mstart, mfct, geteasy,
> +                         fstack[ntfiles].fp, putrec, ftbl);
> +                     ntfiles++;
> +                     mfct = 0;
> +
> +                     if (!nodata) {
> +                             memmove(crec->data, tmpbuf, sz);
> +                             free(tmpbuf);
>                       }
>               }
> -             get = getnext;
> -             if (!ntfiles && !mfct) {        /* everything in memory--pop */
> -                     if (nelem > 1) {
> -                             if (STABLE) {
> -                                     i = sradixsort((const u_char **)keylist,
> -                                         nelem, weights, REC_D);
> -                             } else {
> -                                     i = radixsort((const u_char **)keylist,
> -                                         nelem, weights, REC_D);
> -                             }
> -                             if (i)
> -                                     err(2, NULL);
> -                     }
> -                     append(keylist, nelem, depth, outfp, putline, ftbl);
> -                     break;                                  /* pop */
> -             }
> -             if (panic >= PANIC) {
> -                     if (!ntfiles)
> -                             fmerge(0, mstart, mfct, geteasy,
> -                                 outfp, putline, ftbl);
> -                     else
> -                             fmerge(0, tfiles, ntfiles, geteasy,
> -                                 outfp, putline, ftbl);
> -                     break;
> -             }
> -             total = maxb = lastb = 0;       /* find if one bin dominates */
> -             for (i = 0; i < NBINS; i++)
> -               if (sizes[i]) {
> -                     if (sizes[i] > sizes[maxb])
> -                             maxb = i;
> -                     lastb = i;
> -                     total += sizes[i];
> -             }
> -             if (sizes[maxb] < max((total / 2) , BUFSIZE))
> -                     maxb = lastb;   /* otherwise pop after last bin */
> -             fstack[tfiles.top].lastb = lastb;
> -             fstack[tfiles.top].maxb = maxb;
> -
> -                     /* start refining next level. */
> -             get(-1, tfiles, ntfiles, crec, bufend, 0);      /* rewind */
> -             for (i = 0; i < maxb; i++) {
> -                     if (!sizes[i])  /* bin empty; step ahead file offset */
> -                             get(i, tfiles, ntfiles, crec, bufend, 0);
> -                     else
> -                             fsort(i, depth+1, tfiles, ntfiles, outfp, ftbl);
> -             }
> -             if (lastb != maxb) {
> -                     if (prevfp != outfp)
> -                             tailfp[panic] = prevfp;
> -                     prevfp = ftmp();
> -                     for (i = maxb+1; i <= lastb; i++)
> -                             if (!sizes[i])
> -                                     get(i, tfiles, ntfiles, crec, bufend,0);
> -                             else
> -                                     fsort(i, depth+1, tfiles, ntfiles,
> -                                         prevfp, ftbl);
> -             }
> -
> -             /* sort biggest (or last) bin at this level */
> -             depth++;
> -             panic++;
> -             binno = maxb;
> -             infiles.top = tfiles.top;       /* getnext will free tfiles, */
> -             nfiles = ntfiles;               /* so overwrite them */
> -     }
> -     if (prevfp != outfp) {
> -             concat(outfp, prevfp);
> -             fclose(prevfp);
>       }
> -     for (i = panic; i >= 0; --i)
> -             if (tailfp[i]) {
> -                     concat(outfp, tailfp[i]);
> -                     fclose(tailfp[i]);
> -             }
> -}
> -
> -/*
> - * This is one pass of radix exchange, dumping the bins to disk.
> - */
> -#define swap(a, b, t) t = a, a = b, b = t
> -void
> -onepass(u_char **a, int depth, long n, long sizes[], u_char *tr, FILE *fp)
> -{
> -     size_t tsizes[NBINS+1];
> -     u_char **bin[257], **top[256], ***bp, ***bpmax, ***tp;
> -     static int histo[256];
> -     int *hp;
> -     int c;
> -     u_char **an, *t, **aj;
> -     u_char **ak, *r;
> -
> -     memset(tsizes, 0, sizeof(tsizes));
> -     depth += sizeof(TRECHEADER);
> -     an = &a[n];
> -     for (ak = a; ak < an; ak++) {
> -             histo[c = tr[**ak]]++;
> -             tsizes[c] += ((RECHEADER *) (*ak -= depth))->length;
> +     if (!ntfiles && !mfct) {        /* everything in memory--pop */
> +             if (nelem > 1 && radix_sort((const u_char **)keylist,
> +                 nelem, weights, REC_D))
> +                     err(2, NULL);
> +             if (nelem > 0)
> +                     append(keylist, nelem, 0, outfp, putline, ftbl);
>       }
> +     if (!ntfiles)
> +             fmerge(0, mstart, mfct, geteasy, outfp, putline, ftbl);
> +     else
> +             fmerge(0, tfiles, ntfiles, geteasy, outfp, putline,
> +                 ftbl);
>  
> -     bin[0] = a;
> -     bpmax = bin + 256;
> -     tp = top, hp = histo;
> -     for (bp = bin; bp < bpmax; bp++) {
> -             *tp++ = *(bp + 1) = *bp + (c = *hp);
> -             *hp++ = 0;
> -             if (c <= 1)
> -                     continue;
> -     }
> -     for (aj = a; aj < an; *aj = r, aj = bin[c + 1]) 
> -             for (r = *aj; aj < (ak = --top[c = tr[r[depth]]]); )
> -                     swap(*ak, r, t);
> -
> -     for (ak = a, c = 0; c < 256; c++) {
> -             an = bin[c + 1];
> -             n = an - ak;
> -             tsizes[c] += n * sizeof(TRECHEADER);
> -             /* tell getnext how many elements in this bin, this segment. */
> -             EWRITE(&tsizes[c], sizeof(size_t), 1, fp);
> -             sizes[c] += tsizes[c];
> -             for (; ak < an; ++ak)
> -                     putrec((RECHEADER *) *ak, fp);
> -     }
> +     free(keylist);
> +     keylist = NULL;
> +     free(buffer);
> +     buffer = NULL;
>  }
> Index: fsort.h
> ===================================================================
> RCS file: /cvs/src/usr.bin/sort/fsort.h,v
> retrieving revision 1.10
> diff -u -p -r1.10 fsort.h
> --- fsort.h   13 Mar 2007 17:33:58 -0000      1.10
> +++ fsort.h   29 Jun 2014 22:17:16 -0000
> @@ -38,9 +38,17 @@
>  #define BUFSIZE (1 << POW)
>  #define MAXNUM (BUFSIZE/10)  /* lowish guess at average record size */
>  #define BUFFEND (EOF-2)
> -#define BUFFSMALL (EOF-3)    /* buffer is too small to hold line */
>  #define MAXFCT 1000
>  #define MAXLLEN ((1 << min(POW-4, 16)) - 14)
> +#define MERGE_FNUM   16 
> +
> +/*
> + * Default (initial) and maximum size of record buffer for fsort().
> + * Note that no more than MAXNUM records are stored in the buffer,
> + * even if the buffer is not full yet.
> + */
> +#define DEFBUFSIZE      (1 << 20)       /* 1MB */
> +#define MAXBUFSIZE      (8 << 20)       /* 10 MB */
>  
>  extern u_char *linebuf;
>  extern size_t linebuf_size;
> Index: msort.c
> ===================================================================
> RCS file: /cvs/src/usr.bin/sort/msort.c,v
> retrieving revision 1.24
> diff -u -p -r1.24 msort.c
> --- msort.c   13 Nov 2013 15:07:27 -0000      1.24
> +++ msort.c   29 Jun 2014 22:17:16 -0000
> @@ -65,6 +65,7 @@ fmerge(int binno, union f_handle files, 
>       int i, j, last;
>       void (*put)(RECHEADER *, FILE *);
>       struct tempfile *l_fstack;
> +     size_t tbufsiz;
>  
>       wts = ftbl->weights;
>       if (!UNIQUE && SINGL_FLD && (ftbl->flags & F))
> @@ -72,12 +73,12 @@ fmerge(int binno, union f_handle files, 
>       if (cfilebuf == NULL) {
>               cfilebuf = malloc(MAXLLEN + sizeof(MFILE));
>               if (cfilebuf == NULL)
> -                     errx(2, "cannot allocate memory");
> +                     errx(2, NULL);
>       }
>  
> -     i = min(16, nfiles) * LALIGN(MAXLLEN + sizeof(MFILE));
> -     if (i > bufsize) {
> -             bufsize = i;
> +     tbufsiz = min(16, nfiles) * LALIGN(MAXLLEN + sizeof(MFILE));
> +     if (tbufsiz > bufsize) {
> +             bufsize = tbufsiz;
>               if ((buffer = realloc(buffer, bufsize)) == NULL)
>                       err(2, NULL);
>       }
> @@ -127,22 +128,54 @@ merge(int infl0, int nfiles,
>      int (*get)(int, union f_handle, int, RECHEADER *, u_char *, struct field 
> *),
>      FILE *outfp, void (*put)(RECHEADER *, FILE *), struct field *ftbl)
>  {
> -     int c, i, j;
> +     int c, i, j, nf = nfiles;
>       union f_handle dummy = {0};
> -     struct mfile *flist[16], *cfile;
> +     struct mfile *flistb[MERGE_FNUM], **flist = flistb, *cfile;
> +     size_t availsz = bufsize;
> +     static void *bufs[MERGE_FNUM+1];
> +     static size_t bufs_sz[MERGE_FNUM+1];
> +
> +     /*
> +      * We need nfiles + 1 buffers. One is 'buffer', the
> +      * rest needs to be allocated.
> +      */
> +     bufs[0] = buffer;
> +     bufs_sz[0] = bufsize;
> +     for(i=1; i < nfiles+1; i++) {
> +             if (bufs[i])
> +                     continue;
> +
> +             bufs[i] = malloc(MAXLLEN);
> +             if (!bufs[i])
> +                     err(2, NULL);
> +             bufs_sz[i] = MAXLLEN;
> +     }
>  
>       for (i = j = 0; i < nfiles; i++) {
> -             cfile = (MFILE *) (buffer +
> -                 i * LALIGN(MAXLLEN + sizeof(MFILE)));
> -             cfile->flno = j + infl0;
> -             cfile->end = cfile->rec->data + MAXLLEN;
> +             cfile = (struct mfile *) bufs[j];
> +             cfile->flno = infl0 + j;
> +             cfile->end = (u_char *) bufs[j] + bufs_sz[j];
>               for (c = 1; c == 1;) {
> -                     if (EOF == (c = get(j+infl0, dummy, nfiles,
> +                     if (EOF == (c = get(cfile->flno, dummy, nfiles,
>                          cfile->rec, cfile->end, ftbl))) {
>                               i--;
>                               nfiles--;
>                               break;
>                       }
> +
> +                     if (c == BUFFEND) {
> +                             cfile = realloc(bufs[j], bufs_sz[j] *= 2);
> +                             bufs[j] = (void *)cfile;
> +
> +                             if (!cfile)
> +                                     err(2, NULL);
> +
> +                             cfile->end = (u_char *)cfile + bufs_sz[j];
> +
> +                             c = 1;
> +                             continue;
> +                     }
> +
>                       if (i)
>                               c = insert(flist, &cfile, i, !DELETE);
>                       else
> @@ -151,24 +184,49 @@ merge(int infl0, int nfiles,
>               j++;
>       }
>       if (nfiles > 0) {
> -             cfile = cfilebuf;
> +             cfile = (struct mfile *) bufs[nf];
>               cfile->flno = flist[0]->flno;
> -             cfile->end = cfile->rec->data + MAXLLEN;
> +             cfile->end = (u_char *) cfile + bufs_sz[nf];
>               while (nfiles) {
>                       for (c = 1; c == 1;) {
>                               if (EOF == (c = get(cfile->flno, dummy, nfiles,
>                                  cfile->rec, cfile->end, ftbl))) {
>                                       put(flist[0]->rec, outfp);
> -                                     memmove(flist, flist + 1,
> -                                         sizeof(MFILE *) * (--nfiles));
> -                                     cfile->flno = flist[0]->flno;
> +                                     if (--nfiles > 0) {
> +                                             flist++;
> +                                             cfile->flno = flist[0]->flno;
> +                                     }
>                                       break;
>                               }
> +                             if (c == BUFFEND) {
> +                                     char *oldbuf = (char *)cfile;
> +                                     availsz = (char *)cfile->end - oldbuf;
> +                                     availsz *= 2;
> +                                     cfile = realloc(oldbuf, availsz);
> +                                     if (!cfile)
> +                                             err(2, NULL);
> +
> +                                     for (i = 0; i < nf + 1; i++) {
> +                                             if (bufs[i] == oldbuf) {
> +                                                     bufs[i] = (char *)cfile;
> +                                                     bufs_sz[i] = availsz;
> +                                                     break;
> +                                             }
> +                                     }
> +
> +                                     cfile->end = (u_char *)cfile + availsz;
> +                                     c = 1;
> +                                     continue;
> +                             }
>                               if (!(c = insert(flist, &cfile, nfiles, 
> DELETE)))
>                                       put(cfile->rec, outfp);
>                       }
> -             }       
> -     }       
> +             }
> +             if (bufs_sz[0] > bufsize) {
> +                     buffer = bufs[0];
> +                     bufsize = bufs_sz[0];
> +             }
> +     }
>  }
>  
>  /*
> Index: sort.c
> ===================================================================
> RCS file: /cvs/src/usr.bin/sort/sort.c,v
> retrieving revision 1.41
> diff -u -p -r1.41 sort.c
> --- sort.c    13 Nov 2013 15:07:27 -0000      1.41
> +++ sort.c    29 Jun 2014 22:17:16 -0000
> @@ -44,13 +44,14 @@
>  
>  #include <sys/types.h>
>  #include <sys/stat.h>
> +
> +#include <err.h>
>  #include <locale.h>
>  #include <paths.h>
>  #include <signal.h>
>  #include <stdlib.h>
>  #include <string.h>
>  #include <unistd.h>
> -#include <err.h>
>  
>  int REC_D = '\n';
>  u_char d_mask[NBINS];                /* flags for rec_d, field_d, <blank> */
> @@ -71,6 +72,8 @@ struct coldesc *clist;
>  int ncols = 0;
>  int ND = 10;                 /* limit on number of -k options. */
>  
> +int (*radix_sort)(const u_char **, int, const u_char *, u_int) = radixsort;
> +
>  char *devstdin = _PATH_STDIN;
>  char *tmpdir = _PATH_VARTMP;
>  char toutpath[PATH_MAX];
> @@ -172,7 +175,6 @@ main(int argc, char *argv[])
>                       mflag = 1;
>                       break;
>               case 'H':
> -                     PANIC = 0;
>                       break;
>               case 'y':
>                       /* accept -y for backwards compat. */
> @@ -186,6 +188,7 @@ main(int argc, char *argv[])
>                       break;
>               case 's':
>                       STABLE = 1;
> +                     radix_sort = sradixsort;
>                       break;
>               case '?':
>               default:
> @@ -292,8 +295,8 @@ main(int argc, char *argv[])
>               act.sa_handler = onsig;
>               for (i = 0; sigtable[i]; ++i)   /* always unlink toutpath */
>                       if (sigaction(sigtable[i], NULL, &oact) < 0 ||
> -                         oact.sa_handler != SIG_IGN &&
> -                         sigaction(sigtable[i], &act, NULL) < 0)
> +                         (oact.sa_handler != SIG_IGN &&
> +                         sigaction(sigtable[i], &act, NULL) < 0))
>                               err(2, "sigaction");
>       } else
>               outfile = outpath;
> @@ -302,7 +305,7 @@ main(int argc, char *argv[])
>       if (mflag)
>               fmerge(-1, filelist, argc-optind, get, outfp, putline, fldtab);
>       else
> -             fsort(-1, 0, filelist, argc-optind, outfp, fldtab);
> +             fsort(filelist, argc-optind, outfp, fldtab);
>       if (outfile != outpath) {
>               if (access(outfile, 0))
>                       err(2, "%s", outfile);
> Index: sort.h
> ===================================================================
> RCS file: /cvs/src/usr.bin/sort/sort.h,v
> retrieving revision 1.7
> diff -u -p -r1.7 sort.h
> --- sort.h    21 Aug 2007 20:29:25 -0000      1.7
> +++ sort.h    29 Jun 2014 22:17:16 -0000
> @@ -129,13 +129,14 @@ union f_handle {
>       int top;
>       char **names;
>  };
> -extern int PANIC;    /* maximum depth of fsort before fmerge is called */
> +
>  extern u_char ascii[NBINS], Rascii[NBINS], Ftable[NBINS], RFtable[NBINS];
>  extern u_char alltable[NBINS], dtable[NBINS], itable[NBINS];
>  extern u_char d_mask[NBINS];
>  extern int SINGL_FLD, SEP_FLAG, UNIQUE, STABLE;
>  extern int REC_D;
>  extern char *tmpdir;
> +extern int (*radix_sort)(const u_char **, int, const u_char *, u_int);
>  extern int ND;               /* limit on number of -k options. */
>  
>  #include "extern.h"


Reply via email to