uniq(1): don't skip() lines more than once

Scott Cheloha Tue, 14 Dec 2021 09:26:18 -0800

In uniq(1), calling skip() to skip fields and/or characters on each
input line is extremely expensive.  One way to reduce the cost is to
only do it once for a given line, instead of doing it repeatedly for
the most recent unique line.


The performance improvement for this trivial change is enormous.

# -current uniq(1)
$ for i in $(jot 10); do nanotime uniq -s 1 < /usr/share/dict/words > 
/dev/null; done 
        0.059518016 real         0.060 user         0.000 sys
        0.059492033 real         0.050 user         0.000 sys
        0.059647217 real         0.060 user         0.000 sys
        0.059649349 real         0.060 user         0.000 sys
        0.059546094 real         0.050 user         0.010 sys
        0.059637451 real         0.060 user         0.000 sys
        0.059734855 real         0.060 user         0.000 sys
        0.059553137 real         0.060 user         0.000 sys
        0.059252731 real         0.060 user         0.000 sys
        0.059624800 real         0.050 user         0.000 sys
# patched uniq(1)
jetsam$ for i in $(jot 10); do nanotime obj/uniq -s 1 < /usr/share/dict/words > 
/dev/null; done
        0.051529548 real         0.040 user         0.010 sys
        0.051341628 real         0.050 user         0.000 sys
        0.051586453 real         0.050 user         0.000 sys
        0.051452066 real         0.050 user         0.000 sys
        0.051535854 real         0.050 user         0.010 sys
        0.051293136 real         0.050 user         0.000 sys
        0.051344077 real         0.040 user         0.010 sys
        0.051351220 real         0.050 user         0.010 sys
        0.051422239 real         0.050 user         0.000 sys
        0.051358392 real         0.050 user         0.000 sys

About 14-16% faster when skipping a single character on each input
line.  The improvement grows as you skip more characters or fields,
but a 15% minimum improvement is awesome.

ok?

Index: uniq.c
===================================================================
RCS file: /cvs/src/usr.bin/uniq/uniq.c,v
retrieving revision 1.29
diff -u -p -r1.29 uniq.c
--- uniq.c      17 Nov 2021 23:09:38 -0000      1.29
+++ uniq.c      14 Dec 2021 17:23:35 -0000
@@ -57,7 +57,7 @@ __dead void   usage(void);
 int
 main(int argc, char *argv[])
 {
-       char *prevline, *t1, *t2, *thisline;
+       char *p, *prevline, *t, *thisline, *tmp;
        FILE *ifp = NULL, *ofp = NULL;
        size_t prevsize, thissize, tmpsize;
        ssize_t len;
@@ -142,6 +142,10 @@ main(int argc, char *argv[])
        }
        if (prevline[len - 1] == '\n')
                prevline[len - 1] = '\0';
+       if (numfields || numchars)
+               p = skip(prevline);
+       else
+               p = prevline;
        
        thissize = 0;
        thisline = NULL;
@@ -150,20 +154,20 @@ main(int argc, char *argv[])
                        thisline[len - 1] = '\0';
 
                /* If requested get the chosen fields + character offsets. */
-               if (numfields || numchars) {
-                       t1 = skip(thisline);
-                       t2 = skip(prevline);
-               } else {
-                       t1 = thisline;
-                       t2 = prevline;
-               }
+               if (numfields || numchars)
+                       t = skip(thisline);
+               else
+                       t = thisline;
 
                /* If different, print; set previous to new value. */
-               if ((iflag ? strcasecmp : strcmp)(t1, t2)) {
+               if ((iflag ? strcasecmp : strcmp)(p, t)) {
                        show(ofp, prevline);
-                       t1 = prevline;
+                       tmp = prevline;
                        prevline = thisline;
-                       thisline = t1;
+                       thisline = tmp;
+                       tmp = p;
+                       p = t;
+                       t = tmp;
                        tmpsize = prevsize;
                        prevsize = thissize;
                        thissize = tmpsize;

uniq(1): don't skip() lines more than once

Reply via email to