commit 35f08239c07c43ff53587d8846d50a0d70d4cfea
Author: FRIGN <[email protected]>
Date:   Tue Feb 17 17:04:36 2015 +0100

    Add UTF-8-support to strings(1), add t-flag and refactor code
    
    Previously, the string-length was limited to BUFSIZ, which is an
    obvious deficiency.
    Now the buffer only needs to be as long as the user specifies the
    minimal string length.
    I added UTF-8-support, because that's how POSIX wants it and there
    are cases where you need this. It doesn't add ELF-barf compared to
    the previous implementation.
    The t-flag is also pretty important for POSIX-compliance, so I added
    it.
    The only trouble previously was the a-flag, but given that POSIX
    leaves undefined what the a-flag actually does, we set it as default
    and don't care about parsing ELF-headers, which has already
    turned out to be a security issue in GNU coreutils[0].
    
    [0]: 
http://lcamtuf.blogspot.ro/2014/10/psa-dont-run-strings-on-untrusted-files.html

diff --git a/README b/README
index 49f681f..6b9e854 100644
--- a/README
+++ b/README
@@ -67,7 +67,7 @@ The following tools are implemented ('*' == finished, '#' == 
UTF-8 support,
    sort            no                           -m, -o, -d, -f, -i
 =* split           yes                          none
 =* sponge          non-posix                    none
-   strings         no                           -t
+#* strings         yes                          none
 =* sync            non-posix                    none
 =* tail            yes                          none
 =* tar             non-posix                    none
diff --git a/strings.1 b/strings.1
index b5c65c2..aa4b81f 100644
--- a/strings.1
+++ b/strings.1
@@ -1,32 +1,52 @@
-.Dd November 23, 2014
+.Dd Februrary 17, 2015
 .Dt STRINGS 1
 .Os sbase
 .Sh NAME
 .Nm strings
-.Nd print the strings of printable characters in files
+.Nd print strings of printable characters in files
 .Sh SYNOPSIS
 .Nm
 .Op Fl a
-.Op Fl n Ar len
+.Op Fl n Ar num
+.Op Fl t Ar format
 .Op Ar file ...
 .Sh DESCRIPTION
 .Nm
-prints the printable character sequences that are at least 4 characters
-long. If no
-.Ar files
-are given,
+writes sequences of at least 4 printable characters in each
+.Ar file
+to stdout.
+If no
+.Ar file
+is given,
 .Nm
 reads from stdin.
 .Sh OPTIONS
 .Bl -tag -width Ds
 .It Fl a
-Scan files in their entirety. This is the default.
-.It Fl n Ar len
-Only print sequences that are at least
-.Ar len
-characters.  The default is 4 characters.
+Scan each
+.Ar file
+entirely. This is the default.
+.It Fl n Ar num
+Print sequences of at least
+.Ar num
+characters.  The default is 4.
+.It Fl t Ar format
+Prepend each string with its byte offset, with
+.Ar format
+being one of
+.Sy d , o , x
+for decimal, octal or hexadecimal numbers.
 .El
 .Sh STANDARDS
+The
 .Nm
-mirrors the semantics of Plan9
-.Xr strings 1 .
+utility is compliant with the
+.St -p1003.1-2008
+specification.
+.Pp
+The
+.Op Fl t
+output format has been changed from "%F %s" to "%8lF: %s", with
+.Sy F
+being one of
+.Sy d , o , x .
diff --git a/strings.c b/strings.c
index 27cecc5..9bbf1e2 100644
--- a/strings.c
+++ b/strings.c
@@ -1,50 +1,75 @@
 /* See LICENSE file for copyright and license details. */
-#include <ctype.h>
 #include <limits.h>
 #include <stdio.h>
 #include <stdlib.h>
 
+#include "utf.h"
 #include "util.h"
 
+static char *format = "";
+
 static void
-strings(FILE *fp, const char *fname, int len)
+strings(FILE *fp, const char *fname, size_t len)
 {
-       unsigned char buf[BUFSIZ];
-       int c, i = 0;
-       off_t offset = 0;
+       Rune r, *rbuf;
+       size_t i, bread;
+       off_t off;
+
+       rbuf = emalloc(len * sizeof(*rbuf));
 
-       do {
-               offset++;
-               if (isprint(c = getc(fp)))
-                       buf[i++] = c;
-               if ((!isprint(c) && i >= len) || i == sizeof(buf) - 1) {
-                       buf[i] = '\0';
-                       printf("%8ld: %s\n", (long)offset - i - 1, buf);
+       for (off = 0, i = 0; (bread = efgetrune(&r, fp, fname)); ) {
+               off += bread;
+               if (r == Runeerror)
+                       continue;
+               else if (!isprintrune(r)) {
+                       if (i > len)
+                               putchar('\n');
                        i = 0;
+                       continue;
+               }
+               if (i < len) {
+                       rbuf[i++] = r;
+                       continue;
+               } else if (i > len) {
+                       efputrune(&r, stdout, "<stdout>");
+                       continue;
                }
-       } while (c != EOF);
-       if (ferror(fp))
-               eprintf("%s: read error:", fname);
+               printf(format, (long)off - i);
+               for (i = 0; i < len; i++) {
+                       efputrune(rbuf + i, stdout, "<stdout>");
+               }
+               i++;
+       }
+       free(rbuf);
 }
 
 static void
 usage(void)
 {
-       eprintf("usage: %s [-a] [-n len] [file ...]\n", argv0);
+       eprintf("usage: %s [-a] [-n num] [-t format] [file ...]\n", argv0);
 }
 
 int
 main(int argc, char *argv[])
 {
        FILE *fp;
+       size_t len = 4;
        int ret = 0;
-       int len = 4;
+       char f;
 
        ARGBEGIN {
        case 'a':
                break;
        case 'n':
-               len = estrtonum(EARGF(usage()), 1, INT_MAX);
+               len = estrtonum(EARGF(usage()), 1, LLONG_MAX);
+               break;
+       case 't':
+               format = estrdup("%8l#: ");
+               f = *EARGF(usage());
+               if (f == 'd' || f == 'o' || f == 'x')
+                       format[3] = f;
+               else
+                       usage();
                break;
        default:
                usage();

Reply via email to