Hi On Sat, Sep 09, 2017 at 10:29:21AM -0700, Michael Forney wrote: > On Sat, Sep 9, 2017 at 2:08 AM, Silvan Jegen <s.je...@gmail.com> wrote: > > From: Jim Beveridge <ji...@chromium.org> > > > > The original code is by Jim Beveridge working on Fuchsia. I merged it > > with slight changes. > > > > Time to tar two 1GB files: > > > > Before patch: > > > > real 0m6.428s > > user 0m0.245s > > sys 0m4.881s > > > > real 0m6.454s > > user 0m0.239s > > sys 0m4.883s > > > > real 0m6.515s > > user 0m0.259s > > sys 0m4.839s > > > > After patch: > > > > real 0m4.755s > > user 0m0.026s > > sys 0m1.598s > > > > real 0m4.788s > > user 0m0.063s > > sys 0m1.578s > > > > real 0m4.822s > > user 0m0.007s > > sys 0m1.662s > > > > A similar speedup can be observed for untaring. > > > > In addition to the buffer size increase we change the code to only create > > directories for non-compliant tar files and we check for st to be NULL > > in the recursive copy function. > > He also sent me a pull request on my github branch for oasis: > https://github.com/michaelforney/sbase/pull/2
Ah, I did not see that one. > I think we should work on fixing correctness of tar before trying to > optimize it. Currently it does not handle short reads or writes at all > (when working with pipes). I was thinking we should add a readall in > libutil analogous to writeall and then make use of that. That sounds good to me. > Regarding COPY_CHUNK_SIZE, it is probably a good idea to put that in > util.h (perhaps with a different name). concat has the same problem > with a small BUFSIZ (musl's is only 1024). What do you think of Hiltjo's idea of querying for the page size with long sz = sysconf(_SC_PAGESIZE); or similar? Such code could still be put inot util.h of course. > > --- > > tar.c | 72 > > +++++++++++++++++++++++++++++++++++++++++++++++++++---------------- > > 1 file changed, 55 insertions(+), 17 deletions(-) > > > > diff --git a/tar.c b/tar.c > > index 53a737c..8cd1abe 100644 > > --- a/tar.c > > +++ b/tar.c > > @@ -16,6 +16,8 @@ > > #include "util.h" > > > > #define BLKSIZ 512 > > +// COPY_CHUNK_SIZE must be a power of 2 > > +#define COPY_CHUNK_SIZE 8192 > > > > enum Type { > > REG = '0', > > @@ -236,10 +238,13 @@ archive(const char *path) > > ewrite(tarfd, b, BLKSIZ); > > > > if (fd != -1) { > > - while ((l = eread(fd, b, BLKSIZ)) > 0) { > > - if (l < BLKSIZ) > > - memset(b + l, 0, BLKSIZ - l); > > - ewrite(tarfd, b, BLKSIZ); > > + char chunk[COPY_CHUNK_SIZE]; > > + while ((l = eread(fd, chunk, COPY_CHUNK_SIZE)) > 0) { > > + // Ceiling to BLKSIZ boundary > > + int ceilsize = (l + (BLKSIZ-1)) & ~(BLKSIZ-1); > > + if (l < ceilsize) > > + memset(chunk + l, 0, ceilsize - l); > > + ewrite(tarfd, chunk, ceilsize); > > } > > close(fd); > > } > > @@ -250,7 +255,7 @@ archive(const char *path) > > static int > > unarchive(char *fname, ssize_t l, char b[BLKSIZ]) > > { > > - char lname[101], *tmp, *p; > > + char lname[101], *p; > > long mode, major, minor, type, mtime, uid, gid; > > struct header *h = (struct header *)b; > > int fd = -1; > > @@ -261,9 +266,13 @@ unarchive(char *fname, ssize_t l, char b[BLKSIZ]) > > if (remove(fname) < 0 && errno != ENOENT) > > weprintf("remove %s:", fname); > > > > - tmp = estrdup(fname); > > - mkdirp(dirname(tmp), 0777, 0777); > > - free(tmp); > > + // tar files normally create the directory chain. This is a fallback > > + // for noncompliant tar files. > > + if (h->type != DIRECTORY) { > > + char* tmp = estrdup(fname); > > + mkdirp(dirname(tmp), 0777, 0777); > > + free(tmp); > > + } > > If you tar a file within another directory, you end up with just one > entry. This check means that the parent directory won't be created > when trying to extract this file. Other tar implementations are able > to extract such an archive. > > > > > switch (h->type) { > > case REG: > > @@ -319,9 +328,25 @@ unarchive(char *fname, ssize_t l, char b[BLKSIZ]) > > eprintf("strtol %s: invalid number\n", h->gid); > > > > if (fd != -1) { > > - for (; l > 0; l -= BLKSIZ) > > - if (eread(tarfd, b, BLKSIZ) > 0) > > - ewrite(fd, b, MIN(l, BLKSIZ)); > > + // Ceiling to BLKSIZ boundary > > + int readsize = (l + (BLKSIZ-1)) & ~(BLKSIZ-1); > > + char chunk[COPY_CHUNK_SIZE]; > > + int lastread = 0; > > + > > + for (; readsize > 0; l -= lastread, readsize -= lastread) { > > + int chunk_size = MIN(readsize, COPY_CHUNK_SIZE); > > + // Short reads are legal, so don't expect to read > > + // everything that was requested. > > + lastread = eread(tarfd, chunk, chunk_size); > > + if (lastread == 0) { > > + close(fd); > > + remove(fname); > > + eprintf("unexpected end of file reading > > %s.\n", > > + fname); > > + } > > + > > + ewrite(fd, chunk, MIN(l, lastread)); > > + } > > close(fd); > > } > > > > @@ -331,7 +356,7 @@ unarchive(char *fname, ssize_t l, char b[BLKSIZ]) > > times[0].tv_sec = times[1].tv_sec = mtime; > > times[0].tv_nsec = times[1].tv_nsec = 0; > > if (!mflag && utimensat(AT_FDCWD, fname, times, > > AT_SYMLINK_NOFOLLOW) < 0) > > - weprintf("utimensat %s:\n", fname); > > + weprintf("utimensat %s %d:\n", fname, errno); > > if (h->type == SYMLINK) { > > if (!getuid() && lchown(fname, uid, gid)) > > weprintf("lchown %s:\n", fname); > > @@ -349,10 +374,23 @@ static void > > skipblk(ssize_t l) > > { > > char b[BLKSIZ]; > > - > > - for (; l > 0; l -= BLKSIZ) > > - if (!eread(tarfd, b, BLKSIZ)) > > - break; > > + int lastread = 0; > > + // Ceiling to BLKSIZ boundary > > + int ceilsize = (l + (BLKSIZ-1)) & ~(BLKSIZ-1); > > + > > + off_t offset = lseek(tarfd, ceilsize, SEEK_CUR); > > + if (offset >= ceilsize) > > + return; > > + if (errno != ESPIPE) > > + eprintf("unexpected end of file.\n"); > > + > > + // This is a pipe, socket or FIFO. Fall back to a sequential read. > > + for (; ceilsize > 0; ceilsize -= lastread) { > > + int chunk_size = MIN(ceilsize, BLKSIZ); > > + lastread = eread(tarfd, b, chunk_size); > > + if (lastread == 0) > > + eprintf("unexpected end of file %d.\n", errno); > > + } > > } > > > > static int > > @@ -370,7 +408,7 @@ c(const char *path, struct stat *st, void *data, struct > > recursor *r) > > if (vflag) > > puts(path); > > > > - if (S_ISDIR(st->st_mode)) > > + if (st && S_ISDIR(st->st_mode)) > > recurse(path, NULL, r); > > } > > I don't understand this. Can you explain? Perhaps this was just a bad > merge of > https://git.suckless.org/sbase/commit/?id=a5612b0d08b9abb4e65acaa3d322581af2fd7a39? Ah, I faintly remembered something like this but then thought I could keep it in just in case. According to that commit, st can never be NULL when these functions are called so we can remove that check. Cheers, Silvan