I've added a manpage and applied all of FRIGN and Hiltjo's style changes.
Regards, -- Wolfgang Corcoran-Mathe
#include <ctype.h> #include <stdint.h> #include <stdio.h> #include <stdlib.h> #include <string.h> #include "arg.h" #include "text.h" #include "utf.h" #include "util.h" enum { INIT = 1, GROW = 2, }; enum { EXPAND = 0, RESET = 1, }; enum { FIELD_ERROR = -2, }; struct field { char *s; size_t len; }; struct line { char *text; size_t nf; size_t maxf; struct field *fields; }; struct spec { size_t fileno; size_t fldno; }; struct outlist { size_t ns; size_t maxs; struct spec **specs; }; struct span { size_t nl; size_t maxl; struct line **lines; }; static char *sep = NULL; static char *replace = NULL; static const char defaultofs = ' '; static const int jfield = 1; /* POSIX default join field */ static int unpairsa = 0, unpairsb = 0; static int oflag = 0; static int pairs = 1; static size_t seplen; static struct outlist output; char *argv0; static void usage(void) { eprintf("usage: %s [-1 field] [-2 field] [-o list] [-e string] " "[-a | -v fileno] [-t delim] file1 file2\n", argv0); } static void prfield(struct field *fp) { if (fwrite(fp->s, 1, fp->len, stdout) != fp->len) eprintf("fwrite:"); } static void prsep(void) { if (sep) fwrite(sep, 1, seplen, stdout); else putchar(defaultofs); } static void swaplines(struct line *la, struct line *lb) { struct line tmp; tmp = *la; *la = *lb; *lb = tmp; } static void prjoin(struct line *la, struct line *lb, size_t jfa, size_t jfb) { struct spec *sp; struct field *joinfield; size_t i; if (jfa >= la->nf || jfb >= lb->nf) return; joinfield = &la->fields[jfa]; if (oflag) { for (i = 0; i < output.ns; i++) { sp = output.specs[i]; if (sp->fileno == 1) { if (sp->fldno < la->nf) prfield(&la->fields[sp->fldno]); else if (replace) fputs(replace, stdout); } else if (sp->fileno == 2) { if (sp->fldno < lb->nf) prfield(&lb->fields[sp->fldno]); else if (replace) fputs(replace, stdout); } else if (sp->fileno == 0) { prfield(joinfield); } if (i < output.ns - 1) prsep(); } } else { prfield(joinfield); prsep(); for (i = 0; i < la->nf; i++) { if (i != jfa) { prfield(&la->fields[i]); prsep(); } } for (i = 0; i < lb->nf; i++) { if (i != jfb) { prfield(&lb->fields[i]); if (i < la->nf - 1) prsep(); } } } putchar('\n'); } static void prline(struct line *lp) { size_t len = strlen(lp->text); if (fwrite(lp->text, 1, len, stdout) != len) eprintf("fwrite:"); putchar('\n'); } static int linecmp(struct line *la, struct line *lb, size_t jfa, size_t jfb) { int status; /* return FIELD_ERROR if both lines are short */ if (jfa >= la->nf) { status = jfb >= lb->nf ? FIELD_ERROR : -1; } else if (jfb >= lb->nf) { status = 1; } else { status = memcmp(la->fields[jfa].s, lb->fields[jfb].s, MAX (la->fields[jfa].len, lb->fields[jfb].len)); if (status > 0) status = 1; else if (status < 0) status = -1; } return status; } static void addfield(struct line *lp, char *sp, size_t len) { if (lp->nf >= lp->maxf) { lp->fields = ereallocarray(lp->fields, (GROW * lp->maxf), sizeof(struct field)); lp->maxf *= GROW; } lp->fields[lp->nf].s = sp; lp->fields[lp->nf].len = len; lp->nf++; } static void prspanjoin(struct span *spa, struct span *spb, size_t jfa, size_t jfb) { size_t i, j; for (i = 0; i < (spa->nl - 1); i++) for (j = 0; j < (spb->nl - 1); j++) prjoin(spa->lines[i], spb->lines[j], jfa, jfb); } static struct line * makeline(char *s, size_t len) { struct line *lp; char *sp, *beg, *end; size_t i; int eol = 0; if (s[len-1] == '\n') s[len-1] = '\0'; lp = ereallocarray(NULL, INIT, sizeof(struct line)); lp->text = s; lp->fields = ereallocarray(NULL, INIT, sizeof(struct field)); lp->nf = 0; lp->maxf = INIT; for (sp = lp->text; isblank(*sp); sp++) ; while (!eol) { beg = sp; if (sep) { if (!(end = utfutf(sp, sep))) eol = 1; if (!eol) { addfield(lp, beg, end - beg); for (i = 0; i < seplen; i++) end++; } } else { for (end = sp; !(isblank(*end)); end++) { if (*end == '\0') { eol = 1; break; } } if (!eol) addfield(lp, beg, end - beg); while (isblank(*++end)) ; } if (eol) addfield(lp, beg, strlen(sp)); sp = end; } return lp; } static int addtospan(struct span *sp, FILE *fp, int reset) { char *newl = NULL; size_t len, size = 0; if ((len = getline(&newl, &size, fp)) == -1) { if (ferror(fp)) eprintf("getline:"); else return 0; } if (reset) sp->nl = 0; if (sp->nl >= sp->maxl) { sp->lines = ereallocarray(sp->lines, (GROW * sp->maxl), sizeof(struct line *)); sp->maxl *= GROW; } sp->lines[sp->nl] = makeline(newl, len); sp->nl++; return 1; } static void initspan(struct span *sp) { sp->nl = 0; sp->maxl = INIT; sp->lines = ereallocarray(NULL, INIT, sizeof(struct line *));; } static void freespan(struct span *sp) { size_t i; for (i = 0; i < sp->nl; i++) { free(sp->lines[i]->fields); free(sp->lines[i]->text); } free(sp->lines); } static void initolist(struct outlist *olp) { olp->ns = 0; olp->maxs = 1; olp->specs = ereallocarray(NULL, INIT, sizeof(struct spec *)); } static void addspec(struct outlist *olp, struct spec *sp) { if (olp->ns >= olp->maxs) { olp->specs = ereallocarray(olp->specs, (GROW * olp->maxs), sizeof(struct spec *)); olp->maxs *= GROW; } olp->specs[olp->ns] = sp; olp->ns++; } static struct spec * makespec(char *s) { struct spec *sp; int fileno; size_t fldno; switch (s[0]) { case '0': /* join field */ fileno = 0; fldno = 0; break; case '1': case '2': if (sscanf(s, "%d.%zu", &fileno, &fldno) != 2) eprintf("\"%s\": invalid format\n", s); fldno--; /* ugly */ break; default: eprintf("%c: invalid file number (must be 0, 1 or 2)\n", s[0]); break; } sp = ereallocarray(NULL, INIT, sizeof(struct spec)); sp->fileno = fileno; sp->fldno = fldno; return sp; } static void makeolist(struct outlist *olp, char *s) { char *item, *sp; sp = s; while (sp) { item = sp; sp = strpbrk(sp, ", \t"); if (sp) *sp++ = '\0'; addspec(olp, makespec(item)); } } static void freespecs(struct outlist *olp) { size_t i; for (i = 0; i < olp->ns; i++) free(olp->specs[i]); } static void join(FILE *fa, FILE *fb, size_t jfa, size_t jfb) { struct span spa, spb; int cmp, eofa, eofb; initspan(&spa); initspan(&spb); cmp = eofa = eofb = 0; addtospan(&spa, fa, RESET); addtospan(&spb, fb, RESET); while (spa.nl && spb.nl) { if ((cmp = linecmp(spa.lines[0], spb.lines[0], jfa, jfb)) < 0) { if (unpairsa) prline(spa.lines[0]); if (!addtospan(&spa, fa, RESET)) { if (unpairsb) { /* a is EOF'd; print the rest of b */ do prline(spb.lines[0]); while (addtospan(&spb, fb, RESET)); } eofa = eofb = 1; } else { continue; } } else if (cmp > 0) { if (unpairsb) prline(spb.lines[0]); if (!addtospan(&spb, fb, RESET)) { if (unpairsa) { /* b is EOF'd; print the rest of a */ do prline(spa.lines[0]); while (addtospan(&spa, fa, RESET)); } eofa = eofb = 1; } else { continue; } } else if (cmp == 0) { /* read all consecutive matching lines from a */ do { if (!addtospan(&spa, fa, EXPAND)) { eofa = 1; spa.nl++; break; } } while (linecmp(spa.lines[spa.nl-1], spb.lines[0], jfa, jfb) == 0); /* read all consecutive matching lines from b */ do { if (!addtospan(&spb, fb, EXPAND)) { eofb = 1; spb.nl++; break; } } while (linecmp(spa.lines[0], spb.lines[spb.nl-1], jfa, jfb) == 0); if (pairs) prspanjoin(&spa, &spb, jfa, jfb); } else { /* FIELD_ERROR: both lines lacked join fields */ if (unpairsa) prline(spa.lines[0]); if (unpairsb) prline(spb.lines[0]); eofa = addtospan(&spa, fa, RESET) ? 0 : 1; eofb = addtospan(&spb, fb, RESET) ? 0 : 1; if (!eofa && !eofb) continue; } if (eofa) { spa.nl = 0; } else { swaplines(spa.lines[0], spa.lines[spa.nl - 1]); /* ugly */ spa.nl = 1; } if (eofb) { spb.nl = 0; } else { swaplines(spb.lines[0], spb.lines[spb.nl - 1]); /* ugly */ spb.nl = 1; } } freespan(&spa); freespan(&spb); } int main(int argc, char *argv[]) { size_t jf[2] = { jfield, jfield, }; FILE *fp[2]; int n; char *fno; ARGBEGIN { case '1': jf[0] = estrtonum(EARGF(usage()), 1, MIN(LLONG_MAX, SIZE_MAX)); break; case '2': jf[1] = estrtonum(EARGF(usage()), 1, MIN(LLONG_MAX, SIZE_MAX)); break; case 'a': fno = EARGF(usage()); if (strcmp(fno, "1") == 0) unpairsa = 1; else if (strcmp(fno, "2") == 0) unpairsb = 1; else usage(); break; case 'e': replace = EARGF(usage()); break; case 'o': oflag = 1; initolist(&output); makeolist(&output, EARGF(usage())); break; case 't': sep = EARGF(usage()); break; case 'v': pairs = 0; fno = EARGF(usage()); if (strcmp(fno, "1") == 0) unpairsa = 1; else if (strcmp(fno, "2") == 0) unpairsb = 1; else usage(); break; default: usage(); } ARGEND; if (sep) seplen = unescape(sep); if (argc != 2) usage(); for (n = 0; n < 2; n++) { if (argv[n][0] == '-' && !argv[n][1]) { argv[n] = "<stdin>"; fp[n] = stdin; } else if (!(fp[n] = fopen(argv[n], "r"))) { eprintf("fopen %s:", argv[n]); } } jf[0]--; jf[1]--; join(fp[0], fp[1], jf[0], jf[1]); if (oflag) freespecs(&output); enfshut(2, fp[0], argv[0]); if (fp[0] != fp[1]) enfshut(2, fp[1], argv[1]); enfshut(2, stdout, "<stdout>"); exit(0); }
.Dd April 18, 2015 .Dt JOIN 1 .Os sbase .Sh NAME .Nm join .Nd relational database operator .Sh SYNOPSIS .Nm .Op Fl 1 Ar field .Op Fl 2 Ar field .Op Fl o Ar list .Op Fl e Ar string .Op Fl a Ar fileno | Fl v Ar fileno .Op Fl t Ar delim .Ar file1 file2 .Sh DESCRIPTION .Nm lines from .Ar file1 and .Ar file2 on a matching field. If one of the input files is '-', standard input is read for that file. .Pp Files are read sequentially and are assumed to be sorted on the join field. .Nm does not check the order of input, and joining two unsorted files will produce unexpected output. .Pp By default, input lines are matched on the first blank-separated field; output lines are space-separated and consist of the join field followed by the remaining fields from .Ar file1 Ns , then the remaining fields from .Ar file2 Ns . .Sh OPTIONS .Bl -tag -width Ds .It Fl 1 Ar field Join on the .Ar field Ns eth field of file 1. .It Fl 2 Ar field Join on the .Ar field Ns eth field of file 2. .It Fl a Ar fileno Print unpairable lines from file .Ar fileno in addition to normal output. .It Fl e Ar string When used with .Fl o Ns , replace empty fields in the output list with .Ar string Ns . .It Fl o Ar list Format output according to the string .Ar list Ns . Each element of .Ar list may be either .Ar fileno.field or 0 (representing the join field). Elements in .Ar list may be separated by blanks or commas. For example, .Bd -literal -offset indent join -o "0 2.1 1.3" .Ed .Pp would print the join field, the first field of .Ar file2 Ns , then the third field of .Ar file1 Ns . .Pp Only paired lines are formatted with the .Fl o option. Unpairable lines (selected with .Fl a or .Fl v Ns ) are printed raw. .It Fl t Ar delim Use the arbitrary string .Ar delim as field delimiter for both input and output. .It Fl v Ar fileno Print unpairable lines from file .Ar fileno instead of normal output. .El .Sh STANDARDS The .Nm utility is compliant with the .St -p1003.1-2013 specification with the following exeption: .Bl -bullet -offset indent .It Unpairable lines ignore formatting specified with .Fl o Ns . .El .Pp The possibility of specifying multibyte delimiters of arbitrary length is an extension to the specification.