Hello, Here's an improved version of the 'auto-output-format' feature for join. Includes code,tests, NEWS and documentation.
The patch is attached below, and is also available here: http://cancan.cshl.edu/labmembers/gordon/coreutils8/join_auto_format.patch See email below for a use-case example. Thanks, -gordon Assaf Gordon wrote, On 11/04/2009 08:36 PM: > > This feature allows join to automatically guess the output format > without specifying '-o', allowing easier use (IMHO) of "-e". This is > mostly a convenience, DWIM kind of feature. > Here a simple use case: > > $ cat 1.txt > 1 alice > 2 bob > 4 dave > > $ cat 2.txt > 1 red > 2 green > 3 blue > > Joining with "-a 1 -a 2" will display the third and fourth items without > proper field 'fillers': > > $ join -j1 -a1 -a2 1.txt 2.txt > 1 alice red > 2 bob green > 3 blue > 4 dave > > This behavior is of course by design. > If one needs the empty columns to be filled, it requires both "-e" and > "-o", and to use "-o" properly, one needs to know beforehand the columns > in the input files: > > $ join -j1 -a1 -a2 -e FOO -o 0,1.2,2.2 1.txt 2.txt > 1 alice red > 2 bob green > 3 FOO blue > 4 dave FOO > > If there are many columns in the input fields, writing the proper "-o" > format string is cumbersome. > > I suggest a simple feature: > When adding "--auto-format" argument, join will automatically generate > an output format (simulating "-o"), by putting the joined field first, > followed by all the fields from file1, followed by all fields from file2. > (This feature assumes the number of columns in the first lines > represents the number of columns in all lines). > This allows using "-e" without specifying "-o", as so: > > $ join -j1 -a1 -a2 -e FOO --auto-format 1.txt 2.txt > 1 alice red > 2 bob green > 3 FOO blue > 4 dave FOO > NEWS | 3 +++ doc/coreutils.texi | 10 ++++++++++ src/join.c | 36 +++++++++++++++++++++++++++++++++++- tests/misc/join | 21 +++++++++++++++++++++ 4 files changed, 69 insertions(+), 1 deletions(-) diff --git a/NEWS b/NEWS index 5b75dbb..8655faa 100644 --- a/NEWS +++ b/NEWS @@ -77,6 +77,9 @@ GNU coreutils NEWS -*- outline -*- touch now accepts the option --no-dereference (-h), as a means to change symlink timestamps on platforms with enough support. + join now accepts the option --auto-format (-F), to automatically + detect the output format without requireing explicit -o. + * Noteworthy changes in release 8.0 (2009-10-06) [beta] diff --git a/doc/coreutils.texi b/doc/coreutils.texi index 227014c..f692f47 100644 --- a/doc/coreutils.texi +++ b/doc/coreutils.texi @@ -5512,6 +5512,16 @@ Do not check that both input files are in sorted order. This is the default. Replace those output fields that are missing in the input with @var{string}. +...@item -F +...@itemx --auto-format +...@opindex -i +...@opindex --auto-format +Automatically detects output format based on the number of fields in the +first line of each input file (as if the user explicitly specified @samp{-o}). +Allows using @samp{-e} without a-priori knowledge of the fields in the input files. +The join field is printed first, followed by the remaining fields from the first +file and the second file. + @item -i @itemx --ignore-case @opindex -i diff --git a/src/join.c b/src/join.c index d734a91..07112eb 100644 --- a/src/join.c +++ b/src/join.c @@ -146,6 +146,7 @@ static struct option const longopts[] = {"ignore-case", no_argument, NULL, 'i'}, {"check-order", no_argument, NULL, CHECK_ORDER_OPTION}, {"nocheck-order", no_argument, NULL, NOCHECK_ORDER_OPTION}, + {"auto-format", no_argument, NULL, 'F'}, {GETOPT_HELP_OPTION_DECL}, {GETOPT_VERSION_OPTION_DECL}, {NULL, 0, NULL, 0} @@ -157,6 +158,12 @@ static struct line uni_blank; /* If nonzero, ignore case when comparing join fields. */ static bool ignore_case; +/* if nonzero, automatically build a specific output field list, + based on the first line of each input file */ +static bool auto_output_format; + +static void build_output_format(struct line const *line1, struct line const* line2); + void usage (int status) { @@ -191,6 +198,8 @@ by whitespace. When FILE1 or FILE2 (not both) is -, read standard input.\n\ --check-order check that the input is correctly sorted, even\n\ if all input lines are pairable\n\ --nocheck-order do not check that the input is correctly sorted\n\ + -F, --auto-format Automatically build output format, based on the first\n\ + line of each input file. Allows '-e' without using '-o'.\n\ "), stdout); fputs (HELP_OPTION_DESCRIPTION, stdout); fputs (VERSION_OPTION_DESCRIPTION, stdout); @@ -616,6 +625,9 @@ join (FILE *fp1, FILE *fp2) initseq (&seq2); getseq (fp2, &seq2, 2); + if (auto_output_format && seq1.count && seq2.count) + build_output_format(seq1.lines[0],seq2.lines[0]); + while (seq1.count && seq2.count) { size_t i; @@ -926,6 +938,24 @@ add_file_name (char *name, char *names[2], *optc_status = MIGHT_BE_O_ARG; } +static void +build_output_format(struct line const *line1, struct line const* line2) +{ + int i ; + if (outlist_head.next) + return; + + add_field(0,0); + for (i = 0; i < join_field_1 && i < line1->nfields; ++i) + add_field(1,i); + for (i = join_field_1 + 1; i < line1->nfields; ++i) + add_field(1,i); + for (i = 0; i < join_field_2 && i < line2->nfields; ++i) + add_field(2,i); + for (i = join_field_2 + 1; i < line2->nfields; ++i) + add_field(2,i); +} + int main (int argc, char **argv) { @@ -954,7 +984,7 @@ main (int argc, char **argv) issued_disorder_warning[0] = issued_disorder_warning[1] = false; check_input_order = CHECK_ORDER_DEFAULT; - while ((optc = getopt_long (argc, argv, "-a:e:i1:2:j:o:t:v:", + while ((optc = getopt_long (argc, argv, "-a:e:i1:2:j:o:t:v:F", longopts, NULL)) != -1) { @@ -1052,6 +1082,10 @@ main (int argc, char **argv) &nfiles, &prev_optc_status, &optc_status); break; + case 'F': + auto_output_format = true; + break; + case_GETOPT_HELP_CHAR; case_GETOPT_VERSION_CHAR (PROGRAM_NAME, AUTHORS); diff --git a/tests/misc/join b/tests/misc/join index d1f1677..3f1e590 100755 --- a/tests/misc/join +++ b/tests/misc/join @@ -185,6 +185,27 @@ my @tv = ( # Before 6.10.143, this would mistakenly fail with the diagnostic: # join: File 1 is not in sorted order ['chkodr-7', '-12', ["2 a\n1 b\n", ""], "", 0], + +# Auto-format +['autoformat-1', '-j1 -a1 -a2 -F -e FOO', + ["1 a\n2 b\n", "1 X\n3 Y\n"], "1 a X\n2 b FOO\n3 FOO Y\n", 0], + +# Auto-format, with empty filler (no '-e' specified)- +# should print a column delimiters (space characters), but no filler. +# This should be equivalent to specifing "-o 0,1.2,2.2" without "-e". +['autoformat-2', '-j1 -a1 -a2 -F', + ["1 a\n2 b\n", "1 X\n3 Y\n"], "1 a X\n2 b \n3 Y\n", 0], + +# auto-format sanity check: specify explicit output format without -e, +# make sure it matches the above test. +['autoformat-3', '-j1 -a1 -a2 -o 0,1.2,2.2', + ["1 a\n2 b\n", "1 X\n3 Y\n"], "1 a X\n2 b \n3 Y\n", 0], + +# both auto-format and explicit output format (different format than 'auto'), +# auto-format should be silently ignored. +['autoformat-4', '-j1 -a1 -a2 -e FOO -F -o 0,2.2,1.2', + ["1 a\n2 b\n", "1 X\n3 Y\n"], "1 X a\n2 FOO b\n3 Y FOO\n", 0], + ); # Convert the above old-style test vectors to the newer
