[ https://issues.apache.org/jira/browse/AVRO-1858?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16703784#comment-16703784 ]
ASF GitHub Bot commented on AVRO-1858: -------------------------------------- dkulp closed pull request #100: AVRO-1858 add tojson head mode URL: https://github.com/apache/avro/pull/100 This is a PR merged from a forked repository. As GitHub hides the original diff on merge, it is displayed below for the sake of provenance: As this is a foreign pull request (from a fork), the diff is supplied below (as it won't show otherwise due to GitHub magic): diff --git a/lang/java/tools/src/main/java/org/apache/avro/tool/DataFileReadTool.java b/lang/java/tools/src/main/java/org/apache/avro/tool/DataFileReadTool.java index fcc89caf0..79625e3cd 100644 --- a/lang/java/tools/src/main/java/org/apache/avro/tool/DataFileReadTool.java +++ b/lang/java/tools/src/main/java/org/apache/avro/tool/DataFileReadTool.java @@ -20,12 +20,14 @@ import java.io.BufferedInputStream; import java.io.InputStream; import java.io.PrintStream; +import java.util.ArrayList; import java.util.List; import joptsimple.OptionParser; import joptsimple.OptionSet; import joptsimple.OptionSpec; +import org.apache.avro.AvroRuntimeException; import org.apache.avro.Schema; import org.apache.avro.file.DataFileStream; import org.apache.avro.io.DatumWriter; @@ -36,6 +38,7 @@ /** Reads a data file and dumps to JSON */ public class DataFileReadTool implements Tool { + private static final long DEFAULT_HEAD_COUNT = 10; @Override public String getName() { @@ -53,10 +56,14 @@ public int run(InputStream stdin, PrintStream out, PrintStream err, OptionParser optionParser = new OptionParser(); OptionSpec<Void> prettyOption = optionParser .accepts("pretty", "Turns on pretty printing."); + String headDesc = String.format("Converts the first X records (default is %d).", DEFAULT_HEAD_COUNT); + OptionSpec<String> headOption = optionParser.accepts("head", headDesc).withOptionalArg(); OptionSet optionSet = optionParser.parse(args.toArray(new String[0])); Boolean pretty = optionSet.has(prettyOption); - List<String> nargs = (List<String>)optionSet.nonOptionArguments(); + List<String> nargs = new ArrayList<String>((List<String>)optionSet.nonOptionArguments()); + + long headCount = getHeadCount(optionSet, headOption, nargs); if (nargs.size() != 1) { printHelp(err); @@ -73,8 +80,10 @@ public int run(InputStream stdin, PrintStream out, PrintStream err, Schema schema = streamReader.getSchema(); DatumWriter<Object> writer = new GenericDatumWriter<Object>(schema); JsonEncoder encoder = EncoderFactory.get().jsonEncoder(schema, out, pretty); - for (Object datum : streamReader) + for(long recordCount = 0; streamReader.hasNext() && recordCount < headCount; recordCount++) { + Object datum = streamReader.next(); writer.write(datum, encoder); + } encoder.flush(); out.println(); out.flush(); @@ -84,8 +93,28 @@ public int run(InputStream stdin, PrintStream out, PrintStream err, return 0; } + private static long getHeadCount(OptionSet optionSet, OptionSpec<String> headOption, List<String> nargs) { + long headCount = Long.MAX_VALUE; + if(optionSet.has(headOption)) { + headCount = DEFAULT_HEAD_COUNT; + List<String> headValues = optionSet.valuesOf(headOption); + if(headValues.size() > 0) { + // if the value parses to int, assume it's meant to go with --head + // otherwise assume it was an optionSet.nonOptionArgument and add back to the list + // TODO: support input filenames whose whole path+name is int parsable? + try { + headCount = Long.parseLong(headValues.get(0)); + if(headCount < 0) throw new AvroRuntimeException("--head count must not be negative"); + } catch(NumberFormatException ex) { + nargs.addAll(headValues); + } + } + } + return headCount; + } + private void printHelp(PrintStream ps) { - ps.println("tojson --pretty input-file"); + ps.println("tojson [--pretty] [--head[=X]] input-file"); ps.println(); ps.println(getShortDescription()); ps.println("A dash ('-') can be given as an input file to use stdin"); diff --git a/lang/java/tools/src/test/java/org/apache/avro/tool/TestDataFileTools.java b/lang/java/tools/src/test/java/org/apache/avro/tool/TestDataFileTools.java index 0270b713f..473ac2d4d 100644 --- a/lang/java/tools/src/test/java/org/apache/avro/tool/TestDataFileTools.java +++ b/lang/java/tools/src/test/java/org/apache/avro/tool/TestDataFileTools.java @@ -35,6 +35,7 @@ import java.util.Collections; import java.util.List; +import org.apache.avro.AvroRuntimeException; import org.apache.avro.AvroTestUtil; import org.apache.avro.Schema; import org.apache.avro.Schema.Type; @@ -47,7 +48,7 @@ @SuppressWarnings("deprecation") public class TestDataFileTools { - static final int COUNT = 10; + static final int COUNT = 15; static File sampleFile; static String jsonData; static Schema schema; @@ -117,6 +118,43 @@ public void testReadToJsonPretty() throws Exception { run(new DataFileReadTool(), "--pretty", sampleFile.getPath())); } + @Test + public void testReadHeadDefaultCount() throws Exception { + String expectedJson = jsonData.substring(0, 20); // first 10 numbers + assertEquals(expectedJson, + run(new DataFileReadTool(), "--head", sampleFile.getPath())); + } + + @Test + public void testReadHeadEquals3Count() throws Exception { + String expectedJson = jsonData.substring(0, 6); // first 3 numbers + assertEquals(expectedJson, + run(new DataFileReadTool(), "--head=3", sampleFile.getPath())); + } + + @Test + public void testReadHeadSpace5Count() throws Exception { + String expectedJson = jsonData.substring(0, 10); // first 5 numbers + assertEquals(expectedJson, + run(new DataFileReadTool(), "--head", "5", sampleFile.getPath())); + } + + @Test + public void testReadHeadLongCount() throws Exception { + assertEquals(jsonData, + run(new DataFileReadTool(), "--head=3000000000", sampleFile.getPath())); + } + + @Test + public void testReadHeadEqualsZeroCount() throws Exception { + assertEquals("\n", run(new DataFileReadTool(), "--head=0", sampleFile.getPath())); + } + + @Test(expected = AvroRuntimeException.class) + public void testReadHeadNegativeCount() throws Exception { + assertEquals("\n", run(new DataFileReadTool(), "--head=-5", sampleFile.getPath())); + } + @Test public void testGetMeta() throws Exception { String output = run(new DataFileGetMetaTool(), sampleFile.getPath()); ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org > Update DataFileReadTool (tojson) to support a "head" concept > ------------------------------------------------------------ > > Key: AVRO-1858 > URL: https://issues.apache.org/jira/browse/AVRO-1858 > Project: Apache Avro > Issue Type: Improvement > Components: java > Affects Versions: 1.8.1 > Reporter: Mike Hurley > Assignee: Mike Hurley > Priority: Major > > It would be nice if the tojson operator supported a "head" concept in order > to get a sampling of records in an Avro file. > Allow specifying a maximum record count to display. If no max is given in > head mode, use a reasonable default (like 10). -- This message was sent by Atlassian JIRA (v7.6.3#76005)