Author: cutting
Date: Mon Jan 6 23:33:37 2014
New Revision: 1556069
URL: http://svn.apache.org/r1556069
Log:
AVRO-1426. Java: Add mapreduce word count example. Contributed by Jesse
Anderson.
Added:
avro/trunk/doc/examples/mr-example/src/main/java/example/MapReduceAvroWordCount.java
(with props)
Modified:
avro/trunk/CHANGES.txt
avro/trunk/doc/src/content/xdocs/mr.xml
Modified: avro/trunk/CHANGES.txt
URL:
http://svn.apache.org/viewvc/avro/trunk/CHANGES.txt?rev=1556069&r1=1556068&r2=1556069&view=diff
==============================================================================
--- avro/trunk/CHANGES.txt (original)
+++ avro/trunk/CHANGES.txt Mon Jan 6 23:33:37 2014
@@ -67,6 +67,9 @@ Trunk (not yet released)
AVRO-1225. Java: Add guide for MapReduce API. (Brock Noland via cutting)
+ AVRO-1426. Java: Add mapreduce word count example.
+ (Jesse Anderson via cutting)
+
BUG FIXES
AVRO-1368. Fix SpecificDatumWriter to, when writing a string
Added:
avro/trunk/doc/examples/mr-example/src/main/java/example/MapReduceAvroWordCount.java
URL:
http://svn.apache.org/viewvc/avro/trunk/doc/examples/mr-example/src/main/java/example/MapReduceAvroWordCount.java?rev=1556069&view=auto
==============================================================================
---
avro/trunk/doc/examples/mr-example/src/main/java/example/MapReduceAvroWordCount.java
(added)
+++
avro/trunk/doc/examples/mr-example/src/main/java/example/MapReduceAvroWordCount.java
Mon Jan 6 23:33:37 2014
@@ -0,0 +1,124 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package example;
+
+import java.io.IOException;
+import java.util.*;
+
+import org.apache.avro.Schema;
+import org.apache.avro.Schema.Type;
+import org.apache.avro.mapred.AvroWrapper;
+import org.apache.avro.mapred.Pair;
+import org.apache.avro.mapreduce.AvroJob;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.conf.Configured;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.io.LongWritable;
+import org.apache.hadoop.io.NullWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.Job;
+import org.apache.hadoop.mapreduce.Mapper;
+import org.apache.hadoop.mapreduce.Reducer;
+import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
+import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
+import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
+import org.apache.hadoop.util.Tool;
+import org.apache.hadoop.util.ToolRunner;
+
+/**
+ * The classic WordCount example modified to output Avro Pair<CharSequence,
+ * Integer> records instead of text.
+ */
+public class MapReduceAvroWordCount extends Configured implements Tool {
+
+ public static class Map
+ extends Mapper<LongWritable, Text, Text, IntWritable> {
+
+ private final static IntWritable one = new IntWritable(1);
+ private Text word = new Text();
+
+ public void map(LongWritable key, Text value, Context context)
+ throws IOException, InterruptedException {
+ String line = value.toString();
+ StringTokenizer tokenizer = new StringTokenizer(line);
+ while (tokenizer.hasMoreTokens()) {
+ word.set(tokenizer.nextToken());
+ context.write(word, one);
+ }
+ }
+ }
+
+ public static class Reduce
+ extends Reducer<Text, IntWritable,
+ AvroWrapper<Pair<CharSequence, Integer>>, NullWritable> {
+
+ public void reduce(Text key, Iterable<IntWritable> values,
+ Context context)
+ throws IOException, InterruptedException {
+ int sum = 0;
+ for (IntWritable value : values) {
+ sum += value.get();
+ }
+ context.write(new AvroWrapper<Pair<CharSequence, Integer>>
+ (new Pair<CharSequence, Integer>(key.toString(), sum)),
+ NullWritable.get());
+ }
+ }
+
+ public int run(String[] args) throws Exception {
+ if (args.length != 2) {
+ System.err.println("Usage: AvroWordCount <input path> <output path>");
+ return -1;
+ }
+
+ Job job = new Job(getConf());
+ job.setJarByClass(MapReduceAvroWordCount.class);
+ job.setJobName("wordcount");
+
+ // We call setOutputSchema first so we can override the configuration
+ // parameters it sets
+ AvroJob.setOutputKeySchema(job,
+ Pair.getPairSchema(Schema.create(Type.STRING),
+ Schema.create(Type.INT)));
+ job.setOutputValueClass(NullWritable.class);
+
+ job.setMapperClass(Map.class);
+ job.setReducerClass(Reduce.class);
+
+ job.setInputFormatClass(TextInputFormat.class);
+
+ job.setMapOutputKeyClass(Text.class);
+ job.setMapOutputValueClass(IntWritable.class);
+ job.setSortComparatorClass(Text.Comparator.class);
+
+ FileInputFormat.setInputPaths(job, new Path(args[0]));
+ FileOutputFormat.setOutputPath(job, new Path(args[1]));
+
+ job.waitForCompletion(true);
+
+ return 0;
+ }
+
+ public static void main(String[] args) throws Exception {
+ int res =
+ ToolRunner.run(new Configuration(), new MapReduceAvroWordCount(), args);
+ System.exit(res);
+ }
+}
Propchange:
avro/trunk/doc/examples/mr-example/src/main/java/example/MapReduceAvroWordCount.java
------------------------------------------------------------------------------
svn:eol-style = native
Modified: avro/trunk/doc/src/content/xdocs/mr.xml
URL:
http://svn.apache.org/viewvc/avro/trunk/doc/src/content/xdocs/mr.xml?rev=1556069&r1=1556068&r2=1556069&view=diff
==============================================================================
--- avro/trunk/doc/src/content/xdocs/mr.xml (original)
+++ avro/trunk/doc/src/content/xdocs/mr.xml Mon Jan 6 23:33:37 2014
@@ -39,7 +39,8 @@
See the <a href="http://hadoop.apache.org/docs/current/">Hadoop
documentation</a> and the <a href="gettingstartedjava.html">Avro getting
started guide</a> for introductions to these projects. This guide uses
- the old MapReduce API (<code>org.apache.hadoop.mapred</code>).
+ the old MapReduce API (<code>org.apache.hadoop.mapred</code>) and the
new
+ MapReduce API (<code>org.apache.hadoop.mapreduce</code>).
</p>
<section>
<title>Setup</title>
@@ -289,7 +290,10 @@ public class MapReduceColorCount extends
ColorCount reads in data files containing <code>User</code> records,
defined in <em>examples/user.avsc</em>, and counts the number of
instances of each favorite color. (This example draws inspiration from
- the canonical WordCount MapReduce application.) The <code>User</code>
+ the canonical WordCount MapReduce application.) This example uses the
+ old MapReduce API. See MapReduceAvroWordCount, found under
+ <em>doc/examples/mr-example/src/main/java/example/</em> to see the new
MapReduce
+ API example. The <code>User</code>
schema is defined as follows:
</p>
<source>
@@ -547,7 +551,7 @@ AvroJob.setOutputSchema(conf, Pair.getPa
</p>
<p>
- The mapred package has api <a
+ The mapred package has API <a
href="http://avro.apache.org/docs/current/api/java/org/apache/avro/mapred/package-summary.html">
<code>org.apache.avro.mapred</code> documentation</a> as does the <a
href="http://avro.apache.org/docs/current/api/java/org/apache/avro/mapreduce/package-summary.html">
@@ -558,7 +562,11 @@ AvroJob.setOutputSchema(conf, Pair.getPa
these libraries. See the AvroWordCount application, found under
<em>examples/mr-example/src/main/java/example/AvroWordCount.java</em>
in
the Avro documentation, for an example of implementing a
- <code>Reducer</code> that outputs Avro data.
+ <code>Reducer</code> that outputs Avro data using the old MapReduce
API.
+ See the MapReduceAvroWordCount application, found under
+
<em>examples/mr-example/src/main/java/example/MapReduceAvroWordCount.java</em>
in
+ the Avro documentation, for an example of implementing a
+ <code>Reducer</code> that outputs Avro data using the new MapReduce
API.
</p>
</section>
</body>