HADOOP-13110. add a streaming subcommand to mapred

Project: http://git-wip-us.apache.org/repos/asf/hadoop/repo
Commit: http://git-wip-us.apache.org/repos/asf/hadoop/commit/584a9156
Tree: http://git-wip-us.apache.org/repos/asf/hadoop/tree/584a9156
Diff: http://git-wip-us.apache.org/repos/asf/hadoop/diff/584a9156

Branch: refs/heads/HADOOP-12930
Commit: 584a915611fada6779bb74cb15b511aeed9c3a36
Parents: 1dcd9a9
Author: Allen Wittenauer <a...@apache.org>
Authored: Fri May 6 14:00:56 2016 -0700
Committer: Allen Wittenauer <a...@apache.org>
Committed: Sun May 15 07:50:15 2016 -0700

----------------------------------------------------------------------
 .../main/resources/assemblies/hadoop-tools.xml  |  8 +++
 .../apache/hadoop/streaming/DumpTypedBytes.java |  3 +-
 .../hadoop/streaming/HadoopStreaming.java       |  3 +-
 .../apache/hadoop/streaming/LoadTypedBytes.java |  3 +-
 .../src/main/shellprofile.d/hadoop-streaming.sh | 55 ++++++++++++++++++++
 .../src/site/markdown/HadoopStreaming.md.vm     | 30 +++++------
 6 files changed, 81 insertions(+), 21 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/hadoop/blob/584a9156/hadoop-assemblies/src/main/resources/assemblies/hadoop-tools.xml
----------------------------------------------------------------------
diff --git a/hadoop-assemblies/src/main/resources/assemblies/hadoop-tools.xml 
b/hadoop-assemblies/src/main/resources/assemblies/hadoop-tools.xml
index 8606e23..3909277 100644
--- a/hadoop-assemblies/src/main/resources/assemblies/hadoop-tools.xml
+++ b/hadoop-assemblies/src/main/resources/assemblies/hadoop-tools.xml
@@ -148,6 +148,14 @@
       </includes>
     </fileSet>
     <fileSet>
+      <directory>../hadoop-streaming/src/main/shellprofile.d</directory>
+      <includes>
+        <include>*</include>
+      </includes>
+      <outputDirectory>/libexec/shellprofile.d</outputDirectory>
+      <fileMode>0755</fileMode>
+    </fileSet>
+    <fileSet>
       <directory>../hadoop-sls/target</directory>
       
<outputDirectory>/share/hadoop/${hadoop.component}/sources</outputDirectory>
       <includes>

http://git-wip-us.apache.org/repos/asf/hadoop/blob/584a9156/hadoop-tools/hadoop-streaming/src/main/java/org/apache/hadoop/streaming/DumpTypedBytes.java
----------------------------------------------------------------------
diff --git 
a/hadoop-tools/hadoop-streaming/src/main/java/org/apache/hadoop/streaming/DumpTypedBytes.java
 
b/hadoop-tools/hadoop-streaming/src/main/java/org/apache/hadoop/streaming/DumpTypedBytes.java
index 5a07cc3..ffddc7c 100644
--- 
a/hadoop-tools/hadoop-streaming/src/main/java/org/apache/hadoop/streaming/DumpTypedBytes.java
+++ 
b/hadoop-tools/hadoop-streaming/src/main/java/org/apache/hadoop/streaming/DumpTypedBytes.java
@@ -91,8 +91,7 @@ public class DumpTypedBytes implements Tool {
   }
 
   private void printUsage() {
-    System.out.println("Usage: $HADOOP_HOME/bin/hadoop jar 
hadoop-streaming.jar"
-        + " dumptb <glob-pattern>");
+    System.out.println("Usage: mapred streaming dumptb <glob-pattern>");
     System.out.println("  Dumps all files that match the given pattern to " +
         "standard output as typed bytes.");
     System.out.println("  The files can be text or sequence files");

http://git-wip-us.apache.org/repos/asf/hadoop/blob/584a9156/hadoop-tools/hadoop-streaming/src/main/java/org/apache/hadoop/streaming/HadoopStreaming.java
----------------------------------------------------------------------
diff --git 
a/hadoop-tools/hadoop-streaming/src/main/java/org/apache/hadoop/streaming/HadoopStreaming.java
 
b/hadoop-tools/hadoop-streaming/src/main/java/org/apache/hadoop/streaming/HadoopStreaming.java
index eabf46c..92f9d03 100644
--- 
a/hadoop-tools/hadoop-streaming/src/main/java/org/apache/hadoop/streaming/HadoopStreaming.java
+++ 
b/hadoop-tools/hadoop-streaming/src/main/java/org/apache/hadoop/streaming/HadoopStreaming.java
@@ -56,8 +56,7 @@ public class HadoopStreaming {
   }
   
   private static void printUsage() {
-    System.out.println("Usage: $HADOOP_HOME/bin/hadoop jar 
hadoop-streaming.jar"
-        + " [options]");
+    System.out.println("Usage: mapred streaming [options]");
     System.out.println("Options:");
     System.out.println("  dumptb <glob-pattern> Dumps all files that match 
the" 
         + " given pattern to ");

http://git-wip-us.apache.org/repos/asf/hadoop/blob/584a9156/hadoop-tools/hadoop-streaming/src/main/java/org/apache/hadoop/streaming/LoadTypedBytes.java
----------------------------------------------------------------------
diff --git 
a/hadoop-tools/hadoop-streaming/src/main/java/org/apache/hadoop/streaming/LoadTypedBytes.java
 
b/hadoop-tools/hadoop-streaming/src/main/java/org/apache/hadoop/streaming/LoadTypedBytes.java
index a7a001c..838cfa1 100644
--- 
a/hadoop-tools/hadoop-streaming/src/main/java/org/apache/hadoop/streaming/LoadTypedBytes.java
+++ 
b/hadoop-tools/hadoop-streaming/src/main/java/org/apache/hadoop/streaming/LoadTypedBytes.java
@@ -89,8 +89,7 @@ public class LoadTypedBytes implements Tool {
   }
 
   private void printUsage() {
-    System.out.println("Usage: $HADOOP_HOME/bin/hadoop jar 
hadoop-streaming.jar"
-        + " loadtb <path>");
+    System.out.println("Usage: mapred streaming loadtb <path>");
     System.out.println("  Reads typed bytes from standard input" +
     " and stores them in a sequence file in");
     System.out.println("  the specified path");

http://git-wip-us.apache.org/repos/asf/hadoop/blob/584a9156/hadoop-tools/hadoop-streaming/src/main/shellprofile.d/hadoop-streaming.sh
----------------------------------------------------------------------
diff --git 
a/hadoop-tools/hadoop-streaming/src/main/shellprofile.d/hadoop-streaming.sh 
b/hadoop-tools/hadoop-streaming/src/main/shellprofile.d/hadoop-streaming.sh
new file mode 100755
index 0000000..cca016d
--- /dev/null
+++ b/hadoop-tools/hadoop-streaming/src/main/shellprofile.d/hadoop-streaming.sh
@@ -0,0 +1,55 @@
+#!/usr/bin/env bash
+
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if ! declare -f mapred_subcommand_streaming >/dev/null 2>/dev/null; then
+
+  if [[ "${HADOOP_SHELL_EXECNAME}" = mapred ]]; then
+    hadoop_add_subcommand "streaming" "launch a mapreduce streaming job"
+  fi
+
+## @description  streaming command for mapred
+## @audience     public
+## @stability    stable
+## @replaceable  yes
+function mapred_subcommand_streaming
+{
+  declare jarname
+  declare oldifs
+
+  # shellcheck disable=SC2034
+  HADOOP_CLASSNAME=org.apache.hadoop.util.RunJar
+  hadoop_add_to_classpath_tools hadoop-streaming
+
+  # locate the streaming jar so we have something to
+  # give to RunJar
+  oldifs=${IFS}
+  IFS=:
+  for jarname in ${CLASSPATH}; do
+    if [[ "${jarname}" =~ hadoop-streaming-[0-9] ]]; then
+      HADOOP_SUBCMD_ARGS=("${jarname}" "${HADOOP_SUBCMD_ARGS[@]}")
+      break
+    fi
+  done
+
+  IFS=${oldifs}
+
+  hadoop_debug "Appending HADOOP_CLIENT_OPTS onto HADOOP_OPTS"
+  HADOOP_OPTS="${HADOOP_OPTS} ${HADOOP_CLIENT_OPTS}"
+
+}
+
+fi

http://git-wip-us.apache.org/repos/asf/hadoop/blob/584a9156/hadoop-tools/hadoop-streaming/src/site/markdown/HadoopStreaming.md.vm
----------------------------------------------------------------------
diff --git 
a/hadoop-tools/hadoop-streaming/src/site/markdown/HadoopStreaming.md.vm 
b/hadoop-tools/hadoop-streaming/src/site/markdown/HadoopStreaming.md.vm
index cc8ed69..072a68b 100644
--- a/hadoop-tools/hadoop-streaming/src/site/markdown/HadoopStreaming.md.vm
+++ b/hadoop-tools/hadoop-streaming/src/site/markdown/HadoopStreaming.md.vm
@@ -62,7 +62,7 @@ Hadoop Streaming
 
 Hadoop streaming is a utility that comes with the Hadoop distribution. The 
utility allows you to create and run Map/Reduce jobs with any executable or 
script as the mapper and/or the reducer. For example:
 
-    hadoop jar hadoop-streaming-${project.version}.jar \
+    mapred streaming \
       -input myInputDirs \
       -output myOutputDir \
       -mapper /bin/cat \
@@ -88,7 +88,7 @@ Streaming supports streaming command options as well as 
[generic command options
 
 **Note:** Be sure to place the generic options before the streaming options, 
otherwise the command will fail. For an example, see [Making Archives Available 
to Tasks](#Making_Archives_Available_to_Tasks).
 
-    hadoop command [genericOptions] [streamingOptions]
+    mapred streaming [genericOptions] [streamingOptions]
 
 The Hadoop streaming command options are listed here:
 
@@ -115,7 +115,7 @@ $H3 Specifying a Java Class as the Mapper/Reducer
 
 You can supply a Java class as the mapper and/or the reducer.
 
-    hadoop jar hadoop-streaming-${project.version}.jar \
+    mapred streaming \
       -input myInputDirs \
       -output myOutputDir \
       -inputformat org.apache.hadoop.mapred.KeyValueTextInputFormat \
@@ -128,7 +128,7 @@ $H3 Packaging Files With Job Submissions
 
 You can specify any executable as the mapper and/or the reducer. The 
executables do not need to pre-exist on the machines in the cluster; however, 
if they don't, you will need to use "-file" option to tell the framework to 
pack your executable files as a part of job submission. For example:
 
-    hadoop jar hadoop-streaming-${project.version}.jar \
+    mapred streaming \
       -input myInputDirs \
       -output myOutputDir \
       -mapper myPythonScript.py \
@@ -139,7 +139,7 @@ The above example specifies a user defined Python 
executable as the mapper. The
 
 In addition to executable files, you can also package other auxiliary files 
(such as dictionaries, configuration files, etc) that may be used by the mapper 
and/or the reducer. For example:
 
-    hadoop jar hadoop-streaming-${project.version}.jar \
+    mapred streaming \
       -input myInputDirs \
       -output myOutputDir \
       -mapper myPythonScript.py \
@@ -216,7 +216,7 @@ $H4 Specifying the Number of Reducers
 
 To specify the number of reducers, for example two, use:
 
-    hadoop jar hadoop-streaming-${project.version}.jar \
+    mapred streaming \
       -D mapreduce.job.reduces=2 \
       -input myInputDirs \
       -output myOutputDir \
@@ -229,7 +229,7 @@ As noted earlier, when the Map/Reduce framework reads a 
line from the stdout of
 
 However, you can customize this default. You can specify a field separator 
other than the tab character (the default), and you can specify the nth (n \>= 
1) character rather than the first character in a line (the default) as the 
separator between the key and value. For example:
 
-    hadoop jar hadoop-streaming-${project.version}.jar \
+    mapred streaming \
       -D stream.map.output.field.separator=. \
       -D stream.num.map.output.key.fields=4 \
       -input myInputDirs \
@@ -279,7 +279,7 @@ User can specify a different symlink name for -archives 
using \#.
 
 In this example, the input.txt file has two lines specifying the names of the 
two files: cachedir.jar/cache.txt and cachedir.jar/cache2.txt. "cachedir.jar" 
is a symlink to the archived directory, which has the files "cache.txt" and 
"cache2.txt".
 
-    hadoop jar hadoop-streaming-${project.version}.jar \
+    mapred streaming \
                     -archives 
'hdfs://hadoop-nn1.example.com/user/me/samples/cachefile/cachedir.jar' \
                     -D mapreduce.job.maps=1 \
                     -D mapreduce.job.reduces=1 \
@@ -325,7 +325,7 @@ $H3 Hadoop Partitioner Class
 
 Hadoop has a library class, 
[KeyFieldBasedPartitioner](../api/org/apache/hadoop/mapred/lib/KeyFieldBasedPartitioner.html),
 that is useful for many applications. This class allows the Map/Reduce 
framework to partition the map outputs based on certain key fields, not the 
whole keys. For example:
 
-    hadoop jar hadoop-streaming-${project.version}.jar \
+    mapred streaming \
       -D stream.map.output.field.separator=. \
       -D stream.num.map.output.key.fields=4 \
       -D map.output.key.field.separator=. \
@@ -375,7 +375,7 @@ $H3 Hadoop Comparator Class
 
 Hadoop has a library class, 
[KeyFieldBasedComparator](../api/org/apache/hadoop/mapreduce/lib/partition/KeyFieldBasedComparator.html),
 that is useful for many applications. This class provides a subset of features 
provided by the Unix/GNU Sort. For example:
 
-    hadoop jar hadoop-streaming-${project.version}.jar \
+    mapred streaming \
       -D 
mapreduce.job.output.key.comparator.class=org.apache.hadoop.mapreduce.lib.partition.KeyFieldBasedComparator
 \
       -D stream.map.output.field.separator=. \
       -D stream.num.map.output.key.fields=4 \
@@ -411,7 +411,7 @@ Hadoop has a library package called 
[Aggregate](../api/org/apache/hadoop/mapred/
 
 To use Aggregate, simply specify "-reducer aggregate":
 
-    hadoop jar hadoop-streaming-${project.version}.jar \
+    mapred streaming \
       -input myInputDirs \
       -output myOutputDir \
       -mapper myAggregatorForKeyCount.py \
@@ -444,7 +444,7 @@ $H3 Hadoop Field Selection Class
 
 Hadoop has a library class, 
[FieldSelectionMapReduce](../api/org/apache/hadoop/mapred/lib/FieldSelectionMapReduce.html),
 that effectively allows you to process text data like the unix "cut" utility. 
The map function defined in the class treats each input key/value pair as a 
list of fields. You can specify the field separator (the default is the tab 
character). You can select an arbitrary list of fields as the map output key, 
and an arbitrary list of fields as the map output value. Similarly, the reduce 
function defined in the class treats each input key/value pair as a list of 
fields. You can select an arbitrary list of fields as the reduce output key, 
and an arbitrary list of fields as the reduce output value. For example:
 
-    hadoop jar hadoop-streaming-${project.version}.jar \
+    mapred streaming \
       -D mapreduce.map.output.key.field.separator=. \
       -D mapreduce.partition.keypartitioner.options=-k1,2 \
       -D mapreduce.fieldsel.data.field.separator=. \
@@ -495,7 +495,7 @@ Using an alias will not work, but variable substitution is 
allowed as shown in t
     charlie 80
     dan     75
 
-    $ c2='cut -f2'; hadoop jar hadoop-streaming-${project.version}.jar \
+    $ c2='cut -f2'; mapred streaming \
       -D mapreduce.job.name='Experiment' \
       -input /user/me/samples/student_marks \
       -output /user/me/samples/student_out \
@@ -525,7 +525,7 @@ $H3 How do I specify multiple input directories?
 
 You can specify multiple input directories with multiple '-input' options:
 
-    hadoop jar hadoop-streaming-${project.version}.jar \
+    mapred streaming \
       -input '/user/foo/dir1' -input '/user/foo/dir2' \
         (rest of the command)
 
@@ -541,7 +541,7 @@ $H3 How do I parse XML documents using streaming?
 
 You can use the record reader StreamXmlRecordReader to process XML documents.
 
-    hadoop jar hadoop-streaming-${project.version}.jar \
+    mapred streaming \
       -inputreader "StreamXmlRecord,begin=BEGIN_STRING,end=END_STRING" \
         (rest of the command)
 


---------------------------------------------------------------------
To unsubscribe, e-mail: common-commits-unsubscr...@hadoop.apache.org
For additional commands, e-mail: common-commits-h...@hadoop.apache.org

Reply via email to