Author: drew
Date: Sat Jan 22 15:08:23 2011
New Revision: 1062169
URL: http://svn.apache.org/viewvc?rev=1062169&view=rev
Log:
MAHOUT-520: Add example scripts / integration tests for various algorithms.
Added:
mahout/trunk/examples/bin/build-20news-bayes.sh (with props)
mahout/trunk/examples/bin/build-cluster-syntheticcontrol.sh (with props)
Modified:
mahout/trunk/examples/bin/build-reuters.sh
Added: mahout/trunk/examples/bin/build-20news-bayes.sh
URL:
http://svn.apache.org/viewvc/mahout/trunk/examples/bin/build-20news-bayes.sh?rev=1062169&view=auto
==============================================================================
--- mahout/trunk/examples/bin/build-20news-bayes.sh (added)
+++ mahout/trunk/examples/bin/build-20news-bayes.sh Sat Jan 22 15:08:23 2011
@@ -0,0 +1,97 @@
+#!/bin/sh
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+#
+# Downloads the 20newsgroups dataset, trains and tests a bayes classifier.
+#
+# To run: change into the mahout directory and type:
+# examples/bin/build-20news.sh
+
+SCRIPT_PATH=${0%/*}
+if [ "$0" != "$SCRIPT_PATH" ] && [ "$SCRIPT_PATH" != "" ]; then
+ cd $SCRIPT_PATH
+fi
+
+mkdir -p work
+if [ ! -e work/20news-bayesinput ]; then
+ if [ ! -e work/20news-bydate ]; then
+ if [ ! -f work/20news-bydate.tar.gz ]; then
+ echo "Downloading 20news-bydate"
+ curl
http://people.csail.mit.edu/jrennie/20Newsgroups/20news-bydate.tar.gz -o
work/20news-bydate.tar.gz
+ fi
+ mkdir -p work/20news-bydate
+ echo "Extracting..."
+ cd work/20news-bydate && tar xzf ../20news-bydate.tar.gz && cd .. && cd ..
+ fi
+fi
+
+cd ../..
+
+set -e
+
+./bin/mahout org.apache.mahout.classifier.bayes.PrepareTwentyNewsgroups \
+ -p examples/bin/work/20news-bydate/20news-bydate-train \
+ -o examples/bin/work/20news-bydate/bayes-train-input \
+ -a org.apache.mahout.vectorizer.DefaultAnalyzer \
+ -c UTF-8
+
+./bin/mahout org.apache.mahout.classifier.bayes.PrepareTwentyNewsgroups \
+ -p examples/bin/work/20news-bydate/20news-bydate-test \
+ -o examples/bin/work/20news-bydate/bayes-test-input \
+ -a org.apache.mahout.vectorizer.DefaultAnalyzer \
+ -c UTF-8
+
+TEST_METHOD="sequential"
+
+# if we're set up to run on a cluster..
+if [ "$HADOOP_HOME" != "" ]; then
+ # mapreduce test method used on hadoop
+ TEST_METHOD="mapreduce"
+
+ set +e
+ hadoop dfs -rmr \
+ examples/bin/work/20news-bydate/bayes-train-input
+
+ hadoop dfs -rmr \
+ examples/bin/work/20news-bydate/bayes-test-input
+
+ set -e
+ hadoop dfs -put \
+ examples/bin/work/20news-bydate/bayes-train-input \
+ examples/bin/work/20news-bydate/bayes-train-input
+
+ hadoop dfs -put \
+ examples/bin/work/20news-bydate/bayes-test-input \
+ examples/bin/work/20news-bydate/bayes-test-input
+fi
+
+
+./bin/mahout trainclassifier \
+ -i examples/bin/work/20news-bydate/bayes-train-input \
+ -o examples/bin/work/20news-bydate/bayes-model \
+ -type bayes \
+ -ng 1 \
+ -source hdfs
+
+./bin/mahout testclassifier \
+ -m examples/bin/work/20news-bydate/bayes-model \
+ -d examples/bin/work/20news-bydate/bayes-test-input \
+ -type bayes \
+ -ng 1 \
+ -source hdfs \
+ -method ${TEST_METHOD}
Propchange: mahout/trunk/examples/bin/build-20news-bayes.sh
------------------------------------------------------------------------------
svn:executable = *
Added: mahout/trunk/examples/bin/build-cluster-syntheticcontrol.sh
URL:
http://svn.apache.org/viewvc/mahout/trunk/examples/bin/build-cluster-syntheticcontrol.sh?rev=1062169&view=auto
==============================================================================
--- mahout/trunk/examples/bin/build-cluster-syntheticcontrol.sh (added)
+++ mahout/trunk/examples/bin/build-cluster-syntheticcontrol.sh Sat Jan 22
15:08:23 2011
@@ -0,0 +1,66 @@
+#!/bin/sh
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+#
+# Downloads the Synthetic control dataset and prepares it for clustering
+#
+# To run: change into the mahout directory and type:
+# examples/bin/cluster-syntheticcontrol.sh
+
+if [ $1 = "-ni" ];then
+ clustertype=canopy
+else
+ algorithm=( canopy kmeans fuzzykmeans dirichlet meanshift )
+
+ echo "Please select a number to choose the corresponding clustering
algorithm"
+ echo "1. ${algorithm[0]} clustering"
+ echo "2. ${algorithm[1]} clustering"
+ echo "3. ${algorithm[2]} clustering"
+ echo "4. ${algorithm[3]} clustering"
+ echo "5. ${algorithm[4]} clustering"
+ read -p "Enter your choice : " choice
+
+ echo "ok. You chose $choice and we'll use ${algorithm[$choice-1]} Clustering"
+ clustertype=${algorithm[$choice-1]}
+fi
+
+cd examples/bin/
+mkdir -p work
+if [ ! -f work/synthetic_control.data ]; then
+ echo "Downloading Synthetic control data"
+ curl
http://archive.ics.uci.edu/ml/databases/synthetic_control/synthetic_control.data
-o work/synthetic_control.data
+fi
+
+if [ "$HADOOP_HOME" != "" ]; then
+ echo "Checking the health of DFS..."
+ $HADOOP_HOME/bin/hadoop fs -ls
+ if [ $? -eq 0 ];then
+ echo "DFS is healthy... "
+ echo "Uploading Synthetic control data to HDFS"
+ $HADOOP_HOME/bin/hadoop fs -rmr testdata
+ $HADOOP_HOME/bin/hadoop fs -mkdir testdata
+ $HADOOP_HOME/bin/hadoop fs -put work/synthetic_control.data testdata
+ echo "Successfully Uploaded Synthetic control data to HDFS "
+
+ ../../bin/mahout
org.apache.mahout.clustering.syntheticcontrol."${clustertype}".Job
+ else
+ echo " HADOOP is not running. Please make sure you hadoop is running. "
+ fi
+else
+ echo " HADOOP_HOME variable is not set. Please set this environment variable
and rerun the script"
+fi
\ No newline at end of file
Propchange: mahout/trunk/examples/bin/build-cluster-syntheticcontrol.sh
------------------------------------------------------------------------------
svn:executable = *
Modified: mahout/trunk/examples/bin/build-reuters.sh
URL:
http://svn.apache.org/viewvc/mahout/trunk/examples/bin/build-reuters.sh?rev=1062169&r1=1062168&r2=1062169&view=diff
==============================================================================
--- mahout/trunk/examples/bin/build-reuters.sh (original)
+++ mahout/trunk/examples/bin/build-reuters.sh Sat Jan 22 15:08:23 2011
@@ -1,26 +1,40 @@
-#/**
-# * Licensed to the Apache Software Foundation (ASF) under one or more
-# * contributor license agreements. See the NOTICE file distributed with
-# * this work for additional information regarding copyright ownership.
-# * The ASF licenses this file to You under the Apache License, Version 2.0
-# * (the "License"); you may not use this file except in compliance with
-# * the License. You may obtain a copy of the License at
-# *
-# * http://www.apache.org/licenses/LICENSE-2.0
-# *
-# * Unless required by applicable law or agreed to in writing, software
-# * distributed under the License is distributed on an "AS IS" BASIS,
-# * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# * See the License for the specific language governing permissions and
-# * limitations under the License.
-# */
+#!/bin/sh
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
#
# Downloads the Reuters dataset and prepares it for clustering
#
# To run: change into the mahout directory and type:
# examples/bin/build-reuters.sh
-#!/bin/sh
+
+if [ "$1" = "-ni" ]; then
+ clustertype=kmeans
+else
+ algorithm=( kmeans lda )
+
+ echo "Please select a number to choose the corresponding clustering
algorithm"
+ echo "1. ${algorithm[0]} clustering"
+ echo "2. ${algorithm[1]} clustering"
+ read -p "Enter your choice : " choice
+
+ echo "ok. You chose $choice and we'll use ${algorithm[$choice-1]} Clustering"
+ clustertype=${algorithm[$choice-1]}
+fi
cd examples/bin/
mkdir -p work
@@ -35,18 +49,47 @@ if [ ! -e work/reuters-out ]; then
cd work/reuters-sgm && tar xzf ../reuters21578.tar.gz && cd .. && cd ..
fi
fi
-
cd ../..
-./bin/mahout org.apache.lucene.benchmark.utils.ExtractReuters
./examples/bin/work/reuters-sgm/ ./examples/bin/work/reuters-out/
-./bin/mahout seqdirectory -i ./examples/bin/work/reuters-out/ -o
./examples/bin/work/reuters-out-seqdir -c UTF-8 -chunk 5
-# to use k-Means clustering, uncomment the next three lines
-./bin/mahout seq2sparse -i ./examples/bin/work/reuters-out-seqdir/ -o
./examples/bin/work/reuters-out-seqdir-sparse
-./bin/mahout kmeans -i
./examples/bin/work/reuters-out-seqdir-sparse/tfidf-vectors/ -c
./examples/bin/work/clusters -o ./examples/bin/work/reuters-kmeans -x 10 -k 20
-ow
-./bin/mahout clusterdump -s examples/bin/work/reuters-kmeans/clusters-10 -d
examples/bin/work/reuters-out-seqdir-sparse/dictionary.file-0 -dt sequencefile
-b 100 -n 20
-
-# to use LDA clustering, uncomment the next three lines
-#./bin/mahout seq2sparse -i ./examples/bin/work/reuters-out-seqdir/ -o
./examples/bin/work/reuters-out-seqdir-sparse -wt tf -seq -nr 3
-#./bin/mahout lda -i ./examples/bin/work/reuters-out-seqdir-sparse/tf-vectors
-o ./examples/bin/work/reuters-lda -k 20 -v 50000 -ow -x 20
-#./bin/mahout ldatopics -i ./examples/bin/work/reuters-lda/state-20 -d
./examples/bin/work/reuters-out-seqdir-sparse/dictionary.file-0 -dt sequencefile
+./bin/mahout org.apache.lucene.benchmark.utils.ExtractReuters \
+ ./examples/bin/work/reuters-sgm/ \
+ ./examples/bin/work/reuters-out/ \
+&& \
+./bin/mahout seqdirectory \
+ -i ./examples/bin/work/reuters-out/ \
+ -o ./examples/bin/work/reuters-out-seqdir \
+ -c UTF-8 -chunk 5
+
+if [ "x$clustertype" == "xkmeans" ]; then
+ ./bin/mahout seq2sparse \
+ -i ./examples/bin/work/reuters-out-seqdir/ \
+ -o ./examples/bin/work/reuters-out-seqdir-sparse \
+ && \
+ ./bin/mahout kmeans \
+ -i ./examples/bin/work/reuters-out-seqdir-sparse/tfidf-vectors/ \
+ -c ./examples/bin/work/clusters \
+ -o ./examples/bin/work/reuters-kmeans \
+ -x 10 -k 20 -ow \
+ && \
+ ./bin/mahout clusterdump \
+ -s examples/bin/work/reuters-kmeans/clusters-10 \
+ -d examples/bin/work/reuters-out-seqdir-sparse/dictionary.file-0 \
+ -dt sequencefile -b 100 -n 20
+elif [ "x$clustertype" == "xlda" ]; then
+ ./bin/mahout seq2sparse \
+ -i ./examples/bin/work/reuters-out-seqdir/ \
+ -o ./examples/bin/work/reuters-out-seqdir-sparse \
+ -wt tf -seq -nr 3 \
+ && \
+ ./bin/mahout lda \
+ -i ./examples/bin/work/reuters-out-seqdir-sparse/tf-vectors \
+ -o ./examples/bin/work/reuters-lda -k 20 -v 50000 -ow -x 20 \
+ && \
+ ./bin/mahout ldatopics \
+ -i ./examples/bin/work/reuters-lda/state-20 \
+ -d ./examples/bin/work/reuters-out-seqdir-sparse/dictionary.file-0 \
+ -dt sequencefile
+else
+ echo "unknown cluster type: $clustertype";
+fi