Author: drew
Date: Sat Jan 22 15:08:23 2011
New Revision: 1062169

URL: http://svn.apache.org/viewvc?rev=1062169&view=rev
Log:
MAHOUT-520: Add example scripts / integration tests for various algorithms.

Added:
    mahout/trunk/examples/bin/build-20news-bayes.sh   (with props)
    mahout/trunk/examples/bin/build-cluster-syntheticcontrol.sh   (with props)
Modified:
    mahout/trunk/examples/bin/build-reuters.sh

Added: mahout/trunk/examples/bin/build-20news-bayes.sh
URL: 
http://svn.apache.org/viewvc/mahout/trunk/examples/bin/build-20news-bayes.sh?rev=1062169&view=auto
==============================================================================
--- mahout/trunk/examples/bin/build-20news-bayes.sh (added)
+++ mahout/trunk/examples/bin/build-20news-bayes.sh Sat Jan 22 15:08:23 2011
@@ -0,0 +1,97 @@
+#!/bin/sh
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+#
+# Downloads the 20newsgroups dataset, trains and tests a bayes classifier. 
+#
+# To run:  change into the mahout directory and type:
+#  examples/bin/build-20news.sh
+
+SCRIPT_PATH=${0%/*}
+if [ "$0" != "$SCRIPT_PATH" ] && [ "$SCRIPT_PATH" != "" ]; then 
+  cd $SCRIPT_PATH
+fi
+
+mkdir -p work
+if [ ! -e work/20news-bayesinput ]; then
+  if [ ! -e work/20news-bydate ]; then
+    if [ ! -f work/20news-bydate.tar.gz ]; then
+      echo "Downloading 20news-bydate"
+      curl 
http://people.csail.mit.edu/jrennie/20Newsgroups/20news-bydate.tar.gz -o 
work/20news-bydate.tar.gz
+    fi
+    mkdir -p work/20news-bydate
+    echo "Extracting..."
+    cd work/20news-bydate && tar xzf ../20news-bydate.tar.gz && cd .. && cd ..
+  fi
+fi
+
+cd ../..
+
+set -e
+
+./bin/mahout org.apache.mahout.classifier.bayes.PrepareTwentyNewsgroups \
+  -p examples/bin/work/20news-bydate/20news-bydate-train \
+  -o examples/bin/work/20news-bydate/bayes-train-input \
+  -a org.apache.mahout.vectorizer.DefaultAnalyzer \
+  -c UTF-8
+
+./bin/mahout org.apache.mahout.classifier.bayes.PrepareTwentyNewsgroups \
+  -p examples/bin/work/20news-bydate/20news-bydate-test \
+  -o examples/bin/work/20news-bydate/bayes-test-input \
+  -a org.apache.mahout.vectorizer.DefaultAnalyzer \
+  -c UTF-8 
+
+TEST_METHOD="sequential"
+
+# if we're set up to run on a cluster..
+if [ "$HADOOP_HOME" != "" ]; then
+    # mapreduce test method used on hadoop
+    TEST_METHOD="mapreduce"
+
+    set +e 
+    hadoop dfs -rmr \
+      examples/bin/work/20news-bydate/bayes-train-input 
+
+    hadoop dfs -rmr \
+      examples/bin/work/20news-bydate/bayes-test-input
+
+    set -e
+    hadoop dfs -put \
+      examples/bin/work/20news-bydate/bayes-train-input \
+      examples/bin/work/20news-bydate/bayes-train-input 
+
+    hadoop dfs -put \
+      examples/bin/work/20news-bydate/bayes-test-input \
+      examples/bin/work/20news-bydate/bayes-test-input
+fi
+
+
+./bin/mahout trainclassifier \
+  -i examples/bin/work/20news-bydate/bayes-train-input \
+  -o examples/bin/work/20news-bydate/bayes-model \
+  -type bayes \
+  -ng 1 \
+  -source hdfs
+
+./bin/mahout testclassifier \
+  -m examples/bin/work/20news-bydate/bayes-model \
+  -d examples/bin/work/20news-bydate/bayes-test-input \
+  -type bayes \
+  -ng 1 \
+  -source hdfs \
+  -method ${TEST_METHOD}

Propchange: mahout/trunk/examples/bin/build-20news-bayes.sh
------------------------------------------------------------------------------
    svn:executable = *

Added: mahout/trunk/examples/bin/build-cluster-syntheticcontrol.sh
URL: 
http://svn.apache.org/viewvc/mahout/trunk/examples/bin/build-cluster-syntheticcontrol.sh?rev=1062169&view=auto
==============================================================================
--- mahout/trunk/examples/bin/build-cluster-syntheticcontrol.sh (added)
+++ mahout/trunk/examples/bin/build-cluster-syntheticcontrol.sh Sat Jan 22 
15:08:23 2011
@@ -0,0 +1,66 @@
+#!/bin/sh
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+#
+# Downloads the Synthetic control dataset and prepares it for clustering
+#
+# To run:  change into the mahout directory and type:
+#  examples/bin/cluster-syntheticcontrol.sh
+
+if [ $1 = "-ni" ];then
+  clustertype=canopy
+else
+  algorithm=( canopy kmeans fuzzykmeans dirichlet meanshift )
+
+  echo "Please select a number to choose the corresponding clustering 
algorithm"
+  echo "1. ${algorithm[0]} clustering"
+  echo "2. ${algorithm[1]} clustering"
+  echo "3. ${algorithm[2]} clustering"
+  echo "4. ${algorithm[3]} clustering"
+  echo "5. ${algorithm[4]} clustering"
+  read -p "Enter your choice : " choice
+
+  echo "ok. You chose $choice and we'll use ${algorithm[$choice-1]} Clustering"
+  clustertype=${algorithm[$choice-1]}
+fi
+
+cd examples/bin/
+mkdir -p work
+if [ ! -f work/synthetic_control.data ]; then
+  echo "Downloading Synthetic control data"
+  curl 
http://archive.ics.uci.edu/ml/databases/synthetic_control/synthetic_control.data
  -o work/synthetic_control.data
+fi
+
+if [ "$HADOOP_HOME" != "" ]; then
+  echo "Checking the health of DFS..."
+  $HADOOP_HOME/bin/hadoop fs -ls 
+  if [ $? -eq 0 ];then 
+    echo "DFS is healthy... "
+    echo "Uploading Synthetic control data to HDFS"
+    $HADOOP_HOME/bin/hadoop fs -rmr testdata
+    $HADOOP_HOME/bin/hadoop fs -mkdir testdata
+    $HADOOP_HOME/bin/hadoop fs -put work/synthetic_control.data testdata
+    echo "Successfully Uploaded Synthetic control data to HDFS "
+
+    ../../bin/mahout 
org.apache.mahout.clustering.syntheticcontrol."${clustertype}".Job
+  else
+    echo " HADOOP is not running. Please make sure you hadoop is running. "
+  fi
+else
+  echo " HADOOP_HOME variable is not set. Please set this environment variable 
and rerun the script"
+fi
\ No newline at end of file

Propchange: mahout/trunk/examples/bin/build-cluster-syntheticcontrol.sh
------------------------------------------------------------------------------
    svn:executable = *

Modified: mahout/trunk/examples/bin/build-reuters.sh
URL: 
http://svn.apache.org/viewvc/mahout/trunk/examples/bin/build-reuters.sh?rev=1062169&r1=1062168&r2=1062169&view=diff
==============================================================================
--- mahout/trunk/examples/bin/build-reuters.sh (original)
+++ mahout/trunk/examples/bin/build-reuters.sh Sat Jan 22 15:08:23 2011
@@ -1,26 +1,40 @@
-#/**
-# * Licensed to the Apache Software Foundation (ASF) under one or more
-# * contributor license agreements.  See the NOTICE file distributed with
-# * this work for additional information regarding copyright ownership.
-# * The ASF licenses this file to You under the Apache License, Version 2.0
-# * (the "License"); you may not use this file except in compliance with
-# * the License.  You may obtain a copy of the License at
-# *
-# *     http://www.apache.org/licenses/LICENSE-2.0
-# *
-# * Unless required by applicable law or agreed to in writing, software
-# * distributed under the License is distributed on an "AS IS" BASIS,
-# * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# * See the License for the specific language governing permissions and
-# * limitations under the License.
-# */
+#!/bin/sh
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
 
 #
 # Downloads the Reuters dataset and prepares it for clustering
 #
 # To run:  change into the mahout directory and type:
 #  examples/bin/build-reuters.sh
-#!/bin/sh
+
+if [ "$1" = "-ni" ]; then
+  clustertype=kmeans
+else
+  algorithm=( kmeans lda )
+ 
+  echo "Please select a number to choose the corresponding clustering 
algorithm"
+  echo "1. ${algorithm[0]} clustering"
+  echo "2. ${algorithm[1]} clustering"
+  read -p "Enter your choice : " choice
+
+  echo "ok. You chose $choice and we'll use ${algorithm[$choice-1]} Clustering"
+  clustertype=${algorithm[$choice-1]} 
+fi
 
 cd examples/bin/
 mkdir -p work
@@ -35,18 +49,47 @@ if [ ! -e work/reuters-out ]; then
     cd work/reuters-sgm && tar xzf ../reuters21578.tar.gz && cd .. && cd ..
   fi
 fi
-
 cd ../..
-./bin/mahout org.apache.lucene.benchmark.utils.ExtractReuters 
./examples/bin/work/reuters-sgm/ ./examples/bin/work/reuters-out/
-./bin/mahout seqdirectory -i ./examples/bin/work/reuters-out/ -o 
./examples/bin/work/reuters-out-seqdir -c UTF-8 -chunk 5
 
-# to use k-Means clustering, uncomment the next three lines
-./bin/mahout seq2sparse -i ./examples/bin/work/reuters-out-seqdir/ -o 
./examples/bin/work/reuters-out-seqdir-sparse
-./bin/mahout kmeans -i 
./examples/bin/work/reuters-out-seqdir-sparse/tfidf-vectors/ -c 
./examples/bin/work/clusters -o ./examples/bin/work/reuters-kmeans -x 10 -k 20 
-ow
-./bin/mahout clusterdump -s examples/bin/work/reuters-kmeans/clusters-10 -d 
examples/bin/work/reuters-out-seqdir-sparse/dictionary.file-0 -dt sequencefile 
-b 100 -n 20
-
-# to use LDA clustering, uncomment the next three lines
-#./bin/mahout seq2sparse -i ./examples/bin/work/reuters-out-seqdir/ -o 
./examples/bin/work/reuters-out-seqdir-sparse -wt tf -seq -nr 3
-#./bin/mahout lda -i ./examples/bin/work/reuters-out-seqdir-sparse/tf-vectors 
-o ./examples/bin/work/reuters-lda -k 20 -v 50000 -ow -x 20
-#./bin/mahout ldatopics -i ./examples/bin/work/reuters-lda/state-20 -d 
./examples/bin/work/reuters-out-seqdir-sparse/dictionary.file-0 -dt sequencefile
+./bin/mahout org.apache.lucene.benchmark.utils.ExtractReuters \
+  ./examples/bin/work/reuters-sgm/ \
+  ./examples/bin/work/reuters-out/ \
+&& \
+./bin/mahout seqdirectory \
+  -i ./examples/bin/work/reuters-out/ \
+  -o ./examples/bin/work/reuters-out-seqdir \
+  -c UTF-8 -chunk 5
+
+if [ "x$clustertype" == "xkmeans" ]; then
+  ./bin/mahout seq2sparse \
+    -i ./examples/bin/work/reuters-out-seqdir/ \
+    -o ./examples/bin/work/reuters-out-seqdir-sparse \
+  && \
+  ./bin/mahout kmeans \
+    -i ./examples/bin/work/reuters-out-seqdir-sparse/tfidf-vectors/ \
+    -c ./examples/bin/work/clusters \
+    -o ./examples/bin/work/reuters-kmeans \
+    -x 10 -k 20 -ow \
+  && \
+  ./bin/mahout clusterdump \
+    -s examples/bin/work/reuters-kmeans/clusters-10 \
+    -d examples/bin/work/reuters-out-seqdir-sparse/dictionary.file-0 \
+    -dt sequencefile -b 100 -n 20
+elif [ "x$clustertype" == "xlda" ]; then
+  ./bin/mahout seq2sparse \
+    -i ./examples/bin/work/reuters-out-seqdir/ \
+    -o ./examples/bin/work/reuters-out-seqdir-sparse \
+    -wt tf -seq -nr 3 \
+  && \
+  ./bin/mahout lda \
+    -i ./examples/bin/work/reuters-out-seqdir-sparse/tf-vectors \
+    -o ./examples/bin/work/reuters-lda -k 20 -v 50000 -ow -x 20 \
+  && \
+  ./bin/mahout ldatopics \
+    -i ./examples/bin/work/reuters-lda/state-20 \
+    -d ./examples/bin/work/reuters-out-seqdir-sparse/dictionary.file-0 \
+    -dt sequencefile
+else 
+  echo "unknown cluster type: $clustertype";
+fi 
 


Reply via email to