Author: srowen
Date: Wed Sep 14 16:12:53 2011
New Revision: 1170702
URL: http://svn.apache.org/viewvc?rev=1170702&view=rev
Log:
MAHOUT-811 move work dir to /tmp
Modified:
mahout/trunk/examples/bin/build-20news-bayes.sh
mahout/trunk/examples/bin/build-cluster-syntheticcontrol.sh
mahout/trunk/examples/bin/build-reuters.sh
mahout/trunk/examples/bin/factorize-movielens-1M.sh
Modified: mahout/trunk/examples/bin/build-20news-bayes.sh
URL:
http://svn.apache.org/viewvc/mahout/trunk/examples/bin/build-20news-bayes.sh?rev=1170702&r1=1170701&r2=1170702&view=diff
==============================================================================
--- mahout/trunk/examples/bin/build-20news-bayes.sh (original)
+++ mahout/trunk/examples/bin/build-20news-bayes.sh Wed Sep 14 16:12:53 2011
@@ -27,16 +27,20 @@ if [ "$0" != "$SCRIPT_PATH" ] && [ "$SCR
cd $SCRIPT_PATH
fi
-mkdir -p work
-if [ ! -e work/20news-bayesinput ]; then
- if [ ! -e work/20news-bydate ]; then
- if [ ! -f work/20news-bydate.tar.gz ]; then
+WORK_DIR=/tmp/mahout-work-${USER}
+
+echo "creating work directory at ${WORK_DIR}"
+
+mkdir -p ${WORK_DIR}
+if [ ! -e ${WORK_DIR}/20news-bayesinput ]; then
+ if [ ! -e ${WORK_DIR}/20news-bydate ]; then
+ if [ ! -f ${WORK_DIR}/20news-bydate.tar.gz ]; then
echo "Downloading 20news-bydate"
- curl
http://people.csail.mit.edu/jrennie/20Newsgroups/20news-bydate.tar.gz -o
work/20news-bydate.tar.gz
+ curl
http://people.csail.mit.edu/jrennie/20Newsgroups/20news-bydate.tar.gz -o
${WORK_DIR}/20news-bydate.tar.gz
fi
- mkdir -p work/20news-bydate
+ mkdir -p ${WORK_DIR}/20news-bydate
echo "Extracting..."
- cd work/20news-bydate && tar xzf ../20news-bydate.tar.gz && cd .. && cd ..
+ cd ${WORK_DIR}/20news-bydate && tar xzf ../20news-bydate.tar.gz && cd ..
&& cd ..
fi
fi
@@ -45,14 +49,14 @@ cd ../..
set -e
./bin/mahout org.apache.mahout.classifier.bayes.PrepareTwentyNewsgroups \
- -p examples/bin/work/20news-bydate/20news-bydate-train \
- -o examples/bin/work/20news-bydate/bayes-train-input \
+ -p ${WORK_DIR}/20news-bydate/20news-bydate-train \
+ -o ${WORK_DIR}/20news-bydate/bayes-train-input \
-a org.apache.mahout.vectorizer.DefaultAnalyzer \
-c UTF-8
./bin/mahout org.apache.mahout.classifier.bayes.PrepareTwentyNewsgroups \
- -p examples/bin/work/20news-bydate/20news-bydate-test \
- -o examples/bin/work/20news-bydate/bayes-test-input \
+ -p ${WORK_DIR}/20news-bydate/20news-bydate-test \
+ -o ${WORK_DIR}/20news-bydate/bayes-test-input \
-a org.apache.mahout.vectorizer.DefaultAnalyzer \
-c UTF-8
@@ -65,33 +69,36 @@ if [ "$HADOOP_HOME" != "" ]; then
set +e
hadoop dfs -rmr \
- examples/bin/work/20news-bydate/bayes-train-input
+ ${WORK_DIR}/20news-bydate/bayes-train-input
hadoop dfs -rmr \
- examples/bin/work/20news-bydate/bayes-test-input
+ ${WORK_DIR}/20news-bydate/bayes-test-input
set -e
hadoop dfs -put \
- examples/bin/work/20news-bydate/bayes-train-input \
- examples/bin/work/20news-bydate/bayes-train-input
+ ${WORK_DIR}/20news-bydate/bayes-train-input \
+ ${WORK_DIR}/20news-bydate/bayes-train-input
hadoop dfs -put \
- examples/bin/work/20news-bydate/bayes-test-input \
- examples/bin/work/20news-bydate/bayes-test-input
+ ${WORK_DIR}/20news-bydate/bayes-test-input \
+ ${WORK_DIR}/20news-bydate/bayes-test-input
fi
./bin/mahout trainclassifier \
- -i examples/bin/work/20news-bydate/bayes-train-input \
- -o examples/bin/work/20news-bydate/bayes-model \
+ -i ${WORK_DIR}/20news-bydate/bayes-train-input \
+ -o ${WORK_DIR}/20news-bydate/bayes-model \
-type bayes \
-ng 1 \
-source hdfs
./bin/mahout testclassifier \
- -m examples/bin/work/20news-bydate/bayes-model \
- -d examples/bin/work/20news-bydate/bayes-test-input \
+ -m ${WORK_DIR}/20news-bydate/bayes-model \
+ -d ${WORK_DIR}/20news-bydate/bayes-test-input \
-type bayes \
-ng 1 \
-source hdfs \
-method ${TEST_METHOD}
+
+# Remove the work directory
+rm -rf ${WORK_DIR}
Modified: mahout/trunk/examples/bin/build-cluster-syntheticcontrol.sh
URL:
http://svn.apache.org/viewvc/mahout/trunk/examples/bin/build-cluster-syntheticcontrol.sh?rev=1170702&r1=1170701&r2=1170702&view=diff
==============================================================================
--- mahout/trunk/examples/bin/build-cluster-syntheticcontrol.sh (original)
+++ mahout/trunk/examples/bin/build-cluster-syntheticcontrol.sh Wed Sep 14
16:12:53 2011
@@ -40,10 +40,14 @@ else
fi
cd examples/bin/
-mkdir -p work
-if [ ! -f work/synthetic_control.data ]; then
+
+WORK_DIR=/tmp/mahout-work-${USER}
+
+echo "creating work directory at ${WORK_DIR}"
+mkdir -p ${WORK_DIR}
+if [ ! -f ${WORK_DIR}/synthetic_control.data ]; then
echo "Downloading Synthetic control data"
- curl
http://archive.ics.uci.edu/ml/databases/synthetic_control/synthetic_control.data
-o work/synthetic_control.data
+ curl
http://archive.ics.uci.edu/ml/databases/synthetic_control/synthetic_control.data
-o ${WORK_DIR}/synthetic_control.data
fi
if [ "$HADOOP_HOME" != "" ]; then
@@ -54,7 +58,7 @@ if [ "$HADOOP_HOME" != "" ]; then
echo "Uploading Synthetic control data to HDFS"
$HADOOP_HOME/bin/hadoop fs -rmr testdata
$HADOOP_HOME/bin/hadoop fs -mkdir testdata
- $HADOOP_HOME/bin/hadoop fs -put work/synthetic_control.data testdata
+ $HADOOP_HOME/bin/hadoop fs -put ${WORK_DIR}/synthetic_control.data testdata
echo "Successfully Uploaded Synthetic control data to HDFS "
../../bin/mahout
org.apache.mahout.clustering.syntheticcontrol."${clustertype}".Job
@@ -63,4 +67,7 @@ if [ "$HADOOP_HOME" != "" ]; then
fi
else
echo " HADOOP_HOME variable is not set. Please set this environment variable
and rerun the script"
-fi
\ No newline at end of file
+fi
+
+# Remove the work directory
+rm -rf ${WORK_DIR}
Modified: mahout/trunk/examples/bin/build-reuters.sh
URL:
http://svn.apache.org/viewvc/mahout/trunk/examples/bin/build-reuters.sh?rev=1170702&r1=1170701&r2=1170702&view=diff
==============================================================================
--- mahout/trunk/examples/bin/build-reuters.sh (original)
+++ mahout/trunk/examples/bin/build-reuters.sh Wed Sep 14 16:12:53 2011
@@ -48,29 +48,32 @@ else
clustertype=${algorithm[$choice-1]}
fi
-mkdir -p mahout-work
+WORK_DIR=/tmp/mahout-work-${USER}
+echo "creating work directory at ${WORK_DIR}"
-if [ ! -e mahout-work/reuters-out-seqdir ]; then
- if [ ! -e mahout-work/reuters-out ]; then
- if [ ! -e mahout-work/reuters-sgm ]; then
- if [ ! -f mahout-work/reuters21578.tar.gz ]; then
+mkdir -p ${WORK_DIR}
+
+if [ ! -e ${WORK_DIR}/reuters-out-seqdir ]; then
+ if [ ! -e ${WORK_DIR}/reuters-out ]; then
+ if [ ! -e ${WORK_DIR}/reuters-sgm ]; then
+ if [ ! -f ${WORK_DIR}/reuters21578.tar.gz ]; then
echo "Downloading Reuters-21578"
curl
http://kdd.ics.uci.edu/databases/reuters21578/reuters21578.tar.gz \
- -o mahout-work/reuters21578.tar.gz
+ -o ${WORK_DIR}/reuters21578.tar.gz
fi
- mkdir -p mahout-work/reuters-sgm
+ mkdir -p ${WORK_DIR}/reuters-sgm
echo "Extracting..."
- cd mahout-work/reuters-sgm && tar xzf ../reuters21578.tar.gz && cd
.. && cd ..
+ cd ${WORK_DIR}/reuters-sgm && tar xzf ../reuters21578.tar.gz && cd
.. && cd ..
fi
$MAHOUT org.apache.lucene.benchmark.utils.ExtractReuters \
- mahout-work/reuters-sgm \
- mahout-work/reuters-out
+ ${WORK_DIR}/reuters-sgm \
+ ${WORK_DIR}/reuters-out
fi
MAHOUT_LOCAL=true $MAHOUT seqdirectory \
- -i mahout-work/reuters-out \
- -o mahout-work/reuters-out-seqdir \
+ -i ${WORK_DIR}/reuters-out \
+ -o ${WORK_DIR}/reuters-out-seqdir \
-c UTF-8 -chunk 5
fi
@@ -86,42 +89,45 @@ if [ "$HADOOP_HOME" != "" ] && [ "$MAHOU
set +e
$HADOOP dfs -rmr \
- mahout-work/reuters-out-seqdir
+ ${WORK_DIR}/reuters-out-seqdir
set -e
$HADOOP dfs -put \
- mahout-work/reuters-out-seqdir \
- mahout-work/reuters-out-seqdir
+ ${WORK_DIR}/reuters-out-seqdir \
+ ${WORK_DIR}/reuters-out-seqdir
fi
if [ "x$clustertype" == "xkmeans" ]; then
$MAHOUT seq2sparse \
- -i mahout-work/reuters-out-seqdir/ \
- -o mahout-work/reuters-out-seqdir-sparse-kmeans \
+ -i ${WORK_DIR}/reuters-out-seqdir/ \
+ -o ${WORK_DIR}/reuters-out-seqdir-sparse-kmeans \
&& \
$MAHOUT kmeans \
- -i mahout-work/reuters-out-seqdir-sparse-kmeans/tfidf-vectors/ \
- -c mahout-work/reuters-kmeans-clusters \
- -o mahout-work/reuters-kmeans \
+ -i ${WORK_DIR}/reuters-out-seqdir-sparse-kmeans/tfidf-vectors/ \
+ -c ${WORK_DIR}/reuters-kmeans-clusters \
+ -o ${WORK_DIR}/reuters-kmeans \
-x 10 -k 20 -ow \
&& \
$MAHOUT clusterdump \
- -s mahout-work/reuters-kmeans/clusters-10 \
- -d mahout-work/reuters-out-seqdir-sparse-kmeans/dictionary.file-0 \
+ -s ${WORK_DIR}/reuters-kmeans/clusters-10 \
+ -d ${WORK_DIR}/reuters-out-seqdir-sparse-kmeans/dictionary.file-0 \
-dt sequencefile -b 100 -n 20
elif [ "x$clustertype" == "xlda" ]; then
$MAHOUT seq2sparse \
- -i mahout-work/reuters-out-seqdir/ \
- -o mahout-work/reuters-out-seqdir-sparse-lda \
+ -i ${WORK_DIR}/reuters-out-seqdir/ \
+ -o ${WORK_DIR}/reuters-out-seqdir-sparse-lda \
-wt tf -seq -nr 3 \
&& \
$MAHOUT lda \
- -i mahout-work/reuters-out-seqdir-sparse-lda/tf-vectors \
- -o mahout-work/reuters-lda -k 20 -v 50000 -ow -x 20 \
+ -i ${WORK_DIR}/reuters-out-seqdir-sparse-lda/tf-vectors \
+ -o ${WORK_DIR}/reuters-lda -k 20 -v 50000 -ow -x 20 \
&& \
$MAHOUT ldatopics \
- -i mahout-work/reuters-lda/state-20 \
- -d mahout-work/reuters-out-seqdir-sparse-lda/dictionary.file-0 \
+ -i ${WORK_DIR}/reuters-lda/state-20 \
+ -d ${WORK_DIR}/reuters-out-seqdir-sparse-lda/dictionary.file-0 \
-dt sequencefile
else
echo "unknown cluster type: $clustertype";
fi
+
+# Remove the work directory
+rm -rf ${WORK_DIR}
Modified: mahout/trunk/examples/bin/factorize-movielens-1M.sh
URL:
http://svn.apache.org/viewvc/mahout/trunk/examples/bin/factorize-movielens-1M.sh?rev=1170702&r1=1170701&r2=1170702&view=diff
==============================================================================
--- mahout/trunk/examples/bin/factorize-movielens-1M.sh (original)
+++ mahout/trunk/examples/bin/factorize-movielens-1M.sh Wed Sep 14 16:12:53 2011
@@ -32,28 +32,29 @@ then
exit -1
fi
-echo "creating work directory"
-mkdir -p work/movielens
+WORK_DIR=/tmp/mahout-work-${USER}
+echo "creating work directory at ${WORK_DIR}"
+mkdir -p ${WORK_DIR}/movielens
echo "Converting ratings..."
-cat $1 |sed -e s/::/,/g| cut -d, -f1,2,3 > work/movielens/ratings.csv
+cat $1 |sed -e s/::/,/g| cut -d, -f1,2,3 > ${WORK_DIR}/movielens/ratings.csv
#create a 90% percent training set and a 10% probe set
-bin/mahout splitDataset --input work/movielens/ratings.csv --output
work/dataset \
- --trainingPercentage 0.9 --probePercentage 0.1 --tempDir work/dataset/tmp
+bin/mahout splitDataset --input ${WORK_DIR}/movielens/ratings.csv --output
${WORK_DIR}/dataset \
+ --trainingPercentage 0.9 --probePercentage 0.1 --tempDir
${WORK_DIR}/dataset/tmp
#run distributed ALS-WR to factorize the rating matrix based on the training
set
-bin/mahout parallelALS --input work/dataset/trainingSet/ --output work/als/out
\
- --tempDir work/als/tmp --numFeatures 20 --numIterations 10 --lambda 0.065
+bin/mahout parallelALS --input ${WORK_DIR}/dataset/trainingSet/ --output
${WORK_DIR}/als/out \
+ --tempDir ${WORK_DIR}/als/tmp --numFeatures 20 --numIterations 10 --lambda
0.065
# compute predictions against the probe set, measure the error
-bin/mahout evaluateFactorizationParallel --output work/als/rmse --pairs
work/dataset/probeSet/ \
- --userFeatures work/als/out/U/ --itemFeatures work/als/out/M/
+bin/mahout evaluateFactorizationParallel --output ${WORK_DIR}/als/rmse --pairs
${WORK_DIR}/dataset/probeSet/ \
+ --userFeatures ${WORK_DIR}/als/out/U/ --itemFeatures ${WORK_DIR}/als/out/M/
# print the error
echo -e "\nRMSE is:\n"
-cat work/als/rmse/rmse.txt
+cat ${WORK_DIR}/als/rmse/rmse.txt
echo -e "\n\n"
echo "removing work directory"
-rm -rf work
\ No newline at end of file
+rm -rf ${WORK_DIR}
\ No newline at end of file