http://git-wip-us.apache.org/repos/asf/mahout/blob/f7b69fab/samsara/src/test/scala/org/apache/mahout/math/scalabindings/RLikeMatrixOpsSuite.scala ---------------------------------------------------------------------- diff --git a/samsara/src/test/scala/org/apache/mahout/math/scalabindings/RLikeMatrixOpsSuite.scala b/samsara/src/test/scala/org/apache/mahout/math/scalabindings/RLikeMatrixOpsSuite.scala new file mode 100644 index 0000000..a943c5f --- /dev/null +++ b/samsara/src/test/scala/org/apache/mahout/math/scalabindings/RLikeMatrixOpsSuite.scala @@ -0,0 +1,80 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.mahout.math.scalabindings + +import org.scalatest.FunSuite +import RLikeOps._ +import org.apache.mahout.test.MahoutSuite + +class RLikeMatrixOpsSuite extends FunSuite with MahoutSuite { + + test("multiplication") { + + val a = dense((1, 2, 3), (3, 4, 5)) + val b = dense(1, 4, 5) + val m = a %*% b + + assert(m(0, 0) == 24) + assert(m(1, 0) == 44) + println(m.toString) + } + + test("Hadamard") { + val a = dense( + (1, 2, 3), + (3, 4, 5) + ) + val b = dense( + (1, 1, 2), + (2, 1, 1) + ) + + val c = a * b + + printf("C=\n%s\n", c) + + assert(c(0, 0) == 1) + assert(c(1, 2) == 5) + println(c.toString) + + val d = a * 5.0 + assert(d(0, 0) == 5) + assert(d(1, 1) == 20) + + a *= b + assert(a(0, 0) == 1) + assert(a(1, 2) == 5) + println(a.toString) + + } + + /** Test dsl overloads over scala operations over matrices */ + test ("scalarOps") { + val a = dense( + (1, 2, 3), + (3, 4, 5) + ) + + (10 * a - (10 *: a)).norm shouldBe 0 + (10 + a - (10 +: a)).norm shouldBe 0 + (10 - a - (10 -: a)).norm shouldBe 0 + (10 / a - (10 /: a)).norm shouldBe 0 + + } + +}
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7b69fab/samsara/src/test/scala/org/apache/mahout/math/scalabindings/RLikeVectorOpsSuite.scala ---------------------------------------------------------------------- diff --git a/samsara/src/test/scala/org/apache/mahout/math/scalabindings/RLikeVectorOpsSuite.scala b/samsara/src/test/scala/org/apache/mahout/math/scalabindings/RLikeVectorOpsSuite.scala new file mode 100644 index 0000000..832937b --- /dev/null +++ b/samsara/src/test/scala/org/apache/mahout/math/scalabindings/RLikeVectorOpsSuite.scala @@ -0,0 +1,36 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.mahout.math.scalabindings + +import org.scalatest.FunSuite +import org.apache.mahout.math.Vector +import RLikeOps._ +import org.apache.mahout.test.MahoutSuite + +class RLikeVectorOpsSuite extends FunSuite with MahoutSuite { + + test("Hadamard") { + val a: Vector = (1, 2, 3) + val b = (3, 4, 5) + + val c = a * b + println(c) + assert(c ===(3, 8, 15)) + } + +} http://git-wip-us.apache.org/repos/asf/mahout/blob/f7b69fab/samsara/src/test/scala/org/apache/mahout/math/scalabindings/VectorOpsSuite.scala ---------------------------------------------------------------------- diff --git a/samsara/src/test/scala/org/apache/mahout/math/scalabindings/VectorOpsSuite.scala b/samsara/src/test/scala/org/apache/mahout/math/scalabindings/VectorOpsSuite.scala new file mode 100644 index 0000000..037f562 --- /dev/null +++ b/samsara/src/test/scala/org/apache/mahout/math/scalabindings/VectorOpsSuite.scala @@ -0,0 +1,82 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.mahout.math.scalabindings + +import org.scalatest.FunSuite +import org.apache.mahout.math.{RandomAccessSparseVector, Vector} +import RLikeOps._ +import org.apache.mahout.test.MahoutSuite + +/** VectorOps Suite */ +class VectorOpsSuite extends FunSuite with MahoutSuite { + + test("inline create") { + + val sparseVec = svec((5 -> 1) :: (10 -> 2.0) :: Nil) + println(sparseVec) + + val sparseVec2: Vector = (5 -> 1.0) :: (10 -> 2.0) :: Nil + println(sparseVec2) + + val sparseVec3: Vector = new RandomAccessSparseVector(100) := (5 -> 1.0) :: Nil + println(sparseVec3) + + val denseVec1: Vector = (1.0, 1.1, 1.2) + println(denseVec1) + + val denseVec2 = dvec(1, 0, 1.1, 1.2) + println(denseVec2) + } + + test("plus minus") { + + val a: Vector = (1, 2, 3) + val b: Vector = (0 -> 3) :: (1 -> 4) :: (2 -> 5) :: Nil + + val c = a + b + val d = b - a + val e = -b - a + + assert(c ===(4, 6, 8)) + assert(d ===(2, 2, 2)) + assert(e ===(-4, -6, -8)) + + } + + test("dot") { + + val a: Vector = (1, 2, 3) + val b = (3, 4, 5) + + val c = a dot b + println(c) + assert(c == 26) + + } + + test ("scalarOps") { + val a = dvec(1 to 5):Vector + + 10 * a shouldBe 10 *: a + 10 + a shouldBe 10 +: a + 10 - a shouldBe 10 -: a + 10 / a shouldBe 10 /: a + + } + +} http://git-wip-us.apache.org/repos/asf/mahout/blob/f7b69fab/samsara/src/test/scala/org/apache/mahout/nlp/tfidf/TFIDFtestBase.scala ---------------------------------------------------------------------- diff --git a/samsara/src/test/scala/org/apache/mahout/nlp/tfidf/TFIDFtestBase.scala b/samsara/src/test/scala/org/apache/mahout/nlp/tfidf/TFIDFtestBase.scala new file mode 100644 index 0000000..3ec5ec1 --- /dev/null +++ b/samsara/src/test/scala/org/apache/mahout/nlp/tfidf/TFIDFtestBase.scala @@ -0,0 +1,184 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.mahout.nlp.tfidf + +import org.apache.mahout.math._ +import org.apache.mahout.math.scalabindings._ +import org.apache.mahout.test.DistributedMahoutSuite +import org.scalatest.{FunSuite, Matchers} +import scala.collection._ +import RLikeOps._ +import scala.math._ + + +trait TFIDFtestBase extends DistributedMahoutSuite with Matchers { + this: FunSuite => + + val epsilon = 1E-6 + + val documents: List[(Int, String)] = List( + (1, "the first document contains 5 terms"), + (2, "document two document contains 4 terms"), + (3, "document three three terms"), + (4, "each document including this document contain the term document")) + + def createDictionaryAndDfMaps(documents: List[(Int, String)]): (Map[String, Int], Map[Int, Int]) = { + + // get a tf count for the entire dictionary + val dictMap = documents.unzip._2.mkString(" ").toLowerCase.split(" ").groupBy(identity).mapValues(_.length) + + // create a dictionary with an index for each term + val dictIndex = dictMap.zipWithIndex.map(x => x._1._1 -> x._2).toMap + + val docFrequencyCount = new Array[Int](dictMap.size) + + for (token <- dictMap) { + for (doc <- documents) { + // parse the string and get a word then increment the df count for that word + if (doc._2.toLowerCase.split(" ").contains(token._1)) { + docFrequencyCount(dictIndex(token._1)) += 1 + } + } + } + + val docFrequencyMap = docFrequencyCount.zipWithIndex.map(x => x._2 -> x._1).toMap + + (dictIndex, docFrequencyMap) + } + + def vectorizeDocument(document: String, + dictionaryMap: Map[String, Int], + dfMap: Map[Int, Int], weight: TermWeight = new TFIDF): Vector = { + + val wordCounts = document.toLowerCase.split(" ").groupBy(identity).mapValues(_.length) + + val vec = new RandomAccessSparseVector(dictionaryMap.size) + + val totalDFSize = dictionaryMap.size + val docSize = wordCounts.size + + for (word <- wordCounts) { + val term = word._1 + if (dictionaryMap.contains(term)) { + val termFreq = word._2 + val dictIndex = dictionaryMap(term) + val docFreq = dfMap(dictIndex) + val currentWeight = weight.calculate(termFreq, docFreq.toInt, docSize, totalDFSize.toInt) + vec(dictIndex)= currentWeight + } + } + vec + } + + test("TF test") { + + val (dictionary, dfMap) = createDictionaryAndDfMaps(documents) + + val tf: TermWeight = new TF() + + val vectorizedDocuments: Matrix = new SparseMatrix(documents.size, dictionary.size) + + for (doc <- documents) { + vectorizedDocuments(doc._1 - 1, ::) := vectorizeDocument(doc._2, dictionary, dfMap, tf) + } + + // corpus: + // (1, "the first document contains 5 terms"), + // (2, "document two document contains 4 terms"), + // (3, "document three three terms"), + // (4, "each document including this document contain the term document") + + // dictonary: + // (this -> 0, 4 -> 1, three -> 2, document -> 3, two -> 4, term -> 5, 5 -> 6, contain -> 7, + // each -> 8, first -> 9, terms -> 10, contains -> 11, including -> 12, the -> 13) + + // dfMap: + // (0 -> 1, 5 -> 1, 10 -> 3, 1 -> 1, 6 -> 1, 9 -> 1, 13 -> 2, 2 -> 1, 12 -> 1, 7 -> 1, 3 -> 4, + // 11 -> 2, 8 -> 1, 4 -> 1) + + vectorizedDocuments(0, 0).toInt should be (0) + vectorizedDocuments(0, 13).toInt should be (1) + vectorizedDocuments(1, 3).toInt should be (2) + vectorizedDocuments(3, 3).toInt should be (3) + + } + + + test("TFIDF test") { + val (dictionary, dfMap) = createDictionaryAndDfMaps(documents) + + val tfidf: TermWeight = new TFIDF() + + val vectorizedDocuments: Matrix = new SparseMatrix(documents.size, dictionary.size) + + for (doc <- documents) { + vectorizedDocuments(doc._1 - 1, ::) := vectorizeDocument(doc._2, dictionary, dfMap, tfidf) + } + + // corpus: + // (1, "the first document contains 5 terms"), + // (2, "document two document contains 4 terms"), + // (3, "document three three terms"), + // (4, "each document including this document contain the term document") + + // dictonary: + // (this -> 0, 4 -> 1, three -> 2, document -> 3, two -> 4, term -> 5, 5 -> 6, contain -> 7, + // each -> 8, first -> 9, terms -> 10, contains -> 11, including -> 12, the -> 13) + + // dfMap: + // (0 -> 1, 5 -> 1, 10 -> 3, 1 -> 1, 6 -> 1, 9 -> 1, 13 -> 2, 2 -> 1, 12 -> 1, 7 -> 1, 3 -> 4, + // 11 -> 2, 8 -> 1, 4 -> 1) + + abs(vectorizedDocuments(0, 0) - 0.0) should be < epsilon + abs(vectorizedDocuments(0, 13) - 2.540445) should be < epsilon + abs(vectorizedDocuments(1, 3) - 2.870315) should be < epsilon + abs(vectorizedDocuments(3, 3) - 3.515403) should be < epsilon + } + + test("MLlib TFIDF test") { + val (dictionary, dfMap) = createDictionaryAndDfMaps(documents) + + val tfidf: TermWeight = new MLlibTFIDF() + + val vectorizedDocuments: Matrix = new SparseMatrix(documents.size, dictionary.size) + + for (doc <- documents) { + vectorizedDocuments(doc._1 - 1, ::) := vectorizeDocument(doc._2, dictionary, dfMap, tfidf) + } + + // corpus: + // (1, "the first document contains 5 terms"), + // (2, "document two document contains 4 terms"), + // (3, "document three three terms"), + // (4, "each document including this document contain the term document") + + // dictonary: + // (this -> 0, 4 -> 1, three -> 2, document -> 3, two -> 4, term -> 5, 5 -> 6, contain -> 7, + // each -> 8, first -> 9, terms -> 10, contains -> 11, including -> 12, the -> 13) + + // dfMap: + // (0 -> 1, 5 -> 1, 10 -> 3, 1 -> 1, 6 -> 1, 9 -> 1, 13 -> 2, 2 -> 1, 12 -> 1, 7 -> 1, 3 -> 4, + // 11 -> 2, 8 -> 1, 4 -> 1) + + abs(vectorizedDocuments(0, 0) - 0.0) should be < epsilon + abs(vectorizedDocuments(0, 13) - 1.609437) should be < epsilon + abs(vectorizedDocuments(1, 3) - 2.197224) should be < epsilon + abs(vectorizedDocuments(3, 3) - 3.295836) should be < epsilon + } + +} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/mahout/blob/f7b69fab/samsara/src/test/scala/org/apache/mahout/test/DistributedMahoutSuite.scala ---------------------------------------------------------------------- diff --git a/samsara/src/test/scala/org/apache/mahout/test/DistributedMahoutSuite.scala b/samsara/src/test/scala/org/apache/mahout/test/DistributedMahoutSuite.scala new file mode 100644 index 0000000..3538991 --- /dev/null +++ b/samsara/src/test/scala/org/apache/mahout/test/DistributedMahoutSuite.scala @@ -0,0 +1,28 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.mahout.test + +import org.apache.mahout.math.drm.DistributedContext +import org.scalatest.{Suite, FunSuite, Matchers} + +/** + * Unit tests that use a distributed context to run + */ +trait DistributedMahoutSuite extends MahoutSuite { this: Suite => + protected implicit var mahoutCtx: DistributedContext +} http://git-wip-us.apache.org/repos/asf/mahout/blob/f7b69fab/samsara/src/test/scala/org/apache/mahout/test/LoggerConfiguration.scala ---------------------------------------------------------------------- diff --git a/samsara/src/test/scala/org/apache/mahout/test/LoggerConfiguration.scala b/samsara/src/test/scala/org/apache/mahout/test/LoggerConfiguration.scala new file mode 100644 index 0000000..7a34aa2 --- /dev/null +++ b/samsara/src/test/scala/org/apache/mahout/test/LoggerConfiguration.scala @@ -0,0 +1,16 @@ +package org.apache.mahout.test + +import org.scalatest._ +import org.apache.log4j.{Level, Logger, BasicConfigurator} + +trait LoggerConfiguration extends BeforeAndAfterAllConfigMap { + this: Suite => + + override protected def beforeAll(configMap: ConfigMap): Unit = { + super.beforeAll(configMap) + BasicConfigurator.resetConfiguration() + BasicConfigurator.configure() + Logger.getRootLogger.setLevel(Level.ERROR) + Logger.getLogger("org.apache.mahout.math.scalabindings").setLevel(Level.DEBUG) + } +} http://git-wip-us.apache.org/repos/asf/mahout/blob/f7b69fab/samsara/src/test/scala/org/apache/mahout/test/MahoutSuite.scala ---------------------------------------------------------------------- diff --git a/samsara/src/test/scala/org/apache/mahout/test/MahoutSuite.scala b/samsara/src/test/scala/org/apache/mahout/test/MahoutSuite.scala new file mode 100644 index 0000000..d3b8a38 --- /dev/null +++ b/samsara/src/test/scala/org/apache/mahout/test/MahoutSuite.scala @@ -0,0 +1,54 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.mahout.test + +import java.io.File +import org.scalatest._ +import org.apache.mahout.common.RandomUtils + +trait MahoutSuite extends BeforeAndAfterEach with LoggerConfiguration with Matchers { + this: Suite => + + final val TmpDir = "tmp/" + + override protected def beforeEach() { + super.beforeEach() + RandomUtils.useTestSeed() + } + + override protected def beforeAll(configMap: ConfigMap) { + super.beforeAll(configMap) + + // just in case there is an existing tmp dir clean it before every suite + deleteDirectory(new File(TmpDir)) + } + + override protected def afterEach() { + + // clean the tmp dir after every test + deleteDirectory(new File(TmpDir)) + + super.afterEach() + } + + /** Delete directory no symlink checking and exceptions are not caught */ + private def deleteDirectory(path: File): Unit = { + if (path.isDirectory) + for (files <- path.listFiles) deleteDirectory(files) + path.delete + } +} http://git-wip-us.apache.org/repos/asf/mahout/blob/f7b69fab/spark-shell/pom.xml ---------------------------------------------------------------------- diff --git a/spark-shell/pom.xml b/spark-shell/pom.xml index 0903534..87fb187 100644 --- a/spark-shell/pom.xml +++ b/spark-shell/pom.xml @@ -112,7 +112,7 @@ <dependency> <groupId>org.apache.mahout</groupId> - <artifactId>mahout-math-scala_${scala.compat.version}</artifactId> + <artifactId>mahout-samsara_${scala.compat.version}</artifactId> <classifier>tests</classifier> <scope>test</scope> </dependency> http://git-wip-us.apache.org/repos/asf/mahout/blob/f7b69fab/spark/pom.xml ---------------------------------------------------------------------- diff --git a/spark/pom.xml b/spark/pom.xml index 5646c25..885d5f2 100644 --- a/spark/pom.xml +++ b/spark/pom.xml @@ -134,7 +134,7 @@ <dependency> <groupId>org.apache.mahout</groupId> - <artifactId>mahout-math-scala_${scala.compat.version}</artifactId> + <artifactId>mahout-samsara_${scala.compat.version}</artifactId> </dependency> <dependency> @@ -150,7 +150,7 @@ <dependency> <groupId>org.apache.mahout</groupId> - <artifactId>mahout-math-scala_${scala.compat.version}</artifactId> + <artifactId>mahout-samsara_${scala.compat.version}</artifactId> <classifier>tests</classifier> <scope>test</scope> </dependency>
