Repository: mahout Updated Branches: refs/heads/master f8596b866 -> 9a31923ea
MAHOUT-1856 Add Framework for Models, Fitters, and Tests closes apache/mahout#246 Project: http://git-wip-us.apache.org/repos/asf/mahout/repo Commit: http://git-wip-us.apache.org/repos/asf/mahout/commit/9a31923e Tree: http://git-wip-us.apache.org/repos/asf/mahout/tree/9a31923e Diff: http://git-wip-us.apache.org/repos/asf/mahout/diff/9a31923e Branch: refs/heads/master Commit: 9a31923eae3727d9d91bd2c2ed8df12a616a577e Parents: f8596b8 Author: rawkintrevo <[email protected]> Authored: Tue Jan 31 21:23:10 2017 -0600 Committer: rawkintrevo <[email protected]> Committed: Tue Jan 31 21:23:10 2017 -0600 ---------------------------------------------------------------------- .gitignore | 1 + .../standard/PreprocessorSuite.scala | 26 ++++ .../standard/RegressionSuite.scala | 27 ++++ .../standard/RegressionTestsSuite.scala | 26 ++++ .../math/algorithms/PreprocessorSuite.scala | 24 ++++ .../math/algorithms/RegressionSuite.scala | 25 ++++ .../math/algorithms/RegressionTestsSuite.scala | 24 ++++ .../apache/mahout/math/algorithms/Fitter.scala | 27 ++++ .../apache/mahout/math/algorithms/Model.scala | 26 ++++ .../math/algorithms/SupervisedFitter.scala | 29 +++++ .../math/algorithms/SupervisedModel.scala | 26 ++++ .../math/algorithms/UnsupervisedFitter.scala | 28 ++++ .../math/algorithms/UnsupervisedModel.scala | 24 ++++ .../algorithms/preprocessing/AsFactor.scala | 127 +++++++++++++++++++ .../algorithms/preprocessing/MeanCenter.scala | 91 +++++++++++++ .../preprocessing/PreprocessorModel.scala | 58 +++++++++ .../preprocessing/StandardScaler.scala | 97 ++++++++++++++ .../regression/CochraneOrcuttModel.scala | 100 +++++++++++++++ .../regression/LinearRegressorModel.scala | 124 ++++++++++++++++++ .../regression/OrdinaryLeastSquaresModel.scala | 66 ++++++++++ .../algorithms/regression/RegressorModel.scala | 58 +++++++++ .../regression/tests/AutocorrelationTests.scala | 57 +++++++++ .../regression/tests/FittnessTests.scala | 56 ++++++++ .../math/algorithms/PreprocessorSuiteBase.scala | 59 +++++++++ .../math/algorithms/RegressionSuiteBase.scala | 81 ++++++++++++ .../algorithms/RegressionTestsSuiteBase.scala | 87 +++++++++++++ .../math/algorithms/PreprocessorSuite.scala | 24 ++++ .../math/algorithms/RegressionSuite.scala | 25 ++++ .../math/algorithms/RegressionTestsSuite.scala | 25 ++++ 29 files changed, 1448 insertions(+) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/mahout/blob/9a31923e/.gitignore ---------------------------------------------------------------------- diff --git a/.gitignore b/.gitignore index 63490bf..3aee83a 100644 --- a/.gitignore +++ b/.gitignore @@ -20,3 +20,4 @@ mr/temp temp foo math-tests/ +metastore_db/* \ No newline at end of file http://git-wip-us.apache.org/repos/asf/mahout/blob/9a31923e/flink/src/test/scala/org/apache/mahout/flinkbindings/standard/PreprocessorSuite.scala ---------------------------------------------------------------------- diff --git a/flink/src/test/scala/org/apache/mahout/flinkbindings/standard/PreprocessorSuite.scala b/flink/src/test/scala/org/apache/mahout/flinkbindings/standard/PreprocessorSuite.scala new file mode 100644 index 0000000..5e2b4ee --- /dev/null +++ b/flink/src/test/scala/org/apache/mahout/flinkbindings/standard/PreprocessorSuite.scala @@ -0,0 +1,26 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.mahout.flinkbindings.standard + + +import org.apache.mahout.flinkbindings.DistributedFlinkSuite +import org.apache.mahout.math.algorithms.PreprocessorSuiteBase +import org.scalatest.FunSuite + +class PreprocessorSuite extends FunSuite + with DistributedFlinkSuite with PreprocessorSuiteBase http://git-wip-us.apache.org/repos/asf/mahout/blob/9a31923e/flink/src/test/scala/org/apache/mahout/flinkbindings/standard/RegressionSuite.scala ---------------------------------------------------------------------- diff --git a/flink/src/test/scala/org/apache/mahout/flinkbindings/standard/RegressionSuite.scala b/flink/src/test/scala/org/apache/mahout/flinkbindings/standard/RegressionSuite.scala new file mode 100644 index 0000000..5cb6183 --- /dev/null +++ b/flink/src/test/scala/org/apache/mahout/flinkbindings/standard/RegressionSuite.scala @@ -0,0 +1,27 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.mahout.flinkbindings.standard + + +import org.apache.mahout.flinkbindings.DistributedFlinkSuite +import org.apache.mahout.math.algorithms.RegressionSuiteBase +import org.scalatest.FunSuite + +class RegressionSuite extends FunSuite + with DistributedFlinkSuite with RegressionSuiteBase + http://git-wip-us.apache.org/repos/asf/mahout/blob/9a31923e/flink/src/test/scala/org/apache/mahout/flinkbindings/standard/RegressionTestsSuite.scala ---------------------------------------------------------------------- diff --git a/flink/src/test/scala/org/apache/mahout/flinkbindings/standard/RegressionTestsSuite.scala b/flink/src/test/scala/org/apache/mahout/flinkbindings/standard/RegressionTestsSuite.scala new file mode 100644 index 0000000..8ddab41 --- /dev/null +++ b/flink/src/test/scala/org/apache/mahout/flinkbindings/standard/RegressionTestsSuite.scala @@ -0,0 +1,26 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.mahout.flinkbindings.standard + +import org.apache.mahout.flinkbindings.DistributedFlinkSuite +import org.apache.mahout.math.algorithms.RegressionTestsSuiteBase +import org.scalatest.FunSuite + +class RegressionTestsSuite extends FunSuite + with DistributedFlinkSuite with RegressionTestsSuiteBase + http://git-wip-us.apache.org/repos/asf/mahout/blob/9a31923e/h2o/src/test/scala/org/apache/mahout/math/algorithms/PreprocessorSuite.scala ---------------------------------------------------------------------- diff --git a/h2o/src/test/scala/org/apache/mahout/math/algorithms/PreprocessorSuite.scala b/h2o/src/test/scala/org/apache/mahout/math/algorithms/PreprocessorSuite.scala new file mode 100644 index 0000000..e777f8b --- /dev/null +++ b/h2o/src/test/scala/org/apache/mahout/math/algorithms/PreprocessorSuite.scala @@ -0,0 +1,24 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.mahout.math.algorithms + +import org.apache.mahout.h2obindings.test.DistributedH2OSuite +import org.scalatest.FunSuite + +class PreprocessorSuite extends FunSuite + with DistributedH2OSuite with PreprocessorSuiteBase \ No newline at end of file http://git-wip-us.apache.org/repos/asf/mahout/blob/9a31923e/h2o/src/test/scala/org/apache/mahout/math/algorithms/RegressionSuite.scala ---------------------------------------------------------------------- diff --git a/h2o/src/test/scala/org/apache/mahout/math/algorithms/RegressionSuite.scala b/h2o/src/test/scala/org/apache/mahout/math/algorithms/RegressionSuite.scala new file mode 100644 index 0000000..503eb06 --- /dev/null +++ b/h2o/src/test/scala/org/apache/mahout/math/algorithms/RegressionSuite.scala @@ -0,0 +1,25 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.mahout.math.algorithms + +import org.apache.mahout.h2obindings.test.DistributedH2OSuite +import org.scalatest.FunSuite + +class RegressionSuite extends FunSuite + with DistributedH2OSuite with RegressionSuiteBase + http://git-wip-us.apache.org/repos/asf/mahout/blob/9a31923e/h2o/src/test/scala/org/apache/mahout/math/algorithms/RegressionTestsSuite.scala ---------------------------------------------------------------------- diff --git a/h2o/src/test/scala/org/apache/mahout/math/algorithms/RegressionTestsSuite.scala b/h2o/src/test/scala/org/apache/mahout/math/algorithms/RegressionTestsSuite.scala new file mode 100644 index 0000000..864b045 --- /dev/null +++ b/h2o/src/test/scala/org/apache/mahout/math/algorithms/RegressionTestsSuite.scala @@ -0,0 +1,24 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.mahout.math.algorithms + +import org.apache.mahout.h2obindings.test.DistributedH2OSuite +import org.scalatest.FunSuite + +class RegressionTestsSuite extends FunSuite + with DistributedH2OSuite with RegressionTestsSuiteBase http://git-wip-us.apache.org/repos/asf/mahout/blob/9a31923e/math-scala/src/main/scala/org/apache/mahout/math/algorithms/Fitter.scala ---------------------------------------------------------------------- diff --git a/math-scala/src/main/scala/org/apache/mahout/math/algorithms/Fitter.scala b/math-scala/src/main/scala/org/apache/mahout/math/algorithms/Fitter.scala new file mode 100644 index 0000000..244cefc --- /dev/null +++ b/math-scala/src/main/scala/org/apache/mahout/math/algorithms/Fitter.scala @@ -0,0 +1,27 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.mahout.math.algorithms + +trait Fitter { + + // all models must have a fit method... signatures change. + // leaving this as place holder incase we decide there are somethings all Models must have in common + +} http://git-wip-us.apache.org/repos/asf/mahout/blob/9a31923e/math-scala/src/main/scala/org/apache/mahout/math/algorithms/Model.scala ---------------------------------------------------------------------- diff --git a/math-scala/src/main/scala/org/apache/mahout/math/algorithms/Model.scala b/math-scala/src/main/scala/org/apache/mahout/math/algorithms/Model.scala new file mode 100644 index 0000000..0fbe8ac --- /dev/null +++ b/math-scala/src/main/scala/org/apache/mahout/math/algorithms/Model.scala @@ -0,0 +1,26 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.mahout.math.algorithms + +trait Model extends Serializable { + + var summary: String = "" + +} http://git-wip-us.apache.org/repos/asf/mahout/blob/9a31923e/math-scala/src/main/scala/org/apache/mahout/math/algorithms/SupervisedFitter.scala ---------------------------------------------------------------------- diff --git a/math-scala/src/main/scala/org/apache/mahout/math/algorithms/SupervisedFitter.scala b/math-scala/src/main/scala/org/apache/mahout/math/algorithms/SupervisedFitter.scala new file mode 100644 index 0000000..bf85dee --- /dev/null +++ b/math-scala/src/main/scala/org/apache/mahout/math/algorithms/SupervisedFitter.scala @@ -0,0 +1,29 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.mahout.math.algorithms + +import org.apache.mahout.math.drm.DrmLike + +trait SupervisedFitter[K, M <: SupervisedModel[K]] extends Fitter { + + def fit(drmX : DrmLike[K], + drmTarget: DrmLike[K], + hyperparameters: (Symbol, Any)*): M +} http://git-wip-us.apache.org/repos/asf/mahout/blob/9a31923e/math-scala/src/main/scala/org/apache/mahout/math/algorithms/SupervisedModel.scala ---------------------------------------------------------------------- diff --git a/math-scala/src/main/scala/org/apache/mahout/math/algorithms/SupervisedModel.scala b/math-scala/src/main/scala/org/apache/mahout/math/algorithms/SupervisedModel.scala new file mode 100644 index 0000000..57c20e7 --- /dev/null +++ b/math-scala/src/main/scala/org/apache/mahout/math/algorithms/SupervisedModel.scala @@ -0,0 +1,26 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.mahout.math.algorithms + +import scala.collection.mutable + +trait SupervisedModel[K] extends Model { + var testResults: mutable.Map[Symbol, Any] = mutable.Map[Symbol, Any]() +} http://git-wip-us.apache.org/repos/asf/mahout/blob/9a31923e/math-scala/src/main/scala/org/apache/mahout/math/algorithms/UnsupervisedFitter.scala ---------------------------------------------------------------------- diff --git a/math-scala/src/main/scala/org/apache/mahout/math/algorithms/UnsupervisedFitter.scala b/math-scala/src/main/scala/org/apache/mahout/math/algorithms/UnsupervisedFitter.scala new file mode 100644 index 0000000..5c191d1 --- /dev/null +++ b/math-scala/src/main/scala/org/apache/mahout/math/algorithms/UnsupervisedFitter.scala @@ -0,0 +1,28 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.mahout.math.algorithms + +import org.apache.mahout.math.drm.DrmLike + +trait UnsupervisedFitter extends Fitter { + + def fit[K](input: DrmLike[K], + hyperparameters: (Symbol, Any)*): UnsupervisedModel +} http://git-wip-us.apache.org/repos/asf/mahout/blob/9a31923e/math-scala/src/main/scala/org/apache/mahout/math/algorithms/UnsupervisedModel.scala ---------------------------------------------------------------------- diff --git a/math-scala/src/main/scala/org/apache/mahout/math/algorithms/UnsupervisedModel.scala b/math-scala/src/main/scala/org/apache/mahout/math/algorithms/UnsupervisedModel.scala new file mode 100644 index 0000000..f8ff341 --- /dev/null +++ b/math-scala/src/main/scala/org/apache/mahout/math/algorithms/UnsupervisedModel.scala @@ -0,0 +1,24 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.mahout.math.algorithms + +trait UnsupervisedModel extends Model { + +} http://git-wip-us.apache.org/repos/asf/mahout/blob/9a31923e/math-scala/src/main/scala/org/apache/mahout/math/algorithms/preprocessing/AsFactor.scala ---------------------------------------------------------------------- diff --git a/math-scala/src/main/scala/org/apache/mahout/math/algorithms/preprocessing/AsFactor.scala b/math-scala/src/main/scala/org/apache/mahout/math/algorithms/preprocessing/AsFactor.scala new file mode 100644 index 0000000..9d8e10f --- /dev/null +++ b/math-scala/src/main/scala/org/apache/mahout/math/algorithms/preprocessing/AsFactor.scala @@ -0,0 +1,127 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.mahout.math.algorithms.preprocessing + + + +import collection._ +import JavaConversions._ +import org.apache.mahout.math._ +import org.apache.mahout.math.drm._ +import org.apache.mahout.math.{Vector => MahoutVector} +import org.apache.mahout.math.drm.RLikeDrmOps._ +import org.apache.mahout.math.scalabindings._ +import org.apache.mahout.math.scalabindings.RLikeOps._ +import MahoutCollections._ + +class AsFactor extends PreprocessorFitter { + + def fit[K](input: DrmLike[K], + hyperparameters: (Symbol, Any)*): AsFactorModel = { + + import org.apache.mahout.math.function.VectorFunction + val factorMap = input.allreduceBlock( + { case (keys, block: Matrix) => + // someday we'll replace this with block.max: Vector + // or better yet- block.distinct + dense(block.aggregateColumns( new VectorFunction { + def apply(f: Vector): Double = f.max + })) + })(0, ::) + /* + val A = drmParallelize(dense( + (3, 2, 1), + (0, 0, 0), + (1, 1, 1)) + -> (4,2,2), now 4,3,2 + */ + new AsFactorModel(factorMap.sum.toInt, + dvec(factorMap.toArray.scanLeft(0.0)((l, r) => l + r ).take(factorMap.length)) + // factorMap + ) + } + +} + +class AsFactorModel(cardinality: Int, factorVec: MahoutVector) extends PreprocessorModel { + + val factorMap: MahoutVector = factorVec + + def transform[K](input: DrmLike[K]): DrmLike[K] ={ + + implicit val ctx = input.context + + val bcastK = drmBroadcast(dvec(cardinality)) + val bcastFactorMap = drmBroadcast(factorMap) + + implicit val ktag = input.keyClassTag + + val res = input.mapBlock(cardinality) { + case (keys, block: Matrix) => { + val cardinality: Int = bcastK.value.get(0).toInt + val output = new SparseMatrix(block.nrow, cardinality) + // This is how we take a vector of mapping to a map + val fm = bcastFactorMap.value + for (n <- 0 until output.nrow){ + var m = 0 + for (e <- block(n, ::).all() ){ + output(n, fm.get(m).toInt + e.get().toInt ) = 1.0 + m += 1 + } + } + (keys, output) + } + } + res + } + + override def invTransform[K](input: DrmLike[K]): DrmLike[K] = { + implicit val ctx = input.context + + val bcastK = drmBroadcast(dvec(cardinality)) + val bcastFactorMap = drmBroadcast(factorMap) + + implicit val ktag = input.keyClassTag + + val res = input.mapBlock(cardinality) { + case (keys, block: Matrix) => { + val k: Int = bcastK.value.get(0).toInt + val output = new DenseMatrix(block.nrow, bcastK.value.length) + // This is how we take a vector of mapping to a map + val fm = bcastFactorMap.all.toSeq.map(e => e.get -> e.index).toMap + + import MahoutCollections._ + val indexArray = Array(1.0) ++ bcastFactorMap.value.toArray.map(i => i.toInt) + for (n <- 0 until output.nrow){ + val v = new DenseVector(bcastFactorMap.value.length) + var m = 0 + for (e <- block(n, ::).asInstanceOf[RandomAccessSparseVector].iterateNonZero() ){ + v.setQuick(m, e.index - m) + m += 1 + } + output(n, ::) = v + } + (keys, output) + } + } + res + } + +} http://git-wip-us.apache.org/repos/asf/mahout/blob/9a31923e/math-scala/src/main/scala/org/apache/mahout/math/algorithms/preprocessing/MeanCenter.scala ---------------------------------------------------------------------- diff --git a/math-scala/src/main/scala/org/apache/mahout/math/algorithms/preprocessing/MeanCenter.scala b/math-scala/src/main/scala/org/apache/mahout/math/algorithms/preprocessing/MeanCenter.scala new file mode 100644 index 0000000..258ad1b --- /dev/null +++ b/math-scala/src/main/scala/org/apache/mahout/math/algorithms/preprocessing/MeanCenter.scala @@ -0,0 +1,91 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.mahout.math.algorithms.preprocessing + +import collection._ +import JavaConversions._ +import org.apache.mahout.math.drm._ +import org.apache.mahout.math.drm.RLikeDrmOps._ +import org.apache.mahout.math.Matrix +import org.apache.mahout.math.scalabindings.RLikeOps._ +import org.apache.mahout.math.{Vector => MahoutVector} + + + +class MeanCenter extends PreprocessorFitter { + + /** + * Centers Columns at zero or centers + * @param input A drm which to center on + * + */ + def fit[K](input: DrmLike[K], + hyperparameters: (Symbol, Any)*): MeanCenterModel = { + new MeanCenterModel(input.colMeans()) // could add centers here + } + +} + +/** + * A model for mean centering each column of a data set at 0 or some number specified by the setCenters method. + * @param means + */ +class MeanCenterModel(means: MahoutVector) extends PreprocessorModel { + + var colCentersV: MahoutVector = means + + def setCenters(centers: MahoutVector): Unit = { + if (means.length != centers.length){ + throw new Exception(s"Length of centers vector (${centers.length}) must equal length of means vector ((${means.length}) (e.g. the number of columns in the orignally fit input).") + } + colCentersV = means + centers + } + def transform[K](input: DrmLike[K]): DrmLike[K] = { + + implicit val ctx = input.context + implicit val ktag = input.keyClassTag + + val bcastV = drmBroadcast(colCentersV) + + val output = input.mapBlock(input.ncol) { + case (keys, block: Matrix) => + val copy: Matrix = block.cloned + copy.foreach(row => row -= bcastV.value) + (keys, copy) + } + output + } + + def invTransform[K](input: DrmLike[K]): DrmLike[K] = { + + implicit val ctx = input.context + implicit val ktag = input.keyClassTag + val bcastV = drmBroadcast(colCentersV) + + val output = input.mapBlock(input.ncol) { + case (keys, block: Matrix) => + val copy: Matrix = block.cloned + copy.foreach(row => row += bcastV.value) + (keys, copy) + } + output + } + +} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/mahout/blob/9a31923e/math-scala/src/main/scala/org/apache/mahout/math/algorithms/preprocessing/PreprocessorModel.scala ---------------------------------------------------------------------- diff --git a/math-scala/src/main/scala/org/apache/mahout/math/algorithms/preprocessing/PreprocessorModel.scala b/math-scala/src/main/scala/org/apache/mahout/math/algorithms/preprocessing/PreprocessorModel.scala new file mode 100644 index 0000000..5adb87d --- /dev/null +++ b/math-scala/src/main/scala/org/apache/mahout/math/algorithms/preprocessing/PreprocessorModel.scala @@ -0,0 +1,58 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.mahout.math.algorithms.preprocessing + +import org.apache.mahout.math.algorithms.{UnsupervisedFitter, UnsupervisedModel} +import org.apache.mahout.math.drm.DrmLike + +trait PreprocessorModel extends UnsupervisedModel { + + /** + * A convenience method for returning transformed data back to original + * @param input + * @tparam K + * @return + */ + def invTransform[K](input: DrmLike[K]): DrmLike[K] + + /** + * Transform given Drm given the feature set + * @param input + + */ + def transform[K](input: DrmLike[K]): DrmLike[K] + +} + +trait PreprocessorFitter extends UnsupervisedFitter { + + def fit[K](input: DrmLike[K], + hyperparameters: (Symbol, Any)*): PreprocessorModel + + def fitTransform[K](input: DrmLike[K], + hyperparameters: (Symbol, Any)*): DrmLike[K] = { + model = this.fit(input, hyperparameters:_*) + model.transform(input) + + } + + // used to store the model if `fitTransform` method called + var model: PreprocessorModel = _ +} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/mahout/blob/9a31923e/math-scala/src/main/scala/org/apache/mahout/math/algorithms/preprocessing/StandardScaler.scala ---------------------------------------------------------------------- diff --git a/math-scala/src/main/scala/org/apache/mahout/math/algorithms/preprocessing/StandardScaler.scala b/math-scala/src/main/scala/org/apache/mahout/math/algorithms/preprocessing/StandardScaler.scala new file mode 100644 index 0000000..98d0be1 --- /dev/null +++ b/math-scala/src/main/scala/org/apache/mahout/math/algorithms/preprocessing/StandardScaler.scala @@ -0,0 +1,97 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.mahout.math.algorithms.preprocessing + +import collection._ +import JavaConversions._ + +import org.apache.mahout.math.drm._ +import org.apache.mahout.math.drm.RLikeDrmOps._ +import org.apache.mahout.math.scalabindings.RLikeOps._ +import org.apache.mahout.math.{Vector => MahoutVector, Matrix} + +/** + * Scales columns to mean 0 and unit variance + */ +class StandardScaler extends PreprocessorFitter { + + def fit[K](input: DrmLike[K], + hyperparameters: (Symbol, Any)*): StandardScalerModel = { + val mNv = dcolMeanVars(input) + new StandardScalerModel(mNv._1, mNv._2.sqrt) + } + +} + +class StandardScalerModel(meanVec: MahoutVector, + stdev: MahoutVector + ) extends PreprocessorModel { + + + def transform[K](input: DrmLike[K]): DrmLike[K] = { + implicit val ctx = input.context + + + // Some mapBlock() calls need it + // implicit val ktag = input.keyClassTag + + val bcastMu = drmBroadcast(meanVec) + val bcastSigma = drmBroadcast(stdev) + + implicit val ktag = input.keyClassTag + + val res = input.mapBlock(input.ncol) { + case (keys, block: Matrix) => { + val copy: Matrix = block.cloned + copy.foreach(row => row := (row - bcastMu) / bcastSigma ) + (keys, copy) + } + } + res + } + + /** + * Given a an output- trasform it back into the original + * e.g. a normalized column, back to original values. + * + * @param input + * @tparam K + * @return + */ + def invTransform[K](input: DrmLike[K]): DrmLike[K] = { // [K: ClassTag] + + implicit val ctx = input.context + + // Some mapBlock() calls need it + implicit val ktag = input.keyClassTag + + val bcastMu = drmBroadcast(meanVec) + val bcastSigma = drmBroadcast(stdev) + + val res = input.mapBlock(input.ncol) { + case (keys, block: Matrix) => { + val copy: Matrix = block.cloned + copy.foreach(row => row := (row * bcastSigma ) + bcastMu) + (keys, copy) + } + } + res + } +} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/mahout/blob/9a31923e/math-scala/src/main/scala/org/apache/mahout/math/algorithms/regression/CochraneOrcuttModel.scala ---------------------------------------------------------------------- diff --git a/math-scala/src/main/scala/org/apache/mahout/math/algorithms/regression/CochraneOrcuttModel.scala b/math-scala/src/main/scala/org/apache/mahout/math/algorithms/regression/CochraneOrcuttModel.scala new file mode 100644 index 0000000..844e72f --- /dev/null +++ b/math-scala/src/main/scala/org/apache/mahout/math/algorithms/regression/CochraneOrcuttModel.scala @@ -0,0 +1,100 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.mahout.math.algorithms.regression + +import org.apache.mahout.math.{Vector => MahoutVector} +import org.apache.mahout.math.drm.{CacheHint, DrmLike, safeToNonNegInt} +import org.apache.mahout.math.drm.RLikeDrmOps._ +import org.apache.mahout.math.scalabindings.RLikeOps._ + +class CochraneOrcuttModel[K](regressor: LinearRegressorModel[K]) extends LinearRegressorModel[K] { + // https://en.wikipedia.org/wiki/Cochrane%E2%80%93Orcutt_estimation + + var betas: Array[MahoutVector] = _ + + def predict(drmPredictors: DrmLike[K]): DrmLike[K] = { + regressor.predict(drmPredictors) + } + +} + +class CochraneOrcutt[K](hyperparameters: (Symbol, Any)*) extends LinearRegressorFitter[K] { + + var regressor: LinearRegressorFitter[K] = hyperparameters.asInstanceOf[Map[Symbol, + LinearRegressorFitter[K]]].getOrElse('regressor, new OrdinaryLeastSquares[K]()) + var iterations: Int = hyperparameters.asInstanceOf[Map[Symbol, Int]].getOrElse('iterations, 3) + var cacheHint: CacheHint.CacheHint = hyperparameters.asInstanceOf[Map[Symbol, CacheHint.CacheHint]].getOrElse('cacheHint, CacheHint.MEMORY_ONLY) + // For larger inputs, CacheHint.MEMORY_AND_DISK2 is reccomended. + + def setHyperparameters(hyperparameters: Map[Symbol, Any] = Map('foo -> None)): Unit = { + setStandardHyperparameters(hyperparameters.toMap) + regressor = hyperparameters.asInstanceOf[Map[Symbol, LinearRegressorFitter[K]]].getOrElse('regressor, new OrdinaryLeastSquares()) + regressor.calcStandardErrors = false + regressor.calcCommonStatistics = false + iterations = hyperparameters.asInstanceOf[Map[Symbol, Int]].getOrElse('iterations, 3) + cacheHint = hyperparameters.asInstanceOf[Map[Symbol, CacheHint.CacheHint]].getOrElse('cacheHint, CacheHint.MEMORY_ONLY) + } + + setHyperparameters(hyperparameters.toMap) + + def fit(drmFeatures: DrmLike[K], drmTarget: DrmLike[K], hyperparameters: (Symbol, Any)*): CochraneOrcuttModel[K] = { + + var hyperparameters: Option[Map[String,Any]] = None + val betas = new Array[MahoutVector](iterations) + var regressionModel: LinearRegressorModel[K] = regressor.fit(drmFeatures, drmTarget) + betas(0) = regressionModel.beta + // todo add dw test option on each iteration + + val drmY = drmTarget + val n = safeToNonNegInt(drmTarget.nrow) + val Y = drmTarget(1 until n, 0 until 1).checkpoint(cacheHint) + val Y_lag = drmTarget(0 until n - 1, 0 until 1).checkpoint(cacheHint) + val X = drmFeatures(1 until n, 0 until 1).checkpoint(cacheHint) + val X_lag = drmFeatures(0 until n - 1, 0 until 1).checkpoint(cacheHint) + for (i <- 1 until iterations){ + val error = drmTarget - regressionModel.predict(drmFeatures) + regressionModel = regressor.fit(drmFeatures, drmTarget) + val rho = regressionModel.beta.get(0) + + val drmYprime = Y - Y_lag * rho + val drmXprime = X - X_lag * rho + + if (i == iterations - 1 ){ + // calculate common stats and SE on last iteration only + // todo make this optional- but if you don't care then why are you even bothering to do this? + regressor.calcStandardErrors = true + regressor.calcCommonStatistics = true + } + regressionModel = regressor.fit(drmFeatures, drmTarget) + var betaPrime = regressionModel.beta + val b0 = betaPrime(0) / (1 - rho) + betaPrime(0) = b0 + betas(i) = betaPrime + } + + val model = new CochraneOrcuttModel[K](regressionModel) + model.betas = betas + model.summary = (0 until iterations).map(i â s"Beta estimates on iteration " + i + ": " + + model.betas.toString + "\n").mkString("") + "\n\n" + "Final Model:\n\n" + regressionModel.summary + + model + } + +} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/mahout/blob/9a31923e/math-scala/src/main/scala/org/apache/mahout/math/algorithms/regression/LinearRegressorModel.scala ---------------------------------------------------------------------- diff --git a/math-scala/src/main/scala/org/apache/mahout/math/algorithms/regression/LinearRegressorModel.scala b/math-scala/src/main/scala/org/apache/mahout/math/algorithms/regression/LinearRegressorModel.scala new file mode 100644 index 0000000..555ee6c --- /dev/null +++ b/math-scala/src/main/scala/org/apache/mahout/math/algorithms/regression/LinearRegressorModel.scala @@ -0,0 +1,124 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.mahout.math.algorithms.regression + +import org.apache.mahout.math.algorithms.regression.tests.FittnessTests +import org.apache.mahout.math.drm._ +import org.apache.mahout.math.drm.DrmLike +import org.apache.mahout.math.drm.RLikeDrmOps._ +import org.apache.mahout.math.scalabindings.dvec +import org.apache.mahout.math.{Matrix, Vector => MahoutVector} +import org.apache.mahout.math.scalabindings.RLikeOps._ +import scala.language.higherKinds + +trait LinearRegressorModel[K] extends RegressorModel[K] { + + var beta: MahoutVector = _ + var se: MahoutVector = _ + var tScore: MahoutVector = _ + var pval: MahoutVector = _ + var degreesFreedom: Int = _ + +} + +trait LinearRegressorFitter[K] extends RegressorFitter[K] { + + var addIntercept: Boolean = _ + var calcStandardErrors: Boolean = _ + var calcCommonStatistics: Boolean = _ + + def fit(drmX: DrmLike[K], + drmTarget: DrmLike[K], + hyperparameters: (Symbol, Any)*): LinearRegressorModel[K] + + + def setStandardHyperparameters(hyperparameters: Map[Symbol, Any] = Map('foo -> None)): Unit = { + calcCommonStatistics = hyperparameters.asInstanceOf[Map[Symbol, Boolean]].getOrElse('calcCommonStatistics, true) + calcStandardErrors = hyperparameters.asInstanceOf[Map[Symbol, Boolean]].getOrElse('calcStandardErrors, true) + addIntercept = hyperparameters.asInstanceOf[Map[Symbol, Boolean]].getOrElse('addIntercept, true) + } + + def calculateStandardError[M[K] <: LinearRegressorModel[K]](X: DrmLike[K], + drmTarget: DrmLike[K], + drmXtXinv: Matrix, + model: M[K]): M[K] = { + import org.apache.mahout.math.function.Functions.SQRT + import org.apache.mahout.math.scalabindings.MahoutCollections._ + var modelOut = model + val yhat = X %*% model.beta + val residuals = drmTarget - yhat + val ete = (residuals.t %*% residuals).collect // 1x1 + val n = drmTarget.nrow + val k = safeToNonNegInt(X.ncol) + val invDegFreedomKindOf = 1.0 / (n - k) + val varCovarMatrix = invDegFreedomKindOf * ete(0,0) * drmXtXinv + val se = varCovarMatrix.viewDiagonal.assign(SQRT) + val tScore = model.beta / se + val tDist = new org.apache.commons.math3.distribution.TDistribution(n-k) + val pval = dvec(tScore.toArray.map(t => 2 * (1.0 - tDist.cumulativeProbability(t)) )) + // ^^ TODO bug in this calculation- fix and add test + //degreesFreedom = k + modelOut.summary = "Coef.\t\tEstimate\t\tStd. Error\t\tt-score\t\t\tPr(Beta=0)\n" + + (0 until k).map(i => s"X${i}\t${model.beta(i)}\t${se(i)}\t${tScore(i)}\t${pval(i)}").mkString("\n") + + modelOut.se = se + modelOut.tScore = tScore + modelOut.pval = pval + modelOut.degreesFreedom = X.ncol + + if (calcCommonStatistics){ + modelOut = calculateCommonStatistics(modelOut, drmTarget, residuals) + } + modelOut + } + + def calculateCommonStatistics[M[K] <: LinearRegressorModel[K]](model: M[K], + drmTarget: DrmLike[K], + residuals: DrmLike[K]): M[K] ={ + var modelOut = model + modelOut = FittnessTests.CoefficientOfDetermination(model, drmTarget, residuals) + modelOut = FittnessTests.MeanSquareError(model, residuals) + modelOut + } + + def modelPostprocessing[M[K] <: LinearRegressorModel[K]](model: M[K], + X: DrmLike[K], + drmTarget: DrmLike[K], + drmXtXinv: Matrix): M[K] = { + var modelOut = model + if (calcStandardErrors) { + modelOut = calculateStandardError(X, drmTarget, drmXtXinv, model ) + } else { + modelOut.summary = "Coef.\t\tEstimate\n" + + (0 until X.ncol).map(i => s"X${i}\t${modelOut.beta(i)}").mkString("\n") + if (calcCommonStatistics) { // we do this in calcStandard errors to avoid calculating residuals twice + val residuals = drmTarget - (X %*% modelOut.beta) + modelOut = calculateCommonStatistics(modelOut, drmTarget, residuals) + } + + modelOut + } + + if (addIntercept) { + model.summary.replace(s"X${X.ncol - 1}", "(Intercept)") + } + model + } +} http://git-wip-us.apache.org/repos/asf/mahout/blob/9a31923e/math-scala/src/main/scala/org/apache/mahout/math/algorithms/regression/OrdinaryLeastSquaresModel.scala ---------------------------------------------------------------------- diff --git a/math-scala/src/main/scala/org/apache/mahout/math/algorithms/regression/OrdinaryLeastSquaresModel.scala b/math-scala/src/main/scala/org/apache/mahout/math/algorithms/regression/OrdinaryLeastSquaresModel.scala new file mode 100644 index 0000000..d59701a --- /dev/null +++ b/math-scala/src/main/scala/org/apache/mahout/math/algorithms/regression/OrdinaryLeastSquaresModel.scala @@ -0,0 +1,66 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.mahout.math.algorithms.regression + +import org.apache.mahout.math.drm.RLikeDrmOps._ +import org.apache.mahout.math.drm.DrmLike +import org.apache.mahout.math.scalabindings._ +import org.apache.mahout.math.scalabindings.RLikeOps._ + +class OrdinaryLeastSquaresModel[K] + extends LinearRegressorModel[K] { + // https://en.wikipedia.org/wiki/Ordinary_least_squares + + def predict(drmPredictors: DrmLike[K]): DrmLike[K] = { + drmPredictors %*% beta + } + +} + +class OrdinaryLeastSquares[K] extends LinearRegressorFitter[K] { + + + def fit(drmFeatures: DrmLike[K], + drmTarget: DrmLike[K], + hyperparameters: (Symbol, Any)*): OrdinaryLeastSquaresModel[K] = { + + var model = new OrdinaryLeastSquaresModel[K]() + setStandardHyperparameters(hyperparameters.toMap) + + + if (drmFeatures.nrow != drmTarget.nrow){ + throw new Exception(s"${drmFeatures.nrow} observations in features, ${drmTarget.nrow} observations in target, must be equal.") + } + + var X = drmFeatures + if (addIntercept) { + X = X cbind 1 + } + + val XtX = X.t %*% X + XtX.collect + val drmXtXinv = solve(X.t %*% X) + val drmXty = (X.t %*% drmTarget).collect // this fails when number of columns^2 size matrix won't fit in driver + model.beta = (drmXtXinv %*% drmXty)(::, 0) + + + this.modelPostprocessing(model, X, drmTarget, drmXtXinv) + } +} http://git-wip-us.apache.org/repos/asf/mahout/blob/9a31923e/math-scala/src/main/scala/org/apache/mahout/math/algorithms/regression/RegressorModel.scala ---------------------------------------------------------------------- diff --git a/math-scala/src/main/scala/org/apache/mahout/math/algorithms/regression/RegressorModel.scala b/math-scala/src/main/scala/org/apache/mahout/math/algorithms/regression/RegressorModel.scala new file mode 100644 index 0000000..bdddb29 --- /dev/null +++ b/math-scala/src/main/scala/org/apache/mahout/math/algorithms/regression/RegressorModel.scala @@ -0,0 +1,58 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.mahout.math.algorithms.regression + +import org.apache.mahout.math.algorithms.{SupervisedFitter, SupervisedModel} +import org.apache.mahout.math.drm.DrmLike + +trait RegressorModel[K] extends SupervisedModel[K] { + + def predict(drmPredictors: DrmLike[K]): DrmLike[K] + + // Common Applicable Tests- here only for convenience. + var mse: Double = _ + var r2: Double = _ + + /** + * Syntatictic sugar for fetching test results. Will Return test result if it exists, otherwise None + * @param testSymbol - symbol of the test result to fetch, e.g. `'mse` + * @tparam T - The Type + * @return + */ + def getTestResult[T](testSymbol: Symbol): Option[T] = { + Some(testResults.get(testSymbol).asInstanceOf[T]) + } +} + +trait RegressorFitter[K] extends SupervisedFitter[K, RegressorModel[K]] { + + + def fitPredict(drmX: DrmLike[K], + drmTarget: DrmLike[K], + hyperparameters: (Symbol, Any)* ): DrmLike[K] = { + + model = this.fit(drmX, drmTarget, hyperparameters: _* ) + model.predict(drmX) + } + + // used to store the model if `fitTransform` method called + var model: RegressorModel[K] = _ + +} http://git-wip-us.apache.org/repos/asf/mahout/blob/9a31923e/math-scala/src/main/scala/org/apache/mahout/math/algorithms/regression/tests/AutocorrelationTests.scala ---------------------------------------------------------------------- diff --git a/math-scala/src/main/scala/org/apache/mahout/math/algorithms/regression/tests/AutocorrelationTests.scala b/math-scala/src/main/scala/org/apache/mahout/math/algorithms/regression/tests/AutocorrelationTests.scala new file mode 100644 index 0000000..2b16b74 --- /dev/null +++ b/math-scala/src/main/scala/org/apache/mahout/math/algorithms/regression/tests/AutocorrelationTests.scala @@ -0,0 +1,57 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.mahout.math.algorithms.regression.tests + +import org.apache.mahout.math.algorithms.regression.RegressorModel +import org.apache.mahout.math.drm._ +import org.apache.mahout.math.drm.DrmLike +import org.apache.mahout.math.drm.RLikeDrmOps._ +import org.apache.mahout.math.function.Functions.SQUARE +import org.apache.mahout.math.scalabindings.RLikeOps._ +import scala.language.higherKinds + +object AutocorrelationTests { + + //https://en.wikipedia.org/wiki/Durbin%E2%80%93Watson_statistic + /* + To test for positive autocorrelation at significance α, the test statistic d is compared to lower and upper critical values (dL,α and dU,α): + If d < dL,α, there is statistical evidence that the error terms are positively autocorrelated. + If d > dU,α, there is no statistical evidence that the error terms are positively autocorrelated. + If dL,α < d < dU,α, the test is inconclusive. + + Rule of Thumb: + d < 2 : positive auto-correlation + d = 2 : no auto-correlation + d > 2 : negative auto-correlation + */ + def DurbinWatson[R[K] <: RegressorModel[K], K](model: R[K], residuals: DrmLike[K]): R[K] = { + + val n = safeToNonNegInt(residuals.nrow) + val e: DrmLike[K] = residuals(1 until n , 0 until 1) + val e_t_1: DrmLike[K] = residuals(0 until n - 1, 0 until 1) + val numerator = (e - e_t_1).assign(SQUARE).colSums() + val denominator = residuals.assign(SQUARE).colSums() + val dw = numerator / denominator + model.testResults += ('durbinWatsonTestStatistic â dw.get(0)) + model.summary += s"\nDurbin Watson Test Statistic: ${dw.toString}" + model + } + +} http://git-wip-us.apache.org/repos/asf/mahout/blob/9a31923e/math-scala/src/main/scala/org/apache/mahout/math/algorithms/regression/tests/FittnessTests.scala ---------------------------------------------------------------------- diff --git a/math-scala/src/main/scala/org/apache/mahout/math/algorithms/regression/tests/FittnessTests.scala b/math-scala/src/main/scala/org/apache/mahout/math/algorithms/regression/tests/FittnessTests.scala new file mode 100644 index 0000000..d1dd3bb --- /dev/null +++ b/math-scala/src/main/scala/org/apache/mahout/math/algorithms/regression/tests/FittnessTests.scala @@ -0,0 +1,56 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.mahout.math.algorithms.regression.tests + +import org.apache.mahout.math.algorithms.regression.RegressorModel +import org.apache.mahout.math.algorithms.preprocessing.MeanCenter +import org.apache.mahout.math.drm.DrmLike +import org.apache.mahout.math.function.Functions.SQUARE +import org.apache.mahout.math.scalabindings.RLikeOps._ + +import scala.language.higherKinds +import scala.reflect.ClassTag + +object FittnessTests { + + // https://en.wikipedia.org/wiki/Coefficient_of_determination + def CoefficientOfDetermination[R[K] <: RegressorModel[K], K](model: R[K], + drmTarget: DrmLike[K], + residuals: DrmLike[K]): R[K] = { + val sumSquareResiduals = residuals.assign(SQUARE).sum + val mc = new MeanCenter() + val totalResiduals = mc.fitTransform(drmTarget) + val sumSquareTotal = totalResiduals.assign(SQUARE).sum + val r2 = 1 - (sumSquareResiduals / sumSquareTotal) + model.r2 = r2 + model.testResults += ('r2 -> r2) // need setResult and setSummary method incase you change in future, also to initialize map if non exists or update value if it does + model.summary += s"\nR^2: ${r2}" + model + } + + // https://en.wikipedia.org/wiki/Mean_squared_error + def MeanSquareError[R[K] <: RegressorModel[K], K](model: R[K], residuals: DrmLike[K]): R[K] = { + val mse = residuals.assign(SQUARE).sum / residuals.nrow + model.mse = mse + model.testResults += ('mse -> mse) + model.summary += s"\nMean Squared Error: ${mse}" + model + } +} http://git-wip-us.apache.org/repos/asf/mahout/blob/9a31923e/math-scala/src/test/scala/org/apache/mahout/math/algorithms/PreprocessorSuiteBase.scala ---------------------------------------------------------------------- diff --git a/math-scala/src/test/scala/org/apache/mahout/math/algorithms/PreprocessorSuiteBase.scala b/math-scala/src/test/scala/org/apache/mahout/math/algorithms/PreprocessorSuiteBase.scala new file mode 100644 index 0000000..9e8f029 --- /dev/null +++ b/math-scala/src/test/scala/org/apache/mahout/math/algorithms/PreprocessorSuiteBase.scala @@ -0,0 +1,59 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.mahout.math.algorithms + +import org.apache.mahout.math.algorithms.preprocessing.{AsFactor, AsFactorModel} +import org.apache.mahout.math.drm.drmParallelize +import org.apache.mahout.math.scalabindings.{dense, sparse, svec} +import org.apache.mahout.math.scalabindings.RLikeOps._ +import org.apache.mahout.test.DistributedMahoutSuite +import org.scalatest.{FunSuite, Matchers} + +trait PreprocessorSuiteBase extends DistributedMahoutSuite with Matchers { + + this: FunSuite => + + test("asfactor test") { + val A = drmParallelize(dense( + (3, 2, 1, 2), + (0, 0, 0, 0), + (1, 1, 1, 1)), numPartitions = 2) + + // 0 -> 2, 3 -> 5, 6 -> 9 + val factorizer: AsFactorModel = new AsFactor().fit(A) + + val factoredA = factorizer.transform(A) + + println(factoredA) + println(factorizer.factorMap) + val correctAnswer = sparse( + svec((3 â 1.0) :: (6 â 1.0) :: (8 â 1.0) :: (11 â 1.0) :: Nil, cardinality = 12), + svec((0 â 1.0) :: (4 â 1.0) :: (7 â 1.0) :: ( 9 â 1.0) :: Nil, cardinality = 12), + svec((1 â 1.0) :: (5 â 1.0) :: (8 â 1.0) :: (10 â 1.0) :: Nil, cardinality = 12) + ) + + val myAnswer = factoredA.collect + + val epsilon = 1E-6 + (myAnswer.norm - correctAnswer.norm) should be <= epsilon + (myAnswer.norm - correctAnswer.norm) should be <= epsilon + + } +} http://git-wip-us.apache.org/repos/asf/mahout/blob/9a31923e/math-scala/src/test/scala/org/apache/mahout/math/algorithms/RegressionSuiteBase.scala ---------------------------------------------------------------------- diff --git a/math-scala/src/test/scala/org/apache/mahout/math/algorithms/RegressionSuiteBase.scala b/math-scala/src/test/scala/org/apache/mahout/math/algorithms/RegressionSuiteBase.scala new file mode 100644 index 0000000..2bb0343 --- /dev/null +++ b/math-scala/src/test/scala/org/apache/mahout/math/algorithms/RegressionSuiteBase.scala @@ -0,0 +1,81 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.mahout.math.algorithms + +import org.apache.mahout.math.algorithms.regression.OrdinaryLeastSquares +import org.apache.mahout.math.drm._ +import org.apache.mahout.math.drm.RLikeDrmOps._ +import org.apache.mahout.math.scalabindings._ +import org.apache.mahout.math.scalabindings.RLikeOps._ +import org.apache.mahout.test.DistributedMahoutSuite +import org.scalatest.{FunSuite, Matchers} + +trait RegressionSuiteBase extends DistributedMahoutSuite with Matchers { + this: FunSuite => + + test("ordinary least squares") { + /* + R Prototype: + dataM <- matrix( c(2, 2, 10.5, 10, 29.509541, + 1, 2, 12, 12, 18.042851, + 1, 1, 12, 13, 22.736446, + 2, 1, 11, 13, 32.207582, + 1, 2, 12, 11, 21.871292, + 2, 1, 16, 8, 36.187559, + 6, 2, 17, 1, 50.764999, + 3, 2, 13, 7, 40.400208, + 3, 3, 13, 4, 45.811716), nrow=9, ncol=5, byrow=TRUE) + + + X = dataM[, c(1,2,3,4)] + y = dataM[, c(5)] + + model <- lm(y ~ X ) + summary(model) + + */ + + val drmData = drmParallelize(dense( + (2, 2, 10.5, 10, 29.509541), // Apple Cinnamon Cheerios + (1, 2, 12, 12, 18.042851), // Cap'n'Crunch + (1, 1, 12, 13, 22.736446), // Cocoa Puffs + (2, 1, 11, 13, 32.207582), // Froot Loops + (1, 2, 12, 11, 21.871292), // Honey Graham Ohs + (2, 1, 16, 8, 36.187559), // Wheaties Honey Gold + (6, 2, 17, 1, 50.764999), // Cheerios + (3, 2, 13, 7, 40.400208), // Clusters + (3, 3, 13, 4, 45.811716)), numPartitions = 2) + + + val drmX = drmData(::, 0 until 4) + val drmY = drmData(::, 4 until 5) + + val model = new OrdinaryLeastSquares[Int]().fit(drmX, drmY, 'calcCommonStatistics â false) + + val estimate = model.beta + val Ranswers = dvec(-1.336265, -13.157702, -4.152654, -5.679908, 163.179329) + + val epsilon = 1E-6 + (estimate - Ranswers).sum should be < epsilon + + // TODO add test for S.E / pvalue + } + + + +} http://git-wip-us.apache.org/repos/asf/mahout/blob/9a31923e/math-scala/src/test/scala/org/apache/mahout/math/algorithms/RegressionTestsSuiteBase.scala ---------------------------------------------------------------------- diff --git a/math-scala/src/test/scala/org/apache/mahout/math/algorithms/RegressionTestsSuiteBase.scala b/math-scala/src/test/scala/org/apache/mahout/math/algorithms/RegressionTestsSuiteBase.scala new file mode 100644 index 0000000..1178a9b --- /dev/null +++ b/math-scala/src/test/scala/org/apache/mahout/math/algorithms/RegressionTestsSuiteBase.scala @@ -0,0 +1,87 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.mahout.math.algorithms + +import org.apache.mahout.math.algorithms.regression.OrdinaryLeastSquares +import org.apache.mahout.math.drm.drmParallelize +import org.apache.mahout.math.drm.RLikeDrmOps._ +import org.apache.mahout.math.scalabindings.{`::`, dense} +import org.apache.mahout.test.DistributedMahoutSuite +import org.scalatest.{FunSuite, Matchers} + + +trait RegressionTestsSuiteBase extends DistributedMahoutSuite with Matchers { + this: FunSuite => + + val epsilon = 1E-4 + + test("fittness tests") { + /* + R Prototype: + dataM <- matrix( c(2, 2, 10.5, 10, 29.509541, + 1, 2, 12, 12, 18.042851, + 1, 1, 12, 13, 22.736446, + 2, 1, 11, 13, 32.207582, + 1, 2, 12, 11, 21.871292, + 2, 1, 16, 8, 36.187559, + 6, 2, 17, 1, 50.764999, + 3, 2, 13, 7, 40.400208, + 3, 3, 13, 4, 45.811716), nrow=9, ncol=5, byrow=TRUE) + + + X = dataM[, c(1,2,3,4)] + y = dataM[, c(5)] + + model <- lm(y ~ X) + summary(model) + + */ + + val drmData = drmParallelize(dense( + (2, 2, 10.5, 10, 29.509541), // Apple Cinnamon Cheerios + (1, 2, 12, 12, 18.042851), // Cap'n'Crunch + (1, 1, 12, 13, 22.736446), // Cocoa Puffs + (2, 1, 11, 13, 32.207582), // Froot Loops + (1, 2, 12, 11, 21.871292), // Honey Graham Ohs + (2, 1, 16, 8, 36.187559), // Wheaties Honey Gold + (6, 2, 17, 1, 50.764999), // Cheerios + (3, 2, 13, 7, 40.400208), // Clusters + (3, 3, 13, 4, 45.811716)), numPartitions = 2) + + val drmX = drmData(::, 0 until 4) + val drmY = drmData(::, 4 until 5) + + val model = new OrdinaryLeastSquares[Int]().fit(drmX, drmY) + + println(model.summary) + // Answers from running similar algorithm in R + val rR2 = 0.9425 + val rMSE = 6.457157 + + val r2: Double = model.r2 + val mse: Double = model.mse + println("R2: " + r2) + println("MSE: " + mse) + (rR2 - r2) should be < epsilon + (rMSE - mse) should be < epsilon + + } + +} http://git-wip-us.apache.org/repos/asf/mahout/blob/9a31923e/spark/src/test/scala/org/apache/mahout/math/algorithms/PreprocessorSuite.scala ---------------------------------------------------------------------- diff --git a/spark/src/test/scala/org/apache/mahout/math/algorithms/PreprocessorSuite.scala b/spark/src/test/scala/org/apache/mahout/math/algorithms/PreprocessorSuite.scala new file mode 100644 index 0000000..4a1f074 --- /dev/null +++ b/spark/src/test/scala/org/apache/mahout/math/algorithms/PreprocessorSuite.scala @@ -0,0 +1,24 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.mahout.math.algorithms + +import org.apache.mahout.sparkbindings.test.DistributedSparkSuite +import org.scalatest.FunSuite + +class PreprocessorSuite extends FunSuite + with DistributedSparkSuite with PreprocessorSuiteBase \ No newline at end of file http://git-wip-us.apache.org/repos/asf/mahout/blob/9a31923e/spark/src/test/scala/org/apache/mahout/math/algorithms/RegressionSuite.scala ---------------------------------------------------------------------- diff --git a/spark/src/test/scala/org/apache/mahout/math/algorithms/RegressionSuite.scala b/spark/src/test/scala/org/apache/mahout/math/algorithms/RegressionSuite.scala new file mode 100644 index 0000000..bb99d61 --- /dev/null +++ b/spark/src/test/scala/org/apache/mahout/math/algorithms/RegressionSuite.scala @@ -0,0 +1,25 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.mahout.math.algorithms + +import org.apache.mahout.sparkbindings.test.DistributedSparkSuite +import org.scalatest.FunSuite + +class RegressionSuite extends FunSuite + with DistributedSparkSuite with RegressionSuiteBase + http://git-wip-us.apache.org/repos/asf/mahout/blob/9a31923e/spark/src/test/scala/org/apache/mahout/math/algorithms/RegressionTestsSuite.scala ---------------------------------------------------------------------- diff --git a/spark/src/test/scala/org/apache/mahout/math/algorithms/RegressionTestsSuite.scala b/spark/src/test/scala/org/apache/mahout/math/algorithms/RegressionTestsSuite.scala new file mode 100644 index 0000000..07864f8 --- /dev/null +++ b/spark/src/test/scala/org/apache/mahout/math/algorithms/RegressionTestsSuite.scala @@ -0,0 +1,25 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.mahout.math.algorithms + +import org.apache.mahout.sparkbindings.test.DistributedSparkSuite +import org.scalatest.FunSuite + +class RegressionTestsSuite extends FunSuite + with DistributedSparkSuite with RegressionTestsSuiteBase +
