This is an automated email from the ASF dual-hosted git repository.

mboehm7 pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/systemds.git


The following commit(s) were added to refs/heads/main by this push:
     new bd68831  [SYSTEMDS-3261] Extended min-max normalization built-in 
functions
bd68831 is described below

commit bd688311b262bf759c6efa68999fae13d6126a7d
Author: Matthias Boehm <[email protected]>
AuthorDate: Mon Dec 27 21:19:22 2021 +0100

    [SYSTEMDS-3261] Extended min-max normalization built-in functions
    
    This patch adds a normalizeApply function, documentation, and
    extended tests for min-max normalization (which is necessary for our
    TPCx-AI implementation).
---
 scripts/builtin/normalize.dml                      | 25 +++++++++++--
 scripts/builtin/normalizeApply.dml                 | 43 ++++++++++++++++++++++
 .../java/org/apache/sysds/common/Builtins.java     |  1 +
 .../builtin/part2/BuiltinNormalizeTest.java        | 43 +++++++++++++++++++---
 .../scripts/functions/builtin/normalizeAll.dml     |  9 +++--
 5 files changed, 108 insertions(+), 13 deletions(-)

diff --git a/scripts/builtin/normalize.dml b/scripts/builtin/normalize.dml
index ac29667..e2a32be 100644
--- a/scripts/builtin/normalize.dml
+++ b/scripts/builtin/normalize.dml
@@ -19,7 +19,26 @@
 #
 #-------------------------------------------------------------
 
-m_normalize = function(Matrix[Double] X) return (Matrix[Double] Y) {
-  # normalize features to range [0,1]
-  Y = (X - colMins(X)) / (colMaxs(X) - colMins(X));
+# Min-max normalization (a.k.a. min-max scaling) to range [0,1]. For matrices 
+# of positive values, this normalization preserves the input sparsity.
+#
+# 
------------------------------------------------------------------------------
+# NAME     TYPE     DEFAULT   MEANING
+# 
------------------------------------------------------------------------------
+# X        Matrix    ---      Input feature matrix of shape n-by-m
+# 
------------------------------------------------------------------------------
+# Y        Matrix    ---      Modified output feature matrix of shape n-by-m
+# cmin     Matrix    ---      Colunm minima of shape 1-by-m
+# cmax     Matrix    ---      Column maxima of shape 1-by-m
+# 
------------------------------------------------------------------------------
+
+
+m_normalize = function(Matrix[Double] X)
+  return (Matrix[Double] Y, Matrix[Double] cmin, Matrix[Double] cmax)
+{
+  # compute feature ranges for transformations
+  cmin = colMins(X);
+  cmax = colMaxs(X);
+       # normalize features to range [0,1]
+  Y = normalizeApply(X, cmin, cmax);
 }
diff --git a/scripts/builtin/normalizeApply.dml 
b/scripts/builtin/normalizeApply.dml
new file mode 100644
index 0000000..07fad33
--- /dev/null
+++ b/scripts/builtin/normalizeApply.dml
@@ -0,0 +1,43 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+# Min-max normalization (a.k.a. min-max scaling) to range [0,1], given 
+# existing min-max ranges. For matrices of positive values, this normalization 
+# preserves the input sparsity. The validity of the provided min-max range
+# and post-processing is under control of the caller. 
+#
+# 
------------------------------------------------------------------------------
+# NAME     TYPE     DEFAULT   MEANING
+# 
------------------------------------------------------------------------------
+# X        Matrix    ---      Input feature matrix of shape n-by-m
+# cmin     Matrix    ---      Colunm minima of shape 1-by-m
+# cmax     Matrix    ---      Column maxima of shape 1-by-m
+# 
------------------------------------------------------------------------------
+# Y        Matrix    ---      Modified output feature matrix of shape n-by-m
+# 
------------------------------------------------------------------------------
+
+
+m_normalizeApply = function(Matrix[Double] X, Matrix[Double] cmin, 
Matrix[Double] cmax)
+  return (Matrix[Double] Y)
+{
+  # normalize features to given range ([0,1] if indeed min/max)
+  Y = (X - cmin) / (cmax - cmin);
+}
diff --git a/src/main/java/org/apache/sysds/common/Builtins.java 
b/src/main/java/org/apache/sysds/common/Builtins.java
index f44cf42..58fced1 100644
--- a/src/main/java/org/apache/sysds/common/Builtins.java
+++ b/src/main/java/org/apache/sysds/common/Builtins.java
@@ -205,6 +205,7 @@ public enum Builtins {
        NAIVEBAYESPREDICT("naiveBayesPredict", true, false),
        NCOL("ncol", false),
        NORMALIZE("normalize", true),
+       NORMALIZEAPPLY("normalizeApply", true),
        NROW("nrow", false),
        OUTER("outer", false),
        OUTLIER("outlier", true, false), //TODO parameterize opposite
diff --git 
a/src/test/java/org/apache/sysds/test/functions/builtin/part2/BuiltinNormalizeTest.java
 
b/src/test/java/org/apache/sysds/test/functions/builtin/part2/BuiltinNormalizeTest.java
index f87f2ce..6bc7028 100644
--- 
a/src/test/java/org/apache/sysds/test/functions/builtin/part2/BuiltinNormalizeTest.java
+++ 
b/src/test/java/org/apache/sysds/test/functions/builtin/part2/BuiltinNormalizeTest.java
@@ -21,17 +21,22 @@ package org.apache.sysds.test.functions.builtin.part2;
 
 import java.util.HashMap;
 
+import org.junit.Assert;
 import org.junit.Test;
+
 import org.apache.sysds.common.Types.ExecMode;
 import org.apache.sysds.common.Types.ExecType;
 import org.apache.sysds.runtime.matrix.data.MatrixValue.CellIndex;
 import org.apache.sysds.test.AutomatedTestBase;
 import org.apache.sysds.test.TestConfiguration;
 import org.apache.sysds.test.TestUtils;
+import org.apache.sysds.utils.Statistics;
 
 public class BuiltinNormalizeTest extends AutomatedTestBase 
 {
        private final static String TEST_NAME = "normalize";
+       private final static String TEST_NAME2 = "normalizeAll";
+       
        private final static String TEST_DIR = "functions/builtin/";
        private static final String TEST_CLASS_DIR = TEST_DIR + 
BuiltinNormalizeTest.class.getSimpleName() + "/";
        
@@ -48,25 +53,45 @@ public class BuiltinNormalizeTest extends AutomatedTestBase
 
        @Test
        public void testNormalizeMatrixDenseCP() {
-               runNormalizeTest(false, false, ExecType.CP);
+               runNormalizeTest(TEST_NAME, false, ExecType.CP);
        }
        
        @Test
        public void testNormalizeMatrixSparseCP() {
-               runNormalizeTest(false, true, ExecType.CP);
+               runNormalizeTest(TEST_NAME, true, ExecType.CP);
        }
        
        @Test
        public void testNormalizeMatrixDenseSP() {
-               runNormalizeTest(false, false, ExecType.SPARK);
+               runNormalizeTest(TEST_NAME, false, ExecType.SPARK);
        }
        
        @Test
        public void testNormalizeMatrixSparseSP() {
-               runNormalizeTest(false, true, ExecType.SPARK);
+               runNormalizeTest(TEST_NAME, true, ExecType.SPARK);
+       }
+       
+       @Test
+       public void testNormalize2MatrixDenseCP() {
+               runNormalizeTest(TEST_NAME2, false, ExecType.CP);
+       }
+       
+       @Test
+       public void testNormalize2MatrixSparseCP() {
+               runNormalizeTest(TEST_NAME2, true, ExecType.CP);
        }
        
-       private void runNormalizeTest(boolean scalar, boolean sparse, ExecType 
instType)
+       @Test
+       public void testNormalize2MatrixDenseSP() {
+               runNormalizeTest(TEST_NAME2, false, ExecType.SPARK);
+       }
+       
+       @Test
+       public void testNormalize2MatrixSparseSP() {
+               runNormalizeTest(TEST_NAME2, true, ExecType.SPARK);
+       }
+       
+       private void runNormalizeTest(String testname, boolean sparse, ExecType 
instType)
        {
                ExecMode platformOld = setExecMode(instType);
                
@@ -76,7 +101,7 @@ public class BuiltinNormalizeTest extends AutomatedTestBase
                        double sparsity = sparse ? spSparse : spDense;
                        
                        String HOME = SCRIPT_DIR + TEST_DIR;
-                       fullDMLScriptName = HOME + TEST_NAME + ".dml";
+                       fullDMLScriptName = HOME + testname + ".dml";
                        programArgs = new String[]{"-args", input("A"), 
output("B") };
                        fullRScriptName = HOME + TEST_NAME + ".R";
                        rCmd = "Rscript" + " " + fullRScriptName + " " + 
inputDir() + " " + expectedDir();
@@ -92,6 +117,12 @@ public class BuiltinNormalizeTest extends AutomatedTestBase
                        HashMap<CellIndex, Double> dmlfile = 
readDMLMatrixFromOutputDir("B");
                        HashMap<CellIndex, Double> rfile  = 
readRMatrixFromExpectedDir("B");
                        TestUtils.compareMatrices(dmlfile, rfile, eps, 
"Stat-DML", "Stat-R");
+               
+                       //check number of compiler Spark instructions
+                       if( instType == ExecType.CP ) {
+                               Assert.assertEquals(1, 
Statistics.getNoOfCompiledSPInst()); //reblock
+                               Assert.assertEquals(0, 
Statistics.getNoOfExecutedSPInst());
+                       }
                }
                finally {
                        rtplatform = platformOld;
diff --git a/scripts/builtin/normalize.dml 
b/src/test/scripts/functions/builtin/normalizeAll.dml
similarity index 85%
copy from scripts/builtin/normalize.dml
copy to src/test/scripts/functions/builtin/normalizeAll.dml
index ac29667..a1c7527 100644
--- a/scripts/builtin/normalize.dml
+++ b/src/test/scripts/functions/builtin/normalizeAll.dml
@@ -19,7 +19,8 @@
 #
 #-------------------------------------------------------------
 
-m_normalize = function(Matrix[Double] X) return (Matrix[Double] Y) {
-  # normalize features to range [0,1]
-  Y = (X - colMins(X)) / (colMaxs(X) - colMins(X));
-}
+X = read($1);
+[Y, mins, maxs] = normalize(X);
+Y = normalizeApply(X, mins, maxs);
+
+write(Y, $2);

Reply via email to