This is an automated email from the ASF dual-hosted git repository.

mboehm7 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemds.git


The following commit(s) were added to refs/heads/master by this push:
     new bd1c7c9  [SYSTEMDS-3092] New builtin function selByVarThresh (feature 
selection)
bd1c7c9 is described below

commit bd1c7c95f26494f9285d6b9ed2aaa595ce4f5b24
Author: Matthias Boehm <[email protected]>
AuthorDate: Wed Aug 11 23:41:50 2021 +0200

    [SYSTEMDS-3092] New builtin function selByVarThresh (feature selection)
    
    Small util builtin function for feature selection that drops features
    with less than a threshold column variance. By default we drop constant
    features that are not useful for model training (other than explicit
    intercept).
---
 scripts/builtin/selectByVarThresh.dml              | 28 ++++++++++++++++++++++
 .../java/org/apache/sysds/common/Builtins.java     |  1 +
 2 files changed, 29 insertions(+)

diff --git a/scripts/builtin/selectByVarThresh.dml 
b/scripts/builtin/selectByVarThresh.dml
new file mode 100644
index 0000000..a07ce35
--- /dev/null
+++ b/scripts/builtin/selectByVarThresh.dml
@@ -0,0 +1,28 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+m_selectByVarThresh = function(Matrix[Double] X, Double thresh = 0)
+  return (Matrix[Double] Xp, Matrix[Double] I)
+{
+  # drop feature with <= thresh variance, by default drop constants
+  I = (colVars(X) > thresh);
+  Xp = removeEmpty(target=X, margin="cols", select=I);
+}
diff --git a/src/main/java/org/apache/sysds/common/Builtins.java 
b/src/main/java/org/apache/sysds/common/Builtins.java
index 7bb386a..8ea0bd6 100644
--- a/src/main/java/org/apache/sysds/common/Builtins.java
+++ b/src/main/java/org/apache/sysds/common/Builtins.java
@@ -232,6 +232,7 @@ public enum Builtins {
        ROWVAR("rowVars", false),
        SAMPLE("sample", false),
        SD("sd", false),
+       SELVARTHRESH("selectByVarThresh", true),
        SEQ("seq", false),
        SHERLOCK("sherlock", true),
        SHERLOCKPREDICT("sherlockPredict", true),

Reply via email to