Repository: systemml Updated Branches: refs/heads/master 2610a79d2 -> 5b3b990ad
[SYSTEMML-1846] Fix transformapply w/ subset of column names This patch fixes special cases of transformapply with a transform specification based on column names, where the input data has a subset of columns of the given meta data frame. So far, this join over column names mistakenly assumed sorted column names and hence failed for certain scenarios. Hence, this patch also adds additional tests to better cover these scenarios. Project: http://git-wip-us.apache.org/repos/asf/systemml/repo Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/5b3b990a Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/5b3b990a Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/5b3b990a Branch: refs/heads/master Commit: 5b3b990ad283d28dc6b13d166311c787bff7039c Parents: 2610a79 Author: Matthias Boehm <mboe...@gmail.com> Authored: Wed Aug 16 22:32:21 2017 -0700 Committer: Matthias Boehm <mboe...@gmail.com> Committed: Wed Aug 16 22:32:21 2017 -0700 ---------------------------------------------------------------------- .../transform/encode/EncoderFactory.java | 15 +++++++- .../TransformFrameEncodeApplySubsetTest.java | 38 ++++++++++++++------ .../TransformFrameEncodeApplySubset.dml | 32 ----------------- .../TransformFrameEncodeApplySubset1.dml | 32 +++++++++++++++++ .../TransformFrameEncodeApplySubset2.dml | 32 +++++++++++++++++ 5 files changed, 106 insertions(+), 43 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/systemml/blob/5b3b990a/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderFactory.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderFactory.java b/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderFactory.java index df506e0..8cc22a8 100644 --- a/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderFactory.java +++ b/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderFactory.java @@ -21,6 +21,7 @@ package org.apache.sysml.runtime.transform.encode; import java.util.ArrayList; import java.util.Arrays; +import java.util.HashMap; import java.util.List; import org.apache.commons.collections.CollectionUtils; @@ -101,11 +102,16 @@ public class EncoderFactory if( !TfMetaUtils.isIDSpec(jSpec) && colnames!=null && colnames2!=null && !ArrayUtils.isEquals(colnames, colnames2) ) { + HashMap<String, Integer> colPos = getColumnPositions(colnames2); //create temporary meta frame block w/ shallow column copy FrameBlock meta2 = new FrameBlock(meta.getSchema(), colnames2); meta2.setNumRows(meta.getNumRows()); for( int i=0; i<colnames.length; i++ ) { - int pos = Arrays.binarySearch(colnames2, colnames[i]); + if( !colPos.containsKey(colnames[i]) ) { + throw new DMLRuntimeException("Column name not found in meta data: " + +colnames[i]+" (meta: "+Arrays.toString(colnames2)+")"); + } + int pos = colPos.get(colnames[i]); meta2.setColumn(i, meta.getColumn(pos)); meta2.setColumnMetadata(i, meta.getColumnMetadata(pos)); } @@ -120,4 +126,11 @@ public class EncoderFactory return encoder; } + + private static HashMap<String, Integer> getColumnPositions(String[] colnames) { + HashMap<String, Integer> ret = new HashMap<>(); + for(int i=0; i<colnames.length; i++) + ret.put(colnames[i], i); + return ret; + } } http://git-wip-us.apache.org/repos/asf/systemml/blob/5b3b990a/src/test/java/org/apache/sysml/test/integration/functions/transform/TransformFrameEncodeApplySubsetTest.java ---------------------------------------------------------------------- diff --git a/src/test/java/org/apache/sysml/test/integration/functions/transform/TransformFrameEncodeApplySubsetTest.java b/src/test/java/org/apache/sysml/test/integration/functions/transform/TransformFrameEncodeApplySubsetTest.java index b06bf92..16e1057 100644 --- a/src/test/java/org/apache/sysml/test/integration/functions/transform/TransformFrameEncodeApplySubsetTest.java +++ b/src/test/java/org/apache/sysml/test/integration/functions/transform/TransformFrameEncodeApplySubsetTest.java @@ -30,7 +30,9 @@ import org.apache.sysml.test.utils.TestUtils; public class TransformFrameEncodeApplySubsetTest extends AutomatedTestBase { - private final static String TEST_NAME1 = "TransformFrameEncodeApplySubset"; + private final static String TEST_NAME1 = "TransformFrameEncodeApplySubset1"; + private final static String TEST_NAME2 = "TransformFrameEncodeApplySubset2"; + private final static String TEST_DIR = "functions/transform/"; private final static String TEST_CLASS_DIR = TEST_DIR + TransformFrameEncodeApplySubsetTest.class.getSimpleName() + "/"; @@ -41,21 +43,37 @@ public class TransformFrameEncodeApplySubsetTest extends AutomatedTestBase public void setUp() { TestUtils.clearAssertionInformation(); addTestConfiguration(TEST_NAME1, new TestConfiguration(TEST_CLASS_DIR, TEST_NAME1, new String[] { "y" }) ); + addTestConfiguration(TEST_NAME2, new TestConfiguration(TEST_CLASS_DIR, TEST_NAME2, new String[] { "y" }) ); + } + + @Test + public void testHomesRecodeColnames1SingleNodeCSV() { + runTransformTest(TEST_NAME1, RUNTIME_PLATFORM.SINGLE_NODE, "csv", true); + } + + @Test + public void testHomesRecodeColnames1SparkCSV() { + runTransformTest(TEST_NAME1, RUNTIME_PLATFORM.SPARK, "csv", true); + } + + @Test + public void testHomesRecodeColnames1HybridCSV() { + runTransformTest(TEST_NAME1, RUNTIME_PLATFORM.HYBRID_SPARK, "csv", true); } @Test - public void testHomesRecodeColnamesSingleNodeCSV() { - runTransformTest(RUNTIME_PLATFORM.SINGLE_NODE, "csv", true); + public void testHomesRecodeColnames2SingleNodeCSV() { + runTransformTest(TEST_NAME2, RUNTIME_PLATFORM.SINGLE_NODE, "csv", true); } @Test - public void testHomesRecodeColnamesSparkCSV() { - runTransformTest(RUNTIME_PLATFORM.SPARK, "csv", true); + public void testHomesRecodeColnames2SparkCSV() { + runTransformTest(TEST_NAME2, RUNTIME_PLATFORM.SPARK, "csv", true); } @Test - public void testHomesRecodeColnamesHybridCSV() { - runTransformTest(RUNTIME_PLATFORM.HYBRID_SPARK, "csv", true); + public void testHomesRecodeColnames2HybridCSV() { + runTransformTest(TEST_NAME2, RUNTIME_PLATFORM.HYBRID_SPARK, "csv", true); } @@ -65,7 +83,7 @@ public class TransformFrameEncodeApplySubsetTest extends AutomatedTestBase * @param ofmt * @param dataset */ - private void runTransformTest(RUNTIME_PLATFORM rt, String ofmt, boolean colnames) + private void runTransformTest(String testname, RUNTIME_PLATFORM rt, String ofmt, boolean colnames) { //set runtime platform RUNTIME_PLATFORM rtold = rtplatform; @@ -80,10 +98,10 @@ public class TransformFrameEncodeApplySubsetTest extends AutomatedTestBase try { - getAndLoadTestConfiguration(TEST_NAME1); + getAndLoadTestConfiguration(testname); String HOME = SCRIPT_DIR + TEST_DIR; - fullDMLScriptName = HOME + TEST_NAME1 + ".dml"; + fullDMLScriptName = HOME + testname + ".dml"; programArgs = new String[]{"-explain", "recompile_hops", "-args", HOME + "input/" + DATASET1, output("R") }; http://git-wip-us.apache.org/repos/asf/systemml/blob/5b3b990a/src/test/scripts/functions/transform/TransformFrameEncodeApplySubset.dml ---------------------------------------------------------------------- diff --git a/src/test/scripts/functions/transform/TransformFrameEncodeApplySubset.dml b/src/test/scripts/functions/transform/TransformFrameEncodeApplySubset.dml deleted file mode 100644 index 1e55af4..0000000 --- a/src/test/scripts/functions/transform/TransformFrameEncodeApplySubset.dml +++ /dev/null @@ -1,32 +0,0 @@ -#------------------------------------------------------------- -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# -#------------------------------------------------------------- - -F = read($1, data_type="frame", format="csv"); - -spec = "{ids: false, recode: [ zipcode, district, view ]}"; -[X, M] = transformencode(target=F, spec=spec); - -spec2 = "{ids: false, recode: [ district ]}"; -X2 = transformapply(target=F[,2], spec=spec2, meta=M); - -R = as.matrix(sum(X[,2]==X2)); - -write(R, $2); http://git-wip-us.apache.org/repos/asf/systemml/blob/5b3b990a/src/test/scripts/functions/transform/TransformFrameEncodeApplySubset1.dml ---------------------------------------------------------------------- diff --git a/src/test/scripts/functions/transform/TransformFrameEncodeApplySubset1.dml b/src/test/scripts/functions/transform/TransformFrameEncodeApplySubset1.dml new file mode 100644 index 0000000..1e55af4 --- /dev/null +++ b/src/test/scripts/functions/transform/TransformFrameEncodeApplySubset1.dml @@ -0,0 +1,32 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +F = read($1, data_type="frame", format="csv"); + +spec = "{ids: false, recode: [ zipcode, district, view ]}"; +[X, M] = transformencode(target=F, spec=spec); + +spec2 = "{ids: false, recode: [ district ]}"; +X2 = transformapply(target=F[,2], spec=spec2, meta=M); + +R = as.matrix(sum(X[,2]==X2)); + +write(R, $2); http://git-wip-us.apache.org/repos/asf/systemml/blob/5b3b990a/src/test/scripts/functions/transform/TransformFrameEncodeApplySubset2.dml ---------------------------------------------------------------------- diff --git a/src/test/scripts/functions/transform/TransformFrameEncodeApplySubset2.dml b/src/test/scripts/functions/transform/TransformFrameEncodeApplySubset2.dml new file mode 100644 index 0000000..d586e11 --- /dev/null +++ b/src/test/scripts/functions/transform/TransformFrameEncodeApplySubset2.dml @@ -0,0 +1,32 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +F = read($1, data_type="frame", format="csv"); + +spec = "{ids: false, recode: [ zipcode, district, view ]}"; +[X, M] = transformencode(target=F, spec=spec); + +spec2 = "{ids: false, recode: [ zipcode ]}"; +X2 = transformapply(target=F[,1], spec=spec2, meta=M); + +R = as.matrix(sum(X[,1]==X2)); + +write(R, $2);