This is an automated email from the ASF dual-hosted git repository. zaleslaw pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/ignite.git
The following commit(s) were added to refs/heads/master by this push: new fb2e1e2 IGNITE-10697: [ML] Add Frequency Encoding (#6784) fb2e1e2 is described below commit fb2e1e28818d3d96a7a9fcc86043d28f28e76e47 Author: Alexey Zinoviev <zaleslaw....@gmail.com> AuthorDate: Fri Aug 16 16:21:55 2019 +0300 IGNITE-10697: [ML] Add Frequency Encoding (#6784) --- .../ml/preprocessing/encoding/EncoderTrainer.java | 70 ++++++++++++----- .../ml/preprocessing/encoding/EncoderType.java | 5 +- .../frequency/FrequencyEncoderPreprocessor.java | 89 ++++++++++++++++++++++ .../package-info.java} | 15 +--- .../ml/preprocessing/PreprocessingTestSuite.java | 4 +- .../preprocessing/encoding/EncoderTrainerTest.java | 31 ++++++++ .../encoding/FrequencyEncoderPreprocessorTest.java | 82 ++++++++++++++++++++ 7 files changed, 264 insertions(+), 32 deletions(-) diff --git a/modules/ml/src/main/java/org/apache/ignite/ml/preprocessing/encoding/EncoderTrainer.java b/modules/ml/src/main/java/org/apache/ignite/ml/preprocessing/encoding/EncoderTrainer.java index 2e6442d..5703ea0 100644 --- a/modules/ml/src/main/java/org/apache/ignite/ml/preprocessing/encoding/EncoderTrainer.java +++ b/modules/ml/src/main/java/org/apache/ignite/ml/preprocessing/encoding/EncoderTrainer.java @@ -32,6 +32,7 @@ import org.apache.ignite.ml.dataset.primitive.context.EmptyContext; import org.apache.ignite.ml.environment.LearningEnvironmentBuilder; import org.apache.ignite.ml.preprocessing.PreprocessingTrainer; import org.apache.ignite.ml.preprocessing.Preprocessor; +import org.apache.ignite.ml.preprocessing.encoding.frequency.FrequencyEncoderPreprocessor; import org.apache.ignite.ml.preprocessing.encoding.onehotencoder.OneHotEncoderPreprocessor; import org.apache.ignite.ml.preprocessing.encoding.stringencoder.StringEncoderPreprocessor; import org.apache.ignite.ml.structures.LabeledVector; @@ -71,19 +72,19 @@ public class EncoderTrainer<K, V> implements PreprocessingTrainer<K, V> { while (upstream.hasNext()) { UpstreamEntry<K, V> entity = upstream.next(); LabeledVector<Double> row = basePreprocessor.apply(entity.getKey(), entity.getValue()); - categoryFrequencies = calculateFrequencies(row, categoryFrequencies); + categoryFrequencies = updateFrequenciesForNextRow(row, categoryFrequencies); } return new EncoderPartitionData() .withCategoryFrequencies(categoryFrequencies); } )) { - Map<String, Integer>[] encodingValues = calculateEncodingValuesByFrequencies(dataset); - switch (encoderType) { case ONE_HOT_ENCODER: - return new OneHotEncoderPreprocessor<>(encodingValues, basePreprocessor, handledIndices); + return new OneHotEncoderPreprocessor<>(calculateEncodingValuesByFrequencies(dataset), basePreprocessor, handledIndices); case STRING_ENCODER: - return new StringEncoderPreprocessor<>(encodingValues, basePreprocessor, handledIndices); + return new StringEncoderPreprocessor<>(calculateEncodingValuesByFrequencies(dataset), basePreprocessor, handledIndices); + case FREQUENCY_ENCODER: + return new FrequencyEncoderPreprocessor<>(calculateEncodingFrequencies(dataset), basePreprocessor, handledIndices); default: throw new IllegalStateException("Define the type of the resulting prerocessor."); } @@ -94,14 +95,38 @@ public class EncoderTrainer<K, V> implements PreprocessingTrainer<K, V> { } /** - * Calculates the encoding values values by frequencies keeping in the given dataset. + * Calculates encoding frequencies as frequency divided on amount of rows in dataset. * - * @param dataset The dataset of frequencies for each feature aggregated in each partition. - * @return Encoding values for each feature. + * NOTE: The amount of rows is calculated as sum of absolute frequencies. + * + * @param dataset Dataset. + * @return Encoding frequency for each feature. */ - private Map<String, Integer>[] calculateEncodingValuesByFrequencies( - Dataset<EmptyContext, EncoderPartitionData> dataset) { - Map<String, Integer>[] frequencies = dataset.compute( + private Map<String, Double>[] calculateEncodingFrequencies(Dataset<EmptyContext, EncoderPartitionData> dataset) { + Map<String, Integer>[] frequencies = calculateFrequencies(dataset); + + Map<String, Double>[] res = new Map[frequencies.length]; + + int[] counters = new int[frequencies.length]; + + for (int i = 0; i < frequencies.length; i++) { + counters[i] = frequencies[i].values().stream().reduce(0, Integer::sum); + int locI = i; + res[locI] = new HashMap<>(); + frequencies[i].forEach((k, v) -> res[locI].put(k, (double)v / counters[locI])); + } + + return res; + } + + /** + * Calculates frequencies for each feature. + * + * @param dataset Dataset. + * @return Frequency for each feature. + */ + private Map<String, Integer>[] calculateFrequencies(Dataset<EmptyContext, EncoderPartitionData> dataset) { + return dataset.compute( EncoderPartitionData::categoryFrequencies, (a, b) -> { if (a == null) @@ -121,8 +146,19 @@ public class EncoderTrainer<K, V> implements PreprocessingTrainer<K, V> { return b; } ); + } + + /** + * Calculates the encoding values values by frequencies keeping in the given dataset. + * + * @param dataset The dataset of frequencies for each feature aggregated in each partition. + * @return Encoding values for each feature. + */ + private Map<String, Integer>[] calculateEncodingValuesByFrequencies( + Dataset<EmptyContext, EncoderPartitionData> dataset) { + Map<String, Integer>[] frequencies = calculateFrequencies(dataset); - Map<String, Integer>[] res = new HashMap[frequencies.length]; + Map<String, Integer>[] res = new Map[frequencies.length]; for (int i = 0; i < frequencies.length; i++) if (handledIndices.contains(i)) @@ -140,10 +176,7 @@ public class EncoderTrainer<K, V> implements PreprocessingTrainer<K, V> { private Map<String, Integer> transformFrequenciesToEncodingValues(Map<String, Integer> frequencies) { Comparator<Map.Entry<String, Integer>> comp; - if (encoderSortingStgy.equals(EncoderSortingStrategy.FREQUENCY_DESC)) - comp = Map.Entry.comparingByValue(); - else - comp = Collections.reverseOrder(Map.Entry.comparingByValue()); + comp = encoderSortingStgy == EncoderSortingStrategy.FREQUENCY_DESC ? Map.Entry.comparingByValue() : Collections.reverseOrder(Map.Entry.comparingByValue()); final HashMap<String, Integer> resMap = frequencies.entrySet() .stream() @@ -166,7 +199,8 @@ public class EncoderTrainer<K, V> implements PreprocessingTrainer<K, V> { * @param categoryFrequencies Holds the frequencies of categories by values and features. * @return Updated frequencies by values and features. */ - private Map<String, Integer>[] calculateFrequencies(LabeledVector row, Map<String, Integer>[] categoryFrequencies) { + private Map<String, Integer>[] updateFrequenciesForNextRow(LabeledVector row, + Map<String, Integer>[] categoryFrequencies) { if (categoryFrequencies == null) categoryFrequencies = initializeCategoryFrequencies(row); else @@ -206,7 +240,7 @@ public class EncoderTrainer<K, V> implements PreprocessingTrainer<K, V> { * @return The array contains not null values for handled indices. */ @NotNull private Map<String, Integer>[] initializeCategoryFrequencies(LabeledVector row) { - Map<String, Integer>[] categoryFrequencies = new HashMap[row.size()]; + Map<String, Integer>[] categoryFrequencies = new Map[row.size()]; for (int i = 0; i < categoryFrequencies.length; i++) if (handledIndices.contains(i)) diff --git a/modules/ml/src/main/java/org/apache/ignite/ml/preprocessing/encoding/EncoderType.java b/modules/ml/src/main/java/org/apache/ignite/ml/preprocessing/encoding/EncoderType.java index 79e216c..2a35958 100644 --- a/modules/ml/src/main/java/org/apache/ignite/ml/preprocessing/encoding/EncoderType.java +++ b/modules/ml/src/main/java/org/apache/ignite/ml/preprocessing/encoding/EncoderType.java @@ -27,5 +27,8 @@ public enum EncoderType { ONE_HOT_ENCODER, /** String encoder. */ - STRING_ENCODER + STRING_ENCODER, + + /** Frequency encoder. */ + FREQUENCY_ENCODER } diff --git a/modules/ml/src/main/java/org/apache/ignite/ml/preprocessing/encoding/frequency/FrequencyEncoderPreprocessor.java b/modules/ml/src/main/java/org/apache/ignite/ml/preprocessing/encoding/frequency/FrequencyEncoderPreprocessor.java new file mode 100644 index 0000000..533581e --- /dev/null +++ b/modules/ml/src/main/java/org/apache/ignite/ml/preprocessing/encoding/frequency/FrequencyEncoderPreprocessor.java @@ -0,0 +1,89 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.ignite.ml.preprocessing.encoding.frequency; + +import java.util.Map; +import java.util.Set; +import org.apache.ignite.ml.math.exceptions.preprocessing.UnknownCategorialFeatureValue; +import org.apache.ignite.ml.math.primitives.vector.VectorUtils; +import org.apache.ignite.ml.preprocessing.Preprocessor; +import org.apache.ignite.ml.preprocessing.encoding.EncoderPreprocessor; +import org.apache.ignite.ml.structures.LabeledVector; + +/** + * Preprocessing function that makes Frequency encoding. + * + * The Frequency Encoder Preprocessor encodes string values (categories) to double values + * in range [0.0, 1], where the value will be presented as a fraction of all the labels. + * <p> + * This preprocessor can transform multiple columns which indices are handled during training process. + * These indexes could be defined via .withEncodedFeature(featureIndex) call. + * </p> + * <p> + * NOTE: it doesn’t add new column but change data in-place. + * </p> + * + * @param <K> Type of a key in {@code upstream} data. + * @param <V> Type of a value in {@code upstream} data. + */ +public class FrequencyEncoderPreprocessor<K, V> extends EncoderPreprocessor<K, V> { + /** */ + protected static final long serialVersionUID = 6237711236382623488L; + + /** Filling values. */ + protected final Map<String, Double>[] encodingFrequencies; + + /** + * Constructs a new instance of Frequency Encoder preprocessor. + * + * @param basePreprocessor Base preprocessor. + * @param handledIndices Handled indices. + */ + public FrequencyEncoderPreprocessor(Map<String, Double>[] encodingFrequencies, + Preprocessor<K, V> basePreprocessor, Set<Integer> handledIndices) { + super(null, basePreprocessor, handledIndices); + this.encodingFrequencies = encodingFrequencies; + } + + /** + * Applies this preprocessor. + * + * @param k Key. + * @param v Value. + * @return Preprocessed row. + */ + @Override public LabeledVector apply(K k, V v) { + LabeledVector tmp = basePreprocessor.apply(k, v); + double[] res = new double[tmp.size()]; + + for (int i = 0; i < res.length; i++) { + Object tmpObj = tmp.getRaw(i); + if (handledIndices.contains(i)) { + if (tmpObj.equals(Double.NaN) && encodingFrequencies[i].containsKey(KEY_FOR_NULL_VALUES)) + res[i] = encodingValues[i].get(KEY_FOR_NULL_VALUES); + else if (encodingFrequencies[i].containsKey(tmpObj)) + res[i] = encodingFrequencies[i].get(tmpObj); + else + throw new UnknownCategorialFeatureValue(tmpObj.toString()); + } + else + res[i] = (double)tmpObj; + } + return new LabeledVector(VectorUtils.of(res), tmp.label()); + } +} diff --git a/modules/ml/src/main/java/org/apache/ignite/ml/preprocessing/encoding/EncoderType.java b/modules/ml/src/main/java/org/apache/ignite/ml/preprocessing/encoding/frequency/package-info.java similarity index 73% copy from modules/ml/src/main/java/org/apache/ignite/ml/preprocessing/encoding/EncoderType.java copy to modules/ml/src/main/java/org/apache/ignite/ml/preprocessing/encoding/frequency/package-info.java index 79e216c..2168750 100644 --- a/modules/ml/src/main/java/org/apache/ignite/ml/preprocessing/encoding/EncoderType.java +++ b/modules/ml/src/main/java/org/apache/ignite/ml/preprocessing/encoding/frequency/package-info.java @@ -15,17 +15,8 @@ * limitations under the License. */ -package org.apache.ignite.ml.preprocessing.encoding; - /** - * Describes Encoder preprocessor types to define resulting model in EncoderTrainer. - * - * @see EncoderTrainer + * <!-- Package description. --> + * Contains frequency encoding preprocessor. */ -public enum EncoderType { - /** One hot encoder. */ - ONE_HOT_ENCODER, - - /** String encoder. */ - STRING_ENCODER -} +package org.apache.ignite.ml.preprocessing.encoding.frequency; diff --git a/modules/ml/src/test/java/org/apache/ignite/ml/preprocessing/PreprocessingTestSuite.java b/modules/ml/src/test/java/org/apache/ignite/ml/preprocessing/PreprocessingTestSuite.java index 7b3d5fc..1822704 100644 --- a/modules/ml/src/test/java/org/apache/ignite/ml/preprocessing/PreprocessingTestSuite.java +++ b/modules/ml/src/test/java/org/apache/ignite/ml/preprocessing/PreprocessingTestSuite.java @@ -19,9 +19,10 @@ package org.apache.ignite.ml.preprocessing; import org.apache.ignite.ml.preprocessing.binarization.BinarizationPreprocessorTest; import org.apache.ignite.ml.preprocessing.binarization.BinarizationTrainerTest; +import org.apache.ignite.ml.preprocessing.encoding.EncoderTrainerTest; +import org.apache.ignite.ml.preprocessing.encoding.FrequencyEncoderPreprocessorTest; import org.apache.ignite.ml.preprocessing.encoding.OneHotEncoderPreprocessorTest; import org.apache.ignite.ml.preprocessing.encoding.StringEncoderPreprocessorTest; -import org.apache.ignite.ml.preprocessing.encoding.EncoderTrainerTest; import org.apache.ignite.ml.preprocessing.imputing.ImputerPreprocessorTest; import org.apache.ignite.ml.preprocessing.imputing.ImputerTrainerTest; import org.apache.ignite.ml.preprocessing.minmaxscaling.MinMaxScalerPreprocessorTest; @@ -44,6 +45,7 @@ import org.junit.runners.Suite; ImputerTrainerTest.class, EncoderTrainerTest.class, OneHotEncoderPreprocessorTest.class, + FrequencyEncoderPreprocessorTest.class, StringEncoderPreprocessorTest.class, NormalizationTrainerTest.class, NormalizationPreprocessorTest.class diff --git a/modules/ml/src/test/java/org/apache/ignite/ml/preprocessing/encoding/EncoderTrainerTest.java b/modules/ml/src/test/java/org/apache/ignite/ml/preprocessing/encoding/EncoderTrainerTest.java index 6fb760e..bc75647 100644 --- a/modules/ml/src/test/java/org/apache/ignite/ml/preprocessing/encoding/EncoderTrainerTest.java +++ b/modules/ml/src/test/java/org/apache/ignite/ml/preprocessing/encoding/EncoderTrainerTest.java @@ -161,4 +161,35 @@ public class EncoderTrainerTest extends TrainerTest { assertArrayEquals(new double[] {2.0, 0.0}, preprocessor.apply(7, new DenseVector(new Serializable[]{"Monday", "September"})).features().asArray(), 1e-8); } + + /** Tests {@code fit()} method. */ + @Test + public void testFitOnStringCategorialFeaturesWithFrequencyEncoding() { + Map<Integer, Vector> data = new HashMap<>(); + data.put(1, new DenseVector(new Serializable[] {"Monday", "September"})); + data.put(2, new DenseVector(new Serializable[] {"Monday", "August"})); + data.put(3, new DenseVector(new Serializable[] {"Monday", "August"})); + data.put(4, new DenseVector(new Serializable[] {"Friday", "June"})); + data.put(5, new DenseVector(new Serializable[] {"Friday", "June"})); + data.put(6, new DenseVector(new Serializable[] {"Sunday", "August"})); + + final Vectorizer<Integer, Vector, Integer, Double> vectorizer = new DummyVectorizer<>(0, 1); + + DatasetBuilder<Integer, Vector> datasetBuilder = new LocalDatasetBuilder<>(data, parts); + + EncoderTrainer<Integer, Vector> strEncoderTrainer = new EncoderTrainer<Integer, Vector>() + .withEncoderType(EncoderType.FREQUENCY_ENCODER) + .withEncodedFeature(0) + .withEncodedFeature(1); + + EncoderPreprocessor<Integer, Vector> preprocessor = strEncoderTrainer.fit( + TestUtils.testEnvBuilder(), + datasetBuilder, + vectorizer + ); + + assertArrayEquals(new double[] {0.5, 0.166}, preprocessor.apply(7, new DenseVector(new Serializable[] {"Monday", "September"})).features().asArray(), 0.1); + assertArrayEquals(new double[] {0.33, 0.5}, preprocessor.apply(7, new DenseVector(new Serializable[] {"Friday", "August"})).features().asArray(), 0.1); + assertArrayEquals(new double[] {0.166, 0.33}, preprocessor.apply(7, new DenseVector(new Serializable[] {"Sunday", "June"})).features().asArray(), 0.1); + } } diff --git a/modules/ml/src/test/java/org/apache/ignite/ml/preprocessing/encoding/FrequencyEncoderPreprocessorTest.java b/modules/ml/src/test/java/org/apache/ignite/ml/preprocessing/encoding/FrequencyEncoderPreprocessorTest.java new file mode 100644 index 0000000..4d9d6d1 --- /dev/null +++ b/modules/ml/src/test/java/org/apache/ignite/ml/preprocessing/encoding/FrequencyEncoderPreprocessorTest.java @@ -0,0 +1,82 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.ignite.ml.preprocessing.encoding; + +import java.io.Serializable; +import java.util.HashMap; +import java.util.HashSet; +import org.apache.ignite.ml.dataset.feature.extractor.Vectorizer; +import org.apache.ignite.ml.dataset.feature.extractor.impl.DummyVectorizer; +import org.apache.ignite.ml.math.primitives.vector.Vector; +import org.apache.ignite.ml.math.primitives.vector.impl.DenseVector; +import org.apache.ignite.ml.preprocessing.encoding.frequency.FrequencyEncoderPreprocessor; +import org.junit.Test; + +import static org.junit.Assert.assertArrayEquals; + +/** + * Tests for {@link FrequencyEncoderPreprocessor}. + */ +public class FrequencyEncoderPreprocessorTest { + /** Tests {@code apply()} method. */ + @Test + public void testApply() { + Vector[] data = new Vector[] { + new DenseVector(new Serializable[] {"1", "Moscow", "A"}), + new DenseVector(new Serializable[] {"2", "Moscow", "B"}), + new DenseVector(new Serializable[] {"2", "Moscow", "B"}), + }; + + Vectorizer<Integer, Vector, Integer, Double> vectorizer = new DummyVectorizer<>(0, 1, 2); + + FrequencyEncoderPreprocessor<Integer, Vector> preprocessor = new FrequencyEncoderPreprocessor<Integer, Vector>( + new HashMap[] { + new HashMap() { + { + put("1", 0.33); + put("2", 0.66); + } + }, new HashMap() { + { + put("Moscow", 1.0); + } + }, new HashMap() { + { + put("A", 0.33); + put("B", 0.66); + } + }}, + vectorizer, + new HashSet() { + { + add(0); + add(1); + add(2); + } + }); + + double[][] postProcessedData = new double[][] { + {0.33, 1.0, 0.33}, + {0.66, 1.0, 0.66}, + {0.66, 1.0, 0.66}, + }; + + for (int i = 0; i < data.length; i++) + assertArrayEquals(postProcessedData[i], preprocessor.apply(i, data[i]).features().asArray(), 0.1); + } +}