IGNITE-10543: [ML] Test/train sample generator This closes #5727
Project: http://git-wip-us.apache.org/repos/asf/ignite/repo Commit: http://git-wip-us.apache.org/repos/asf/ignite/commit/d0facb26 Tree: http://git-wip-us.apache.org/repos/asf/ignite/tree/d0facb26 Diff: http://git-wip-us.apache.org/repos/asf/ignite/diff/d0facb26 Branch: refs/heads/master Commit: d0facb260773c3527237e038ce7c48121bb3f0f3 Parents: 581fdca Author: Alexey Platonov <aplaton...@gmail.com> Authored: Tue Dec 25 18:42:19 2018 +0300 Committer: Yury Babak <yba...@gridgain.com> Committed: Tue Dec 25 18:42:19 2018 +0300 ---------------------------------------------------------------------- .../ml/util/generators/DataStreamGenerator.java | 134 +++++++++++ .../util/generators/DatasetBuilderAdapter.java | 69 ++++++ .../ignite/ml/util/generators/package-info.java | 25 +++ .../generators/primitives/package-info.java | 22 ++ .../scalar/DiscreteRandomProducer.java | 204 +++++++++++++++++ .../primitives/scalar/GaussRandomProducer.java | 77 +++++++ .../primitives/scalar/RandomProducer.java | 78 +++++++ .../scalar/RandomProducerWithGenerator.java | 51 +++++ .../scalar/UniformRandomProducer.java | 66 ++++++ .../primitives/scalar/package-info.java | 22 ++ .../vector/ParametricVectorGenerator.java | 62 +++++ .../primitives/vector/VectorGenerator.java | 224 +++++++++++++++++++ .../vector/VectorGeneratorPrimitives.java | 154 +++++++++++++ .../vector/VectorGeneratorsFamily.java | 189 ++++++++++++++++ .../primitives/vector/package-info.java | 22 ++ .../standard/GaussianMixtureDataStream.java | 99 ++++++++ .../standard/RegressionDataStream.java | 119 ++++++++++ .../generators/standard/RingsDataStream.java | 91 ++++++++ .../standard/TwoSeparableClassesDataStream.java | 95 ++++++++ .../util/generators/standard/package-info.java | 22 ++ .../generators/DataStreamGeneratorTest.java | 210 +++++++++++++++++ .../scalar/DiscreteRandomProducerTest.java | 102 +++++++++ .../scalar/GaussRandomProducerTest.java | 66 ++++++ .../primitives/scalar/RandomProducerTest.java | 79 +++++++ .../scalar/UniformRandomProducerTest.java | 68 ++++++ .../vector/ParametricVectorGeneratorTest.java | 50 +++++ .../vector/VectorGeneratorPrimitivesTest.java | 110 +++++++++ .../primitives/vector/VectorGeneratorTest.java | 194 ++++++++++++++++ .../vector/VectorGeneratorsFamilyTest.java | 118 ++++++++++ 29 files changed, 2822 insertions(+) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/ignite/blob/d0facb26/modules/ml/src/main/java/org/apache/ignite/ml/util/generators/DataStreamGenerator.java ---------------------------------------------------------------------- diff --git a/modules/ml/src/main/java/org/apache/ignite/ml/util/generators/DataStreamGenerator.java b/modules/ml/src/main/java/org/apache/ignite/ml/util/generators/DataStreamGenerator.java new file mode 100644 index 0000000..c2fd652 --- /dev/null +++ b/modules/ml/src/main/java/org/apache/ignite/ml/util/generators/DataStreamGenerator.java @@ -0,0 +1,134 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.ignite.ml.util.generators; + +import java.util.Map; +import java.util.stream.Collectors; +import java.util.stream.Stream; +import org.apache.ignite.lang.IgniteBiPredicate; +import org.apache.ignite.ml.dataset.DatasetBuilder; +import org.apache.ignite.ml.dataset.UpstreamTransformerBuilder; +import org.apache.ignite.ml.math.functions.IgniteFunction; +import org.apache.ignite.ml.math.primitives.vector.Vector; +import org.apache.ignite.ml.structures.DatasetRow; +import org.apache.ignite.ml.structures.LabeledVector; +import org.apache.ignite.ml.util.generators.primitives.scalar.RandomProducer; + +/** + * Provides general interface for generation of pseudorandom vectors according to shape defined + * by logic of specific data stream generator. + */ +public interface DataStreamGenerator { + /** + * @return Stream of {@link LabeledVector} in according to dataset shape. + */ + public Stream<LabeledVector<Vector, Double>> labeled(); + + /** + * @return Stream of unlabeled {@link Vector} in according to dataset shape. + */ + public default Stream<Vector> unlabeled() { + return labeled().map(DatasetRow::features); + } + + /** + * @param classifier User defined classifier for vectors stream. + * @return Stream of {@link LabeledVector} in according to dataset shape and user's classifier. + */ + public default Stream<LabeledVector<Vector, Double>> labeled(IgniteFunction<Vector, Double> classifier) { + return labeled().map(DatasetRow::features).map(v -> new LabeledVector<>(v, classifier.apply(v))); + } + + /** + * Apply user defined mapper to vectors stream without labels hiding. + * + * @param f Mapper of vectors of data stream. + * @return Stream of mapped vectors. + */ + public default DataStreamGenerator mapVectors(IgniteFunction<Vector, Vector> f) { + return new DataStreamGenerator() { + @Override public Stream<LabeledVector<Vector, Double>> labeled() { + return DataStreamGenerator.this.labeled() + .map(v -> new LabeledVector<>(f.apply(v.features()), v.label())); + } + }; + } + + /** + * Apply pseudorandom noize to vectors without labels mapping. Such method can be useful in cases + * when vectors with different labels should be mixed between them on class bounds. + * + * @param rnd Generator of pseudorandom scalars modifying vector components with label saving. + * @return Stream of blurred vectors with same labels. + */ + public default DataStreamGenerator blur(RandomProducer rnd) { + return mapVectors(rnd::noizify); + } + + /** + * Convert first N values from stream to map. + * + * @param datasetSize Dataset size. + * @return Map of vectors and labels. + */ + public default Map<Vector, Double> asMap(int datasetSize) { + return labeled().limit(datasetSize) + .collect(Collectors.toMap(DatasetRow::features, LabeledVector::label)); + } + + /** + * Convert first N values from stream to {@link DatasetBuilder}. + * + * @param datasetSize Dataset size. + * @param partitions Partitions count. + * @return Dataset builder. + */ + public default DatasetBuilder<Vector, Double> asDatasetBuilder(int datasetSize, int partitions) { + return new DatasetBuilderAdapter(this, datasetSize, partitions); + } + + /** + * Convert first N values from stream to {@link DatasetBuilder}. + * + * @param datasetSize Dataset size. + * @param filter Data filter. + * @param partitions Partitions count. + * @return Dataset builder. + */ + public default DatasetBuilder<Vector, Double> asDatasetBuilder(int datasetSize, IgniteBiPredicate<Vector, Double> filter, + int partitions) { + + return new DatasetBuilderAdapter(this, datasetSize, filter, partitions); + } + + /** + * Convert first N values from stream to {@link DatasetBuilder}. + * + * @param datasetSize Dataset size. + * @param filter Data filter. + * @param partitions Partitions count. + * @param upstreamTransformerBuilder Upstream transformer builder. + * @return Dataset builder. + */ + public default DatasetBuilder<Vector, Double> asDatasetBuilder(int datasetSize, IgniteBiPredicate<Vector, Double> filter, + int partitions, UpstreamTransformerBuilder<Vector, Double> upstreamTransformerBuilder) { + + return new DatasetBuilderAdapter(this, datasetSize, filter, partitions, upstreamTransformerBuilder); + } + +} http://git-wip-us.apache.org/repos/asf/ignite/blob/d0facb26/modules/ml/src/main/java/org/apache/ignite/ml/util/generators/DatasetBuilderAdapter.java ---------------------------------------------------------------------- diff --git a/modules/ml/src/main/java/org/apache/ignite/ml/util/generators/DatasetBuilderAdapter.java b/modules/ml/src/main/java/org/apache/ignite/ml/util/generators/DatasetBuilderAdapter.java new file mode 100644 index 0000000..189e053 --- /dev/null +++ b/modules/ml/src/main/java/org/apache/ignite/ml/util/generators/DatasetBuilderAdapter.java @@ -0,0 +1,69 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.ignite.ml.util.generators; + +import org.apache.ignite.lang.IgniteBiPredicate; +import org.apache.ignite.ml.dataset.UpstreamTransformerBuilder; +import org.apache.ignite.ml.dataset.impl.local.LocalDatasetBuilder; +import org.apache.ignite.ml.math.primitives.vector.Vector; + +/** + * DataStreamGenerator to DatasetBuilder adapter. + */ +class DatasetBuilderAdapter extends LocalDatasetBuilder<Vector, Double> { + /** + * Constructs an instance of DatasetBuilderAdapter. + * + * @param generator Generator. + * @param datasetSize Dataset size. + * @param partitions Partitions. + */ + public DatasetBuilderAdapter(DataStreamGenerator generator, int datasetSize, int partitions) { + super(generator.asMap(datasetSize), partitions); + } + + /** + * Constructs an instance of DatasetBuilderAdapter. + * + * @param generator Generator. + * @param datasetSize Dataset size. + * @param filter Filter. + * @param partitions Partitions. + * @param upstreamTransformerBuilder Upstream transformer builder. + */ + public DatasetBuilderAdapter(DataStreamGenerator generator, int datasetSize, + IgniteBiPredicate<Vector, Double> filter, int partitions, + UpstreamTransformerBuilder<Vector, Double> upstreamTransformerBuilder) { + + super(generator.asMap(datasetSize), filter, partitions, upstreamTransformerBuilder); + } + + /** + * Constructs an instance of DatasetBuilderAdapter. + * + * @param generator Generator. + * @param datasetSize Dataset size. + * @param filter Filter. + * @param partitions Partitions. + */ + public DatasetBuilderAdapter(DataStreamGenerator generator, int datasetSize, + IgniteBiPredicate<Vector, Double> filter, int partitions) { + + super(generator.asMap(datasetSize), filter, partitions); + } +} http://git-wip-us.apache.org/repos/asf/ignite/blob/d0facb26/modules/ml/src/main/java/org/apache/ignite/ml/util/generators/package-info.java ---------------------------------------------------------------------- diff --git a/modules/ml/src/main/java/org/apache/ignite/ml/util/generators/package-info.java b/modules/ml/src/main/java/org/apache/ignite/ml/util/generators/package-info.java new file mode 100644 index 0000000..6ebcc09 --- /dev/null +++ b/modules/ml/src/main/java/org/apache/ignite/ml/util/generators/package-info.java @@ -0,0 +1,25 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * <!-- Package description. --> + * Contains utility classes for data streams generation. Entry point for all data streams is a + * {@link org.apache.ignite.ml.util.generators.DataStreamGenerator} class providing streams of + * labeled and unlabeled vectors. There are predefined generators like + * {@link org.apache.ignite.ml.util.generators.standard.RingsDataStream}. + */ +package org.apache.ignite.ml.util.generators; http://git-wip-us.apache.org/repos/asf/ignite/blob/d0facb26/modules/ml/src/main/java/org/apache/ignite/ml/util/generators/primitives/package-info.java ---------------------------------------------------------------------- diff --git a/modules/ml/src/main/java/org/apache/ignite/ml/util/generators/primitives/package-info.java b/modules/ml/src/main/java/org/apache/ignite/ml/util/generators/primitives/package-info.java new file mode 100644 index 0000000..57c79c0 --- /dev/null +++ b/modules/ml/src/main/java/org/apache/ignite/ml/util/generators/primitives/package-info.java @@ -0,0 +1,22 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * <!-- Package description. --> + * Contains primitives like random scalars and random vector generators for composing own data stream generator. + */ +package org.apache.ignite.ml.util.generators.primitives; http://git-wip-us.apache.org/repos/asf/ignite/blob/d0facb26/modules/ml/src/main/java/org/apache/ignite/ml/util/generators/primitives/scalar/DiscreteRandomProducer.java ---------------------------------------------------------------------- diff --git a/modules/ml/src/main/java/org/apache/ignite/ml/util/generators/primitives/scalar/DiscreteRandomProducer.java b/modules/ml/src/main/java/org/apache/ignite/ml/util/generators/primitives/scalar/DiscreteRandomProducer.java new file mode 100644 index 0000000..ef80db7 --- /dev/null +++ b/modules/ml/src/main/java/org/apache/ignite/ml/util/generators/primitives/scalar/DiscreteRandomProducer.java @@ -0,0 +1,204 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.ignite.ml.util.generators.primitives.scalar; + +import java.util.Arrays; +import java.util.Random; +import java.util.stream.IntStream; +import org.apache.ignite.internal.util.typedef.internal.A; + +/** + * Pseudorandom producer generating values from user provided discrete distribution. + */ +public class DiscreteRandomProducer extends RandomProducerWithGenerator { + /** */ + private static final double EPS = 1e-5; + + /** Probabilities. */ + private final double[] probs; + + /** Random variable values. */ + private final int[] ids; + + /** + * Creates an instance of DiscreteRandomProducer. + * + * @param probs Discrete distribution probabilities. + */ + public DiscreteRandomProducer(double... probs) { + this(System.currentTimeMillis(), probs); + } + + /** + * Creates an instance of DiscreteRandomProducer. + * + * @param seed Seed. + * @param probs Discrete distribution probabilities. + */ + public DiscreteRandomProducer(long seed, double... probs) { + super(seed); + + boolean allElementsAreGEZero = Arrays.stream(probs).allMatch(p -> p >= 0.0); + boolean sumOfProbsEqOne = Math.abs(Arrays.stream(probs).sum() - 1.0) < EPS; + A.ensure(allElementsAreGEZero, "all elements should be great or equals 0.0"); + A.ensure(sumOfProbsEqOne, "sum of probs should equal 1.0"); + + this.probs = Arrays.copyOf(probs, probs.length); + this.ids = IntStream.range(0, probs.length).toArray(); + sort(this.probs, ids, 0, probs.length - 1); + + int i = 0; + int j = probs.length - 1; + while (i < j) { + double temp = this.probs[i]; + this.probs[i] = this.probs[j]; + this.probs[j] = temp; + + int idxTmp = this.ids[i]; + this.ids[i] = this.ids[j]; + this.ids[j] = idxTmp; + + i++; + j--; + } + + for (i = 1; i < this.probs.length; i++) + this.probs[i] += this.probs[i - 1]; + } + + /** + * Creates a producer of random values from uniform discrete distribution. + * + * @param numOfValues Number of distinct values. + * @return Producer. + */ + public static DiscreteRandomProducer uniform(int numOfValues) { + return uniform(numOfValues, System.currentTimeMillis()); + } + + /** + * Creates a producer of random values from uniform discrete distribution. + * + * @param numOfValues Number of distinct values. + * @param seed Seed. + * @return Producer. + */ + public static DiscreteRandomProducer uniform(int numOfValues, long seed) { + double[] probs = new double[numOfValues]; + Arrays.fill(probs, 1.0 / numOfValues); + return new DiscreteRandomProducer(seed, probs); + } + + /** + * Generates pseudorandom discrete distribution. + * + * @param numOfValues Number of distinct values of pseudorandom variable. + * @return Probabilities array. + */ + public static double[] randomDistribution(int numOfValues) { + return randomDistribution(numOfValues, System.currentTimeMillis()); + } + + /** + * Generates pseudorandom discrete distribution. + * + * @param numOfValues Number of distinct values of pseudorandom variable. + * @param seed Seed. + * @return Probabilities array. + */ + public static double[] randomDistribution(int numOfValues, long seed) { + A.ensure(numOfValues > 0, "numberOfValues > 0"); + + Random random = new Random(seed); + long[] rnd = IntStream.range(0, numOfValues) + .mapToLong(i -> random.nextInt(Integer.MAX_VALUE)) + .limit(numOfValues) + .toArray(); + long sum = Arrays.stream(rnd).sum(); + + double[] res = new double[numOfValues]; + for (int i = 0; i < res.length; i++) + res[i] = rnd[i] / Math.max(1.0, sum); + + return res; + } + + /** {@inheritDoc} */ + @Override public Double get() { + double p = generator().nextDouble(); + for (int i = 0; i < probs.length; i++) { + if (probs[i] > p) + return (double)ids[i]; + } + + return (double)ids[probs.length - 1]; + } + + /** + * @return Value of preudorandom discrete variable. + */ + public int getInt() { + return get().intValue(); + } + + /** + * @return Count of distinct values of distribution. + */ + public int size() { + return probs.length; + } + + /** + * Sort of probabilities values and corresponded indicies. + * + * @param probs Probabilities. + * @param idx Random variable values. + * @param from From. + * @param to To. + */ + private void sort(double[] probs, int[] idx, int from, int to) { + if (from < to) { + double pivot = probs[(from + to) / 2]; + + int i = from, j = to; + + while (i <= j) { + while (probs[i] < pivot) + i++; + while (probs[j] > pivot) + j--; + + if (i <= j) { + double tmpFeature = probs[i]; + probs[i] = probs[j]; + probs[j] = tmpFeature; + + int tmpLb = idx[i]; + idx[i] = idx[j]; + idx[j] = tmpLb; + + i++; + j--; + } + } + + sort(probs, idx, from, j); + sort(probs, idx, i, to); + } + } +} http://git-wip-us.apache.org/repos/asf/ignite/blob/d0facb26/modules/ml/src/main/java/org/apache/ignite/ml/util/generators/primitives/scalar/GaussRandomProducer.java ---------------------------------------------------------------------- diff --git a/modules/ml/src/main/java/org/apache/ignite/ml/util/generators/primitives/scalar/GaussRandomProducer.java b/modules/ml/src/main/java/org/apache/ignite/ml/util/generators/primitives/scalar/GaussRandomProducer.java new file mode 100644 index 0000000..0fcfcdf --- /dev/null +++ b/modules/ml/src/main/java/org/apache/ignite/ml/util/generators/primitives/scalar/GaussRandomProducer.java @@ -0,0 +1,77 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.ignite.ml.util.generators.primitives.scalar; + +import org.apache.ignite.internal.util.typedef.internal.A; + +/** + * Pseudorandom producer generating values from gauss distribution. + */ +public class GaussRandomProducer extends RandomProducerWithGenerator { + /** Mean. */ + private final double mean; + /** Variance. */ + private final double variance; + + /** + * Creates an instance of GaussRandomProducer with mean = 0 and variance = 1.0. + */ + public GaussRandomProducer() { + this(0.0, 1.0, System.currentTimeMillis()); + } + + /** + * Creates an instance of GaussRandomProducer with mean = 0 and variance = 1.0. + * + * @param seed Seed. + */ + public GaussRandomProducer(long seed) { + this(0.0, 1.0, seed); + } + + /** + * Creates an instance of GaussRandomProducer. + * + * @param mean Mean. + * @param variance Variance. + */ + public GaussRandomProducer(double mean, double variance) { + this(mean, variance, System.currentTimeMillis()); + } + + /** + * Creates an instance of GaussRandomProducer. + * + * @param mean Mean. + * @param variance Variance. + * @param seed Seed. + */ + public GaussRandomProducer(double mean, double variance, long seed) { + super(seed); + + A.ensure(variance > 0, "variance > 0"); + + this.mean = mean; + this.variance = variance; + } + + /** {@inheritDoc} */ + @Override public Double get() { + return mean + generator().nextGaussian() * Math.sqrt(variance); + } +} http://git-wip-us.apache.org/repos/asf/ignite/blob/d0facb26/modules/ml/src/main/java/org/apache/ignite/ml/util/generators/primitives/scalar/RandomProducer.java ---------------------------------------------------------------------- diff --git a/modules/ml/src/main/java/org/apache/ignite/ml/util/generators/primitives/scalar/RandomProducer.java b/modules/ml/src/main/java/org/apache/ignite/ml/util/generators/primitives/scalar/RandomProducer.java new file mode 100644 index 0000000..35c8e1f --- /dev/null +++ b/modules/ml/src/main/java/org/apache/ignite/ml/util/generators/primitives/scalar/RandomProducer.java @@ -0,0 +1,78 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.ignite.ml.util.generators.primitives.scalar; + +import java.util.Arrays; +import java.util.function.Supplier; +import java.util.stream.IntStream; +import org.apache.ignite.internal.util.typedef.internal.A; +import org.apache.ignite.ml.math.functions.IgniteFunction; +import org.apache.ignite.ml.math.primitives.vector.Vector; +import org.apache.ignite.ml.math.primitives.vector.VectorUtils; +import org.apache.ignite.ml.util.generators.primitives.vector.VectorGenerator; + +/** + * Represents a generator of preudorandom scalar values. + */ +public interface RandomProducer extends Supplier<Double> { + /** + * Create {@link VectorGenerator} with vectors having feature values generated by random producer. + * + * @param vectorSize Generated vector size. + * @return Vector generator. + */ + public default VectorGenerator vectorize(int vectorSize) { + return () -> VectorUtils.of(IntStream.range(0, vectorSize).mapToDouble(x -> get()).toArray()); + } + + /** + * Adds value generated by random producer to function value. + * + * @param f Function. + * @return New function with noize. + */ + public default IgniteFunction<Double, Double> noizify(IgniteFunction<Double, Double> f) { + return t -> f.apply(t) + get(); + } + + /** + * Adds values generated by random producer to each vector value. + * + * @param vector Vector. + * @return New vector. + */ + public default Vector noizify(Vector vector) { + Vector cp = vector.copy(); + for (int i = 0; i < vector.size(); i++) + cp.set(i, cp.get(i) + get()); + return cp; + } + + /** + * Creates {@link VectorGenerator} with vectors having feature values in according to + * preudorandom producers. + * + * @param producers Feature value producers. + * @return Vector generator. + */ + public static VectorGenerator vectorize(RandomProducer... producers) { + A.notEmpty(producers, "producers"); + + return () -> VectorUtils.of(Arrays.stream(producers).mapToDouble(Supplier::get).toArray()); + } +} http://git-wip-us.apache.org/repos/asf/ignite/blob/d0facb26/modules/ml/src/main/java/org/apache/ignite/ml/util/generators/primitives/scalar/RandomProducerWithGenerator.java ---------------------------------------------------------------------- diff --git a/modules/ml/src/main/java/org/apache/ignite/ml/util/generators/primitives/scalar/RandomProducerWithGenerator.java b/modules/ml/src/main/java/org/apache/ignite/ml/util/generators/primitives/scalar/RandomProducerWithGenerator.java new file mode 100644 index 0000000..f15de29 --- /dev/null +++ b/modules/ml/src/main/java/org/apache/ignite/ml/util/generators/primitives/scalar/RandomProducerWithGenerator.java @@ -0,0 +1,51 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.ignite.ml.util.generators.primitives.scalar; + +import java.util.Random; + +/** + * Base class for generators based on basic java Random. + */ +abstract class RandomProducerWithGenerator implements RandomProducer { + /** Rnd. */ + private final Random rnd; + + /** + * Creates an instance of RandomProducerWithGenerator. + */ + protected RandomProducerWithGenerator() { + this(System.currentTimeMillis()); + } + + /** + * Creates an instance of RandomProducerWithGenerator. + * + * @param seed Seed. + */ + protected RandomProducerWithGenerator(long seed) { + this.rnd = new Random(seed); + } + + /** + * @return Java preudorandom values generator. + */ + protected Random generator() { + return rnd; + } +} http://git-wip-us.apache.org/repos/asf/ignite/blob/d0facb26/modules/ml/src/main/java/org/apache/ignite/ml/util/generators/primitives/scalar/UniformRandomProducer.java ---------------------------------------------------------------------- diff --git a/modules/ml/src/main/java/org/apache/ignite/ml/util/generators/primitives/scalar/UniformRandomProducer.java b/modules/ml/src/main/java/org/apache/ignite/ml/util/generators/primitives/scalar/UniformRandomProducer.java new file mode 100644 index 0000000..91c598b --- /dev/null +++ b/modules/ml/src/main/java/org/apache/ignite/ml/util/generators/primitives/scalar/UniformRandomProducer.java @@ -0,0 +1,66 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.ignite.ml.util.generators.primitives.scalar; + +import org.apache.ignite.internal.util.typedef.internal.A; + +/** + * Pseudorandom producer generating values from uniform continuous distribution. + */ +public class UniformRandomProducer extends RandomProducerWithGenerator { + /** Generate values from this value. */ + private final double from; + + /** Generate values to this value. */ + private final double to; + + /** + * Creates an instance of UniformRandomProducer. + * + * @param from Generate values from this value. + * @param to Generate values to this value. + */ + public UniformRandomProducer(double from, double to) { + this(from, to, System.currentTimeMillis()); + } + + /** + * Creates an instance of UniformRandomProducer. + * + * @param from Generate values from this value. + * @param to Generate values to this value. + * @param seed Seed. + */ + public UniformRandomProducer(double from, double to, long seed) { + super(seed); + + A.ensure(to >= from, "from >= to"); + + this.from = from; + this.to = to; + } + + /** {@inheritDoc} */ + @Override public Double get() { + double result = generator().nextDouble() * (to - from) + from; + if (result > to) + result = to; + + return result; + } +} http://git-wip-us.apache.org/repos/asf/ignite/blob/d0facb26/modules/ml/src/main/java/org/apache/ignite/ml/util/generators/primitives/scalar/package-info.java ---------------------------------------------------------------------- diff --git a/modules/ml/src/main/java/org/apache/ignite/ml/util/generators/primitives/scalar/package-info.java b/modules/ml/src/main/java/org/apache/ignite/ml/util/generators/primitives/scalar/package-info.java new file mode 100644 index 0000000..264c69f --- /dev/null +++ b/modules/ml/src/main/java/org/apache/ignite/ml/util/generators/primitives/scalar/package-info.java @@ -0,0 +1,22 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * <!-- Package description. --> + * Contains generators of pseudo-random scalars in according to specific disctribution. + */ +package org.apache.ignite.ml.util.generators.primitives.scalar; http://git-wip-us.apache.org/repos/asf/ignite/blob/d0facb26/modules/ml/src/main/java/org/apache/ignite/ml/util/generators/primitives/vector/ParametricVectorGenerator.java ---------------------------------------------------------------------- diff --git a/modules/ml/src/main/java/org/apache/ignite/ml/util/generators/primitives/vector/ParametricVectorGenerator.java b/modules/ml/src/main/java/org/apache/ignite/ml/util/generators/primitives/vector/ParametricVectorGenerator.java new file mode 100644 index 0000000..c9257cb --- /dev/null +++ b/modules/ml/src/main/java/org/apache/ignite/ml/util/generators/primitives/vector/ParametricVectorGenerator.java @@ -0,0 +1,62 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.ignite.ml.util.generators.primitives.vector; + +import java.util.Arrays; +import java.util.List; +import org.apache.ignite.internal.util.typedef.internal.A; +import org.apache.ignite.ml.math.functions.IgniteFunction; +import org.apache.ignite.ml.math.primitives.vector.Vector; +import org.apache.ignite.ml.math.primitives.vector.VectorUtils; +import org.apache.ignite.ml.util.generators.primitives.scalar.RandomProducer; + +/** + * Generate vectors having components generated by parametrized function. + * For each vector v: v = [F1(t), F2(t), ..., Fn(t)], where t is a value + * from user defined distribution. + * + */ +public class ParametricVectorGenerator implements VectorGenerator { + /** Per dimension generators. */ + private final List<IgniteFunction<Double, Double>> perDimensionGenerators; + + /** Random producer. */ + private final RandomProducer randomProducer; + + /** + * Create an intance of ParametricVectorGenerator. + * + * @param paramGenerator Parameter generator. + * @param perDimensionGenerators Per dimension generators. + */ + public ParametricVectorGenerator(RandomProducer paramGenerator, + IgniteFunction<Double, Double>... perDimensionGenerators) { + + A.notEmpty(perDimensionGenerators, "perDimensionGenerators.length != 0"); + + this.perDimensionGenerators = Arrays.asList(perDimensionGenerators); + this.randomProducer = paramGenerator; + } + + /** {@inheritDoc} */ + @Override public Vector get() { + Double t = randomProducer.get(); + return VectorUtils.of(perDimensionGenerators.stream() + .mapToDouble(f -> f.apply(t)).toArray()); + } +} http://git-wip-us.apache.org/repos/asf/ignite/blob/d0facb26/modules/ml/src/main/java/org/apache/ignite/ml/util/generators/primitives/vector/VectorGenerator.java ---------------------------------------------------------------------- diff --git a/modules/ml/src/main/java/org/apache/ignite/ml/util/generators/primitives/vector/VectorGenerator.java b/modules/ml/src/main/java/org/apache/ignite/ml/util/generators/primitives/vector/VectorGenerator.java new file mode 100644 index 0000000..7a38cbe --- /dev/null +++ b/modules/ml/src/main/java/org/apache/ignite/ml/util/generators/primitives/vector/VectorGenerator.java @@ -0,0 +1,224 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.ignite.ml.util.generators.primitives.vector; + +import java.util.Collections; +import java.util.List; +import java.util.Random; +import java.util.function.Supplier; +import java.util.stream.Collectors; +import java.util.stream.IntStream; +import java.util.stream.Stream; +import org.apache.ignite.internal.util.typedef.internal.A; +import org.apache.ignite.lang.IgnitePredicate; +import org.apache.ignite.ml.math.functions.IgniteFunction; +import org.apache.ignite.ml.math.primitives.vector.Vector; +import org.apache.ignite.ml.math.primitives.vector.VectorUtils; +import org.apache.ignite.ml.structures.LabeledVector; +import org.apache.ignite.ml.util.generators.DataStreamGenerator; +import org.apache.ignite.ml.util.generators.primitives.scalar.RandomProducer; + +/** + * Basic interface for pseudorandom vectors generators. + */ +public interface VectorGenerator extends Supplier<Vector> { + /** + * Maps values of vector generator using mapper. + * + * @param mapper Mapper. + * @return Vector generator with mapped vectors. + */ + public default VectorGenerator map(IgniteFunction<Vector, Vector> mapper) { + return () -> mapper.apply(get()); + } + + /** + * Filters values of vector generator using predicate. + * + * @param predicate Predicate. + * @return Vector generator with filtered vectors. + */ + public default VectorGenerator filter(IgnitePredicate<Vector> predicate) { + return () -> { + Vector v = null; + do { + v = get(); + } + while (!predicate.apply(v)); + + return v; + }; + } + + /** + * Creates new generator by concatenation of vectors of this generator and other. + * + * @param other Other. + * @return Generator of concatenated vectors. + */ + public default VectorGenerator concat(VectorGenerator other) { + return () -> VectorUtils.concat(this.get(), other.get()); + } + + /** + * Creates new generator by concatenation of vectors of this generator and random producer. + * + * @param producer Producer. + * @return Generator of concatenated vector and noize. + */ + public default VectorGenerator concat(RandomProducer producer) { + return () -> VectorUtils.concat(this.get(), VectorUtils.of(producer.get())); + } + + /** + * Creates new generator by sum of vectors of this generator and other. + * + * @param other Other. + * @return Generator of vector sums. + */ + public default VectorGenerator plus(VectorGenerator other) { + return () -> this.get().plus(other.get()); + } + + /** + * Creates a permanent rearrangement mapping of features in vector and applies this rearrangement for each vectors + * of current generator. + * + * @return Generator of vectors with shuffled features. + */ + public default VectorGenerator shuffle() { + return shuffle(System.currentTimeMillis()); + } + + /** + * Creates a permanent rearrangement mapping of features in vector and applies this rearrangement for each vectors + * of current generator. + * + * @param seed Seed. + * @return Generator of vectors with shuffled features. + */ + public default VectorGenerator shuffle(Long seed) { + Random rnd = new Random(seed); + List<Integer> shuffledIds = IntStream.range(0, get().size()).boxed().collect(Collectors.toList()); + Collections.shuffle(shuffledIds, rnd); + + return map(original -> { + Vector cp = original.copy(); + for (int to = 0; to < cp.size(); to++) { + int from = shuffledIds.get(to); + cp.set(to, original.get(from)); + } + return cp; + }); + } + + /** + * Increase vectors of generator by increaseSize and sets to new values random selected feature values from already + * set components. + * + * @param increaseSize Increase size. + * @return Generator. + */ + public default VectorGenerator duplicateRandomFeatures(int increaseSize) { + return duplicateRandomFeatures(increaseSize, System.currentTimeMillis()); + } + + /** + * Increase vectors of generator by increaseSize and sets to new values random selected feature values from already + * set components. + * + * @param increaseSize Increase size. + * @param seed Seed. + * @return Generator. + */ + public default VectorGenerator duplicateRandomFeatures(int increaseSize, Long seed) { + A.ensure(increaseSize > 0, "increaseSize > 0"); + + Random rnd = new Random(seed); + return map(original -> { + double[] values = new double[original.size() + increaseSize]; + for (int i = 0; i < original.size(); i++) + values[i] = original.get(i); + for (int i = 0; i < increaseSize; i++) { + int rndId = rnd.nextInt(original.size()); + values[original.size() + i] = original.get(rndId); + } + return VectorUtils.of(values); + }); + } + + /** + * Moves all vectors to other position by summing with input vector. + * + * @param v Vector. + * @return Generator with old vectors plus input vector. + */ + public default VectorGenerator move(Vector v) { + return map(x -> x.plus(v)); + } + + /** + * Rotate first two components of all vectors of generator by angle around zero. + * + * @param angle Angle. + * @return Generator. + */ + public default VectorGenerator rotate(double angle) { + return rotate(angle, 0, 1); + } + + /** + * Rotate selected two components of all vectors of generator by angle around zero. + * + * @param angle Angle. + * @param firstComponent First component id. + * @param secondComponent Second component id. + * @return Generator. + */ + public default VectorGenerator rotate(double angle, int firstComponent, int secondComponent) { + return map(x -> x.copy() + .set(firstComponent, x.get(firstComponent) * Math.cos(angle) + x.get(secondComponent) * Math.sin(angle)) + .set(secondComponent, -x.get(firstComponent) * Math.sin(angle) + x.get(secondComponent) * Math.cos(angle)) + ); + } + + /** + * Adds noize to all components of generated vectors. + * + * @param randomProducer Random producer. + * @return Generator. + */ + public default VectorGenerator noisify(RandomProducer randomProducer) { + int vectorSize = get().size(); + return plus(randomProducer.vectorize(vectorSize)); + } + + /** + * Conterts vectors generator to unlabeled data stream generator. + * + * @return data stream generator. + */ + public default DataStreamGenerator asDataStream() { + final VectorGenerator gen = this; + return new DataStreamGenerator() { + @Override public Stream<LabeledVector<Vector, Double>> labeled() { + return Stream.generate(gen).map(v -> new LabeledVector<>(v, 0.0)); + } + }; + } +} http://git-wip-us.apache.org/repos/asf/ignite/blob/d0facb26/modules/ml/src/main/java/org/apache/ignite/ml/util/generators/primitives/vector/VectorGeneratorPrimitives.java ---------------------------------------------------------------------- diff --git a/modules/ml/src/main/java/org/apache/ignite/ml/util/generators/primitives/vector/VectorGeneratorPrimitives.java b/modules/ml/src/main/java/org/apache/ignite/ml/util/generators/primitives/vector/VectorGeneratorPrimitives.java new file mode 100644 index 0000000..1c49643 --- /dev/null +++ b/modules/ml/src/main/java/org/apache/ignite/ml/util/generators/primitives/vector/VectorGeneratorPrimitives.java @@ -0,0 +1,154 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.ignite.ml.util.generators.primitives.vector; + +import org.apache.ignite.internal.util.typedef.internal.A; +import org.apache.ignite.ml.math.primitives.vector.Vector; +import org.apache.ignite.ml.math.primitives.vector.VectorUtils; +import org.apache.ignite.ml.util.generators.primitives.scalar.GaussRandomProducer; +import org.apache.ignite.ml.util.generators.primitives.scalar.RandomProducer; +import org.apache.ignite.ml.util.generators.primitives.scalar.UniformRandomProducer; + +/** + * Collection of predefined vector generators. + */ +public class VectorGeneratorPrimitives { + /** + * Returns vector generator of vectors from multidimension gauss distribution. + * + * @param means Mean values per dimension. + * @param variances Variance values per dimension. + * @param seed Seed. + * @return Generator. + */ + public static VectorGenerator gauss(Vector means, Vector variances, Long seed) { + A.notEmpty(means.asArray(), "mean.size() != 0"); + A.ensure(means.size() == variances.size(), "mean.size() == variances.size()"); + + RandomProducer[] producers = new RandomProducer[means.size()]; + for (int i = 0; i < producers.length; i++) + producers[i] = new GaussRandomProducer(means.get(i), variances.get(i), seed *= 2); + return RandomProducer.vectorize(producers); + } + + /** + * Returns vector generator of vectors from multidimension gauss distribution. + * + * @param means Mean values per dimension. + * @param variances Variance values per dimension. + * @return Generator. + */ + public static VectorGenerator gauss(Vector means, Vector variances) { + return gauss(means, variances, System.currentTimeMillis()); + } + + /** + * Returns vector generator of 2D-vectors from ring-like distribution. + * + * @param radius Ring radius. + * @param fromAngle From angle. + * @param toAngle To angle. + * @return Generator. + */ + public static VectorGenerator ring(double radius, double fromAngle, double toAngle) { + return ring(radius, fromAngle, toAngle, System.currentTimeMillis()); + } + + /** + * Returns vector generator of 2D-vectors from ring-like distribution around zero. + * + * @param radius Ring radius. + * @param fromAngle From angle. + * @param toAngle To angle. + * @param seed Seed. + * @return Generator. + */ + public static VectorGenerator ring(double radius, double fromAngle, double toAngle, long seed) { + return new ParametricVectorGenerator( + new UniformRandomProducer(fromAngle, toAngle, seed), + t -> radius * Math.sin(t), + t -> radius * Math.cos(t) + ); + } + + /** + * Returns vector generator of vectors from multidimension uniform distribution around zero. + * + * @param bounds Parallelogram bounds. + * @return Generator. + */ + public static VectorGenerator parallelogram(Vector bounds) { + return parallelogram(bounds, System.currentTimeMillis()); + } + + /** + * Returns vector generator of vectors from multidimension uniform distribution around zero. + * + * @param bounds Parallelogram bounds. + * @param seed Seed. + * @return Generator. + */ + public static VectorGenerator parallelogram(Vector bounds, long seed) { + A.ensure(bounds.size() != 0, "bounds.size() != 0"); + + UniformRandomProducer[] producers = new UniformRandomProducer[bounds.size()]; + for (int i = 0; i < producers.length; i++) + producers[i] = new UniformRandomProducer(-bounds.get(i), bounds.get(i), seed *= 2); + + return RandomProducer.vectorize(producers); + } + + /** + * Returns vector generator of 2D-vectors from circle-like distribution around zero. + * + * @param radius Circle radius. + * @return Generator. + */ + public static VectorGenerator circle(double radius) { + return circle(radius, System.currentTimeMillis()); + } + + /** + * Returns vector generator of 2D-vectors from circle-like distribution around zero. + * + * @param radius Circle radius. + * @param seed Seed. + * @return Generator. + */ + public static VectorGenerator circle(double radius, long seed) { + return new UniformRandomProducer(-radius, radius, seed) + .vectorize(2) + .filter(v -> Math.sqrt(v.getLengthSquared()) <= radius); + } + + /** + * @param size Vector size. + * @return Generator of constant vector = zero. + */ + public static VectorGenerator zero(int size) { + return constant(VectorUtils.zeroes(size)); + } + + /** + * @param v Constant. + * @return Generator of constant vector. + */ + public static VectorGenerator constant(Vector v) { + return () -> v; + } +} http://git-wip-us.apache.org/repos/asf/ignite/blob/d0facb26/modules/ml/src/main/java/org/apache/ignite/ml/util/generators/primitives/vector/VectorGeneratorsFamily.java ---------------------------------------------------------------------- diff --git a/modules/ml/src/main/java/org/apache/ignite/ml/util/generators/primitives/vector/VectorGeneratorsFamily.java b/modules/ml/src/main/java/org/apache/ignite/ml/util/generators/primitives/vector/VectorGeneratorsFamily.java new file mode 100644 index 0000000..5359fe9 --- /dev/null +++ b/modules/ml/src/main/java/org/apache/ignite/ml/util/generators/primitives/vector/VectorGeneratorsFamily.java @@ -0,0 +1,189 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.ignite.ml.util.generators.primitives.vector; + +import java.util.ArrayList; +import java.util.List; +import java.util.stream.Collectors; +import java.util.stream.Stream; +import org.apache.ignite.internal.util.typedef.internal.A; +import org.apache.ignite.ml.math.functions.IgniteFunction; +import org.apache.ignite.ml.math.primitives.vector.Vector; +import org.apache.ignite.ml.structures.LabeledVector; +import org.apache.ignite.ml.util.generators.DataStreamGenerator; +import org.apache.ignite.ml.util.generators.primitives.scalar.DiscreteRandomProducer; + +/** + * Represents a distribution family of district vector generators. + */ +public class VectorGeneratorsFamily implements VectorGenerator { + /** Family of generators. */ + private final List<VectorGenerator> family; + + /** Randomized selector of vector generator from family. */ + private final DiscreteRandomProducer selector; + + /** + * Creates an instance of VectorGeneratorsFamily. + * + * @param family Family of generators. + * @param selector Randomized selector of generator from family. + */ + private VectorGeneratorsFamily(List<VectorGenerator> family, DiscreteRandomProducer selector) { + this.family = family; + this.selector = selector; + } + + /** {@inheritDoc} */ + @Override public Vector get() { + return family.get(selector.getInt()).get(); + } + + /** + * @return pseudo random vector with parent distribution id. + */ + public VectorWithDistributionId getWithId() { + int id = selector.getInt(); + return new VectorWithDistributionId(family.get(id).get(), id); + } + + /** + * Creates data stream where label of vector == id of distribution from family. + * + * @return Data stream generator. + */ + @Override public DataStreamGenerator asDataStream() { + VectorGeneratorsFamily gen = this; + return new DataStreamGenerator() { + @Override public Stream<LabeledVector<Vector, Double>> labeled() { + return Stream.generate(gen::getWithId) + .map(v -> new LabeledVector<>(v.vector, (double)v.distributionId)); + } + }; + } + + /** + * Helper for distribution family building. + */ + public static class Builder { + /** Family. */ + private final List<VectorGenerator> family = new ArrayList<>(); + + /** Weights of generators. */ + private final List<Double> weights = new ArrayList<>(); + + /** + * Mapper for generators in family. + * It as applied before create an instance of VectorGeneratorsFamily + */ + private IgniteFunction<VectorGenerator, VectorGenerator> mapper = x -> x; + + /** + * Add generator to family with weight proportional to it selection probability. + * + * @param generator Generator. + * @param weight Weight. + * @return This builder. + */ + public Builder add(VectorGenerator generator, double weight) { + A.ensure(weight > 0, "weight > 0"); + + family.add(generator); + weights.add(weight); + return this; + } + + /** + * Adds generator to family with weight = 1. + * + * @param generator Generator. + * @return This builder. + */ + public Builder add(VectorGenerator generator) { + return add(generator, 1); + } + + /** + * Adds map function for all generators in family. + * + * @param mapper Mapper. + * @return This builder. + */ + public Builder map(IgniteFunction<VectorGenerator, VectorGenerator> mapper) { + final IgniteFunction<VectorGenerator, VectorGenerator> old = this.mapper; + this.mapper = x -> mapper.apply(old.apply(x)); + return this; + } + + /** + * Builds VectorGeneratorsFamily instance. + * + * @return Vector generators family. + */ + public VectorGeneratorsFamily build() { + return build(System.currentTimeMillis()); + } + + /** + * Builds VectorGeneratorsFamily instance. + * + * @param seed Seed. + * @return Vector generators family. + */ + public VectorGeneratorsFamily build(long seed) { + A.notEmpty(family, "family.size != 0"); + double sumOfWeigts = weights.stream().mapToDouble(x -> x).sum(); + double[] probs = weights.stream().mapToDouble(w -> w / sumOfWeigts).toArray(); + + List<VectorGenerator> mappedFamilily = family.stream().map(mapper).collect(Collectors.toList()); + return new VectorGeneratorsFamily(mappedFamilily, new DiscreteRandomProducer(seed, probs)); + } + } + + /** */ + public static class VectorWithDistributionId { + /** Vector. */ + private final Vector vector; + + /** Distribution id. */ + private final int distributionId; + + /** + * @param vector Vector. + * @param distributionId Distribution id. + */ + public VectorWithDistributionId(Vector vector, int distributionId) { + this.vector = vector; + this.distributionId = distributionId; + } + + /** + * @return Vector. + */ + public Vector vector() { + return vector; + } + + /** + * @return Distribution id. + */ + public int distributionId() { + return distributionId; + } + } +} http://git-wip-us.apache.org/repos/asf/ignite/blob/d0facb26/modules/ml/src/main/java/org/apache/ignite/ml/util/generators/primitives/vector/package-info.java ---------------------------------------------------------------------- diff --git a/modules/ml/src/main/java/org/apache/ignite/ml/util/generators/primitives/vector/package-info.java b/modules/ml/src/main/java/org/apache/ignite/ml/util/generators/primitives/vector/package-info.java new file mode 100644 index 0000000..ded85c5 --- /dev/null +++ b/modules/ml/src/main/java/org/apache/ignite/ml/util/generators/primitives/vector/package-info.java @@ -0,0 +1,22 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * <!-- Package description. --> + * Contains generators of pseudo-random vectors in according to specific disctribution. + */ +package org.apache.ignite.ml.util.generators.primitives.vector; http://git-wip-us.apache.org/repos/asf/ignite/blob/d0facb26/modules/ml/src/main/java/org/apache/ignite/ml/util/generators/standard/GaussianMixtureDataStream.java ---------------------------------------------------------------------- diff --git a/modules/ml/src/main/java/org/apache/ignite/ml/util/generators/standard/GaussianMixtureDataStream.java b/modules/ml/src/main/java/org/apache/ignite/ml/util/generators/standard/GaussianMixtureDataStream.java new file mode 100644 index 0000000..6ea3ed0 --- /dev/null +++ b/modules/ml/src/main/java/org/apache/ignite/ml/util/generators/standard/GaussianMixtureDataStream.java @@ -0,0 +1,99 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.ignite.ml.util.generators.standard; + +import java.util.ArrayList; +import java.util.List; +import java.util.stream.Stream; +import org.apache.ignite.internal.util.typedef.internal.A; +import org.apache.ignite.ml.math.functions.IgniteFunction; +import org.apache.ignite.ml.math.primitives.vector.Vector; +import org.apache.ignite.ml.structures.LabeledVector; +import org.apache.ignite.ml.util.generators.DataStreamGenerator; +import org.apache.ignite.ml.util.generators.primitives.vector.VectorGenerator; +import org.apache.ignite.ml.util.generators.primitives.vector.VectorGeneratorsFamily; + +import static org.apache.ignite.ml.util.generators.primitives.vector.VectorGeneratorPrimitives.gauss; + +/** + * Data stream generator representing gaussian mixture. + */ +public class GaussianMixtureDataStream implements DataStreamGenerator { + /** Gaussian component generators. */ + private final List<IgniteFunction<Long, VectorGenerator>> componentGenerators; + + /** Seed. */ + private long seed; + + /** + * Create an instance of GaussianMixtureDataStream. + * + * @param componentGenerators Component generators. + * @param seed Seed. + */ + private GaussianMixtureDataStream(List<IgniteFunction<Long, VectorGenerator>> componentGenerators, long seed) { + this.componentGenerators = componentGenerators; + this.seed = seed; + } + + /** {@inheritDoc} */ + @Override public Stream<LabeledVector<Vector, Double>> labeled() { + VectorGeneratorsFamily.Builder builder = new VectorGeneratorsFamily.Builder(); + for (int i = 0; i < componentGenerators.size(); i++) { + builder = builder.add(componentGenerators.get(i).apply(seed), 1.0); + seed *= 2; + } + + return builder.build().asDataStream().labeled(); + } + + /** + * Builder for gaussian mixture. + */ + public static class Builder { + /** Gaussian component generators. */ + private List<IgniteFunction<Long, VectorGenerator>> componentGenerators = new ArrayList<>(); + + /** + * Adds multidimentional gaussian component. + * + * @param mean Mean value. + * @param variance Variance for each component. + */ + public Builder add(Vector mean, Vector variance) { + componentGenerators.add(seed -> gauss(mean, variance, seed)); + return this; + } + + /** + * @return GaussianMixtureDataStream instance. + */ + public GaussianMixtureDataStream build() { + return build(System.currentTimeMillis()); + } + + /** + * @param seed Seed. + * @return GaussianMixtureDataStream instance. + */ + public GaussianMixtureDataStream build(long seed) { + A.notEmpty(componentGenerators, "this.means.size()"); + return new GaussianMixtureDataStream(componentGenerators, seed); + } + } +} http://git-wip-us.apache.org/repos/asf/ignite/blob/d0facb26/modules/ml/src/main/java/org/apache/ignite/ml/util/generators/standard/RegressionDataStream.java ---------------------------------------------------------------------- diff --git a/modules/ml/src/main/java/org/apache/ignite/ml/util/generators/standard/RegressionDataStream.java b/modules/ml/src/main/java/org/apache/ignite/ml/util/generators/standard/RegressionDataStream.java new file mode 100644 index 0000000..6d7291b --- /dev/null +++ b/modules/ml/src/main/java/org/apache/ignite/ml/util/generators/standard/RegressionDataStream.java @@ -0,0 +1,119 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.ignite.ml.util.generators.standard; + +import java.util.stream.Stream; +import org.apache.ignite.internal.util.typedef.internal.A; +import org.apache.ignite.ml.math.functions.IgniteFunction; +import org.apache.ignite.ml.math.primitives.vector.Vector; +import org.apache.ignite.ml.structures.LabeledVector; +import org.apache.ignite.ml.util.generators.DataStreamGenerator; +import org.apache.ignite.ml.util.generators.primitives.scalar.UniformRandomProducer; + +/** + * Represents a generator of regression data stream based on Vector->Double function where each Vector + * was produced from hypercube with sides = [minXValue, maxXValue]. + */ +public class RegressionDataStream implements DataStreamGenerator { + /** Function. */ + private final IgniteFunction<Vector, Double> function; + + /** Min x value for each dimension. */ + private final double minXVal; + + /** Max x value. */ + private final double maxXVal; + + /** Vector size. */ + private final int vectorSize; + + /** Seed. */ + private long seed; + + /** + * Creates an instance of RegressionDataStream. + * + * @param vectorSize Vector size. + * @param function Function. + * @param minXVal Min x value. + * @param maxXVal Max x value. + * @param seed Seed. + */ + private RegressionDataStream(int vectorSize, IgniteFunction<Vector, Double> function, + double minXVal, double maxXVal, long seed) { + + A.ensure(vectorSize > 0, "vectorSize > 0"); + A.ensure(minXVal <= maxXVal, "minXValue <= maxXValue"); + + this.function = function; + this.minXVal = minXVal; + this.maxXVal = maxXVal; + this.seed = seed; + this.vectorSize = vectorSize; + } + + /** + * Creates an instance of RegressionDataStream. + * + * @param vectorSize Vector size. + * @param function Function. + * @param minXVal Min x value. + * @param maxXVal Max x value. + */ + public RegressionDataStream(int vectorSize, IgniteFunction<Vector, Double> function, double minXVal, + double maxXVal) { + this(vectorSize, function, minXVal, maxXVal, System.currentTimeMillis()); + } + + /** {@inheritDoc} */ + @Override public Stream<LabeledVector<Vector, Double>> labeled() { + seed *= 2; + return new UniformRandomProducer(minXVal, maxXVal, seed) + .vectorize(vectorSize).asDataStream() + .labeled(function); + } + + /** + * Creates two dimensional regression data stream. + * + * @param function Double->double function. + * @param minXVal Min x value. + * @param maxXVal Max x value. + * @return RegressionDataStream instance. + */ + public static RegressionDataStream twoDimensional(IgniteFunction<Double, Double> function, + double minXVal, double maxXVal) { + + return twoDimensional(function, minXVal, maxXVal, System.currentTimeMillis()); + } + + /** + * Creates two dimensional regression data stream. + * + * @param function Double->double function. + * @param minXVal Min x value. + * @param maxXVal Max x value. + * @param seed Seed. + * @return RegressionDataStream instance. + */ + public static RegressionDataStream twoDimensional(IgniteFunction<Double, Double> function, + double minXVal, double maxXVal, long seed) { + + return new RegressionDataStream(1, v -> function.apply(v.get(0)), minXVal, maxXVal, seed); + } +} http://git-wip-us.apache.org/repos/asf/ignite/blob/d0facb26/modules/ml/src/main/java/org/apache/ignite/ml/util/generators/standard/RingsDataStream.java ---------------------------------------------------------------------- diff --git a/modules/ml/src/main/java/org/apache/ignite/ml/util/generators/standard/RingsDataStream.java b/modules/ml/src/main/java/org/apache/ignite/ml/util/generators/standard/RingsDataStream.java new file mode 100644 index 0000000..27211b0 --- /dev/null +++ b/modules/ml/src/main/java/org/apache/ignite/ml/util/generators/standard/RingsDataStream.java @@ -0,0 +1,91 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.ignite.ml.util.generators.standard; + +import java.util.stream.Stream; +import org.apache.ignite.internal.util.typedef.internal.A; +import org.apache.ignite.ml.math.primitives.vector.Vector; +import org.apache.ignite.ml.structures.LabeledVector; +import org.apache.ignite.ml.util.generators.DataStreamGenerator; +import org.apache.ignite.ml.util.generators.primitives.scalar.GaussRandomProducer; +import org.apache.ignite.ml.util.generators.primitives.vector.VectorGeneratorsFamily; + +import static org.apache.ignite.ml.util.generators.primitives.vector.VectorGeneratorPrimitives.ring; + +/** + * Represents a data stream of vectors produced by family of ring-like distributions around zero blurred + * by gauss distribution. First ring equals minRadius next ring radius = prev_radius + distanceBetweenRings. + */ +public class RingsDataStream implements DataStreamGenerator { + /** Count of rings. */ + private final int cntOfRings; + + /** Min radius. */ + private final double minRadius; + + /** Distance between circles. */ + private final double distanceBetweenRings; + + /** Seed. */ + private long seed; + + /** + * Create an intance of RingsDataStream. + * + * @param cntOfRings Count of circles. + * @param minRadius Min radius. + * @param distanceBetweenRings Distance between circles. + */ + public RingsDataStream(int cntOfRings, double minRadius, double distanceBetweenRings) { + this(cntOfRings, minRadius, distanceBetweenRings, System.currentTimeMillis()); + } + + /** + * Create an intance of RingsDataStream. + * + * @param cntOfRings Count of circles. + * @param minRadius Min radius. + * @param distanceBetweenRings Distance between circles. + * @param seed Seed. + */ + public RingsDataStream(int cntOfRings, double minRadius, double distanceBetweenRings, long seed) { + A.ensure(cntOfRings > 0, "countOfRings > 0"); + A.ensure(minRadius > 0, "minRadius > 0"); + A.ensure(distanceBetweenRings > 0, "distanceBetweenRings > 0"); + + this.cntOfRings = cntOfRings; + this.minRadius = minRadius; + this.distanceBetweenRings = distanceBetweenRings; + this.seed = seed; + } + + /** {@inheritDoc} */ + @Override public Stream<LabeledVector<Vector, Double>> labeled() { + VectorGeneratorsFamily.Builder builder = new VectorGeneratorsFamily.Builder(); + for (int i = 0; i < cntOfRings; i++) { + final double radius = minRadius + distanceBetweenRings * i; + final double variance = 0.1 * (i + 1); + + GaussRandomProducer gauss = new GaussRandomProducer(0, variance, seed); + builder = builder.add(ring(radius, 0, 2 * Math.PI).noisify(gauss)); + seed *= 2; + } + + return builder.build().asDataStream().labeled(); + } +} http://git-wip-us.apache.org/repos/asf/ignite/blob/d0facb26/modules/ml/src/main/java/org/apache/ignite/ml/util/generators/standard/TwoSeparableClassesDataStream.java ---------------------------------------------------------------------- diff --git a/modules/ml/src/main/java/org/apache/ignite/ml/util/generators/standard/TwoSeparableClassesDataStream.java b/modules/ml/src/main/java/org/apache/ignite/ml/util/generators/standard/TwoSeparableClassesDataStream.java new file mode 100644 index 0000000..49c69b7 --- /dev/null +++ b/modules/ml/src/main/java/org/apache/ignite/ml/util/generators/standard/TwoSeparableClassesDataStream.java @@ -0,0 +1,95 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.ignite.ml.util.generators.standard; + +import java.util.stream.Stream; +import org.apache.ignite.ml.math.primitives.vector.Vector; +import org.apache.ignite.ml.structures.LabeledVector; +import org.apache.ignite.ml.util.generators.DataStreamGenerator; +import org.apache.ignite.ml.util.generators.primitives.scalar.UniformRandomProducer; + +/** + * 2D-Vectors data stream with two separable classes. + */ +public class TwoSeparableClassesDataStream implements DataStreamGenerator { + /** Margin. */ + private final double margin; + + /** Variance. */ + private final double variance; + + /** Seed. */ + private long seed; + + /** + * Create an instance of TwoSeparableClassesDataStream. Note that margin can be less than zero. + * + * @param margin Margin. + * @param variance Variance. + */ + public TwoSeparableClassesDataStream(double margin, double variance) { + this(margin, variance, System.currentTimeMillis()); + } + + /** + * Create an instance of TwoSeparableClassesDataStream. Note that margin can be less than zero. + * + * @param margin Margin. + * @param variance Variance. + * @param seed Seed. + */ + public TwoSeparableClassesDataStream(double margin, double variance, long seed) { + this.margin = margin; + this.variance = variance; + this.seed = seed; + } + + /** {@inheritDoc} */ + @Override public Stream<LabeledVector<Vector, Double>> labeled() { + seed *= 2; + + double minCordVal = -variance - Math.abs(margin); + double maxCordVal = variance + Math.abs(margin); + + return new UniformRandomProducer(minCordVal, maxCordVal, seed) + .vectorize(2).asDataStream().labeled(this::classify) + .map(v -> new LabeledVector<>(applyMargin(v.features()), v.label())) + .filter(v -> between(v.features().get(0), -variance, variance)) + .filter(v -> between(v.features().get(1), -variance, variance)); + } + + /** */ + private boolean between(double x, double min, double max) { + return x >= min && x <= max; + } + + /** */ + private double classify(Vector v) { + return v.get(0) - v.get(1) > 0 ? -1.0 : 1.0; + } + + /** */ + private Vector applyMargin(Vector v) { + Vector cp = v.copy(); + + cp.set(0, cp.get(0) + Math.signum(v.get(0) - v.get(1)) * margin); + cp.set(1, cp.get(1) - Math.signum(v.get(0) - v.get(1)) * margin); + + return cp; + } +} http://git-wip-us.apache.org/repos/asf/ignite/blob/d0facb26/modules/ml/src/main/java/org/apache/ignite/ml/util/generators/standard/package-info.java ---------------------------------------------------------------------- diff --git a/modules/ml/src/main/java/org/apache/ignite/ml/util/generators/standard/package-info.java b/modules/ml/src/main/java/org/apache/ignite/ml/util/generators/standard/package-info.java new file mode 100644 index 0000000..4aaf4bf --- /dev/null +++ b/modules/ml/src/main/java/org/apache/ignite/ml/util/generators/standard/package-info.java @@ -0,0 +1,22 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * <!-- Package description. --> + * Contains classes for predefined data stream generators. + */ +package org.apache.ignite.ml.util.generators.standard;