IGNITE-6495: performance measurement of decision trees algorithms this closes #3213
Project: http://git-wip-us.apache.org/repos/asf/ignite/repo Commit: http://git-wip-us.apache.org/repos/asf/ignite/commit/a0516f9e Tree: http://git-wip-us.apache.org/repos/asf/ignite/tree/a0516f9e Diff: http://git-wip-us.apache.org/repos/asf/ignite/diff/a0516f9e Branch: refs/heads/ignite-zk-ce Commit: a0516f9e8ccb39125de67e5196b943ea6d55f7df Parents: 47e7daf Author: Oleg Ignatenko <[email protected]> Authored: Wed Dec 13 20:10:29 2017 +0300 Committer: Yury Babak <[email protected]> Committed: Wed Dec 13 20:10:29 2017 +0300 ---------------------------------------------------------------------- .../yardstick/config/benchmark-ml.properties | 2 + modules/yardstick/pom-standalone.xml | 12 + modules/yardstick/pom.xml | 12 + .../IgniteColumnDecisionTreeGiniBenchmark.java | 70 +++ ...niteColumnDecisionTreeVarianceBenchmark.java | 71 ++++ .../yardstick/ml/trees/SplitDataGenerator.java | 426 +++++++++++++++++++ .../ignite/yardstick/ml/trees/package-info.java | 22 + 7 files changed, 615 insertions(+) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/ignite/blob/a0516f9e/modules/yardstick/config/benchmark-ml.properties ---------------------------------------------------------------------- diff --git a/modules/yardstick/config/benchmark-ml.properties b/modules/yardstick/config/benchmark-ml.properties index dbf90dd..4867c0f 100644 --- a/modules/yardstick/config/benchmark-ml.properties +++ b/modules/yardstick/config/benchmark-ml.properties @@ -85,4 +85,6 @@ CONFIGS="\ -cfg ${SCRIPT_DIR}/../config/ignite-localhost-config.xml -nn ${nodesNum} -b ${b} -w ${w} -d ${d} -t ${t} -sm ${sm} -dn IgniteDenseLocalOffHeapMatrixMulBenchmark -sn IgniteNode -ds ${ver}dense-local-offheap-matrix-mul-${b}-backup,\ -cfg ${SCRIPT_DIR}/../config/ignite-localhost-config.xml -nn ${nodesNum} -b ${b} -w ${w} -d ${d} -t ${t} -sm ${sm} -dn IgniteSparseLocalMatrixMulBenchmark -sn IgniteNode -ds ${ver}sparse-local-matrix-mul-${b}-backup,\ -cfg ${SCRIPT_DIR}/../config/ignite-localhost-config.xml -nn ${nodesNum} -b ${b} -w ${w} -d ${d} -t ${t} -sm ${sm} -dn IgniteSparseBlockDistributedMatrixMulBenchmark -sn IgniteNode -ds ${ver}sparse-block-distributed-matrix-mul-${b}-backup,\ +-cfg ${SCRIPT_DIR}/../config/ignite-localhost-config.xml -nn ${nodesNum} -b ${b} -w ${w} -d ${d} -t ${t} -sm ${sm} -dn IgniteColumnDecisionTreeVarianceBenchmark -sn IgniteNode -ds ${ver}column-decision-tree-variance-${b}-backup,\ +-cfg ${SCRIPT_DIR}/../config/ignite-localhost-config.xml -nn ${nodesNum} -b ${b} -w ${w} -d ${d} -t ${t} -sm ${sm} -dn IgniteColumnDecisionTreeGiniBenchmark -sn IgniteNode -ds ${ver}column-decision-tree-gini-${b}-backup,\ " http://git-wip-us.apache.org/repos/asf/ignite/blob/a0516f9e/modules/yardstick/pom-standalone.xml ---------------------------------------------------------------------- diff --git a/modules/yardstick/pom-standalone.xml b/modules/yardstick/pom-standalone.xml index 78569a3..58920b9 100644 --- a/modules/yardstick/pom-standalone.xml +++ b/modules/yardstick/pom-standalone.xml @@ -123,6 +123,18 @@ <version>${project.version}</version> </dependency> </dependencies> + + <build> + <plugins> + <plugin> + <artifactId>maven-compiler-plugin</artifactId> + <configuration> + <source>1.8</source> + <target>1.8</target> + </configuration> + </plugin> + </plugins> + </build> </profile> </profiles> http://git-wip-us.apache.org/repos/asf/ignite/blob/a0516f9e/modules/yardstick/pom.xml ---------------------------------------------------------------------- diff --git a/modules/yardstick/pom.xml b/modules/yardstick/pom.xml index 6a98584..8dda6a7 100644 --- a/modules/yardstick/pom.xml +++ b/modules/yardstick/pom.xml @@ -143,6 +143,18 @@ <version>${project.version}</version> </dependency> </dependencies> + + <build> + <plugins> + <plugin> + <artifactId>maven-compiler-plugin</artifactId> + <configuration> + <source>1.8</source> + <target>1.8</target> + </configuration> + </plugin> + </plugins> + </build> </profile> </profiles> http://git-wip-us.apache.org/repos/asf/ignite/blob/a0516f9e/modules/yardstick/src/main/ml/org/apache/ignite/yardstick/ml/trees/IgniteColumnDecisionTreeGiniBenchmark.java ---------------------------------------------------------------------- diff --git a/modules/yardstick/src/main/ml/org/apache/ignite/yardstick/ml/trees/IgniteColumnDecisionTreeGiniBenchmark.java b/modules/yardstick/src/main/ml/org/apache/ignite/yardstick/ml/trees/IgniteColumnDecisionTreeGiniBenchmark.java new file mode 100644 index 0000000..f8a7c08 --- /dev/null +++ b/modules/yardstick/src/main/ml/org/apache/ignite/yardstick/ml/trees/IgniteColumnDecisionTreeGiniBenchmark.java @@ -0,0 +1,70 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.ignite.yardstick.ml.trees; + +import java.util.HashMap; +import java.util.Map; +import org.apache.ignite.Ignite; +import org.apache.ignite.ml.math.impls.vector.DenseLocalOnHeapVector; +import org.apache.ignite.ml.trees.trainers.columnbased.contsplitcalcs.ContinuousSplitCalculators; +import org.apache.ignite.ml.trees.trainers.columnbased.regcalcs.RegionCalculators; +import org.apache.ignite.resources.IgniteInstanceResource; +import org.apache.ignite.thread.IgniteThread; +import org.apache.ignite.yardstick.IgniteAbstractBenchmark; + +/** + * Ignite benchmark that performs ML Grid operations. + */ +@SuppressWarnings("unused") +public class IgniteColumnDecisionTreeGiniBenchmark extends IgniteAbstractBenchmark { + /** */ + @IgniteInstanceResource + private Ignite ignite; + + /** {@inheritDoc} */ + @Override public boolean test(Map<Object, Object> ctx) throws Exception { + // Create IgniteThread, we must work with SparseDistributedMatrix inside IgniteThread + // because we create ignite cache internally. + IgniteThread igniteThread = new IgniteThread(ignite.configuration().getIgniteInstanceName(), + this.getClass().getSimpleName(), new Runnable() { + /** {@inheritDoc} */ + @Override public void run() { + // IMPL NOTE originally taken from ColumnDecisionTreeTrainerTest#testCacheMixedGini + int totalPts = 1 << 10; + int featCnt = 2; + + HashMap<Integer, Integer> catsInfo = new HashMap<>(); + catsInfo.put(1, 3); + + SplitDataGenerator<DenseLocalOnHeapVector> gen = new SplitDataGenerator<>( + featCnt, catsInfo, () -> new DenseLocalOnHeapVector(featCnt + 1)). + split(0, 1, new int[] {0, 2}). + split(1, 0, -10.0); + + gen.testByGen(totalPts, ContinuousSplitCalculators.GINI.apply(ignite), + RegionCalculators.GINI, RegionCalculators.MEAN, ignite); + } + }); + + igniteThread.start(); + + igniteThread.join(); + + return true; + } +} http://git-wip-us.apache.org/repos/asf/ignite/blob/a0516f9e/modules/yardstick/src/main/ml/org/apache/ignite/yardstick/ml/trees/IgniteColumnDecisionTreeVarianceBenchmark.java ---------------------------------------------------------------------- diff --git a/modules/yardstick/src/main/ml/org/apache/ignite/yardstick/ml/trees/IgniteColumnDecisionTreeVarianceBenchmark.java b/modules/yardstick/src/main/ml/org/apache/ignite/yardstick/ml/trees/IgniteColumnDecisionTreeVarianceBenchmark.java new file mode 100644 index 0000000..f9d417f --- /dev/null +++ b/modules/yardstick/src/main/ml/org/apache/ignite/yardstick/ml/trees/IgniteColumnDecisionTreeVarianceBenchmark.java @@ -0,0 +1,71 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.ignite.yardstick.ml.trees; + +import java.util.HashMap; +import java.util.Map; +import org.apache.ignite.Ignite; +import org.apache.ignite.ml.math.impls.vector.DenseLocalOnHeapVector; +import org.apache.ignite.ml.trees.trainers.columnbased.contsplitcalcs.ContinuousSplitCalculators; +import org.apache.ignite.ml.trees.trainers.columnbased.regcalcs.RegionCalculators; +import org.apache.ignite.resources.IgniteInstanceResource; +import org.apache.ignite.thread.IgniteThread; +import org.apache.ignite.yardstick.IgniteAbstractBenchmark; + +/** + * Ignite benchmark that performs ML Grid operations. + */ +@SuppressWarnings("unused") +public class IgniteColumnDecisionTreeVarianceBenchmark extends IgniteAbstractBenchmark { + /** */ + @IgniteInstanceResource + private Ignite ignite; + + /** {@inheritDoc} */ + @Override public boolean test(Map<Object, Object> ctx) throws Exception { + // Create IgniteThread, we must work with SparseDistributedMatrix inside IgniteThread + // because we create ignite cache internally. + IgniteThread igniteThread = new IgniteThread(ignite.configuration().getIgniteInstanceName(), + this.getClass().getSimpleName(), new Runnable() { + /** {@inheritDoc} */ + @Override public void run() { + // IMPL NOTE originally taken from ColumnDecisionTreeTrainerTest#testCacheMixed + int totalPts = 1 << 10; + int featCnt = 2; + + HashMap<Integer, Integer> catsInfo = new HashMap<>(); + catsInfo.put(1, 3); + + SplitDataGenerator<DenseLocalOnHeapVector> gen + = new SplitDataGenerator<>( + featCnt, catsInfo, () -> new DenseLocalOnHeapVector(featCnt + 1)). + split(0, 1, new int[] {0, 2}). + split(1, 0, -10.0); + + gen.testByGen(totalPts, + ContinuousSplitCalculators.VARIANCE, RegionCalculators.VARIANCE, RegionCalculators.MEAN, ignite); + } + }); + + igniteThread.start(); + + igniteThread.join(); + + return true; + } +} http://git-wip-us.apache.org/repos/asf/ignite/blob/a0516f9e/modules/yardstick/src/main/ml/org/apache/ignite/yardstick/ml/trees/SplitDataGenerator.java ---------------------------------------------------------------------- diff --git a/modules/yardstick/src/main/ml/org/apache/ignite/yardstick/ml/trees/SplitDataGenerator.java b/modules/yardstick/src/main/ml/org/apache/ignite/yardstick/ml/trees/SplitDataGenerator.java new file mode 100644 index 0000000..f530300 --- /dev/null +++ b/modules/yardstick/src/main/ml/org/apache/ignite/yardstick/ml/trees/SplitDataGenerator.java @@ -0,0 +1,426 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.ignite.yardstick.ml.trees; + +import java.io.Serializable; +import java.util.Arrays; +import java.util.BitSet; +import java.util.Collections; +import java.util.HashMap; +import java.util.LinkedList; +import java.util.List; +import java.util.Map; +import java.util.Random; +import java.util.function.BiFunction; +import java.util.function.Function; +import java.util.function.Supplier; +import java.util.stream.Collectors; +import java.util.stream.DoubleStream; +import java.util.stream.IntStream; +import java.util.stream.Stream; +import org.apache.ignite.Ignite; +import org.apache.ignite.lang.IgniteBiTuple; +import org.apache.ignite.ml.math.StorageConstants; +import org.apache.ignite.ml.math.Vector; +import org.apache.ignite.ml.math.exceptions.MathIllegalArgumentException; +import org.apache.ignite.ml.math.functions.IgniteFunction; +import org.apache.ignite.ml.math.impls.matrix.SparseDistributedMatrix; +import org.apache.ignite.ml.math.impls.vector.DenseLocalOnHeapVector; +import org.apache.ignite.ml.structures.LabeledVectorDouble; +import org.apache.ignite.ml.trees.ContinuousRegionInfo; +import org.apache.ignite.ml.trees.ContinuousSplitCalculator; +import org.apache.ignite.ml.trees.models.DecisionTreeModel; +import org.apache.ignite.ml.trees.trainers.columnbased.ColumnDecisionTreeTrainer; +import org.apache.ignite.ml.trees.trainers.columnbased.ColumnDecisionTreeTrainerInput; +import org.apache.ignite.ml.trees.trainers.columnbased.MatrixColumnDecisionTreeTrainerInput; +import org.apache.ignite.ml.util.Utils; + +/** */ +class SplitDataGenerator<V extends Vector> { + /** */ + private static final Random rnd = new Random(12349L); + + /** */ + private static final double DELTA = 100.0; + + /** Map of the form of (is categorical -> list of region indexes). */ + private final Map<Boolean, List<Integer>> di; + + /** List of regions. */ + private final List<Region> regs; + + /** Data of bounds of regions. */ + private final Map<Integer, IgniteBiTuple<Double, Double>> boundsData; + + /** */ + private final Map<Integer, Integer> catFeaturesInfo; + + /** Supplier of vectors. */ + private final Supplier<V> supplier; + + /** Features count. */ + private final int featCnt; + + /** + * Create SplitDataGenerator. + * + * @param featCnt Features count. + * @param catFeaturesInfo Information about categorical features in form of map (feature index -> categories + * count). + * @param supplier Supplier of vectors. + */ + SplitDataGenerator(int featCnt, Map<Integer, Integer> catFeaturesInfo, Supplier<V> supplier) { + regs = new LinkedList<>(); + boundsData = new HashMap<>(); + this.supplier = supplier; + this.featCnt = featCnt; + this.catFeaturesInfo = catFeaturesInfo; + + // Divide indexes into indexes of categorical coordinates and indexes of continuous coordinates. + di = IntStream.range(0, featCnt). + boxed(). + collect(Collectors.partitioningBy(catFeaturesInfo::containsKey)); + + // Categorical coordinates info. + Map<Integer, CatCoordInfo> catCoords = new HashMap<>(); + di.get(true).forEach(i -> { + BitSet bs = new BitSet(); + bs.set(0, catFeaturesInfo.get(i)); + catCoords.put(i, new CatCoordInfo(bs)); + }); + + // Continuous coordinates info. + Map<Integer, ContCoordInfo> contCoords = new HashMap<>(); + di.get(false).forEach(i -> { + contCoords.put(i, new ContCoordInfo()); + boundsData.put(i, new IgniteBiTuple<>(-1.0, 1.0)); + }); + + Region firstReg = new Region(catCoords, contCoords, 0); + regs.add(firstReg); + } + + /** */ + <D extends ContinuousRegionInfo> void testByGen(int totalPts, + IgniteFunction<ColumnDecisionTreeTrainerInput, ? extends ContinuousSplitCalculator<D>> calc, + IgniteFunction<ColumnDecisionTreeTrainerInput, IgniteFunction<DoubleStream, Double>> catImpCalc, + IgniteFunction<DoubleStream, Double> regCalc, Ignite ignite) { + + List<IgniteBiTuple<Integer, V>> lst = points(totalPts, (i, rn) -> i).collect(Collectors.toList()); + + Collections.shuffle(lst, rnd); + + SparseDistributedMatrix m = new SparseDistributedMatrix(totalPts, + featCnt + 1, StorageConstants.COLUMN_STORAGE_MODE, StorageConstants.RANDOM_ACCESS_MODE); + + Map<Integer, List<LabeledVectorDouble>> byRegion = new HashMap<>(); + + int i = 0; + for (IgniteBiTuple<Integer, V> bt : lst) { + byRegion.putIfAbsent(bt.get1(), new LinkedList<>()); + byRegion.get(bt.get1()).add(asLabeledVector(bt.get2().getStorage().data())); + m.setRow(i, bt.get2().getStorage().data()); + i++; + } + + ColumnDecisionTreeTrainer<D> trainer = + new ColumnDecisionTreeTrainer<>(3, calc, catImpCalc, regCalc, ignite); + + DecisionTreeModel mdl = trainer.train(new MatrixColumnDecisionTreeTrainerInput(m, catFeaturesInfo)); + + byRegion.keySet().forEach(k -> mdl.predict(byRegion.get(k).get(0).features())); + } + + /** + * Split region by continuous coordinate using given threshold. + * + * @param regIdx Region index. + * @param coordIdx Coordinate index. + * @param threshold Threshold. + * @return {@code this}. + */ + SplitDataGenerator<V> split(int regIdx, int coordIdx, double threshold) { + Region regToSplit = regs.get(regIdx); + ContCoordInfo cci = regToSplit.contCoords.get(coordIdx); + + double left = cci.left; + double right = cci.right; + + if (threshold < left || threshold > right) + throw new MathIllegalArgumentException("Threshold is out of region bounds."); + + regToSplit.incTwoPow(); + + Region newReg = Utils.copy(regToSplit); + newReg.contCoords.get(coordIdx).left = threshold; + + regs.add(regIdx + 1, newReg); + cci.right = threshold; + + IgniteBiTuple<Double, Double> bounds = boundsData.get(coordIdx); + double min = bounds.get1(); + double max = bounds.get2(); + boundsData.put(coordIdx, new IgniteBiTuple<>(Math.min(threshold, min), Math.max(max, threshold))); + + return this; + } + + /** + * Split region by categorical coordinate. + * + * @param regIdx Region index. + * @param coordIdx Coordinate index. + * @param cats Categories allowed for the left sub region. + * @return {@code this}. + */ + SplitDataGenerator<V> split(int regIdx, int coordIdx, int[] cats) { + BitSet subset = new BitSet(); + Arrays.stream(cats).forEach(subset::set); + Region regToSplit = regs.get(regIdx); + CatCoordInfo cci = regToSplit.catCoords.get(coordIdx); + + BitSet ssc = (BitSet)subset.clone(); + BitSet set = cci.bs; + ssc.and(set); + if (ssc.length() != subset.length()) + throw new MathIllegalArgumentException("Splitter set is not a subset of a parent subset."); + + ssc.xor(set); + set.and(subset); + + regToSplit.incTwoPow(); + Region newReg = Utils.copy(regToSplit); + newReg.catCoords.put(coordIdx, new CatCoordInfo(ssc)); + + regs.add(regIdx + 1, newReg); + + return this; + } + + /** + * Get stream of points generated by this generator. + * + * @param ptsCnt Points count. + */ + private Stream<IgniteBiTuple<Integer, V>> points(int ptsCnt, BiFunction<Double, Random, Double> f) { + return IntStream.range(0, regs.size()). + boxed(). + map(i -> regs.get(i).generatePoints(ptsCnt, f.apply((double)i, rnd), boundsData, di, supplier, rnd) + .map(v -> new IgniteBiTuple<>(i, v))).flatMap(Function.identity()); + } + + /** + * Convert double array to {@link LabeledVectorDouble} + * + * @param arr Array for conversion. + * @return LabeledVectorDouble. + */ + private static LabeledVectorDouble<DenseLocalOnHeapVector> asLabeledVector(double arr[]) { + return new LabeledVectorDouble<>(new DenseLocalOnHeapVector( + Arrays.copyOf(arr, arr.length - 1)), arr[arr.length - 1]); + } + + /** + * Categorical coordinate info. + */ + private static class CatCoordInfo implements Serializable { + /** + * Defines categories which are included in this region + */ + private final BitSet bs; + + /** + * Construct CatCoordInfo. + * + * @param bs Bitset. + */ + CatCoordInfo(BitSet bs) { + this.bs = bs; + } + + /** {@inheritDoc} */ + @Override public String toString() { + return "CatCoordInfo [" + + "bs=" + bs + + ']'; + } + } + + /** + * Continuous coordinate info. + */ + private static class ContCoordInfo implements Serializable { + /** + * Left (min) bound of region. + */ + private double left; + + /** + * Right (max) bound of region. + */ + private double right; + + /** + * Construct ContCoordInfo. + */ + ContCoordInfo() { + left = Double.NEGATIVE_INFINITY; + right = Double.POSITIVE_INFINITY; + } + + /** {@inheritDoc} */ + @Override public String toString() { + return "ContCoordInfo [" + + "left=" + left + + ", right=" + right + + ']'; + } + } + + /** + * Class representing information about region. + */ + private static class Region implements Serializable { + /** + * Information about categorical coordinates restrictions of this region in form of + * (coordinate index -> restriction) + */ + private final Map<Integer, CatCoordInfo> catCoords; + + /** + * Information about continuous coordinates restrictions of this region in form of + * (coordinate index -> restriction) + */ + private final Map<Integer, ContCoordInfo> contCoords; + + /** + * Region should contain {@code 1/2^twoPow * totalPoints} points. + */ + private int twoPow; + + /** + * Construct region by information about restrictions on coordinates (features) values. + * + * @param catCoords Restrictions on categorical coordinates. + * @param contCoords Restrictions on continuous coordinates + * @param twoPow Region should contain {@code 1/2^twoPow * totalPoints} points. + */ + Region(Map<Integer, CatCoordInfo> catCoords, Map<Integer, ContCoordInfo> contCoords, int twoPow) { + this.catCoords = catCoords; + this.contCoords = contCoords; + this.twoPow = twoPow; + } + + /** */ + int divideBy() { + return 1 << twoPow; + } + + /** */ + void incTwoPow() { + twoPow++; + } + + /** {@inheritDoc} */ + @Override public String toString() { + return "Region [" + + "catCoords=" + catCoords + + ", contCoords=" + contCoords + + ", twoPow=" + twoPow + + ']'; + } + + /** + * Generate continuous coordinate for this region. + * + * @param coordIdx Coordinate index. + * @param boundsData Data with bounds + * @param rnd Random numbers generator. + * @return Categorical coordinate value. + */ + double generateContCoord(int coordIdx, Map<Integer, IgniteBiTuple<Double, Double>> boundsData, + Random rnd) { + ContCoordInfo cci = contCoords.get(coordIdx); + double left = cci.left; + double right = cci.right; + + if (left == Double.NEGATIVE_INFINITY) + left = boundsData.get(coordIdx).get1() - DELTA; + + if (right == Double.POSITIVE_INFINITY) + right = boundsData.get(coordIdx).get2() + DELTA; + + double size = right - left; + + return left + rnd.nextDouble() * size; + } + + /** + * Generate categorical coordinate value for this region. + * + * @param coordIdx Coordinate index. + * @param rnd Random numbers generator. + * @return Categorical coordinate value. + */ + double generateCatCoord(int coordIdx, Random rnd) { + // Pick random bit. + BitSet bs = catCoords.get(coordIdx).bs; + int j = rnd.nextInt(bs.length()); + + int i = 0; + int bn = 0; + int bnp = 0; + + while ((bn = bs.nextSetBit(bn)) != -1 && i <= j) { + i++; + bnp = bn; + bn++; + } + + return bnp; + } + + /** + * Generate points for this region. + * + * @param ptsCnt Count of points to generate. + * @param val Label for all points in this region. + * @param boundsData Data about bounds of continuous coordinates. + * @param catCont Data about which categories can be in this region in the form (coordinate index -> list of + * categories indexes). + * @param s Vectors supplier. + * @param rnd Random numbers generator. + * @param <V> Type of vectors. + * @return Stream of generated points for this region. + */ + <V extends Vector> Stream<V> generatePoints(int ptsCnt, double val, + Map<Integer, IgniteBiTuple<Double, Double>> boundsData, Map<Boolean, List<Integer>> catCont, + Supplier<V> s, + Random rnd) { + return IntStream.range(0, ptsCnt / divideBy()).mapToObj(i -> { + V v = s.get(); + int coordsCnt = v.size(); + catCont.get(false).forEach(ci -> v.setX(ci, generateContCoord(ci, boundsData, rnd))); + catCont.get(true).forEach(ci -> v.setX(ci, generateCatCoord(ci, rnd))); + + v.setX(coordsCnt - 1, val); + return v; + }); + } + } +} http://git-wip-us.apache.org/repos/asf/ignite/blob/a0516f9e/modules/yardstick/src/main/ml/org/apache/ignite/yardstick/ml/trees/package-info.java ---------------------------------------------------------------------- diff --git a/modules/yardstick/src/main/ml/org/apache/ignite/yardstick/ml/trees/package-info.java b/modules/yardstick/src/main/ml/org/apache/ignite/yardstick/ml/trees/package-info.java new file mode 100644 index 0000000..fc379a6 --- /dev/null +++ b/modules/yardstick/src/main/ml/org/apache/ignite/yardstick/ml/trees/package-info.java @@ -0,0 +1,22 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * <!-- Package description. --> + * ML Grid decision tree benchmarks. + */ +package org.apache.ignite.yardstick.ml.trees; \ No newline at end of file
