This is an automated email from the ASF dual-hosted git repository. zaleslaw pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/ignite.git
The following commit(s) were added to refs/heads/master by this push: new 4b8b7ff IGNITE-12168: [ML] Flaky ML example tests (#6866) 4b8b7ff is described below commit 4b8b7ff254db202f2f9af1130f0183057bb3f6aa Author: Alexey Zinoviev <zaleslaw....@gmail.com> AuthorDate: Fri Sep 13 22:13:51 2019 +0300 IGNITE-12168: [ML] Flaky ML example tests (#6866) --- .../apache/ignite/ml/util/MLSandboxDatasets.java | 26 ++++++------- .../org/apache/ignite/ml/util/SandboxMLCache.java | 45 ++++++++++++---------- 2 files changed, 37 insertions(+), 34 deletions(-) diff --git a/modules/ml/src/main/java/org/apache/ignite/ml/util/MLSandboxDatasets.java b/modules/ml/src/main/java/org/apache/ignite/ml/util/MLSandboxDatasets.java index 4d423e9..12bff53 100644 --- a/modules/ml/src/main/java/org/apache/ignite/ml/util/MLSandboxDatasets.java +++ b/modules/ml/src/main/java/org/apache/ignite/ml/util/MLSandboxDatasets.java @@ -22,50 +22,50 @@ package org.apache.ignite.ml.util; */ public enum MLSandboxDatasets { /** Movielens dataset with ratings. */ - MOVIELENS("ratings.csv", true, ","), + MOVIELENS("modules/ml/src/main/resources/datasets/ratings.csv", true, ","), /** The full Iris dataset from Machine Learning Repository. */ - IRIS("iris.txt", false, "\t"), + IRIS("modules/ml/src/main/resources/datasets/iris.txt", false, "\t"), /** The Titanic dataset from Kaggle competition. */ - TITANIC("titanic.csv", true, ";"), + TITANIC("modules/ml/src/main/resources/datasets/titanic.csv", true, ";"), /** The 1st and 2nd classes from the Iris dataset. */ - TWO_CLASSED_IRIS("two_classed_iris.csv", false, "\t"), + TWO_CLASSED_IRIS("modules/ml/src/main/resources/datasets/two_classed_iris.csv", false, "\t"), /** The dataset is about different computers' properties based on https://archive.ics.uci.edu/ml/datasets/Computer+Hardware. */ - CLEARED_MACHINES("cleared_machines.csv", false, ";"), + CLEARED_MACHINES("modules/ml/src/main/resources/datasets/cleared_machines.csv", false, ";"), /** * The health data is related to death rate based on; doctor availability, hospital availability, * annual per capita income, and population density people per square mile. */ - MORTALITY_DATA("mortalitydata.csv", false, ";"), + MORTALITY_DATA("modules/ml/src/main/resources/datasets/mortalitydata.csv", false, ";"), /** * The preprocessed Glass dataset from the Machine Learning Repository https://archive.ics.uci.edu/ml/datasets/Glass+Identification * There are 3 classes with labels: 1 {building_windows_float_processed}, 3 {vehicle_windows_float_processed}, 7 {headlamps}. * Feature names: 'Na-Sodium', 'Mg-Magnesium', 'Al-Aluminum', 'Ba-Barium', 'Fe-Iron'. */ - GLASS_IDENTIFICATION("glass_identification.csv", false, ";"), + GLASS_IDENTIFICATION("modules/ml/src/main/resources/datasets/glass_identification.csv", false, ";"), /** The Wine recognition data. Could be found <a href="https://archive.ics.uci.edu/ml/machine-learning-databases/wine/">here</a>. */ - WINE_RECOGNITION("wine.txt", false, ","), + WINE_RECOGNITION("modules/ml/src/main/resources/datasets/wine.txt", false, ","), /** The Boston house-prices dataset. Could be found <a href="https://archive.ics.uci.edu/ml/machine-learning-databases/housing/">here</a>. */ - BOSTON_HOUSE_PRICES("boston_housing_dataset.txt", false, ","), + BOSTON_HOUSE_PRICES("modules/ml/src/main/resources/datasets/boston_housing_dataset.txt", false, ","), /** Example from book Barber D. Bayesian reasoning and machine learning. Chapter 10. */ - ENGLISH_VS_SCOTTISH("english_vs_scottish_binary_dataset.csv", true, ","), + ENGLISH_VS_SCOTTISH("modules/ml/src/main/resources/datasets/english_vs_scottish_binary_dataset.csv", true, ","), /** Wholesale customers dataset. Could be found <a href="https://archive.ics.uci.edu/ml/datasets/Wholesale+customers">here</a>. */ - WHOLESALE_CUSTOMERS("wholesale_customers.csv", true, ","), + WHOLESALE_CUSTOMERS("modules/ml/src/main/resources/datasets/wholesale_customers.csv", true, ","), /** Fraud detection problem [part of whole dataset]. Could be found <a href="https://www.kaggle.com/mlg-ulb/creditcardfraud/">here</a>. */ - FRAUD_DETECTION("fraud_detection.csv", false, ","), + FRAUD_DETECTION("modules/ml/src/main/resources/datasets/fraud_detection.csv", false, ","), /** A dataset with discrete and continious features. */ - MIXED_DATASET("mixed_dataset.csv", true, ","); + MIXED_DATASET("modules/ml/src/main/resources/datasets/mixed_dataset.csv", true, ","); /** Filename. */ private final String filename; diff --git a/modules/ml/src/main/java/org/apache/ignite/ml/util/SandboxMLCache.java b/modules/ml/src/main/java/org/apache/ignite/ml/util/SandboxMLCache.java index 23febff..ef2e260 100644 --- a/modules/ml/src/main/java/org/apache/ignite/ml/util/SandboxMLCache.java +++ b/modules/ml/src/main/java/org/apache/ignite/ml/util/SandboxMLCache.java @@ -17,6 +17,7 @@ package org.apache.ignite.ml.util; +import java.io.File; import java.io.FileNotFoundException; import java.io.IOException; import java.nio.file.Paths; @@ -31,13 +32,10 @@ import org.apache.ignite.Ignite; import org.apache.ignite.IgniteCache; import org.apache.ignite.cache.affinity.rendezvous.RendezvousAffinityFunction; import org.apache.ignite.configuration.CacheConfiguration; -import org.apache.ignite.internal.util.typedef.internal.A; +import org.apache.ignite.internal.util.IgniteUtils; import org.apache.ignite.ml.math.exceptions.knn.FileParsingException; import org.apache.ignite.ml.math.primitives.vector.Vector; import org.apache.ignite.ml.math.primitives.vector.VectorUtils; -import org.springframework.core.io.Resource; -import org.springframework.core.io.support.PathMatchingResourcePatternResolver; -import org.springframework.core.io.support.ResourcePatternResolver; /** * Common utility code used in some ML examples to set up test cache. @@ -46,10 +44,6 @@ public class SandboxMLCache { /** */ private final Ignite ignite; - /** Resource resolver. */ - private static final ResourcePatternResolver RESOURCE_RESOLVER = - new PathMatchingResourcePatternResolver(SandboxMLCache.class.getClassLoader()); - /** */ public SandboxMLCache(Ignite ignite) { this.ignite = ignite; @@ -74,6 +68,7 @@ public class SandboxMLCache { return cache; } + /** * Loads dataset as a list of rows. * @@ -84,10 +79,15 @@ public class SandboxMLCache { public List<String> loadDataset(MLSandboxDatasets dataset) throws IOException { List<String> res = new ArrayList<>(); - Resource[] resources = RESOURCE_RESOLVER.getResources("classpath*:*/" + dataset.getFileName()); - A.ensure(resources.length == 1, "Cannot find resource"); + String fileName = dataset.getFileName(); + + File file = IgniteUtils.resolveIgnitePath(fileName); + + if (file == null) + throw new FileNotFoundException(fileName); + + Scanner scanner = new Scanner(file); - Scanner scanner = new Scanner(resources[0].getInputStream()); if (dataset.hasHeader() && scanner.hasNextLine()) scanner.nextLine(); @@ -99,6 +99,7 @@ public class SandboxMLCache { return res; } + /** * Fills cache with data and returns it. * @@ -106,19 +107,23 @@ public class SandboxMLCache { * @return Filled Ignite Cache. * @throws FileNotFoundException If file not found. */ - public IgniteCache<Integer, Vector> fillCacheWith(MLSandboxDatasets dataset) throws IOException { + public IgniteCache<Integer, Vector> fillCacheWith(MLSandboxDatasets dataset) throws FileNotFoundException { + IgniteCache<Integer, Vector> cache = getCache(); String fileName = dataset.getFileName(); - Resource[] resources = RESOURCE_RESOLVER.getResources("classpath*:*/" + fileName); - A.ensure(resources.length == 1, "Cannot find resource"); - Scanner scanner = new Scanner(resources[0].getInputStream()); + File file = IgniteUtils.resolveIgnitePath(fileName); + + if (file == null) + throw new FileNotFoundException(fileName); + + Scanner scanner = new Scanner(file); int cnt = 0; while (scanner.hasNextLine()) { String row = scanner.nextLine(); - if (dataset.hasHeader() && cnt == 0) { + if(dataset.hasHeader() && cnt == 0) { cnt++; continue; } @@ -129,11 +134,9 @@ public class SandboxMLCache { NumberFormat format = NumberFormat.getInstance(Locale.FRANCE); for (int i = 0; i < cells.length; i++) - try { - if (cells[i].equals("")) - data[i] = Double.NaN; - else - data[i] = Double.valueOf(cells[i]); + try{ + if(cells[i].equals("")) data[i] = Double.NaN; + else data[i] = Double.valueOf(cells[i]); } catch (java.lang.NumberFormatException e) { try { data[i] = format.parse(cells[i]).doubleValue();