Repository: incubator-reef Updated Branches: refs/heads/master 4a9796792 -> 9b16d54cd
[REEF-613] DataLoadingRequestBuilder should use the desired number of input splits defined by the user We are overriding the numberOfDesiredInputSplits defined by the user to 0, when they invoke the DataLoadingRequestBuilder.setInputPath method. This change allows to use the DataLoadingRequestBuilder.setInputPath method and DataLoadingRequestBuilder.setNumberOfDesiredSplits safely. We now set the correct number of desired splits to the DistributedDataSet object we create. This also un-deprecates some of the API to avoid the overhead of creating the multi data center strategy, and throws an exception if setInputPath and setDistributedDataSet methods are both called (in any order). JIRA: [REEF-613](https://issues.apache.org/jira/browse/REEF-613) Pull Request: Closes #391 Project: http://git-wip-us.apache.org/repos/asf/incubator-reef/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-reef/commit/9b16d54c Tree: http://git-wip-us.apache.org/repos/asf/incubator-reef/tree/9b16d54c Diff: http://git-wip-us.apache.org/repos/asf/incubator-reef/diff/9b16d54c Branch: refs/heads/master Commit: 9b16d54cdef4c68e580e3dc0625ef9fc9043cfdf Parents: 4a97967 Author: Ignacio Cano <[email protected]> Authored: Thu Aug 20 11:34:46 2015 -0700 Committer: Jason (Joo Seong) Jeong <[email protected]> Committed: Tue Aug 25 17:48:45 2015 +0900 ---------------------------------------------------------------------- .../loading/api/DataLoadingRequestBuilder.java | 51 ++++++++++++++------ 1 file changed, 36 insertions(+), 15 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-reef/blob/9b16d54c/lang/java/reef-io/src/main/java/org/apache/reef/io/data/loading/api/DataLoadingRequestBuilder.java ---------------------------------------------------------------------- diff --git a/lang/java/reef-io/src/main/java/org/apache/reef/io/data/loading/api/DataLoadingRequestBuilder.java b/lang/java/reef-io/src/main/java/org/apache/reef/io/data/loading/api/DataLoadingRequestBuilder.java index 79275a0..46085fe 100644 --- a/lang/java/reef-io/src/main/java/org/apache/reef/io/data/loading/api/DataLoadingRequestBuilder.java +++ b/lang/java/reef-io/src/main/java/org/apache/reef/io/data/loading/api/DataLoadingRequestBuilder.java @@ -83,6 +83,11 @@ public final class DataLoadingRequestBuilder */ private DistributedDataSet distributedDataSet; + /** + * The input path of the data to be loaded. + */ + private String inputPath; + public DataLoadingRequestBuilder setNumberOfDesiredSplits(final int numberOfDesiredSplits) { this.numberOfDesiredSplits = numberOfDesiredSplits; return this; @@ -205,29 +210,23 @@ public final class DataLoadingRequestBuilder } /** - * Sets the path of the folder where the data is. Internally it constructs a - * distributed data set with one partition, no splits and the data can be - * loaded from anywhere. + * Sets the path of the folder where the data is. + * Internally, a distributed dataset with a unique partition is created, + * and {@link SingleDataCenterEvaluatorToPartitionStrategy} is binded. * - * @deprecated since 0.12. Should use instead - * {@link DataLoadingRequestBuilder#setDistributedDataSet(DistributedDataSet)} * @param inputPath * the input path * @return this */ - @Deprecated public DataLoadingRequestBuilder setInputPath(final String inputPath) { - final DistributedDataSet dds = new DistributedDataSet(); - dds.addPartition(DistributedDataSetPartition.newBuilder().setPath(inputPath) - .setLocation(DistributedDataSetPartition.LOAD_INTO_ANY_LOCATION) - .setDesiredSplits(Integer.valueOf(NumberOfDesiredSplits.DEFAULT_DESIRED_SPLITS)).build()); + this.inputPath = inputPath; this.singleDataCenterStrategy = true; - this.distributedDataSet = dds; return this; } /** * Sets the distributed data set. + * Internally, a {@link MultiDataCenterEvaluatorToPartitionStrategy} is binded. * * @param distributedDataSet * the distributed data set @@ -245,6 +244,32 @@ public final class DataLoadingRequestBuilder throw new BindException("Driver Configuration Module is a required parameter."); } + // need to create the distributed data set + if (this.singleDataCenterStrategy) { + if (this.inputPath == null) { + throw new BindException("Should specify an input path."); + } + if (this.distributedDataSet != null && !this.distributedDataSet.isEmpty()) { + throw new BindException("You should either call setInputPath or setDistributedDataSet, but not both"); + } + // Create a distributed data set with one partition, the splits defined by + // the user if greater than 0 or no splits, and data to be loaded from + // anywhere. + final DistributedDataSet dds = new DistributedDataSet(); + dds.addPartition(DistributedDataSetPartition + .newBuilder() + .setPath(inputPath) + .setLocation(DistributedDataSetPartition.LOAD_INTO_ANY_LOCATION) + .setDesiredSplits( + numberOfDesiredSplits > 0 ? numberOfDesiredSplits : Integer + .valueOf(NumberOfDesiredSplits.DEFAULT_DESIRED_SPLITS)).build()); + this.distributedDataSet = dds; + } else { + if (this.inputPath != null) { + throw new BindException("You should either call setInputPath or setDistributedDataSet, but not both"); + } + } + if (this.distributedDataSet == null || this.distributedDataSet.isEmpty()) { throw new BindException("Distributed Data Set is a required parameter."); } @@ -270,10 +295,6 @@ public final class DataLoadingRequestBuilder final JavaConfigurationBuilder jcb = Tang.Factory.getTang().newConfigurationBuilder(driverConfiguration); - if (this.numberOfDesiredSplits > 0) { - jcb.bindNamedParameter(NumberOfDesiredSplits.class, "" + this.numberOfDesiredSplits); - } - // if empty, then the user code still uses the deprecated fields. // we create a dataLoadRequest object based on them (or their default values) if (this.dataRequests.isEmpty()) {
