[jira] [Work logged] (BEAM-3246) BigtableIO should merge splits if they exceed 15K
[ https://issues.apache.org/jira/browse/BEAM-3246?focusedWorklogId=101242=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-101242 ] ASF GitHub Bot logged work on BEAM-3246: Author: ASF GitHub Bot Created on: 11/May/18 18:08 Start Date: 11/May/18 18:08 Worklog Time Spent: 10m Work Description: chamikaramj closed pull request #4517: [BEAM-3246] Bigtable: Merge splits if they exceed 15K URL: https://github.com/apache/beam/pull/4517 This is a PR merged from a forked repository. As GitHub hides the original diff on merge, it is displayed below for the sake of provenance: As this is a foreign pull request (from a fork), the diff is supplied below (as it won't show otherwise due to GitHub magic): diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigtable/BigtableIO.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigtable/BigtableIO.java index 4e602699e58..2e67648bb58 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigtable/BigtableIO.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigtable/BigtableIO.java @@ -34,6 +34,7 @@ import com.google.common.collect.Lists; import com.google.protobuf.ByteString; import java.io.IOException; +import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.Iterator; @@ -836,6 +837,8 @@ protected BigtableSource withEstimatedSizeBytes(Long estimatedSizeBytes) { return config.getBigtableService(pipelineOptions).getSampleRowKeys(this); } +private static final long MAX_SPLIT_COUNT = 15_360L; + @Override public List split( long desiredBundleSizeBytes, PipelineOptions options) throws Exception { @@ -847,7 +850,103 @@ protected BigtableSource withEstimatedSizeBytes(Long estimatedSizeBytes) { Math.max(sizeEstimate / maximumNumberOfSplits, desiredBundleSizeBytes); // Delegate to testable helper. - return splitBasedOnSamples(desiredBundleSizeBytes, getSampleRowKeys(options)); + List splits = + splitBasedOnSamples(desiredBundleSizeBytes, getSampleRowKeys(options)); + return reduceSplits(splits, options, MAX_SPLIT_COUNT); +} + +@VisibleForTesting +protected List reduceSplits( +List splits, PipelineOptions options, long maxSplitCounts) +throws IOException { + int numberToCombine = + (int) ((splits.size() + maxSplitCounts - 1) / maxSplitCounts); + if (splits.size() < maxSplitCounts || numberToCombine < 2) { +return splits; + } + ImmutableList.Builder reducedSplits = ImmutableList.builder(); + List previousSourceRanges = new ArrayList(); + int counter = 0; + long size = 0; + for (BigtableSource source : splits) { +if (counter == numberToCombine +|| !checkRangeAdjacency(previousSourceRanges, source.getRanges())) { + reducedSplits.add( + new BigtableSource( + config, + filter, + previousSourceRanges, + size)); + counter = 0; + size = 0; + previousSourceRanges = new ArrayList(); +} +previousSourceRanges.addAll(source.getRanges()); +previousSourceRanges = mergeRanges(previousSourceRanges); +size += source.getEstimatedSizeBytes(options); +counter++; + } + if (size > 0) { +reducedSplits.add( +new BigtableSource( +config, +filter, +previousSourceRanges, +size)); + } + return reducedSplits.build(); +} + +/** Helper to validate range Adjacency. + * Ranges are considered adjacent if [1..100][100..200][200..300] + **/ +private static boolean checkRangeAdjacency(List ranges, +List otherRanges) { + checkArgument(ranges != null || otherRanges != null, "Both ranges cannot be null."); + ImmutableList.Builder mergedRanges = ImmutableList.builder(); + if (ranges != null) { +mergedRanges.addAll(ranges); + } + if (otherRanges != null) { +mergedRanges.addAll(otherRanges); + } + return checkRangeAdjacency(mergedRanges.build()); +} + +/** Helper to validate range Adjacency. + * Ranges are considered adjacent if [1..100][100..200][200..300] + **/ +private static boolean checkRangeAdjacency(List ranges) { + int index = 0; + if (ranges.size() < 2) { +return true; + } + ByteKey lastEndKey = ranges.get(index++).getEndKey(); + while (index < ranges.size()) { +ByteKeyRange currentKeyRange = ranges.get(index++); +if (!lastEndKey.equals(currentKeyRange.getStartKey())) { + return false; +
[jira] [Work logged] (BEAM-3246) BigtableIO should merge splits if they exceed 15K
[ https://issues.apache.org/jira/browse/BEAM-3246?focusedWorklogId=101241=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-101241 ] ASF GitHub Bot logged work on BEAM-3246: Author: ASF GitHub Bot Created on: 11/May/18 18:07 Start Date: 11/May/18 18:07 Worklog Time Spent: 10m Work Description: chamikaramj commented on issue #4517: [BEAM-3246] Bigtable: Merge splits if they exceed 15K URL: https://github.com/apache/beam/pull/4517#issuecomment-388441826 https://builds.apache.org/job/beam_PreCommit_Java_GradleBuild/5255/ Failures is unrelated. Merging. This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org Issue Time Tracking --- Worklog Id: (was: 101241) Time Spent: 6h (was: 5h 50m) > BigtableIO should merge splits if they exceed 15K > - > > Key: BEAM-3246 > URL: https://issues.apache.org/jira/browse/BEAM-3246 > Project: Beam > Issue Type: Bug > Components: io-java-gcp >Reporter: Solomon Duskis >Assignee: Solomon Duskis >Priority: Major > Time Spent: 6h > Remaining Estimate: 0h > > A customer hit a problem with a large number of splits. CloudBitableIO fixes > that here > https://github.com/GoogleCloudPlatform/cloud-bigtable-client/blob/master/bigtable-dataflow-parent/bigtable-hbase-beam/src/main/java/com/google/cloud/bigtable/beam/CloudBigtableIO.java#L241 > BigtableIO should have similar logic. -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Work logged] (BEAM-3246) BigtableIO should merge splits if they exceed 15K
[ https://issues.apache.org/jira/browse/BEAM-3246?focusedWorklogId=101184=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-101184 ] ASF GitHub Bot logged work on BEAM-3246: Author: ASF GitHub Bot Created on: 11/May/18 16:38 Start Date: 11/May/18 16:38 Worklog Time Spent: 10m Work Description: chamikaramj commented on issue #4517: [BEAM-3246] Bigtable: Merge splits if they exceed 15K URL: https://github.com/apache/beam/pull/4517#issuecomment-388417813 Just waiting for tests to pass This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org Issue Time Tracking --- Worklog Id: (was: 101184) Time Spent: 5h 50m (was: 5h 40m) > BigtableIO should merge splits if they exceed 15K > - > > Key: BEAM-3246 > URL: https://issues.apache.org/jira/browse/BEAM-3246 > Project: Beam > Issue Type: Bug > Components: io-java-gcp >Reporter: Solomon Duskis >Assignee: Solomon Duskis >Priority: Major > Time Spent: 5h 50m > Remaining Estimate: 0h > > A customer hit a problem with a large number of splits. CloudBitableIO fixes > that here > https://github.com/GoogleCloudPlatform/cloud-bigtable-client/blob/master/bigtable-dataflow-parent/bigtable-hbase-beam/src/main/java/com/google/cloud/bigtable/beam/CloudBigtableIO.java#L241 > BigtableIO should have similar logic. -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Work logged] (BEAM-3246) BigtableIO should merge splits if they exceed 15K
[ https://issues.apache.org/jira/browse/BEAM-3246?focusedWorklogId=101183=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-101183 ] ASF GitHub Bot logged work on BEAM-3246: Author: ASF GitHub Bot Created on: 11/May/18 16:38 Start Date: 11/May/18 16:38 Worklog Time Spent: 10m Work Description: chamikaramj commented on issue #4517: [BEAM-3246] Bigtable: Merge splits if they exceed 15K URL: https://github.com/apache/beam/pull/4517#issuecomment-388417682 Retest this please This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org Issue Time Tracking --- Worklog Id: (was: 101183) Time Spent: 5h 40m (was: 5.5h) > BigtableIO should merge splits if they exceed 15K > - > > Key: BEAM-3246 > URL: https://issues.apache.org/jira/browse/BEAM-3246 > Project: Beam > Issue Type: Bug > Components: io-java-gcp >Reporter: Solomon Duskis >Assignee: Solomon Duskis >Priority: Major > Time Spent: 5h 40m > Remaining Estimate: 0h > > A customer hit a problem with a large number of splits. CloudBitableIO fixes > that here > https://github.com/GoogleCloudPlatform/cloud-bigtable-client/blob/master/bigtable-dataflow-parent/bigtable-hbase-beam/src/main/java/com/google/cloud/bigtable/beam/CloudBigtableIO.java#L241 > BigtableIO should have similar logic. -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Work logged] (BEAM-3246) BigtableIO should merge splits if they exceed 15K
[ https://issues.apache.org/jira/browse/BEAM-3246?focusedWorklogId=10=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-10 ] ASF GitHub Bot logged work on BEAM-3246: Author: ASF GitHub Bot Created on: 11/May/18 13:02 Start Date: 11/May/18 13:02 Worklog Time Spent: 10m Work Description: sduskis commented on issue #4517: [BEAM-3246] Bigtable: Merge splits if they exceed 15K URL: https://github.com/apache/beam/pull/4517#issuecomment-388356853 @chamikaramj, what can we do to move this forward? This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org Issue Time Tracking --- Worklog Id: (was: 10) Time Spent: 5.5h (was: 5h 20m) > BigtableIO should merge splits if they exceed 15K > - > > Key: BEAM-3246 > URL: https://issues.apache.org/jira/browse/BEAM-3246 > Project: Beam > Issue Type: Bug > Components: io-java-gcp >Reporter: Solomon Duskis >Assignee: Solomon Duskis >Priority: Major > Time Spent: 5.5h > Remaining Estimate: 0h > > A customer hit a problem with a large number of splits. CloudBitableIO fixes > that here > https://github.com/GoogleCloudPlatform/cloud-bigtable-client/blob/master/bigtable-dataflow-parent/bigtable-hbase-beam/src/main/java/com/google/cloud/bigtable/beam/CloudBigtableIO.java#L241 > BigtableIO should have similar logic. -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Work logged] (BEAM-3246) BigtableIO should merge splits if they exceed 15K
[ https://issues.apache.org/jira/browse/BEAM-3246?focusedWorklogId=101110=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-101110 ] ASF GitHub Bot logged work on BEAM-3246: Author: ASF GitHub Bot Created on: 11/May/18 13:01 Start Date: 11/May/18 13:01 Worklog Time Spent: 10m Work Description: sduskis commented on issue #4517: [BEAM-3246] Bigtable: Merge splits if they exceed 15K URL: https://github.com/apache/beam/pull/4517#issuecomment-388356612 Run Java Precommit This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org Issue Time Tracking --- Worklog Id: (was: 101110) Time Spent: 5h 20m (was: 5h 10m) > BigtableIO should merge splits if they exceed 15K > - > > Key: BEAM-3246 > URL: https://issues.apache.org/jira/browse/BEAM-3246 > Project: Beam > Issue Type: Bug > Components: io-java-gcp >Reporter: Solomon Duskis >Assignee: Solomon Duskis >Priority: Major > Time Spent: 5h 20m > Remaining Estimate: 0h > > A customer hit a problem with a large number of splits. CloudBitableIO fixes > that here > https://github.com/GoogleCloudPlatform/cloud-bigtable-client/blob/master/bigtable-dataflow-parent/bigtable-hbase-beam/src/main/java/com/google/cloud/bigtable/beam/CloudBigtableIO.java#L241 > BigtableIO should have similar logic. -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Work logged] (BEAM-3246) BigtableIO should merge splits if they exceed 15K
[ https://issues.apache.org/jira/browse/BEAM-3246?focusedWorklogId=97529=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-97529 ] ASF GitHub Bot logged work on BEAM-3246: Author: ASF GitHub Bot Created on: 02/May/18 15:12 Start Date: 02/May/18 15:12 Worklog Time Spent: 10m Work Description: chamikaramj commented on issue #4517: [BEAM-3246] Bigtable: Merge splits if they exceed 15K URL: https://github.com/apache/beam/pull/4517#issuecomment-386012815 Run Java Precommit This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org Issue Time Tracking --- Worklog Id: (was: 97529) Time Spent: 5h 10m (was: 5h) > BigtableIO should merge splits if they exceed 15K > - > > Key: BEAM-3246 > URL: https://issues.apache.org/jira/browse/BEAM-3246 > Project: Beam > Issue Type: Bug > Components: io-java-gcp >Reporter: Solomon Duskis >Assignee: Solomon Duskis >Priority: Major > Time Spent: 5h 10m > Remaining Estimate: 0h > > A customer hit a problem with a large number of splits. CloudBitableIO fixes > that here > https://github.com/GoogleCloudPlatform/cloud-bigtable-client/blob/master/bigtable-dataflow-parent/bigtable-hbase-beam/src/main/java/com/google/cloud/bigtable/beam/CloudBigtableIO.java#L241 > BigtableIO should have similar logic. -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Work logged] (BEAM-3246) BigtableIO should merge splits if they exceed 15K
[ https://issues.apache.org/jira/browse/BEAM-3246?focusedWorklogId=96549=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-96549 ] ASF GitHub Bot logged work on BEAM-3246: Author: ASF GitHub Bot Created on: 30/Apr/18 05:36 Start Date: 30/Apr/18 05:36 Worklog Time Spent: 10m Work Description: chamikaramj commented on issue #4517: [BEAM-3246] Bigtable: Merge splits if they exceed 15K URL: https://github.com/apache/beam/pull/4517#issuecomment-385316670 Retest this please This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org Issue Time Tracking --- Worklog Id: (was: 96549) Time Spent: 5h (was: 4h 50m) > BigtableIO should merge splits if they exceed 15K > - > > Key: BEAM-3246 > URL: https://issues.apache.org/jira/browse/BEAM-3246 > Project: Beam > Issue Type: Bug > Components: io-java-gcp >Reporter: Solomon Duskis >Assignee: Solomon Duskis >Priority: Major > Time Spent: 5h > Remaining Estimate: 0h > > A customer hit a problem with a large number of splits. CloudBitableIO fixes > that here > https://github.com/GoogleCloudPlatform/cloud-bigtable-client/blob/master/bigtable-dataflow-parent/bigtable-hbase-beam/src/main/java/com/google/cloud/bigtable/beam/CloudBigtableIO.java#L241 > BigtableIO should have similar logic. -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Work logged] (BEAM-3246) BigtableIO should merge splits if they exceed 15K
[ https://issues.apache.org/jira/browse/BEAM-3246?focusedWorklogId=81673=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-81673 ] ASF GitHub Bot logged work on BEAM-3246: Author: ASF GitHub Bot Created on: 18/Mar/18 18:03 Start Date: 18/Mar/18 18:03 Worklog Time Spent: 10m Work Description: sduskis commented on issue #4517: [BEAM-3246] Bigtable: Merge splits if they exceed 15K URL: https://github.com/apache/beam/pull/4517#issuecomment-374024571 @chamikaramj I think that we're ready for a final review. This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org Issue Time Tracking --- Worklog Id: (was: 81673) Time Spent: 4h 50m (was: 4h 40m) > BigtableIO should merge splits if they exceed 15K > - > > Key: BEAM-3246 > URL: https://issues.apache.org/jira/browse/BEAM-3246 > Project: Beam > Issue Type: Bug > Components: io-java-gcp >Reporter: Solomon Duskis >Assignee: Solomon Duskis >Priority: Major > Time Spent: 4h 50m > Remaining Estimate: 0h > > A customer hit a problem with a large number of splits. CloudBitableIO fixes > that here > https://github.com/GoogleCloudPlatform/cloud-bigtable-client/blob/master/bigtable-dataflow-parent/bigtable-hbase-beam/src/main/java/com/google/cloud/bigtable/beam/CloudBigtableIO.java#L241 > BigtableIO should have similar logic. -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Work logged] (BEAM-3246) BigtableIO should merge splits if they exceed 15K
[ https://issues.apache.org/jira/browse/BEAM-3246?focusedWorklogId=81669=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-81669 ] ASF GitHub Bot logged work on BEAM-3246: Author: ASF GitHub Bot Created on: 18/Mar/18 17:18 Start Date: 18/Mar/18 17:18 Worklog Time Spent: 10m Work Description: arkash commented on a change in pull request #4517: [BEAM-3246] Bigtable: Merge splits if they exceed 15K URL: https://github.com/apache/beam/pull/4517#discussion_r175296227 ## File path: sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigtable/BigtableIOTest.java ## @@ -595,6 +596,180 @@ public void testReadingWithSplits() throws Exception { assertSourcesEqualReferenceSource(source, splits, null /* options */); } + private void assertAllSourcesHaveSingleAdjacentRanges(List sources) { +if (sources.size() > 0) { + assertThat(sources.get(0).getRanges(), hasSize(1)); + for (int i = 1; i < sources.size(); i++) { +assertThat(sources.get(i).getRanges(), hasSize(1)); +ByteKey lastEndKey = sources.get(i - 1).getRanges().get(0).getEndKey(); +ByteKey currentStartKey = sources.get(i).getRanges().get(0).getStartKey(); +assertEquals(lastEndKey, currentStartKey); + } +} + } + + private void assertAllSourcesHaveSingleRanges(List sources) { +for (BigtableSource source : sources) { + assertThat(source.getRanges(), hasSize(1)); +} + } + + private ByteKey createByteKey(int key) { +return ByteKey.copyFrom(String.format("key%09d", key).getBytes()); + } + + /** Tests reduce splits with few non adjacent ranges. */ + @Test + public void testReduceSplitsWithSomeNonAdjacentRanges() throws Exception { +final String table = "TEST-MANY-ROWS-SPLITS-TABLE"; +final int numRows = 10; +final int numSamples = 10; +final long bytesPerRow = 100L; +final int maxSplit = 3; + +// Set up test table data and sample row keys for size estimation and splitting. +makeTableData(table, numRows); Review comment: Below method internally called by split in BigtableSource private List splitBasedOnSamples( long desiredBundleSizeBytes, List sampleRowKeys) { // There are no regions, or no samples available. Just scan the entire range. if (sampleRowKeys.isEmpty()) { LOG.info("Not splitting source {} because no sample row keys are available.", this); return Collections.singletonList(this); } LOG.info( "About to split into bundles of size {} with sampleRowKeys length {} first element {}", desiredBundleSizeBytes, sampleRowKeys.size(), sampleRowKeys.get(0)); ImmutableList.Builder splits = ImmutableList.builder(); for (ByteKeyRange range : ranges) { splits.addAll(splitRangeBasedOnSamples(desiredBundleSizeBytes, sampleRowKeys, range)); } return splits.build(); } This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org Issue Time Tracking --- Worklog Id: (was: 81669) Time Spent: 4h 40m (was: 4.5h) > BigtableIO should merge splits if they exceed 15K > - > > Key: BEAM-3246 > URL: https://issues.apache.org/jira/browse/BEAM-3246 > Project: Beam > Issue Type: Bug > Components: io-java-gcp >Reporter: Solomon Duskis >Assignee: Solomon Duskis >Priority: Major > Time Spent: 4h 40m > Remaining Estimate: 0h > > A customer hit a problem with a large number of splits. CloudBitableIO fixes > that here > https://github.com/GoogleCloudPlatform/cloud-bigtable-client/blob/master/bigtable-dataflow-parent/bigtable-hbase-beam/src/main/java/com/google/cloud/bigtable/beam/CloudBigtableIO.java#L241 > BigtableIO should have similar logic. -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Work logged] (BEAM-3246) BigtableIO should merge splits if they exceed 15K
[ https://issues.apache.org/jira/browse/BEAM-3246?focusedWorklogId=81668=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-81668 ] ASF GitHub Bot logged work on BEAM-3246: Author: ASF GitHub Bot Created on: 18/Mar/18 17:11 Start Date: 18/Mar/18 17:11 Worklog Time Spent: 10m Work Description: arkash commented on a change in pull request #4517: [BEAM-3246] Bigtable: Merge splits if they exceed 15K URL: https://github.com/apache/beam/pull/4517#discussion_r175296060 ## File path: sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigtable/BigtableIOTest.java ## @@ -595,6 +596,180 @@ public void testReadingWithSplits() throws Exception { assertSourcesEqualReferenceSource(source, splits, null /* options */); } + private void assertAllSourcesHaveSingleAdjacentRanges(List sources) { +if (sources.size() > 0) { + assertThat(sources.get(0).getRanges(), hasSize(1)); + for (int i = 1; i < sources.size(); i++) { +assertThat(sources.get(i).getRanges(), hasSize(1)); +ByteKey lastEndKey = sources.get(i - 1).getRanges().get(0).getEndKey(); +ByteKey currentStartKey = sources.get(i).getRanges().get(0).getStartKey(); +assertEquals(lastEndKey, currentStartKey); + } +} + } + + private void assertAllSourcesHaveSingleRanges(List sources) { +for (BigtableSource source : sources) { + assertThat(source.getRanges(), hasSize(1)); +} + } + + private ByteKey createByteKey(int key) { +return ByteKey.copyFrom(String.format("key%09d", key).getBytes()); + } + + /** Tests reduce splits with few non adjacent ranges. */ + @Test + public void testReduceSplitsWithSomeNonAdjacentRanges() throws Exception { +final String table = "TEST-MANY-ROWS-SPLITS-TABLE"; +final int numRows = 10; +final int numSamples = 10; +final long bytesPerRow = 100L; +final int maxSplit = 3; + +// Set up test table data and sample row keys for size estimation and splitting. +makeTableData(table, numRows); Review comment: `source.split(numRows * bytesPerRow / numSamples, null /* options */);` uses samepleRowKeys generated by makeTableData This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org Issue Time Tracking --- Worklog Id: (was: 81668) Time Spent: 4.5h (was: 4h 20m) > BigtableIO should merge splits if they exceed 15K > - > > Key: BEAM-3246 > URL: https://issues.apache.org/jira/browse/BEAM-3246 > Project: Beam > Issue Type: Bug > Components: io-java-gcp >Reporter: Solomon Duskis >Assignee: Solomon Duskis >Priority: Major > Time Spent: 4.5h > Remaining Estimate: 0h > > A customer hit a problem with a large number of splits. CloudBitableIO fixes > that here > https://github.com/GoogleCloudPlatform/cloud-bigtable-client/blob/master/bigtable-dataflow-parent/bigtable-hbase-beam/src/main/java/com/google/cloud/bigtable/beam/CloudBigtableIO.java#L241 > BigtableIO should have similar logic. -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Work logged] (BEAM-3246) BigtableIO should merge splits if they exceed 15K
[ https://issues.apache.org/jira/browse/BEAM-3246?focusedWorklogId=81656=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-81656 ] ASF GitHub Bot logged work on BEAM-3246: Author: ASF GitHub Bot Created on: 18/Mar/18 15:24 Start Date: 18/Mar/18 15:24 Worklog Time Spent: 10m Work Description: sduskis commented on a change in pull request #4517: [BEAM-3246] Bigtable: Merge splits if they exceed 15K URL: https://github.com/apache/beam/pull/4517#discussion_r175292694 ## File path: sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigtable/BigtableIOTest.java ## @@ -595,6 +596,180 @@ public void testReadingWithSplits() throws Exception { assertSourcesEqualReferenceSource(source, splits, null /* options */); } + private void assertAllSourcesHaveSingleAdjacentRanges(List sources) { +if (sources.size() > 0) { + assertThat(sources.get(0).getRanges(), hasSize(1)); + for (int i = 1; i < sources.size(); i++) { +assertThat(sources.get(i).getRanges(), hasSize(1)); +ByteKey lastEndKey = sources.get(i - 1).getRanges().get(0).getEndKey(); +ByteKey currentStartKey = sources.get(i).getRanges().get(0).getStartKey(); +assertEquals(lastEndKey, currentStartKey); + } +} + } + + private void assertAllSourcesHaveSingleRanges(List sources) { +for (BigtableSource source : sources) { + assertThat(source.getRanges(), hasSize(1)); +} + } + + private ByteKey createByteKey(int key) { +return ByteKey.copyFrom(String.format("key%09d", key).getBytes()); + } + + /** Tests reduce splits with few non adjacent ranges. */ + @Test + public void testReduceSplitsWithSomeNonAdjacentRanges() throws Exception { +final String table = "TEST-MANY-ROWS-SPLITS-TABLE"; +final int numRows = 10; +final int numSamples = 10; +final long bytesPerRow = 100L; +final int maxSplit = 3; + +// Set up test table data and sample row keys for size estimation and splitting. +makeTableData(table, numRows); Review comment: I don't see any uses of the following lines: ``` makeTableData(table, numRows); service.setupSampleRowKeys(table, numSamples, bytesPerRow); ``` Please remove the lines if they are no longer used, and any unused variables. If there is a usage, can you please explain where it is? This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org Issue Time Tracking --- Worklog Id: (was: 81656) Time Spent: 4h 20m (was: 4h 10m) > BigtableIO should merge splits if they exceed 15K > - > > Key: BEAM-3246 > URL: https://issues.apache.org/jira/browse/BEAM-3246 > Project: Beam > Issue Type: Bug > Components: io-java-gcp >Reporter: Solomon Duskis >Assignee: Solomon Duskis >Priority: Major > Time Spent: 4h 20m > Remaining Estimate: 0h > > A customer hit a problem with a large number of splits. CloudBitableIO fixes > that here > https://github.com/GoogleCloudPlatform/cloud-bigtable-client/blob/master/bigtable-dataflow-parent/bigtable-hbase-beam/src/main/java/com/google/cloud/bigtable/beam/CloudBigtableIO.java#L241 > BigtableIO should have similar logic. -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Work logged] (BEAM-3246) BigtableIO should merge splits if they exceed 15K
[ https://issues.apache.org/jira/browse/BEAM-3246?focusedWorklogId=81652=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-81652 ] ASF GitHub Bot logged work on BEAM-3246: Author: ASF GitHub Bot Created on: 18/Mar/18 14:22 Start Date: 18/Mar/18 14:22 Worklog Time Spent: 10m Work Description: arkash commented on issue #4517: [BEAM-3246] Bigtable: Merge splits if they exceed 15K URL: https://github.com/apache/beam/pull/4517#issuecomment-374003075 @sduskis please review remove the table ranges. This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org Issue Time Tracking --- Worklog Id: (was: 81652) Time Spent: 4h 10m (was: 4h) > BigtableIO should merge splits if they exceed 15K > - > > Key: BEAM-3246 > URL: https://issues.apache.org/jira/browse/BEAM-3246 > Project: Beam > Issue Type: Bug > Components: io-java-gcp >Reporter: Solomon Duskis >Assignee: Solomon Duskis >Priority: Major > Time Spent: 4h 10m > Remaining Estimate: 0h > > A customer hit a problem with a large number of splits. CloudBitableIO fixes > that here > https://github.com/GoogleCloudPlatform/cloud-bigtable-client/blob/master/bigtable-dataflow-parent/bigtable-hbase-beam/src/main/java/com/google/cloud/bigtable/beam/CloudBigtableIO.java#L241 > BigtableIO should have similar logic. -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Work logged] (BEAM-3246) BigtableIO should merge splits if they exceed 15K
[ https://issues.apache.org/jira/browse/BEAM-3246?focusedWorklogId=81651=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-81651 ] ASF GitHub Bot logged work on BEAM-3246: Author: ASF GitHub Bot Created on: 18/Mar/18 12:33 Start Date: 18/Mar/18 12:33 Worklog Time Spent: 10m Work Description: sduskis commented on a change in pull request #4517: [BEAM-3246] Bigtable: Merge splits if they exceed 15K URL: https://github.com/apache/beam/pull/4517#discussion_r175287451 ## File path: sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigtable/BigtableIOTest.java ## @@ -595,6 +596,183 @@ public void testReadingWithSplits() throws Exception { assertSourcesEqualReferenceSource(source, splits, null /* options */); } + private void assertAllSourcesHaveSingleAdjacentRanges(List sources) { +if (sources.size() > 0) { + assertThat(sources.get(0).getRanges(), hasSize(1)); + for (int i = 1; i < sources.size(); i++) { +assertThat(sources.get(i).getRanges(), hasSize(1)); +ByteKey lastEndKey = sources.get(i - 1).getRanges().get(0).getEndKey(); +ByteKey currentStartKey = sources.get(i).getRanges().get(0).getStartKey(); +assertEquals(lastEndKey, currentStartKey); + } +} + } + + private void assertAllSourcesHaveSingleRanges(List sources) { +for (BigtableSource source : sources) { + assertThat(source.getRanges(), hasSize(1)); +} + } + + private ByteKey createByteKey(int key) { +return ByteKey.copyFrom(String.format("key%09d", key).getBytes()); + } + + /** Tests reduce splits with few non adjacent ranges. */ + @Test + public void testReduceSplitsWithSomeNonAdjacentRanges() throws Exception { +final String table = "TEST-MANY-ROWS-SPLITS-TABLE"; +final int numRows = 10; +final int numSamples = 10; +final long bytesPerRow = 100L; +final int maxSplit = 3; + +// Set up test table data and sample row keys for size estimation and splitting. +makeTableData(table, numRows); +service.setupSampleRowKeys(table, numSamples, bytesPerRow); + +ByteKeyRange tableRange = service.getTableRange(table); +//Construct few non contiguous key ranges [..1][1..2][3..4][4..5][6..7][8..] +List keyRanges = Arrays.asList( +tableRange.withEndKey(createByteKey(1)), Review comment: Please use `ByteKeyRange.of(start, end)` instead of tableRage.withEndKey(). there's no need to involve `tableRange` here. This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org Issue Time Tracking --- Worklog Id: (was: 81651) Time Spent: 4h (was: 3h 50m) > BigtableIO should merge splits if they exceed 15K > - > > Key: BEAM-3246 > URL: https://issues.apache.org/jira/browse/BEAM-3246 > Project: Beam > Issue Type: Bug > Components: io-java-gcp >Reporter: Solomon Duskis >Assignee: Solomon Duskis >Priority: Major > Time Spent: 4h > Remaining Estimate: 0h > > A customer hit a problem with a large number of splits. CloudBitableIO fixes > that here > https://github.com/GoogleCloudPlatform/cloud-bigtable-client/blob/master/bigtable-dataflow-parent/bigtable-hbase-beam/src/main/java/com/google/cloud/bigtable/beam/CloudBigtableIO.java#L241 > BigtableIO should have similar logic. -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Work logged] (BEAM-3246) BigtableIO should merge splits if they exceed 15K
[ https://issues.apache.org/jira/browse/BEAM-3246?focusedWorklogId=81649=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-81649 ] ASF GitHub Bot logged work on BEAM-3246: Author: ASF GitHub Bot Created on: 18/Mar/18 12:22 Start Date: 18/Mar/18 12:22 Worklog Time Spent: 10m Work Description: arkash commented on a change in pull request #4517: [BEAM-3246] Bigtable: Merge splits if they exceed 15K URL: https://github.com/apache/beam/pull/4517#discussion_r175287056 ## File path: sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigtable/BigtableIOTest.java ## @@ -595,6 +596,183 @@ public void testReadingWithSplits() throws Exception { assertSourcesEqualReferenceSource(source, splits, null /* options */); } + private void assertAllSourcesHaveSingleAdjacentRanges(List sources) { +if (sources.size() > 0) { + assertThat(sources.get(0).getRanges(), hasSize(1)); + for (int i = 1; i < sources.size(); i++) { +assertThat(sources.get(i).getRanges(), hasSize(1)); +ByteKey lastEndKey = sources.get(i - 1).getRanges().get(0).getEndKey(); +ByteKey currentStartKey = sources.get(i).getRanges().get(0).getStartKey(); +assertEquals(lastEndKey, currentStartKey); + } +} + } + + private void assertAllSourcesHaveSingleRanges(List sources) { +for (BigtableSource source : sources) { + assertThat(source.getRanges(), hasSize(1)); +} + } + + private ByteKey createByteKey(int key) { +return ByteKey.copyFrom(String.format("key%09d", key).getBytes()); + } + + /** Tests reduce splits with few non adjacent ranges. */ + @Test + public void testReduceSplitsWithSomeNonAdjacentRanges() throws Exception { +final String table = "TEST-MANY-ROWS-SPLITS-TABLE"; +final int numRows = 10; +final int numSamples = 10; +final long bytesPerRow = 100L; +final int maxSplit = 3; + +// Set up test table data and sample row keys for size estimation and splitting. +makeTableData(table, numRows); +service.setupSampleRowKeys(table, numSamples, bytesPerRow); + +ByteKeyRange tableRange = service.getTableRange(table); Review comment: We do use the data created by makeTable to setup sampleRowKeys of the service object which is further used to split the source. This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org Issue Time Tracking --- Worklog Id: (was: 81649) Time Spent: 3h 50m (was: 3h 40m) > BigtableIO should merge splits if they exceed 15K > - > > Key: BEAM-3246 > URL: https://issues.apache.org/jira/browse/BEAM-3246 > Project: Beam > Issue Type: Bug > Components: io-java-gcp >Reporter: Solomon Duskis >Assignee: Solomon Duskis >Priority: Major > Time Spent: 3h 50m > Remaining Estimate: 0h > > A customer hit a problem with a large number of splits. CloudBitableIO fixes > that here > https://github.com/GoogleCloudPlatform/cloud-bigtable-client/blob/master/bigtable-dataflow-parent/bigtable-hbase-beam/src/main/java/com/google/cloud/bigtable/beam/CloudBigtableIO.java#L241 > BigtableIO should have similar logic. -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Work logged] (BEAM-3246) BigtableIO should merge splits if they exceed 15K
[ https://issues.apache.org/jira/browse/BEAM-3246?focusedWorklogId=81180=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-81180 ] ASF GitHub Bot logged work on BEAM-3246: Author: ASF GitHub Bot Created on: 16/Mar/18 14:11 Start Date: 16/Mar/18 14:11 Worklog Time Spent: 10m Work Description: sduskis commented on a change in pull request #4517: [BEAM-3246] Bigtable: Merge splits if they exceed 15K URL: https://github.com/apache/beam/pull/4517#discussion_r175101148 ## File path: sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigtable/BigtableIOTest.java ## @@ -595,6 +596,183 @@ public void testReadingWithSplits() throws Exception { assertSourcesEqualReferenceSource(source, splits, null /* options */); } + private void assertAllSourcesHaveSingleAdjacentRanges(List sources) { +if (sources.size() > 0) { + assertThat(sources.get(0).getRanges(), hasSize(1)); + for (int i = 1; i < sources.size(); i++) { +assertThat(sources.get(i).getRanges(), hasSize(1)); +ByteKey lastEndKey = sources.get(i - 1).getRanges().get(0).getEndKey(); +ByteKey currentStartKey = sources.get(i).getRanges().get(0).getStartKey(); +assertEquals(lastEndKey, currentStartKey); + } +} + } + + private void assertAllSourcesHaveSingleRanges(List sources) { +for (BigtableSource source : sources) { + assertThat(source.getRanges(), hasSize(1)); +} + } + + private ByteKey createByteKey(int key) { +return ByteKey.copyFrom(String.format("key%09d", key).getBytes()); + } + + /** Tests reduce splits with few non adjacent ranges. */ + @Test + public void testReduceSplitsWithSomeNonAdjacentRanges() throws Exception { +final String table = "TEST-MANY-ROWS-SPLITS-TABLE"; +final int numRows = 10; +final int numSamples = 10; +final long bytesPerRow = 100L; +final int maxSplit = 3; + +// Set up test table data and sample row keys for size estimation and splitting. +makeTableData(table, numRows); +service.setupSampleRowKeys(table, numSamples, bytesPerRow); + +ByteKeyRange tableRange = service.getTableRange(table); Review comment: Please remove the `tableRange` and `service` variables and the call to `makeTableData ` as they are not used in this test. Please use `ByteKeyRange.of(start, end)` instead of `tableRange.with` methods. This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org Issue Time Tracking --- Worklog Id: (was: 81180) Time Spent: 3h 40m (was: 3.5h) > BigtableIO should merge splits if they exceed 15K > - > > Key: BEAM-3246 > URL: https://issues.apache.org/jira/browse/BEAM-3246 > Project: Beam > Issue Type: Bug > Components: io-java-gcp >Reporter: Solomon Duskis >Assignee: Solomon Duskis >Priority: Major > Time Spent: 3h 40m > Remaining Estimate: 0h > > A customer hit a problem with a large number of splits. CloudBitableIO fixes > that here > https://github.com/GoogleCloudPlatform/cloud-bigtable-client/blob/master/bigtable-dataflow-parent/bigtable-hbase-beam/src/main/java/com/google/cloud/bigtable/beam/CloudBigtableIO.java#L241 > BigtableIO should have similar logic. -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Work logged] (BEAM-3246) BigtableIO should merge splits if they exceed 15K
[ https://issues.apache.org/jira/browse/BEAM-3246?focusedWorklogId=81179=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-81179 ] ASF GitHub Bot logged work on BEAM-3246: Author: ASF GitHub Bot Created on: 16/Mar/18 14:09 Start Date: 16/Mar/18 14:09 Worklog Time Spent: 10m Work Description: sduskis commented on a change in pull request #4517: [BEAM-3246] Bigtable: Merge splits if they exceed 15K URL: https://github.com/apache/beam/pull/4517#discussion_r173053528 ## File path: sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigtable/BigtableIOTest.java ## @@ -595,6 +595,34 @@ public void testReadingWithSplits() throws Exception { assertSourcesEqualReferenceSource(source, splits, null /* options */); } + /** Tests Max Split Count. */ + @Test + public void testMaxSplits() throws Exception { +final String table = "TEST-MANY-ROWS-SPLITS-TABLE"; +final int numRows = 2; +final int numSamples = 16000; +final long bytesPerRow = 100L; +final int maxSplit = 15_360; + +// Set up test table data and sample row keys for size estimation and splitting. +makeTableData(table, numRows); +service.setupSampleRowKeys(table, numSamples, bytesPerRow); + Review comment: Please confirm that there are indeed more than maxSplit sample row keys. This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org Issue Time Tracking --- Worklog Id: (was: 81179) Time Spent: 3.5h (was: 3h 20m) > BigtableIO should merge splits if they exceed 15K > - > > Key: BEAM-3246 > URL: https://issues.apache.org/jira/browse/BEAM-3246 > Project: Beam > Issue Type: Bug > Components: io-java-gcp >Reporter: Solomon Duskis >Assignee: Solomon Duskis >Priority: Major > Time Spent: 3.5h > Remaining Estimate: 0h > > A customer hit a problem with a large number of splits. CloudBitableIO fixes > that here > https://github.com/GoogleCloudPlatform/cloud-bigtable-client/blob/master/bigtable-dataflow-parent/bigtable-hbase-beam/src/main/java/com/google/cloud/bigtable/beam/CloudBigtableIO.java#L241 > BigtableIO should have similar logic. -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Work logged] (BEAM-3246) BigtableIO should merge splits if they exceed 15K
[ https://issues.apache.org/jira/browse/BEAM-3246?focusedWorklogId=81035=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-81035 ] ASF GitHub Bot logged work on BEAM-3246: Author: ASF GitHub Bot Created on: 16/Mar/18 01:24 Start Date: 16/Mar/18 01:24 Worklog Time Spent: 10m Work Description: arkash commented on issue #4517: [BEAM-3246] Bigtable: Merge splits if they exceed 15K URL: https://github.com/apache/beam/pull/4517#issuecomment-373574146 @sduskis have completed discussed changes and added additional test cases please review. This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org Issue Time Tracking --- Worklog Id: (was: 81035) Time Spent: 3h 20m (was: 3h 10m) > BigtableIO should merge splits if they exceed 15K > - > > Key: BEAM-3246 > URL: https://issues.apache.org/jira/browse/BEAM-3246 > Project: Beam > Issue Type: Bug > Components: io-java-gcp >Reporter: Solomon Duskis >Assignee: Solomon Duskis >Priority: Major > Time Spent: 3h 20m > Remaining Estimate: 0h > > A customer hit a problem with a large number of splits. CloudBitableIO fixes > that here > https://github.com/GoogleCloudPlatform/cloud-bigtable-client/blob/master/bigtable-dataflow-parent/bigtable-hbase-beam/src/main/java/com/google/cloud/bigtable/beam/CloudBigtableIO.java#L241 > BigtableIO should have similar logic. -- This message was sent by Atlassian JIRA (v7.6.3#76005)