This is an automated email from the ASF dual-hosted git repository. leerho pushed a commit to branch methods_to_compute_partition_limits in repository https://gitbox.apache.org/repos/asf/datasketches-java.git
commit 20166c7f681a272bd9277d7eaa490829c50c5000 Author: Lee Rhodes <[email protected]> AuthorDate: Thu Apr 11 21:37:02 2024 -0700 temp commit --- .../apache/datasketches/kll/KllItemsSketch.java | 17 +++-- .../datasketches/partitions/Partitioner.java | 4 +- .../apache/datasketches/quantiles/ItemsSketch.java | 17 +++-- .../GenericPartitionBoundaries.java | 45 ++++++++++--- .../quantilescommon/GenericSortedView.java | 2 +- .../quantilescommon/ItemsSketchSortedView.java | 46 +++++++------ .../quantilescommon/PartitionBoundaries.java | 67 ------------------- .../quantilescommon/PartitioningFeature.java | 78 ++++++++++++++++++---- .../quantilescommon/QuantilesGenericAPI.java | 2 +- .../quantilescommon/SketchPartitionLimits.java | 63 +++++++++++++++++ .../datasketches/quantiles/ItemsSketchTest.java | 4 +- .../quantilescommon/CrossCheckQuantilesTest.java | 6 +- .../quantilescommon/PartitionBoundariesTest.java | 18 +++-- 13 files changed, 228 insertions(+), 141 deletions(-) diff --git a/src/main/java/org/apache/datasketches/kll/KllItemsSketch.java b/src/main/java/org/apache/datasketches/kll/KllItemsSketch.java index 9f5a5ae7..efcca934 100644 --- a/src/main/java/org/apache/datasketches/kll/KllItemsSketch.java +++ b/src/main/java/org/apache/datasketches/kll/KllItemsSketch.java @@ -156,11 +156,21 @@ public abstract class KllItemsSketch<T> extends KllSketch implements QuantilesGe } @Override - public GenericPartitionBoundaries<T> getPartitionBoundaries(final int numEquallySized, + public GenericPartitionBoundaries<T> getPartitionBoundariesFromNumParts( + final int numEquallySizedParts, final QuantileSearchCriteria searchCrit) { if (isEmpty()) { throw new IllegalArgumentException(EMPTY_MSG); } refreshSortedView(); - return itemsSV.getPartitionBoundaries(numEquallySized, searchCrit); + return itemsSV.getPartitionBoundariesFromNumParts(numEquallySizedParts, searchCrit); + } + + @Override + public GenericPartitionBoundaries<T> getPartitionBoundariesFromPartSize( + final long nominalPartSizeItems, + final QuantileSearchCriteria searchCrit) { + if (isEmpty()) { throw new IllegalArgumentException(EMPTY_MSG); } + refreshSortedView(); + return itemsSV.getPartitionBoundariesFromPartSize(nominalPartSizeItems, searchCrit); } @Override @@ -424,9 +434,8 @@ public abstract class KllItemsSketch<T> extends KllSketch implements QuantilesGe quantiles = (T[]) Array.newInstance(serDe.getClassOfT(), numQuantiles); cumWeights = new long[numQuantiles]; populateFromSketch(srcQuantiles, srcLevels, srcNumLevels, numQuantiles); - final double normRankErr = getNormalizedRankError(getK(), true); return new ItemsSketchSortedView( - quantiles, cumWeights, getN(), comparator, getMaxItem(), getMinItem(), normRankErr); + quantiles, cumWeights, getN(), comparator, getMaxItem(), getMinItem()); } private void populateFromSketch(final Object[] srcQuantiles, final int[] srcLevels, diff --git a/src/main/java/org/apache/datasketches/partitions/Partitioner.java b/src/main/java/org/apache/datasketches/partitions/Partitioner.java index be1247ca..66030fb2 100644 --- a/src/main/java/org/apache/datasketches/partitions/Partitioner.java +++ b/src/main/java/org/apache/datasketches/partitions/Partitioner.java @@ -117,7 +117,7 @@ public class Partitioner<T, S extends QuantilesGenericAPI<T> & PartitioningFeatu this.numLevels = (int)max(1, ceil(log(guessNumParts) / log(maxPartsPerSk))); final int partsPerSk = (int)round(pow(guessNumParts, 1.0 / numLevels)); this.partitionsPerSk = min(partsPerSk, maxPartsPerSk); - final GenericPartitionBoundaries<T> gpb = sk.getPartitionBoundaries(partitionsPerSk, criteria); + final GenericPartitionBoundaries<T> gpb = sk.getPartitionBoundariesFromNumParts(partitionsPerSk, criteria); final StackElement<T> se = new StackElement<>(gpb, 0, "1"); stack.push(se); partitionSearch(stack); @@ -144,7 +144,7 @@ public class Partitioner<T, S extends QuantilesGenericAPI<T> & PartitioningFeatu if (++se.part <= numParts) { final PartitionBoundsRow<T> row = new PartitionBoundsRow<>(se); final S sk = fillReq.getRange(row.lowerBound, row.upperBound, row.rule); - final GenericPartitionBoundaries<T> gpb2 = sk.getPartitionBoundaries(this.partitionsPerSk, criteria); + final GenericPartitionBoundaries<T> gpb2 = sk.getPartitionBoundariesFromNumParts(this.partitionsPerSk, criteria); final int level = stack.size() + 1; final String partId = se.levelPartId + "." + se.part + "," + level; final StackElement<T> se2 = new StackElement<>(gpb2, 0, partId); diff --git a/src/main/java/org/apache/datasketches/quantiles/ItemsSketch.java b/src/main/java/org/apache/datasketches/quantiles/ItemsSketch.java index 9361c6cd..3d2d3388 100644 --- a/src/main/java/org/apache/datasketches/quantiles/ItemsSketch.java +++ b/src/main/java/org/apache/datasketches/quantiles/ItemsSketch.java @@ -277,11 +277,21 @@ public final class ItemsSketch<T> implements QuantilesGenericAPI<T> { } @Override - public GenericPartitionBoundaries<T> getPartitionBoundaries(final int numEquallySized, + public GenericPartitionBoundaries<T> getPartitionBoundariesFromNumParts( + final int numEquallySizedParts, final QuantileSearchCriteria searchCrit) { if (isEmpty()) { throw new IllegalArgumentException(QuantilesAPI.EMPTY_MSG); } refreshSortedView(); - return classicQisSV.getPartitionBoundaries(numEquallySized, searchCrit); + return classicQisSV.getPartitionBoundariesFromNumParts(numEquallySizedParts, searchCrit); + } + + @Override + public GenericPartitionBoundaries<T> getPartitionBoundariesFromPartSize( + final long nominalPartSizeItems, + final QuantileSearchCriteria searchCrit) { + if (isEmpty()) { throw new IllegalArgumentException(QuantilesAPI.EMPTY_MSG); } + refreshSortedView(); + return classicQisSV.getPartitionBoundariesFromPartSize(nominalPartSizeItems, searchCrit); } @Override @@ -656,9 +666,8 @@ public final class ItemsSketch<T> implements QuantilesGenericAPI<T> { throw new SketchesStateException("Sorted View is misconfigured. TotalN does not match cumWeights."); } - final double normRankErr = getNormalizedRankError(sk.getK(), true); return new ItemsSketchSortedView<>( - svQuantiles, svCumWeights, sk.getN(), comparator, sk.getMaxItem(), sk.getMinItem(), normRankErr); + svQuantiles, svCumWeights, sk.getN(), comparator, sk.getMaxItem(), sk.getMinItem()); } diff --git a/src/main/java/org/apache/datasketches/quantilescommon/GenericPartitionBoundaries.java b/src/main/java/org/apache/datasketches/quantilescommon/GenericPartitionBoundaries.java index b21cde77..ee53e910 100644 --- a/src/main/java/org/apache/datasketches/quantilescommon/GenericPartitionBoundaries.java +++ b/src/main/java/org/apache/datasketches/quantilescommon/GenericPartitionBoundaries.java @@ -25,9 +25,10 @@ import static org.apache.datasketches.quantilescommon.QuantileSearchCriteria.INC import org.apache.datasketches.common.SketchesStateException; /** - * Implements PartitionBoundaries + * This defines the returned results of the getParitionBoundaries() function and + * includes the basic methods needed to construct actual partitions. */ -final public class GenericPartitionBoundaries<T> implements PartitionBoundaries { +public final class GenericPartitionBoundaries<T> { private long totalN; //totalN of source sketch private T[] boundaries; //quantiles at the boundaries private long[] natRanks; //natural ranks at the boundaries @@ -36,7 +37,7 @@ final public class GenericPartitionBoundaries<T> implements PartitionBoundaries private T minItem; //of the source sketch private QuantileSearchCriteria searchCrit; //of the source sketch query to getPartitionBoundaries. //computed - private long[] numDeltaItems; //num of items in each part + private long[] numDeltaItems; //num of items in each partition private int numPartitions; //num of partitions public GenericPartitionBoundaries( @@ -48,7 +49,7 @@ final public class GenericPartitionBoundaries<T> implements PartitionBoundaries final T minItem, final QuantileSearchCriteria searchCrit) { this.totalN = totalN; - this.boundaries = boundaries; //SpotBugs EI_EXPOSE_REP2 copying from sketch class to this "friend" class. + this.boundaries = boundaries; //SpotBugs EI_EXPOSE_REP2 OK: copying from sketch class to this "friend" class. this.natRanks = natRanks; // " this.normRanks = normRanks; // " this.maxItem = maxItem; @@ -56,7 +57,7 @@ final public class GenericPartitionBoundaries<T> implements PartitionBoundaries this.searchCrit = searchCrit; //check and compute final int len = boundaries.length; - if (len < 2) { throw new SketchesStateException("Source sketch is empty"); } + if (len < 2) { throw new SketchesStateException("Source sketch is empty"); } //class is final, this is ok numDeltaItems = new long[len]; numDeltaItems[0] = 0; // index 0 is always 0 for (int i = 1; i < len; i++) { @@ -67,7 +68,10 @@ final public class GenericPartitionBoundaries<T> implements PartitionBoundaries this.numPartitions = len - 1; } - @Override + /** + * Gets the length of the input stream offered to the underlying sketch. + * @return the length of the input stream offered to the underlying sketch. + */ public long getN() { return totalN; } /** @@ -100,16 +104,32 @@ final public class GenericPartitionBoundaries<T> implements PartitionBoundaries */ public T[] getBoundaries() { return boundaries.clone(); } - @Override + /** + * Gets an ordered array of natural ranks of the associated array of partition boundaries utilizing + * a specified search criterion. Natural ranks are integral values on the interval [1, N] + * @return an array of natural ranks. + */ public long[] getNaturalRanks() { return natRanks.clone(); } - @Override + /** + * Gets an ordered array of normalized ranks of the associated array of partition boundaries utilizing + * a specified search criterion. Normalized ranks are double values on the interval [0.0, 1.0]. + * @return an array of normalized ranks. + */ public double[] getNormalizedRanks() { return normRanks.clone(); } - @Override + /** + * Gets the number of items to be included for each partition as an array. + * The count at index 0 is 0. The number of items included in the first partition, defined by the boundaries at + * index 0 and index 1, is at index 1 in this array, etc. + * @return the number of items to be included for each partition as an array. + */ public long[] getNumDeltaItems() { return numDeltaItems.clone(); } - @Override + /** + * Gets the number of partitions + * @return the number of partitions + */ public int getNumPartitions() { return numPartitions; } /** @@ -130,7 +150,10 @@ final public class GenericPartitionBoundaries<T> implements PartitionBoundaries */ public T getMinItem() { return minItem; } - @Override + /** + * Gets the search criteria specified for the source sketch + * @return The search criteria specified for the source sketch + */ public QuantileSearchCriteria getSearchCriteria() { return searchCrit; } } diff --git a/src/main/java/org/apache/datasketches/quantilescommon/GenericSortedView.java b/src/main/java/org/apache/datasketches/quantilescommon/GenericSortedView.java index 1c54395f..4f9da76a 100644 --- a/src/main/java/org/apache/datasketches/quantilescommon/GenericSortedView.java +++ b/src/main/java/org/apache/datasketches/quantilescommon/GenericSortedView.java @@ -30,7 +30,7 @@ import org.apache.datasketches.common.SketchesArgumentException; * @author Alexander Saydakov * @author Lee Rhodes */ -public interface GenericSortedView<T> extends PartitioningFeature<T>, SortedView { +public interface GenericSortedView<T> extends PartitioningFeature<T>, SketchPartitionLimits, SortedView { /** * Returns an approximation to the Cumulative Distribution Function (CDF) of the input stream diff --git a/src/main/java/org/apache/datasketches/quantilescommon/ItemsSketchSortedView.java b/src/main/java/org/apache/datasketches/quantilescommon/ItemsSketchSortedView.java index 5fbcf434..57735c6d 100644 --- a/src/main/java/org/apache/datasketches/quantilescommon/ItemsSketchSortedView.java +++ b/src/main/java/org/apache/datasketches/quantilescommon/ItemsSketchSortedView.java @@ -19,6 +19,7 @@ package org.apache.datasketches.quantilescommon; +import static java.lang.Math.min; import static org.apache.datasketches.quantilescommon.GenericInequalitySearch.find; import static org.apache.datasketches.quantilescommon.QuantileSearchCriteria.INCLUSIVE; import static org.apache.datasketches.quantilescommon.QuantilesAPI.EMPTY_MSG; @@ -38,7 +39,6 @@ import org.apache.datasketches.quantilescommon.GenericInequalitySearch.Inequalit * @author Lee Rhodes */ public class ItemsSketchSortedView<T> implements GenericSortedView<T> { - private static final double PARTITIONING_ERROR_FACTOR = 2.0; private final T[] quantiles; private final long[] cumWeights; //cumulative natural weights private final long totalN; @@ -46,7 +46,6 @@ public class ItemsSketchSortedView<T> implements GenericSortedView<T> { private final T maxItem; private final T minItem; private final Class<T> clazz; - private final double normRankErr;//assumes PMF type error /** * Construct from elements, also used in testing. @@ -56,7 +55,6 @@ public class ItemsSketchSortedView<T> implements GenericSortedView<T> { * @param comparator the Comparator for type T * @param maxItem of type T * @param minItem of type T - * @param normRankErr the normalized rank error of the originating sketch. */ @SuppressWarnings("unchecked") public ItemsSketchSortedView( @@ -65,8 +63,7 @@ public class ItemsSketchSortedView<T> implements GenericSortedView<T> { final long totalN, final Comparator<? super T> comparator, final T maxItem, - final T minItem, - final double normRankErr) { + final T minItem) { this.quantiles = quantiles; this.cumWeights = cumWeights; this.totalN = totalN; @@ -74,7 +71,6 @@ public class ItemsSketchSortedView<T> implements GenericSortedView<T> { this.maxItem = maxItem; this.minItem = minItem; this.clazz = (Class<T>)quantiles[0].getClass(); - this.normRankErr = normRankErr; } //end of constructors @@ -118,29 +114,35 @@ public class ItemsSketchSortedView<T> implements GenericSortedView<T> { } @Override - @SuppressWarnings("unchecked") - public GenericPartitionBoundaries<T> getPartitionBoundaries(final int numEquallySized, + public GenericPartitionBoundaries<T> getPartitionBoundariesFromPartSize( + final long nominalPartitionSize, final QuantileSearchCriteria searchCrit) { if (isEmpty()) { throw new SketchesArgumentException(QuantilesAPI.EMPTY_MSG); } - final long totalN = this.totalN; - final int maxParts = (int) (totalN / Math.ceil(normRankErr * PARTITIONING_ERROR_FACTOR) ); - final int svLen = cumWeights.length; - - if (numEquallySized > maxParts) { + final long partSizeItems = getMinPartitionSizeItems(); + if (nominalPartitionSize < partSizeItems) { throw new SketchesArgumentException(QuantilesAPI.UNSUPPORTED_MSG - + "The requested number of partitions is too large for the 'k' of this sketch " - + "if it exceeds the maximum number of partitions allowed by the error threshold for the 'k' of this sketch." - + "Requested Partitions: " + numEquallySized + " > " + maxParts); + + " The requested nominal partition size is too small for this sketch."); } - if (numEquallySized > svLen / 2.0) { + final long totalN = this.totalN; + final int numEquallySizedParts = (int) min(totalN / partSizeItems, getMaxPartitions()); + return getPartitionBoundariesFromNumParts(numEquallySizedParts); + } + + @Override + @SuppressWarnings("unchecked") + public GenericPartitionBoundaries<T> getPartitionBoundariesFromNumParts( + final int numEquallySizedParts, + final QuantileSearchCriteria searchCrit) { + if (isEmpty()) { throw new SketchesArgumentException(QuantilesAPI.EMPTY_MSG); } + final int maxParts = getMaxPartitions(); + if (numEquallySizedParts > maxParts) { throw new SketchesArgumentException(QuantilesAPI.UNSUPPORTED_MSG - + "The requested number of partitions is too large for the number of retained items " - + "if it exceeds maximum number of retained items divided by 2." - + "Requested Partitions: " + numEquallySized + " > " - + "Retained Items / 2: " + (svLen / 2)); + + " The requested number of partitions is too large for this sketch."); } + final long totalN = this.totalN; + final int svLen = cumWeights.length; - final double[] searchNormRanks = evenlySpacedDoubles(0, 1.0, numEquallySized + 1); + final double[] searchNormRanks = evenlySpacedDoubles(0, 1.0, numEquallySizedParts + 1); final int partArrLen = searchNormRanks.length; final T[] partQuantiles = (T[]) Array.newInstance(clazz, partArrLen); final long[] partNatRanks = new long[partArrLen]; diff --git a/src/main/java/org/apache/datasketches/quantilescommon/PartitionBoundaries.java b/src/main/java/org/apache/datasketches/quantilescommon/PartitionBoundaries.java deleted file mode 100644 index e3c59d2c..00000000 --- a/src/main/java/org/apache/datasketches/quantilescommon/PartitionBoundaries.java +++ /dev/null @@ -1,67 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.apache.datasketches.quantilescommon; - -/** - * This defines a set of results computed from the getParitionBoundaries() function and - * encapsulates the basic methods needed to construct actual partitions based on generic items. - */ -public interface PartitionBoundaries { - - /** - * Gets the length of the input stream offered to the underlying sketch. - * @return the length of the input stream offered to the underlying sketch. - */ - long getN(); - - /** - * Gets an ordered array of natural ranks of the associated array of partition boundaries utilizing - * a specified search criterion. Natural ranks are integral values on the interval [1, N] - * @return an array of natural ranks. - */ - long[] getNaturalRanks(); - - /** - * Gets an ordered array of normalized ranks of the associated array of partition boundaries utilizing - * a specified search criterion. Normalized ranks are double values on the interval [0.0, 1.0]. - * @return an array of normalized ranks. - */ - double[] getNormalizedRanks(); - - /** - * Gets the number of items to be included for each partition as an array. - * The count at index 0 is 0. The number of items included in the first partition, defined by the boundaries at - * index 0 and index 1, is at index 1 in this array, etc. - * @return the number of items to be included for each partition as an array. - */ - long[] getNumDeltaItems(); - - /** - * Gets the number of partitions - * @return the number of partitions - */ - int getNumPartitions(); - - /** - * Gets the search criteria specified for the source sketch - * @return The search criteria specified for the source sketch - */ - QuantileSearchCriteria getSearchCriteria(); -} diff --git a/src/main/java/org/apache/datasketches/quantilescommon/PartitioningFeature.java b/src/main/java/org/apache/datasketches/quantilescommon/PartitioningFeature.java index 2c36bb10..3d35cfe9 100644 --- a/src/main/java/org/apache/datasketches/quantilescommon/PartitioningFeature.java +++ b/src/main/java/org/apache/datasketches/quantilescommon/PartitioningFeature.java @@ -33,13 +33,15 @@ public interface PartitioningFeature<T> { * refers to an approximately equal number of items per partition. * * <p>This method is equivalent to - * {@link #getPartitionBoundaries(int, QuantileSearchCriteria) getPartitionBoundaries(numEquallySized, INCLUSIVE)}. + * {@link #getPartitionBoundariesFromNumParts(int, QuantileSearchCriteria) + * getPartitionBoundariesFromNumParts(numEquallySizedParts, INCLUSIVE)}. * </p> * - * @param numEquallySized an integer that specifies the number of equally sized partitions between + * @param numEquallySizedParts an integer that specifies the number of equally sized partitions between * {@link GenericPartitionBoundaries#getMinItem() getMinItem()} and * {@link GenericPartitionBoundaries#getMaxItem() getMaxItem()}. - * This must be a positive integer greater than zero. + * This must be a positive integer less than + * {@link SketchPartitionLimits#getMaxPartitions() getMaxPartitions()} * <ul> * <li>A 1 will return: minItem, maxItem.</li> * <li>A 2 will return: minItem, median quantile, maxItem.</li> @@ -47,11 +49,12 @@ public interface PartitioningFeature<T> { * </ul> * * @return an instance of {@link GenericPartitionBoundaries GenericPartitionBoundaries}. - * @throws IllegalArgumentException if sketch is empty. - * @throws IllegalArgumentException if <i>numEquallySized</i> is less than 1. + * @throws SketchesArgumentException if sketch is empty. + * @throws SketchesArgumentException if <i>numEquallySized</i> is greater than + * {@link SketchPartitionLimits#getMaxPartitions() getMaxPartitions()} */ - default GenericPartitionBoundaries<T> getPartitionBoundaries(int numEquallySized) { - return getPartitionBoundaries(numEquallySized, INCLUSIVE); + default GenericPartitionBoundaries<T> getPartitionBoundariesFromNumParts(int numEquallySizedParts) { + return getPartitionBoundariesFromNumParts(numEquallySizedParts, INCLUSIVE); } /** @@ -60,10 +63,11 @@ public interface PartitioningFeature<T> { * sufficient information for the user to create the given number of equally sized partitions, where "equally sized" * refers to an approximately equal number of items per partition. * - * @param numEquallySized an integer that specifies the number of equally sized partitions between + * @param numEquallySizedParts an integer that specifies the number of equally sized partitions between * {@link GenericPartitionBoundaries#getMinItem() getMinItem()} and * {@link GenericPartitionBoundaries#getMaxItem() getMaxItem()}. - * This must be a positive integer greater than zero. + * This must be a positive integer less than + * {@link SketchPartitionLimits#getMaxPartitions() getMaxPartitions()} * <ul> * <li>A 1 will return: minItem, maxItem.</li> * <li>A 2 will return: minItem, median quantile, maxItem.</li> @@ -77,9 +81,59 @@ public interface PartitioningFeature<T> { * with the exception of the highest returned quantile, which is the upper boundary of the highest ranked partition. * * @return an instance of {@link GenericPartitionBoundaries GenericPartitionBoundaries}. - * @throws IllegalArgumentException if sketch is empty. - * @throws IllegalArgumentException if <i>numEquallySized</i> is less than 1. + * @throws SketchesArgumentException if sketch is empty. + * @throws SketchesArgumentException if <i>numEquallySized</i> is greater than + * {@link SketchPartitionLimits#getMaxPartitions() getMaxPartitions()} */ - GenericPartitionBoundaries<T> getPartitionBoundaries(int numEquallySized, QuantileSearchCriteria searchCrit); + GenericPartitionBoundaries<T> getPartitionBoundariesFromNumParts( + int numEquallySizedParts, QuantileSearchCriteria searchCrit); + + /** + * This method returns an instance of + * {@link GenericPartitionBoundaries GenericPartitionBoundaries} which provides + * sufficient information for the user to create the given number of equally sized partitions, where "equally sized" + * refers to an approximately equal number of items per partition. + * + * <p>This method is equivalent to + * {@link #getPartitionBoundariesFromPartSize(long, QuantileSearchCriteria) + * getPartitionBoundariesFromPartSize(nominalPartSizeItems, INCLUSIVE)}. + * </p> + * + * @param nominalPartSizeItems an integer that specifies the nominal size, in items, of each target partition. + * This must be a positive integer greater than + * {@link SketchPartitionLimits#getMinPartitionSizeItems() getMinPartitionSizeItems()} + * + * @return an instance of {@link GenericPartitionBoundaries GenericPartitionBoundaries}. + * @throws SketchesArgumentException if sketch is empty. + * @throws SketchesArgumentException if <i>nominalPartSizeItems</i> is less than + * {@link SketchPartitionLimits#getMinPartitionSizeItems() getMinPartitionSizeItems()} + */ + default GenericPartitionBoundaries<T> getPartitionBoundariesFromPartSize(long nominalPartSizeItems) { + return getPartitionBoundariesFromPartSize(nominalPartSizeItems, INCLUSIVE); + } + + /** + * This method returns an instance of + * {@link GenericPartitionBoundaries GenericPartitionBoundaries} which provides + * sufficient information for the user to create the given number of equally sized partitions, where "equally sized" + * refers to an approximately equal number of items per partition. + * + * @param nominalPartSizeItems an integer that specifies the nominal size, in items, of each target partition. + * This must be a positive integer greater than + * {@link SketchPartitionLimits#getMinPartitionSizeItems() getMinPartitionSizeItems()}. + * + * @param searchCrit + * If INCLUSIVE, all the returned quantiles are the upper boundaries of the equally sized partitions + * with the exception of the lowest returned quantile, which is the lowest boundary of the lowest ranked partition. + * If EXCLUSIVE, all the returned quantiles are the lower boundaries of the equally sized partitions + * with the exception of the highest returned quantile, which is the upper boundary of the highest ranked partition. + * + * @return an instance of {@link GenericPartitionBoundaries GenericPartitionBoundaries}. + * @throws SketchesArgumentException if sketch is empty. + * @throws SketchesArgumentException if <i>nominalPartSizeItems</i> is less than + * {@link SketchPartitionLimits#getMinPartitionSizeItems() getMinPartitionSizeItems()} + */ + GenericPartitionBoundaries<T> getPartitionBoundariesFromPartSize( + long nominalPartSizeItems, QuantileSearchCriteria searchCrit); } diff --git a/src/main/java/org/apache/datasketches/quantilescommon/QuantilesGenericAPI.java b/src/main/java/org/apache/datasketches/quantilescommon/QuantilesGenericAPI.java index 404ec7a7..d422f15c 100644 --- a/src/main/java/org/apache/datasketches/quantilescommon/QuantilesGenericAPI.java +++ b/src/main/java/org/apache/datasketches/quantilescommon/QuantilesGenericAPI.java @@ -27,7 +27,7 @@ import static org.apache.datasketches.quantilescommon.QuantileSearchCriteria.INC * @param <T> The given item type * @author Lee Rhodes */ -public interface QuantilesGenericAPI<T> extends QuantilesAPI, PartitioningFeature<T> { +public interface QuantilesGenericAPI<T> extends QuantilesAPI, PartitioningFeature<T>, SketchPartitionLimits { /** * This is equivalent to {@link #getCDF(Object[], QuantileSearchCriteria) getCDF(splitPoints, INCLUSIVE)} diff --git a/src/main/java/org/apache/datasketches/quantilescommon/SketchPartitionLimits.java b/src/main/java/org/apache/datasketches/quantilescommon/SketchPartitionLimits.java new file mode 100644 index 00000000..578f142a --- /dev/null +++ b/src/main/java/org/apache/datasketches/quantilescommon/SketchPartitionLimits.java @@ -0,0 +1,63 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.datasketches.quantilescommon; + +import static org.apache.datasketches.quantilescommon.QuantilesAPI.EMPTY_MSG; + +import org.apache.datasketches.common.SketchesArgumentException; + +/** + * This defines the methods required to compute the partition limits. + */ +public interface SketchPartitionLimits { + + /** + * Gets the maximum number of partitions this sketch will support based on the configured size <i>K</i> + * and the number of retained values of this sketch. + * @return the maximum number of partitions this sketch will support. + */ + default int getMaxPartitions() { + return getNumRetained() / 2; + } + + /** + * Gets the minimum partition size in items this sketch will support based on the configured size <i>K</i> of this + * sketch and the number of retained values of this sketch. + * @return the minimum partition size in items this sketch will support. + */ + default long getMinPartitionSizeItems() { + final long totalN = getN(); + if (totalN <= 0) { throw new SketchesArgumentException(EMPTY_MSG); } + return totalN / getMaxPartitions(); + } + + /** + * Gets the length of the input stream offered to the sketch.. + * @return the length of the input stream offered to the sketch. + */ + long getN(); + + /** + * Gets the number of quantiles retained by the sketch. + * @return the number of quantiles retained by the sketch + */ + int getNumRetained(); + +} diff --git a/src/test/java/org/apache/datasketches/quantiles/ItemsSketchTest.java b/src/test/java/org/apache/datasketches/quantiles/ItemsSketchTest.java index 02990ae0..69e73667 100644 --- a/src/test/java/org/apache/datasketches/quantiles/ItemsSketchTest.java +++ b/src/test/java/org/apache/datasketches/quantiles/ItemsSketchTest.java @@ -19,7 +19,6 @@ package org.apache.datasketches.quantiles; -import static org.apache.datasketches.quantiles.PreambleUtil.DEFAULT_K; import static org.apache.datasketches.quantilescommon.LongsAsOrderableStrings.getString; import static org.apache.datasketches.quantilescommon.QuantileSearchCriteria.EXCLUSIVE; import static org.apache.datasketches.quantilescommon.QuantileSearchCriteria.INCLUSIVE; @@ -623,8 +622,7 @@ public class ItemsSketchTest { Double[] qArr = {8.0, 10.0, 10.0, 20.0}; long[] cwArr = {1, 3, 4, 5}; Comparator<Double> comp = Comparator.naturalOrder(); - final double normRankErr = ItemsSketch.getNormalizedRankError(DEFAULT_K, true); - ItemsSketchSortedView<Double> sv = new ItemsSketchSortedView<>(qArr, cwArr, 5L, comp, 20.0, 8.0, normRankErr); + ItemsSketchSortedView<Double> sv = new ItemsSketchSortedView<>(qArr, cwArr, 5L, comp, 20.0, 8.0); double[] ranks = {0, .1, .2, .3, .6, .7, .8, .9, 1.0}; Double[] qOut = new Double[9]; for (int i = 0; i < ranks.length; i++) { diff --git a/src/test/java/org/apache/datasketches/quantilescommon/CrossCheckQuantilesTest.java b/src/test/java/org/apache/datasketches/quantilescommon/CrossCheckQuantilesTest.java index dd07ae60..00cd380f 100644 --- a/src/test/java/org/apache/datasketches/quantilescommon/CrossCheckQuantilesTest.java +++ b/src/test/java/org/apache/datasketches/quantilescommon/CrossCheckQuantilesTest.java @@ -39,7 +39,6 @@ import org.apache.datasketches.common.SketchesArgumentException; import org.apache.datasketches.kll.KllDoublesSketch; import org.apache.datasketches.kll.KllFloatsSketch; import org.apache.datasketches.kll.KllItemsSketch; -import org.apache.datasketches.kll.KllSketch; import org.apache.datasketches.quantiles.DoublesSketch; import org.apache.datasketches.quantiles.ItemsSketch; import org.apache.datasketches.quantiles.UpdateDoublesSketch; @@ -321,11 +320,10 @@ public class CrossCheckQuantilesTest { String svImin = svIValues[set][0]; kllItemsSV = new ItemsSketchSortedView<>(svIValues[set], svCumWeights[set], totalN[set], - comparator, svImax, svImin, KllSketch.getNormalizedRankError(k, true)); + comparator, svImax, svImin); classicItemsSV = new ItemsSketchSortedView<>(svIValues[set], svCumWeights[set], totalN[set], - comparator, svImax, svImin, ItemsSketch.getNormalizedRankError(k, true)); - + comparator, svImax, svImin); } /********BUILD DATA SETS**********/ diff --git a/src/test/java/org/apache/datasketches/quantilescommon/PartitionBoundariesTest.java b/src/test/java/org/apache/datasketches/quantilescommon/PartitionBoundariesTest.java index d4a3eb43..1849c04f 100644 --- a/src/test/java/org/apache/datasketches/quantilescommon/PartitionBoundariesTest.java +++ b/src/test/java/org/apache/datasketches/quantilescommon/PartitionBoundariesTest.java @@ -71,7 +71,7 @@ public class PartitionBoundariesTest { printf(rowdfmt2, j++, itr.getNormalizedRank(searchCrit), itr.getNaturalRank(searchCrit), itr.getQuantile()); } - GenericPartitionBoundaries<String> gpb = sv.getPartitionBoundaries(numParts, searchCrit); + GenericPartitionBoundaries<String> gpb = sv.getPartitionBoundariesFromNumParts(numParts, searchCrit); int arrLen = gpb.getBoundaries().length; double[] normRanks = gpb.getNormalizedRanks(); long[] natRanks = gpb.getNaturalRanks(); @@ -111,7 +111,7 @@ public class PartitionBoundariesTest { printf(rowdfmt2, j++, itr.getNormalizedRank(searchCrit), itr.getNaturalRank(searchCrit), itr.getQuantile()); } - GenericPartitionBoundaries<String> gpb = sv.getPartitionBoundaries(numParts, searchCrit); + GenericPartitionBoundaries<String> gpb = sv.getPartitionBoundariesFromNumParts(numParts, searchCrit); int arrLen = gpb.getBoundaries().length; double[] normRanks = gpb.getNormalizedRanks(); long[] natRanks = gpb.getNaturalRanks(); @@ -136,10 +136,10 @@ public class PartitionBoundariesTest { sketch.update("C"); sketch.update("D"); String[] quantiles1 = sketch.getQuantiles(new double[] {0.0, 0.5, 1.0}, EXCLUSIVE); - String[] quantiles2 = sketch.getPartitionBoundaries(2, EXCLUSIVE).getBoundaries(); + String[] quantiles2 = sketch.getPartitionBoundariesFromNumParts(2, EXCLUSIVE).getBoundaries(); assertEquals(quantiles1, quantiles2); quantiles1 = sketch.getQuantiles(new double[] {0.0, 0.5, 1.0}, INCLUSIVE); - quantiles2 = sketch.getPartitionBoundaries(2, INCLUSIVE).getBoundaries(); + quantiles2 = sketch.getPartitionBoundariesFromNumParts(2, INCLUSIVE).getBoundaries(); assertEquals(quantiles1, quantiles2); } @@ -151,10 +151,10 @@ public class PartitionBoundariesTest { sketch.update(3); sketch.update(4); Integer[] quantiles1 = sketch.getQuantiles(new double[] {0.0, 0.5, 1.0}, EXCLUSIVE); - Integer[] quantiles2 = sketch.getPartitionBoundaries(2, EXCLUSIVE).getBoundaries(); + Integer[] quantiles2 = sketch.getPartitionBoundariesFromNumParts(2, EXCLUSIVE).getBoundaries(); assertEquals(quantiles1, quantiles2); quantiles1 = sketch.getQuantiles(new double[] {0.0, 0.5, 1.0}, INCLUSIVE); - quantiles2 = sketch.getPartitionBoundaries(2, INCLUSIVE).getBoundaries(); + quantiles2 = sketch.getPartitionBoundariesFromNumParts(2, INCLUSIVE).getBoundaries(); assertEquals(quantiles1, quantiles2); } @@ -164,22 +164,20 @@ public class PartitionBoundariesTest { */ @Test public void checkSimpleEndsAdjustment() { - final int k = 128; final String[] quantiles = {"2","4","6","7"}; final long[] cumWeights = {2, 4, 6, 8}; final long totalN = 8; final Comparator<String> comparator = Comparator.naturalOrder(); final String maxItem = "8"; final String minItem = "1"; - final double normRankErr = ItemsSketch.getNormalizedRankError(k, true); ItemsSketchSortedView<String> sv = new ItemsSketchSortedView<>( - quantiles, cumWeights, totalN, comparator, maxItem, minItem, normRankErr); + quantiles, cumWeights, totalN, comparator, maxItem, minItem); GenericSortedViewIterator<String> itr = sv.iterator(); while (itr.next()) { println(itr.getNaturalRank(INCLUSIVE) + ", " + itr.getQuantile(INCLUSIVE)); } - GenericPartitionBoundaries<String> gpb = sv.getPartitionBoundaries(2); + GenericPartitionBoundaries<String> gpb = sv.getPartitionBoundariesFromNumParts(2); String[] boundaries = gpb.getBoundaries(); long[] natRanks = gpb.getNaturalRanks(); double[] normRanks = gpb.getNormalizedRanks(); --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
