proost commented on code in PR #120:
URL: https://github.com/apache/datasketches-go/pull/120#discussion_r2780597319
##########
sampling/reservoir_items_sketch.go:
##########
@@ -93,9 +107,14 @@ func NewReservoirItemsSketch[T any](
opt(options)
}
+ lgRf, err := resizeFactorLg(options.resizeFactor)
+ if err != nil {
+ return nil, err
+ }
+
ceilingLgK, _ := internal.ExactLog2(common.CeilingPowerOf2(k))
initialLgSize := startingSubMultiple(
- ceilingLgK, int(math.Log2(float64(options.resizeFactor))),
minLgArrItems,
Review Comment:
using ResizeFactor itself is ok , right?
##########
sampling/reservoir_items_sketch.go:
##########
@@ -44,11 +42,27 @@ const (
defaultResizeFactor = ResizeX8
minK = 2
+ maxItemsSeen = int64(0xFFFFFFFFFFFF)
// smallest sampling array allocated: 16
minLgArrItems = 4
)
+func resizeFactorLg(rf ResizeFactor) (int, error) {
Review Comment:
You have already defined ResizeFactor using iota. so, verbose case doesn't
need.
##########
sampling/reservoir_items_sketch_test.go:
##########
@@ -301,3 +343,34 @@ func TestReservoirItemsSketchLegacySerVerEmpty(t
*testing.T) {
assert.Equal(t, 1024, sketch.K())
assert.Equal(t, ResizeX8, sketch.rf)
}
+
+func TestReservoirItemsSketchUpdateReturnsErrorAtMaxItemsSeen(t *testing.T) {
Review Comment:
I think this case is included already defined test function as nest case.
##########
sampling/reservoir_items_sketch.go:
##########
@@ -271,8 +312,15 @@ func (s *ReservoirItemsSketch[T])
insertValueAtPosition(item T, pos int) {
}
// forceIncrementItemsSeen adds delta to the items seen count.
-func (s *ReservoirItemsSketch[T]) forceIncrementItemsSeen(delta int64) {
+func (s *ReservoirItemsSketch[T]) forceIncrementItemsSeen(delta int64) error {
Review Comment:
If you add `delta` to `s.n` first, it already change state. so guarding not
to above maxItemsSeen is correct.
##########
sampling/reservoir_items_sketch.go:
##########
@@ -106,27 +125,44 @@ func NewReservoirItemsSketch[T any](
}
// Update adds an item to the sketch using reservoir sampling algorithm.
-func (s *ReservoirItemsSketch[T]) Update(item T) {
+func (s *ReservoirItemsSketch[T]) Update(item T) error {
+ if s.n == maxItemsSeen {
+ return fmt.Errorf("sketch has exceeded capacity for total items
seen: %d", maxItemsSeen)
+ }
+
if s.n < int64(s.k) {
// Initial phase: store all items until reservoir is full
if s.n >= int64(cap(s.data)) {
- s.growReservoir()
+ if err := s.growReservoir(); err != nil {
+ return err
+ }
}
s.data = append(s.data, item)
+ s.n++
} else {
// Steady state: replace with probability k/n
- j := rand.Int63n(s.n + 1)
- if j < int64(s.k) {
- s.data[j] = item
+ s.n++
+ if rand.Float64()*float64(s.n) < float64(s.k) {
+ s.data[rand.Intn(s.k)] = item
}
}
- s.n++
+ return nil
}
-func (s *ReservoirItemsSketch[T]) growReservoir() {
- adjustedSize := adjustedSamplingAllocationSize(s.k,
cap(s.data)<<int(s.rf))
- s.data = slices.Grow(s.data, adjustedSize)
+func (s *ReservoirItemsSketch[T]) growReservoir() error {
+ lgRf, err := resizeFactorLg(s.rf)
+ if err != nil {
+ return err
+ }
+ targetCap := adjustedSamplingAllocationSize(s.k, cap(s.data)<<lgRf)
+ if targetCap <= cap(s.data) {
+ return nil
+ }
+ newData := make([]T, len(s.data), targetCap)
Review Comment:
L162 ~ L164 is too expensive. that is why i use `Grow`
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]