This is an automated email from the ASF dual-hosted git repository. alsay pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/datasketches-go.git
commit 3c48b48bdd5205f501e11d4a90d4d84a18689082 Author: Pierre Lacave <[email protected]> AuthorDate: Thu Dec 21 13:35:51 2023 +0100 Add remaing tests and cleanup --- common/testutils/utils.go | 14 ++ common/utils.go | 7 +- frequencies/dist_test.go | 13 -- frequencies/error_types.go | 39 ---- frequencies/longs_sketch.go | 232 +++++++++++++++------ frequencies/longs_sketch_serialization_test.go | 145 +++++++++++++ frequencies/longs_sketch_test.go | 273 +++++++++++++++---------- frequencies/row.go | 18 +- frequencies/utils.go | 33 +++ hll/hll_sketch_serialization_test.go | 50 ++--- hll/hll_sketch_test.go | 6 - 11 files changed, 554 insertions(+), 276 deletions(-) diff --git a/common/testutils/utils.go b/common/testutils/utils.go new file mode 100644 index 0000000..7f8586a --- /dev/null +++ b/common/testutils/utils.go @@ -0,0 +1,14 @@ +package testutils + +const ( + DSketchTestGenerateGo = "DSKETCH_TEST_GENERATE_GO" + DSketchTestCrossJava = "DSKETCH_TEST_CROSS_JAVA" + DSketchTestCrossCpp = "DSKETCH_TEST_CROSS_CPP" + DSketchTestCrossGo = "DSKETCH_TEST_CROSS_GO" +) + +const ( + JavaPath = "../serialization_test_data/java_generated_files" + CppPath = "../serialization_test_data/cpp_generated_files" + GoPath = "../serialization_test_data/go_generated_files" +) diff --git a/common/utils.go b/common/utils.go index 305fa00..80a8b0d 100644 --- a/common/utils.go +++ b/common/utils.go @@ -28,6 +28,7 @@ const ( InverseGolden = float64(0.6180339887498949025) ) + // InvPow2 returns 2^(-e). func InvPow2(e int) (float64, error) { if (e | 1024 - e - 1) < 0 { @@ -49,14 +50,14 @@ func CeilPowerOf2(n int) int { } func ExactLog2(powerOf2 int) (int, error) { - if !isPowerOf2(powerOf2) { + if !IsPowerOf2(powerOf2) { return 0, fmt.Errorf("argument 'powerOf2' must be a positive power of 2") } return bits.TrailingZeros64(uint64(powerOf2)), nil } -// isPowerOf2 returns true if the given number is a power of 2. -func isPowerOf2(powerOf2 int) bool { +// IsPowerOf2 returns true if the given number is a power of 2. +func IsPowerOf2(powerOf2 int) bool { return powerOf2 > 0 && (powerOf2&(powerOf2-1)) == 0 } diff --git a/frequencies/dist_test.go b/frequencies/dist_test.go deleted file mode 100644 index 2b67ae0..0000000 --- a/frequencies/dist_test.go +++ /dev/null @@ -1,13 +0,0 @@ -package frequencies - -import ( - "math" - "math/rand" -) - -func randomGeometricDist(prob float64) int64 { - if prob <= 0.0 || prob >= 1.0 { - panic("prob must be in (0, 1)") - } - return int64(1 + math.Log(rand.Float64())/math.Log(1.0-prob)) -} diff --git a/frequencies/error_types.go b/frequencies/error_types.go deleted file mode 100644 index 730f331..0000000 --- a/frequencies/error_types.go +++ /dev/null @@ -1,39 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package frequencies - -type ErrorType struct { - id int - Name string -} - -type ErrorTypes struct { - NO_FALSE_POSITIVES ErrorType - NO_FALSE_NEGATIVES ErrorType -} - -var ErrorTypeEnum = &ErrorTypes{ - NO_FALSE_POSITIVES: ErrorType{ - id: 1, - Name: "NO_FALSE_POSITIVES", - }, - NO_FALSE_NEGATIVES: ErrorType{ - id: 2, - Name: "NO_FALSE_NEGATIVES", - }, -} diff --git a/frequencies/longs_sketch.go b/frequencies/longs_sketch.go index 6574c33..41c6b34 100644 --- a/frequencies/longs_sketch.go +++ b/frequencies/longs_sketch.go @@ -19,6 +19,7 @@ package frequencies import ( "encoding/binary" + "errors" "fmt" "github.com/apache/datasketches-go/common" "math/bits" @@ -59,13 +60,15 @@ const ( * map managed by this sketch. */ -// NewLongSketch returns a new LongsSketch with the given lgMaxMapSize and lgCurMapSize. +// NewLongsSketch returns a new LongsSketch with the given lgMaxMapSize and lgCurMapSize. +// // lgMaxMapSize is the log2 of the physical size of the internal hash map managed by this // sketch. The maximum capacity of this internal hash map is 0.75 times 2^lgMaxMapSize. // Both the ultimate accuracy and size of this sketch are a function of lgMaxMapSize. +// // lgCurMapSize is the log2 of the starting (current) physical size of the internal hash // map managed by this sketch. -func NewLongSketch(lgMaxMapSize int, lgCurMapSize int) (*LongsSketch, error) { +func NewLongsSketch(lgMaxMapSize int, lgCurMapSize int) (*LongsSketch, error) { //set initial size of hash map lgMaxMapSize = max(lgMaxMapSize, _LG_MIN_MAP_SIZE) lgCurMapSize = max(lgCurMapSize, _LG_MIN_MAP_SIZE) @@ -86,21 +89,26 @@ func NewLongSketch(lgMaxMapSize int, lgCurMapSize int) (*LongsSketch, error) { }, nil } -// NewLongSketchWithMaxMapSize constructs a new LongsSketch with the given maxMapSize and the +// NewLongsSketchWithMaxMapSize constructs a new LongsSketch with the given maxMapSize and the // default initialMapSize (8). +// // maxMapSize determines the physical size of the internal hash map managed by this // sketch and must be a power of 2. The maximum capacity of this internal hash map is // 0.75 times * maxMapSize. Both the ultimate accuracy and size of this sketch are a // function of maxMapSize. -func NewLongSketchWithMaxMapSize(maxMapSize int) (*LongsSketch, error) { +func NewLongsSketchWithMaxMapSize(maxMapSize int) (*LongsSketch, error) { log2OfInt, err := common.ExactLog2(maxMapSize) if err != nil { return nil, fmt.Errorf("maxMapSize, %e", err) } - return NewLongSketch(log2OfInt, _LG_MIN_MAP_SIZE) + return NewLongsSketch(log2OfInt, _LG_MIN_MAP_SIZE) } -func NewLongSketchFromSlice(slc []byte) (*LongsSketch, error) { +// NewLongsSketchFromSlice returns a sketch instance of this class from the given slice, +// which must be a byte slice representation of this sketch class. +// +// slc is a byte slice representation of a sketch of this class. +func NewLongsSketchFromSlice(slc []byte) (*LongsSketch, error) { pre0, err := checkPreambleSize(slc) if err != nil { return nil, err @@ -130,14 +138,14 @@ func NewLongSketchFromSlice(slc []byte) (*LongsSketch, error) { return nil, fmt.Errorf("Possible Corruption: Empty Flag set incorrectly: %t", preLongsEq1) } if empty { - return NewLongSketch(lgMaxMapSize, _LG_MIN_MAP_SIZE) + return NewLongsSketch(lgMaxMapSize, _LG_MIN_MAP_SIZE) } // get full preamble preArr := make([]int64, preLongs) for i := 0; i < preLongs; i++ { preArr[i] = int64(binary.LittleEndian.Uint64(slc[i<<3:])) } - fls, err := NewLongSketch(lgMaxMapSize, lgCurMapSize) + fls, err := NewLongsSketch(lgMaxMapSize, lgCurMapSize) if err != nil { return nil, err } @@ -173,7 +181,11 @@ func NewLongSketchFromSlice(slc []byte) (*LongsSketch, error) { return fls, nil } -func NewLongSketchFromString(str string) (*LongsSketch, error) { +// NewLongsSketchFromString returns a sketch instance of this class from the given string, +// which must be a string representation of this sketch class. +// +// str is a string representation of a sketch of this class. +func NewLongsSketchFromString(str string) (*LongsSketch, error) { if len(str) < 1 { return nil, fmt.Errorf("String is empty") } @@ -237,7 +249,7 @@ func NewLongSketchFromString(str string) (*LongsSketch, error) { return nil, fmt.Errorf("Possible Corruption: Incorrect # of tokens: %d, numActive: %d", numTokens, numActive) } // if ((2 * numActive) != (numTokens - STR_PREAMBLE_TOKENS - 2)) { - sk, err := NewLongSketch(int(lgMax), int(lgCur)) + sk, err := NewLongsSketch(int(lgMax), int(lgCur)) if err != nil { return nil, err } @@ -250,7 +262,43 @@ func NewLongSketchFromString(str string) (*LongsSketch, error) { return sk, nil } -func (s *LongsSketch) getEstimate(item int64) (int64, error) { +// GetAprioriErrorLongsSketch returns the estimated a priori error given the maxMapSize for the sketch and the +// estimatedTotalStreamWeight. +// +// maxMapSize is the planned map size to be used when constructing this sketch. +// estimatedTotalStreamWeight is the estimated total stream weight. +func GetAprioriErrorLongsSketch(maxMapSize int, estimatedTotalStreamWeight int64) (float64, error) { + epsilon, err := GetEpsilonLongsSketch(maxMapSize) + if err != nil { + return 0, err + } + return epsilon * float64(estimatedTotalStreamWeight), nil +} + +// GetCurrentMapCapacity returns the current number of counters the sketch is configured to support. +func (s *LongsSketch) GetCurrentMapCapacity() int { + return s.curMapCap +} + +// GetEpsilonLongsSketch returns epsilon used to compute a priori error. +// This is just the value 3.5 / maxMapSize. +// +// maxMapSize is the planned map size to be used when constructing this sketch. +func GetEpsilonLongsSketch(maxMapSize int) (float64, error) { + if !common.IsPowerOf2(maxMapSize) { + return 0, errors.New("maxMapSize is not a power of 2") + } + return 3.5 / float64(maxMapSize), nil +} + +// GetEstimate gets the estimate of the frequency of the given item. +// Note: The true frequency of a item would be the sum of the counts as a result of the +// two update functions. +// +// item is the given item +// +// return the estimate of the frequency of the given item +func (s *LongsSketch) GetEstimate(item int64) (int64, error) { itemCount, err := s.hashMap.get(item) if err != nil { return 0, err @@ -258,12 +306,25 @@ func (s *LongsSketch) getEstimate(item int64) (int64, error) { return itemCount + s.offset, nil } -func (s *LongsSketch) getLowerBound(item int64) (int64, error) { +// GetLowerBound gets the guaranteed lower bound frequency of the given item, which can never be +// negative. +// +// item is the given item. +// +// return the guaranteed lower bound frequency of the given item. That is, a number which +// is guaranteed to be no larger than the real frequency. +func (s *LongsSketch) GetLowerBound(item int64) (int64, error) { // LB = itemCount return s.hashMap.get(item) } -func (s *LongsSketch) getUpperBound(item int64) (int64, error) { +// GetUpperBound gets the guaranteed upper bound frequency of the given item. +// +// item is the given item. +// +// return the guaranteed upper bound frequency of the given item. That is, a number which +// is guaranteed to be no smaller than the real frequency. +func (s *LongsSketch) GetUpperBound(item int64) (int64, error) { itemCount, err := s.hashMap.get(item) if err != nil { return 0, err @@ -271,43 +332,91 @@ func (s *LongsSketch) getUpperBound(item int64) (int64, error) { return itemCount + s.offset, nil } -func (s *LongsSketch) getNumActiveItems() int { +// GetFrequentItemsWithThreshold returns an array of Rows that include frequent items, estimates, upper and +// lower bounds given a threshold and an ErrorCondition. If the threshold is lower than +// getMaximumError(), then getMaximumError() will be used instead. +// +// The method first examines all active items in the sketch (items that have a counter). +// +// If errorType = NO_FALSE_NEGATIVES, this will include an item in the result list if +// GetUpperBound(item) > threshold. There will be no false negatives, i.e., no Type II error. +// There may be items in the set with true frequencies less than the threshold (false positives). +// +// If errorType = NO_FALSE_POSITIVES, this will include an item in the result list if +// GetLowerBound(item) > threshold. There will be no false positives, i.e., no Type I error. +// There may be items omitted from the set with true frequencies greater than the threshold +// (false negatives). This is a subset of the NO_FALSE_NEGATIVES case. +// +// threshold to include items in the result list +// errorType determines whether no false positives or no false negatives are desired. +// an array of frequent items +func (s *LongsSketch) GetFrequentItemsWithThreshold(threshold int64, errorType ErrorType) ([]*Row, error) { + finalThreshold := s.GetMaximumError() + if threshold > finalThreshold { + finalThreshold = threshold + } + return sortItems(s, finalThreshold, errorType) +} + +// GetFrequentItems returns an array of Rows that include frequent items, estimates, upper and +// lower bounds given an ErrorCondition and the default threshold. +// This is the same as GetFrequentItemsWithThreshold(getMaximumError(), errorType) +// +// errorType determines whether no false positives or no false negatives are desired. +func (s *LongsSketch) GetFrequentItems(errorType ErrorType) ([]*Row, error) { + return sortItems(s, s.GetMaximumError(), errorType) +} + +// GetNumActiveItems returns the number of active items in the sketch. +func (s *LongsSketch) GetNumActiveItems() int { return s.hashMap.numActive } -// getMaximumMapCapacity returns the maximum number of counters the sketch is configured to +// GetMaximumError return an upper bound on the maximum error of GetEstimate(item) for any item. +// This is equivalent to the maximum distance between the upper bound and the lower bound +// for any item. +func (s *LongsSketch) GetMaximumError() int64 { + return s.offset +} + +// GetMaximumMapCapacity returns the maximum number of counters the sketch is configured to // support. -func (s *LongsSketch) getMaximumMapCapacity() int { +func (s *LongsSketch) GetMaximumMapCapacity() int { return int(float64(uint64(1<<s.lgMaxMapSize)) * reversePurgeLongHashMapLoadFactor) } -func (s *LongsSketch) getStorageBytes() int { - if s.isEmpty() { +// GetStorageBytes returns the number of bytes required to store this sketch as slice +func (s *LongsSketch) GetStorageBytes() int { + if s.IsEmpty() { return 8 } - return (4 * 8) + (16 * s.getNumActiveItems()) + return (4 * 8) + (16 * s.GetNumActiveItems()) } -func (s *LongsSketch) getCurrentMapCapacity() int { - return s.curMapCap -} - -func (s *LongsSketch) getMaximumError() int64 { - return s.offset -} - -func (s *LongsSketch) getStreamLength() int64 { +// GetStreamLength returns the sum of the frequencies (weights or counts) in the stream seen +// so far by the sketch +func (s *LongsSketch) GetStreamLength() int64 { return s.streamWeight } -func (s *LongsSketch) isEmpty() bool { - return s.getNumActiveItems() == 0 +// IsEmpty returns true if this sketch is empty +func (s *LongsSketch) IsEmpty() bool { + return s.GetNumActiveItems() == 0 } +// Update update this sketch with an item and a frequency count of one. +// +// item for which the frequency should be increased. func (s *LongsSketch) Update(item int64) error { return s.UpdateMany(item, 1) } +// UpdateMany update this sketch with a item and a positive frequency count (or weight). +// +// item for which the frequency should be increased. The item can be any long value +// and is only used by the sketch to determine uniqueness. +// count the amount by which the frequency of the item should be increased. +// An count of zero is a no-op, and a negative count will throw an exception. func (s *LongsSketch) UpdateMany(item int64, count int64) error { if count == 0 { return nil @@ -330,7 +439,7 @@ func (s *LongsSketch) UpdateMany(item int64, count int64) error { } else { // At tgt size, must purge s.offset += s.hashMap.purge(s.sampleSize) - if s.getNumActiveItems() > s.getMaximumMapCapacity() { + if s.GetNumActiveItems() > s.GetMaximumMapCapacity() { return fmt.Errorf("purge did not reduce active items") } } @@ -338,23 +447,17 @@ func (s *LongsSketch) UpdateMany(item int64, count int64) error { return nil } -func (s *LongsSketch) String() string { - var sb strings.Builder - sb.WriteString("FrequentLongsSketch:") - sb.WriteString("\n") - sb.WriteString(" Stream Length : " + strconv.FormatInt(s.streamWeight, 10)) - sb.WriteString("\n") - sb.WriteString(" Max Error Offset : " + strconv.FormatInt(s.offset, 10)) - sb.WriteString("\n") - sb.WriteString(s.hashMap.String()) - return sb.String() -} - -func (s *LongsSketch) merge(other *LongsSketch) (*LongsSketch, error) { - if other == nil || other.isEmpty() { +// Merge merges the other sketch into this one. The other sketch may be of a different size. +// +// other sketch of this class +// +// return a sketch whose estimates are within the guarantees of the largest error tolerance +// of the two merged sketches. +func (s *LongsSketch) Merge(other *LongsSketch) (*LongsSketch, error) { + if other == nil || other.IsEmpty() { return s, nil } - streamWt := s.streamWeight + other.streamWeight //capture before merge + streamWt := s.streamWeight + other.streamWeight //capture before Merge iter := other.hashMap.iterator() for iter.next() { err := s.UpdateMany(iter.getKey(), iter.getValue()) @@ -367,7 +470,8 @@ func (s *LongsSketch) merge(other *LongsSketch) (*LongsSketch, error) { return s, nil } -func (s *LongsSketch) serializeToString() (string, error) { +// ToString returns a String representation of this sketch +func (s *LongsSketch) ToString() (string, error) { var sb strings.Builder //start the string with parameters of the sketch serVer := _SER_VER //0 @@ -385,9 +489,10 @@ func (s *LongsSketch) serializeToString() (string, error) { return sb.String(), nil } -func (s *LongsSketch) toSlice() ([]byte, error) { - emtpy := s.isEmpty() - activeItems := s.getNumActiveItems() +// ToSlice returns a slice representation of this sketch +func (s *LongsSketch) ToSlice() ([]byte, error) { + emtpy := s.IsEmpty() + activeItems := s.GetNumActiveItems() preLongs := 1 outBytes := 8 if !emtpy { @@ -428,6 +533,7 @@ func (s *LongsSketch) toSlice() ([]byte, error) { return outArr, nil } +// Reset resets this sketch to a virgin state. func (s *LongsSketch) Reset() { hasMap, _ := NewReversePurgeLongHashMap(1 << _LG_MIN_MAP_SIZE) s.curMapCap = hasMap.getCapacity() @@ -436,20 +542,14 @@ func (s *LongsSketch) Reset() { s.hashMap = hasMap } -/* - public Row[] getFrequentItems(final long threshold, final ErrorType errorType) { - return sortItems(threshold > getMaximumError() ? threshold : getMaximumError(), errorType); - } -*/ - -func (s *LongsSketch) getFrequentItemsWithThreshold(threshold int64, errorType ErrorType) ([]*Row, error) { - finalThreshold := s.getMaximumError() - if threshold > finalThreshold { - finalThreshold = threshold - } - return sortItems(s, finalThreshold, errorType) -} - -func (s *LongsSketch) getFrequentItems(errorType ErrorType) ([]*Row, error) { - return sortItems(s, s.getMaximumError(), errorType) +func (s *LongsSketch) String() string { + var sb strings.Builder + sb.WriteString("FrequentLongsSketch:") + sb.WriteString("\n") + sb.WriteString(" Stream Length : " + strconv.FormatInt(s.streamWeight, 10)) + sb.WriteString("\n") + sb.WriteString(" Max Error Offset : " + strconv.FormatInt(s.offset, 10)) + sb.WriteString("\n") + sb.WriteString(s.hashMap.String()) + return sb.String() } diff --git a/frequencies/longs_sketch_serialization_test.go b/frequencies/longs_sketch_serialization_test.go new file mode 100644 index 0000000..c54351d --- /dev/null +++ b/frequencies/longs_sketch_serialization_test.go @@ -0,0 +1,145 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package frequencies + +import ( + "fmt" + "github.com/apache/datasketches-go/common/testutils" + "github.com/stretchr/testify/assert" + "os" + "testing" +) + +func TestGenerateGoBinariesForCompatibilityTestingLongsSketch(t *testing.T) { + if len(os.Getenv(testutils.DSketchTestGenerateGo)) == 0 { + t.Skipf("%s not set", testutils.DSketchTestGenerateGo) + } + + nArr := []int{0, 1, 10, 100, 1000, 10000, 100000, 1000000} + for _, n := range nArr { + sk, err := NewLongsSketchWithMaxMapSize(64) + assert.NoError(t, err) + for i := 1; i <= n; i++ { + err = sk.Update(int64(i)) + assert.NoError(t, err) + } + if n == 0 { + assert.True(t, sk.IsEmpty()) + } else { + assert.False(t, sk.IsEmpty()) + } + if n > 10 { + assert.True(t, sk.GetMaximumError() > 0) + } else { + assert.Equal(t, sk.GetMaximumError(), int64(0)) + } + err = os.MkdirAll(testutils.GoPath, os.ModePerm) + assert.NoError(t, err) + + slc, err := sk.ToSlice() + assert.NoError(t, err) + err = os.WriteFile(fmt.Sprintf("%s/frequent_long_n%d_go.sk", testutils.GoPath, n), slc, 0644) + if err != nil { + t.Errorf("err != nil") + } + } +} + +func TestGoCompat(t *testing.T) { + if len(os.Getenv(testutils.DSketchTestCrossGo)) == 0 { + t.Skipf("%s not set", testutils.DSketchTestCrossGo) + } + + nArr := []int{0, 1, 10, 100, 1000, 10000, 100000, 1000000} + for _, n := range nArr { + bytes, err := os.ReadFile(fmt.Sprintf("%s/frequent_long_n%d_go.sk", testutils.GoPath, n)) + assert.NoError(t, err) + sketch, err := NewLongsSketchFromSlice(bytes) + if err != nil { + return + } + + if n == 0 { + assert.True(t, sketch.IsEmpty()) + } else { + assert.False(t, sketch.IsEmpty()) + } + if n > 10 { + assert.True(t, sketch.GetMaximumError() > 0) + } else { + assert.Equal(t, sketch.GetMaximumError(), int64(0)) + } + assert.Equal(t, sketch.GetStreamLength(), int64(n)) + } +} + +func TestJavaCompat(t *testing.T) { + if len(os.Getenv(testutils.DSketchTestCrossJava)) == 0 { + t.Skipf("%s not set", testutils.DSketchTestCrossJava) + } + + nArr := []int{0, 1, 10, 100, 1000, 10000, 100000, 1000000} + for _, n := range nArr { + bytes, err := os.ReadFile(fmt.Sprintf("%s/frequent_long_n%d_java.sk", testutils.JavaPath, n)) + assert.NoError(t, err) + sketch, err := NewLongsSketchFromSlice(bytes) + if err != nil { + return + } + + if n == 0 { + assert.True(t, sketch.IsEmpty()) + } else { + assert.False(t, sketch.IsEmpty()) + } + if n > 10 { + assert.True(t, sketch.GetMaximumError() > 0) + } else { + assert.Equal(t, sketch.GetMaximumError(), int64(0)) + } + assert.Equal(t, sketch.GetStreamLength(), int64(n)) + } +} + +func TestCppCompat(t *testing.T) { + if len(os.Getenv(testutils.DSketchTestCrossCpp)) == 0 { + t.Skipf("%s not set", testutils.DSketchTestCrossCpp) + } + + nArr := []int{0, 1, 10, 100, 1000, 10000, 100000, 1000000} + for _, n := range nArr { + bytes, err := os.ReadFile(fmt.Sprintf("%s/frequent_long_n%d_cpp.sk", testutils.CppPath, n)) + assert.NoError(t, err) + sketch, err := NewLongsSketchFromSlice(bytes) + if err != nil { + return + } + + if n == 0 { + assert.True(t, sketch.IsEmpty()) + } else { + assert.False(t, sketch.IsEmpty()) + } + if n > 10 { + assert.True(t, sketch.GetMaximumError() > 0) + } else { + assert.Equal(t, sketch.GetMaximumError(), int64(0)) + } + assert.Equal(t, sketch.GetStreamLength(), int64(n)) + } +} diff --git a/frequencies/longs_sketch_test.go b/frequencies/longs_sketch_test.go index 20c9898..895c18b 100644 --- a/frequencies/longs_sketch_test.go +++ b/frequencies/longs_sketch_test.go @@ -27,9 +27,9 @@ import ( ) func TestFrequentItemsStringSerial(t *testing.T) { - sketch, err := NewLongSketchWithMaxMapSize(8) + sketch, err := NewLongsSketchWithMaxMapSize(8) assert.NoError(t, err) - sketch2, err := NewLongSketchWithMaxMapSize(128) + sketch2, err := NewLongsSketchWithMaxMapSize(128) assert.NoError(t, err) sketch.UpdateMany(10, 100) sketch.UpdateMany(10, 100) @@ -37,15 +37,15 @@ func TestFrequentItemsStringSerial(t *testing.T) { sketch.UpdateMany(1000001, 1010230) sketch.UpdateMany(1000002, 1010230) - ser, err := sketch.serializeToString() + ser, err := sketch.ToString() assert.NoError(t, err) - newSk0, err := NewLongSketchFromString(ser) + newSk0, err := NewLongsSketchFromString(ser) assert.NoError(t, err) - newSer0, err := newSk0.serializeToString() + newSer0, err := newSk0.ToString() assert.NoError(t, err) assert.Equal(t, ser, newSer0) - assert.Equal(t, sketch.getMaximumMapCapacity(), newSk0.getMaximumMapCapacity()) - assert.Equal(t, sketch.getCurrentMapCapacity(), newSk0.getCurrentMapCapacity()) + assert.Equal(t, sketch.GetMaximumMapCapacity(), newSk0.GetMaximumMapCapacity()) + assert.Equal(t, sketch.GetCurrentMapCapacity(), newSk0.GetCurrentMapCapacity()) sketch2.UpdateMany(190, 12902390) sketch2.UpdateMany(191, 12902390) @@ -67,44 +67,44 @@ func TestFrequentItemsStringSerial(t *testing.T) { sketch2.UpdateMany(207, 12902390) sketch2.UpdateMany(208, 12902390) - s2, err := sketch2.serializeToString() + s2, err := sketch2.ToString() assert.NoError(t, err) - newSk2, err := NewLongSketchFromString(s2) + newSk2, err := NewLongsSketchFromString(s2) assert.NoError(t, err) - newS2, err := newSk2.serializeToString() + newS2, err := newSk2.ToString() assert.NoError(t, err) assert.Equal(t, s2, newS2) - assert.Equal(t, sketch2.getMaximumMapCapacity(), newSk2.getMaximumMapCapacity()) - assert.Equal(t, sketch2.getCurrentMapCapacity(), newSk2.getCurrentMapCapacity()) - assert.Equal(t, sketch2.getStreamLength(), newSk2.getStreamLength()) + assert.Equal(t, sketch2.GetMaximumMapCapacity(), newSk2.GetMaximumMapCapacity()) + assert.Equal(t, sketch2.GetCurrentMapCapacity(), newSk2.GetCurrentMapCapacity()) + assert.Equal(t, sketch2.GetStreamLength(), newSk2.GetStreamLength()) - mergedSketch, err := sketch.merge(sketch2) + mergedSketch, err := sketch.Merge(sketch2) assert.NoError(t, err) - ms, err := mergedSketch.serializeToString() + ms, err := mergedSketch.ToString() assert.NoError(t, err) - newMs, err := NewLongSketchFromString(ms) + newMs, err := NewLongsSketchFromString(ms) assert.NoError(t, err) - newSMs, err := newMs.serializeToString() + newSMs, err := newMs.ToString() assert.NoError(t, err) assert.Equal(t, ms, newSMs) - assert.Equal(t, mergedSketch.getMaximumMapCapacity(), newMs.getMaximumMapCapacity()) - assert.Equal(t, mergedSketch.getCurrentMapCapacity(), newMs.getCurrentMapCapacity()) - assert.Equal(t, mergedSketch.getStreamLength(), newMs.getStreamLength()) + assert.Equal(t, mergedSketch.GetMaximumMapCapacity(), newMs.GetMaximumMapCapacity()) + assert.Equal(t, mergedSketch.GetCurrentMapCapacity(), newMs.GetCurrentMapCapacity()) + assert.Equal(t, mergedSketch.GetStreamLength(), newMs.GetStreamLength()) } func TestFrequentItemsByteSerial(t *testing.T) { - sketch, err := NewLongSketchWithMaxMapSize(16) + sketch, err := NewLongsSketchWithMaxMapSize(16) assert.NoError(t, err) - byteArray0, err := sketch.toSlice() - newSk0, err := NewLongSketchFromSlice(byteArray0) + byteArray0, err := sketch.ToSlice() + newSk0, err := NewLongsSketchFromSlice(byteArray0) assert.NoError(t, err) - str0, err := sketch.serializeToString() + str0, err := sketch.ToString() assert.NoError(t, err) - newStr0, err := newSk0.serializeToString() + newStr0, err := newSk0.ToString() assert.NoError(t, err) assert.Equal(t, str0, newStr0) - sketch2, err := NewLongSketchWithMaxMapSize(128) + sketch2, err := NewLongsSketchWithMaxMapSize(128) assert.NoError(t, err) sketch.UpdateMany(10, 100) sketch.UpdateMany(10, 100) @@ -112,16 +112,16 @@ func TestFrequentItemsByteSerial(t *testing.T) { sketch.UpdateMany(1000001, 1010230) sketch.UpdateMany(1000002, 1010230) - byteArray1, err := sketch.toSlice() + byteArray1, err := sketch.ToSlice() assert.NoError(t, err) - newSk1, err := NewLongSketchFromSlice(byteArray1) + newSk1, err := NewLongsSketchFromSlice(byteArray1) assert.NoError(t, err) - str1, err := sketch.serializeToString() - newStr1, err := newSk1.serializeToString() + str1, err := sketch.ToString() + newStr1, err := newSk1.ToString() assert.NoError(t, err) assert.Equal(t, str1, newStr1) - assert.Equal(t, sketch.getMaximumMapCapacity(), newSk1.getMaximumMapCapacity()) - assert.Equal(t, sketch.getCurrentMapCapacity(), newSk1.getCurrentMapCapacity()) + assert.Equal(t, sketch.GetMaximumMapCapacity(), newSk1.GetMaximumMapCapacity()) + assert.Equal(t, sketch.GetCurrentMapCapacity(), newSk1.GetCurrentMapCapacity()) sketch2.UpdateMany(190, 12902390) sketch2.UpdateMany(191, 12902390) @@ -143,37 +143,37 @@ func TestFrequentItemsByteSerial(t *testing.T) { sketch2.UpdateMany(207, 12902390) sketch2.UpdateMany(208, 12902390) - byteArray2, err := sketch2.toSlice() + byteArray2, err := sketch2.ToSlice() assert.NoError(t, err) - newSk2, err := NewLongSketchFromSlice(byteArray2) + newSk2, err := NewLongsSketchFromSlice(byteArray2) assert.NoError(t, err) - str2, err := sketch2.serializeToString() + str2, err := sketch2.ToString() assert.NoError(t, err) - newStr2, err := newSk2.serializeToString() + newStr2, err := newSk2.ToString() assert.NoError(t, err) assert.Equal(t, str2, newStr2) - assert.Equal(t, sketch2.getMaximumMapCapacity(), newSk2.getMaximumMapCapacity()) - assert.Equal(t, sketch2.getCurrentMapCapacity(), newSk2.getCurrentMapCapacity()) - assert.Equal(t, sketch2.getStreamLength(), newSk2.getStreamLength()) + assert.Equal(t, sketch2.GetMaximumMapCapacity(), newSk2.GetMaximumMapCapacity()) + assert.Equal(t, sketch2.GetCurrentMapCapacity(), newSk2.GetCurrentMapCapacity()) + assert.Equal(t, sketch2.GetStreamLength(), newSk2.GetStreamLength()) - mergedSketch, err := sketch.merge(sketch2) + mergedSketch, err := sketch.Merge(sketch2) assert.NoError(t, err) - byteArray3, err := mergedSketch.toSlice() + byteArray3, err := mergedSketch.ToSlice() assert.NoError(t, err) - newSk3, err := NewLongSketchFromSlice(byteArray3) + newSk3, err := NewLongsSketchFromSlice(byteArray3) assert.NoError(t, err) - str3, err := mergedSketch.serializeToString() + str3, err := mergedSketch.ToString() assert.NoError(t, err) - newStr3, err := newSk3.serializeToString() + newStr3, err := newSk3.ToString() assert.NoError(t, err) assert.Equal(t, str3, newStr3) - assert.Equal(t, mergedSketch.getMaximumMapCapacity(), newSk3.getMaximumMapCapacity()) - assert.Equal(t, mergedSketch.getCurrentMapCapacity(), newSk3.getCurrentMapCapacity()) - assert.Equal(t, mergedSketch.getStreamLength(), newSk3.getStreamLength()) + assert.Equal(t, mergedSketch.GetMaximumMapCapacity(), newSk3.GetMaximumMapCapacity()) + assert.Equal(t, mergedSketch.GetCurrentMapCapacity(), newSk3.GetCurrentMapCapacity()) + assert.Equal(t, mergedSketch.GetStreamLength(), newSk3.GetStreamLength()) } func TestFrequentItemsByteResetAndEmptySerial(t *testing.T) { - sketch, err := NewLongSketchWithMaxMapSize(16) + sketch, err := NewLongsSketchWithMaxMapSize(16) assert.NoError(t, err) sketch.UpdateMany(10, 100) sketch.UpdateMany(10, 100) @@ -182,22 +182,22 @@ func TestFrequentItemsByteResetAndEmptySerial(t *testing.T) { sketch.UpdateMany(1000002, 1010230) sketch.Reset() - byteArray0, err := sketch.toSlice() + byteArray0, err := sketch.ToSlice() assert.NoError(t, err) - newSk0, err := NewLongSketchFromSlice(byteArray0) + newSk0, err := NewLongsSketchFromSlice(byteArray0) assert.NoError(t, err) - str0, err := sketch.serializeToString() + str0, err := sketch.ToString() assert.NoError(t, err) - newStr0, err := newSk0.serializeToString() + newStr0, err := newSk0.ToString() assert.NoError(t, err) assert.Equal(t, str0, newStr0) - assert.Equal(t, sketch.getMaximumMapCapacity(), newSk0.getMaximumMapCapacity()) - assert.Equal(t, sketch.getCurrentMapCapacity(), newSk0.getCurrentMapCapacity()) + assert.Equal(t, sketch.GetMaximumMapCapacity(), newSk0.GetMaximumMapCapacity()) + assert.Equal(t, sketch.GetCurrentMapCapacity(), newSk0.GetCurrentMapCapacity()) } func TestFreqLongSliceSerDe(t *testing.T) { minSize := 1 << _LG_MIN_MAP_SIZE - sk1, err := NewLongSketchWithMaxMapSize(minSize) + sk1, err := NewLongsSketchWithMaxMapSize(minSize) assert.NoError(t, err) sk1.UpdateMany(10, 100) sk1.UpdateMany(10, 100) @@ -205,9 +205,9 @@ func TestFreqLongSliceSerDe(t *testing.T) { sk1.UpdateMany(1000001, 1010230) sk1.UpdateMany(1000002, 1010230) - byteArray0, err := sk1.toSlice() + byteArray0, err := sk1.ToSlice() assert.NoError(t, err) - sk2, err := NewLongSketchFromSlice(byteArray0) + sk2, err := NewLongsSketchFromSlice(byteArray0) assert.NoError(t, err) checkEquality(t, sk1, sk2) @@ -215,7 +215,7 @@ func TestFreqLongSliceSerDe(t *testing.T) { func TestFreqLongStringSerDe(t *testing.T) { minSize := 1 << _LG_MIN_MAP_SIZE - sk1, err := NewLongSketchWithMaxMapSize(minSize) + sk1, err := NewLongsSketchWithMaxMapSize(minSize) assert.NoError(t, err) sk1.UpdateMany(10, 100) sk1.UpdateMany(10, 100) @@ -223,29 +223,29 @@ func TestFreqLongStringSerDe(t *testing.T) { sk1.UpdateMany(1000001, 1010230) sk1.UpdateMany(1000002, 1010230) - str1, err := sk1.serializeToString() + str1, err := sk1.ToString() assert.NoError(t, err) - sk2, err := NewLongSketchFromString(str1) + sk2, err := NewLongsSketchFromString(str1) assert.NoError(t, err) checkEquality(t, sk1, sk2) } func checkEquality(t *testing.T, sk1, sk2 *LongsSketch) { - assert.Equal(t, sk1.getNumActiveItems(), sk2.getNumActiveItems()) - assert.Equal(t, sk1.getCurrentMapCapacity(), sk2.getCurrentMapCapacity()) - assert.Equal(t, sk1.getMaximumError(), sk2.getMaximumError()) - assert.Equal(t, sk1.getMaximumMapCapacity(), sk2.getMaximumMapCapacity()) - assert.Equal(t, sk1.getStorageBytes(), sk2.getStorageBytes()) - assert.Equal(t, sk1.getStreamLength(), sk2.getStreamLength()) - assert.Equal(t, sk1.isEmpty(), sk2.isEmpty()) + assert.Equal(t, sk1.GetNumActiveItems(), sk2.GetNumActiveItems()) + assert.Equal(t, sk1.GetCurrentMapCapacity(), sk2.GetCurrentMapCapacity()) + assert.Equal(t, sk1.GetMaximumError(), sk2.GetMaximumError()) + assert.Equal(t, sk1.GetMaximumMapCapacity(), sk2.GetMaximumMapCapacity()) + assert.Equal(t, sk1.GetStorageBytes(), sk2.GetStorageBytes()) + assert.Equal(t, sk1.GetStreamLength(), sk2.GetStreamLength()) + assert.Equal(t, sk1.IsEmpty(), sk2.IsEmpty()) NFN := ErrorTypeEnum.NO_FALSE_NEGATIVES NFP := ErrorTypeEnum.NO_FALSE_POSITIVES - rowArr1, err := sk1.getFrequentItems(NFN) + rowArr1, err := sk1.GetFrequentItems(NFN) assert.NoError(t, err) - rowArr2, err := sk2.getFrequentItems(NFN) + rowArr2, err := sk2.GetFrequentItems(NFN) assert.NoError(t, err) assert.Equal(t, len(rowArr1), len(rowArr2)) for i := 0; i < len(rowArr1); i++ { @@ -254,9 +254,9 @@ func checkEquality(t *testing.T, sk1, sk2 *LongsSketch) { assert.Equal(t, s1, s2) } - rowArr1, err = sk1.getFrequentItems(NFP) + rowArr1, err = sk1.GetFrequentItems(NFP) assert.NoError(t, err) - rowArr2, err = sk2.getFrequentItems(NFP) + rowArr2, err = sk2.GetFrequentItems(NFP) assert.NoError(t, err) assert.Equal(t, len(rowArr1), len(rowArr2)) for i := 0; i < len(rowArr1); i++ { @@ -268,11 +268,11 @@ func checkEquality(t *testing.T, sk1, sk2 *LongsSketch) { func TestFreqLongSliceSerDeError(t *testing.T) { minSize := 1 << _LG_MIN_MAP_SIZE - sk1, err := NewLongSketchWithMaxMapSize(minSize) + sk1, err := NewLongsSketchWithMaxMapSize(minSize) assert.NoError(t, err) sk1.Update(1) - byteArray0, err := sk1.toSlice() + byteArray0, err := sk1.ToSlice() pre0 := binary.LittleEndian.Uint64(byteArray0) tryBadMem(t, byteArray0, _PREAMBLE_LONGS_BYTE, 2) //Corrupt @@ -291,13 +291,13 @@ func TestFreqLongSliceSerDeError(t *testing.T) { func tryBadMem(t *testing.T, mem []byte, byteOffset, byteValue int) { binary.LittleEndian.PutUint64(mem[byteOffset:], uint64(byteValue)) - _, err := NewLongSketchFromSlice(mem) + _, err := NewLongsSketchFromSlice(mem) assert.Error(t, err) } func TestFreqLongStringSerDeError(t *testing.T) { - // sk1, err := NewLongSketchWithMaxMapSize(8) - // str1 := sk1.serializeToString() + // sk1, err := NewLongsSketchWithMaxMapSize(8) + // str1 := sk1.ToString() // correct = "1,10,2,4,0,0,0,4,"; tryBadString(t, "2,10,2,4,0,0,0,4,") //bad SerVer of 2 @@ -306,7 +306,7 @@ func TestFreqLongStringSerDeError(t *testing.T) { } func tryBadString(t *testing.T, badString string) { - _, err := NewLongSketchFromString(badString) + _, err := NewLongsSketchFromString(badString) assert.Error(t, err) } @@ -329,27 +329,27 @@ func TestFreqLongs(t *testing.T) { } for h := 0; h < numSketches; h++ { - threshold := sketches[h].getMaximumError() - rows, err := sketches[h].getFrequentItems(ErrorTypeEnum.NO_FALSE_NEGATIVES) + threshold := sketches[h].GetMaximumError() + rows, err := sketches[h].GetFrequentItems(ErrorTypeEnum.NO_FALSE_NEGATIVES) assert.NoError(t, err) for i := 0; i < len(rows); i++ { - assert.True(t, rows[i].getUpperBound() > threshold) + assert.True(t, rows[i].GetUpperBound() > threshold) } - rows, err = sketches[h].getFrequentItems(ErrorTypeEnum.NO_FALSE_POSITIVES) + rows, err = sketches[h].GetFrequentItems(ErrorTypeEnum.NO_FALSE_POSITIVES) assert.NoError(t, err) assert.Equal(t, len(rows), 0) for i := 0; i < len(rows); i++ { - assert.True(t, rows[i].getLowerBound() > threshold) + assert.True(t, rows[i].GetLowerBound() > threshold) } - rows, err = sketches[h].getFrequentItems(ErrorTypeEnum.NO_FALSE_NEGATIVES) + rows, err = sketches[h].GetFrequentItems(ErrorTypeEnum.NO_FALSE_NEGATIVES) } } func newFrequencySketch(eps float64) (*LongsSketch, error) { maxMapSize := common.CeilPowerOf2(int(1.0 / (eps * reversePurgeLongHashMapLoadFactor))) - return NewLongSketchWithMaxMapSize(maxMapSize) + return NewLongsSketchWithMaxMapSize(maxMapSize) } func TestUpdateOneTime(t *testing.T) { @@ -359,35 +359,35 @@ func TestUpdateOneTime(t *testing.T) { numSketches := 1 for h := 0; h < numSketches; h++ { sketch, _ := newFrequencySketch(errorTolerance) - ub, err := sketch.getUpperBound(13) + ub, err := sketch.GetUpperBound(13) assert.NoError(t, err) assert.Equal(t, ub, int64(0)) - lb, err := sketch.getLowerBound(13) + lb, err := sketch.GetLowerBound(13) assert.NoError(t, err) assert.Equal(t, lb, int64(0)) - assert.Equal(t, sketch.getMaximumError(), int64(0)) - est, err := sketch.getEstimate(13) + assert.Equal(t, sketch.GetMaximumError(), int64(0)) + est, err := sketch.GetEstimate(13) assert.NoError(t, err) assert.Equal(t, est, int64(0)) sketch.Update(13) - // assert.Equal(t, sketch.getEstimate(13), 1) + // assert.Equal(t, sketch.GetEstimate(13), 1) } } func TestGetInstanceSlice(t *testing.T) { sl := make([]byte, 4) - _, err := NewLongSketchFromSlice(sl) + _, err := NewLongsSketchFromSlice(sl) assert.Error(t, err) } func TestGetInstanceString(t *testing.T) { - _, err := NewLongSketchFromString("") + _, err := NewLongsSketchFromString("") assert.Error(t, err) } func TestUpdateNegative(t *testing.T) { minSize := 1 << _LG_MIN_MAP_SIZE - fls, err := NewLongSketchWithMaxMapSize(minSize) + fls, err := NewLongsSketchWithMaxMapSize(minSize) assert.NoError(t, err) err = fls.UpdateMany(1, 0) assert.NoError(t, err) @@ -397,10 +397,10 @@ func TestUpdateNegative(t *testing.T) { func TestGetFrequentItems1(t *testing.T) { minSize := 1 << _LG_MIN_MAP_SIZE - fls, err := NewLongSketchWithMaxMapSize(minSize) + fls, err := NewLongsSketchWithMaxMapSize(minSize) assert.NoError(t, err) fls.Update(1) - rowArr, err := fls.getFrequentItems(ErrorTypeEnum.NO_FALSE_POSITIVES) + rowArr, err := fls.GetFrequentItems(ErrorTypeEnum.NO_FALSE_POSITIVES) assert.NoError(t, err) assert.NotEmpty(t, rowArr) row := rowArr[0] @@ -417,44 +417,44 @@ func TestGetFrequentItems1(t *testing.T) { func TestGetStorageByes(t *testing.T) { minSize := 1 << _LG_MIN_MAP_SIZE - fls, err := NewLongSketchWithMaxMapSize(minSize) + fls, err := NewLongsSketchWithMaxMapSize(minSize) assert.NoError(t, err) - sl, err := fls.toSlice() + sl, err := fls.ToSlice() assert.NoError(t, err) - assert.Equal(t, len(sl), fls.getStorageBytes()) + assert.Equal(t, len(sl), fls.GetStorageBytes()) err = fls.Update(1) assert.NoError(t, err) - sl, err = fls.toSlice() + sl, err = fls.ToSlice() assert.NoError(t, err) - assert.Equal(t, len(sl), fls.getStorageBytes()) + assert.Equal(t, len(sl), fls.GetStorageBytes()) } func TestDeSerFromString(t *testing.T) { minSize := 1 << _LG_MIN_MAP_SIZE - fls, err := NewLongSketchWithMaxMapSize(minSize) + fls, err := NewLongsSketchWithMaxMapSize(minSize) assert.NoError(t, err) - str, err := fls.serializeToString() + str, err := fls.ToString() fmt.Println(str) assert.NoError(t, err) fls.Update(1) - str, err = fls.serializeToString() + str, err = fls.ToString() assert.NoError(t, err) fmt.Println(str) } func TestMerge(t *testing.T) { minSize := 1 << _LG_MIN_MAP_SIZE - fls1, err := NewLongSketchWithMaxMapSize(minSize) + fls1, err := NewLongsSketchWithMaxMapSize(minSize) assert.NoError(t, err) var fls2 *LongsSketch fls2 = nil - fle, err := fls1.merge(fls2) + fle, err := fls1.Merge(fls2) assert.NoError(t, err) - assert.True(t, fle.isEmpty()) + assert.True(t, fle.IsEmpty()) - fls2, err = NewLongSketchWithMaxMapSize(minSize) + fls2, err = NewLongsSketchWithMaxMapSize(minSize) assert.NoError(t, err) - fle, err = fls1.merge(fls2) + fle, err = fls1.Merge(fls2) assert.NoError(t, err) } @@ -480,11 +480,11 @@ func TestSortItems(t *testing.T) { } for h := 0; h < numSketches; h++ { - threshold := sketches[h].getMaximumError() - rows, err := sketches[h].getFrequentItems(ErrorTypeEnum.NO_FALSE_NEGATIVES) + threshold := sketches[h].GetMaximumError() + rows, err := sketches[h].GetFrequentItems(ErrorTypeEnum.NO_FALSE_NEGATIVES) assert.NoError(t, err) for i := 0; i < len(rows); i++ { - assert.True(t, rows[i].getUpperBound() > threshold) + assert.True(t, rows[i].GetUpperBound() > threshold) } first := rows[0] anItem := first.item @@ -514,7 +514,7 @@ func TestToString1(t *testing.T) { func printSketch(t *testing.T, size int, items []int64) { var sb strings.Builder - fls, err := NewLongSketchWithMaxMapSize(size) + fls, err := NewLongsSketchWithMaxMapSize(size) assert.NoError(t, err) for i := 0; i < len(items); i++ { fls.UpdateMany(int64(i+1), items[i]) @@ -529,7 +529,7 @@ func printSketch(t *testing.T, size int, items []int64) { } func printRows(t *testing.T, fls *LongsSketch, errorType ErrorType) { - rows, err := fls.getFrequentItems(errorType) + rows, err := fls.GetFrequentItems(errorType) assert.NoError(t, err) fmt.Println(errorType.Name) fmt.Printf(" %20s%20s%20s %s", "Est", "UB", "LB", "Item") @@ -544,3 +544,52 @@ func printRows(t *testing.T, fls *LongsSketch, errorType ErrorType) { assert.NotEqual(t, rows[0], nullRow) } } + +func TestStringDeserEmptyNotCorrupt(t *testing.T) { + size := 1 << _LG_MIN_MAP_SIZE + thresh := (size * 3) / 4 + format := "%6d%10s%s" + fls, err := NewLongsSketchWithMaxMapSize(size) + assert.NoError(t, err) + fmt.Printf("Sketch Size: %d\n", size) + for i := 0; i <= thresh; i++ { + err := fls.UpdateMany(int64(i+1), 1) + assert.NoError(t, err) + s, err := fls.ToString() + assert.NoError(t, err) + fmt.Printf("SER "+format+"\n", (i + 1), fmt.Sprintf("%t : ", fls.IsEmpty()), s) + fls2, err := NewLongsSketchFromString(s) + assert.NoError(t, err) + fmt.Printf("DESER "+format+"\n", (i + 1), fmt.Sprintf("%t : ", fls2.IsEmpty()), s) + } +} + +func TestStringDeserEmptyCorrupt(t *testing.T) { + var s strings.Builder + s.WriteString("1,") //serVer + s.WriteString("10,") //FamID + s.WriteString("3,") //lgMaxMapSz + s.WriteString("0,") //Empty Flag = false ... corrupted, should be true + s.WriteString("7,") //stream Len so far + s.WriteString("1,") //error offset + s.WriteString("0,") //numActive ...conflict with empty + s.WriteString("8,") //curMapLen + _, err := NewLongsSketchFromString(s.String()) + assert.Error(t, err) +} + +func TestGetEpsilon(t *testing.T) { + eps, err := GetEpsilonLongsSketch(1024) + assert.NoError(t, err) + assert.Equal(t, eps, 3.5/1024) + + _, err = GetEpsilonLongsSketch(1000) + assert.Error(t, err) +} + +func TestGetAprioriError(t *testing.T) { + eps := 3.5 / 1024 + apr, err := GetAprioriErrorLongsSketch(1024, 10_000) + assert.NoError(t, err) + assert.Equal(t, apr, eps*10_000) +} diff --git a/frequencies/row.go b/frequencies/row.go index ad105ef..696193d 100644 --- a/frequencies/row.go +++ b/frequencies/row.go @@ -42,15 +42,15 @@ func (r *Row) String() string { return fmt.Sprintf(" %20d%20d%20d %d", r.est, r.ub, r.lb, r.item) } -func (r *Row) getEstimate() int64 { +func (r *Row) GetEstimate() int64 { return r.est } -func (r *Row) getUpperBound() int64 { +func (r *Row) GetUpperBound() int64 { return r.ub } -func (r *Row) getLowerBound() int64 { +func (r *Row) GetLowerBound() int64 { return r.lb } @@ -59,15 +59,15 @@ func sortItems(sk *LongsSketch, threshold int64, errorType ErrorType) ([]*Row, e iter := sk.hashMap.iterator() if errorType == ErrorTypeEnum.NO_FALSE_NEGATIVES { for iter.next() { - est, err := sk.getEstimate(iter.getKey()) + est, err := sk.GetEstimate(iter.getKey()) if err != nil { return nil, err } - ub, err := sk.getUpperBound(iter.getKey()) + ub, err := sk.GetUpperBound(iter.getKey()) if err != nil { return nil, err } - lb, err := sk.getLowerBound(iter.getKey()) + lb, err := sk.GetLowerBound(iter.getKey()) if err != nil { return nil, err } @@ -78,15 +78,15 @@ func sortItems(sk *LongsSketch, threshold int64, errorType ErrorType) ([]*Row, e } } else { //NO_FALSE_POSITIVES for iter.next() { - est, err := sk.getEstimate(iter.getKey()) + est, err := sk.GetEstimate(iter.getKey()) if err != nil { return nil, err } - ub, err := sk.getUpperBound(iter.getKey()) + ub, err := sk.GetUpperBound(iter.getKey()) if err != nil { return nil, err } - lb, err := sk.getLowerBound(iter.getKey()) + lb, err := sk.GetLowerBound(iter.getKey()) if err != nil { return nil, err } diff --git a/frequencies/utils.go b/frequencies/utils.go index 6af4804..2778032 100644 --- a/frequencies/utils.go +++ b/frequencies/utils.go @@ -17,6 +17,11 @@ package frequencies +import ( + "math" + "math/rand" +) + const ( _LG_MIN_MAP_SIZE = 3 // This constant is large enough so that computing the median of SAMPLE_SIZE @@ -26,6 +31,27 @@ const ( _SAMPLE_SIZE = 1024 ) +type ErrorType struct { + id int + Name string +} + +type ErrorTypes struct { + NO_FALSE_POSITIVES ErrorType + NO_FALSE_NEGATIVES ErrorType +} + +var ErrorTypeEnum = &ErrorTypes{ + NO_FALSE_POSITIVES: ErrorType{ + id: 1, + Name: "NO_FALSE_POSITIVES", + }, + NO_FALSE_NEGATIVES: ErrorType{ + id: 2, + Name: "NO_FALSE_NEGATIVES", + }, +} + // hash returns an index into the hash table. // This hash function is taken from the internals of Austin Appleby's MurmurHash3 algorithm. // It is also used by the Trove for Java libraries. @@ -38,3 +64,10 @@ func hash(okey int64) int64 { key ^= key >> 33 return int64(key) } + +func randomGeometricDist(prob float64) int64 { + if prob <= 0.0 || prob >= 1.0 { + panic("prob must be in (0, 1)") + } + return int64(1 + math.Log(rand.Float64())/math.Log(1.0-prob)) +} diff --git a/hll/hll_sketch_serialization_test.go b/hll/hll_sketch_serialization_test.go index 2341af6..94a5ff1 100644 --- a/hll/hll_sketch_serialization_test.go +++ b/hll/hll_sketch_serialization_test.go @@ -19,23 +19,17 @@ package hll import ( "fmt" + "github.com/apache/datasketches-go/common/testutils" "os" "testing" "github.com/stretchr/testify/assert" ) -const ( - dSketchTestGenerateGo = "DSKETCH_TEST_GENERATE_GO" - dSketchTestCrossJava = "DSKETCH_TEST_CROSS_JAVA" - dSketchTestCrossCpp = "DSKETCH_TEST_CROSS_CPP" - dSketchTestCrossGo = "DSKETCH_TEST_CROSS_GO" -) - // Run me manually for generation func TestGenerateGoFiles(t *testing.T) { - if len(os.Getenv(dSketchTestGenerateGo)) == 0 { - t.Skipf("%s not set", dSketchTestGenerateGo) + if len(os.Getenv(testutils.DSketchTestGenerateGo)) == 0 { + t.Skipf("%s not set", testutils.DSketchTestGenerateGo) } nArr := []int{0, 1, 10, 100, 1000, 10000, 100000, 1000000} @@ -52,35 +46,35 @@ func TestGenerateGoFiles(t *testing.T) { assert.NoError(t, hll6.UpdateUInt64(uint64(i))) assert.NoError(t, hll8.UpdateUInt64(uint64(i))) } - err = os.MkdirAll(goPath, os.ModePerm) + err = os.MkdirAll(testutils.GoPath, os.ModePerm) assert.NoError(t, err) sl4, err := hll4.ToCompactSlice() assert.NoError(t, err) - err = os.WriteFile(fmt.Sprintf("%s/hll4_n%d_go.sk", goPath, n), sl4, 0644) + err = os.WriteFile(fmt.Sprintf("%s/hll4_n%d_go.sk", testutils.GoPath, n), sl4, 0644) assert.NoError(t, err) sl6, err := hll6.ToCompactSlice() assert.NoError(t, err) - err = os.WriteFile(fmt.Sprintf("%s/hll6_n%d_go.sk", goPath, n), sl6, 0644) + err = os.WriteFile(fmt.Sprintf("%s/hll6_n%d_go.sk", testutils.GoPath, n), sl6, 0644) assert.NoError(t, err) sl8, err := hll8.ToCompactSlice() assert.NoError(t, err) - err = os.WriteFile(fmt.Sprintf("%s/hll8_n%d_go.sk", goPath, n), sl8, 0644) + err = os.WriteFile(fmt.Sprintf("%s/hll8_n%d_go.sk", testutils.GoPath, n), sl8, 0644) assert.NoError(t, err) } } func TestJavaCompat(t *testing.T) { - if len(os.Getenv(dSketchTestCrossJava)) == 0 { - t.Skipf("%s not set", dSketchTestCrossJava) + if len(os.Getenv(testutils.DSketchTestCrossJava)) == 0 { + t.Skipf("%s not set", testutils.DSketchTestCrossJava) } t.Run("Java Hll4", func(t *testing.T) { nArr := []int{0, 1, 10, 100, 1000, 10000, 100000, 1000000} for _, n := range nArr { - bytes, err := os.ReadFile(fmt.Sprintf("%s/hll4_n%d_java.sk", javaPath, n)) + bytes, err := os.ReadFile(fmt.Sprintf("%s/hll4_n%d_java.sk", testutils.JavaPath, n)) assert.NoError(t, err) sketch, err := DeserializeHllSketch(bytes, true) if err != nil { @@ -97,7 +91,7 @@ func TestJavaCompat(t *testing.T) { t.Run("Java Hll6", func(t *testing.T) { nArr := []int{0, 1, 10, 100, 1000, 10000, 100000, 1000000} for _, n := range nArr { - bytes, err := os.ReadFile(fmt.Sprintf("%s/hll6_n%d_java.sk", javaPath, n)) + bytes, err := os.ReadFile(fmt.Sprintf("%s/hll6_n%d_java.sk", testutils.JavaPath, n)) assert.NoError(t, err) sketch, err := DeserializeHllSketch(bytes, true) @@ -115,7 +109,7 @@ func TestJavaCompat(t *testing.T) { t.Run("Java Hll8", func(t *testing.T) { nArr := []int{0, 1, 10, 100, 1000, 10000, 100000, 1000000} for _, n := range nArr { - bytes, err := os.ReadFile(fmt.Sprintf("%s/hll8_n%d_java.sk", javaPath, n)) + bytes, err := os.ReadFile(fmt.Sprintf("%s/hll8_n%d_java.sk", testutils.JavaPath, n)) assert.NoError(t, err) sketch, err := DeserializeHllSketch(bytes, true) if err != nil { @@ -131,14 +125,14 @@ func TestJavaCompat(t *testing.T) { } func TestCppCompat(t *testing.T) { - if len(os.Getenv(dSketchTestCrossCpp)) == 0 { - t.Skipf("%s not set", dSketchTestCrossCpp) + if len(os.Getenv(testutils.DSketchTestCrossCpp)) == 0 { + t.Skipf("%s not set", testutils.DSketchTestCrossCpp) } t.Run("Cpp Hll4", func(t *testing.T) { nArr := []int{0, 1, 10, 100, 1000, 10000, 100000, 1000000} for _, n := range nArr { - bytes, err := os.ReadFile(fmt.Sprintf("%s/hll4_n%d_cpp.sk", cppPath, n)) + bytes, err := os.ReadFile(fmt.Sprintf("%s/hll4_n%d_cpp.sk", testutils.CppPath, n)) assert.NoError(t, err) sketch, err := DeserializeHllSketch(bytes, true) if err != nil { @@ -155,7 +149,7 @@ func TestCppCompat(t *testing.T) { t.Run("Cpp Hll6", func(t *testing.T) { nArr := []int{0, 1, 10, 100, 1000, 10000, 100000, 1000000} for _, n := range nArr { - bytes, err := os.ReadFile(fmt.Sprintf("%s/hll6_n%d_cpp.sk", cppPath, n)) + bytes, err := os.ReadFile(fmt.Sprintf("%s/hll6_n%d_cpp.sk", testutils.CppPath, n)) assert.NoError(t, err) sketch, err := DeserializeHllSketch(bytes, true) @@ -173,7 +167,7 @@ func TestCppCompat(t *testing.T) { t.Run("Cpp Hll8", func(t *testing.T) { nArr := []int{0, 1, 10, 100, 1000, 10000, 100000, 1000000} for _, n := range nArr { - bytes, err := os.ReadFile(fmt.Sprintf("%s/hll8_n%d_cpp.sk", cppPath, n)) + bytes, err := os.ReadFile(fmt.Sprintf("%s/hll8_n%d_cpp.sk", testutils.CppPath, n)) assert.NoError(t, err) sketch, err := DeserializeHllSketch(bytes, true) if err != nil { @@ -189,14 +183,14 @@ func TestCppCompat(t *testing.T) { } func TestGoCompat(t *testing.T) { - if len(os.Getenv(dSketchTestCrossGo)) == 0 { - t.Skipf("%s not set", dSketchTestCrossGo) + if len(os.Getenv(testutils.DSketchTestCrossGo)) == 0 { + t.Skipf("%s not set", testutils.DSketchTestCrossGo) } t.Run("Go Hll4", func(t *testing.T) { nArr := []int{0, 1, 10, 100, 1000, 10000, 100000, 1000000} for _, n := range nArr { - bytes, err := os.ReadFile(fmt.Sprintf("%s/hll4_n%d_go.sk", goPath, n)) + bytes, err := os.ReadFile(fmt.Sprintf("%s/hll4_n%d_go.sk", testutils.GoPath, n)) assert.NoError(t, err) sketch, err := DeserializeHllSketch(bytes, true) @@ -214,7 +208,7 @@ func TestGoCompat(t *testing.T) { t.Run("Go Hll6", func(t *testing.T) { nArr := []int{0, 1, 10, 100, 1000, 10000, 100000, 1000000} for _, n := range nArr { - bytes, err := os.ReadFile(fmt.Sprintf("%s/hll6_n%d_go.sk", goPath, n)) + bytes, err := os.ReadFile(fmt.Sprintf("%s/hll6_n%d_go.sk", testutils.GoPath, n)) assert.NoError(t, err) sketch, err := DeserializeHllSketch(bytes, true) @@ -232,7 +226,7 @@ func TestGoCompat(t *testing.T) { t.Run("Go Hll8", func(t *testing.T) { nArr := []int{0, 1, 10, 100, 1000, 10000, 100000, 1000000} for _, n := range nArr { - bytes, err := os.ReadFile(fmt.Sprintf("%s/hll8_n%d_go.sk", goPath, n)) + bytes, err := os.ReadFile(fmt.Sprintf("%s/hll8_n%d_go.sk", testutils.GoPath, n)) assert.NoError(t, err) sketch, err := DeserializeHllSketch(bytes, true) diff --git a/hll/hll_sketch_test.go b/hll/hll_sketch_test.go index bb9fd02..82f7313 100644 --- a/hll/hll_sketch_test.go +++ b/hll/hll_sketch_test.go @@ -26,12 +26,6 @@ import ( "github.com/stretchr/testify/assert" ) -const ( - javaPath = "../serialization_test_data/java_generated_files" - cppPath = "../serialization_test_data/cpp_generated_files" - goPath = "../serialization_test_data/go_generated_files" -) - func TestMisc(t *testing.T) { hll, err := NewHllSketch(10, TgtHllTypeHll4) assert.NoError(t, err) --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
