This is an automated email from the ASF dual-hosted git repository. alsay pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/datasketches-go.git
commit 7d8bc368fc41d9d9c730c132d5ccef0321012098 Author: Pierre Lacave <[email protected]> AuthorDate: Thu Dec 21 11:45:51 2023 +0100 Add more test and fix sort order --- frequencies/error_types.go | 24 ++++- frequencies/longs_sketch.go | 12 +++ frequencies/longs_sketch_test.go | 162 ++++++++++++++++++++++++++++- frequencies/reverse_purge_long_hash_map.go | 13 +++ frequencies/row.go | 18 ++-- 5 files changed, 208 insertions(+), 21 deletions(-) diff --git a/frequencies/error_types.go b/frequencies/error_types.go index b649d01..730f331 100644 --- a/frequencies/error_types.go +++ b/frequencies/error_types.go @@ -17,9 +17,23 @@ package frequencies -type ErrorType = int +type ErrorType struct { + id int + Name string +} -const ( - NO_FALSE_POSITIVES = ErrorType(1) - NO_FALSE_NEGATIVES = ErrorType(2) -) +type ErrorTypes struct { + NO_FALSE_POSITIVES ErrorType + NO_FALSE_NEGATIVES ErrorType +} + +var ErrorTypeEnum = &ErrorTypes{ + NO_FALSE_POSITIVES: ErrorType{ + id: 1, + Name: "NO_FALSE_POSITIVES", + }, + NO_FALSE_NEGATIVES: ErrorType{ + id: 2, + Name: "NO_FALSE_NEGATIVES", + }, +} diff --git a/frequencies/longs_sketch.go b/frequencies/longs_sketch.go index ecb51a2..6574c33 100644 --- a/frequencies/longs_sketch.go +++ b/frequencies/longs_sketch.go @@ -338,6 +338,18 @@ func (s *LongsSketch) UpdateMany(item int64, count int64) error { return nil } +func (s *LongsSketch) String() string { + var sb strings.Builder + sb.WriteString("FrequentLongsSketch:") + sb.WriteString("\n") + sb.WriteString(" Stream Length : " + strconv.FormatInt(s.streamWeight, 10)) + sb.WriteString("\n") + sb.WriteString(" Max Error Offset : " + strconv.FormatInt(s.offset, 10)) + sb.WriteString("\n") + sb.WriteString(s.hashMap.String()) + return sb.String() +} + func (s *LongsSketch) merge(other *LongsSketch) (*LongsSketch, error) { if other == nil || other.isEmpty() { return s, nil diff --git a/frequencies/longs_sketch_test.go b/frequencies/longs_sketch_test.go index f0ec490..20c9898 100644 --- a/frequencies/longs_sketch_test.go +++ b/frequencies/longs_sketch_test.go @@ -19,8 +19,10 @@ package frequencies import ( "encoding/binary" + "fmt" "github.com/apache/datasketches-go/common" "github.com/stretchr/testify/assert" + "strings" "testing" ) @@ -238,8 +240,8 @@ func checkEquality(t *testing.T, sk1, sk2 *LongsSketch) { assert.Equal(t, sk1.getStreamLength(), sk2.getStreamLength()) assert.Equal(t, sk1.isEmpty(), sk2.isEmpty()) - NFN := NO_FALSE_NEGATIVES - NFP := NO_FALSE_POSITIVES + NFN := ErrorTypeEnum.NO_FALSE_NEGATIVES + NFP := ErrorTypeEnum.NO_FALSE_POSITIVES rowArr1, err := sk1.getFrequentItems(NFN) assert.NoError(t, err) @@ -328,20 +330,20 @@ func TestFreqLongs(t *testing.T) { for h := 0; h < numSketches; h++ { threshold := sketches[h].getMaximumError() - rows, err := sketches[h].getFrequentItems(NO_FALSE_NEGATIVES) + rows, err := sketches[h].getFrequentItems(ErrorTypeEnum.NO_FALSE_NEGATIVES) assert.NoError(t, err) for i := 0; i < len(rows); i++ { assert.True(t, rows[i].getUpperBound() > threshold) } - rows, err = sketches[h].getFrequentItems(NO_FALSE_POSITIVES) + rows, err = sketches[h].getFrequentItems(ErrorTypeEnum.NO_FALSE_POSITIVES) assert.NoError(t, err) assert.Equal(t, len(rows), 0) for i := 0; i < len(rows); i++ { assert.True(t, rows[i].getLowerBound() > threshold) } - rows, err = sketches[h].getFrequentItems(NO_FALSE_NEGATIVES) + rows, err = sketches[h].getFrequentItems(ErrorTypeEnum.NO_FALSE_NEGATIVES) } } @@ -392,3 +394,153 @@ func TestUpdateNegative(t *testing.T) { err = fls.UpdateMany(1, -1) assert.Error(t, err) } + +func TestGetFrequentItems1(t *testing.T) { + minSize := 1 << _LG_MIN_MAP_SIZE + fls, err := NewLongSketchWithMaxMapSize(minSize) + assert.NoError(t, err) + fls.Update(1) + rowArr, err := fls.getFrequentItems(ErrorTypeEnum.NO_FALSE_POSITIVES) + assert.NoError(t, err) + assert.NotEmpty(t, rowArr) + row := rowArr[0] + assert.Equal(t, row.est, int64(1)) + assert.Equal(t, row.item, int64(1)) + assert.Equal(t, row.lb, int64(1)) + assert.Equal(t, row.ub, int64(1)) + newRow := NewRow(row.item, row.est+1, row.ub, row.lb) + assert.NotEqual(t, row, newRow) + newRow = NewRow(row.item, row.est, row.ub, row.lb) + assert.Equal(t, row, newRow) + +} + +func TestGetStorageByes(t *testing.T) { + minSize := 1 << _LG_MIN_MAP_SIZE + fls, err := NewLongSketchWithMaxMapSize(minSize) + assert.NoError(t, err) + sl, err := fls.toSlice() + assert.NoError(t, err) + assert.Equal(t, len(sl), fls.getStorageBytes()) + err = fls.Update(1) + assert.NoError(t, err) + sl, err = fls.toSlice() + assert.NoError(t, err) + assert.Equal(t, len(sl), fls.getStorageBytes()) +} + +func TestDeSerFromString(t *testing.T) { + minSize := 1 << _LG_MIN_MAP_SIZE + fls, err := NewLongSketchWithMaxMapSize(minSize) + assert.NoError(t, err) + str, err := fls.serializeToString() + fmt.Println(str) + assert.NoError(t, err) + fls.Update(1) + str, err = fls.serializeToString() + assert.NoError(t, err) + fmt.Println(str) +} + +func TestMerge(t *testing.T) { + minSize := 1 << _LG_MIN_MAP_SIZE + fls1, err := NewLongSketchWithMaxMapSize(minSize) + assert.NoError(t, err) + var fls2 *LongsSketch + fls2 = nil + fle, err := fls1.merge(fls2) + assert.NoError(t, err) + assert.True(t, fle.isEmpty()) + + fls2, err = NewLongSketchWithMaxMapSize(minSize) + assert.NoError(t, err) + fle, err = fls1.merge(fls2) + assert.NoError(t, err) +} + +func TestSortItems(t *testing.T) { + numSketches := 1 + n := 2222 + errorTolerance := 1.0 / 100 + sketchSize := common.CeilPowerOf2(int(1.0 / (errorTolerance * reversePurgeLongHashMapLoadFactor))) + fmt.Printf("sketchSize: %d\n", sketchSize) + + sketches := make([]*LongsSketch, numSketches) + for h := 0; h < numSketches; h++ { + sketches[h], _ = newFrequencySketch(float64(sketchSize)) + } + + prob := .001 + for i := 0; i < n; i++ { + item := randomGeometricDist(prob) + 1 + for h := 0; h < numSketches; h++ { + err := sketches[h].Update(item) + assert.NoError(t, err) + } + } + + for h := 0; h < numSketches; h++ { + threshold := sketches[h].getMaximumError() + rows, err := sketches[h].getFrequentItems(ErrorTypeEnum.NO_FALSE_NEGATIVES) + assert.NoError(t, err) + for i := 0; i < len(rows); i++ { + assert.True(t, rows[i].getUpperBound() > threshold) + } + first := rows[0] + anItem := first.item + anEst := first.est + aLB := first.lb + s := first.String() + fmt.Println(s) + assert.True(t, anEst >= 0) + assert.True(t, aLB >= 0) + assert.Equal(t, anItem, anItem) //dummy test + + } +} + +func TestGetAndCheckPreLongs(t *testing.T) { + byteArr := make([]byte, 8) + byteArr[0] = 2 + _, err := checkPreambleSize(byteArr) + assert.Error(t, err) +} + +func TestToString1(t *testing.T) { + size := 1 << _LG_MIN_MAP_SIZE + printSketch(t, size, []int64{1, 1, 1, 1, 1, 1, 1, 2, 3, 4, 5}) + printSketch(t, size, []int64{5, 4, 3, 2, 1, 1, 1, 1, 1, 1, 1}) +} + +func printSketch(t *testing.T, size int, items []int64) { + var sb strings.Builder + fls, err := NewLongSketchWithMaxMapSize(size) + assert.NoError(t, err) + for i := 0; i < len(items); i++ { + fls.UpdateMany(int64(i+1), items[i]) + } + sb.WriteString(fmt.Sprintf("Sketch Size: %d\n", size)) + sb.WriteString(fls.String()) + fmt.Println(sb.String()) + printRows(t, fls, ErrorTypeEnum.NO_FALSE_NEGATIVES) + fmt.Println("") + printRows(t, fls, ErrorTypeEnum.NO_FALSE_POSITIVES) + fmt.Println("") +} + +func printRows(t *testing.T, fls *LongsSketch, errorType ErrorType) { + rows, err := fls.getFrequentItems(errorType) + assert.NoError(t, err) + fmt.Println(errorType.Name) + fmt.Printf(" %20s%20s%20s %s", "Est", "UB", "LB", "Item") + fmt.Print("\n") + for i := 0; i < len(rows); i++ { + row := rows[i] + s2 := row.String() + fmt.Println(s2) + } + if len(rows) > 0 { //check equals null case + var nullRow *Row + assert.NotEqual(t, rows[0], nullRow) + } +} diff --git a/frequencies/reverse_purge_long_hash_map.go b/frequencies/reverse_purge_long_hash_map.go index 03c2b16..62c83b9 100644 --- a/frequencies/reverse_purge_long_hash_map.go +++ b/frequencies/reverse_purge_long_hash_map.go @@ -352,6 +352,19 @@ func (s *reversePurgeLongHashMap) hashProbe(key int64) int { return probe } +func (s *reversePurgeLongHashMap) String() string { + var sb strings.Builder + sb.WriteString("ReversePurgeLongHashMap:\n") + sb.WriteString(fmt.Sprintf(" %12s:%11s%20s %s\n", "Index", "States", "Values", "Keys")) + for i := 0; i < len(s.keys); i++ { + if s.states[i] <= 0 { + continue + } + sb.WriteString(fmt.Sprintf(" %12d:%11d%20d %d\n", i, s.states[i], s.values[i], s.keys[i])) + } + return sb.String() +} + func newIterator(keys []int64, values []int64, states []int16, numActive int) *iteratorHashMap { stride := int(uint64(float64(len(keys))*common.InverseGolden) | 1) return &iteratorHashMap{ diff --git a/frequencies/row.go b/frequencies/row.go index 21a2c81..ad105ef 100644 --- a/frequencies/row.go +++ b/frequencies/row.go @@ -22,10 +22,6 @@ import ( "sort" ) -const ( - hfmt string = " %20s%20s%20s %s" -) - type Row struct { item int64 est int64 @@ -33,8 +29,8 @@ type Row struct { lb int64 } -func NewRow(item int64, estimate int64, ub int64, lb int64) Row { - return Row{ +func NewRow(item int64, estimate int64, ub int64, lb int64) *Row { + return &Row{ item: item, est: estimate, ub: ub, @@ -43,7 +39,7 @@ func NewRow(item int64, estimate int64, ub int64, lb int64) Row { } func (r *Row) String() string { - return fmt.Sprintf(" %20d%20d%20d %d", r.item, r.est, r.ub, r.lb) + return fmt.Sprintf(" %20d%20d%20d %d", r.est, r.ub, r.lb, r.item) } func (r *Row) getEstimate() int64 { @@ -61,7 +57,7 @@ func (r *Row) getLowerBound() int64 { func sortItems(sk *LongsSketch, threshold int64, errorType ErrorType) ([]*Row, error) { rowList := make([]*Row, 0) iter := sk.hashMap.iterator() - if errorType == NO_FALSE_NEGATIVES { + if errorType == ErrorTypeEnum.NO_FALSE_NEGATIVES { for iter.next() { est, err := sk.getEstimate(iter.getKey()) if err != nil { @@ -77,7 +73,7 @@ func sortItems(sk *LongsSketch, threshold int64, errorType ErrorType) ([]*Row, e } if ub >= threshold { row := NewRow(iter.getKey(), est, ub, lb) - rowList = append(rowList, &row) + rowList = append(rowList, row) } } } else { //NO_FALSE_POSITIVES @@ -96,13 +92,13 @@ func sortItems(sk *LongsSketch, threshold int64, errorType ErrorType) ([]*Row, e } if lb >= threshold { row := NewRow(iter.getKey(), est, ub, lb) - rowList = append(rowList, &row) + rowList = append(rowList, row) } } } sort.Slice(rowList, func(i, j int) bool { - return rowList[i].est < rowList[j].est + return rowList[i].est > rowList[j].est }) return rowList, nil --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
