This is an automated email from the ASF dual-hosted git repository. alsay pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/datasketches-go.git
commit 4917dbc6cf32d143694f87d6487ea1983d1ef523 Author: Pierre Lacave <[email protected]> AuthorDate: Wed Dec 20 17:17:50 2023 +0100 Add frequency long sketch deserialization from string --- frequencies/long_sketch.go | 94 ++++++++++++++++++++++++++++-- frequencies/long_sketch_test.go | 18 +++--- frequencies/reverse_purge_long_hash_map.go | 18 ++++++ frequencies/utils.go | 2 +- 4 files changed, 118 insertions(+), 14 deletions(-) diff --git a/frequencies/long_sketch.go b/frequencies/long_sketch.go index 75dcc3a..837d956 100644 --- a/frequencies/long_sketch.go +++ b/frequencies/long_sketch.go @@ -20,6 +20,8 @@ package frequencies import ( "fmt" "github.com/apache/datasketches-go/common" + "math/bits" + "strconv" "strings" ) @@ -71,7 +73,7 @@ func NewLongSketch(lgMaxMapSize int, lgCurMapSize int) (*LongSketch, error) { return nil, err } curMapCap := hashMap.getCapacity() - maxMapCap := int(float64(uint64((1 << lgMaxMapSize))) * loadFactor) + maxMapCap := int(float64(uint64(1<<lgMaxMapSize)) * loadFactor) offset := int64(0) sampleSize := min(sampleSize, maxMapCap) return &LongSketch{ @@ -97,6 +99,83 @@ func NewLongSketchWithDefault(maxMapSize int) (*LongSketch, error) { return NewLongSketch(log2OfInt, lgMinMapSize) } +func NewLongSketchFromString(str string) (*LongSketch, error) { + if len(str) < 1 { + return nil, fmt.Errorf("String is empty") + } + // Remove trailing comma if present + // as this will cause a problem with the split + if str[len(str)-1] == ',' { + str = str[:len(str)-1] + } + tokens := strings.Split(str, ",") + if len(tokens) < (strPreambleTokens + 2) { + return nil, fmt.Errorf("String not long enough: %d", len(tokens)) + } + serVe, err := strconv.ParseInt(tokens[0], 10, 32) + if err != nil { + return nil, err + } + famID, err := strconv.ParseInt(tokens[1], 10, 32) + if err != nil { + return nil, err + } + lgMax, err := strconv.ParseInt(tokens[2], 10, 32) + if err != nil { + return nil, err + } + flags, err := strconv.ParseInt(tokens[3], 10, 32) + if err != nil { + return nil, err + } + streamWt, err := strconv.ParseInt(tokens[4], 10, 64) + if err != nil { + return nil, err + } + offset, err := strconv.ParseInt(tokens[5], 10, 64) + if err != nil { + return nil, err + } + //should always get at least the next 2 from the map + numActive, err := strconv.ParseInt(tokens[6], 10, 32) + if err != nil { + return nil, err + } + lgCurOrigin, err := strconv.ParseUint(tokens[7], 10, 32) + if err != nil { + return nil, err + } + lgCur := bits.TrailingZeros64(lgCurOrigin) + + //checks + if serVe != serVer { + return nil, fmt.Errorf("Possible Corruption: Bad SerVer: %d", serVe) + } + if famID != common.FamilyFrequencyId { + return nil, fmt.Errorf("Possible Corruption: Bad Family: %d", famID) + } + empty := flags > 0 + if !empty && (numActive == 0) { + return nil, fmt.Errorf("Possible Corruption: !Empty && NumActive=0; strLen: %d", numActive) + } + numTokens := int64(len(tokens)) + if (2 * numActive) != (numTokens - strPreambleTokens - 2) { + return nil, fmt.Errorf("Possible Corruption: Incorrect # of tokens: %d, numActive: %d", numTokens, numActive) + } + // if ((2 * numActive) != (numTokens - STR_PREAMBLE_TOKENS - 2)) { + sk, err := NewLongSketch(int(lgMax), int(lgCur)) + if err != nil { + return nil, err + } + sk.streamWeight = streamWt + sk.offset = offset + sk.hashMap, err = deserializeFromStringArray(tokens) + if err != nil { + return nil, err + } + return sk, nil +} + func (s *LongSketch) getNumActiveItems() int { return s.hashMap.numActive } @@ -107,6 +186,10 @@ func (s *LongSketch) getMaximumMapCapacity() int { return int(float64(uint64(1<<s.lgMaxMapSize)) * loadFactor) } +func (s *LongSketch) getCurrentMapCapacity() int { + return s.curMapCap +} + func (s *LongSketch) Update(item int64, count int64) error { if count == 0 { return nil @@ -134,7 +217,7 @@ func (s *LongSketch) Update(item int64, count int64) error { return nil } -func (s *LongSketch) serializeToString() string { +func (s *LongSketch) serializeToString() (string, error) { var sb strings.Builder //start the string with parameters of the sketch serVer := serVer //0 @@ -144,7 +227,10 @@ func (s *LongSketch) serializeToString() string { if s.hashMap.numActive == 0 { flags = emptyFlagMask } - fmt.Fprintf(&sb, "%d,%d,%d,%d,%d,%d,", serVer, famID, lgMaxMapSz, flags, s.streamWeight, s.offset) + _, err := fmt.Fprintf(&sb, "%d,%d,%d,%d,%d,%d,", serVer, famID, lgMaxMapSz, flags, s.streamWeight, s.offset) + if err != nil { + return "", err + } sb.WriteString(s.hashMap.serializeToString()) //numActive, curMaplen, key[i], value[i], ... - return sb.String() + return sb.String(), nil } diff --git a/frequencies/long_sketch_test.go b/frequencies/long_sketch_test.go index 03a0c69..cc1c115 100644 --- a/frequencies/long_sketch_test.go +++ b/frequencies/long_sketch_test.go @@ -33,13 +33,13 @@ func TestFrequentItsemsStringSerialTest(t *testing.T) { sketch.Update(1000001, 1010230) sketch.Update(1000002, 1010230) - s := sketch.serializeToString() - assert.Equal(t, "1,10,5,0,2024103,0,4,32,10,200,15,3443,1000001,1010230,1000002,1010230,", s) - /* - LongsSketch new_sketch0 = LongsSketch.getInstance(string0); - String new_string0 = new_sketch0.serializeToString(); - Assert.assertTrue(string0.equals(new_string0)); - Assert.assertTrue(new_sketch0.getMaximumMapCapacity() == sketch.getMaximumMapCapacity()); - Assert.assertTrue(new_sketch0.getCurrentMapCapacity() == sketch.getCurrentMapCapacity()); - */ + ser, err := sketch.serializeToString() + assert.NoError(t, err) + newSk0, err := NewLongSketchFromString(ser) + assert.NoError(t, err) + newSer0, err := newSk0.serializeToString() + assert.NoError(t, err) + assert.Equal(t, ser, newSer0) + assert.Equal(t, sketch.getMaximumMapCapacity(), newSk0.getMaximumMapCapacity()) + assert.Equal(t, sketch.getCurrentMapCapacity(), newSk0.getCurrentMapCapacity()) } diff --git a/frequencies/reverse_purge_long_hash_map.go b/frequencies/reverse_purge_long_hash_map.go index 28df67d..82417ea 100644 --- a/frequencies/reverse_purge_long_hash_map.go +++ b/frequencies/reverse_purge_long_hash_map.go @@ -246,3 +246,21 @@ func deserializeReversePurgeLongHashMapFromString(string string) (*reversePurgeL } return table, nil } + +func deserializeFromStringArray(tokens []string) (*reversePurgeLongHashMap, error) { + ignore := strPreambleTokens + numActive, _ := strconv.ParseUint(tokens[ignore], 10, 32) + length, _ := strconv.ParseUint(tokens[ignore+1], 10, 32) + hashMap, err := NewReversePurgeLongHashMap(int(length)) + if err != nil { + return nil, err + } + j := 2 + ignore + for i := 0; i < int(numActive); i++ { + key, _ := strconv.ParseUint(tokens[j], 10, 64) + value, _ := strconv.ParseUint(tokens[j+1], 10, 64) + hashMap.adjustOrPutValue(int64(key), int64(value)) + j += 2 + } + return hashMap, nil +} diff --git a/frequencies/utils.go b/frequencies/utils.go index 05b2d57..9774709 100644 --- a/frequencies/utils.go +++ b/frequencies/utils.go @@ -18,7 +18,7 @@ package frequencies const ( - lgMinMapSize = 5 + lgMinMapSize = 3 // This constant is large enough so that computing the median of SAMPLE_SIZE // randomly selected entries from a list of numbers and outputting // the empirical median will give a constant-factor approximation to the --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
