This is an automated email from the ASF dual-hosted git repository.

alsay pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datasketches-go.git

commit 4917dbc6cf32d143694f87d6487ea1983d1ef523
Author: Pierre Lacave <[email protected]>
AuthorDate: Wed Dec 20 17:17:50 2023 +0100

    Add frequency long sketch deserialization from string
---
 frequencies/long_sketch.go                 | 94 ++++++++++++++++++++++++++++--
 frequencies/long_sketch_test.go            | 18 +++---
 frequencies/reverse_purge_long_hash_map.go | 18 ++++++
 frequencies/utils.go                       |  2 +-
 4 files changed, 118 insertions(+), 14 deletions(-)

diff --git a/frequencies/long_sketch.go b/frequencies/long_sketch.go
index 75dcc3a..837d956 100644
--- a/frequencies/long_sketch.go
+++ b/frequencies/long_sketch.go
@@ -20,6 +20,8 @@ package frequencies
 import (
        "fmt"
        "github.com/apache/datasketches-go/common"
+       "math/bits"
+       "strconv"
        "strings"
 )
 
@@ -71,7 +73,7 @@ func NewLongSketch(lgMaxMapSize int, lgCurMapSize int) 
(*LongSketch, error) {
                return nil, err
        }
        curMapCap := hashMap.getCapacity()
-       maxMapCap := int(float64(uint64((1 << lgMaxMapSize))) * loadFactor)
+       maxMapCap := int(float64(uint64(1<<lgMaxMapSize)) * loadFactor)
        offset := int64(0)
        sampleSize := min(sampleSize, maxMapCap)
        return &LongSketch{
@@ -97,6 +99,83 @@ func NewLongSketchWithDefault(maxMapSize int) (*LongSketch, 
error) {
        return NewLongSketch(log2OfInt, lgMinMapSize)
 }
 
+func NewLongSketchFromString(str string) (*LongSketch, error) {
+       if len(str) < 1 {
+               return nil, fmt.Errorf("String is empty")
+       }
+       // Remove trailing comma if present
+       // as this will cause a problem with the split
+       if str[len(str)-1] == ',' {
+               str = str[:len(str)-1]
+       }
+       tokens := strings.Split(str, ",")
+       if len(tokens) < (strPreambleTokens + 2) {
+               return nil, fmt.Errorf("String not long enough: %d", 
len(tokens))
+       }
+       serVe, err := strconv.ParseInt(tokens[0], 10, 32)
+       if err != nil {
+               return nil, err
+       }
+       famID, err := strconv.ParseInt(tokens[1], 10, 32)
+       if err != nil {
+               return nil, err
+       }
+       lgMax, err := strconv.ParseInt(tokens[2], 10, 32)
+       if err != nil {
+               return nil, err
+       }
+       flags, err := strconv.ParseInt(tokens[3], 10, 32)
+       if err != nil {
+               return nil, err
+       }
+       streamWt, err := strconv.ParseInt(tokens[4], 10, 64)
+       if err != nil {
+               return nil, err
+       }
+       offset, err := strconv.ParseInt(tokens[5], 10, 64)
+       if err != nil {
+               return nil, err
+       }
+       //should always get at least the next 2 from the map
+       numActive, err := strconv.ParseInt(tokens[6], 10, 32)
+       if err != nil {
+               return nil, err
+       }
+       lgCurOrigin, err := strconv.ParseUint(tokens[7], 10, 32)
+       if err != nil {
+               return nil, err
+       }
+       lgCur := bits.TrailingZeros64(lgCurOrigin)
+
+       //checks
+       if serVe != serVer {
+               return nil, fmt.Errorf("Possible Corruption: Bad SerVer: %d", 
serVe)
+       }
+       if famID != common.FamilyFrequencyId {
+               return nil, fmt.Errorf("Possible Corruption: Bad Family: %d", 
famID)
+       }
+       empty := flags > 0
+       if !empty && (numActive == 0) {
+               return nil, fmt.Errorf("Possible Corruption: !Empty && 
NumActive=0;  strLen: %d", numActive)
+       }
+       numTokens := int64(len(tokens))
+       if (2 * numActive) != (numTokens - strPreambleTokens - 2) {
+               return nil, fmt.Errorf("Possible Corruption: Incorrect # of 
tokens: %d, numActive: %d", numTokens, numActive)
+       }
+       //    if ((2 * numActive) != (numTokens - STR_PREAMBLE_TOKENS - 2)) {
+       sk, err := NewLongSketch(int(lgMax), int(lgCur))
+       if err != nil {
+               return nil, err
+       }
+       sk.streamWeight = streamWt
+       sk.offset = offset
+       sk.hashMap, err = deserializeFromStringArray(tokens)
+       if err != nil {
+               return nil, err
+       }
+       return sk, nil
+}
+
 func (s *LongSketch) getNumActiveItems() int {
        return s.hashMap.numActive
 }
@@ -107,6 +186,10 @@ func (s *LongSketch) getMaximumMapCapacity() int {
        return int(float64(uint64(1<<s.lgMaxMapSize)) * loadFactor)
 }
 
+func (s *LongSketch) getCurrentMapCapacity() int {
+       return s.curMapCap
+}
+
 func (s *LongSketch) Update(item int64, count int64) error {
        if count == 0 {
                return nil
@@ -134,7 +217,7 @@ func (s *LongSketch) Update(item int64, count int64) error {
        return nil
 }
 
-func (s *LongSketch) serializeToString() string {
+func (s *LongSketch) serializeToString() (string, error) {
        var sb strings.Builder
        //start the string with parameters of the sketch
        serVer := serVer //0
@@ -144,7 +227,10 @@ func (s *LongSketch) serializeToString() string {
        if s.hashMap.numActive == 0 {
                flags = emptyFlagMask
        }
-       fmt.Fprintf(&sb, "%d,%d,%d,%d,%d,%d,", serVer, famID, lgMaxMapSz, 
flags, s.streamWeight, s.offset)
+       _, err := fmt.Fprintf(&sb, "%d,%d,%d,%d,%d,%d,", serVer, famID, 
lgMaxMapSz, flags, s.streamWeight, s.offset)
+       if err != nil {
+               return "", err
+       }
        sb.WriteString(s.hashMap.serializeToString()) //numActive, curMaplen, 
key[i], value[i], ...
-       return sb.String()
+       return sb.String(), nil
 }
diff --git a/frequencies/long_sketch_test.go b/frequencies/long_sketch_test.go
index 03a0c69..cc1c115 100644
--- a/frequencies/long_sketch_test.go
+++ b/frequencies/long_sketch_test.go
@@ -33,13 +33,13 @@ func TestFrequentItsemsStringSerialTest(t *testing.T) {
        sketch.Update(1000001, 1010230)
        sketch.Update(1000002, 1010230)
 
-       s := sketch.serializeToString()
-       assert.Equal(t, 
"1,10,5,0,2024103,0,4,32,10,200,15,3443,1000001,1010230,1000002,1010230,", s)
-       /*
-          LongsSketch new_sketch0 = LongsSketch.getInstance(string0);
-          String new_string0 = new_sketch0.serializeToString();
-          Assert.assertTrue(string0.equals(new_string0));
-          Assert.assertTrue(new_sketch0.getMaximumMapCapacity() == 
sketch.getMaximumMapCapacity());
-          Assert.assertTrue(new_sketch0.getCurrentMapCapacity() == 
sketch.getCurrentMapCapacity());
-       */
+       ser, err := sketch.serializeToString()
+       assert.NoError(t, err)
+       newSk0, err := NewLongSketchFromString(ser)
+       assert.NoError(t, err)
+       newSer0, err := newSk0.serializeToString()
+       assert.NoError(t, err)
+       assert.Equal(t, ser, newSer0)
+       assert.Equal(t, sketch.getMaximumMapCapacity(), 
newSk0.getMaximumMapCapacity())
+       assert.Equal(t, sketch.getCurrentMapCapacity(), 
newSk0.getCurrentMapCapacity())
 }
diff --git a/frequencies/reverse_purge_long_hash_map.go 
b/frequencies/reverse_purge_long_hash_map.go
index 28df67d..82417ea 100644
--- a/frequencies/reverse_purge_long_hash_map.go
+++ b/frequencies/reverse_purge_long_hash_map.go
@@ -246,3 +246,21 @@ func deserializeReversePurgeLongHashMapFromString(string 
string) (*reversePurgeL
        }
        return table, nil
 }
+
+func deserializeFromStringArray(tokens []string) (*reversePurgeLongHashMap, 
error) {
+       ignore := strPreambleTokens
+       numActive, _ := strconv.ParseUint(tokens[ignore], 10, 32)
+       length, _ := strconv.ParseUint(tokens[ignore+1], 10, 32)
+       hashMap, err := NewReversePurgeLongHashMap(int(length))
+       if err != nil {
+               return nil, err
+       }
+       j := 2 + ignore
+       for i := 0; i < int(numActive); i++ {
+               key, _ := strconv.ParseUint(tokens[j], 10, 64)
+               value, _ := strconv.ParseUint(tokens[j+1], 10, 64)
+               hashMap.adjustOrPutValue(int64(key), int64(value))
+               j += 2
+       }
+       return hashMap, nil
+}
diff --git a/frequencies/utils.go b/frequencies/utils.go
index 05b2d57..9774709 100644
--- a/frequencies/utils.go
+++ b/frequencies/utils.go
@@ -18,7 +18,7 @@
 package frequencies
 
 const (
-       lgMinMapSize = 5
+       lgMinMapSize = 3
        // This constant is large enough so that computing the median of 
SAMPLE_SIZE
        // randomly selected entries from a list of numbers and outputting
        // the empirical median will give a constant-factor approximation to the


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to