This is an automated email from the ASF dual-hosted git repository.

placave pushed a commit to branch float64-serde
in repository https://gitbox.apache.org/repos/asf/datasketches-go.git

commit bceb87d463c52c37b34b4cef5cc2af692e41afbc
Author: Pierre Lacave <[email protected]>
AuthorDate: Sun Mar 17 13:25:59 2024 +0100

    Fix serialization issue, bug and deterministism + Add double serde
---
 common/array_of_doubles_serde.go                   |  82 +++++++++++++++++++++
 kll/items_sketch.go                                |  47 ++++++++----
 kll/items_sketch_test.go                           |  55 +++++++++++++-
 kll/items_sletch_serialization_test.go             |  69 ++++++++++++++++-
 .../java_generated_files/kll_double_n0_java.sk     | Bin 0 -> 8 bytes
 .../kll_double_n1000000_java.sk                    | Bin 0 -> 5000 bytes
 .../kll_double_n100000_java.sk                     | Bin 0 -> 4728 bytes
 .../java_generated_files/kll_double_n10000_java.sk | Bin 0 -> 4372 bytes
 .../java_generated_files/kll_double_n1000_java.sk  | Bin 0 -> 2640 bytes
 .../java_generated_files/kll_double_n100_java.sk   | Bin 0 -> 840 bytes
 .../java_generated_files/kll_double_n10_java.sk    | Bin 0 -> 120 bytes
 .../java_generated_files/kll_double_n1_java.sk     | Bin 0 -> 16 bytes
 .../kll_string_n1000000_java.sk                    | Bin 6848 -> 6848 bytes
 .../kll_string_n100000_java.sk                     | Bin 5896 -> 5896 bytes
 .../java_generated_files/kll_string_n10000_java.sk | Bin 4913 -> 4913 bytes
 .../java_generated_files/kll_string_n1000_java.sk  | Bin 2640 -> 2640 bytes
 16 files changed, 234 insertions(+), 19 deletions(-)

diff --git a/common/array_of_doubles_serde.go b/common/array_of_doubles_serde.go
new file mode 100644
index 0000000..84d5872
--- /dev/null
+++ b/common/array_of_doubles_serde.go
@@ -0,0 +1,82 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package common
+
+import (
+       "encoding/binary"
+       "github.com/twmb/murmur3"
+       "math"
+)
+
+type ArrayOfDoublesSerDe struct {
+       scratch [8]byte
+}
+
+func (f ArrayOfDoublesSerDe) Identity() float64 {
+       return 0
+}
+
+func (f ArrayOfDoublesSerDe) Hash(item float64) uint64 {
+       binary.LittleEndian.PutUint64(f.scratch[:], math.Float64bits(item))
+       return murmur3.SeedSum64(_DEFAULT_SERDE_HASH_SEED, f.scratch[:])
+}
+
+func (f ArrayOfDoublesSerDe) LessFn() LessFn[float64] {
+       return func(a float64, b float64) bool {
+               return a < b
+       }
+}
+
+func (f ArrayOfDoublesSerDe) SizeOf(item float64) int {
+       return 8
+}
+
+func (f ArrayOfDoublesSerDe) SizeOfMany(mem []byte, offsetBytes int, numItems 
int) (int, error) {
+       return numItems * 8, nil
+}
+
+func (f ArrayOfDoublesSerDe) SerializeOneToSlice(item float64) []byte {
+       bytes := make([]byte, 8)
+       binary.LittleEndian.PutUint64(bytes, math.Float64bits(item))
+       return bytes
+}
+
+func (f ArrayOfDoublesSerDe) SerializeManyToSlice(item []float64) []byte {
+       if len(item) == 0 {
+               return []byte{}
+       }
+       bytes := make([]byte, 8*len(item))
+       offset := 0
+       for i := 0; i < len(item); i++ {
+               binary.LittleEndian.PutUint64(bytes[offset:], 
math.Float64bits(item[i]))
+               offset += 8
+       }
+       return bytes
+}
+
+func (f ArrayOfDoublesSerDe) DeserializeManyFromSlice(mem []byte, offsetBytes 
int, numItems int) ([]float64, error) {
+       if numItems == 0 {
+               return []float64{}, nil
+       }
+       array := make([]float64, 0, numItems)
+       for i := 0; i < numItems; i++ {
+               array = append(array, 
math.Float64frombits(binary.LittleEndian.Uint64(mem[offsetBytes:])))
+               offsetBytes += 8
+       }
+       return array, nil
+}
diff --git a/kll/items_sketch.go b/kll/items_sketch.go
index 8728746..3cce1b9 100644
--- a/kll/items_sketch.go
+++ b/kll/items_sketch.go
@@ -31,8 +31,8 @@ import (
        "fmt"
        "github.com/apache/datasketches-go/common"
        "github.com/apache/datasketches-go/internal"
+       "math/rand"
        "sort"
-       "unsafe"
 )
 
 type ItemsSketch[C comparable] struct {
@@ -51,6 +51,9 @@ type ItemsSketch[C comparable] struct {
        maxItem           *C
        sortedView        *ItemsSketchSortedView[C]
        itemsSketchOp     common.ItemSketchOp[C]
+
+       // Force deterministic offset for test, so that we can compare results 
across implementation.
+       deterministicOffsetForTest bool
 }
 
 const (
@@ -68,6 +71,9 @@ var (
                3486784401, 10460353203, 31381059609, 94143178827, 282429536481,
                847288609443, 2541865828329, 7625597484987, 22876792454961, 
68630377364883,
                205891132094649}
+
+       // Used for deterministic rand behavior in tests
+       nextOffsetForTest = 0
 )
 
 // NewKllItemsSketch create a new ItemsSketch with the given k and m.
@@ -498,7 +504,6 @@ func (s *ItemsSketch[C]) ToSlice() ([]byte, error) {
                        return nil, err
                }
                copy(bytesOut[_DATA_START_ADR_SINGLE_ITEM:], siByteArr)
-               //wbuf.incrementPosition(-len);
                return bytesOut, nil
        }
 
@@ -607,7 +612,7 @@ func (s *ItemsSketch[C]) getSingleItemSizeBytes() (int, 
error) {
        if err != nil {
                return 0, err
        }
-       return s.itemsSketchOp.SizeOf(v) + int(unsafe.Sizeof(uint32(1))), nil
+       return s.itemsSketchOp.SizeOf(v), nil
 }
 
 func (s *ItemsSketch[C]) getSingleItemByteArr() ([]byte, error) {
@@ -736,7 +741,7 @@ func (s *ItemsSketch[C]) mergeItemsSketch(other 
*ItemsSketch[C]) {
                        otherNumLevels, otherLevelsArr, otherItemsArr, 
s.itemsSketchOp.LessFn())
 
                // notice that workbuf is being used as both the input and 
output
-               result := generalItemsCompress(s.k, s.m, provisionalNumLevels, 
workbuf, worklevels, workbuf, outlevels, s.isLevelZeroSorted, 
s.itemsSketchOp.LessFn())
+               result := generalItemsCompress(s.k, s.m, provisionalNumLevels, 
workbuf, worklevels, workbuf, outlevels, s.isLevelZeroSorted, 
s.itemsSketchOp.LessFn(), s.deterministicOffsetForTest)
                targetItemCount := result[1] //was finalCapacity. Max size 
given k, m, numLevels
                curItemCount := result[2]    //was finalPop
 
@@ -844,9 +849,9 @@ func (s *ItemsSketch[C]) compressWhileUpdatingSketch() {
                })
        }
        if popAbove == 0 {
-               randomlyHalveUpItems(myItemsArr, adjBeg, adjPop)
+               randomlyHalveUpItems(myItemsArr, adjBeg, adjPop, 
s.deterministicOffsetForTest)
        } else {
-               randomlyHalveDownItems(myItemsArr, adjBeg, adjPop)
+               randomlyHalveDownItems(myItemsArr, adjBeg, adjPop, 
s.deterministicOffsetForTest)
                mergeSortedItemsArrays(
                        myItemsArr, adjBeg, halfAdjPop,
                        myItemsArr, rawEnd, popAbove,
@@ -980,10 +985,12 @@ func intCapAuxAux(k uint16, depth uint8) uint32 {
        return uint32(k)
 }
 
-func randomlyHalveUpItems[C comparable](buf []C, start uint32, length uint32) {
+func randomlyHalveUpItems[C comparable](buf []C, start uint32, length uint32, 
deterministicOffsetForTest bool) {
        halfLength := length / 2
-       //offset := rand.Intn(2)
-       offset := 1
+       offset := rand.Intn(2)
+       if deterministicOffsetForTest {
+               offset = deterministicOffset()
+       }
        j := (start + length) - 1 - uint32(offset)
        for i := (start + length) - 1; i >= (start + halfLength); i-- {
                buf[i] = buf[j]
@@ -991,10 +998,12 @@ func randomlyHalveUpItems[C comparable](buf []C, start 
uint32, length uint32) {
        }
 }
 
-func randomlyHalveDownItems[C comparable](buf []C, start uint32, length 
uint32) {
+func randomlyHalveDownItems[C comparable](buf []C, start uint32, length 
uint32, deterministicOffsetForTest bool) {
        halfLength := length / 2
-       //offset := rand.Intn(2)
-       offset := 1
+       offset := rand.Intn(2)
+       if deterministicOffsetForTest {
+               offset = deterministicOffset()
+       }
        j := start + uint32(offset)
        for i := start; i < (start + halfLength); i++ {
                buf[i] = buf[j]
@@ -1074,7 +1083,9 @@ func generalItemsCompress[C comparable](
        outBuf []C,
        outLevels []uint32,
        isLevelZeroSorted bool,
-       lessFn common.LessFn[C]) []uint32 {
+       lessFn common.LessFn[C],
+       deterministicOffsetForTest bool,
+) []uint32 {
        numLevels := numLevelsIn
        currentItemCount := inLevels[numLevels] - inLevels[0]        // 
decreases with each compaction
        targetItemCount := computeTotalItemCapacity(k, m, numLevels) // 
increases if we add levels
@@ -1131,9 +1142,9 @@ func generalItemsCompress[C comparable](
                        }
 
                        if popAbove == 0 {
-                               randomlyHalveUpItems(inBuf, adjBeg, adjPop)
+                               randomlyHalveUpItems(inBuf, adjBeg, adjPop, 
deterministicOffsetForTest)
                        } else {
-                               randomlyHalveDownItems(inBuf, adjBeg, adjPop)
+                               randomlyHalveDownItems(inBuf, adjBeg, adjPop, 
deterministicOffsetForTest)
                                mergeSortedItemsArrays(
                                        inBuf, adjBeg, halfAdjPop,
                                        inBuf, rawLim, popAbove,
@@ -1162,3 +1173,9 @@ func generalItemsCompress[C comparable](
 
        return []uint32{uint32(numLevels), targetItemCount, currentItemCount}
 }
+
+func deterministicOffset() int {
+       result := nextOffsetForTest
+       nextOffsetForTest = 1 - nextOffsetForTest
+       return result
+}
diff --git a/kll/items_sketch_test.go b/kll/items_sketch_test.go
index 0d36c95..473aeee 100644
--- a/kll/items_sketch_test.go
+++ b/kll/items_sketch_test.go
@@ -835,7 +835,7 @@ func TestItemsSketch_SerializeDeserializeMultipleValue(t 
*testing.T) {
        assert.Equal(t, mem, mem2)
 }
 
-func TestSerializeDeserialize(t *testing.T) {
+func TestSerializeDeserializeString(t *testing.T) {
        nArr := []int{0, 1, 10, 100, 1000, 10000, 100000, 1000000}
        serde := common.ArrayOfStringsSerDe{}
        for _, n := range nArr {
@@ -888,3 +888,56 @@ func TestSerializeDeserialize(t *testing.T) {
                }
        }
 }
+
+func TestSerializeDeserializeFloat(t *testing.T) {
+       nArr := []int{0, 1, 10, 100, 1000, 10000, 100000, 1000000}
+       serde := common.ArrayOfDoublesSerDe{}
+       for _, n := range nArr {
+               sk, err := NewKllItemsSketchWithDefault[float64](serde)
+               assert.NoError(t, err)
+               for i := 1; i <= n; i++ {
+                       sk.Update(float64(i))
+               }
+               slc, err := sk.ToSlice()
+               assert.NoError(t, err)
+
+               sketch, err := NewKllItemsSketchFromSlice[float64](slc, serde)
+               if err != nil {
+                       return
+               }
+
+               assert.Equal(t, sketch.GetK(), uint16(200))
+               if n == 0 {
+                       assert.True(t, sketch.IsEmpty())
+               } else {
+                       assert.False(t, sketch.IsEmpty())
+               }
+
+               if n > 100 {
+                       assert.True(t, sketch.IsEstimationMode())
+               } else {
+                       assert.False(t, sketch.IsEstimationMode())
+               }
+
+               if n > 0 {
+                       minV, err := sketch.GetMinItem()
+                       assert.NoError(t, err)
+                       assert.Equal(t, minV, float64(1))
+
+                       maxV, err := sketch.GetMaxItem()
+                       assert.NoError(t, err)
+                       assert.Equal(t, maxV, float64(n))
+
+                       weight := int64(0)
+                       it := sketch.GetIterator()
+                       lessFn := serde.LessFn()
+                       for it.Next() {
+                               qut := it.GetQuantile()
+                               assert.True(t, lessFn(minV, qut) || minV == 
qut, fmt.Sprintf("min: \"%v\" \"%v\"", minV, qut))
+                               assert.True(t, !lessFn(maxV, qut) || maxV == 
qut, fmt.Sprintf("max: \"%v\" \"%v\"", maxV, qut))
+                               weight += it.GetWeight()
+                       }
+                       assert.Equal(t, weight, int64(n))
+               }
+       }
+}
diff --git a/kll/items_sletch_serialization_test.go 
b/kll/items_sletch_serialization_test.go
index 16ff817..4d01a7f 100644
--- a/kll/items_sletch_serialization_test.go
+++ b/kll/items_sletch_serialization_test.go
@@ -27,14 +27,17 @@ import (
 )
 
 func TestGenerateGoFiles(t *testing.T) {
-       if len(os.Getenv(internal.DSketchTestGenerateGo)) == 0 {
-               t.Skipf("%s not set", internal.DSketchTestGenerateGo)
-       }
+       //if len(os.Getenv(internal.DSketchTestGenerateGo)) == 0 {
+       //      t.Skipf("%s not set", internal.DSketchTestGenerateGo)
+       //}
+
+       os.Mkdir(internal.GoPath, 0755)
 
        nArr := []int{0, 1, 10, 100, 1000, 10000, 100000, 1000000}
        for _, n := range nArr {
                digits := numDigits(n)
                sk, err := 
NewKllItemsSketchWithDefault[string](common.ArrayOfStringsSerDe{})
+               sk.deterministicOffsetForTest = true
                assert.NoError(t, err)
                for i := 1; i <= n; i++ {
                        sk.Update(intToFixedLengthString(i, digits))
@@ -44,6 +47,19 @@ func TestGenerateGoFiles(t *testing.T) {
                err = os.WriteFile(fmt.Sprintf("%s/kll_string_n%d_go.sk", 
internal.GoPath, n), slc, 0644)
                assert.NoError(t, err)
        }
+
+       for _, n := range nArr {
+               sk, err := 
NewKllItemsSketchWithDefault[float64](common.ArrayOfDoublesSerDe{})
+               sk.deterministicOffsetForTest = true
+               assert.NoError(t, err)
+               for i := 1; i <= n; i++ {
+                       sk.Update(float64(i))
+               }
+               slc, err := sk.ToSlice()
+               assert.NoError(t, err)
+               err = os.WriteFile(fmt.Sprintf("%s/kll_double_n%d_go.sk", 
internal.GoPath, n), slc, 0644)
+               assert.NoError(t, err)
+       }
 }
 
 func TestJavaCompat(t *testing.T) {
@@ -94,4 +110,51 @@ func TestJavaCompat(t *testing.T) {
                        }
                }
        })
+
+       t.Run("Java KLL Double", func(t *testing.T) {
+               nArr := []int{0, 1, 10, 100, 1000, 10000, 100000, 1000000}
+               serde := common.ArrayOfDoublesSerDe{}
+               for _, n := range nArr {
+                       bytes, err := 
os.ReadFile(fmt.Sprintf("%s/kll_double_n%d_java.sk", internal.JavaPath, n))
+                       assert.NoError(t, err)
+                       sketch, err := 
NewKllItemsSketchFromSlice[float64](bytes, serde)
+                       if err != nil {
+                               return
+                       }
+
+                       assert.Equal(t, sketch.GetK(), uint16(200))
+                       if n == 0 {
+                               assert.True(t, sketch.IsEmpty())
+                       } else {
+                               assert.False(t, sketch.IsEmpty())
+                       }
+
+                       if n > 100 {
+                               assert.True(t, sketch.IsEstimationMode())
+                       } else {
+                               assert.False(t, sketch.IsEstimationMode())
+                       }
+
+                       if n > 0 {
+                               minV, err := sketch.GetMinItem()
+                               assert.NoError(t, err)
+                               assert.Equal(t, minV, float64(1))
+
+                               maxV, err := sketch.GetMaxItem()
+                               assert.NoError(t, err)
+                               assert.Equal(t, maxV, float64(n))
+
+                               weight := int64(0)
+                               it := sketch.GetIterator()
+                               lessFn := serde.LessFn()
+                               for it.Next() {
+                                       qut := it.GetQuantile()
+                                       assert.True(t, lessFn(minV, qut) || 
minV == qut, fmt.Sprintf("min: \"%v\" \"%v\"", minV, qut))
+                                       assert.True(t, !lessFn(maxV, qut) || 
maxV == qut, fmt.Sprintf("max: \"%v\" \"%v\"", maxV, qut))
+                                       weight += it.GetWeight()
+                               }
+                               assert.Equal(t, weight, int64(n))
+                       }
+               }
+       })
 }
diff --git a/serialization_test_data/java_generated_files/kll_double_n0_java.sk 
b/serialization_test_data/java_generated_files/kll_double_n0_java.sk
new file mode 100644
index 0000000..afd2209
Binary files /dev/null and 
b/serialization_test_data/java_generated_files/kll_double_n0_java.sk differ
diff --git 
a/serialization_test_data/java_generated_files/kll_double_n1000000_java.sk 
b/serialization_test_data/java_generated_files/kll_double_n1000000_java.sk
new file mode 100644
index 0000000..171a3a0
Binary files /dev/null and 
b/serialization_test_data/java_generated_files/kll_double_n1000000_java.sk 
differ
diff --git 
a/serialization_test_data/java_generated_files/kll_double_n100000_java.sk 
b/serialization_test_data/java_generated_files/kll_double_n100000_java.sk
new file mode 100644
index 0000000..5b57a7a
Binary files /dev/null and 
b/serialization_test_data/java_generated_files/kll_double_n100000_java.sk differ
diff --git 
a/serialization_test_data/java_generated_files/kll_double_n10000_java.sk 
b/serialization_test_data/java_generated_files/kll_double_n10000_java.sk
new file mode 100644
index 0000000..0de0b17
Binary files /dev/null and 
b/serialization_test_data/java_generated_files/kll_double_n10000_java.sk differ
diff --git 
a/serialization_test_data/java_generated_files/kll_double_n1000_java.sk 
b/serialization_test_data/java_generated_files/kll_double_n1000_java.sk
new file mode 100644
index 0000000..a2737f4
Binary files /dev/null and 
b/serialization_test_data/java_generated_files/kll_double_n1000_java.sk differ
diff --git 
a/serialization_test_data/java_generated_files/kll_double_n100_java.sk 
b/serialization_test_data/java_generated_files/kll_double_n100_java.sk
new file mode 100644
index 0000000..b0d578a
Binary files /dev/null and 
b/serialization_test_data/java_generated_files/kll_double_n100_java.sk differ
diff --git 
a/serialization_test_data/java_generated_files/kll_double_n10_java.sk 
b/serialization_test_data/java_generated_files/kll_double_n10_java.sk
new file mode 100644
index 0000000..fb526ef
Binary files /dev/null and 
b/serialization_test_data/java_generated_files/kll_double_n10_java.sk differ
diff --git a/serialization_test_data/java_generated_files/kll_double_n1_java.sk 
b/serialization_test_data/java_generated_files/kll_double_n1_java.sk
new file mode 100644
index 0000000..5eceb40
Binary files /dev/null and 
b/serialization_test_data/java_generated_files/kll_double_n1_java.sk differ
diff --git 
a/serialization_test_data/java_generated_files/kll_string_n1000000_java.sk 
b/serialization_test_data/java_generated_files/kll_string_n1000000_java.sk
index 6ba10e5..e5d0a29 100644
Binary files 
a/serialization_test_data/java_generated_files/kll_string_n1000000_java.sk and 
b/serialization_test_data/java_generated_files/kll_string_n1000000_java.sk 
differ
diff --git 
a/serialization_test_data/java_generated_files/kll_string_n100000_java.sk 
b/serialization_test_data/java_generated_files/kll_string_n100000_java.sk
index 73fa4bb..e3382f2 100644
Binary files 
a/serialization_test_data/java_generated_files/kll_string_n100000_java.sk and 
b/serialization_test_data/java_generated_files/kll_string_n100000_java.sk differ
diff --git 
a/serialization_test_data/java_generated_files/kll_string_n10000_java.sk 
b/serialization_test_data/java_generated_files/kll_string_n10000_java.sk
index 644a3e4..8c4ba2d 100644
Binary files 
a/serialization_test_data/java_generated_files/kll_string_n10000_java.sk and 
b/serialization_test_data/java_generated_files/kll_string_n10000_java.sk differ
diff --git 
a/serialization_test_data/java_generated_files/kll_string_n1000_java.sk 
b/serialization_test_data/java_generated_files/kll_string_n1000_java.sk
index f16897c..858cac2 100644
Binary files 
a/serialization_test_data/java_generated_files/kll_string_n1000_java.sk and 
b/serialization_test_data/java_generated_files/kll_string_n1000_java.sk differ


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to