This is an automated email from the ASF dual-hosted git repository. placave pushed a commit to branch float64-serde in repository https://gitbox.apache.org/repos/asf/datasketches-go.git
commit bceb87d463c52c37b34b4cef5cc2af692e41afbc Author: Pierre Lacave <[email protected]> AuthorDate: Sun Mar 17 13:25:59 2024 +0100 Fix serialization issue, bug and deterministism + Add double serde --- common/array_of_doubles_serde.go | 82 +++++++++++++++++++++ kll/items_sketch.go | 47 ++++++++---- kll/items_sketch_test.go | 55 +++++++++++++- kll/items_sletch_serialization_test.go | 69 ++++++++++++++++- .../java_generated_files/kll_double_n0_java.sk | Bin 0 -> 8 bytes .../kll_double_n1000000_java.sk | Bin 0 -> 5000 bytes .../kll_double_n100000_java.sk | Bin 0 -> 4728 bytes .../java_generated_files/kll_double_n10000_java.sk | Bin 0 -> 4372 bytes .../java_generated_files/kll_double_n1000_java.sk | Bin 0 -> 2640 bytes .../java_generated_files/kll_double_n100_java.sk | Bin 0 -> 840 bytes .../java_generated_files/kll_double_n10_java.sk | Bin 0 -> 120 bytes .../java_generated_files/kll_double_n1_java.sk | Bin 0 -> 16 bytes .../kll_string_n1000000_java.sk | Bin 6848 -> 6848 bytes .../kll_string_n100000_java.sk | Bin 5896 -> 5896 bytes .../java_generated_files/kll_string_n10000_java.sk | Bin 4913 -> 4913 bytes .../java_generated_files/kll_string_n1000_java.sk | Bin 2640 -> 2640 bytes 16 files changed, 234 insertions(+), 19 deletions(-) diff --git a/common/array_of_doubles_serde.go b/common/array_of_doubles_serde.go new file mode 100644 index 0000000..84d5872 --- /dev/null +++ b/common/array_of_doubles_serde.go @@ -0,0 +1,82 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package common + +import ( + "encoding/binary" + "github.com/twmb/murmur3" + "math" +) + +type ArrayOfDoublesSerDe struct { + scratch [8]byte +} + +func (f ArrayOfDoublesSerDe) Identity() float64 { + return 0 +} + +func (f ArrayOfDoublesSerDe) Hash(item float64) uint64 { + binary.LittleEndian.PutUint64(f.scratch[:], math.Float64bits(item)) + return murmur3.SeedSum64(_DEFAULT_SERDE_HASH_SEED, f.scratch[:]) +} + +func (f ArrayOfDoublesSerDe) LessFn() LessFn[float64] { + return func(a float64, b float64) bool { + return a < b + } +} + +func (f ArrayOfDoublesSerDe) SizeOf(item float64) int { + return 8 +} + +func (f ArrayOfDoublesSerDe) SizeOfMany(mem []byte, offsetBytes int, numItems int) (int, error) { + return numItems * 8, nil +} + +func (f ArrayOfDoublesSerDe) SerializeOneToSlice(item float64) []byte { + bytes := make([]byte, 8) + binary.LittleEndian.PutUint64(bytes, math.Float64bits(item)) + return bytes +} + +func (f ArrayOfDoublesSerDe) SerializeManyToSlice(item []float64) []byte { + if len(item) == 0 { + return []byte{} + } + bytes := make([]byte, 8*len(item)) + offset := 0 + for i := 0; i < len(item); i++ { + binary.LittleEndian.PutUint64(bytes[offset:], math.Float64bits(item[i])) + offset += 8 + } + return bytes +} + +func (f ArrayOfDoublesSerDe) DeserializeManyFromSlice(mem []byte, offsetBytes int, numItems int) ([]float64, error) { + if numItems == 0 { + return []float64{}, nil + } + array := make([]float64, 0, numItems) + for i := 0; i < numItems; i++ { + array = append(array, math.Float64frombits(binary.LittleEndian.Uint64(mem[offsetBytes:]))) + offsetBytes += 8 + } + return array, nil +} diff --git a/kll/items_sketch.go b/kll/items_sketch.go index 8728746..3cce1b9 100644 --- a/kll/items_sketch.go +++ b/kll/items_sketch.go @@ -31,8 +31,8 @@ import ( "fmt" "github.com/apache/datasketches-go/common" "github.com/apache/datasketches-go/internal" + "math/rand" "sort" - "unsafe" ) type ItemsSketch[C comparable] struct { @@ -51,6 +51,9 @@ type ItemsSketch[C comparable] struct { maxItem *C sortedView *ItemsSketchSortedView[C] itemsSketchOp common.ItemSketchOp[C] + + // Force deterministic offset for test, so that we can compare results across implementation. + deterministicOffsetForTest bool } const ( @@ -68,6 +71,9 @@ var ( 3486784401, 10460353203, 31381059609, 94143178827, 282429536481, 847288609443, 2541865828329, 7625597484987, 22876792454961, 68630377364883, 205891132094649} + + // Used for deterministic rand behavior in tests + nextOffsetForTest = 0 ) // NewKllItemsSketch create a new ItemsSketch with the given k and m. @@ -498,7 +504,6 @@ func (s *ItemsSketch[C]) ToSlice() ([]byte, error) { return nil, err } copy(bytesOut[_DATA_START_ADR_SINGLE_ITEM:], siByteArr) - //wbuf.incrementPosition(-len); return bytesOut, nil } @@ -607,7 +612,7 @@ func (s *ItemsSketch[C]) getSingleItemSizeBytes() (int, error) { if err != nil { return 0, err } - return s.itemsSketchOp.SizeOf(v) + int(unsafe.Sizeof(uint32(1))), nil + return s.itemsSketchOp.SizeOf(v), nil } func (s *ItemsSketch[C]) getSingleItemByteArr() ([]byte, error) { @@ -736,7 +741,7 @@ func (s *ItemsSketch[C]) mergeItemsSketch(other *ItemsSketch[C]) { otherNumLevels, otherLevelsArr, otherItemsArr, s.itemsSketchOp.LessFn()) // notice that workbuf is being used as both the input and output - result := generalItemsCompress(s.k, s.m, provisionalNumLevels, workbuf, worklevels, workbuf, outlevels, s.isLevelZeroSorted, s.itemsSketchOp.LessFn()) + result := generalItemsCompress(s.k, s.m, provisionalNumLevels, workbuf, worklevels, workbuf, outlevels, s.isLevelZeroSorted, s.itemsSketchOp.LessFn(), s.deterministicOffsetForTest) targetItemCount := result[1] //was finalCapacity. Max size given k, m, numLevels curItemCount := result[2] //was finalPop @@ -844,9 +849,9 @@ func (s *ItemsSketch[C]) compressWhileUpdatingSketch() { }) } if popAbove == 0 { - randomlyHalveUpItems(myItemsArr, adjBeg, adjPop) + randomlyHalveUpItems(myItemsArr, adjBeg, adjPop, s.deterministicOffsetForTest) } else { - randomlyHalveDownItems(myItemsArr, adjBeg, adjPop) + randomlyHalveDownItems(myItemsArr, adjBeg, adjPop, s.deterministicOffsetForTest) mergeSortedItemsArrays( myItemsArr, adjBeg, halfAdjPop, myItemsArr, rawEnd, popAbove, @@ -980,10 +985,12 @@ func intCapAuxAux(k uint16, depth uint8) uint32 { return uint32(k) } -func randomlyHalveUpItems[C comparable](buf []C, start uint32, length uint32) { +func randomlyHalveUpItems[C comparable](buf []C, start uint32, length uint32, deterministicOffsetForTest bool) { halfLength := length / 2 - //offset := rand.Intn(2) - offset := 1 + offset := rand.Intn(2) + if deterministicOffsetForTest { + offset = deterministicOffset() + } j := (start + length) - 1 - uint32(offset) for i := (start + length) - 1; i >= (start + halfLength); i-- { buf[i] = buf[j] @@ -991,10 +998,12 @@ func randomlyHalveUpItems[C comparable](buf []C, start uint32, length uint32) { } } -func randomlyHalveDownItems[C comparable](buf []C, start uint32, length uint32) { +func randomlyHalveDownItems[C comparable](buf []C, start uint32, length uint32, deterministicOffsetForTest bool) { halfLength := length / 2 - //offset := rand.Intn(2) - offset := 1 + offset := rand.Intn(2) + if deterministicOffsetForTest { + offset = deterministicOffset() + } j := start + uint32(offset) for i := start; i < (start + halfLength); i++ { buf[i] = buf[j] @@ -1074,7 +1083,9 @@ func generalItemsCompress[C comparable]( outBuf []C, outLevels []uint32, isLevelZeroSorted bool, - lessFn common.LessFn[C]) []uint32 { + lessFn common.LessFn[C], + deterministicOffsetForTest bool, +) []uint32 { numLevels := numLevelsIn currentItemCount := inLevels[numLevels] - inLevels[0] // decreases with each compaction targetItemCount := computeTotalItemCapacity(k, m, numLevels) // increases if we add levels @@ -1131,9 +1142,9 @@ func generalItemsCompress[C comparable]( } if popAbove == 0 { - randomlyHalveUpItems(inBuf, adjBeg, adjPop) + randomlyHalveUpItems(inBuf, adjBeg, adjPop, deterministicOffsetForTest) } else { - randomlyHalveDownItems(inBuf, adjBeg, adjPop) + randomlyHalveDownItems(inBuf, adjBeg, adjPop, deterministicOffsetForTest) mergeSortedItemsArrays( inBuf, adjBeg, halfAdjPop, inBuf, rawLim, popAbove, @@ -1162,3 +1173,9 @@ func generalItemsCompress[C comparable]( return []uint32{uint32(numLevels), targetItemCount, currentItemCount} } + +func deterministicOffset() int { + result := nextOffsetForTest + nextOffsetForTest = 1 - nextOffsetForTest + return result +} diff --git a/kll/items_sketch_test.go b/kll/items_sketch_test.go index 0d36c95..473aeee 100644 --- a/kll/items_sketch_test.go +++ b/kll/items_sketch_test.go @@ -835,7 +835,7 @@ func TestItemsSketch_SerializeDeserializeMultipleValue(t *testing.T) { assert.Equal(t, mem, mem2) } -func TestSerializeDeserialize(t *testing.T) { +func TestSerializeDeserializeString(t *testing.T) { nArr := []int{0, 1, 10, 100, 1000, 10000, 100000, 1000000} serde := common.ArrayOfStringsSerDe{} for _, n := range nArr { @@ -888,3 +888,56 @@ func TestSerializeDeserialize(t *testing.T) { } } } + +func TestSerializeDeserializeFloat(t *testing.T) { + nArr := []int{0, 1, 10, 100, 1000, 10000, 100000, 1000000} + serde := common.ArrayOfDoublesSerDe{} + for _, n := range nArr { + sk, err := NewKllItemsSketchWithDefault[float64](serde) + assert.NoError(t, err) + for i := 1; i <= n; i++ { + sk.Update(float64(i)) + } + slc, err := sk.ToSlice() + assert.NoError(t, err) + + sketch, err := NewKllItemsSketchFromSlice[float64](slc, serde) + if err != nil { + return + } + + assert.Equal(t, sketch.GetK(), uint16(200)) + if n == 0 { + assert.True(t, sketch.IsEmpty()) + } else { + assert.False(t, sketch.IsEmpty()) + } + + if n > 100 { + assert.True(t, sketch.IsEstimationMode()) + } else { + assert.False(t, sketch.IsEstimationMode()) + } + + if n > 0 { + minV, err := sketch.GetMinItem() + assert.NoError(t, err) + assert.Equal(t, minV, float64(1)) + + maxV, err := sketch.GetMaxItem() + assert.NoError(t, err) + assert.Equal(t, maxV, float64(n)) + + weight := int64(0) + it := sketch.GetIterator() + lessFn := serde.LessFn() + for it.Next() { + qut := it.GetQuantile() + assert.True(t, lessFn(minV, qut) || minV == qut, fmt.Sprintf("min: \"%v\" \"%v\"", minV, qut)) + assert.True(t, !lessFn(maxV, qut) || maxV == qut, fmt.Sprintf("max: \"%v\" \"%v\"", maxV, qut)) + weight += it.GetWeight() + } + assert.Equal(t, weight, int64(n)) + } + } +} diff --git a/kll/items_sletch_serialization_test.go b/kll/items_sletch_serialization_test.go index 16ff817..4d01a7f 100644 --- a/kll/items_sletch_serialization_test.go +++ b/kll/items_sletch_serialization_test.go @@ -27,14 +27,17 @@ import ( ) func TestGenerateGoFiles(t *testing.T) { - if len(os.Getenv(internal.DSketchTestGenerateGo)) == 0 { - t.Skipf("%s not set", internal.DSketchTestGenerateGo) - } + //if len(os.Getenv(internal.DSketchTestGenerateGo)) == 0 { + // t.Skipf("%s not set", internal.DSketchTestGenerateGo) + //} + + os.Mkdir(internal.GoPath, 0755) nArr := []int{0, 1, 10, 100, 1000, 10000, 100000, 1000000} for _, n := range nArr { digits := numDigits(n) sk, err := NewKllItemsSketchWithDefault[string](common.ArrayOfStringsSerDe{}) + sk.deterministicOffsetForTest = true assert.NoError(t, err) for i := 1; i <= n; i++ { sk.Update(intToFixedLengthString(i, digits)) @@ -44,6 +47,19 @@ func TestGenerateGoFiles(t *testing.T) { err = os.WriteFile(fmt.Sprintf("%s/kll_string_n%d_go.sk", internal.GoPath, n), slc, 0644) assert.NoError(t, err) } + + for _, n := range nArr { + sk, err := NewKllItemsSketchWithDefault[float64](common.ArrayOfDoublesSerDe{}) + sk.deterministicOffsetForTest = true + assert.NoError(t, err) + for i := 1; i <= n; i++ { + sk.Update(float64(i)) + } + slc, err := sk.ToSlice() + assert.NoError(t, err) + err = os.WriteFile(fmt.Sprintf("%s/kll_double_n%d_go.sk", internal.GoPath, n), slc, 0644) + assert.NoError(t, err) + } } func TestJavaCompat(t *testing.T) { @@ -94,4 +110,51 @@ func TestJavaCompat(t *testing.T) { } } }) + + t.Run("Java KLL Double", func(t *testing.T) { + nArr := []int{0, 1, 10, 100, 1000, 10000, 100000, 1000000} + serde := common.ArrayOfDoublesSerDe{} + for _, n := range nArr { + bytes, err := os.ReadFile(fmt.Sprintf("%s/kll_double_n%d_java.sk", internal.JavaPath, n)) + assert.NoError(t, err) + sketch, err := NewKllItemsSketchFromSlice[float64](bytes, serde) + if err != nil { + return + } + + assert.Equal(t, sketch.GetK(), uint16(200)) + if n == 0 { + assert.True(t, sketch.IsEmpty()) + } else { + assert.False(t, sketch.IsEmpty()) + } + + if n > 100 { + assert.True(t, sketch.IsEstimationMode()) + } else { + assert.False(t, sketch.IsEstimationMode()) + } + + if n > 0 { + minV, err := sketch.GetMinItem() + assert.NoError(t, err) + assert.Equal(t, minV, float64(1)) + + maxV, err := sketch.GetMaxItem() + assert.NoError(t, err) + assert.Equal(t, maxV, float64(n)) + + weight := int64(0) + it := sketch.GetIterator() + lessFn := serde.LessFn() + for it.Next() { + qut := it.GetQuantile() + assert.True(t, lessFn(minV, qut) || minV == qut, fmt.Sprintf("min: \"%v\" \"%v\"", minV, qut)) + assert.True(t, !lessFn(maxV, qut) || maxV == qut, fmt.Sprintf("max: \"%v\" \"%v\"", maxV, qut)) + weight += it.GetWeight() + } + assert.Equal(t, weight, int64(n)) + } + } + }) } diff --git a/serialization_test_data/java_generated_files/kll_double_n0_java.sk b/serialization_test_data/java_generated_files/kll_double_n0_java.sk new file mode 100644 index 0000000..afd2209 Binary files /dev/null and b/serialization_test_data/java_generated_files/kll_double_n0_java.sk differ diff --git a/serialization_test_data/java_generated_files/kll_double_n1000000_java.sk b/serialization_test_data/java_generated_files/kll_double_n1000000_java.sk new file mode 100644 index 0000000..171a3a0 Binary files /dev/null and b/serialization_test_data/java_generated_files/kll_double_n1000000_java.sk differ diff --git a/serialization_test_data/java_generated_files/kll_double_n100000_java.sk b/serialization_test_data/java_generated_files/kll_double_n100000_java.sk new file mode 100644 index 0000000..5b57a7a Binary files /dev/null and b/serialization_test_data/java_generated_files/kll_double_n100000_java.sk differ diff --git a/serialization_test_data/java_generated_files/kll_double_n10000_java.sk b/serialization_test_data/java_generated_files/kll_double_n10000_java.sk new file mode 100644 index 0000000..0de0b17 Binary files /dev/null and b/serialization_test_data/java_generated_files/kll_double_n10000_java.sk differ diff --git a/serialization_test_data/java_generated_files/kll_double_n1000_java.sk b/serialization_test_data/java_generated_files/kll_double_n1000_java.sk new file mode 100644 index 0000000..a2737f4 Binary files /dev/null and b/serialization_test_data/java_generated_files/kll_double_n1000_java.sk differ diff --git a/serialization_test_data/java_generated_files/kll_double_n100_java.sk b/serialization_test_data/java_generated_files/kll_double_n100_java.sk new file mode 100644 index 0000000..b0d578a Binary files /dev/null and b/serialization_test_data/java_generated_files/kll_double_n100_java.sk differ diff --git a/serialization_test_data/java_generated_files/kll_double_n10_java.sk b/serialization_test_data/java_generated_files/kll_double_n10_java.sk new file mode 100644 index 0000000..fb526ef Binary files /dev/null and b/serialization_test_data/java_generated_files/kll_double_n10_java.sk differ diff --git a/serialization_test_data/java_generated_files/kll_double_n1_java.sk b/serialization_test_data/java_generated_files/kll_double_n1_java.sk new file mode 100644 index 0000000..5eceb40 Binary files /dev/null and b/serialization_test_data/java_generated_files/kll_double_n1_java.sk differ diff --git a/serialization_test_data/java_generated_files/kll_string_n1000000_java.sk b/serialization_test_data/java_generated_files/kll_string_n1000000_java.sk index 6ba10e5..e5d0a29 100644 Binary files a/serialization_test_data/java_generated_files/kll_string_n1000000_java.sk and b/serialization_test_data/java_generated_files/kll_string_n1000000_java.sk differ diff --git a/serialization_test_data/java_generated_files/kll_string_n100000_java.sk b/serialization_test_data/java_generated_files/kll_string_n100000_java.sk index 73fa4bb..e3382f2 100644 Binary files a/serialization_test_data/java_generated_files/kll_string_n100000_java.sk and b/serialization_test_data/java_generated_files/kll_string_n100000_java.sk differ diff --git a/serialization_test_data/java_generated_files/kll_string_n10000_java.sk b/serialization_test_data/java_generated_files/kll_string_n10000_java.sk index 644a3e4..8c4ba2d 100644 Binary files a/serialization_test_data/java_generated_files/kll_string_n10000_java.sk and b/serialization_test_data/java_generated_files/kll_string_n10000_java.sk differ diff --git a/serialization_test_data/java_generated_files/kll_string_n1000_java.sk b/serialization_test_data/java_generated_files/kll_string_n1000_java.sk index f16897c..858cac2 100644 Binary files a/serialization_test_data/java_generated_files/kll_string_n1000_java.sk and b/serialization_test_data/java_generated_files/kll_string_n1000_java.sk differ --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
