(datasketches-characterization) branch go-characterization updated: Rebase Go implemn from Java Job runners

placave Sat, 16 Mar 2024 10:20:38 -0700

This is an automated email from the ASF dual-hosted git repository.

placave pushed a commit to branch go-characterization
in repository 
https://gitbox.apache.org/repos/asf/datasketches-characterization.git



The following commit(s) were added to refs/heads/go-characterization by this 
push:
     new e7d8724  Rebase Go implemn from Java Job runners
e7d8724 is described below

commit e7d8724b878e79256db1d77b615681c8a2e20db3
Author: Pierre Lacave <[email protected]>
AuthorDate: Sat Mar 16 18:20:20 2024 +0100

    Rebase Go implemn from Java Job runners
---
 go/configs.go                         |  35 +++++
 go/distinct_count_accuracy_profile.go | 257 ++++++++++++++++++++++++----------
 go/hll_sketch_accuracy_profile.go     |  33 +++--
 go/job.go                             |  35 ++---
 go/main.go                            |   2 +-
 5 files changed, 243 insertions(+), 119 deletions(-)

diff --git a/go/configs.go b/go/configs.go
new file mode 100644
index 0000000..b63fa46
--- /dev/null
+++ b/go/configs.go
@@ -0,0 +1,35 @@
+package main
+
+import "github.com/apache/datasketches-go/hll"
+
+type distinctCountJobConfigType struct {
+       lgMinU int // The starting # of uniques that is printed at the end.
+       lgMaxU int // How high the # uniques go
+       UPPO   int // The horizontal x-resolution of trials points
+
+       lgMinT int // prints intermediate results starting w/ this lgMinT
+       lgMaxT int // The max trials
+       TPPO   int // how often intermediate results are printed
+
+       lgQK      int  // size of quantiles sketch
+       interData bool // intermediate data
+
+       runner DistinctCountAccuracyProfileRunner
+}
+
+var (
+       distinctCountJobConfig = distinctCountJobConfigType{
+               lgMinU: 0,
+               lgMaxU: 20,
+               UPPO:   16,
+
+               lgMinT: 8,
+               lgMaxT: 20,
+               TPPO:   1,
+
+               lgQK:      12,
+               interData: true,
+
+               runner: NewHllSketchAccuracyRunner(4 /* lgK */, 
hll.TgtHllTypeHll8 /* tgtType */),
+       }
+)
diff --git a/go/distinct_count_accuracy_profile.go 
b/go/distinct_count_accuracy_profile.go
index cddaa54..f399731 100644
--- a/go/distinct_count_accuracy_profile.go
+++ b/go/distinct_count_accuracy_profile.go
@@ -18,112 +18,215 @@ package main
 
 import (
        "fmt"
+       "github.com/apache/datasketches-go/common"
+       "github.com/apache/datasketches-go/kll"
+       "math"
+       "strings"
        "time"
 )
 
+const (
+       M4SD = 0.0000316712418331 //minus 4 StdDev
+       M3SD = 0.0013498980316301 //minus 3 StdDev
+       M2SD = 0.0227501319481792 //minus 2 StdDev
+       M1SD = 0.1586552539314570 //minus 1 StdDev
+       MED  = 0.5                //median
+       P1SD = 0.8413447460685430 //plus  1 StdDev
+       P2SD = 0.9772498680518210 //plus  2 StdDev
+       P3SD = 0.9986501019683700 //plus  3 StdDev
+       P4SD = 0.9999683287581670 //plus  4 StdDev
+)
+
+var (
+       GAUSSIANS_4SD = []float64{0.0, M4SD, M3SD, M2SD, M1SD, MED, P1SD, P2SD, 
P3SD, P4SD, 1.0}
+)
+
 type DistinctCountAccuracyProfileRunner interface {
-       runTrial(stats []accuracyStats, key uint64) uint64
+       runTrial(stats []*accuracyStats, key uint64) uint64
 }
 
 type accuracyStats struct {
-       trueValue   int
+       trueValue   uint64
        sumEst      float64
        sumRelErr   float64
        sumSqRelErr float64
        count       int
-       //kll_sketch<double> rel_err_distribution;
+       // Make that a sketch of float64
+       rel_err_distribution *kll.ItemsSketch[int64]
+}
+
+func (a *accuracyStats) update(est float64) {
+       a.sumEst += est
+       relativeError := est/float64(a.trueValue) - 1.0
+       a.sumRelErr += relativeError
+       a.sumSqRelErr += relativeError * relativeError
+       a.rel_err_distribution.Update(int64(relativeError))
+       a.count++
 }
 
 type DistinctCountAccuracyProfile struct {
-       runner DistinctCountAccuracyProfileRunner
-       stats  []accuracyStats
+       config    distinctCountJobConfigType
+       runner    DistinctCountAccuracyProfileRunner
+       stats     []*accuracyStats
+       startTime int64
 }
 
-func NewDistinctCountAccuracyProfile(runner 
DistinctCountAccuracyProfileRunner) *DistinctCountAccuracyProfile {
+func NewDistinctCountAccuracyProfile(config distinctCountJobConfigType) 
*DistinctCountAccuracyProfile {
        return &DistinctCountAccuracyProfile{
-               runner,
-               make([]accuracyStats, 0),
+               config:    config,
+               runner:    config.runner,
+               stats:     buildLog2AccuracyStatsArray(config.lgMinU, 
config.lgMaxU, config.UPPO, config.lgQK),
+               startTime: time.Now().UnixMilli(),
        }
 }
 
 func (d *DistinctCountAccuracyProfile) run() {
-       const (
-               lgMinTrials       = 4
-               lgMaxTrials       = 16
-               trialsPpo         = 4
-               printIntermediate = true
-
-               minT      = uint64(1) << lgMinTrials
-               maxTrials = 1 << lgMaxTrials
-
-               lgMinCounts = 0
-               lgMaxCounts = 32
-               countsPpo   = 16
-
-               quantilesK = 10000
-       )
-
-       var (
-               numPoints = countPoints(lgMinCounts, lgMaxCounts, countsPpo)
-               p         = uint64(1) << lgMinCounts
-               key       = uint64(0)
-       )
-
-       for i := 0; i < numPoints; i++ {
-               d.stats = append(d.stats, accuracyStats{
-                       trueValue: quantilesK,
-                       //rel_err_distribution: p,
-               })
-               p = pwr2LawNext(countsPpo, p)
-       }
+       minT := 1 << d.config.lgMinT
+       maxT := 1 << d.config.lgMaxT
+       maxU := 1 << d.config.lgMaxU
 
-       startTime := time.Now().UnixMilli()
+       vIn := uint64(0)
 
-       // this will generate a table of data up to each intermediate number of 
trials
-       lastTrials := uint64(0)
-       for lastTrials < maxTrials {
-               nextTrials := minT
-               if lastTrials != 0 {
-                       nextTrials = pwr2LawNext(trialsPpo, lastTrials)
+       // This will generate a table of data for each intermediate Trials point
+       lastTpt := 0
+       for lastTpt < maxT {
+               nextT := lastTpt
+               if lastTpt == 0 {
+                       nextT = minT
+               } else {
+                       nextT = int(pwr2SeriesNext(d.config.TPPO, 
uint64(lastTpt)))
+               }
+               delta := nextT - lastTpt
+               for i := 0; i < delta; i++ {
+                       vIn = d.runner.runTrial(d.stats, vIn)
                }
-               delta := nextTrials - lastTrials
-               for i := uint64(0); i < delta; i++ {
-                       key = d.runner.runTrial(d.stats, key)
+               lastTpt = nextT
+               sb := &strings.Builder{}
+               if nextT < maxT {
+                       if d.config.interData {
+                               fmt.Println(getHeader())
+                               process(d.stats, lastTpt, sb)
+                               fmt.Println(sb.String())
+                       }
+               } else {
+                       fmt.Println(getHeader())
+                       process(d.stats, lastTpt, sb)
+                       fmt.Println(sb.String())
                }
-               lastTrials = nextTrials
-               if printIntermediate || nextTrials == maxTrials {
-                       d.printStats()
+
+               fmt.Printf("Config:             : %+v\n", d.config)
+               fmt.Printf("Cum Trials          : %d\n", lastTpt)
+               fmt.Printf("Cum Updates         : %d\n", vIn)
+               currentTime_mS := time.Now().UnixMilli()
+               cumTime_mS := currentTime_mS - d.startTime
+               fmt.Printf("Cum Time            : %s\n", 
time.Duration(cumTime_mS*1000*1000))
+               timePerTrial_mS := float64(cumTime_mS) / float64(lastTpt)
+               avgUpdateTime_ns := timePerTrial_mS * 1e6 / float64(maxU)
+               fmt.Printf("Time Per Trial, mSec: %f\n", timePerTrial_mS)
+               fmt.Printf("Avg Update Time, nSec: %f\n", avgUpdateTime_ns)
+               fmt.Printf("Date Time           : %s\n", 
time.Now().Format(time.RFC3339))
+               timeToComplete_mS := int64(timePerTrial_mS * 
float64(maxT-lastTpt))
+               fmt.Printf("Est Time to Complete: %s\n", 
time.Duration(timeToComplete_mS*1000*1000))
+               fmt.Printf("Est Time at Completion: %s\n", 
time.Now().Add(time.Duration(timeToComplete_mS*1000*1000)).Format(time.RFC3339))
+               fmt.Println("")
+       }
+}
+
+func process(qArr []*accuracyStats, cumTrials int, sb *strings.Builder) {
+       points := len(qArr)
+       sb.Reset()
+       for pt := 0; pt < points; pt++ {
+               q := qArr[pt]
+
+               trueUniques := q.trueValue
+
+               meanEst := q.sumEst / float64(cumTrials)
+               meanRelErr := q.sumRelErr / float64(cumTrials)
+               meanSqErr := q.sumSqRelErr / float64(cumTrials)
+               normMeanSqErr := meanSqErr / (float64(trueUniques) * 
float64(trueUniques))
+               rmsRelErr := math.Sqrt(normMeanSqErr)
+               //q.rmsre = rmsRelErr
+
+               sb.WriteString(fmt.Sprintf("%d", trueUniques))
+               sb.WriteString("\t")
+
+               sb.WriteString(fmt.Sprintf("%e", meanEst))
+               sb.WriteString("\t")
+
+               sb.WriteString(fmt.Sprintf("%e", meanRelErr))
+               sb.WriteString("\t")
+
+               sb.WriteString(fmt.Sprintf("%e", rmsRelErr))
+               sb.WriteString("\t")
+
+               sb.WriteString(fmt.Sprintf("%d", cumTrials))
+               sb.WriteString("\t")
+
+               quants, _ := q.rel_err_distribution.GetQuantiles(GAUSSIANS_4SD, 
true)
+               for i := 0; i < len(quants); i++ {
+                       sb.WriteString(fmt.Sprintf("%e", 
float64(quants[i])/(float64(trueUniques))-1.0))
+                       sb.WriteString("\t")
                }
 
-               fmt.Println("Cum Trials             : ", lastTrials)
-               fmt.Println("Cum Updates            : ", key)
-               cumTimeMs := time.Now().UnixMilli() - startTime
-               fmt.Println("Cum Time, ms           : ", cumTimeMs)
-               timePerTrialMs := float64(cumTimeMs) / float64(lastTrials)
-               fmt.Println("Avg Time Per Trial, ms : ", timePerTrialMs)
-               currentTime := time.Now()
-               fmt.Println("Current time           : ", currentTime)
-               timeToCompleteMs := float64(cumTimeMs) / float64(lastTrials) * 
float64(maxTrials-lastTrials)
-               estCompletionTime := 
currentTime.Add(time.Duration(timeToCompleteMs) * time.Millisecond)
-               fmt.Println("Est Time of Completion : ", estCompletionTime)
-               fmt.Println()
+               sb.WriteString(fmt.Sprintf("%d", 0))
+               sb.WriteString("\t")
+               sb.WriteString(fmt.Sprintf("%d", 0))
+
+               sb.WriteString("\n")
        }
 }
 
-func (d *DistinctCountAccuracyProfile) printStats() {
-       for _, stat := range d.stats {
-               fmt.Println(stat.trueValue)
-               fmt.Println(stat.count)
-               fmt.Println(stat.sumEst)
-               fmt.Println(stat.sumRelErr)
-               fmt.Println(stat.sumSqRelErr)
-               // quantiles
-               //const auto quants = stat.get_quantiles(FRACTIONS, FRACT_LEN);
-               //for (size_t i = 0; i < FRACT_LEN; i++) {
-               //      const double quantile = quants[i];
-               //      std::cout << quantile;
-               //      if (i != FRACT_LEN - 1) std::cout << "\t";
-               //}
-               fmt.Println()
+func getHeader() string {
+       sb := &strings.Builder{}
+       sb.WriteString("TrueU")
+       sb.WriteString("\t")
+       sb.WriteString("MeanEst")
+       sb.WriteString("\t")
+       sb.WriteString("MeanRelErr")
+       sb.WriteString("\t")
+       sb.WriteString("RMS_RE")
+       sb.WriteString("\t")
+       sb.WriteString("Trials")
+       sb.WriteString("\t")
+       sb.WriteString("Min")
+       sb.WriteString("\t")
+       sb.WriteString("Q(.0000317)")
+       sb.WriteString("\t")
+       sb.WriteString("Q(.00135)")
+       sb.WriteString("\t")
+       sb.WriteString("Q(.02275)")
+       sb.WriteString("\t")
+       sb.WriteString("Q(.15866)")
+       sb.WriteString("\t")
+       sb.WriteString("Q(.5)")
+       sb.WriteString("\t")
+       sb.WriteString("Q(.84134)")
+       sb.WriteString("\t")
+       sb.WriteString("Q(.97725)")
+       sb.WriteString("\t")
+       sb.WriteString("Q(.99865)")
+       sb.WriteString("\t")
+       sb.WriteString("Q(.9999683)")
+       sb.WriteString("\t")
+       sb.WriteString("Max")
+       sb.WriteString("\t")
+       sb.WriteString("Bytes")
+       sb.WriteString("\t")
+       sb.WriteString("ReMerit")
+       return sb.String()
+}
+
+func buildLog2AccuracyStatsArray(lgMin, lgMax, ppo, lgQK int) []*accuracyStats 
{
+       qLen := countPoints(lgMin, lgMax, ppo)
+       qArr := make([]*accuracyStats, qLen)
+       p := uint64(1) << lgMin
+       for i := 0; i < qLen; i++ {
+               kllSketch, _ := kll.NewKllItemsSketch[int64](uint16(lgQK), 8, 
common.ArrayOfLongsSerDe{})
+               qArr[i] = &accuracyStats{
+                       trueValue:            p,
+                       rel_err_distribution: kllSketch,
+               }
+               p = pwr2SeriesNext(ppo, p)
        }
+       return qArr
 }
diff --git a/go/hll_sketch_accuracy_profile.go 
b/go/hll_sketch_accuracy_profile.go
index cd7ccfa..8d667c9 100644
--- a/go/hll_sketch_accuracy_profile.go
+++ b/go/hll_sketch_accuracy_profile.go
@@ -17,29 +17,34 @@
 
 package main
 
-import "github.com/apache/datasketches-go/hll"
+import (
+       "github.com/apache/datasketches-go/hll"
+)
 
-type HllSketchAccuracyProfile struct {
+type HllSketchAccuracyRunner struct {
+       sketch hll.HllSketch
 }
 
-func NewHllSketchAccuracyProfile() *HllSketchAccuracyProfile {
-       return &HllSketchAccuracyProfile{}
+func NewHllSketchAccuracyRunner(lgK int, tgtType hll.TgtHllType) 
*HllSketchAccuracyRunner {
+       sketch, _ := hll.NewHllSketch(lgK, tgtType)
+       return &HllSketchAccuracyRunner{
+               sketch: sketch,
+       }
 }
 
-func (HllSketchAccuracyProfile) runTrial(stats []accuracyStats, key uint64) 
uint64 {
-       lgK := 12
-
-       s, _ := hll.NewHllSketch(lgK, hll.TgtHllTypeDefault)
-       count := 0
+func (h *HllSketchAccuracyRunner) runTrial(stats []*accuracyStats, key uint64) 
uint64 {
+       h.sketch.Reset()
 
+       lastUniques := uint64(0)
        for _, stat := range stats {
-               delta := stat.trueValue - count
-               for i := 0; i < delta; i++ {
-                       s.UpdateUInt64(key)
+               delta := stat.trueValue - lastUniques
+               for u := uint64(0); u < delta; u++ {
+                       h.sketch.UpdateUInt64(key)
                        key++
                }
-               count += delta
-               //stat.update(s.get_estimate())
+               lastUniques += delta
+               est, _ := h.sketch.GetEstimate()
+               stat.update(est)
        }
 
        return key
diff --git a/go/job.go b/go/job.go
index 7fc3a5b..38f19f4 100644
--- a/go/job.go
+++ b/go/job.go
@@ -23,53 +23,34 @@ type JobProfile interface {
        run()
 }
 
-// countPoints Counts the actual number of plotting points between lgStart and 
lgEnd assuming the given PPO.
+// countPoints return the actual number of plotting points between lgStart and 
lgEnd assuming the given PPO
+// and a logBase of 2.
 // This is not a simple linear function due to points that may be skipped in 
the low range.
-// param lgStart Log2 of the starting value
-// param lgEnd Log2 of the ending value
-// param ppo the number of logarithmically evenly spaced points per octave.
-// returns the actual number of plotting points between lgStart and lgEnd.
 func countPoints(lgStart, lgEnd, ppo int) int {
        p := uint64(1) << lgStart
        end := uint64(1) << lgEnd
        count := 0
        for p <= end {
-               p = pwr2LawNext(ppo, p)
+               p = pwr2SeriesNext(ppo, p)
                count++
        }
        return count
 }
 
-// pwr2LawNext Computes the next larger integer point in the power series
-// point = 2^( i / ppo ) given the current point in the series.
-// For illustration, this can be used in a loop as follows:
-//
-//     int maxP = 1024;
-//     int minP = 1;
-//     int ppo = 2;
-//
-//     for (int p = minP; p <= maxP; p = pwr2LawNext(ppo, p)) {
-//       System.out.print(p + " ");
-//     }
-//     //generates the following series:
-//     //1 2 3 4 6 8 11 16 23 32 45 64 91 128 181 256 362 512 724 1024
-//
-// param ppo Points-Per-Octave, or the number of points per integer powers of 
2 in the series.
-// param curPoint the current point of the series. Must be &ge; 1.
-// returns the next point in the power series.
-func pwr2LawNext(ppo int, curPoint uint64) uint64 {
+func pwr2SeriesNext(ppo int, curPoint uint64) uint64 {
        cur := curPoint
        if cur < 1 {
                cur = 1
        }
-       gi := int64(math.Log2(float64(cur)) * float64(ppo))
+       gi := int(math.Round(math.Log2(float64(cur)) * float64(ppo)))
+
        var next uint64
        for {
-               next = uint64(math.Pow(2.0, float64(gi)/float64(ppo)))
+               gi++
+               next = uint64(math.Round(math.Pow(2.0, 
float64(gi)/float64(ppo))))
                if next > curPoint {
                        break
                }
-               gi++
        }
        return next
 }
diff --git a/go/main.go b/go/main.go
index 5733394..383b915 100644
--- a/go/main.go
+++ b/go/main.go
@@ -20,7 +20,7 @@ package main
 func main() {
 
        jobs := map[string]JobProfile{
-               "distinct_count_accuracy_profile": 
NewDistinctCountAccuracyProfile(NewHllSketchAccuracyProfile()),
+               "distinct_count_accuracy_profile": 
NewDistinctCountAccuracyProfile(distinctCountJobConfig),
        }
 
        for _, job := range jobs {


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

(datasketches-characterization) branch go-characterization updated: Rebase Go implemn from Java Job runners

Reply via email to