[jira] [Updated] (SPARK-30882) Inaccurate results even with higher precision ApproximatePercentile results
[ https://issues.apache.org/jira/browse/SPARK-30882?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ] Nandini malempati updated SPARK-30882: -- Description: Results of ApproximatePercentile should have better accuracy with increased precision as per the documentation provided here: [https://github.com/apache/spark/blob/master/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/ApproximatePercentile.scala] But i'm seeing nondeterministic behavior . On a data set of size 450 with Accuracy 100 is returning better results than 500 (P25 and P90 looks fine but P75 is very off). And accuracy with 700 gives exact results. But this behavior is not consistent. {code:scala} // Some comments here package com.microsoft.teams.test.utils import org.apache.spark.sql.{Column, DataFrame, Row, SparkSession} import org.scalatest.{Assertions, FunSpec} import org.apache.spark.sql.functions.{lit, _} import org.apache.spark.sql.catalyst.expressions.aggregate.ApproximatePercentile import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType} import scala.collection.mutable.ListBuffer class PercentilesTest extends FunSpec with Assertions { it("check percentiles with different precision") { val schema = List(StructField("MetricName", StringType), StructField("DataPoint", IntegerType)) val data = new ListBuffer[Row] for(i <- 1 to 450) { data += Row("metric", i)} import spark.implicits._ val df = createDF(schema, data.toSeq) val accuracy1000 = df.groupBy("MetricName").agg(percentile_approx($"DataPoint", typedLit(Seq(0.1, 0.25, 0.75, 0.9)), ApproximatePercentile.DEFAULT_PERCENTILE_ACCURACY)) val accuracy1M = df.groupBy("MetricName").agg(percentile_approx($"DataPoint", typedLit(Seq(0.1, 0.25, 0.75, 0.9)), 100)) val accuracy5M = df.groupBy("MetricName").agg(percentile_approx($"DataPoint", typedLit(Seq(0.1, 0.25, 0.75, 0.9)), 500)) val accuracy7M = df.groupBy("MetricName").agg(percentile_approx($"DataPoint", typedLit(Seq(0.1, 0.25, 0.75, 0.9)), 700)) val accuracy10M = df.groupBy("MetricName").agg(percentile_approx($"DataPoint", typedLit(Seq(0.1, 0.25, 0.75, 0.9)), 1000)) accuracy1000.show(1, false) accuracy1M.show(1, false) accuracy5M.show(1, false) accuracy7M.show(1, false) accuracy10M.show(1, false) } def percentile_approx(col: Column, percentage: Column, accuracy: Column): Column = { val expr = new ApproximatePercentile( col.expr, percentage.expr, accuracy.expr ).toAggregateExpression new Column(expr) } def percentile_approx(col: Column, percentage: Column, accuracy: Int): Column = percentile_approx( col, percentage, lit(accuracy) ) lazy val spark: SparkSession = { SparkSession .builder() .master("local") .appName("spark tests") .getOrCreate() } def createDF(schema: List[StructField], data: Seq[Row]): DataFrame = { spark.createDataFrame( spark.sparkContext.parallelize(data), StructType(schema)) } } {code} Above is a test run to reproduce the error. In this example with accuracy 500 , P25 and P90 looks fine. But P75 is the max of the column. +--+--+ |MetricName|percentile_approx(DataPoint, [0.1,0.25,0.75,0.9], 500)| +--+--+ |metric |[45, 1125000, 450, 405] | +--+--+ This is breaking our reports as there is no proper definition of accuracy . we have data sets of size more than 2700. After studying the pattern found that inaccurate percentiles always have "max" of the column as value. P50 and P99 might be right in few cases but P75 can be wrong. Is there a way to define what the correct accuracy would be for a given dataset size ? was: Results of ApproximatePercentile should have better accuracy with increased precision as per the documentation provided here: [https://github.com/apache/spark/blob/master/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/ApproximatePercentile.scala] But i'm seeing nondeterministic behavior . On a data set of size 450 with Accuracy 100 is returning better results than 500. And accuracy with 700 gives exact results. But this behavior is not consistent. {code:scala} // Some comments here package com.microsoft.teams.test.utils import org.apache.spark.sql.{Column, DataFrame, Row, SparkSession} import org.scalatest.{Assertions, FunSpec} import org.apache.spark.sql.functions.{lit, _} import org.apache.spark.sql.catalyst.expressions.aggregate.ApproximatePercentile import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType} import
[jira] [Updated] (SPARK-30882) Inaccurate results even with higher precision ApproximatePercentile results
[ https://issues.apache.org/jira/browse/SPARK-30882?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ] Nandini malempati updated SPARK-30882: -- Description: Results of ApproximatePercentile should have better accuracy with increased precision as per the documentation provided here: [https://github.com/apache/spark/blob/master/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/ApproximatePercentile.scala] But i'm seeing nondeterministic behavior . On a data set of size 450 with Accuracy 100 is returning better results than 500 (P25 and P90 looks fine but P75 is very off). And accuracy with 700 gives exact results. But this behavior is not consistent. {code:scala} // Some comments here package com.microsoft.teams.test.utils import org.apache.spark.sql.{Column, DataFrame, Row, SparkSession} import org.scalatest.{Assertions, FunSpec} import org.apache.spark.sql.functions.{lit, _} import org.apache.spark.sql.catalyst.expressions.aggregate.ApproximatePercentile import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType} import scala.collection.mutable.ListBuffer class PercentilesTest extends FunSpec with Assertions { it("check percentiles with different precision") { val schema = List(StructField("MetricName", StringType), StructField("DataPoint", IntegerType)) val data = new ListBuffer[Row] for(i <- 1 to 450) { data += Row("metric", i)} import spark.implicits._ val df = createDF(schema, data.toSeq) val accuracy1000 = df.groupBy("MetricName").agg(percentile_approx($"DataPoint", typedLit(Seq(0.1, 0.25, 0.75, 0.9)), ApproximatePercentile.DEFAULT_PERCENTILE_ACCURACY)) val accuracy1M = df.groupBy("MetricName").agg(percentile_approx($"DataPoint", typedLit(Seq(0.1, 0.25, 0.75, 0.9)), 100)) val accuracy5M = df.groupBy("MetricName").agg(percentile_approx($"DataPoint", typedLit(Seq(0.1, 0.25, 0.75, 0.9)), 500)) val accuracy7M = df.groupBy("MetricName").agg(percentile_approx($"DataPoint", typedLit(Seq(0.1, 0.25, 0.75, 0.9)), 700)) val accuracy10M = df.groupBy("MetricName").agg(percentile_approx($"DataPoint", typedLit(Seq(0.1, 0.25, 0.75, 0.9)), 1000)) accuracy1000.show(1, false) accuracy1M.show(1, false) accuracy5M.show(1, false) accuracy7M.show(1, false) accuracy10M.show(1, false) } def percentile_approx(col: Column, percentage: Column, accuracy: Column): Column = { val expr = new ApproximatePercentile( col.expr, percentage.expr, accuracy.expr ).toAggregateExpression new Column(expr) } def percentile_approx(col: Column, percentage: Column, accuracy: Int): Column = percentile_approx( col, percentage, lit(accuracy) ) lazy val spark: SparkSession = { SparkSession .builder() .master("local") .appName("spark tests") .getOrCreate() } def createDF(schema: List[StructField], data: Seq[Row]): DataFrame = { spark.createDataFrame( spark.sparkContext.parallelize(data), StructType(schema)) } } {code} Above is a test run to reproduce the error. Below are few runs with different accuracies. P25 and P90 looks fine. But P75 is the max of the column. +--+--+ |MetricName|percentile_approx(DataPoint, [0.1,0.25,0.75,0.9], 700)| +--+--+ |metric |[45, 1125000, 3375000, 405] | +--+--+ +--+--+ |MetricName|percentile_approx(DataPoint, [0.1,0.25,0.75,0.9], 500)| +--+--+ |metric |[45, 1125000, 450, 405] | +--+--+ +--+--+ |MetricName|percentile_approx(DataPoint, [0.1,0.25,0.75,0.9], 100)| +--+--+ |metric |[45, 1124998, 3374996, 405] | +--+--+ +--++ |MetricName|percentile_approx(DataPoint, [0.1,0.25,0.75,0.9], 1)| +--++ |metric |[45, 1124848, 3374638, 405] | +--++ This is breaking our reports as there is no proper definition of accuracy . we have data sets of size more than 2700. After studying the pattern found that inaccurate percentiles always have "max" of the column as
[jira] [Updated] (SPARK-30882) Inaccurate results even with higher precision ApproximatePercentile results
[ https://issues.apache.org/jira/browse/SPARK-30882?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ] Nandini malempati updated SPARK-30882: -- Description: Results of ApproximatePercentile should have better accuracy with increased precision as per the documentation provided here: [https://github.com/apache/spark/blob/master/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/ApproximatePercentile.scala] But i'm seeing nondeterministic behavior . On a data set of size 450 with Accuracy 100 is returning better results than 500. And accuracy with 700 gives exact results. But this behavior is not consistent. {code:scala} // Some comments here package com.microsoft.teams.test.utilsimport org.apache.spark.sql.\{Column, DataFrame, Row, SparkSession} import org.scalatest.\{Assertions, FunSpec} import org.apache.spark.sql.functions.\{lit, _} import org.apache.spark.sql.catalyst.expressions.aggregate.ApproximatePercentile import org.apache.spark.sql.types.\{IntegerType, StringType, StructField, StructType}import scala.collection.mutable.ListBuffer class PercentilesTest extends FunSpec with Assertions \{ it("check percentiles with different precision") { val schema = List(StructField("MetricName", StringType), StructField("DataPoint", IntegerType)) val data = new ListBuffer[Row] for(i <- 1 to 450) { data += Row("metric", i)} import spark.implicits._ val df = createDF(schema, data.toSeq) val accuracy1000 = df.groupBy("MetricName").agg(percentile_approx($"DataPoint", typedLit(Seq(0.1, 0.25, 0.75, 0.9)), ApproximatePercentile.DEFAULT_PERCENTILE_ACCURACY)) val accuracy1M = df.groupBy("MetricName").agg(percentile_approx($"DataPoint", typedLit(Seq(0.1, 0.25, 0.75, 0.9)), 100)) val accuracy5M = df.groupBy("MetricName").agg(percentile_approx($"DataPoint", typedLit(Seq(0.1, 0.25, 0.75, 0.9)), 500)) val accuracy7M = df.groupBy("MetricName").agg(percentile_approx($"DataPoint", typedLit(Seq(0.1, 0.25, 0.75, 0.9)), 700)) val accuracy10M = df.groupBy("MetricName").agg(percentile_approx($"DataPoint", typedLit(Seq(0.1, 0.25, 0.75, 0.9)), 1000)) accuracy1000.show(1, false) accuracy1M.show(1, false) accuracy5M.show(1, false) accuracy7M.show(1, false) accuracy10M.show(1, false) } def percentile_approx(col: Column, percentage: Column, accuracy: Column): Column = \{ val expr = new ApproximatePercentile( col.expr, percentage.expr, accuracy.expr ).toAggregateExpression new Column(expr) } def percentile_approx(col: Column, percentage: Column, accuracy: Int): Column = percentile_approx( col, percentage, lit(accuracy) ) lazy val spark: SparkSession = \{ SparkSession .builder() .master("local") .appName("spark tests") .getOrCreate() } def createDF(schema: List[StructField], data: Seq[Row]): DataFrame = \{ spark.createDataFrame( spark.sparkContext.parallelize(data), StructType(schema)) } } {code} Above is a test run to reproduce the error. In this example with accuracy 500 , P25 and P90 looks fine. But P75 is the max of the column. +--+--+ |MetricName|percentile_approx(DataPoint, [0.1,0.25,0.75,0.9], 500)| +--+--+ |metric |[45, 1125000, 450, 405] | +--+--+ This is breaking our reports as there is no proper definition of accuracy . we have data sets of size more than 2700. After studying the pattern found that inaccurate percentiles always have "max" of the column as value. P50 and P99 might be right in few cases but P75 can be wrong. Is there a way to define what the correct accuracy would be for a given dataset size ? was: Results of ApproximatePercentile should have better accuracy with increased precision as per the documentation provided here: [https://github.com/apache/spark/blob/master/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/ApproximatePercentile.scala] But i'm seeing nondeterministic behavior . On a data set of size 450 with Accuracy 100 is returning better results than 500. And accuracy with 700 gives exact results. But this behavior is not consistent. {code:java} // code placeholder {package com.microsoft.teams.test.utilsimport org.apache.spark.sql.\{Column, DataFrame, Row, SparkSession} import org.scalatest.\{Assertions, FunSpec} import org.apache.spark.sql.functions.\{lit, _} import org.apache.spark.sql.catalyst.expressions.aggregate.ApproximatePercentile import org.apache.spark.sql.types.\{IntegerType, StringType, StructField, StructType}import scala.collection.mutable.ListBuffer class PercentilesTest extends FunSpec with Assertions \{ it("check percentiles with different precision") { val schema = List(StructField("MetricName", StringType),
[jira] [Updated] (SPARK-30882) Inaccurate results even with higher precision ApproximatePercentile results
[ https://issues.apache.org/jira/browse/SPARK-30882?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ] Nandini malempati updated SPARK-30882: -- Description: Results of ApproximatePercentile should have better accuracy with increased precision as per the documentation provided here: [https://github.com/apache/spark/blob/master/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/ApproximatePercentile.scala] But i'm seeing nondeterministic behavior . On a data set of size 450 with Accuracy 100 is returning better results than 500. And accuracy with 700 gives exact results. But this behavior is not consistent. {code:scala} // Some comments here package com.microsoft.teams.test.utils import org.apache.spark.sql.{Column, DataFrame, Row, SparkSession} import org.scalatest.{Assertions, FunSpec} import org.apache.spark.sql.functions.{lit, _} import org.apache.spark.sql.catalyst.expressions.aggregate.ApproximatePercentile import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType} import scala.collection.mutable.ListBuffer class PercentilesTest extends FunSpec with Assertions { it("check percentiles with different precision") { val schema = List(StructField("MetricName", StringType), StructField("DataPoint", IntegerType)) val data = new ListBuffer[Row] for(i <- 1 to 450) { data += Row("metric", i)} import spark.implicits._ val df = createDF(schema, data.toSeq) val accuracy1000 = df.groupBy("MetricName").agg(percentile_approx($"DataPoint", typedLit(Seq(0.1, 0.25, 0.75, 0.9)), ApproximatePercentile.DEFAULT_PERCENTILE_ACCURACY)) val accuracy1M = df.groupBy("MetricName").agg(percentile_approx($"DataPoint", typedLit(Seq(0.1, 0.25, 0.75, 0.9)), 100)) val accuracy5M = df.groupBy("MetricName").agg(percentile_approx($"DataPoint", typedLit(Seq(0.1, 0.25, 0.75, 0.9)), 500)) val accuracy7M = df.groupBy("MetricName").agg(percentile_approx($"DataPoint", typedLit(Seq(0.1, 0.25, 0.75, 0.9)), 700)) val accuracy10M = df.groupBy("MetricName").agg(percentile_approx($"DataPoint", typedLit(Seq(0.1, 0.25, 0.75, 0.9)), 1000)) accuracy1000.show(1, false) accuracy1M.show(1, false) accuracy5M.show(1, false) accuracy7M.show(1, false) accuracy10M.show(1, false) } def percentile_approx(col: Column, percentage: Column, accuracy: Column): Column = { val expr = new ApproximatePercentile( col.expr, percentage.expr, accuracy.expr ).toAggregateExpression new Column(expr) } def percentile_approx(col: Column, percentage: Column, accuracy: Int): Column = percentile_approx( col, percentage, lit(accuracy) ) lazy val spark: SparkSession = { SparkSession .builder() .master("local") .appName("spark tests") .getOrCreate() } def createDF(schema: List[StructField], data: Seq[Row]): DataFrame = { spark.createDataFrame( spark.sparkContext.parallelize(data), StructType(schema)) } } {code} Above is a test run to reproduce the error. In this example with accuracy 500 , P25 and P90 looks fine. But P75 is the max of the column. +--+--+ |MetricName|percentile_approx(DataPoint, [0.1,0.25,0.75,0.9], 500)| +--+--+ |metric |[45, 1125000, 450, 405] | +--+--+ This is breaking our reports as there is no proper definition of accuracy . we have data sets of size more than 2700. After studying the pattern found that inaccurate percentiles always have "max" of the column as value. P50 and P99 might be right in few cases but P75 can be wrong. Is there a way to define what the correct accuracy would be for a given dataset size ? was: Results of ApproximatePercentile should have better accuracy with increased precision as per the documentation provided here: [https://github.com/apache/spark/blob/master/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/ApproximatePercentile.scala] But i'm seeing nondeterministic behavior . On a data set of size 450 with Accuracy 100 is returning better results than 500. And accuracy with 700 gives exact results. But this behavior is not consistent. {code:scala} // Some comments here package com.microsoft.teams.test.utilsimport org.apache.spark.sql.\{Column, DataFrame, Row, SparkSession} import org.scalatest.\{Assertions, FunSpec} import org.apache.spark.sql.functions.\{lit, _} import org.apache.spark.sql.catalyst.expressions.aggregate.ApproximatePercentile import org.apache.spark.sql.types.\{IntegerType, StringType, StructField, StructType}import scala.collection.mutable.ListBuffer class
[jira] [Updated] (SPARK-30882) Inaccurate results even with higher precision ApproximatePercentile results
[ https://issues.apache.org/jira/browse/SPARK-30882?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ] Nandini malempati updated SPARK-30882: -- Description: Results of ApproximatePercentile should have better accuracy with increased precision as per the documentation provided here: [https://github.com/apache/spark/blob/master/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/ApproximatePercentile.scala] But i'm seeing nondeterministic behavior . On a data set of size 450 with Accuracy 100 is returning better results than 500. And accuracy with 700 gives exact results. But this behavior is not consistent. {code:java} // code placeholder {package com.microsoft.teams.test.utilsimport org.apache.spark.sql.\{Column, DataFrame, Row, SparkSession} import org.scalatest.\{Assertions, FunSpec} import org.apache.spark.sql.functions.\{lit, _} import org.apache.spark.sql.catalyst.expressions.aggregate.ApproximatePercentile import org.apache.spark.sql.types.\{IntegerType, StringType, StructField, StructType}import scala.collection.mutable.ListBuffer class PercentilesTest extends FunSpec with Assertions \{ it("check percentiles with different precision") { val schema = List(StructField("MetricName", StringType), StructField("DataPoint", IntegerType)) val data = new ListBuffer[Row] for(i <- 1 to 450) { data += Row("metric", i)} import spark.implicits._ val df = createDF(schema, data.toSeq) val accuracy1000 = df.groupBy("MetricName").agg(percentile_approx($"DataPoint", typedLit(Seq(0.1, 0.25, 0.75, 0.9)), ApproximatePercentile.DEFAULT_PERCENTILE_ACCURACY)) val accuracy1M = df.groupBy("MetricName").agg(percentile_approx($"DataPoint", typedLit(Seq(0.1, 0.25, 0.75, 0.9)), 100)) val accuracy5M = df.groupBy("MetricName").agg(percentile_approx($"DataPoint", typedLit(Seq(0.1, 0.25, 0.75, 0.9)), 500)) val accuracy7M = df.groupBy("MetricName").agg(percentile_approx($"DataPoint", typedLit(Seq(0.1, 0.25, 0.75, 0.9)), 700)) val accuracy10M = df.groupBy("MetricName").agg(percentile_approx($"DataPoint", typedLit(Seq(0.1, 0.25, 0.75, 0.9)), 1000)) accuracy1000.show(1, false) accuracy1M.show(1, false) accuracy5M.show(1, false) accuracy7M.show(1, false) accuracy10M.show(1, false) } def percentile_approx(col: Column, percentage: Column, accuracy: Column): Column = \{ val expr = new ApproximatePercentile( col.expr, percentage.expr, accuracy.expr ).toAggregateExpression new Column(expr) } def percentile_approx(col: Column, percentage: Column, accuracy: Int): Column = percentile_approx( col, percentage, lit(accuracy) ) lazy val spark: SparkSession = \{ SparkSession .builder() .master("local") .appName("spark tests") .getOrCreate() } def createDF(schema: List[StructField], data: Seq[Row]): DataFrame = \{ spark.createDataFrame( spark.sparkContext.parallelize(data), StructType(schema)) } }} Above is a test run to reproduce the error. In this example with accuracy 500 , P25 and P90 looks fine. But P75 is the max of the column. +--+--+ |MetricName|percentile_approx(DataPoint, [0.1,0.25,0.75,0.9], 500)| +--+--+ |metric |[45, 1125000, 450, 405] | +--+--+ This is breaking our reports as there is no proper definition of accuracy . we have data sets of size more than 2700. After studying the pattern found that inaccurate percentiles always have "max" of the column as value. P50 and P99 might be right in few cases but P75 can be wrong. Is there a way to define what the correct accuracy would be for a given dataset size ? was: Results of ApproximatePercentile should have better accuracy with increased precision as per the documentation provided here: [https://github.com/apache/spark/blob/master/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/ApproximatePercentile.scala] But i'm seeing nondeterministic behavior . On a data set of size 450 with Accuracy 100 is returning better results than 500. And accuracy with 700 gives exact results. But this behavior is not consistent. {code:java} // code placeholder {code} package com.microsoft.teams.test.utilsimport org.apache.spark.sql.\{Column, DataFrame, Row, SparkSession} import org.scalatest.\{Assertions, FunSpec} import org.apache.spark.sql.functions.\{lit, _} import org.apache.spark.sql.catalyst.expressions.aggregate.ApproximatePercentile import org.apache.spark.sql.types.\{IntegerType, StringType, StructField, StructType}import scala.collection.mutable.ListBuffer class PercentilesTest extends FunSpec with Assertions \{ it("check percentiles with different precision") { val schema = List(StructField("MetricName", StringType),
[jira] [Updated] (SPARK-30882) Inaccurate results even with higher precision ApproximatePercentile results
[ https://issues.apache.org/jira/browse/SPARK-30882?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ] Nandini malempati updated SPARK-30882: -- Summary: Inaccurate results even with higher precision ApproximatePercentile results (was: Inaccurate ApproximatePercentile results ) > Inaccurate results even with higher precision ApproximatePercentile results > > > Key: SPARK-30882 > URL: https://issues.apache.org/jira/browse/SPARK-30882 > Project: Spark > Issue Type: Bug > Components: Java API >Affects Versions: 2.4.4 >Reporter: Nandini malempati >Priority: Major > > Results of ApproximatePercentile should have better accuracy with increased > precision as per the documentation provided here: > [https://github.com/apache/spark/blob/master/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/ApproximatePercentile.scala] > But i'm seeing nondeterministic behavior . On a data set of size 450 with > Accuracy 100 is returning better results than 500. And accuracy with > 700 gives exact results. But this behavior is not consistent. > {code:java} > // code placeholder > {code} > package com.microsoft.teams.test.utilsimport org.apache.spark.sql.\{Column, > DataFrame, Row, SparkSession} import org.scalatest.\{Assertions, FunSpec} > import org.apache.spark.sql.functions.\{lit, _} import > org.apache.spark.sql.catalyst.expressions.aggregate.ApproximatePercentile > import org.apache.spark.sql.types.\{IntegerType, StringType, StructField, > StructType}import scala.collection.mutable.ListBuffer class PercentilesTest > extends FunSpec with Assertions \{ it("check percentiles with different > precision") { val schema = List(StructField("MetricName", StringType), > StructField("DataPoint", IntegerType)) val data = new ListBuffer[Row] for(i > <- 1 to 450) { data += Row("metric", i)} import spark.implicits._ val df > = createDF(schema, data.toSeq) val accuracy1000 = > df.groupBy("MetricName").agg(percentile_approx($"DataPoint", > typedLit(Seq(0.1, 0.25, 0.75, 0.9)), > ApproximatePercentile.DEFAULT_PERCENTILE_ACCURACY)) val accuracy1M = > df.groupBy("MetricName").agg(percentile_approx($"DataPoint", > typedLit(Seq(0.1, 0.25, 0.75, 0.9)), 100)) val accuracy5M = > df.groupBy("MetricName").agg(percentile_approx($"DataPoint", > typedLit(Seq(0.1, 0.25, 0.75, 0.9)), 500)) val accuracy7M = > df.groupBy("MetricName").agg(percentile_approx($"DataPoint", > typedLit(Seq(0.1, 0.25, 0.75, 0.9)), 700)) val accuracy10M = > df.groupBy("MetricName").agg(percentile_approx($"DataPoint", > typedLit(Seq(0.1, 0.25, 0.75, 0.9)), 1000)) accuracy1000.show(1, false) > accuracy1M.show(1, false) accuracy5M.show(1, false) accuracy7M.show(1, false) > accuracy10M.show(1, false) } def percentile_approx(col: Column, percentage: > Column, accuracy: Column): Column = \{ val expr = new ApproximatePercentile( > col.expr, percentage.expr, accuracy.expr ).toAggregateExpression new > Column(expr) } def percentile_approx(col: Column, percentage: Column, > accuracy: Int): Column = percentile_approx( col, percentage, lit(accuracy) ) > lazy val spark: SparkSession = \{ SparkSession .builder() .master("local") > .appName("spark tests") .getOrCreate() } def createDF(schema: > List[StructField], data: Seq[Row]): DataFrame = \{ spark.createDataFrame( > spark.sparkContext.parallelize(data), StructType(schema)) } } > Above is a test run to reproduce the error. In this example with accuracy > 500 , P25 and P90 looks fine. But P75 is the max of the column. > +--+--+ > |MetricName|percentile_approx(DataPoint, [0.1,0.25,0.75,0.9], 500)| > +--+--+ > |metric |[45, 1125000, 450, 405] | > +--+--+ > This is breaking our reports as there is no proper definition of accuracy . > we have data sets of size more than 2700. After studying the pattern > found that inaccurate percentiles always have "max" of the column as value. > P50 and P99 might be right in few cases but P75 can be wrong. > Is there a way to define what the correct accuracy would be for a given > dataset size ? -- This message was sent by Atlassian Jira (v8.3.4#803005) - To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org For additional commands, e-mail: issues-h...@spark.apache.org