[ 
https://issues.apache.org/jira/browse/SPARK-30882?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Nandini malempati updated SPARK-30882:
--------------------------------------
    Description: 
Results of ApproximatePercentile should have better accuracy with increased 
precision as per the documentation provided here: 
[https://github.com/apache/spark/blob/master/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/ApproximatePercentile.scala]
But i'm seeing nondeterministic behavior . On a data set of size 4500000 with 
Accuracy 1000000 is returning better results than 5000000. And accuracy with 
7000000 gives exact results. But this behavior is not consistent. 


{code:scala}
// Some comments here
package com.microsoft.teams.test.utils

import org.apache.spark.sql.{Column, DataFrame, Row, SparkSession}
import org.scalatest.{Assertions, FunSpec}
import org.apache.spark.sql.functions.{lit, _}
import org.apache.spark.sql.catalyst.expressions.aggregate.ApproximatePercentile
import org.apache.spark.sql.types.{IntegerType, StringType, StructField, 
StructType}

import scala.collection.mutable.ListBuffer


class PercentilesTest extends FunSpec with Assertions {

    it("check percentiles with different precision") {
      val schema = List(StructField("MetricName", StringType), 
StructField("DataPoint", IntegerType))
      val data = new ListBuffer[Row]
      for(i <- 1 to 4500000) { data += Row("metric", i)}

      import spark.implicits._
    val df = createDF(schema, data.toSeq)

    val accuracy1000 = 
df.groupBy("MetricName").agg(percentile_approx($"DataPoint", typedLit(Seq(0.1, 
0.25, 0.75, 0.9)), ApproximatePercentile.DEFAULT_PERCENTILE_ACCURACY))

    val accuracy1M = 
df.groupBy("MetricName").agg(percentile_approx($"DataPoint", typedLit(Seq(0.1, 
0.25, 0.75, 0.9)), 1000000))

    val accuracy5M = 
df.groupBy("MetricName").agg(percentile_approx($"DataPoint", typedLit(Seq(0.1, 
0.25, 0.75, 0.9)), 5000000))

    val accuracy7M = 
df.groupBy("MetricName").agg(percentile_approx($"DataPoint", typedLit(Seq(0.1, 
0.25, 0.75, 0.9)), 7000000))

    val accuracy10M = 
df.groupBy("MetricName").agg(percentile_approx($"DataPoint", typedLit(Seq(0.1, 
0.25, 0.75, 0.9)), 10000000))

    accuracy1000.show(1, false)

    accuracy1M.show(1, false)

    accuracy5M.show(1, false)

    accuracy7M.show(1, false)

    accuracy10M.show(1, false)

  }

  def percentile_approx(col: Column, percentage: Column, accuracy: Column): 
Column = {
    val expr = new ApproximatePercentile(
      col.expr,  percentage.expr, accuracy.expr
    ).toAggregateExpression
    new Column(expr)
  }

  def percentile_approx(col: Column, percentage: Column, accuracy: Int): Column 
= percentile_approx(
    col, percentage, lit(accuracy)
  )

  lazy val spark: SparkSession = {
    SparkSession
      .builder()
      .master("local")
      .appName("spark tests")
      .getOrCreate()
  }

  def createDF(schema: List[StructField], data: Seq[Row]): DataFrame = {
    spark.createDataFrame(
      spark.sparkContext.parallelize(data),
      StructType(schema))
  }
}

{code}

Above is a test run to reproduce the error.  In this example with accuracy 
5000000 , P25 and P90 looks fine. But P75 is the max of the column.

+----------+----------------------------------------------------------+
|MetricName|percentile_approx(DataPoint, [0.1,0.25,0.75,0.9], 5000000)|
+----------+----------------------------------------------------------+
|metric        |[450000, 1125000, 4500000, 4050000]                       |
+----------+----------------------------------------------------------+

This is breaking our reports as there is no proper definition of accuracy . we 
have data sets of size more than 27000000. After studying the pattern found 
that inaccurate percentiles always have "max" of the column as value. P50 and 
P99 might be right in few cases but P75 can be wrong. 

Is there a way to define what the correct accuracy would be for a given dataset 
size ? 

  was:
Results of ApproximatePercentile should have better accuracy with increased 
precision as per the documentation provided here: 
[https://github.com/apache/spark/blob/master/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/ApproximatePercentile.scala]
But i'm seeing nondeterministic behavior . On a data set of size 4500000 with 
Accuracy 1000000 is returning better results than 5000000. And accuracy with 
7000000 gives exact results. But this behavior is not consistent. 


{code:scala}
// Some comments here
package com.microsoft.teams.test.utilsimport org.apache.spark.sql.\{Column, 
DataFrame, Row, SparkSession} import org.scalatest.\{Assertions, FunSpec} 
import org.apache.spark.sql.functions.\{lit, _} import 
org.apache.spark.sql.catalyst.expressions.aggregate.ApproximatePercentile 
import org.apache.spark.sql.types.\{IntegerType, StringType, StructField, 
StructType}import scala.collection.mutable.ListBuffer class PercentilesTest 
extends FunSpec with Assertions \{ it("check percentiles with different 
precision") { val schema = List(StructField("MetricName", StringType), 
StructField("DataPoint", IntegerType)) val data = new ListBuffer[Row] for(i <- 
1 to 4500000) { data += Row("metric", i)} import spark.implicits._ val df = 
createDF(schema, data.toSeq) val accuracy1000 = 
df.groupBy("MetricName").agg(percentile_approx($"DataPoint", typedLit(Seq(0.1, 
0.25, 0.75, 0.9)), ApproximatePercentile.DEFAULT_PERCENTILE_ACCURACY)) val 
accuracy1M = df.groupBy("MetricName").agg(percentile_approx($"DataPoint", 
typedLit(Seq(0.1, 0.25, 0.75, 0.9)), 1000000)) val accuracy5M = 
df.groupBy("MetricName").agg(percentile_approx($"DataPoint", typedLit(Seq(0.1, 
0.25, 0.75, 0.9)), 5000000)) val accuracy7M = 
df.groupBy("MetricName").agg(percentile_approx($"DataPoint", typedLit(Seq(0.1, 
0.25, 0.75, 0.9)), 7000000)) val accuracy10M = 
df.groupBy("MetricName").agg(percentile_approx($"DataPoint", typedLit(Seq(0.1, 
0.25, 0.75, 0.9)), 10000000)) accuracy1000.show(1, false) accuracy1M.show(1, 
false) accuracy5M.show(1, false) accuracy7M.show(1, false) accuracy10M.show(1, 
false) } def percentile_approx(col: Column, percentage: Column, accuracy: 
Column): Column = \{ val expr = new ApproximatePercentile( col.expr, 
percentage.expr, accuracy.expr ).toAggregateExpression new Column(expr) } def 
percentile_approx(col: Column, percentage: Column, accuracy: Int): Column = 
percentile_approx( col, percentage, lit(accuracy) ) lazy val spark: 
SparkSession = \{ SparkSession .builder() .master("local") .appName("spark 
tests") .getOrCreate() } def createDF(schema: List[StructField], data: 
Seq[Row]): DataFrame = \{ spark.createDataFrame( 
spark.sparkContext.parallelize(data), StructType(schema)) } }
{code}

Above is a test run to reproduce the error.  In this example with accuracy 
5000000 , P25 and P90 looks fine. But P75 is the max of the column.

+----------+----------------------------------------------------------+
|MetricName|percentile_approx(DataPoint, [0.1,0.25,0.75,0.9], 5000000)|
+----------+----------------------------------------------------------+
|metric        |[450000, 1125000, 4500000, 4050000]                       |
+----------+----------------------------------------------------------+

This is breaking our reports as there is no proper definition of accuracy . we 
have data sets of size more than 27000000. After studying the pattern found 
that inaccurate percentiles always have "max" of the column as value. P50 and 
P99 might be right in few cases but P75 can be wrong. 

Is there a way to define what the correct accuracy would be for a given dataset 
size ? 


> Inaccurate results even with higher precision ApproximatePercentile results 
> ----------------------------------------------------------------------------
>
>                 Key: SPARK-30882
>                 URL: https://issues.apache.org/jira/browse/SPARK-30882
>             Project: Spark
>          Issue Type: Bug
>          Components: Java API
>    Affects Versions: 2.4.4
>            Reporter: Nandini malempati
>            Priority: Major
>
> Results of ApproximatePercentile should have better accuracy with increased 
> precision as per the documentation provided here: 
> [https://github.com/apache/spark/blob/master/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/ApproximatePercentile.scala]
> But i'm seeing nondeterministic behavior . On a data set of size 4500000 with 
> Accuracy 1000000 is returning better results than 5000000. And accuracy with 
> 7000000 gives exact results. But this behavior is not consistent. 
> {code:scala}
> // Some comments here
> package com.microsoft.teams.test.utils
> import org.apache.spark.sql.{Column, DataFrame, Row, SparkSession}
> import org.scalatest.{Assertions, FunSpec}
> import org.apache.spark.sql.functions.{lit, _}
> import 
> org.apache.spark.sql.catalyst.expressions.aggregate.ApproximatePercentile
> import org.apache.spark.sql.types.{IntegerType, StringType, StructField, 
> StructType}
> import scala.collection.mutable.ListBuffer
> class PercentilesTest extends FunSpec with Assertions {
>     it("check percentiles with different precision") {
>       val schema = List(StructField("MetricName", StringType), 
> StructField("DataPoint", IntegerType))
>       val data = new ListBuffer[Row]
>       for(i <- 1 to 4500000) { data += Row("metric", i)}
>       import spark.implicits._
>     val df = createDF(schema, data.toSeq)
>     val accuracy1000 = 
> df.groupBy("MetricName").agg(percentile_approx($"DataPoint", 
> typedLit(Seq(0.1, 0.25, 0.75, 0.9)), 
> ApproximatePercentile.DEFAULT_PERCENTILE_ACCURACY))
>     val accuracy1M = 
> df.groupBy("MetricName").agg(percentile_approx($"DataPoint", 
> typedLit(Seq(0.1, 0.25, 0.75, 0.9)), 1000000))
>     val accuracy5M = 
> df.groupBy("MetricName").agg(percentile_approx($"DataPoint", 
> typedLit(Seq(0.1, 0.25, 0.75, 0.9)), 5000000))
>     val accuracy7M = 
> df.groupBy("MetricName").agg(percentile_approx($"DataPoint", 
> typedLit(Seq(0.1, 0.25, 0.75, 0.9)), 7000000))
>     val accuracy10M = 
> df.groupBy("MetricName").agg(percentile_approx($"DataPoint", 
> typedLit(Seq(0.1, 0.25, 0.75, 0.9)), 10000000))
>     accuracy1000.show(1, false)
>     accuracy1M.show(1, false)
>     accuracy5M.show(1, false)
>     accuracy7M.show(1, false)
>     accuracy10M.show(1, false)
>   }
>   def percentile_approx(col: Column, percentage: Column, accuracy: Column): 
> Column = {
>     val expr = new ApproximatePercentile(
>       col.expr,  percentage.expr, accuracy.expr
>     ).toAggregateExpression
>     new Column(expr)
>   }
>   def percentile_approx(col: Column, percentage: Column, accuracy: Int): 
> Column = percentile_approx(
>     col, percentage, lit(accuracy)
>   )
>   lazy val spark: SparkSession = {
>     SparkSession
>       .builder()
>       .master("local")
>       .appName("spark tests")
>       .getOrCreate()
>   }
>   def createDF(schema: List[StructField], data: Seq[Row]): DataFrame = {
>     spark.createDataFrame(
>       spark.sparkContext.parallelize(data),
>       StructType(schema))
>   }
> }
> {code}
> Above is a test run to reproduce the error.  In this example with accuracy 
> 5000000 , P25 and P90 looks fine. But P75 is the max of the column.
> +----------+----------------------------------------------------------+
> |MetricName|percentile_approx(DataPoint, [0.1,0.25,0.75,0.9], 5000000)|
> +----------+----------------------------------------------------------+
> |metric        |[450000, 1125000, 4500000, 4050000]                       |
> +----------+----------------------------------------------------------+
> This is breaking our reports as there is no proper definition of accuracy . 
> we have data sets of size more than 27000000. After studying the pattern 
> found that inaccurate percentiles always have "max" of the column as value. 
> P50 and P99 might be right in few cases but P75 can be wrong. 
> Is there a way to define what the correct accuracy would be for a given 
> dataset size ? 



--
This message was sent by Atlassian Jira
(v8.3.4#803005)

---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org
For additional commands, e-mail: issues-h...@spark.apache.org

Reply via email to