This is an automated email from the ASF dual-hosted git repository.

comphead pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion-comet.git


The following commit(s) were added to refs/heads/main by this push:
     new 383b56d88 chore: Add tolerance for `ComparisonTool`  (#2699)
383b56d88 is described below

commit 383b56d8801a2b63b0c2a0c01cca62655a9096de
Author: Oleks V <[email protected]>
AuthorDate: Thu Nov 6 13:14:34 2025 -0800

    chore: Add tolerance for `ComparisonTool`  (#2699)
    
    * chore: Add tolerance for `ComparisonTool` to identify error threshold for 
floating point comparisons
---
 .../scala/org/apache/comet/fuzz/ComparisonTool.scala    | 10 +++++++---
 .../main/scala/org/apache/comet/fuzz/QueryRunner.scala  | 17 +++++++++--------
 2 files changed, 16 insertions(+), 11 deletions(-)

diff --git 
a/fuzz-testing/src/main/scala/org/apache/comet/fuzz/ComparisonTool.scala 
b/fuzz-testing/src/main/scala/org/apache/comet/fuzz/ComparisonTool.scala
index 03b6f7334..055ea6553 100644
--- a/fuzz-testing/src/main/scala/org/apache/comet/fuzz/ComparisonTool.scala
+++ b/fuzz-testing/src/main/scala/org/apache/comet/fuzz/ComparisonTool.scala
@@ -31,6 +31,8 @@ class ComparisonToolConf(arguments: Seq[String]) extends 
ScallopConf(arguments)
       opt[String](required = true, descr = "Folder with Spark produced results 
in Parquet format")
     val inputCometFolder: ScallopOption[String] =
       opt[String](required = true, descr = "Folder with Comet produced results 
in Parquet format")
+    val tolerance: ScallopOption[Double] =
+      opt[Double](default = Some(0.000002), descr = "Tolerance for floating 
point comparisons")
   }
   addSubcommand(compareParquet)
   verify()
@@ -49,7 +51,8 @@ object ComparisonTool {
         compareParquetFolders(
           spark,
           conf.compareParquet.inputSparkFolder(),
-          conf.compareParquet.inputCometFolder())
+          conf.compareParquet.inputCometFolder(),
+          conf.compareParquet.tolerance())
 
       case _ =>
         // scalastyle:off println
@@ -62,7 +65,8 @@ object ComparisonTool {
   private def compareParquetFolders(
       spark: SparkSession,
       sparkFolderPath: String,
-      cometFolderPath: String): Unit = {
+      cometFolderPath: String,
+      tolerance: Double): Unit = {
 
     val output = QueryRunner.createOutputMdFile()
 
@@ -115,7 +119,7 @@ object ComparisonTool {
             val cometRows = 
cometDf.orderBy(cometDf.columns.map(functions.col): _*).collect()
 
             // Compare the results
-            if (QueryComparison.assertSameRows(sparkRows, cometRows, output)) {
+            if (QueryComparison.assertSameRows(sparkRows, cometRows, output, 
tolerance)) {
               output.write(s"Subfolder $subfolderName: ${sparkRows.length} 
rows matched\n\n")
             } else {
               // Output schema if dataframes are not equal
diff --git 
a/fuzz-testing/src/main/scala/org/apache/comet/fuzz/QueryRunner.scala 
b/fuzz-testing/src/main/scala/org/apache/comet/fuzz/QueryRunner.scala
index 23bfbc1c0..dc7189c53 100644
--- a/fuzz-testing/src/main/scala/org/apache/comet/fuzz/QueryRunner.scala
+++ b/fuzz-testing/src/main/scala/org/apache/comet/fuzz/QueryRunner.scala
@@ -148,7 +148,8 @@ object QueryComparison {
   def assertSameRows(
       sparkRows: Array[Row],
       cometRows: Array[Row],
-      output: BufferedWriter): Boolean = {
+      output: BufferedWriter,
+      tolerance: Double = 0.000001): Boolean = {
     if (sparkRows.length == cometRows.length) {
       var i = 0
       while (i < sparkRows.length) {
@@ -164,7 +165,7 @@ object QueryComparison {
 
         assert(l.length == r.length)
         for (j <- 0 until l.length) {
-          if (!same(l(j), r(j))) {
+          if (!same(l(j), r(j), tolerance)) {
             output.write(s"First difference at row $i:\n")
             output.write("Spark: `" + formatRow(l) + "`\n")
             output.write("Comet: `" + formatRow(r) + "`\n")
@@ -186,7 +187,7 @@ object QueryComparison {
     true
   }
 
-  private def same(l: Any, r: Any): Boolean = {
+  private def same(l: Any, r: Any, tolerance: Double): Boolean = {
     if (l == null || r == null) {
       return l == null && r == null
     }
@@ -195,20 +196,20 @@ object QueryComparison {
       case (a: Float, b: Float) if a.isNegInfinity => b.isNegInfinity
       case (a: Float, b: Float) if a.isInfinity => b.isInfinity
       case (a: Float, b: Float) if a.isNaN => b.isNaN
-      case (a: Float, b: Float) => (a - b).abs <= 0.000001f
+      case (a: Float, b: Float) => (a - b).abs <= tolerance
       case (a: Double, b: Double) if a.isPosInfinity => b.isPosInfinity
       case (a: Double, b: Double) if a.isNegInfinity => b.isNegInfinity
       case (a: Double, b: Double) if a.isInfinity => b.isInfinity
       case (a: Double, b: Double) if a.isNaN => b.isNaN
-      case (a: Double, b: Double) => (a - b).abs <= 0.000001
+      case (a: Double, b: Double) => (a - b).abs <= tolerance
       case (a: Array[_], b: Array[_]) =>
-        a.length == b.length && a.zip(b).forall(x => same(x._1, x._2))
+        a.length == b.length && a.zip(b).forall(x => same(x._1, x._2, 
tolerance))
       case (a: mutable.WrappedArray[_], b: mutable.WrappedArray[_]) =>
-        a.length == b.length && a.zip(b).forall(x => same(x._1, x._2))
+        a.length == b.length && a.zip(b).forall(x => same(x._1, x._2, 
tolerance))
       case (a: Row, b: Row) =>
         val aa = a.toSeq
         val bb = b.toSeq
-        aa.length == bb.length && aa.zip(bb).forall(x => same(x._1, x._2))
+        aa.length == bb.length && aa.zip(bb).forall(x => same(x._1, x._2, 
tolerance))
       case (a, b) => a == b
     }
   }


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to