Ian Manning created SPARK-53982:
-----------------------------------
Summary: Spark aggregation is incorrect (floating point error)
Key: SPARK-53982
URL: https://issues.apache.org/jira/browse/SPARK-53982
Project: Spark
Issue Type: Bug
Components: SQL
Affects Versions: 3.5.6
Reporter: Ian Manning
{code:java}
List<Row> data = Arrays.asList(
RowFactory.create("2021-01-01T00:00:00.000+0000", "pod123", 99.95),
RowFactory.create("2021-01-01T00:00:00.000+0000", "pod123", 99.95),
RowFactory.create("2021-01-01T00:00:00.000+0000", "pod123", 99.95),
RowFactory.create("2021-01-01T00:00:00.000+0000", "pod123", 99.95),
RowFactory.create("2021-01-01T00:00:00.000+0000", "pod123", 99.95),
RowFactory.create("2021-01-01T00:00:00.000+0000", "pod123", 99.95),
RowFactory.create("2021-01-01T00:00:00.000+0000", "pod123", 99.95),
RowFactory.create("2021-01-01T00:00:00.000+0000", "pod123", 99.95)
);
StructType schema = DataTypes.createStructType(new StructField[] {
DataTypes.createStructField("timestamp", DataTypes.StringType, false),
DataTypes.createStructField("id", DataTypes.StringType, false),
DataTypes.createStructField("value", DataTypes.DoubleType, false)
});
Dataset<Row> df = spark.createDataFrame(data, schema);
// Show the input data
System.out.println("Input data:");
df.show();
// Perform the aggregation
Dataset<Row> result = df.groupBy("id")
.agg(
avg("value").as(METADATA_COL_METRICVALUE),
sum("value").as(METADATA_COL_SUM_VALUE)
);
// Show the results
System.out.println("Aggregation results:");
result.show();
// Collect the results
List<Row> results = result.collectAsList();
// Print the results
System.out.println("Number of results: " + results.size());
for (Row row : results) {
System.out.println("Metric value: " +
row.getDouble(row.fieldIndex(METADATA_COL_METRICVALUE)));
System.out.println("Sum value: " +
row.getDouble(row.fieldIndex(METADATA_COL_SUM_VALUE)));
}
// Verify the results
assertEquals(1, results.size(), "Expected 1 aggregated result");
Row resultRow = results.get(0);
doublesumValue =
resultRow.getDouble(resultRow.fieldIndex(METADATA_COL_SUM_VALUE));
doubleexpectedSum = 799.6; // 8 * 99.95
System.out.println("Expected sum: " + expectedSum);
System.out.println("Actual sum: " + sumValue);
System.out.println("Difference: " + Math.abs(expectedSum - sumValue));
// Check if the sum is close to the expected value
assertTrue(Math.abs(expectedSum - sumValue) < 0.001,
"Sum value should be close to " + expectedSum + " but was " + sumValue);
{code}
{color:#000000} {color}
Input data: +--------------------+------+-----+ | timestamp| id|value|
+--------------------+------+-----+ |2021-01-01T00:00:...|pod123|99.95|
|2021-01-01T00:00:...|pod123|99.95| |2021-01-01T00:00:...|pod123|99.95|
|2021-01-01T00:00:...|pod123|99.95| |2021-01-01T00:00:...|pod123|99.95|
|2021-01-01T00:00:...|pod123|99.95| |2021-01-01T00:00:...|pod123|99.95|
|2021-01-01T00:00:...|pod123|99.95| +--------------------+------+-----+
Aggregation results: +------+-----------------+-----------------+ | id|
metric_value| sum_value| +------+-----------------+-----------------+
|pod123|99.95000000000002|799.6000000000001|
+------+-----------------+-----------------+ Number of results: 1 Metric value:
99.95000000000002 Sum value: 799.6000000000001 Expected sum: 799.6 Actual sum:
799.6000000000001 Difference: 1.1368683772161603E-13
--
This message was sent by Atlassian Jira
(v8.20.10#820010)
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]