aglinxinyuan commented on code in PR #4908: URL: https://github.com/apache/texera/pull/4908#discussion_r3179893029
########## common/workflow-operator/src/test/scala/org/apache/texera/amber/operator/aggregate/AggregationOperationSpec.scala: ########## @@ -0,0 +1,235 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.texera.amber.operator.aggregate + +import org.apache.texera.amber.core.tuple.{Attribute, AttributeType, Schema, Tuple} +import org.scalatest.flatspec.AnyFlatSpec + +class AggregationOperationSpec extends AnyFlatSpec { + + // --- helpers --------------------------------------------------------------- + + private def schemaWith(name: String, t: AttributeType): Schema = + new Schema(new Attribute(name, t)) + + private def tupleOf(name: String, t: AttributeType, value: AnyRef): Tuple = + Tuple.builder(schemaWith(name, t)).add(new Attribute(name, t), value).build() + + private def op( + func: AggregationFunction, + attribute: String = "v", + resultAttribute: String = "r" + ): AggregationOperation = { + val o = new AggregationOperation() + o.aggFunction = func + o.attribute = attribute + o.resultAttribute = resultAttribute + o + } + + // --- getAggregationAttribute ----------------------------------------------- + + "AggregationOperation.getAggregationAttribute" should "preserve the input type for SUM" in { + val attr = op(AggregationFunction.SUM).getAggregationAttribute(AttributeType.LONG) + assert(attr.getName == "r") + assert(attr.getType == AttributeType.LONG) + } + + it should "produce INTEGER for COUNT regardless of input type" in { + val attr = op(AggregationFunction.COUNT).getAggregationAttribute(AttributeType.STRING) + assert(attr.getType == AttributeType.INTEGER) + } + + it should "produce DOUBLE for AVERAGE regardless of input type" in { + val attr = op(AggregationFunction.AVERAGE).getAggregationAttribute(AttributeType.LONG) + assert(attr.getType == AttributeType.DOUBLE) + } + + it should "preserve the input type for MIN and MAX" in { + assert( + op(AggregationFunction.MIN).getAggregationAttribute(AttributeType.INTEGER).getType == + AttributeType.INTEGER + ) + assert( + op(AggregationFunction.MAX).getAggregationAttribute(AttributeType.TIMESTAMP).getType == + AttributeType.TIMESTAMP + ) + } + + it should "produce STRING for CONCAT" in { + assert( + op(AggregationFunction.CONCAT).getAggregationAttribute(AttributeType.STRING).getType == + AttributeType.STRING + ) + } + + it should "throw RuntimeException when aggFunction is null" in { + val ex = intercept[RuntimeException] { + op(null).getAggregationAttribute(AttributeType.INTEGER) + } + assert(ex.getMessage.contains("Unknown aggregation function")) + } + + // --- getAggFunc: type validation ------------------------------------------- + + "AggregationOperation.getAggFunc" should "throw for non-numeric types on SUM" in { + val ex = intercept[UnsupportedOperationException] { + op(AggregationFunction.SUM).getAggFunc(AttributeType.STRING) + } + assert(ex.getMessage.contains("Unsupported attribute type for sum")) + } + + it should "throw for non-numeric types on MIN and MAX" in { + intercept[UnsupportedOperationException] { + op(AggregationFunction.MIN).getAggFunc(AttributeType.STRING) + } + intercept[UnsupportedOperationException] { + op(AggregationFunction.MAX).getAggFunc(AttributeType.BOOLEAN) + } + } + + it should "throw UnsupportedOperationException when aggFunction is null" in { + val ex = intercept[UnsupportedOperationException] { + op(null).getAggFunc(AttributeType.INTEGER) + } + assert(ex.getMessage.contains("Unknown aggregation function")) + } + + // --- getAggFunc: SUM behavior ---------------------------------------------- + + "SUM aggregation" should "init at the type's zero, accumulate values, and merge partial sums" in { + val agg = op(AggregationFunction.SUM).getAggFunc(AttributeType.INTEGER) + val zero = agg.init().asInstanceOf[Integer] + assert(zero == 0) + val t1 = tupleOf("v", AttributeType.INTEGER, Int.box(3)) + val t2 = tupleOf("v", AttributeType.INTEGER, Int.box(5)) + val partial = agg.iterate(agg.iterate(zero, t1), t2) + assert(partial.asInstanceOf[Integer] == 8) + val merged = agg.merge(partial, partial) + assert(merged.asInstanceOf[Integer] == 16) + assert(agg.finalAgg(merged).asInstanceOf[Integer] == 16) + } Review Comment: Done in 07423b3fe9 — agreed; trimmed the spec to remove the duplicates with AggregateOpSpec. Removed all of `getAggregationAttribute`, the per-kind `iterate` tests (SUM/COUNT/AVERAGE/CONCAT), and the `getFinal` shape tests. Added a header comment in the spec documenting which behaviors AggregateOpSpec already covers, so future contributors do not re-add them. Net change: 19 → 7 tests. ########## common/workflow-operator/src/test/scala/org/apache/texera/amber/operator/aggregate/AggregationOperationSpec.scala: ########## @@ -0,0 +1,235 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.texera.amber.operator.aggregate + +import org.apache.texera.amber.core.tuple.{Attribute, AttributeType, Schema, Tuple} +import org.scalatest.flatspec.AnyFlatSpec + +class AggregationOperationSpec extends AnyFlatSpec { + + // --- helpers --------------------------------------------------------------- + + private def schemaWith(name: String, t: AttributeType): Schema = + new Schema(new Attribute(name, t)) + + private def tupleOf(name: String, t: AttributeType, value: AnyRef): Tuple = + Tuple.builder(schemaWith(name, t)).add(new Attribute(name, t), value).build() + + private def op( + func: AggregationFunction, + attribute: String = "v", + resultAttribute: String = "r" + ): AggregationOperation = { + val o = new AggregationOperation() + o.aggFunction = func + o.attribute = attribute + o.resultAttribute = resultAttribute + o + } + + // --- getAggregationAttribute ----------------------------------------------- + + "AggregationOperation.getAggregationAttribute" should "preserve the input type for SUM" in { + val attr = op(AggregationFunction.SUM).getAggregationAttribute(AttributeType.LONG) + assert(attr.getName == "r") + assert(attr.getType == AttributeType.LONG) + } + + it should "produce INTEGER for COUNT regardless of input type" in { + val attr = op(AggregationFunction.COUNT).getAggregationAttribute(AttributeType.STRING) + assert(attr.getType == AttributeType.INTEGER) + } + + it should "produce DOUBLE for AVERAGE regardless of input type" in { + val attr = op(AggregationFunction.AVERAGE).getAggregationAttribute(AttributeType.LONG) + assert(attr.getType == AttributeType.DOUBLE) + } + + it should "preserve the input type for MIN and MAX" in { + assert( + op(AggregationFunction.MIN).getAggregationAttribute(AttributeType.INTEGER).getType == + AttributeType.INTEGER + ) + assert( + op(AggregationFunction.MAX).getAggregationAttribute(AttributeType.TIMESTAMP).getType == + AttributeType.TIMESTAMP + ) + } + + it should "produce STRING for CONCAT" in { + assert( + op(AggregationFunction.CONCAT).getAggregationAttribute(AttributeType.STRING).getType == + AttributeType.STRING + ) + } + + it should "throw RuntimeException when aggFunction is null" in { + val ex = intercept[RuntimeException] { + op(null).getAggregationAttribute(AttributeType.INTEGER) + } + assert(ex.getMessage.contains("Unknown aggregation function")) + } + + // --- getAggFunc: type validation ------------------------------------------- + + "AggregationOperation.getAggFunc" should "throw for non-numeric types on SUM" in { + val ex = intercept[UnsupportedOperationException] { + op(AggregationFunction.SUM).getAggFunc(AttributeType.STRING) + } + assert(ex.getMessage.contains("Unsupported attribute type for sum")) + } + + it should "throw for non-numeric types on MIN and MAX" in { + intercept[UnsupportedOperationException] { + op(AggregationFunction.MIN).getAggFunc(AttributeType.STRING) + } + intercept[UnsupportedOperationException] { + op(AggregationFunction.MAX).getAggFunc(AttributeType.BOOLEAN) + } + } + + it should "throw UnsupportedOperationException when aggFunction is null" in { + val ex = intercept[UnsupportedOperationException] { + op(null).getAggFunc(AttributeType.INTEGER) + } + assert(ex.getMessage.contains("Unknown aggregation function")) + } + + // --- getAggFunc: SUM behavior ---------------------------------------------- + + "SUM aggregation" should "init at the type's zero, accumulate values, and merge partial sums" in { + val agg = op(AggregationFunction.SUM).getAggFunc(AttributeType.INTEGER) + val zero = agg.init().asInstanceOf[Integer] + assert(zero == 0) + val t1 = tupleOf("v", AttributeType.INTEGER, Int.box(3)) + val t2 = tupleOf("v", AttributeType.INTEGER, Int.box(5)) + val partial = agg.iterate(agg.iterate(zero, t1), t2) + assert(partial.asInstanceOf[Integer] == 8) + val merged = agg.merge(partial, partial) + assert(merged.asInstanceOf[Integer] == 16) + assert(agg.finalAgg(merged).asInstanceOf[Integer] == 16) + } + + // --- getAggFunc: COUNT behavior -------------------------------------------- + + "COUNT aggregation" should "treat a null `attribute` as count-all (one per tuple)" in { + val agg = op(AggregationFunction.COUNT, attribute = null).getAggFunc(AttributeType.INTEGER) + val t = tupleOf("v", AttributeType.INTEGER, null) + val out = agg.iterate(agg.iterate(agg.init(), t), t).asInstanceOf[Integer] + assert(out == 2, "with attribute=null, every tuple — even null-valued — should count") + } + + it should "count only non-null values when `attribute` is set" in { + val agg = op(AggregationFunction.COUNT, attribute = "v").getAggFunc(AttributeType.INTEGER) + val nonNull = tupleOf("v", AttributeType.INTEGER, Int.box(7)) + val nullVal = tupleOf("v", AttributeType.INTEGER, null) + val out = agg + .iterate(agg.iterate(agg.iterate(agg.init(), nonNull), nullVal), nonNull) + .asInstanceOf[Integer] + assert(out == 2, "two non-null tuples + one null → count == 2") + } + + // --- getAggFunc: AVERAGE behavior ------------------------------------------ + + "AVERAGE aggregation" should "init at (0,0), accumulate sum+count, and yield sum/count" in { + // averageAgg() returns DistributedAggregation[AveragePartialObj] but is + // type-erased to Object via getAggFunc, so we cast back here. + val agg = op(AggregationFunction.AVERAGE).getAggFunc(AttributeType.DOUBLE) + val zero = agg.init().asInstanceOf[AveragePartialObj] + assert(zero == AveragePartialObj(0, 0)) + + val t1 = tupleOf("v", AttributeType.DOUBLE, java.lang.Double.valueOf(2.0)) + val t2 = tupleOf("v", AttributeType.DOUBLE, java.lang.Double.valueOf(4.0)) + val acc = agg + .iterate(agg.iterate(zero, t1), t2) + .asInstanceOf[AveragePartialObj] + assert(acc == AveragePartialObj(6.0, 2)) + val finalVal = agg.finalAgg(acc).asInstanceOf[java.lang.Double] + assert(finalVal == 3.0) + } + + it should "yield null when no non-null values were aggregated" in { + val agg = op(AggregationFunction.AVERAGE).getAggFunc(AttributeType.DOUBLE) + val zero = agg.init() + val finalVal = agg.finalAgg(zero) + assert(finalVal == null) + } + + // --- getAggFunc: CONCAT behavior ------------------------------------------- + + "CONCAT aggregation" should "concatenate non-empty values with commas and skip null gracefully" in { + val agg = op(AggregationFunction.CONCAT).getAggFunc(AttributeType.STRING) + assert(agg.init() == "") + val t1 = tupleOf("v", AttributeType.STRING, "a") + val t2 = tupleOf("v", AttributeType.STRING, "b") + val tNull = tupleOf("v", AttributeType.STRING, null) + val out = + agg.iterate(agg.iterate(agg.iterate(agg.init(), t1), tNull), t2) + assert(out == "a,,b", "null values are emitted as empty between commas") + } + + it should "merge two non-empty partial strings with a comma" in { + val agg = op(AggregationFunction.CONCAT).getAggFunc(AttributeType.STRING) + assert(agg.merge("foo", "bar") == "foo,bar") + assert(agg.merge("", "bar") == "bar") + assert(agg.merge("foo", "") == "foo") + assert(agg.merge("", "") == "") + } + + // --- getFinal -------------------------------------------------------------- Review Comment: Done in 07423b3fe9 — added two end-to-end pipeline tests under `"Worker → final aggregation pipeline"`: each runs a real worker partial aggregation (COUNT and SUM), emits a partial output tuple, then re-aggregates the partials through the operation produced by `getFinal`, asserting that the two-stage result equals a single-pass aggregation. ########## common/workflow-operator/src/test/scala/org/apache/texera/amber/operator/aggregate/AggregationOperationSpec.scala: ########## @@ -0,0 +1,235 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.texera.amber.operator.aggregate + +import org.apache.texera.amber.core.tuple.{Attribute, AttributeType, Schema, Tuple} +import org.scalatest.flatspec.AnyFlatSpec + +class AggregationOperationSpec extends AnyFlatSpec { + + // --- helpers --------------------------------------------------------------- + + private def schemaWith(name: String, t: AttributeType): Schema = + new Schema(new Attribute(name, t)) + + private def tupleOf(name: String, t: AttributeType, value: AnyRef): Tuple = + Tuple.builder(schemaWith(name, t)).add(new Attribute(name, t), value).build() + + private def op( + func: AggregationFunction, + attribute: String = "v", + resultAttribute: String = "r" + ): AggregationOperation = { + val o = new AggregationOperation() + o.aggFunction = func + o.attribute = attribute + o.resultAttribute = resultAttribute + o + } Review Comment: Done in 07423b3fe9 — agreed. Trimmed the duplicates with AggregateOpSpec (getAggregationAttribute, per-kind iterate, getFinal shape) and added a coverage-notes header in the spec documenting what AggregateOpSpec already covers, so the two suites no longer overlap. ########## common/workflow-operator/src/test/scala/org/apache/texera/amber/operator/aggregate/AggregationOperationSpec.scala: ########## @@ -0,0 +1,235 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.texera.amber.operator.aggregate + +import org.apache.texera.amber.core.tuple.{Attribute, AttributeType, Schema, Tuple} +import org.scalatest.flatspec.AnyFlatSpec + +class AggregationOperationSpec extends AnyFlatSpec { + + // --- helpers --------------------------------------------------------------- + + private def schemaWith(name: String, t: AttributeType): Schema = + new Schema(new Attribute(name, t)) + + private def tupleOf(name: String, t: AttributeType, value: AnyRef): Tuple = + Tuple.builder(schemaWith(name, t)).add(new Attribute(name, t), value).build() + + private def op( + func: AggregationFunction, + attribute: String = "v", + resultAttribute: String = "r" + ): AggregationOperation = { + val o = new AggregationOperation() + o.aggFunction = func + o.attribute = attribute + o.resultAttribute = resultAttribute + o + } + + // --- getAggregationAttribute ----------------------------------------------- + + "AggregationOperation.getAggregationAttribute" should "preserve the input type for SUM" in { + val attr = op(AggregationFunction.SUM).getAggregationAttribute(AttributeType.LONG) + assert(attr.getName == "r") + assert(attr.getType == AttributeType.LONG) + } + + it should "produce INTEGER for COUNT regardless of input type" in { + val attr = op(AggregationFunction.COUNT).getAggregationAttribute(AttributeType.STRING) + assert(attr.getType == AttributeType.INTEGER) + } + + it should "produce DOUBLE for AVERAGE regardless of input type" in { + val attr = op(AggregationFunction.AVERAGE).getAggregationAttribute(AttributeType.LONG) + assert(attr.getType == AttributeType.DOUBLE) + } + + it should "preserve the input type for MIN and MAX" in { + assert( + op(AggregationFunction.MIN).getAggregationAttribute(AttributeType.INTEGER).getType == + AttributeType.INTEGER + ) + assert( + op(AggregationFunction.MAX).getAggregationAttribute(AttributeType.TIMESTAMP).getType == + AttributeType.TIMESTAMP + ) + } + + it should "produce STRING for CONCAT" in { + assert( + op(AggregationFunction.CONCAT).getAggregationAttribute(AttributeType.STRING).getType == + AttributeType.STRING + ) + } + + it should "throw RuntimeException when aggFunction is null" in { + val ex = intercept[RuntimeException] { + op(null).getAggregationAttribute(AttributeType.INTEGER) + } + assert(ex.getMessage.contains("Unknown aggregation function")) + } + + // --- getAggFunc: type validation ------------------------------------------- + + "AggregationOperation.getAggFunc" should "throw for non-numeric types on SUM" in { + val ex = intercept[UnsupportedOperationException] { + op(AggregationFunction.SUM).getAggFunc(AttributeType.STRING) + } + assert(ex.getMessage.contains("Unsupported attribute type for sum")) + } + + it should "throw for non-numeric types on MIN and MAX" in { + intercept[UnsupportedOperationException] { + op(AggregationFunction.MIN).getAggFunc(AttributeType.STRING) + } + intercept[UnsupportedOperationException] { + op(AggregationFunction.MAX).getAggFunc(AttributeType.BOOLEAN) + } + } + + it should "throw UnsupportedOperationException when aggFunction is null" in { + val ex = intercept[UnsupportedOperationException] { + op(null).getAggFunc(AttributeType.INTEGER) + } + assert(ex.getMessage.contains("Unknown aggregation function")) + } + + // --- getAggFunc: SUM behavior ---------------------------------------------- + + "SUM aggregation" should "init at the type's zero, accumulate values, and merge partial sums" in { + val agg = op(AggregationFunction.SUM).getAggFunc(AttributeType.INTEGER) + val zero = agg.init().asInstanceOf[Integer] + assert(zero == 0) + val t1 = tupleOf("v", AttributeType.INTEGER, Int.box(3)) + val t2 = tupleOf("v", AttributeType.INTEGER, Int.box(5)) + val partial = agg.iterate(agg.iterate(zero, t1), t2) + assert(partial.asInstanceOf[Integer] == 8) + val merged = agg.merge(partial, partial) + assert(merged.asInstanceOf[Integer] == 16) + assert(agg.finalAgg(merged).asInstanceOf[Integer] == 16) + } + + // --- getAggFunc: COUNT behavior -------------------------------------------- + + "COUNT aggregation" should "treat a null `attribute` as count-all (one per tuple)" in { + val agg = op(AggregationFunction.COUNT, attribute = null).getAggFunc(AttributeType.INTEGER) + val t = tupleOf("v", AttributeType.INTEGER, null) + val out = agg.iterate(agg.iterate(agg.init(), t), t).asInstanceOf[Integer] + assert(out == 2, "with attribute=null, every tuple — even null-valued — should count") + } + + it should "count only non-null values when `attribute` is set" in { + val agg = op(AggregationFunction.COUNT, attribute = "v").getAggFunc(AttributeType.INTEGER) + val nonNull = tupleOf("v", AttributeType.INTEGER, Int.box(7)) + val nullVal = tupleOf("v", AttributeType.INTEGER, null) + val out = agg + .iterate(agg.iterate(agg.iterate(agg.init(), nonNull), nullVal), nonNull) + .asInstanceOf[Integer] + assert(out == 2, "two non-null tuples + one null → count == 2") + } + + // --- getAggFunc: AVERAGE behavior ------------------------------------------ + + "AVERAGE aggregation" should "init at (0,0), accumulate sum+count, and yield sum/count" in { + // averageAgg() returns DistributedAggregation[AveragePartialObj] but is + // type-erased to Object via getAggFunc, so we cast back here. + val agg = op(AggregationFunction.AVERAGE).getAggFunc(AttributeType.DOUBLE) + val zero = agg.init().asInstanceOf[AveragePartialObj] + assert(zero == AveragePartialObj(0, 0)) + + val t1 = tupleOf("v", AttributeType.DOUBLE, java.lang.Double.valueOf(2.0)) + val t2 = tupleOf("v", AttributeType.DOUBLE, java.lang.Double.valueOf(4.0)) + val acc = agg + .iterate(agg.iterate(zero, t1), t2) + .asInstanceOf[AveragePartialObj] + assert(acc == AveragePartialObj(6.0, 2)) + val finalVal = agg.finalAgg(acc).asInstanceOf[java.lang.Double] + assert(finalVal == 3.0) + } + + it should "yield null when no non-null values were aggregated" in { + val agg = op(AggregationFunction.AVERAGE).getAggFunc(AttributeType.DOUBLE) + val zero = agg.init() + val finalVal = agg.finalAgg(zero) + assert(finalVal == null) + } + + // --- getAggFunc: CONCAT behavior ------------------------------------------- + + "CONCAT aggregation" should "concatenate non-empty values with commas and skip null gracefully" in { + val agg = op(AggregationFunction.CONCAT).getAggFunc(AttributeType.STRING) + assert(agg.init() == "") + val t1 = tupleOf("v", AttributeType.STRING, "a") + val t2 = tupleOf("v", AttributeType.STRING, "b") + val tNull = tupleOf("v", AttributeType.STRING, null) + val out = + agg.iterate(agg.iterate(agg.iterate(agg.init(), t1), tNull), t2) + assert(out == "a,,b", "null values are emitted as empty between commas") + } Review Comment: Done in 07423b3fe9 — the misleading CONCAT iterate test was removed altogether (its iterate behavior is already covered by `AggregateOpSpec`'s CONCAT case). What remains in this spec is a CONCAT *merge* test that uses an unambiguous description: `"join two non-empty partials with a comma and short-circuit when either is empty"`. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
