This is an automated email from the ASF dual-hosted git repository. github-merge-queue[bot] pushed a commit to branch gh-readonly-queue/main/pr-5878-6eb4165f7ca49ba4eb9d6727971097763c9a8377 in repository https://gitbox.apache.org/repos/asf/texera.git
commit 7a9730bee0986e6d51e29aa28936665336aa7b44 Author: Xinyuan Lin <[email protected]> AuthorDate: Tue Jun 23 15:27:38 2026 -0700 test(workflow-operator): add unit test coverage for machine-learning operator types (Scorer, HyperParameters, SklearnML base) (#5878) ### What changes were proposed in this PR? Pin behavior of three previously-untested machine-learning operator types in `common/workflow-operator/machineLearning/`. No production-code changes. | Spec | Source class | Tests | | --- | --- | --- | | `MachineLearningScorerOpDescSpec` | `MachineLearningScorerOpDesc` | 6 | | `HyperParametersSpec` | `HyperParameters` | 5 | | `SklearnMLOperatorDescriptorSpec` | `SklearnMLOperatorDescriptor` (abstract base) | 3 | **Behavior pinned** | Surface | Contract | | --- | --- | | `MachineLearningScorerOpDesc` | operatorInfo (`Machine Learning Scorer`, MACHINE_LEARNING_GENERAL_GROUP); `getOutputSchemas` branches — classification → a `Class` column, regression → empty; `generatePythonCode` structure; round-trip | | `HyperParameters` | generic config bag — `parameter`/`attribute`/`value` null defaults + `parametersSource == false`; `@JsonProperty` wire-keys; `Include.NON_NULL` omits null fields on a fresh instance; populated round-trip | | `SklearnMLOperatorDescriptor` | abstract base — operatorInfo derived from the subclass (`training` + `parameter` inputs depending in order, one output, ADVANCED_SKLEARN_GROUP); fixed `Model`/`Parameters` `getOutputSchemas`; base-field defaults | **Note for reviewers:** `SklearnMLOperatorDescriptor` is abstract, so its base contract is exercised through a **real** concrete subclass (`SklearnAdvancedKNNClassifierTrainerOpDesc`) rather than a test stub — a stub placed under `org.apache.texera.amber.operator.*` would be picked up by `PythonCodeRawInvalidTextSpec`'s classpath scanner (which instantiates every `PythonOperatorDescriptor` and runs `py_compile`). Using a real subclass keeps the spec off that scanner. ### Any related issues, documentation, discussions? Closes #5875. ### How was this PR tested? - `sbt "WorkflowOperator/testOnly org.apache.texera.amber.operator.machineLearning.Scorer.MachineLearningScorerOpDescSpec org.apache.texera.amber.operator.machineLearning.sklearnAdvanced.base.HyperParametersSpec org.apache.texera.amber.operator.machineLearning.sklearnAdvanced.base.SklearnMLOperatorDescriptorSpec"` — 14 tests, all green - `sbt "WorkflowOperator/Test/scalafmtCheck"` and `sbt "WorkflowOperator/Test/scalafix --check"` — clean - CI to confirm ### Was this PR authored or co-authored using generative AI tooling? Generated-by: Claude Code (Opus 4.8 [1M context]) --- .../Scorer/MachineLearningScorerOpDescSpec.scala | 95 ++++++++++++++++++++++ .../sklearnAdvanced/base/HyperParametersSpec.scala | 79 ++++++++++++++++++ .../base/SklearnMLOperatorDescriptorSpec.scala | 72 ++++++++++++++++ 3 files changed, 246 insertions(+) diff --git a/common/workflow-operator/src/test/scala/org/apache/texera/amber/operator/machineLearning/Scorer/MachineLearningScorerOpDescSpec.scala b/common/workflow-operator/src/test/scala/org/apache/texera/amber/operator/machineLearning/Scorer/MachineLearningScorerOpDescSpec.scala new file mode 100644 index 0000000000..14909b5c9f --- /dev/null +++ b/common/workflow-operator/src/test/scala/org/apache/texera/amber/operator/machineLearning/Scorer/MachineLearningScorerOpDescSpec.scala @@ -0,0 +1,95 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.texera.amber.operator.machineLearning.Scorer + +import org.apache.texera.amber.core.tuple.{Attribute, AttributeType, Schema} +import org.apache.texera.amber.operator.LogicalOp +import org.apache.texera.amber.operator.metadata.OperatorGroupConstants +import org.apache.texera.amber.util.JSONUtils.objectMapper +import org.scalatest.flatspec.AnyFlatSpec +import org.scalatest.matchers.should.Matchers + +import java.nio.charset.StandardCharsets +import java.util.Base64 + +class MachineLearningScorerOpDescSpec extends AnyFlatSpec with Matchers { + + "MachineLearningScorerOpDesc.operatorInfo" should + "advertise the name and Machine Learning General group" in { + val info = (new MachineLearningScorerOpDesc).operatorInfo + info.userFriendlyName shouldBe "Machine Learning Scorer" + info.operatorDescription shouldBe "Scorer for machine learning models" + info.operatorGroupName shouldBe OperatorGroupConstants.MACHINE_LEARNING_GENERAL_GROUP + info.inputPorts should have length 1 + info.outputPorts should have length 1 + } + + "MachineLearningScorerOpDesc" should "default isRegression false and the column fields to empty" in { + val d = new MachineLearningScorerOpDesc + d.isRegression shouldBe false + d.actualValueColumn shouldBe "" + d.predictValueColumn shouldBe "" + d.classificationMetrics shouldBe empty + d.regressionMetrics shouldBe empty + } + + "MachineLearningScorerOpDesc.getOutputSchemas" should + "include a Class column for classification with no metrics" in { + val d = new MachineLearningScorerOpDesc + d.getOutputSchemas(Map.empty) shouldBe Map( + d.operatorInfo.outputPorts.head.id -> Schema( + List(new Attribute("Class", AttributeType.STRING)) + ) + ) + } + + it should "produce an empty schema for regression with no metrics" in { + val d = new MachineLearningScorerOpDesc + d.isRegression = true + val out = d.getOutputSchemas(Map.empty) + out.keySet shouldBe Set(d.operatorInfo.outputPorts.head.id) + out(d.operatorInfo.outputPorts.head.id).getAttributes shouldBe empty + } + + "MachineLearningScorerOpDesc.generatePythonCode" should "emit the scorer table operator" in { + val d = new MachineLearningScorerOpDesc + d.actualValueColumn = "y" + d.predictValueColumn = "yhat" + val code = d.generatePythonCode() + code should include("class ProcessTableOperator(UDFTableOperator)") + code should include("from sklearn.metrics import") + // actualValueColumn/predictValueColumn are EncodableString: base64-encoded into the emitted code. + code should include(Base64.getEncoder.encodeToString("y".getBytes(StandardCharsets.UTF_8))) + code should include(Base64.getEncoder.encodeToString("yhat".getBytes(StandardCharsets.UTF_8))) + } + + "MachineLearningScorerOpDesc" should "round-trip its config fields through the polymorphic base" in { + val d = new MachineLearningScorerOpDesc + d.isRegression = true + d.actualValueColumn = "y" + d.predictValueColumn = "yhat" + val restored = objectMapper.readValue(objectMapper.writeValueAsString(d), classOf[LogicalOp]) + restored shouldBe a[MachineLearningScorerOpDesc] + val s = restored.asInstanceOf[MachineLearningScorerOpDesc] + s.isRegression shouldBe true + s.actualValueColumn shouldBe "y" + s.predictValueColumn shouldBe "yhat" + } +} diff --git a/common/workflow-operator/src/test/scala/org/apache/texera/amber/operator/machineLearning/sklearnAdvanced/base/HyperParametersSpec.scala b/common/workflow-operator/src/test/scala/org/apache/texera/amber/operator/machineLearning/sklearnAdvanced/base/HyperParametersSpec.scala new file mode 100644 index 0000000000..d0e6b96153 --- /dev/null +++ b/common/workflow-operator/src/test/scala/org/apache/texera/amber/operator/machineLearning/sklearnAdvanced/base/HyperParametersSpec.scala @@ -0,0 +1,79 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.texera.amber.operator.machineLearning.sklearnAdvanced.base + +import org.apache.texera.amber.util.JSONUtils.objectMapper +import org.scalatest.flatspec.AnyFlatSpec +import org.scalatest.matchers.should.Matchers + +class HyperParametersSpec extends AnyFlatSpec with Matchers { + + "HyperParameters" should + "default parameter/attribute/value to null and parametersSource to false" in { + val h = new HyperParameters[String] + h.parameter shouldBe null + h.attribute shouldBe null + h.value shouldBe null + h.parametersSource shouldBe false + } + + it should "allow all fields to be assigned post-construction" in { + val h = new HyperParameters[String] + h.parameter = "alpha" + h.attribute = "colA" + h.value = "0.5" + h.parametersSource = true + h.parameter shouldBe "alpha" + h.attribute shouldBe "colA" + h.value shouldBe "0.5" + h.parametersSource shouldBe true + } + + "HyperParameters" should "serialize attribute and value under their wire-keys" in { + val h = new HyperParameters[String] + h.attribute = "colA" + h.value = "0.5" + val tree = objectMapper.readTree(objectMapper.writeValueAsString(h)) + tree.get("attribute").asText shouldBe "colA" + tree.get("value").asText shouldBe "0.5" + } + + "HyperParameters JSON" should "omit null fields (Include.NON_NULL) for a fresh instance" in { + val tree = objectMapper.readTree(objectMapper.writeValueAsString(new HyperParameters[String])) + tree.has("parameter") shouldBe false + tree.has("attribute") shouldBe false + tree.has("value") shouldBe false + tree.has("parametersSource") shouldBe true + } + + it should "round-trip populated fields" in { + val h = new HyperParameters[String] + h.parameter = "alpha" + h.attribute = "colA" + h.value = "0.5" + h.parametersSource = true + val restored = + objectMapper.readValue(objectMapper.writeValueAsString(h), classOf[HyperParameters[String]]) + restored.parameter shouldBe "alpha" + restored.attribute shouldBe "colA" + restored.value shouldBe "0.5" + restored.parametersSource shouldBe true + } +} diff --git a/common/workflow-operator/src/test/scala/org/apache/texera/amber/operator/machineLearning/sklearnAdvanced/base/SklearnMLOperatorDescriptorSpec.scala b/common/workflow-operator/src/test/scala/org/apache/texera/amber/operator/machineLearning/sklearnAdvanced/base/SklearnMLOperatorDescriptorSpec.scala new file mode 100644 index 0000000000..eac74a54d6 --- /dev/null +++ b/common/workflow-operator/src/test/scala/org/apache/texera/amber/operator/machineLearning/sklearnAdvanced/base/SklearnMLOperatorDescriptorSpec.scala @@ -0,0 +1,72 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.texera.amber.operator.machineLearning.sklearnAdvanced.base + +import org.apache.texera.amber.core.tuple.{Attribute, AttributeType, Schema} +import org.apache.texera.amber.core.workflow.PortIdentity +import org.apache.texera.amber.operator.machineLearning.sklearnAdvanced.KNNTrainer.SklearnAdvancedKNNClassifierTrainerOpDesc +import org.apache.texera.amber.operator.metadata.OperatorGroupConstants +import org.scalatest.flatspec.AnyFlatSpec +import org.scalatest.matchers.should.Matchers + +class SklearnMLOperatorDescriptorSpec extends AnyFlatSpec with Matchers { + + // Exercise the abstract base's contract through a real concrete subclass — no test + // stub. A stub placed under org.apache.texera.amber.operator.* would be picked up by + // PythonCodeRawInvalidTextSpec's classpath scanner; using a real subclass avoids that. + private def newOp(): SklearnMLOperatorDescriptor[_] = + new SklearnAdvancedKNNClassifierTrainerOpDesc + + "SklearnMLOperatorDescriptor.operatorInfo" should + "derive name/description and advertise the training + parameter inputs and one output" in { + val info = newOp().operatorInfo + info.userFriendlyName shouldBe "KNN Classifier" + info.operatorDescription shouldBe "Sklearn KNN Classifier Operator" + info.operatorGroupName shouldBe OperatorGroupConstants.ADVANCED_SKLEARN_GROUP + info.inputPorts should have length 2 + info.inputPorts.head.id shouldBe PortIdentity(0) + info.inputPorts.head.displayName shouldBe "training" + info.inputPorts.last.id shouldBe PortIdentity(1) + info.inputPorts.last.displayName shouldBe "parameter" + info.inputPorts.last.dependencies shouldBe List(PortIdentity(0)) + info.outputPorts should have length 1 + } + + "SklearnMLOperatorDescriptor.getOutputSchemas" should + "produce the fixed Model/Parameters schema keyed by the declared output port" in { + val op = newOp() + op.getOutputSchemas(Map.empty) shouldBe Map( + op.operatorInfo.outputPorts.head.id -> Schema( + List( + new Attribute("Model", AttributeType.BINARY), + new Attribute("Parameters", AttributeType.STRING) + ) + ) + ) + } + + "SklearnMLOperatorDescriptor" should + "default paraList to empty, groundTruthAttribute to empty, and selectedFeatures to null" in { + val op = newOp() + op.paraList shouldBe empty + op.groundTruthAttribute shouldBe "" + op.selectedFeatures shouldBe null + } +}
