This is an automated email from the ASF dual-hosted git repository.
github-merge-queue[bot] pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/texera.git
The following commit(s) were added to refs/heads/main by this push:
new 2ebfc2814e test(workflow-operator): add unit test coverage for Sklearn
ridge/SGD/dummy classifier descriptors (#5946)
2ebfc2814e is described below
commit 2ebfc2814e31acb8eebda78d369ebd74ea3e1410
Author: Xinyuan Lin <[email protected]>
AuthorDate: Thu Jun 25 14:24:10 2026 -0700
test(workflow-operator): add unit test coverage for Sklearn ridge/SGD/dummy
classifier descriptors (#5946)
### What changes were proposed in this PR?
Pin behavior of four previously-untested Sklearn classifier descriptors
(ridge/SGD/dummy) in `common/workflow-operator`. No production-code
changes.
| Spec | Source class | Tests |
| --- | --- | --- |
| `SklearnRidgeOpDescSpec` | `SklearnRidgeOpDesc` | 5 |
| `SklearnRidgeCVOpDescSpec` | `SklearnRidgeCVOpDesc` | 5 |
| `SklearnSDGOpDescSpec` | `SklearnSDGOpDesc` | 5 |
| `SklearnDummyClassifierOpDescSpec` | `SklearnDummyClassifierOpDesc` |
5 |
**Behavior pinned**
| Surface | Contract |
| --- | --- |
| `operatorInfo` | exact model name + `Sklearn <name> Operator`
description; Sklearn group; training/testing input ports + one blocking
output |
| field defaults | `countVectorizer`/`tfidfTransformer` `false`;
`target`/`text` `null` |
| `getOutputSchemas` | `model_name` (STRING) + `model` (BINARY) keyed by
the declared output port |
| `generatePythonCode` | imports the matching sklearn estimator
(`RidgeClassifier`/`RidgeClassifierCV`/`SGDClassifier`/`DummyClassifier`)
and builds the `make_pipeline` model |
| Round-trip | config fields preserved through the polymorphic
`LogicalOp` base, with the correct `operatorType` discriminator |
### Any related issues, documentation, discussions?
Part of the ongoing `workflow-operator` unit-test coverage effort
(follow-up to the Sklearn classifier coverage in #5925, #5939, #5940,
#5941).
### How was this PR tested?
- `sbt "WorkflowOperator/testOnly *SklearnRidgeOpDescSpec
*SklearnRidgeCVOpDescSpec *SklearnSDGOpDescSpec
*SklearnDummyClassifierOpDescSpec"` — 20 tests, all green
- `sbt "WorkflowOperator/Test/scalafmtCheck"` and `sbt
"WorkflowOperator/scalafixAll --check"` — clean
- CI to confirm
### Was this PR authored or co-authored using generative AI tooling?
Generated-by: Claude Code (Opus 4.8 [1M context])
---
.../sklearn/SklearnDummyClassifierOpDescSpec.scala | 79 ++++++++++++++++++++++
.../sklearn/SklearnRidgeCVOpDescSpec.scala | 79 ++++++++++++++++++++++
.../operator/sklearn/SklearnRidgeOpDescSpec.scala | 79 ++++++++++++++++++++++
.../operator/sklearn/SklearnSDGOpDescSpec.scala | 79 ++++++++++++++++++++++
4 files changed, 316 insertions(+)
diff --git
a/common/workflow-operator/src/test/scala/org/apache/texera/amber/operator/sklearn/SklearnDummyClassifierOpDescSpec.scala
b/common/workflow-operator/src/test/scala/org/apache/texera/amber/operator/sklearn/SklearnDummyClassifierOpDescSpec.scala
new file mode 100644
index 0000000000..24b391b6bf
--- /dev/null
+++
b/common/workflow-operator/src/test/scala/org/apache/texera/amber/operator/sklearn/SklearnDummyClassifierOpDescSpec.scala
@@ -0,0 +1,79 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.texera.amber.operator.sklearn
+
+import org.apache.texera.amber.core.tuple.AttributeType
+import org.apache.texera.amber.operator.LogicalOp
+import org.apache.texera.amber.operator.metadata.OperatorGroupConstants
+import org.apache.texera.amber.util.JSONUtils.objectMapper
+import org.scalatest.flatspec.AnyFlatSpec
+import org.scalatest.matchers.should.Matchers
+
+class SklearnDummyClassifierOpDescSpec extends AnyFlatSpec with Matchers {
+
+ "SklearnDummyClassifierOpDesc.operatorInfo" should
+ "advertise the model name, Sklearn group, and the training/testing port
shape" in {
+ val info = (new SklearnDummyClassifierOpDesc).operatorInfo
+ info.userFriendlyName shouldBe "Dummy Classifier"
+ info.operatorDescription shouldBe "Sklearn Dummy Classifier Operator"
+ info.operatorGroupName shouldBe OperatorGroupConstants.SKLEARN_GROUP
+ info.inputPorts.map(_.displayName) shouldBe List("training", "testing")
+ info.outputPorts should have length 1
+ info.outputPorts.head.blocking shouldBe true
+ }
+
+ "SklearnDummyClassifierOpDesc" should "default its config fields" in {
+ val d = new SklearnDummyClassifierOpDesc
+ d.countVectorizer shouldBe false
+ d.tfidfTransformer shouldBe false
+ d.target shouldBe null
+ d.text shouldBe null
+ }
+
+ "SklearnDummyClassifierOpDesc.getOutputSchemas" should
+ "emit the model_name/model schema keyed by the declared output port" in {
+ val d = new SklearnDummyClassifierOpDesc
+ val schema =
d.getOutputSchemas(Map.empty)(d.operatorInfo.outputPorts.head.id)
+ schema.getAttribute("model_name").getType shouldBe AttributeType.STRING
+ schema.getAttribute("model").getType shouldBe AttributeType.BINARY
+ }
+
+ "SklearnDummyClassifierOpDesc.generatePythonCode" should "import the
configured sklearn estimator" in {
+ val d = new SklearnDummyClassifierOpDesc
+ d.target = "y"
+ val code = d.generatePythonCode()
+ code should include("from sklearn.dummy import DummyClassifier")
+ code should include("make_pipeline")
+ code should include("Dummy Classifier")
+ }
+
+ "SklearnDummyClassifierOpDesc" should "round-trip its config fields through
the polymorphic base" in {
+ val d = new SklearnDummyClassifierOpDesc
+ d.target = "label"
+ d.countVectorizer = true
+ val json = objectMapper.writeValueAsString(d)
+ json should include("\"operatorType\":\"SklearnDummyClassifier\"")
+ val restored = objectMapper.readValue(json, classOf[LogicalOp])
+ restored shouldBe a[SklearnDummyClassifierOpDesc]
+ val r = restored.asInstanceOf[SklearnDummyClassifierOpDesc]
+ r.target shouldBe "label"
+ r.countVectorizer shouldBe true
+ }
+}
diff --git
a/common/workflow-operator/src/test/scala/org/apache/texera/amber/operator/sklearn/SklearnRidgeCVOpDescSpec.scala
b/common/workflow-operator/src/test/scala/org/apache/texera/amber/operator/sklearn/SklearnRidgeCVOpDescSpec.scala
new file mode 100644
index 0000000000..c3fe761c15
--- /dev/null
+++
b/common/workflow-operator/src/test/scala/org/apache/texera/amber/operator/sklearn/SklearnRidgeCVOpDescSpec.scala
@@ -0,0 +1,79 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.texera.amber.operator.sklearn
+
+import org.apache.texera.amber.core.tuple.AttributeType
+import org.apache.texera.amber.operator.LogicalOp
+import org.apache.texera.amber.operator.metadata.OperatorGroupConstants
+import org.apache.texera.amber.util.JSONUtils.objectMapper
+import org.scalatest.flatspec.AnyFlatSpec
+import org.scalatest.matchers.should.Matchers
+
+class SklearnRidgeCVOpDescSpec extends AnyFlatSpec with Matchers {
+
+ "SklearnRidgeCVOpDesc.operatorInfo" should
+ "advertise the model name, Sklearn group, and the training/testing port
shape" in {
+ val info = (new SklearnRidgeCVOpDesc).operatorInfo
+ info.userFriendlyName shouldBe "Ridge Regression Cross Validation"
+ info.operatorDescription shouldBe "Sklearn Ridge Regression Cross
Validation Operator"
+ info.operatorGroupName shouldBe OperatorGroupConstants.SKLEARN_GROUP
+ info.inputPorts.map(_.displayName) shouldBe List("training", "testing")
+ info.outputPorts should have length 1
+ info.outputPorts.head.blocking shouldBe true
+ }
+
+ "SklearnRidgeCVOpDesc" should "default its config fields" in {
+ val d = new SklearnRidgeCVOpDesc
+ d.countVectorizer shouldBe false
+ d.tfidfTransformer shouldBe false
+ d.target shouldBe null
+ d.text shouldBe null
+ }
+
+ "SklearnRidgeCVOpDesc.getOutputSchemas" should
+ "emit the model_name/model schema keyed by the declared output port" in {
+ val d = new SklearnRidgeCVOpDesc
+ val schema =
d.getOutputSchemas(Map.empty)(d.operatorInfo.outputPorts.head.id)
+ schema.getAttribute("model_name").getType shouldBe AttributeType.STRING
+ schema.getAttribute("model").getType shouldBe AttributeType.BINARY
+ }
+
+ "SklearnRidgeCVOpDesc.generatePythonCode" should "import the configured
sklearn estimator" in {
+ val d = new SklearnRidgeCVOpDesc
+ d.target = "y"
+ val code = d.generatePythonCode()
+ code should include("from sklearn.linear_model import RidgeClassifierCV")
+ code should include("make_pipeline")
+ code should include("Ridge Regression Cross Validation")
+ }
+
+ "SklearnRidgeCVOpDesc" should "round-trip its config fields through the
polymorphic base" in {
+ val d = new SklearnRidgeCVOpDesc
+ d.target = "label"
+ d.countVectorizer = true
+ val json = objectMapper.writeValueAsString(d)
+ json should include("\"operatorType\":\"SklearnRidgeCV\"")
+ val restored = objectMapper.readValue(json, classOf[LogicalOp])
+ restored shouldBe a[SklearnRidgeCVOpDesc]
+ val r = restored.asInstanceOf[SklearnRidgeCVOpDesc]
+ r.target shouldBe "label"
+ r.countVectorizer shouldBe true
+ }
+}
diff --git
a/common/workflow-operator/src/test/scala/org/apache/texera/amber/operator/sklearn/SklearnRidgeOpDescSpec.scala
b/common/workflow-operator/src/test/scala/org/apache/texera/amber/operator/sklearn/SklearnRidgeOpDescSpec.scala
new file mode 100644
index 0000000000..1f24306bb6
--- /dev/null
+++
b/common/workflow-operator/src/test/scala/org/apache/texera/amber/operator/sklearn/SklearnRidgeOpDescSpec.scala
@@ -0,0 +1,79 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.texera.amber.operator.sklearn
+
+import org.apache.texera.amber.core.tuple.AttributeType
+import org.apache.texera.amber.operator.LogicalOp
+import org.apache.texera.amber.operator.metadata.OperatorGroupConstants
+import org.apache.texera.amber.util.JSONUtils.objectMapper
+import org.scalatest.flatspec.AnyFlatSpec
+import org.scalatest.matchers.should.Matchers
+
+class SklearnRidgeOpDescSpec extends AnyFlatSpec with Matchers {
+
+ "SklearnRidgeOpDesc.operatorInfo" should
+ "advertise the model name, Sklearn group, and the training/testing port
shape" in {
+ val info = (new SklearnRidgeOpDesc).operatorInfo
+ info.userFriendlyName shouldBe "Ridge Regression"
+ info.operatorDescription shouldBe "Sklearn Ridge Regression Operator"
+ info.operatorGroupName shouldBe OperatorGroupConstants.SKLEARN_GROUP
+ info.inputPorts.map(_.displayName) shouldBe List("training", "testing")
+ info.outputPorts should have length 1
+ info.outputPorts.head.blocking shouldBe true
+ }
+
+ "SklearnRidgeOpDesc" should "default its config fields" in {
+ val d = new SklearnRidgeOpDesc
+ d.countVectorizer shouldBe false
+ d.tfidfTransformer shouldBe false
+ d.target shouldBe null
+ d.text shouldBe null
+ }
+
+ "SklearnRidgeOpDesc.getOutputSchemas" should
+ "emit the model_name/model schema keyed by the declared output port" in {
+ val d = new SklearnRidgeOpDesc
+ val schema =
d.getOutputSchemas(Map.empty)(d.operatorInfo.outputPorts.head.id)
+ schema.getAttribute("model_name").getType shouldBe AttributeType.STRING
+ schema.getAttribute("model").getType shouldBe AttributeType.BINARY
+ }
+
+ "SklearnRidgeOpDesc.generatePythonCode" should "import the configured
sklearn estimator" in {
+ val d = new SklearnRidgeOpDesc
+ d.target = "y"
+ val code = d.generatePythonCode()
+ code should include("from sklearn.linear_model import RidgeClassifier")
+ code should include("make_pipeline")
+ code should include("Ridge Regression")
+ }
+
+ "SklearnRidgeOpDesc" should "round-trip its config fields through the
polymorphic base" in {
+ val d = new SklearnRidgeOpDesc
+ d.target = "label"
+ d.countVectorizer = true
+ val json = objectMapper.writeValueAsString(d)
+ json should include("\"operatorType\":\"SklearnRidge\"")
+ val restored = objectMapper.readValue(json, classOf[LogicalOp])
+ restored shouldBe a[SklearnRidgeOpDesc]
+ val r = restored.asInstanceOf[SklearnRidgeOpDesc]
+ r.target shouldBe "label"
+ r.countVectorizer shouldBe true
+ }
+}
diff --git
a/common/workflow-operator/src/test/scala/org/apache/texera/amber/operator/sklearn/SklearnSDGOpDescSpec.scala
b/common/workflow-operator/src/test/scala/org/apache/texera/amber/operator/sklearn/SklearnSDGOpDescSpec.scala
new file mode 100644
index 0000000000..516374b087
--- /dev/null
+++
b/common/workflow-operator/src/test/scala/org/apache/texera/amber/operator/sklearn/SklearnSDGOpDescSpec.scala
@@ -0,0 +1,79 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.texera.amber.operator.sklearn
+
+import org.apache.texera.amber.core.tuple.AttributeType
+import org.apache.texera.amber.operator.LogicalOp
+import org.apache.texera.amber.operator.metadata.OperatorGroupConstants
+import org.apache.texera.amber.util.JSONUtils.objectMapper
+import org.scalatest.flatspec.AnyFlatSpec
+import org.scalatest.matchers.should.Matchers
+
+class SklearnSDGOpDescSpec extends AnyFlatSpec with Matchers {
+
+ "SklearnSDGOpDesc.operatorInfo" should
+ "advertise the model name, Sklearn group, and the training/testing port
shape" in {
+ val info = (new SklearnSDGOpDesc).operatorInfo
+ info.userFriendlyName shouldBe "Stochastic Gradient Descent"
+ info.operatorDescription shouldBe "Sklearn Stochastic Gradient Descent
Operator"
+ info.operatorGroupName shouldBe OperatorGroupConstants.SKLEARN_GROUP
+ info.inputPorts.map(_.displayName) shouldBe List("training", "testing")
+ info.outputPorts should have length 1
+ info.outputPorts.head.blocking shouldBe true
+ }
+
+ "SklearnSDGOpDesc" should "default its config fields" in {
+ val d = new SklearnSDGOpDesc
+ d.countVectorizer shouldBe false
+ d.tfidfTransformer shouldBe false
+ d.target shouldBe null
+ d.text shouldBe null
+ }
+
+ "SklearnSDGOpDesc.getOutputSchemas" should
+ "emit the model_name/model schema keyed by the declared output port" in {
+ val d = new SklearnSDGOpDesc
+ val schema =
d.getOutputSchemas(Map.empty)(d.operatorInfo.outputPorts.head.id)
+ schema.getAttribute("model_name").getType shouldBe AttributeType.STRING
+ schema.getAttribute("model").getType shouldBe AttributeType.BINARY
+ }
+
+ "SklearnSDGOpDesc.generatePythonCode" should "import the configured sklearn
estimator" in {
+ val d = new SklearnSDGOpDesc
+ d.target = "y"
+ val code = d.generatePythonCode()
+ code should include("from sklearn.linear_model import SGDClassifier")
+ code should include("make_pipeline")
+ code should include("Stochastic Gradient Descent")
+ }
+
+ "SklearnSDGOpDesc" should "round-trip its config fields through the
polymorphic base" in {
+ val d = new SklearnSDGOpDesc
+ d.target = "label"
+ d.countVectorizer = true
+ val json = objectMapper.writeValueAsString(d)
+ json should include("\"operatorType\":\"SklearnSDG\"")
+ val restored = objectMapper.readValue(json, classOf[LogicalOp])
+ restored shouldBe a[SklearnSDGOpDesc]
+ val r = restored.asInstanceOf[SklearnSDGOpDesc]
+ r.target shouldBe "label"
+ r.countVectorizer shouldBe true
+ }
+}