This is an automated email from the ASF dual-hosted git repository.
mengw15 pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/texera.git
The following commit(s) were added to refs/heads/main by this push:
new 782069dbfc feat: add ECDF plot visualization operator (#4406)
782069dbfc is described below
commit 782069dbfc76a2e168aef717bb0d6315a48cf96a
Author: Eugene Gu <[email protected]>
AuthorDate: Mon Apr 20 17:53:49 2026 -0700
feat: add ECDF plot visualization operator (#4406)
PR Description
## Purpose
Adds a new **Empirical Cumulative Distribution Function (ECDF)** plot
operator to the Statistical Visualization group, letting users visualize
the cumulative distribution of a numeric column and easily compare
distributions across groups.
## Summary
- New operator `ECDFPlotOpDesc` under
`operator/visualization/ecdfPlot/`, rendered via `plotly.express.ecdf`.
- Configurable fields:
- **Value Column** (required, numeric): column to compute ECDF on
- **Color Column** (optional): group and color lines by category
- **SeparateBy Column** (optional): split plot into facets
- **Y Axis Mode**: `probability` / `count` / `sum`
- **CDF Mode**: `standard` / `reversed` / `complementary`
- **Orientation**: `vertical` / `horizontal`
- **Show Markers / Show Lines** toggles
- **Marginal Plot**: `""` / `histogram` / `rug`
- Registered in `LogicalOp.scala` as the `ECDFPlot` operator type.
- Added operator icon
`frontend/src/assets/operator_images/ECDFPlot.png`.
- User-provided enum fields (`cdfMode`, `orientation`, `marginal`) use
`EncodableString` so generated Python safely passes
`PythonCodeRawInvalidTextSpec`.
- Unit tests in `ECDFPlotOpDescSpec` covering the empty-value assertion
and the generated figure with all optional parameters.
## Test
- [x] `sbt scalafmtCheckAll` passes
- [x] `sbt "scalafixAll --check"` passes
- [x] `sbt "WorkflowOperator/testOnly
org.apache.texera.amber.operator.visualization.ecdfPlot.ECDFPlotOpDescSpec
org.apache.texera.amber.util.PythonCodeRawInvalidTextSpec"`
— all tests pass (4/4), 110/110 raw-invalid OK, 110/110 py_compile OK
- [x] Manually tested end-to-end in the UI with CSV source → ECDF Plot
operator; verified the rendered plot in the result panel for all
combinations of color/facet/CDF mode/orientation/marginal options.
## Screenshots
<img width="3529" height="1962" alt="image"
src="https://github.com/user-attachments/assets/44392eaa-e6bb-48ee-80dc-a4c128425255"
/>
[ecdf_demo.csv](https://github.com/user-attachments/files/26914412/ecdf_demo.csv)
---------
Co-authored-by: Chen Li <[email protected]>
Co-authored-by: Meng Wang <[email protected]>
---
.../apache/texera/amber/operator/LogicalOp.scala | 2 +
.../visualization/ecdfPlot/ECDFPlotOpDesc.scala | 186 +++++++++++++++++++++
.../ecdfPlot/ECDFPlotOpDescSpec.scala | 61 +++++++
frontend/src/assets/operator_images/ECDFPlot.png | Bin 0 -> 1297 bytes
4 files changed, 249 insertions(+)
diff --git
a/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/LogicalOp.scala
b/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/LogicalOp.scala
index 3949b67be9..d9a8a97407 100644
---
a/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/LogicalOp.scala
+++
b/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/LogicalOp.scala
@@ -109,6 +109,7 @@ import
org.apache.texera.amber.operator.visualization.continuousErrorBands.Conti
import
org.apache.texera.amber.operator.visualization.contourPlot.ContourPlotOpDesc
import
org.apache.texera.amber.operator.visualization.dendrogram.DendrogramOpDesc
import
org.apache.texera.amber.operator.visualization.dumbbellPlot.DumbbellPlotOpDesc
+import org.apache.texera.amber.operator.visualization.ecdfPlot.ECDFPlotOpDesc
import
org.apache.texera.amber.operator.visualization.figureFactoryTable.FigureFactoryTableOpDesc
import
org.apache.texera.amber.operator.visualization.filledAreaPlot.FilledAreaPlotOpDesc
import
org.apache.texera.amber.operator.visualization.funnelPlot.FunnelPlotOpDesc
@@ -185,6 +186,7 @@ trait StateTransferFunc
new Type(value = classOf[CandlestickChartOpDesc], name =
"CandlestickChart"),
new Type(value = classOf[SplitOpDesc], name = "Split"),
new Type(value = classOf[ContourPlotOpDesc], name = "ContourPlot"),
+ new Type(value = classOf[ECDFPlotOpDesc], name = "ECDFPlot"),
new Type(value = classOf[RegexOpDesc], name = "Regex"),
new Type(value = classOf[SpecializedFilterOpDesc], name = "Filter"),
new Type(value = classOf[ProjectionOpDesc], name = "Projection"),
diff --git
a/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/visualization/ecdfPlot/ECDFPlotOpDesc.scala
b/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/visualization/ecdfPlot/ECDFPlotOpDesc.scala
new file mode 100644
index 0000000000..68625bdc4b
--- /dev/null
+++
b/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/visualization/ecdfPlot/ECDFPlotOpDesc.scala
@@ -0,0 +1,186 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.texera.amber.operator.visualization.ecdfPlot
+
+import com.fasterxml.jackson.annotation.{JsonProperty, JsonPropertyDescription}
+import com.kjetland.jackson.jsonSchema.annotations.{JsonSchemaInject,
JsonSchemaTitle}
+import org.apache.texera.amber.core.tuple.{AttributeType, Schema}
+import org.apache.texera.amber.core.workflow.PortIdentity
+import org.apache.texera.amber.operator.PythonOperatorDescriptor
+import
org.apache.texera.amber.operator.metadata.annotations.AutofillAttributeName
+import org.apache.texera.amber.operator.metadata.{OperatorGroupConstants,
OperatorInfo}
+import org.apache.texera.amber.pybuilder.PyStringTypes.EncodableString
+import org.apache.texera.amber.pybuilder.PythonTemplateBuilder
+import
org.apache.texera.amber.pybuilder.PythonTemplateBuilder.PythonTemplateBuilderStringContext
+
+import javax.validation.constraints.NotNull
+
+@JsonSchemaInject(
+ json =
"""{"attributeTypeRules":{"valueColumn":{"enum":["integer","long","double"]}}}"""
+)
+class ECDFPlotOpDesc extends PythonOperatorDescriptor {
+
+ @JsonProperty(required = true)
+ @JsonSchemaTitle("Value Column")
+ @JsonPropertyDescription("Numeric column used to compute the empirical
cumulative distribution.")
+ @AutofillAttributeName
+ @NotNull(message = "Value column cannot be empty")
+ var valueColumn: EncodableString = ""
+
+ @JsonProperty(required = false)
+ @JsonSchemaTitle("Color Column")
+ @JsonPropertyDescription("Optional column for coloring ECDF lines by group.")
+ @AutofillAttributeName
+ var colorColumn: EncodableString = ""
+
+ @JsonProperty(required = false)
+ @JsonSchemaTitle("Separate By Column")
+ @JsonPropertyDescription("Optional column for splitting ECDF plots into
subplots.")
+ @AutofillAttributeName
+ var separateBy: EncodableString = ""
+
+ @JsonProperty(required = false, defaultValue = "probability")
+ @JsonSchemaTitle("Y Axis Mode")
+ @JsonPropertyDescription("Display cumulative probability, raw count, or
cumulative sum.")
+ @JsonSchemaInject(
+ json = """{ "enum": ["probability", "count", "sum"], "default":
"probability" }"""
+ )
+ var yAxisMode: String = "probability"
+
+ @JsonProperty(required = false, defaultValue = "standard")
+ @JsonSchemaTitle("CDF Mode")
+ @JsonPropertyDescription(
+ "'standard' shows P(X ≤ x), 'reversed' shows P(X ≥ x), " +
+ "'complementary' shows 1 - P(X ≤ x)."
+ )
+ @JsonSchemaInject(
+ json = """{ "enum": ["standard", "reversed", "complementary"], "default":
"standard" }"""
+ )
+ var cdfMode: EncodableString = "standard"
+
+ @JsonProperty(required = false, defaultValue = "vertical")
+ @JsonSchemaTitle("Orientation")
+ @JsonPropertyDescription("Plot ECDF vertically or horizontally.")
+ @JsonSchemaInject(json = """{ "enum": ["vertical", "horizontal"], "default":
"vertical" }""")
+ var orientation: EncodableString = "vertical"
+
+ @JsonProperty(required = false, defaultValue = "false")
+ @JsonSchemaTitle("Show Markers")
+ @JsonPropertyDescription("Display sample markers on the ECDF line.")
+ var showMarkers: Boolean = false
+
+ @JsonProperty(required = false, defaultValue = "none")
+ @JsonSchemaTitle("Marginal Plot")
+ @JsonPropertyDescription("Optional marginal plot to display alongside the
ECDF.")
+ @JsonSchemaInject(
+ json = """{ "enum": ["none", "histogram", "rug"], "default": "none" }"""
+ )
+ var marginal: EncodableString = "none"
+
+ override def operatorInfo: OperatorInfo =
+ OperatorInfo.forVisualization(
+ "Empirical Cumulative Distribution Plot",
+ "Visualize the empirical cumulative distribution of a numeric column.",
+ OperatorGroupConstants.VISUALIZATION_STATISTICAL_GROUP
+ )
+
+ override def getOutputSchemas(
+ inputSchemas: Map[PortIdentity, Schema]
+ ): Map[PortIdentity, Schema] = {
+ val outputSchema = Schema().add("html-content", AttributeType.STRING)
+ Map(operatorInfo.outputPorts.head.id -> outputSchema)
+ }
+
+ def manipulateTable(): PythonTemplateBuilder = {
+ assert(valueColumn.nonEmpty)
+ val requiredCols =
+ List(
+ Some(pyb"$valueColumn"),
+ Option.when(colorColumn.nonEmpty)(pyb"$colorColumn"),
+ Option.when(separateBy.nonEmpty)(pyb"$separateBy")
+ ).flatten
+ val requiredColsExpr = requiredCols.mkString(", ")
+
+ pyb"""
+ | required_cols = [$requiredColsExpr]
+ | table.dropna(subset=required_cols, inplace=True)
+ | table[$valueColumn] = pd.to_numeric(table[$valueColumn],
errors='coerce')
+ | table.dropna(subset=[$valueColumn], inplace=True)
+ |"""
+ }
+
+ def createPlotlyFigure(): PythonTemplateBuilder = {
+ assert(valueColumn.nonEmpty)
+
+ val args = scala.collection.mutable.ArrayBuffer[PythonTemplateBuilder](
+ pyb"table",
+ pyb"x=$valueColumn"
+ )
+ if (colorColumn.nonEmpty) args += pyb"color=$colorColumn"
+ if (separateBy.nonEmpty) args += pyb"facet_col=$separateBy"
+ yAxisMode match {
+ case "count" => args += pyb"ecdfnorm=None"
+ case "sum" => args += pyb"ecdfnorm=None"
+ case _ =>
+ }
+ if (yAxisMode == "sum") args += pyb"y=$valueColumn"
+ if (cdfMode != "standard") args += pyb"ecdfmode=$cdfMode"
+ if (orientation == "horizontal") args += pyb"orientation='h'"
+ if (showMarkers) args += pyb"markers=True"
+ if (marginal != "none") args += pyb"marginal=$marginal"
+
+ val joinedArgs = args.mkString(", ")
+ pyb"""
+ | fig = px.ecdf($joinedArgs)
+ | fig.update_layout(margin=dict(l=0, r=0, t=30, b=0))
+ |"""
+ }
+
+ override def generatePythonCode(): String = {
+ val finalCode =
+ pyb"""
+ |from pytexera import *
+ |
+ |import pandas as pd
+ |import plotly.express as px
+ |import plotly.io
+ |
+ |class ProcessTableOperator(UDFTableOperator):
+ | def render_error(self, error_msg):
+ | return '''<h1>Empirical cumulative distribution plot is not
available.</h1>
+ | <p>Reason is: {} </p>
+ | '''.format(error_msg)
+ |
+ | @overrides
+ | def process_table(self, table: Table, port: int) ->
Iterator[Optional[TableLike]]:
+ | if table.empty:
+ | yield {'html-content': self.render_error("input table is
empty.")}
+ | return
+ | ${manipulateTable()}
+ | if table.empty:
+ | yield {'html-content': self.render_error("no valid rows
left after removing missing or non-numeric values.")}
+ | return
+ | ${createPlotlyFigure()}
+ | html = plotly.io.to_html(fig, include_plotlyjs='cdn',
auto_play=False)
+ | yield {'html-content': html}
+ |"""
+ finalCode.encode
+ }
+}
diff --git
a/common/workflow-operator/src/test/scala/org/apache/texera/amber/operator/visualization/ecdfPlot/ECDFPlotOpDescSpec.scala
b/common/workflow-operator/src/test/scala/org/apache/texera/amber/operator/visualization/ecdfPlot/ECDFPlotOpDescSpec.scala
new file mode 100644
index 0000000000..bc565e4d3a
--- /dev/null
+++
b/common/workflow-operator/src/test/scala/org/apache/texera/amber/operator/visualization/ecdfPlot/ECDFPlotOpDescSpec.scala
@@ -0,0 +1,61 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.texera.amber.operator.visualization.ecdfPlot
+
+import org.scalatest.BeforeAndAfter
+import org.scalatest.flatspec.AnyFlatSpec
+
+class ECDFPlotOpDescSpec extends AnyFlatSpec with BeforeAndAfter {
+
+ var opDesc: ECDFPlotOpDesc = _
+
+ before {
+ opDesc = new ECDFPlotOpDesc()
+ }
+
+ it should "throw assertion error if value column is empty" in {
+ assertThrows[AssertionError] {
+ opDesc.manipulateTable()
+ }
+ }
+
+ it should "generate a plotly ecdf figure with optional parameters" in {
+ opDesc.valueColumn = "score"
+ opDesc.colorColumn = "group"
+ opDesc.separateBy = "category"
+ opDesc.yAxisMode = "count"
+ opDesc.cdfMode = "reversed"
+ opDesc.orientation = "horizontal"
+ opDesc.showMarkers = true
+ opDesc.marginal = "histogram"
+
+ val plain = opDesc.createPlotlyFigure().plain
+
+ assert(plain.contains("fig = px.ecdf(table"))
+ assert(plain.contains("ecdfnorm=None"))
+ assert(plain.contains("ecdfmode=self.decode_python_template"))
+ assert(plain.contains("orientation='h'"))
+ assert(plain.contains("markers=True"))
+ assert(plain.contains("marginal=self.decode_python_template"))
+ assert(plain.contains("x=self.decode_python_template"))
+ assert(plain.contains("color=self.decode_python_template"))
+ assert(plain.contains("facet_col=self.decode_python_template"))
+ }
+}
diff --git a/frontend/src/assets/operator_images/ECDFPlot.png
b/frontend/src/assets/operator_images/ECDFPlot.png
new file mode 100644
index 0000000000..85c3bc5a95
Binary files /dev/null and b/frontend/src/assets/operator_images/ECDFPlot.png
differ