This is an automated email from the ASF dual-hosted git repository.

mengw15 pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/texera.git


The following commit(s) were added to refs/heads/main by this push:
     new 782069dbfc feat: add ECDF plot visualization operator (#4406)
782069dbfc is described below

commit 782069dbfc76a2e168aef717bb0d6315a48cf96a
Author: Eugene Gu <[email protected]>
AuthorDate: Mon Apr 20 17:53:49 2026 -0700

    feat: add ECDF plot visualization operator (#4406)
    
    PR Description
    
    ## Purpose
    
    Adds a new **Empirical Cumulative Distribution Function (ECDF)** plot
    operator to the Statistical Visualization group, letting users visualize
    the cumulative distribution of a numeric column and easily compare
    distributions across groups.
    
    ## Summary
    - New operator `ECDFPlotOpDesc` under
    `operator/visualization/ecdfPlot/`, rendered via `plotly.express.ecdf`.
    
    - Configurable fields:
      - **Value Column** (required, numeric): column to compute ECDF on
      - **Color Column** (optional): group and color lines by category
      - **SeparateBy Column** (optional): split plot into facets
      - **Y Axis Mode**: `probability` / `count` / `sum`
      - **CDF Mode**: `standard` / `reversed` / `complementary`
      - **Orientation**: `vertical` / `horizontal`
      - **Show Markers / Show Lines** toggles
      - **Marginal Plot**: `""` / `histogram` / `rug`
    - Registered in `LogicalOp.scala` as the `ECDFPlot` operator type.
    - Added operator icon
    `frontend/src/assets/operator_images/ECDFPlot.png`.
    - User-provided enum fields (`cdfMode`, `orientation`, `marginal`) use
    `EncodableString` so generated Python safely passes
    `PythonCodeRawInvalidTextSpec`.
    - Unit tests in `ECDFPlotOpDescSpec` covering the empty-value assertion
    and the generated figure with all optional parameters.
    
    ## Test
    - [x] `sbt scalafmtCheckAll` passes
    - [x] `sbt "scalafixAll --check"` passes
    - [x] `sbt "WorkflowOperator/testOnly
    
    org.apache.texera.amber.operator.visualization.ecdfPlot.ECDFPlotOpDescSpec
            org.apache.texera.amber.util.PythonCodeRawInvalidTextSpec"`
    — all tests pass (4/4), 110/110 raw-invalid OK, 110/110 py_compile OK
    - [x] Manually tested end-to-end in the UI with CSV source → ECDF Plot
    operator; verified the rendered plot in the result panel for all
    combinations of color/facet/CDF mode/orientation/marginal options.
    
    ## Screenshots
    <img width="3529" height="1962" alt="image"
    
src="https://github.com/user-attachments/assets/44392eaa-e6bb-48ee-80dc-a4c128425255";
    />
    
    
[ecdf_demo.csv](https://github.com/user-attachments/files/26914412/ecdf_demo.csv)
    
    ---------
    
    Co-authored-by: Chen Li <[email protected]>
    Co-authored-by: Meng Wang <[email protected]>
---
 .../apache/texera/amber/operator/LogicalOp.scala   |   2 +
 .../visualization/ecdfPlot/ECDFPlotOpDesc.scala    | 186 +++++++++++++++++++++
 .../ecdfPlot/ECDFPlotOpDescSpec.scala              |  61 +++++++
 frontend/src/assets/operator_images/ECDFPlot.png   | Bin 0 -> 1297 bytes
 4 files changed, 249 insertions(+)

diff --git 
a/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/LogicalOp.scala
 
b/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/LogicalOp.scala
index 3949b67be9..d9a8a97407 100644
--- 
a/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/LogicalOp.scala
+++ 
b/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/LogicalOp.scala
@@ -109,6 +109,7 @@ import 
org.apache.texera.amber.operator.visualization.continuousErrorBands.Conti
 import 
org.apache.texera.amber.operator.visualization.contourPlot.ContourPlotOpDesc
 import 
org.apache.texera.amber.operator.visualization.dendrogram.DendrogramOpDesc
 import 
org.apache.texera.amber.operator.visualization.dumbbellPlot.DumbbellPlotOpDesc
+import org.apache.texera.amber.operator.visualization.ecdfPlot.ECDFPlotOpDesc
 import 
org.apache.texera.amber.operator.visualization.figureFactoryTable.FigureFactoryTableOpDesc
 import 
org.apache.texera.amber.operator.visualization.filledAreaPlot.FilledAreaPlotOpDesc
 import 
org.apache.texera.amber.operator.visualization.funnelPlot.FunnelPlotOpDesc
@@ -185,6 +186,7 @@ trait StateTransferFunc
     new Type(value = classOf[CandlestickChartOpDesc], name = 
"CandlestickChart"),
     new Type(value = classOf[SplitOpDesc], name = "Split"),
     new Type(value = classOf[ContourPlotOpDesc], name = "ContourPlot"),
+    new Type(value = classOf[ECDFPlotOpDesc], name = "ECDFPlot"),
     new Type(value = classOf[RegexOpDesc], name = "Regex"),
     new Type(value = classOf[SpecializedFilterOpDesc], name = "Filter"),
     new Type(value = classOf[ProjectionOpDesc], name = "Projection"),
diff --git 
a/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/visualization/ecdfPlot/ECDFPlotOpDesc.scala
 
b/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/visualization/ecdfPlot/ECDFPlotOpDesc.scala
new file mode 100644
index 0000000000..68625bdc4b
--- /dev/null
+++ 
b/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/visualization/ecdfPlot/ECDFPlotOpDesc.scala
@@ -0,0 +1,186 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.texera.amber.operator.visualization.ecdfPlot
+
+import com.fasterxml.jackson.annotation.{JsonProperty, JsonPropertyDescription}
+import com.kjetland.jackson.jsonSchema.annotations.{JsonSchemaInject, 
JsonSchemaTitle}
+import org.apache.texera.amber.core.tuple.{AttributeType, Schema}
+import org.apache.texera.amber.core.workflow.PortIdentity
+import org.apache.texera.amber.operator.PythonOperatorDescriptor
+import 
org.apache.texera.amber.operator.metadata.annotations.AutofillAttributeName
+import org.apache.texera.amber.operator.metadata.{OperatorGroupConstants, 
OperatorInfo}
+import org.apache.texera.amber.pybuilder.PyStringTypes.EncodableString
+import org.apache.texera.amber.pybuilder.PythonTemplateBuilder
+import 
org.apache.texera.amber.pybuilder.PythonTemplateBuilder.PythonTemplateBuilderStringContext
+
+import javax.validation.constraints.NotNull
+
+@JsonSchemaInject(
+  json = 
"""{"attributeTypeRules":{"valueColumn":{"enum":["integer","long","double"]}}}"""
+)
+class ECDFPlotOpDesc extends PythonOperatorDescriptor {
+
+  @JsonProperty(required = true)
+  @JsonSchemaTitle("Value Column")
+  @JsonPropertyDescription("Numeric column used to compute the empirical 
cumulative distribution.")
+  @AutofillAttributeName
+  @NotNull(message = "Value column cannot be empty")
+  var valueColumn: EncodableString = ""
+
+  @JsonProperty(required = false)
+  @JsonSchemaTitle("Color Column")
+  @JsonPropertyDescription("Optional column for coloring ECDF lines by group.")
+  @AutofillAttributeName
+  var colorColumn: EncodableString = ""
+
+  @JsonProperty(required = false)
+  @JsonSchemaTitle("Separate By Column")
+  @JsonPropertyDescription("Optional column for splitting ECDF plots into 
subplots.")
+  @AutofillAttributeName
+  var separateBy: EncodableString = ""
+
+  @JsonProperty(required = false, defaultValue = "probability")
+  @JsonSchemaTitle("Y Axis Mode")
+  @JsonPropertyDescription("Display cumulative probability, raw count, or 
cumulative sum.")
+  @JsonSchemaInject(
+    json = """{ "enum": ["probability", "count", "sum"], "default": 
"probability" }"""
+  )
+  var yAxisMode: String = "probability"
+
+  @JsonProperty(required = false, defaultValue = "standard")
+  @JsonSchemaTitle("CDF Mode")
+  @JsonPropertyDescription(
+    "'standard' shows P(X ≤ x), 'reversed' shows P(X ≥ x), " +
+      "'complementary' shows 1 - P(X ≤ x)."
+  )
+  @JsonSchemaInject(
+    json = """{ "enum": ["standard", "reversed", "complementary"], "default": 
"standard" }"""
+  )
+  var cdfMode: EncodableString = "standard"
+
+  @JsonProperty(required = false, defaultValue = "vertical")
+  @JsonSchemaTitle("Orientation")
+  @JsonPropertyDescription("Plot ECDF vertically or horizontally.")
+  @JsonSchemaInject(json = """{ "enum": ["vertical", "horizontal"], "default": 
"vertical" }""")
+  var orientation: EncodableString = "vertical"
+
+  @JsonProperty(required = false, defaultValue = "false")
+  @JsonSchemaTitle("Show Markers")
+  @JsonPropertyDescription("Display sample markers on the ECDF line.")
+  var showMarkers: Boolean = false
+
+  @JsonProperty(required = false, defaultValue = "none")
+  @JsonSchemaTitle("Marginal Plot")
+  @JsonPropertyDescription("Optional marginal plot to display alongside the 
ECDF.")
+  @JsonSchemaInject(
+    json = """{ "enum": ["none", "histogram", "rug"], "default": "none" }"""
+  )
+  var marginal: EncodableString = "none"
+
+  override def operatorInfo: OperatorInfo =
+    OperatorInfo.forVisualization(
+      "Empirical Cumulative Distribution Plot",
+      "Visualize the empirical cumulative distribution of a numeric column.",
+      OperatorGroupConstants.VISUALIZATION_STATISTICAL_GROUP
+    )
+
+  override def getOutputSchemas(
+      inputSchemas: Map[PortIdentity, Schema]
+  ): Map[PortIdentity, Schema] = {
+    val outputSchema = Schema().add("html-content", AttributeType.STRING)
+    Map(operatorInfo.outputPorts.head.id -> outputSchema)
+  }
+
+  def manipulateTable(): PythonTemplateBuilder = {
+    assert(valueColumn.nonEmpty)
+    val requiredCols =
+      List(
+        Some(pyb"$valueColumn"),
+        Option.when(colorColumn.nonEmpty)(pyb"$colorColumn"),
+        Option.when(separateBy.nonEmpty)(pyb"$separateBy")
+      ).flatten
+    val requiredColsExpr = requiredCols.mkString(", ")
+
+    pyb"""
+       |        required_cols = [$requiredColsExpr]
+       |        table.dropna(subset=required_cols, inplace=True)
+       |        table[$valueColumn] = pd.to_numeric(table[$valueColumn], 
errors='coerce')
+       |        table.dropna(subset=[$valueColumn], inplace=True)
+       |"""
+  }
+
+  def createPlotlyFigure(): PythonTemplateBuilder = {
+    assert(valueColumn.nonEmpty)
+
+    val args = scala.collection.mutable.ArrayBuffer[PythonTemplateBuilder](
+      pyb"table",
+      pyb"x=$valueColumn"
+    )
+    if (colorColumn.nonEmpty) args += pyb"color=$colorColumn"
+    if (separateBy.nonEmpty) args += pyb"facet_col=$separateBy"
+    yAxisMode match {
+      case "count" => args += pyb"ecdfnorm=None"
+      case "sum"   => args += pyb"ecdfnorm=None"
+      case _       =>
+    }
+    if (yAxisMode == "sum") args += pyb"y=$valueColumn"
+    if (cdfMode != "standard") args += pyb"ecdfmode=$cdfMode"
+    if (orientation == "horizontal") args += pyb"orientation='h'"
+    if (showMarkers) args += pyb"markers=True"
+    if (marginal != "none") args += pyb"marginal=$marginal"
+
+    val joinedArgs = args.mkString(", ")
+    pyb"""
+       |        fig = px.ecdf($joinedArgs)
+       |        fig.update_layout(margin=dict(l=0, r=0, t=30, b=0))
+       |"""
+  }
+
+  override def generatePythonCode(): String = {
+    val finalCode =
+      pyb"""
+         |from pytexera import *
+         |
+         |import pandas as pd
+         |import plotly.express as px
+         |import plotly.io
+         |
+         |class ProcessTableOperator(UDFTableOperator):
+         |    def render_error(self, error_msg):
+         |        return '''<h1>Empirical cumulative distribution plot is not 
available.</h1>
+         |                  <p>Reason is: {} </p>
+         |               '''.format(error_msg)
+         |
+         |    @overrides
+         |    def process_table(self, table: Table, port: int) -> 
Iterator[Optional[TableLike]]:
+         |        if table.empty:
+         |            yield {'html-content': self.render_error("input table is 
empty.")}
+         |            return
+         |        ${manipulateTable()}
+         |        if table.empty:
+         |            yield {'html-content': self.render_error("no valid rows 
left after removing missing or non-numeric values.")}
+         |            return
+         |        ${createPlotlyFigure()}
+         |        html = plotly.io.to_html(fig, include_plotlyjs='cdn', 
auto_play=False)
+         |        yield {'html-content': html}
+         |"""
+    finalCode.encode
+  }
+}
diff --git 
a/common/workflow-operator/src/test/scala/org/apache/texera/amber/operator/visualization/ecdfPlot/ECDFPlotOpDescSpec.scala
 
b/common/workflow-operator/src/test/scala/org/apache/texera/amber/operator/visualization/ecdfPlot/ECDFPlotOpDescSpec.scala
new file mode 100644
index 0000000000..bc565e4d3a
--- /dev/null
+++ 
b/common/workflow-operator/src/test/scala/org/apache/texera/amber/operator/visualization/ecdfPlot/ECDFPlotOpDescSpec.scala
@@ -0,0 +1,61 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.texera.amber.operator.visualization.ecdfPlot
+
+import org.scalatest.BeforeAndAfter
+import org.scalatest.flatspec.AnyFlatSpec
+
+class ECDFPlotOpDescSpec extends AnyFlatSpec with BeforeAndAfter {
+
+  var opDesc: ECDFPlotOpDesc = _
+
+  before {
+    opDesc = new ECDFPlotOpDesc()
+  }
+
+  it should "throw assertion error if value column is empty" in {
+    assertThrows[AssertionError] {
+      opDesc.manipulateTable()
+    }
+  }
+
+  it should "generate a plotly ecdf figure with optional parameters" in {
+    opDesc.valueColumn = "score"
+    opDesc.colorColumn = "group"
+    opDesc.separateBy = "category"
+    opDesc.yAxisMode = "count"
+    opDesc.cdfMode = "reversed"
+    opDesc.orientation = "horizontal"
+    opDesc.showMarkers = true
+    opDesc.marginal = "histogram"
+
+    val plain = opDesc.createPlotlyFigure().plain
+
+    assert(plain.contains("fig = px.ecdf(table"))
+    assert(plain.contains("ecdfnorm=None"))
+    assert(plain.contains("ecdfmode=self.decode_python_template"))
+    assert(plain.contains("orientation='h'"))
+    assert(plain.contains("markers=True"))
+    assert(plain.contains("marginal=self.decode_python_template"))
+    assert(plain.contains("x=self.decode_python_template"))
+    assert(plain.contains("color=self.decode_python_template"))
+    assert(plain.contains("facet_col=self.decode_python_template"))
+  }
+}
diff --git a/frontend/src/assets/operator_images/ECDFPlot.png 
b/frontend/src/assets/operator_images/ECDFPlot.png
new file mode 100644
index 0000000000..85c3bc5a95
Binary files /dev/null and b/frontend/src/assets/operator_images/ECDFPlot.png 
differ

Reply via email to