This is an automated email from the ASF dual-hosted git repository.

xuang7 pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/texera.git


The following commit(s) were added to refs/heads/main by this push:
     new ac55403592 fix: require non-null decodingMethod in 
URLFetcherOpDesc.sourceSchema (#5048)
ac55403592 is described below

commit ac5540359290afebee35889f47342ff296a875cb
Author: Matthew B. <[email protected]>
AuthorDate: Wed May 13 15:22:04 2026 -0700

    fix: require non-null decodingMethod in URLFetcherOpDesc.sourceSchema 
(#5048)
    
    ### What changes were proposed in this PR?
    `URLFetcherOpDesc.decodingMethod` defaults to `null` (from `var
    decodingMethod: DecodingMethod = _`). `sourceSchema()` branched on `if
    (decodingMethod == DecodingMethod.UTF_8) STRING
    else ANY`, so a null silently produced an ANY column even though the
    JSON schema marks the field `required = true`. This PR adds a
    `require(decodingMethod != null, ...)` precondition
    at the top of `sourceSchema()`, surfacing the misconfiguration as a
    clear `IllegalArgumentException` instead of a silent ANY-typed output.
      ### Any related issues, documentation, or discussions?
    Closes: #4815
      ### How was this PR tested?
    Updated `URLFetcherOpDescSpec`: flipped the previously pinned "default
    to ANY when decodingMethod is left unset" case to assert that
    `sourceSchema()` now throws
    `IllegalArgumentException`. Existing UTF-8 / RAW_BYTES / physical-op /
    schema-propagation specs continue to pass. Ran `sbt "project
    WorkflowOperator" "testOnly
    org.apache.texera.amber.operator.source.fetcher.URLFetcherOpDescSpec"`,
    all 7 tests pass.
      ### Was this PR authored or co-authored using generative AI tooling?
      Co-authored with Claude Opus 4.7 in compliance with ASF
    
    ---------
    
    Co-authored-by: Xuan Gu <[email protected]>
---
 .../operator/source/fetcher/URLFetcherOpDesc.scala      |  4 ++++
 .../operator/source/fetcher/URLFetcherOpDescSpec.scala  | 17 ++++++++---------
 .../operator/source/fetcher/URLFetcherOpExecSpec.scala  |  3 ---
 3 files changed, 12 insertions(+), 12 deletions(-)

diff --git 
a/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/source/fetcher/URLFetcherOpDesc.scala
 
b/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/source/fetcher/URLFetcherOpDesc.scala
index f8b840b28c..7abe83945d 100644
--- 
a/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/source/fetcher/URLFetcherOpDesc.scala
+++ 
b/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/source/fetcher/URLFetcherOpDesc.scala
@@ -46,6 +46,10 @@ class URLFetcherOpDesc extends SourceOperatorDescriptor {
   var decodingMethod: DecodingMethod = _
 
   override def sourceSchema(): Schema = {
+    require(
+      decodingMethod != null,
+      "URLFetcherOpDesc.decodingMethod must be set before sourceSchema is 
computed"
+    )
     Schema()
       .add(
         "URL content",
diff --git 
a/common/workflow-operator/src/test/scala/org/apache/texera/amber/operator/source/fetcher/URLFetcherOpDescSpec.scala
 
b/common/workflow-operator/src/test/scala/org/apache/texera/amber/operator/source/fetcher/URLFetcherOpDescSpec.scala
index 3b8c306b73..34b7ec8f6f 100644
--- 
a/common/workflow-operator/src/test/scala/org/apache/texera/amber/operator/source/fetcher/URLFetcherOpDescSpec.scala
+++ 
b/common/workflow-operator/src/test/scala/org/apache/texera/amber/operator/source/fetcher/URLFetcherOpDescSpec.scala
@@ -67,17 +67,16 @@ class URLFetcherOpDescSpec extends AnyFlatSpec with 
Matchers {
     schema.getAttributes.head.getType shouldBe AttributeType.ANY
   }
 
-  it should "default to ANY when decodingMethod is left unset (current 
behavior)" in {
-    // Pin: `var decodingMethod: DecodingMethod = _` defaults to null.
-    // sourceSchema's branch is `if (decodingMethod == DecodingMethod.UTF_8)
-    // STRING else ANY`, so a null comparison falls through to ANY without
-    // raising. Documenting the current behavior so a future explicit-null
-    // check breaks this spec deliberately.
+  it should "fail loudly when decodingMethod is left unset rather than 
silently defaulting to ANY" in {
+    // `var decodingMethod: DecodingMethod = _` defaults to null. Without a
+    // guard, sourceSchema would fall through `if (decodingMethod ==
+    // DecodingMethod.UTF_8) ... else ANY` and silently produce an ANY column
+    // for a misconfigured operator. sourceSchema now requires a non-null
+    // decodingMethod and surfaces the misconfiguration as an
+    // IllegalArgumentException.
     val op = new URLFetcherOpDesc
     op.url = "https://example.test/data";
-    val schema = op.sourceSchema()
-    schema.getAttributes should have length 1
-    schema.getAttributes.head.getType shouldBe AttributeType.ANY
+    an[IllegalArgumentException] should be thrownBy op.sourceSchema()
   }
 
   "URLFetcherOpDesc.getPhysicalOp" should "wire the URLFetcherOpExec class 
name into the OpExecInitInfo" in {
diff --git 
a/common/workflow-operator/src/test/scala/org/apache/texera/amber/operator/source/fetcher/URLFetcherOpExecSpec.scala
 
b/common/workflow-operator/src/test/scala/org/apache/texera/amber/operator/source/fetcher/URLFetcherOpExecSpec.scala
index c61468c683..47770cbae8 100644
--- 
a/common/workflow-operator/src/test/scala/org/apache/texera/amber/operator/source/fetcher/URLFetcherOpExecSpec.scala
+++ 
b/common/workflow-operator/src/test/scala/org/apache/texera/amber/operator/source/fetcher/URLFetcherOpExecSpec.scala
@@ -19,14 +19,11 @@
 
 package org.apache.texera.amber.operator.source.fetcher
 
-import org.apache.texera.amber.core.tuple.Schema
 import org.apache.texera.amber.util.JSONUtils.objectMapper
 import org.scalatest.BeforeAndAfter
 import org.scalatest.flatspec.AnyFlatSpec
 class URLFetcherOpExecSpec extends AnyFlatSpec with BeforeAndAfter {
 
-  val resultSchema: Schema = new URLFetcherOpDesc().sourceSchema()
-
   val opDesc: URLFetcherOpDesc = new URLFetcherOpDesc()
 
   it should "fetch url and output one tuple with raw bytes" in {

Reply via email to