This is an automated email from the ASF dual-hosted git repository. github-merge-queue[bot] pushed a commit to branch gh-readonly-queue/main/pr-5133-e4557eeb8573759c33bf649cfc89d97778d62920 in repository https://gitbox.apache.org/repos/asf/texera.git
commit 62883b886837fc1d949e8c29b26b6e8153458d34 Author: Matthew B. <[email protected]> AuthorDate: Thu May 21 00:06:48 2026 -0700 fix: use BINARY type for RAW_BYTES URL fetcher schema (#5133) ### What changes were proposed in this PR? `URLFetcherOpDesc.sourceSchema()` advertised `AttributeType.ANY` for `RAW_BYTES` decoding, even though the executor already emits a concrete `byte[]`. This change returns `AttributeType.BINARY` instead, matching the runtime payload and unblocking Iceberg materialization (which rejects `ANY`). The existing `URLFetcherOpDescSpec` test that pinned the old behavior is flipped to assert `BINARY`. Test json: [url-fetcher-raw-bytes-test.json](https://github.com/user-attachments/files/28089609/url-fetcher-raw-bytes-test.json) ### Any related issues, documentation, or discussions? Closes: #5074 ### How was this PR tested? Updated `URLFetcherOpDescSpec` covers the schema; `URLFetcherOpExecSpec` already pins the runtime field as `Array[Byte]`, so static schema and runtime type now agree. CI Scala test job is expected to pass. ### Was this PR authored or co-authored using generative AI tooling? Co-authored with Claude Opus 4.7 in compliance with ASF --- .../texera/amber/operator/source/fetcher/URLFetcherOpDesc.scala | 2 +- .../texera/amber/operator/source/fetcher/URLFetcherOpDescSpec.scala | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/source/fetcher/URLFetcherOpDesc.scala b/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/source/fetcher/URLFetcherOpDesc.scala index 7abe83945d..fc12c11e36 100644 --- a/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/source/fetcher/URLFetcherOpDesc.scala +++ b/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/source/fetcher/URLFetcherOpDesc.scala @@ -53,7 +53,7 @@ class URLFetcherOpDesc extends SourceOperatorDescriptor { Schema() .add( "URL content", - if (decodingMethod == DecodingMethod.UTF_8) AttributeType.STRING else AttributeType.ANY + if (decodingMethod eq DecodingMethod.UTF_8) AttributeType.STRING else AttributeType.BINARY ) } diff --git a/common/workflow-operator/src/test/scala/org/apache/texera/amber/operator/source/fetcher/URLFetcherOpDescSpec.scala b/common/workflow-operator/src/test/scala/org/apache/texera/amber/operator/source/fetcher/URLFetcherOpDescSpec.scala index 34b7ec8f6f..16b9821cb1 100644 --- a/common/workflow-operator/src/test/scala/org/apache/texera/amber/operator/source/fetcher/URLFetcherOpDescSpec.scala +++ b/common/workflow-operator/src/test/scala/org/apache/texera/amber/operator/source/fetcher/URLFetcherOpDescSpec.scala @@ -59,12 +59,12 @@ class URLFetcherOpDescSpec extends AnyFlatSpec with Matchers { schema.getAttributes.head.getType shouldBe AttributeType.STRING } - it should "produce an ANY column for raw-bytes decoding" in { + it should "produce a BINARY column for raw-bytes decoding" in { val op = configured(DecodingMethod.RAW_BYTES) val schema = op.sourceSchema() schema.getAttributes should have length 1 schema.getAttributes.head.getName shouldBe "URL content" - schema.getAttributes.head.getType shouldBe AttributeType.ANY + schema.getAttributes.head.getType shouldBe AttributeType.BINARY } it should "fail loudly when decodingMethod is left unset rather than silently defaulting to ANY" in {
