(texera) branch main updated: test(amber): add unit tests for ArrowUtils (#4758)

yiconghuang Sun, 03 May 2026 00:25:31 -0700

This is an automated email from the ASF dual-hosted git repository.

Yicong-Huang pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/texera.git



The following commit(s) were added to refs/heads/main by this push:
     new 9bcbbc062f test(amber): add unit tests for ArrowUtils (#4758)
9bcbbc062f is described below

commit 9bcbbc062f12bfebac8194f7165f3064602b53da
Author: Yicong Huang <[email protected]>
AuthorDate: Sun May 3 00:25:19 2026 -0700

    test(amber): add unit tests for ArrowUtils (#4758)
    
    ### What changes were proposed in this PR?
    
    Adds scalatest coverage for
    
`common/workflow-core/src/main/scala/org/apache/texera/amber/util/ArrowUtils.scala`.
    The Arrow ↔ Texera type / schema converters had no dedicated spec.
    
    The `getTexeraTuple` / `setTexeraTuple` / `appendTexeraTuple` row-level
    helpers are intentionally out of scope for this PR — they need a real
    `VectorSchemaRoot` and substantial Arrow scaffolding, and warrant a
    dedicated PR.
    
    ### Any related issues, documentation, discussions?
    
    Closes #4751.
    
    Two contract divergences are pinned in the spec with explanatory
    comments and filed separately as Bug issues: (1) `toAttributeType`'s
    catch-all `case 64 | _ => LONG` collapses every non-{16,32}-bit width
    onto LONG (including widths like 8 that are clearly not LONG), and (2)
    `fromAttributeType` collapses `STRING`, `LARGE_BINARY`, and `ANY` onto
    the same `Utf8` Arrow type — `LARGE_BINARY` is recovered via field
    metadata, but `ANY` loses its distinction entirely on round-trip and
    surfaces as `STRING`.
    
    ### How was this PR tested?
    
    ```
    sbt scalafmtCheckAll
    sbt "WorkflowCore/testOnly org.apache.texera.amber.util.ArrowUtilsSpec"
    ```
    
    ### Was this PR authored or co-authored using generative AI tooling?
    
    Generated-by: Claude Code (claude-opus-4-7)
    
    Co-authored-by: Claude Opus 4.7 (1M context) <[email protected]>
---
 .../apache/texera/amber/util/ArrowUtilsSpec.scala  | 271 +++++++++++++++++++++
 1 file changed, 271 insertions(+)

diff --git 
a/common/workflow-core/src/test/scala/org/apache/texera/amber/util/ArrowUtilsSpec.scala
 
b/common/workflow-core/src/test/scala/org/apache/texera/amber/util/ArrowUtilsSpec.scala
new file mode 100644
index 0000000000..69323acbba
--- /dev/null
+++ 
b/common/workflow-core/src/test/scala/org/apache/texera/amber/util/ArrowUtilsSpec.scala
@@ -0,0 +1,271 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.texera.amber.util
+
+import org.apache.arrow.vector.types.{FloatingPointPrecision, TimeUnit}
+import org.apache.arrow.vector.types.pojo.{ArrowType, Field, FieldType}
+import org.apache.texera.amber.core.tuple.{Attribute, AttributeType, Schema}
+import 
org.apache.texera.amber.core.tuple.AttributeTypeUtils.AttributeTypeException
+import org.scalatest.flatspec.AnyFlatSpec
+import org.scalatest.matchers.should.Matchers
+
+import java.util
+import scala.jdk.CollectionConverters.CollectionHasAsScala
+
+class ArrowUtilsSpec extends AnyFlatSpec with Matchers {
+
+  // ----- toAttributeType -----
+
+  "toAttributeType" should "map Int(16) to INTEGER" in {
+    ArrowUtils.toAttributeType(new ArrowType.Int(16, true)) shouldBe 
AttributeType.INTEGER
+  }
+
+  it should "map Int(32) to INTEGER" in {
+    ArrowUtils.toAttributeType(new ArrowType.Int(32, true)) shouldBe 
AttributeType.INTEGER
+  }
+
+  it should "map Int(64) to LONG" in {
+    ArrowUtils.toAttributeType(new ArrowType.Int(64, true)) shouldBe 
AttributeType.LONG
+  }
+
+  it should "map non-standard Int bit-widths to LONG (current behavior)" in {
+    // Pin: the source code's match is `case 16 | 32 => INTEGER` then
+    // `case 64 | _ => LONG`. The trailing `_` makes the second arm a
+    // catch-all, so Int(8), Int(128) and any other width all surface as
+    // LONG. A future fix that distinguishes those widths will deliberately
+    // break this spec.
+    ArrowUtils.toAttributeType(new ArrowType.Int(8, true)) shouldBe 
AttributeType.LONG
+    ArrowUtils.toAttributeType(new ArrowType.Int(128, true)) shouldBe 
AttributeType.LONG
+  }
+
+  it should "map Bool to BOOLEAN" in {
+    ArrowUtils.toAttributeType(ArrowType.Bool.INSTANCE) shouldBe 
AttributeType.BOOLEAN
+  }
+
+  it should "map FloatingPoint to DOUBLE" in {
+    ArrowUtils.toAttributeType(
+      new ArrowType.FloatingPoint(FloatingPointPrecision.DOUBLE)
+    ) shouldBe AttributeType.DOUBLE
+  }
+
+  it should "map Timestamp to TIMESTAMP" in {
+    ArrowUtils.toAttributeType(
+      new ArrowType.Timestamp(TimeUnit.MILLISECOND, "UTC")
+    ) shouldBe AttributeType.TIMESTAMP
+  }
+
+  it should "map Utf8 to STRING" in {
+    ArrowUtils.toAttributeType(ArrowType.Utf8.INSTANCE) shouldBe 
AttributeType.STRING
+  }
+
+  it should "map Binary to BINARY" in {
+    ArrowUtils.toAttributeType(new ArrowType.Binary) shouldBe 
AttributeType.BINARY
+  }
+
+  it should "throw AttributeTypeException for unsupported Arrow types" in {
+    // ArrowType.Null is a real Arrow type that this method doesn't handle.
+    assertThrows[AttributeTypeException] {
+      ArrowUtils.toAttributeType(ArrowType.Null.INSTANCE)
+    }
+  }
+
+  // ----- fromAttributeType -----
+
+  "fromAttributeType" should "map INTEGER to Int(32, signed)" in {
+    val arrow = ArrowUtils.fromAttributeType(AttributeType.INTEGER)
+    arrow shouldBe new ArrowType.Int(32, true)
+  }
+
+  it should "map LONG to Int(64, signed)" in {
+    val arrow = ArrowUtils.fromAttributeType(AttributeType.LONG)
+    arrow shouldBe new ArrowType.Int(64, true)
+  }
+
+  it should "map DOUBLE to FloatingPoint(DOUBLE)" in {
+    val arrow = ArrowUtils.fromAttributeType(AttributeType.DOUBLE)
+    arrow shouldBe new ArrowType.FloatingPoint(FloatingPointPrecision.DOUBLE)
+  }
+
+  it should "map BOOLEAN to Bool.INSTANCE" in {
+    ArrowUtils.fromAttributeType(AttributeType.BOOLEAN) shouldBe 
ArrowType.Bool.INSTANCE
+  }
+
+  it should "map TIMESTAMP to Timestamp(MILLISECOND, UTC)" in {
+    val arrow = ArrowUtils.fromAttributeType(AttributeType.TIMESTAMP)
+    arrow shouldBe new ArrowType.Timestamp(TimeUnit.MILLISECOND, "UTC")
+  }
+
+  it should "map BINARY to ArrowType.Binary" in {
+    ArrowUtils.fromAttributeType(AttributeType.BINARY) shouldBe new 
ArrowType.Binary
+  }
+
+  it should "map STRING, LARGE_BINARY, and ANY all to Utf8.INSTANCE" in {
+    // Pin: STRING / LARGE_BINARY / ANY collapse onto the same Arrow type
+    // (Utf8). LARGE_BINARY is recovered via field metadata, ANY loses its
+    // distinction entirely. Documenting the collision so a future change
+    // that splits them apart will surface here.
+    ArrowUtils.fromAttributeType(AttributeType.STRING) shouldBe 
ArrowType.Utf8.INSTANCE
+    ArrowUtils.fromAttributeType(
+      AttributeType.LARGE_BINARY
+    ) shouldBe ArrowType.Utf8.INSTANCE
+    ArrowUtils.fromAttributeType(AttributeType.ANY) shouldBe 
ArrowType.Utf8.INSTANCE
+  }
+
+  // ----- bool2int implicit -----
+
+  "bool2int implicit" should "map true to 1 and false to 0" in {
+    import ArrowUtils.bool2int
+    val one: Int = true
+    val zero: Int = false
+    one shouldBe 1
+    zero shouldBe 0
+  }
+
+  // ----- toTexeraSchema -----
+
+  private def arrowField(
+      name: String,
+      t: ArrowType,
+      metadata: util.Map[String, String] = null
+  ): Field =
+    new Field(name, new FieldType(true, t, null, metadata), null)
+
+  "toTexeraSchema" should "produce an empty Texera Schema for an empty Arrow 
schema" in {
+    val arrow = new 
org.apache.arrow.vector.types.pojo.Schema(util.Arrays.asList[Field]())
+    ArrowUtils.toTexeraSchema(arrow).getAttributes shouldBe empty
+  }
+
+  it should "translate each Arrow field to a Texera Attribute by primitive 
type" in {
+    val arrow = new org.apache.arrow.vector.types.pojo.Schema(
+      util.Arrays.asList(
+        arrowField("a", new ArrowType.Int(32, true)),
+        arrowField("b", new ArrowType.Int(64, true)),
+        arrowField("c", ArrowType.Bool.INSTANCE),
+        arrowField("d", ArrowType.Utf8.INSTANCE)
+      )
+    )
+    val schema = ArrowUtils.toTexeraSchema(arrow)
+    val attrs = schema.getAttributes.toList
+    attrs.map(_.getName) shouldBe List("a", "b", "c", "d")
+    attrs.map(_.getType) shouldBe List(
+      AttributeType.INTEGER,
+      AttributeType.LONG,
+      AttributeType.BOOLEAN,
+      AttributeType.STRING
+    )
+  }
+
+  it should "promote Utf8 fields to LARGE_BINARY when texera_type metadata 
says so" in {
+    val md = new util.HashMap[String, String]()
+    md.put("texera_type", "LARGE_BINARY")
+    val arrow = new org.apache.arrow.vector.types.pojo.Schema(
+      util.Arrays.asList(
+        arrowField("blob", ArrowType.Utf8.INSTANCE, md),
+        arrowField("plain", ArrowType.Utf8.INSTANCE) // no metadata
+      )
+    )
+    val schema = ArrowUtils.toTexeraSchema(arrow)
+    val attrs = schema.getAttributes.toList
+    attrs.map(_.getName) shouldBe List("blob", "plain")
+    attrs.map(_.getType) shouldBe List(AttributeType.LARGE_BINARY, 
AttributeType.STRING)
+  }
+
+  // ----- fromTexeraSchema -----
+
+  "fromTexeraSchema" should "translate each Texera Attribute to an Arrow field 
with primitive types" in {
+    val schema = Schema(
+      List(
+        new Attribute("i", AttributeType.INTEGER),
+        new Attribute("l", AttributeType.LONG),
+        new Attribute("d", AttributeType.DOUBLE),
+        new Attribute("b", AttributeType.BOOLEAN)
+      )
+    )
+    val arrow = ArrowUtils.fromTexeraSchema(schema)
+    val fields = arrow.getFields.asScala.toList
+    fields.map(_.getName) shouldBe List("i", "l", "d", "b")
+    fields.map(_.getFieldType.getType) shouldBe List(
+      new ArrowType.Int(32, true),
+      new ArrowType.Int(64, true),
+      new ArrowType.FloatingPoint(FloatingPointPrecision.DOUBLE),
+      ArrowType.Bool.INSTANCE
+    )
+  }
+
+  it should "attach texera_type=LARGE_BINARY metadata to LARGE_BINARY fields 
and only those" in {
+    val schema = Schema(
+      List(
+        new Attribute("blob", AttributeType.LARGE_BINARY),
+        new Attribute("name", AttributeType.STRING)
+      )
+    )
+    val arrow = ArrowUtils.fromTexeraSchema(schema)
+    val fields = arrow.getFields.asScala.toList
+    val blob = fields.find(_.getName == "blob").get
+    val name = fields.find(_.getName == "name").get
+    blob.getMetadata.get("texera_type") shouldBe "LARGE_BINARY"
+    // STRING fields do not get the texera_type metadata.
+    
Option(name.getMetadata).map(_.containsKey("texera_type")).getOrElse(false) 
shouldBe false
+  }
+
+  // ----- round-trip -----
+
+  "schema round-trip" should "preserve primitive AttributeTypes through 
fromTexeraSchema and back" in {
+    val original = Schema(
+      List(
+        new Attribute("i", AttributeType.INTEGER),
+        new Attribute("l", AttributeType.LONG),
+        new Attribute("d", AttributeType.DOUBLE),
+        new Attribute("b", AttributeType.BOOLEAN),
+        new Attribute("t", AttributeType.TIMESTAMP),
+        new Attribute("s", AttributeType.STRING),
+        new Attribute("y", AttributeType.BINARY)
+      )
+    )
+    val recovered = 
ArrowUtils.toTexeraSchema(ArrowUtils.fromTexeraSchema(original))
+    recovered.getAttributes.toList.map(a => (a.getName, a.getType)) shouldBe
+      original.getAttributes.toList.map(a => (a.getName, a.getType))
+  }
+
+  it should "preserve LARGE_BINARY through the metadata-based path" in {
+    val original = Schema(
+      List(
+        new Attribute("blob", AttributeType.LARGE_BINARY),
+        new Attribute("name", AttributeType.STRING)
+      )
+    )
+    val recovered = 
ArrowUtils.toTexeraSchema(ArrowUtils.fromTexeraSchema(original))
+    recovered.getAttributes.toList.map(a => (a.getName, a.getType)) shouldBe 
List(
+      ("blob", AttributeType.LARGE_BINARY),
+      ("name", AttributeType.STRING)
+    )
+  }
+
+  it should "lose the ANY distinction (round-trips as STRING)" in {
+    // Pin: ANY fromAttributeType produces Utf8 with no metadata. 
toAttributeType
+    // then can only see Utf8, so the recovered type is STRING. Documenting 
this
+    // information loss so a future fix that round-trips ANY can break the 
spec.
+    val original = Schema(List(new Attribute("v", AttributeType.ANY)))
+    val recovered = 
ArrowUtils.toTexeraSchema(ArrowUtils.fromTexeraSchema(original))
+    recovered.getAttributes.toList.map(a => (a.getName, a.getType)) shouldBe 
List(
+      ("v", AttributeType.STRING)
+    )
+  }
+}

(texera) branch main updated: test(amber): add unit tests for ArrowUtils (#4758)

Reply via email to