This is an automated email from the ASF dual-hosted git repository.
Yicong-Huang pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/texera.git
The following commit(s) were added to refs/heads/main by this push:
new 9bcbbc062f test(amber): add unit tests for ArrowUtils (#4758)
9bcbbc062f is described below
commit 9bcbbc062f12bfebac8194f7165f3064602b53da
Author: Yicong Huang <[email protected]>
AuthorDate: Sun May 3 00:25:19 2026 -0700
test(amber): add unit tests for ArrowUtils (#4758)
### What changes were proposed in this PR?
Adds scalatest coverage for
`common/workflow-core/src/main/scala/org/apache/texera/amber/util/ArrowUtils.scala`.
The Arrow ↔ Texera type / schema converters had no dedicated spec.
The `getTexeraTuple` / `setTexeraTuple` / `appendTexeraTuple` row-level
helpers are intentionally out of scope for this PR — they need a real
`VectorSchemaRoot` and substantial Arrow scaffolding, and warrant a
dedicated PR.
### Any related issues, documentation, discussions?
Closes #4751.
Two contract divergences are pinned in the spec with explanatory
comments and filed separately as Bug issues: (1) `toAttributeType`'s
catch-all `case 64 | _ => LONG` collapses every non-{16,32}-bit width
onto LONG (including widths like 8 that are clearly not LONG), and (2)
`fromAttributeType` collapses `STRING`, `LARGE_BINARY`, and `ANY` onto
the same `Utf8` Arrow type — `LARGE_BINARY` is recovered via field
metadata, but `ANY` loses its distinction entirely on round-trip and
surfaces as `STRING`.
### How was this PR tested?
```
sbt scalafmtCheckAll
sbt "WorkflowCore/testOnly org.apache.texera.amber.util.ArrowUtilsSpec"
```
### Was this PR authored or co-authored using generative AI tooling?
Generated-by: Claude Code (claude-opus-4-7)
Co-authored-by: Claude Opus 4.7 (1M context) <[email protected]>
---
.../apache/texera/amber/util/ArrowUtilsSpec.scala | 271 +++++++++++++++++++++
1 file changed, 271 insertions(+)
diff --git
a/common/workflow-core/src/test/scala/org/apache/texera/amber/util/ArrowUtilsSpec.scala
b/common/workflow-core/src/test/scala/org/apache/texera/amber/util/ArrowUtilsSpec.scala
new file mode 100644
index 0000000000..69323acbba
--- /dev/null
+++
b/common/workflow-core/src/test/scala/org/apache/texera/amber/util/ArrowUtilsSpec.scala
@@ -0,0 +1,271 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.texera.amber.util
+
+import org.apache.arrow.vector.types.{FloatingPointPrecision, TimeUnit}
+import org.apache.arrow.vector.types.pojo.{ArrowType, Field, FieldType}
+import org.apache.texera.amber.core.tuple.{Attribute, AttributeType, Schema}
+import
org.apache.texera.amber.core.tuple.AttributeTypeUtils.AttributeTypeException
+import org.scalatest.flatspec.AnyFlatSpec
+import org.scalatest.matchers.should.Matchers
+
+import java.util
+import scala.jdk.CollectionConverters.CollectionHasAsScala
+
+class ArrowUtilsSpec extends AnyFlatSpec with Matchers {
+
+ // ----- toAttributeType -----
+
+ "toAttributeType" should "map Int(16) to INTEGER" in {
+ ArrowUtils.toAttributeType(new ArrowType.Int(16, true)) shouldBe
AttributeType.INTEGER
+ }
+
+ it should "map Int(32) to INTEGER" in {
+ ArrowUtils.toAttributeType(new ArrowType.Int(32, true)) shouldBe
AttributeType.INTEGER
+ }
+
+ it should "map Int(64) to LONG" in {
+ ArrowUtils.toAttributeType(new ArrowType.Int(64, true)) shouldBe
AttributeType.LONG
+ }
+
+ it should "map non-standard Int bit-widths to LONG (current behavior)" in {
+ // Pin: the source code's match is `case 16 | 32 => INTEGER` then
+ // `case 64 | _ => LONG`. The trailing `_` makes the second arm a
+ // catch-all, so Int(8), Int(128) and any other width all surface as
+ // LONG. A future fix that distinguishes those widths will deliberately
+ // break this spec.
+ ArrowUtils.toAttributeType(new ArrowType.Int(8, true)) shouldBe
AttributeType.LONG
+ ArrowUtils.toAttributeType(new ArrowType.Int(128, true)) shouldBe
AttributeType.LONG
+ }
+
+ it should "map Bool to BOOLEAN" in {
+ ArrowUtils.toAttributeType(ArrowType.Bool.INSTANCE) shouldBe
AttributeType.BOOLEAN
+ }
+
+ it should "map FloatingPoint to DOUBLE" in {
+ ArrowUtils.toAttributeType(
+ new ArrowType.FloatingPoint(FloatingPointPrecision.DOUBLE)
+ ) shouldBe AttributeType.DOUBLE
+ }
+
+ it should "map Timestamp to TIMESTAMP" in {
+ ArrowUtils.toAttributeType(
+ new ArrowType.Timestamp(TimeUnit.MILLISECOND, "UTC")
+ ) shouldBe AttributeType.TIMESTAMP
+ }
+
+ it should "map Utf8 to STRING" in {
+ ArrowUtils.toAttributeType(ArrowType.Utf8.INSTANCE) shouldBe
AttributeType.STRING
+ }
+
+ it should "map Binary to BINARY" in {
+ ArrowUtils.toAttributeType(new ArrowType.Binary) shouldBe
AttributeType.BINARY
+ }
+
+ it should "throw AttributeTypeException for unsupported Arrow types" in {
+ // ArrowType.Null is a real Arrow type that this method doesn't handle.
+ assertThrows[AttributeTypeException] {
+ ArrowUtils.toAttributeType(ArrowType.Null.INSTANCE)
+ }
+ }
+
+ // ----- fromAttributeType -----
+
+ "fromAttributeType" should "map INTEGER to Int(32, signed)" in {
+ val arrow = ArrowUtils.fromAttributeType(AttributeType.INTEGER)
+ arrow shouldBe new ArrowType.Int(32, true)
+ }
+
+ it should "map LONG to Int(64, signed)" in {
+ val arrow = ArrowUtils.fromAttributeType(AttributeType.LONG)
+ arrow shouldBe new ArrowType.Int(64, true)
+ }
+
+ it should "map DOUBLE to FloatingPoint(DOUBLE)" in {
+ val arrow = ArrowUtils.fromAttributeType(AttributeType.DOUBLE)
+ arrow shouldBe new ArrowType.FloatingPoint(FloatingPointPrecision.DOUBLE)
+ }
+
+ it should "map BOOLEAN to Bool.INSTANCE" in {
+ ArrowUtils.fromAttributeType(AttributeType.BOOLEAN) shouldBe
ArrowType.Bool.INSTANCE
+ }
+
+ it should "map TIMESTAMP to Timestamp(MILLISECOND, UTC)" in {
+ val arrow = ArrowUtils.fromAttributeType(AttributeType.TIMESTAMP)
+ arrow shouldBe new ArrowType.Timestamp(TimeUnit.MILLISECOND, "UTC")
+ }
+
+ it should "map BINARY to ArrowType.Binary" in {
+ ArrowUtils.fromAttributeType(AttributeType.BINARY) shouldBe new
ArrowType.Binary
+ }
+
+ it should "map STRING, LARGE_BINARY, and ANY all to Utf8.INSTANCE" in {
+ // Pin: STRING / LARGE_BINARY / ANY collapse onto the same Arrow type
+ // (Utf8). LARGE_BINARY is recovered via field metadata, ANY loses its
+ // distinction entirely. Documenting the collision so a future change
+ // that splits them apart will surface here.
+ ArrowUtils.fromAttributeType(AttributeType.STRING) shouldBe
ArrowType.Utf8.INSTANCE
+ ArrowUtils.fromAttributeType(
+ AttributeType.LARGE_BINARY
+ ) shouldBe ArrowType.Utf8.INSTANCE
+ ArrowUtils.fromAttributeType(AttributeType.ANY) shouldBe
ArrowType.Utf8.INSTANCE
+ }
+
+ // ----- bool2int implicit -----
+
+ "bool2int implicit" should "map true to 1 and false to 0" in {
+ import ArrowUtils.bool2int
+ val one: Int = true
+ val zero: Int = false
+ one shouldBe 1
+ zero shouldBe 0
+ }
+
+ // ----- toTexeraSchema -----
+
+ private def arrowField(
+ name: String,
+ t: ArrowType,
+ metadata: util.Map[String, String] = null
+ ): Field =
+ new Field(name, new FieldType(true, t, null, metadata), null)
+
+ "toTexeraSchema" should "produce an empty Texera Schema for an empty Arrow
schema" in {
+ val arrow = new
org.apache.arrow.vector.types.pojo.Schema(util.Arrays.asList[Field]())
+ ArrowUtils.toTexeraSchema(arrow).getAttributes shouldBe empty
+ }
+
+ it should "translate each Arrow field to a Texera Attribute by primitive
type" in {
+ val arrow = new org.apache.arrow.vector.types.pojo.Schema(
+ util.Arrays.asList(
+ arrowField("a", new ArrowType.Int(32, true)),
+ arrowField("b", new ArrowType.Int(64, true)),
+ arrowField("c", ArrowType.Bool.INSTANCE),
+ arrowField("d", ArrowType.Utf8.INSTANCE)
+ )
+ )
+ val schema = ArrowUtils.toTexeraSchema(arrow)
+ val attrs = schema.getAttributes.toList
+ attrs.map(_.getName) shouldBe List("a", "b", "c", "d")
+ attrs.map(_.getType) shouldBe List(
+ AttributeType.INTEGER,
+ AttributeType.LONG,
+ AttributeType.BOOLEAN,
+ AttributeType.STRING
+ )
+ }
+
+ it should "promote Utf8 fields to LARGE_BINARY when texera_type metadata
says so" in {
+ val md = new util.HashMap[String, String]()
+ md.put("texera_type", "LARGE_BINARY")
+ val arrow = new org.apache.arrow.vector.types.pojo.Schema(
+ util.Arrays.asList(
+ arrowField("blob", ArrowType.Utf8.INSTANCE, md),
+ arrowField("plain", ArrowType.Utf8.INSTANCE) // no metadata
+ )
+ )
+ val schema = ArrowUtils.toTexeraSchema(arrow)
+ val attrs = schema.getAttributes.toList
+ attrs.map(_.getName) shouldBe List("blob", "plain")
+ attrs.map(_.getType) shouldBe List(AttributeType.LARGE_BINARY,
AttributeType.STRING)
+ }
+
+ // ----- fromTexeraSchema -----
+
+ "fromTexeraSchema" should "translate each Texera Attribute to an Arrow field
with primitive types" in {
+ val schema = Schema(
+ List(
+ new Attribute("i", AttributeType.INTEGER),
+ new Attribute("l", AttributeType.LONG),
+ new Attribute("d", AttributeType.DOUBLE),
+ new Attribute("b", AttributeType.BOOLEAN)
+ )
+ )
+ val arrow = ArrowUtils.fromTexeraSchema(schema)
+ val fields = arrow.getFields.asScala.toList
+ fields.map(_.getName) shouldBe List("i", "l", "d", "b")
+ fields.map(_.getFieldType.getType) shouldBe List(
+ new ArrowType.Int(32, true),
+ new ArrowType.Int(64, true),
+ new ArrowType.FloatingPoint(FloatingPointPrecision.DOUBLE),
+ ArrowType.Bool.INSTANCE
+ )
+ }
+
+ it should "attach texera_type=LARGE_BINARY metadata to LARGE_BINARY fields
and only those" in {
+ val schema = Schema(
+ List(
+ new Attribute("blob", AttributeType.LARGE_BINARY),
+ new Attribute("name", AttributeType.STRING)
+ )
+ )
+ val arrow = ArrowUtils.fromTexeraSchema(schema)
+ val fields = arrow.getFields.asScala.toList
+ val blob = fields.find(_.getName == "blob").get
+ val name = fields.find(_.getName == "name").get
+ blob.getMetadata.get("texera_type") shouldBe "LARGE_BINARY"
+ // STRING fields do not get the texera_type metadata.
+
Option(name.getMetadata).map(_.containsKey("texera_type")).getOrElse(false)
shouldBe false
+ }
+
+ // ----- round-trip -----
+
+ "schema round-trip" should "preserve primitive AttributeTypes through
fromTexeraSchema and back" in {
+ val original = Schema(
+ List(
+ new Attribute("i", AttributeType.INTEGER),
+ new Attribute("l", AttributeType.LONG),
+ new Attribute("d", AttributeType.DOUBLE),
+ new Attribute("b", AttributeType.BOOLEAN),
+ new Attribute("t", AttributeType.TIMESTAMP),
+ new Attribute("s", AttributeType.STRING),
+ new Attribute("y", AttributeType.BINARY)
+ )
+ )
+ val recovered =
ArrowUtils.toTexeraSchema(ArrowUtils.fromTexeraSchema(original))
+ recovered.getAttributes.toList.map(a => (a.getName, a.getType)) shouldBe
+ original.getAttributes.toList.map(a => (a.getName, a.getType))
+ }
+
+ it should "preserve LARGE_BINARY through the metadata-based path" in {
+ val original = Schema(
+ List(
+ new Attribute("blob", AttributeType.LARGE_BINARY),
+ new Attribute("name", AttributeType.STRING)
+ )
+ )
+ val recovered =
ArrowUtils.toTexeraSchema(ArrowUtils.fromTexeraSchema(original))
+ recovered.getAttributes.toList.map(a => (a.getName, a.getType)) shouldBe
List(
+ ("blob", AttributeType.LARGE_BINARY),
+ ("name", AttributeType.STRING)
+ )
+ }
+
+ it should "lose the ANY distinction (round-trips as STRING)" in {
+ // Pin: ANY fromAttributeType produces Utf8 with no metadata.
toAttributeType
+ // then can only see Utf8, so the recovered type is STRING. Documenting
this
+ // information loss so a future fix that round-trips ANY can break the
spec.
+ val original = Schema(List(new Attribute("v", AttributeType.ANY)))
+ val recovered =
ArrowUtils.toTexeraSchema(ArrowUtils.fromTexeraSchema(original))
+ recovered.getAttributes.toList.map(a => (a.getName, a.getType)) shouldBe
List(
+ ("v", AttributeType.STRING)
+ )
+ }
+}