This is an automated email from the ASF dual-hosted git repository.

lzljs3620320 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/paimon.git


The following commit(s) were added to refs/heads/master by this push:
     new 06fbb5e1f [orc] Add type id to orc files (#4523)
06fbb5e1f is described below

commit 06fbb5e1f1d9162c87dac6e707c38315beb4f6a8
Author: yuzelin <[email protected]>
AuthorDate: Tue Nov 19 20:20:57 2024 +0800

    [orc] Add type id to orc files (#4523)
---
 .../apache/paimon/format/orc/OrcFileFormat.java    |   8 +-
 .../apache/paimon/format/orc/OrcReaderFactory.java |   4 +-
 .../org/apache/paimon/format/orc/OrcTypeUtil.java  | 147 +++++++++++++++
 .../format/orc/reader/OrcSplitReaderUtil.java      |  99 ----------
 .../format/orc/writer/RowDataVectorizer.java       |   3 +-
 .../paimon/format/orc/writer/Vectorizer.java       |   4 +-
 .../paimon/format/orc/OrcSplitReaderUtilTest.java  |  68 -------
 .../apache/paimon/format/orc/OrcTypeUtilTest.java  | 206 +++++++++++++++++++++
 .../paimon/format/orc/OrcWriterFactoryTest.java    |   3 +-
 9 files changed, 364 insertions(+), 178 deletions(-)

diff --git 
a/paimon-format/src/main/java/org/apache/paimon/format/orc/OrcFileFormat.java 
b/paimon-format/src/main/java/org/apache/paimon/format/orc/OrcFileFormat.java
index c564b6940..c3521c6f1 100644
--- 
a/paimon-format/src/main/java/org/apache/paimon/format/orc/OrcFileFormat.java
+++ 
b/paimon-format/src/main/java/org/apache/paimon/format/orc/OrcFileFormat.java
@@ -28,7 +28,6 @@ import org.apache.paimon.format.SimpleStatsExtractor;
 import org.apache.paimon.format.orc.filter.OrcFilters;
 import org.apache.paimon.format.orc.filter.OrcPredicateFunctionVisitor;
 import org.apache.paimon.format.orc.filter.OrcSimpleStatsExtractor;
-import org.apache.paimon.format.orc.reader.OrcSplitReaderUtil;
 import org.apache.paimon.format.orc.writer.RowDataVectorizer;
 import org.apache.paimon.format.orc.writer.Vectorizer;
 import org.apache.paimon.options.MemorySize;
@@ -123,7 +122,7 @@ public class OrcFileFormat extends FileFormat {
     @Override
     public void validateDataFields(RowType rowType) {
         DataType refinedType = refineDataType(rowType);
-        OrcSplitReaderUtil.toOrcType(refinedType);
+        OrcTypeUtil.convertToOrcSchema((RowType) refinedType);
     }
 
     /**
@@ -141,9 +140,8 @@ public class OrcFileFormat extends FileFormat {
         DataType refinedType = refineDataType(type);
         DataType[] orcTypes = getFieldTypes(refinedType).toArray(new 
DataType[0]);
 
-        TypeDescription typeDescription = 
OrcSplitReaderUtil.toOrcType(refinedType);
-        Vectorizer<InternalRow> vectorizer =
-                new RowDataVectorizer(typeDescription.toString(), orcTypes);
+        TypeDescription typeDescription = 
OrcTypeUtil.convertToOrcSchema((RowType) refinedType);
+        Vectorizer<InternalRow> vectorizer = new 
RowDataVectorizer(typeDescription, orcTypes);
 
         return new OrcWriterFactory(vectorizer, orcProperties, writerConf, 
writeBatchSize);
     }
diff --git 
a/paimon-format/src/main/java/org/apache/paimon/format/orc/OrcReaderFactory.java
 
b/paimon-format/src/main/java/org/apache/paimon/format/orc/OrcReaderFactory.java
index 05f3dd785..ee0f8a55c 100644
--- 
a/paimon-format/src/main/java/org/apache/paimon/format/orc/OrcReaderFactory.java
+++ 
b/paimon-format/src/main/java/org/apache/paimon/format/orc/OrcReaderFactory.java
@@ -55,8 +55,8 @@ import javax.annotation.Nullable;
 import java.io.IOException;
 import java.util.List;
 
+import static org.apache.paimon.format.orc.OrcTypeUtil.convertToOrcSchema;
 import static 
org.apache.paimon.format.orc.reader.AbstractOrcColumnVector.createPaimonVector;
-import static org.apache.paimon.format.orc.reader.OrcSplitReaderUtil.toOrcType;
 import static org.apache.paimon.utils.Preconditions.checkNotNull;
 
 /** An ORC reader that produces a stream of {@link ColumnarRow} records. */
@@ -81,7 +81,7 @@ public class OrcReaderFactory implements FormatReaderFactory {
             final int batchSize,
             final boolean deletionVectorsEnabled) {
         this.hadoopConfig = checkNotNull(hadoopConfig);
-        this.schema = toOrcType(readType);
+        this.schema = convertToOrcSchema(readType);
         this.tableType = readType;
         this.conjunctPredicates = checkNotNull(conjunctPredicates);
         this.batchSize = batchSize;
diff --git 
a/paimon-format/src/main/java/org/apache/paimon/format/orc/OrcTypeUtil.java 
b/paimon-format/src/main/java/org/apache/paimon/format/orc/OrcTypeUtil.java
new file mode 100644
index 000000000..f7d3d626d
--- /dev/null
+++ b/paimon-format/src/main/java/org/apache/paimon/format/orc/OrcTypeUtil.java
@@ -0,0 +1,147 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.paimon.format.orc;
+
+import org.apache.paimon.annotation.VisibleForTesting;
+import org.apache.paimon.table.SpecialFields;
+import org.apache.paimon.types.ArrayType;
+import org.apache.paimon.types.CharType;
+import org.apache.paimon.types.DataField;
+import org.apache.paimon.types.DataType;
+import org.apache.paimon.types.DataTypes;
+import org.apache.paimon.types.DecimalType;
+import org.apache.paimon.types.MapType;
+import org.apache.paimon.types.RowType;
+import org.apache.paimon.types.VarCharType;
+
+import org.apache.orc.TypeDescription;
+
+/** Util for orc types. */
+public class OrcTypeUtil {
+
+    public static final String PAIMON_ORC_FIELD_ID_KEY = "paimon.id";
+
+    public static TypeDescription convertToOrcSchema(RowType rowType) {
+        TypeDescription struct = TypeDescription.createStruct();
+        for (DataField dataField : rowType.getFields()) {
+            TypeDescription child = convertToOrcType(dataField.type(), 
dataField.id(), 0);
+            struct.addField(dataField.name(), child);
+        }
+        return struct;
+    }
+
+    @VisibleForTesting
+    static TypeDescription convertToOrcType(DataType type, int fieldId, int 
depth) {
+        type = type.copy(true);
+        switch (type.getTypeRoot()) {
+            case CHAR:
+                return TypeDescription.createChar()
+                        .withMaxLength(((CharType) type).getLength())
+                        .setAttribute(PAIMON_ORC_FIELD_ID_KEY, 
String.valueOf(fieldId));
+            case VARCHAR:
+                int len = ((VarCharType) type).getLength();
+                if (len == VarCharType.MAX_LENGTH) {
+                    return TypeDescription.createString()
+                            .setAttribute(PAIMON_ORC_FIELD_ID_KEY, 
String.valueOf(fieldId));
+                } else {
+                    return TypeDescription.createVarchar()
+                            .withMaxLength(len)
+                            .setAttribute(PAIMON_ORC_FIELD_ID_KEY, 
String.valueOf(fieldId));
+                }
+            case BOOLEAN:
+                return TypeDescription.createBoolean()
+                        .setAttribute(PAIMON_ORC_FIELD_ID_KEY, 
String.valueOf(fieldId));
+            case VARBINARY:
+                if (type.equals(DataTypes.BYTES())) {
+                    return TypeDescription.createBinary()
+                            .setAttribute(PAIMON_ORC_FIELD_ID_KEY, 
String.valueOf(fieldId));
+                } else {
+                    throw new UnsupportedOperationException(
+                            "Not support other binary type: " + type);
+                }
+            case DECIMAL:
+                DecimalType decimalType = (DecimalType) type;
+                return TypeDescription.createDecimal()
+                        .withScale(decimalType.getScale())
+                        .withPrecision(decimalType.getPrecision())
+                        .setAttribute(PAIMON_ORC_FIELD_ID_KEY, 
String.valueOf(fieldId));
+            case TINYINT:
+                return TypeDescription.createByte()
+                        .setAttribute(PAIMON_ORC_FIELD_ID_KEY, 
String.valueOf(fieldId));
+            case SMALLINT:
+                return TypeDescription.createShort()
+                        .setAttribute(PAIMON_ORC_FIELD_ID_KEY, 
String.valueOf(fieldId));
+            case INTEGER:
+            case TIME_WITHOUT_TIME_ZONE:
+                return TypeDescription.createInt()
+                        .setAttribute(PAIMON_ORC_FIELD_ID_KEY, 
String.valueOf(fieldId));
+            case BIGINT:
+                return TypeDescription.createLong()
+                        .setAttribute(PAIMON_ORC_FIELD_ID_KEY, 
String.valueOf(fieldId));
+            case FLOAT:
+                return TypeDescription.createFloat()
+                        .setAttribute(PAIMON_ORC_FIELD_ID_KEY, 
String.valueOf(fieldId));
+            case DOUBLE:
+                return TypeDescription.createDouble()
+                        .setAttribute(PAIMON_ORC_FIELD_ID_KEY, 
String.valueOf(fieldId));
+            case DATE:
+                return TypeDescription.createDate()
+                        .setAttribute(PAIMON_ORC_FIELD_ID_KEY, 
String.valueOf(fieldId));
+            case TIMESTAMP_WITHOUT_TIME_ZONE:
+                return TypeDescription.createTimestamp()
+                        .setAttribute(PAIMON_ORC_FIELD_ID_KEY, 
String.valueOf(fieldId));
+            case TIMESTAMP_WITH_LOCAL_TIME_ZONE:
+                return TypeDescription.createTimestampInstant()
+                        .setAttribute(PAIMON_ORC_FIELD_ID_KEY, 
String.valueOf(fieldId));
+            case ARRAY:
+                ArrayType arrayType = (ArrayType) type;
+
+                String elementFieldId =
+                        
String.valueOf(SpecialFields.getArrayElementFieldId(fieldId, depth + 1));
+                TypeDescription elementOrcType =
+                        convertToOrcType(arrayType.getElementType(), fieldId, 
depth + 1)
+                                .setAttribute(PAIMON_ORC_FIELD_ID_KEY, 
elementFieldId);
+
+                return TypeDescription.createList(elementOrcType)
+                        .setAttribute(PAIMON_ORC_FIELD_ID_KEY, 
String.valueOf(fieldId));
+            case MAP:
+                MapType mapType = (MapType) type;
+
+                String mapKeyFieldId =
+                        String.valueOf(SpecialFields.getMapKeyFieldId(fieldId, 
depth + 1));
+                TypeDescription mapKeyOrcType =
+                        convertToOrcType(mapType.getKeyType(), fieldId, depth 
+ 1)
+                                .setAttribute(PAIMON_ORC_FIELD_ID_KEY, 
mapKeyFieldId);
+
+                String mapValueFieldId =
+                        
String.valueOf(SpecialFields.getMapValueFieldId(fieldId, depth + 1));
+                TypeDescription mapValueOrcType =
+                        convertToOrcType(mapType.getValueType(), fieldId, 
depth + 1)
+                                .setAttribute(PAIMON_ORC_FIELD_ID_KEY, 
mapValueFieldId);
+
+                return TypeDescription.createMap(mapKeyOrcType, 
mapValueOrcType)
+                        .setAttribute(PAIMON_ORC_FIELD_ID_KEY, 
String.valueOf(fieldId));
+            case ROW:
+                return convertToOrcSchema((RowType) type)
+                        .setAttribute(PAIMON_ORC_FIELD_ID_KEY, 
String.valueOf(fieldId));
+            default:
+                throw new UnsupportedOperationException("Unsupported type: " + 
type);
+        }
+    }
+}
diff --git 
a/paimon-format/src/main/java/org/apache/paimon/format/orc/reader/OrcSplitReaderUtil.java
 
b/paimon-format/src/main/java/org/apache/paimon/format/orc/reader/OrcSplitReaderUtil.java
deleted file mode 100644
index 882f1c753..000000000
--- 
a/paimon-format/src/main/java/org/apache/paimon/format/orc/reader/OrcSplitReaderUtil.java
+++ /dev/null
@@ -1,99 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.paimon.format.orc.reader;
-
-import org.apache.paimon.types.ArrayType;
-import org.apache.paimon.types.CharType;
-import org.apache.paimon.types.DataType;
-import org.apache.paimon.types.DataTypes;
-import org.apache.paimon.types.DecimalType;
-import org.apache.paimon.types.MapType;
-import org.apache.paimon.types.RowType;
-import org.apache.paimon.types.VarCharType;
-
-import org.apache.orc.TypeDescription;
-
-/** Util for orc types. */
-public class OrcSplitReaderUtil {
-
-    public static TypeDescription toOrcType(DataType type) {
-        type = type.copy(true);
-        switch (type.getTypeRoot()) {
-            case CHAR:
-                return TypeDescription.createChar().withMaxLength(((CharType) 
type).getLength());
-            case VARCHAR:
-                int len = ((VarCharType) type).getLength();
-                if (len == VarCharType.MAX_LENGTH) {
-                    return TypeDescription.createString();
-                } else {
-                    return TypeDescription.createVarchar().withMaxLength(len);
-                }
-            case BOOLEAN:
-                return TypeDescription.createBoolean();
-            case VARBINARY:
-                if (type.equals(DataTypes.BYTES())) {
-                    return TypeDescription.createBinary();
-                } else {
-                    throw new UnsupportedOperationException(
-                            "Not support other binary type: " + type);
-                }
-            case DECIMAL:
-                DecimalType decimalType = (DecimalType) type;
-                return TypeDescription.createDecimal()
-                        .withScale(decimalType.getScale())
-                        .withPrecision(decimalType.getPrecision());
-            case TINYINT:
-                return TypeDescription.createByte();
-            case SMALLINT:
-                return TypeDescription.createShort();
-            case INTEGER:
-            case TIME_WITHOUT_TIME_ZONE:
-                return TypeDescription.createInt();
-            case BIGINT:
-                return TypeDescription.createLong();
-            case FLOAT:
-                return TypeDescription.createFloat();
-            case DOUBLE:
-                return TypeDescription.createDouble();
-            case DATE:
-                return TypeDescription.createDate();
-            case TIMESTAMP_WITHOUT_TIME_ZONE:
-                return TypeDescription.createTimestamp();
-            case TIMESTAMP_WITH_LOCAL_TIME_ZONE:
-                return TypeDescription.createTimestampInstant();
-            case ARRAY:
-                ArrayType arrayType = (ArrayType) type;
-                return 
TypeDescription.createList(toOrcType(arrayType.getElementType()));
-            case MAP:
-                MapType mapType = (MapType) type;
-                return TypeDescription.createMap(
-                        toOrcType(mapType.getKeyType()), 
toOrcType(mapType.getValueType()));
-            case ROW:
-                RowType rowType = (RowType) type;
-                TypeDescription struct = TypeDescription.createStruct();
-                for (int i = 0; i < rowType.getFieldCount(); i++) {
-                    struct.addField(
-                            rowType.getFieldNames().get(i), 
toOrcType(rowType.getTypeAt(i)));
-                }
-                return struct;
-            default:
-                throw new UnsupportedOperationException("Unsupported type: " + 
type);
-        }
-    }
-}
diff --git 
a/paimon-format/src/main/java/org/apache/paimon/format/orc/writer/RowDataVectorizer.java
 
b/paimon-format/src/main/java/org/apache/paimon/format/orc/writer/RowDataVectorizer.java
index 21443cdf9..46c936a02 100644
--- 
a/paimon-format/src/main/java/org/apache/paimon/format/orc/writer/RowDataVectorizer.java
+++ 
b/paimon-format/src/main/java/org/apache/paimon/format/orc/writer/RowDataVectorizer.java
@@ -23,6 +23,7 @@ import org.apache.paimon.types.DataType;
 
 import org.apache.hadoop.hive.ql.exec.vector.ColumnVector;
 import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
+import org.apache.orc.TypeDescription;
 
 import java.util.Arrays;
 import java.util.List;
@@ -35,7 +36,7 @@ public class RowDataVectorizer extends 
Vectorizer<InternalRow> {
 
     private final List<FieldWriter> fieldWriters;
 
-    public RowDataVectorizer(String schema, DataType[] fieldTypes) {
+    public RowDataVectorizer(TypeDescription schema, DataType[] fieldTypes) {
         super(schema);
         this.fieldWriters =
                 Arrays.stream(fieldTypes)
diff --git 
a/paimon-format/src/main/java/org/apache/paimon/format/orc/writer/Vectorizer.java
 
b/paimon-format/src/main/java/org/apache/paimon/format/orc/writer/Vectorizer.java
index 0f0e6bba7..2add46531 100644
--- 
a/paimon-format/src/main/java/org/apache/paimon/format/orc/writer/Vectorizer.java
+++ 
b/paimon-format/src/main/java/org/apache/paimon/format/orc/writer/Vectorizer.java
@@ -39,9 +39,9 @@ public abstract class Vectorizer<T> implements Serializable {
 
     private final TypeDescription schema;
 
-    public Vectorizer(final String schema) {
+    public Vectorizer(final TypeDescription schema) {
         checkNotNull(schema);
-        this.schema = TypeDescription.fromString(schema);
+        this.schema = schema;
     }
 
     /**
diff --git 
a/paimon-format/src/test/java/org/apache/paimon/format/orc/OrcSplitReaderUtilTest.java
 
b/paimon-format/src/test/java/org/apache/paimon/format/orc/OrcSplitReaderUtilTest.java
deleted file mode 100644
index c07838dfa..000000000
--- 
a/paimon-format/src/test/java/org/apache/paimon/format/orc/OrcSplitReaderUtilTest.java
+++ /dev/null
@@ -1,68 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.paimon.format.orc;
-
-import org.apache.paimon.format.orc.reader.OrcSplitReaderUtil;
-import org.apache.paimon.types.DataType;
-import org.apache.paimon.types.DataTypes;
-
-import org.junit.jupiter.api.Test;
-
-import static org.apache.paimon.format.orc.reader.OrcSplitReaderUtil.toOrcType;
-import static org.assertj.core.api.Assertions.assertThat;
-
-/** Test for {@link OrcSplitReaderUtil}. */
-class OrcSplitReaderUtilTest {
-
-    @Test
-    void testDataTypeToOrcType() {
-        test("boolean", DataTypes.BOOLEAN());
-        test("char(123)", DataTypes.CHAR(123));
-        test("varchar(123)", DataTypes.VARCHAR(123));
-        test("string", DataTypes.STRING());
-        test("binary", DataTypes.BYTES());
-        test("tinyint", DataTypes.TINYINT());
-        test("smallint", DataTypes.SMALLINT());
-        test("int", DataTypes.INT());
-        test("bigint", DataTypes.BIGINT());
-        test("float", DataTypes.FLOAT());
-        test("double", DataTypes.DOUBLE());
-        test("date", DataTypes.DATE());
-        test("timestamp", DataTypes.TIMESTAMP());
-        test("array<float>", DataTypes.ARRAY(DataTypes.FLOAT()));
-        test("map<float,bigint>", DataTypes.MAP(DataTypes.FLOAT(), 
DataTypes.BIGINT()));
-        test(
-                
"struct<int0:int,str1:string,double2:double,row3:struct<int0:int,int1:int>>",
-                DataTypes.ROW(
-                        DataTypes.FIELD(0, "int0", DataTypes.INT()),
-                        DataTypes.FIELD(1, "str1", DataTypes.STRING()),
-                        DataTypes.FIELD(2, "double2", DataTypes.DOUBLE()),
-                        DataTypes.FIELD(
-                                3,
-                                "row3",
-                                DataTypes.ROW(
-                                        DataTypes.FIELD(4, "int0", 
DataTypes.INT()),
-                                        DataTypes.FIELD(5, "int1", 
DataTypes.INT())))));
-        test("decimal(4,2)", DataTypes.DECIMAL(4, 2));
-    }
-
-    private void test(String expected, DataType type) {
-        assertThat(toOrcType(type)).hasToString(expected);
-    }
-}
diff --git 
a/paimon-format/src/test/java/org/apache/paimon/format/orc/OrcTypeUtilTest.java 
b/paimon-format/src/test/java/org/apache/paimon/format/orc/OrcTypeUtilTest.java
new file mode 100644
index 000000000..5669ac33d
--- /dev/null
+++ 
b/paimon-format/src/test/java/org/apache/paimon/format/orc/OrcTypeUtilTest.java
@@ -0,0 +1,206 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.paimon.format.orc;
+
+import org.apache.paimon.format.FileFormatFactory;
+import org.apache.paimon.format.FormatWriter;
+import org.apache.paimon.fs.FileIO;
+import org.apache.paimon.fs.Path;
+import org.apache.paimon.fs.PositionOutputStream;
+import org.apache.paimon.fs.local.LocalFileIO;
+import org.apache.paimon.options.Options;
+import org.apache.paimon.types.DataType;
+import org.apache.paimon.types.DataTypes;
+import org.apache.paimon.types.RowType;
+
+import org.apache.paimon.shade.guava30.com.google.common.base.Objects;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.orc.Reader;
+import org.apache.orc.TypeDescription;
+import org.junit.jupiter.api.Test;
+import org.junit.jupiter.api.io.TempDir;
+
+import java.io.IOException;
+import java.util.List;
+import java.util.UUID;
+import java.util.concurrent.atomic.AtomicInteger;
+
+import static org.apache.paimon.format.orc.OrcFileFormat.refineDataType;
+import static org.apache.paimon.format.orc.OrcTypeUtil.PAIMON_ORC_FIELD_ID_KEY;
+import static org.apache.paimon.format.orc.OrcTypeUtil.convertToOrcSchema;
+import static org.apache.paimon.format.orc.OrcTypeUtil.convertToOrcType;
+import static org.apache.paimon.utils.Preconditions.checkArgument;
+import static org.assertj.core.api.Assertions.assertThat;
+import static org.assertj.core.api.Assertions.assertThatNoException;
+
+/** Test for {@link OrcTypeUtil}. */
+class OrcTypeUtilTest {
+
+    @Test
+    void testDataTypeToOrcType() {
+        test("boolean", DataTypes.BOOLEAN());
+        test("char(123)", DataTypes.CHAR(123));
+        test("varchar(123)", DataTypes.VARCHAR(123));
+        test("string", DataTypes.STRING());
+        test("binary", DataTypes.BYTES());
+        test("tinyint", DataTypes.TINYINT());
+        test("smallint", DataTypes.SMALLINT());
+        test("int", DataTypes.INT());
+        test("bigint", DataTypes.BIGINT());
+        test("float", DataTypes.FLOAT());
+        test("double", DataTypes.DOUBLE());
+        test("date", DataTypes.DATE());
+        test("timestamp", DataTypes.TIMESTAMP());
+        test("array<float>", DataTypes.ARRAY(DataTypes.FLOAT()));
+        test("map<float,bigint>", DataTypes.MAP(DataTypes.FLOAT(), 
DataTypes.BIGINT()));
+        test(
+                
"struct<int0:int,str1:string,double2:double,row3:struct<int0:int,int1:int>>",
+                DataTypes.ROW(
+                        DataTypes.FIELD(0, "int0", DataTypes.INT()),
+                        DataTypes.FIELD(1, "str1", DataTypes.STRING()),
+                        DataTypes.FIELD(2, "double2", DataTypes.DOUBLE()),
+                        DataTypes.FIELD(
+                                3,
+                                "row3",
+                                DataTypes.ROW(
+                                        DataTypes.FIELD(4, "int0", 
DataTypes.INT()),
+                                        DataTypes.FIELD(5, "int1", 
DataTypes.INT())))));
+        test("decimal(4,2)", DataTypes.DECIMAL(4, 2));
+    }
+
+    private void test(String expected, DataType type) {
+        assertThat(convertToOrcType(type, -1, -1)).hasToString(expected);
+    }
+
+    @Test
+    void testFieldIdAttribute(@TempDir java.nio.file.Path tempPath) throws 
IOException {
+        RowType rowType =
+                RowType.builder()
+                        .field("a", DataTypes.INT())
+                        .field(
+                                "b",
+                                RowType.builder(true, new AtomicInteger(10))
+                                        .field("f0", DataTypes.STRING())
+                                        .field("f1", DataTypes.INT())
+                                        .build())
+                        .field("c", DataTypes.ARRAY(DataTypes.INT()))
+                        .field("d", DataTypes.MAP(DataTypes.INT(), 
DataTypes.STRING()))
+                        .field(
+                                "e",
+                                DataTypes.ARRAY(
+                                        RowType.builder(true, new 
AtomicInteger(20))
+                                                .field("f0", 
DataTypes.STRING())
+                                                .field("f1", DataTypes.INT())
+                                                .build()))
+                        .field(
+                                "f",
+                                RowType.builder(true, new AtomicInteger(30))
+                                        .field("f0", 
DataTypes.ARRAY(DataTypes.INT()))
+                                        .build())
+                        .build();
+
+        // write schema to orc file then get
+        FileIO fileIO = LocalFileIO.create();
+        Path tempFile = new Path(new Path(tempPath.toUri()), 
UUID.randomUUID().toString());
+
+        OrcFileFormat format =
+                new OrcFileFormat(new FileFormatFactory.FormatContext(new 
Options(), 1024, 1024));
+        PositionOutputStream out = fileIO.newOutputStream(tempFile, false);
+        FormatWriter writer = format.createWriterFactory(rowType).create(out, 
"zstd");
+        writer.close();
+        out.close();
+
+        Reader orcReader =
+                OrcReaderFactory.createReader(new Configuration(), fileIO, 
tempFile, null);
+        TypeDescription orcSchema = orcReader.getSchema();
+
+        RowType refined = (RowType) refineDataType(rowType);
+
+        assertThatNoException()
+                .isThrownBy(() -> checkStruct(convertToOrcSchema(refined), 
orcSchema));
+
+        assertThatNoException()
+                .isThrownBy(
+                        () ->
+                                checkStruct(
+                                        
convertToOrcSchema(refined.project("c", "b", "d")),
+                                        orcSchema));
+
+        assertThatNoException()
+                .isThrownBy(
+                        () ->
+                                checkStruct(
+                                        
convertToOrcSchema(refined.project("a", "e", "f")),
+                                        orcSchema));
+    }
+
+    private void checkStruct(TypeDescription requiredStruct, TypeDescription 
orcStruct) {
+        List<String> requiredFields = requiredStruct.getFieldNames();
+        List<TypeDescription> requiredTypes = requiredStruct.getChildren();
+        List<String> orcFields = orcStruct.getFieldNames();
+        List<TypeDescription> orcTypes = orcStruct.getChildren();
+
+        for (int i = 0; i < requiredFields.size(); i++) {
+            String field = requiredFields.get(i);
+            int orcIndex = orcFields.indexOf(field);
+            checkArgument(orcIndex != -1, "Cannot find field %s in orc file 
meta.", field);
+            TypeDescription requiredType = requiredTypes.get(i);
+            TypeDescription orcType = orcTypes.get(orcIndex);
+            checkField(field, requiredType, orcType);
+        }
+    }
+
+    private void checkField(
+            String fieldName, TypeDescription requiredType, TypeDescription 
orcType) {
+        checkFieldIdAttribute(fieldName, requiredType, orcType);
+        if (requiredType.getCategory().isPrimitive()) {
+            return;
+        }
+
+        switch (requiredType.getCategory()) {
+            case LIST:
+                checkField(
+                        "_elem", requiredType.getChildren().get(0), 
orcType.getChildren().get(0));
+                return;
+            case MAP:
+                checkField("_key", requiredType.getChildren().get(0), 
orcType.getChildren().get(0));
+                checkField(
+                        "_value", requiredType.getChildren().get(1), 
orcType.getChildren().get(1));
+                return;
+            case STRUCT:
+                checkStruct(requiredType, orcType);
+                return;
+            default:
+                throw new UnsupportedOperationException("Unsupported orc type: 
" + requiredType);
+        }
+    }
+
+    private void checkFieldIdAttribute(
+            String fieldName, TypeDescription requiredType, TypeDescription 
orcType) {
+        String requiredId = 
requiredType.getAttributeValue(PAIMON_ORC_FIELD_ID_KEY);
+        String orcId = orcType.getAttributeValue(PAIMON_ORC_FIELD_ID_KEY);
+        checkArgument(
+                Objects.equal(requiredId, orcId),
+                "Field %s has different id: read type id is %s but orc type id 
is %s. This is unexpected.",
+                fieldName,
+                requiredId,
+                orcId);
+    }
+}
diff --git 
a/paimon-format/src/test/java/org/apache/paimon/format/orc/OrcWriterFactoryTest.java
 
b/paimon-format/src/test/java/org/apache/paimon/format/orc/OrcWriterFactoryTest.java
index 2511d7ed7..52df5afb4 100644
--- 
a/paimon-format/src/test/java/org/apache/paimon/format/orc/OrcWriterFactoryTest.java
+++ 
b/paimon-format/src/test/java/org/apache/paimon/format/orc/OrcWriterFactoryTest.java
@@ -28,6 +28,7 @@ import org.apache.paimon.types.DataTypes;
 import org.apache.hadoop.fs.Path;
 import org.apache.orc.MemoryManager;
 import org.apache.orc.OrcFile;
+import org.apache.orc.TypeDescription;
 import org.junit.jupiter.api.Test;
 import org.junit.jupiter.api.io.TempDir;
 
@@ -47,7 +48,7 @@ class OrcWriterFactoryTest {
         OrcWriterFactory factory =
                 new TestOrcWriterFactory(
                         new RowDataVectorizer(
-                                "struct<_col0:string,_col1:int>",
+                                
TypeDescription.fromString("struct<_col0:string,_col1:int>"),
                                 new DataType[] {DataTypes.STRING(), 
DataTypes.INT()}),
                         memoryManager);
         factory.create(new 
LocalPositionOutputStream(tmpDir.resolve("file1").toFile()), "LZ4");

Reply via email to