This is an automated email from the ASF dual-hosted git repository.
lzljs3620320 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/paimon.git
The following commit(s) were added to refs/heads/master by this push:
new 06fbb5e1f [orc] Add type id to orc files (#4523)
06fbb5e1f is described below
commit 06fbb5e1f1d9162c87dac6e707c38315beb4f6a8
Author: yuzelin <[email protected]>
AuthorDate: Tue Nov 19 20:20:57 2024 +0800
[orc] Add type id to orc files (#4523)
---
.../apache/paimon/format/orc/OrcFileFormat.java | 8 +-
.../apache/paimon/format/orc/OrcReaderFactory.java | 4 +-
.../org/apache/paimon/format/orc/OrcTypeUtil.java | 147 +++++++++++++++
.../format/orc/reader/OrcSplitReaderUtil.java | 99 ----------
.../format/orc/writer/RowDataVectorizer.java | 3 +-
.../paimon/format/orc/writer/Vectorizer.java | 4 +-
.../paimon/format/orc/OrcSplitReaderUtilTest.java | 68 -------
.../apache/paimon/format/orc/OrcTypeUtilTest.java | 206 +++++++++++++++++++++
.../paimon/format/orc/OrcWriterFactoryTest.java | 3 +-
9 files changed, 364 insertions(+), 178 deletions(-)
diff --git
a/paimon-format/src/main/java/org/apache/paimon/format/orc/OrcFileFormat.java
b/paimon-format/src/main/java/org/apache/paimon/format/orc/OrcFileFormat.java
index c564b6940..c3521c6f1 100644
---
a/paimon-format/src/main/java/org/apache/paimon/format/orc/OrcFileFormat.java
+++
b/paimon-format/src/main/java/org/apache/paimon/format/orc/OrcFileFormat.java
@@ -28,7 +28,6 @@ import org.apache.paimon.format.SimpleStatsExtractor;
import org.apache.paimon.format.orc.filter.OrcFilters;
import org.apache.paimon.format.orc.filter.OrcPredicateFunctionVisitor;
import org.apache.paimon.format.orc.filter.OrcSimpleStatsExtractor;
-import org.apache.paimon.format.orc.reader.OrcSplitReaderUtil;
import org.apache.paimon.format.orc.writer.RowDataVectorizer;
import org.apache.paimon.format.orc.writer.Vectorizer;
import org.apache.paimon.options.MemorySize;
@@ -123,7 +122,7 @@ public class OrcFileFormat extends FileFormat {
@Override
public void validateDataFields(RowType rowType) {
DataType refinedType = refineDataType(rowType);
- OrcSplitReaderUtil.toOrcType(refinedType);
+ OrcTypeUtil.convertToOrcSchema((RowType) refinedType);
}
/**
@@ -141,9 +140,8 @@ public class OrcFileFormat extends FileFormat {
DataType refinedType = refineDataType(type);
DataType[] orcTypes = getFieldTypes(refinedType).toArray(new
DataType[0]);
- TypeDescription typeDescription =
OrcSplitReaderUtil.toOrcType(refinedType);
- Vectorizer<InternalRow> vectorizer =
- new RowDataVectorizer(typeDescription.toString(), orcTypes);
+ TypeDescription typeDescription =
OrcTypeUtil.convertToOrcSchema((RowType) refinedType);
+ Vectorizer<InternalRow> vectorizer = new
RowDataVectorizer(typeDescription, orcTypes);
return new OrcWriterFactory(vectorizer, orcProperties, writerConf,
writeBatchSize);
}
diff --git
a/paimon-format/src/main/java/org/apache/paimon/format/orc/OrcReaderFactory.java
b/paimon-format/src/main/java/org/apache/paimon/format/orc/OrcReaderFactory.java
index 05f3dd785..ee0f8a55c 100644
---
a/paimon-format/src/main/java/org/apache/paimon/format/orc/OrcReaderFactory.java
+++
b/paimon-format/src/main/java/org/apache/paimon/format/orc/OrcReaderFactory.java
@@ -55,8 +55,8 @@ import javax.annotation.Nullable;
import java.io.IOException;
import java.util.List;
+import static org.apache.paimon.format.orc.OrcTypeUtil.convertToOrcSchema;
import static
org.apache.paimon.format.orc.reader.AbstractOrcColumnVector.createPaimonVector;
-import static org.apache.paimon.format.orc.reader.OrcSplitReaderUtil.toOrcType;
import static org.apache.paimon.utils.Preconditions.checkNotNull;
/** An ORC reader that produces a stream of {@link ColumnarRow} records. */
@@ -81,7 +81,7 @@ public class OrcReaderFactory implements FormatReaderFactory {
final int batchSize,
final boolean deletionVectorsEnabled) {
this.hadoopConfig = checkNotNull(hadoopConfig);
- this.schema = toOrcType(readType);
+ this.schema = convertToOrcSchema(readType);
this.tableType = readType;
this.conjunctPredicates = checkNotNull(conjunctPredicates);
this.batchSize = batchSize;
diff --git
a/paimon-format/src/main/java/org/apache/paimon/format/orc/OrcTypeUtil.java
b/paimon-format/src/main/java/org/apache/paimon/format/orc/OrcTypeUtil.java
new file mode 100644
index 000000000..f7d3d626d
--- /dev/null
+++ b/paimon-format/src/main/java/org/apache/paimon/format/orc/OrcTypeUtil.java
@@ -0,0 +1,147 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.paimon.format.orc;
+
+import org.apache.paimon.annotation.VisibleForTesting;
+import org.apache.paimon.table.SpecialFields;
+import org.apache.paimon.types.ArrayType;
+import org.apache.paimon.types.CharType;
+import org.apache.paimon.types.DataField;
+import org.apache.paimon.types.DataType;
+import org.apache.paimon.types.DataTypes;
+import org.apache.paimon.types.DecimalType;
+import org.apache.paimon.types.MapType;
+import org.apache.paimon.types.RowType;
+import org.apache.paimon.types.VarCharType;
+
+import org.apache.orc.TypeDescription;
+
+/** Util for orc types. */
+public class OrcTypeUtil {
+
+ public static final String PAIMON_ORC_FIELD_ID_KEY = "paimon.id";
+
+ public static TypeDescription convertToOrcSchema(RowType rowType) {
+ TypeDescription struct = TypeDescription.createStruct();
+ for (DataField dataField : rowType.getFields()) {
+ TypeDescription child = convertToOrcType(dataField.type(),
dataField.id(), 0);
+ struct.addField(dataField.name(), child);
+ }
+ return struct;
+ }
+
+ @VisibleForTesting
+ static TypeDescription convertToOrcType(DataType type, int fieldId, int
depth) {
+ type = type.copy(true);
+ switch (type.getTypeRoot()) {
+ case CHAR:
+ return TypeDescription.createChar()
+ .withMaxLength(((CharType) type).getLength())
+ .setAttribute(PAIMON_ORC_FIELD_ID_KEY,
String.valueOf(fieldId));
+ case VARCHAR:
+ int len = ((VarCharType) type).getLength();
+ if (len == VarCharType.MAX_LENGTH) {
+ return TypeDescription.createString()
+ .setAttribute(PAIMON_ORC_FIELD_ID_KEY,
String.valueOf(fieldId));
+ } else {
+ return TypeDescription.createVarchar()
+ .withMaxLength(len)
+ .setAttribute(PAIMON_ORC_FIELD_ID_KEY,
String.valueOf(fieldId));
+ }
+ case BOOLEAN:
+ return TypeDescription.createBoolean()
+ .setAttribute(PAIMON_ORC_FIELD_ID_KEY,
String.valueOf(fieldId));
+ case VARBINARY:
+ if (type.equals(DataTypes.BYTES())) {
+ return TypeDescription.createBinary()
+ .setAttribute(PAIMON_ORC_FIELD_ID_KEY,
String.valueOf(fieldId));
+ } else {
+ throw new UnsupportedOperationException(
+ "Not support other binary type: " + type);
+ }
+ case DECIMAL:
+ DecimalType decimalType = (DecimalType) type;
+ return TypeDescription.createDecimal()
+ .withScale(decimalType.getScale())
+ .withPrecision(decimalType.getPrecision())
+ .setAttribute(PAIMON_ORC_FIELD_ID_KEY,
String.valueOf(fieldId));
+ case TINYINT:
+ return TypeDescription.createByte()
+ .setAttribute(PAIMON_ORC_FIELD_ID_KEY,
String.valueOf(fieldId));
+ case SMALLINT:
+ return TypeDescription.createShort()
+ .setAttribute(PAIMON_ORC_FIELD_ID_KEY,
String.valueOf(fieldId));
+ case INTEGER:
+ case TIME_WITHOUT_TIME_ZONE:
+ return TypeDescription.createInt()
+ .setAttribute(PAIMON_ORC_FIELD_ID_KEY,
String.valueOf(fieldId));
+ case BIGINT:
+ return TypeDescription.createLong()
+ .setAttribute(PAIMON_ORC_FIELD_ID_KEY,
String.valueOf(fieldId));
+ case FLOAT:
+ return TypeDescription.createFloat()
+ .setAttribute(PAIMON_ORC_FIELD_ID_KEY,
String.valueOf(fieldId));
+ case DOUBLE:
+ return TypeDescription.createDouble()
+ .setAttribute(PAIMON_ORC_FIELD_ID_KEY,
String.valueOf(fieldId));
+ case DATE:
+ return TypeDescription.createDate()
+ .setAttribute(PAIMON_ORC_FIELD_ID_KEY,
String.valueOf(fieldId));
+ case TIMESTAMP_WITHOUT_TIME_ZONE:
+ return TypeDescription.createTimestamp()
+ .setAttribute(PAIMON_ORC_FIELD_ID_KEY,
String.valueOf(fieldId));
+ case TIMESTAMP_WITH_LOCAL_TIME_ZONE:
+ return TypeDescription.createTimestampInstant()
+ .setAttribute(PAIMON_ORC_FIELD_ID_KEY,
String.valueOf(fieldId));
+ case ARRAY:
+ ArrayType arrayType = (ArrayType) type;
+
+ String elementFieldId =
+
String.valueOf(SpecialFields.getArrayElementFieldId(fieldId, depth + 1));
+ TypeDescription elementOrcType =
+ convertToOrcType(arrayType.getElementType(), fieldId,
depth + 1)
+ .setAttribute(PAIMON_ORC_FIELD_ID_KEY,
elementFieldId);
+
+ return TypeDescription.createList(elementOrcType)
+ .setAttribute(PAIMON_ORC_FIELD_ID_KEY,
String.valueOf(fieldId));
+ case MAP:
+ MapType mapType = (MapType) type;
+
+ String mapKeyFieldId =
+ String.valueOf(SpecialFields.getMapKeyFieldId(fieldId,
depth + 1));
+ TypeDescription mapKeyOrcType =
+ convertToOrcType(mapType.getKeyType(), fieldId, depth
+ 1)
+ .setAttribute(PAIMON_ORC_FIELD_ID_KEY,
mapKeyFieldId);
+
+ String mapValueFieldId =
+
String.valueOf(SpecialFields.getMapValueFieldId(fieldId, depth + 1));
+ TypeDescription mapValueOrcType =
+ convertToOrcType(mapType.getValueType(), fieldId,
depth + 1)
+ .setAttribute(PAIMON_ORC_FIELD_ID_KEY,
mapValueFieldId);
+
+ return TypeDescription.createMap(mapKeyOrcType,
mapValueOrcType)
+ .setAttribute(PAIMON_ORC_FIELD_ID_KEY,
String.valueOf(fieldId));
+ case ROW:
+ return convertToOrcSchema((RowType) type)
+ .setAttribute(PAIMON_ORC_FIELD_ID_KEY,
String.valueOf(fieldId));
+ default:
+ throw new UnsupportedOperationException("Unsupported type: " +
type);
+ }
+ }
+}
diff --git
a/paimon-format/src/main/java/org/apache/paimon/format/orc/reader/OrcSplitReaderUtil.java
b/paimon-format/src/main/java/org/apache/paimon/format/orc/reader/OrcSplitReaderUtil.java
deleted file mode 100644
index 882f1c753..000000000
---
a/paimon-format/src/main/java/org/apache/paimon/format/orc/reader/OrcSplitReaderUtil.java
+++ /dev/null
@@ -1,99 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.paimon.format.orc.reader;
-
-import org.apache.paimon.types.ArrayType;
-import org.apache.paimon.types.CharType;
-import org.apache.paimon.types.DataType;
-import org.apache.paimon.types.DataTypes;
-import org.apache.paimon.types.DecimalType;
-import org.apache.paimon.types.MapType;
-import org.apache.paimon.types.RowType;
-import org.apache.paimon.types.VarCharType;
-
-import org.apache.orc.TypeDescription;
-
-/** Util for orc types. */
-public class OrcSplitReaderUtil {
-
- public static TypeDescription toOrcType(DataType type) {
- type = type.copy(true);
- switch (type.getTypeRoot()) {
- case CHAR:
- return TypeDescription.createChar().withMaxLength(((CharType)
type).getLength());
- case VARCHAR:
- int len = ((VarCharType) type).getLength();
- if (len == VarCharType.MAX_LENGTH) {
- return TypeDescription.createString();
- } else {
- return TypeDescription.createVarchar().withMaxLength(len);
- }
- case BOOLEAN:
- return TypeDescription.createBoolean();
- case VARBINARY:
- if (type.equals(DataTypes.BYTES())) {
- return TypeDescription.createBinary();
- } else {
- throw new UnsupportedOperationException(
- "Not support other binary type: " + type);
- }
- case DECIMAL:
- DecimalType decimalType = (DecimalType) type;
- return TypeDescription.createDecimal()
- .withScale(decimalType.getScale())
- .withPrecision(decimalType.getPrecision());
- case TINYINT:
- return TypeDescription.createByte();
- case SMALLINT:
- return TypeDescription.createShort();
- case INTEGER:
- case TIME_WITHOUT_TIME_ZONE:
- return TypeDescription.createInt();
- case BIGINT:
- return TypeDescription.createLong();
- case FLOAT:
- return TypeDescription.createFloat();
- case DOUBLE:
- return TypeDescription.createDouble();
- case DATE:
- return TypeDescription.createDate();
- case TIMESTAMP_WITHOUT_TIME_ZONE:
- return TypeDescription.createTimestamp();
- case TIMESTAMP_WITH_LOCAL_TIME_ZONE:
- return TypeDescription.createTimestampInstant();
- case ARRAY:
- ArrayType arrayType = (ArrayType) type;
- return
TypeDescription.createList(toOrcType(arrayType.getElementType()));
- case MAP:
- MapType mapType = (MapType) type;
- return TypeDescription.createMap(
- toOrcType(mapType.getKeyType()),
toOrcType(mapType.getValueType()));
- case ROW:
- RowType rowType = (RowType) type;
- TypeDescription struct = TypeDescription.createStruct();
- for (int i = 0; i < rowType.getFieldCount(); i++) {
- struct.addField(
- rowType.getFieldNames().get(i),
toOrcType(rowType.getTypeAt(i)));
- }
- return struct;
- default:
- throw new UnsupportedOperationException("Unsupported type: " +
type);
- }
- }
-}
diff --git
a/paimon-format/src/main/java/org/apache/paimon/format/orc/writer/RowDataVectorizer.java
b/paimon-format/src/main/java/org/apache/paimon/format/orc/writer/RowDataVectorizer.java
index 21443cdf9..46c936a02 100644
---
a/paimon-format/src/main/java/org/apache/paimon/format/orc/writer/RowDataVectorizer.java
+++
b/paimon-format/src/main/java/org/apache/paimon/format/orc/writer/RowDataVectorizer.java
@@ -23,6 +23,7 @@ import org.apache.paimon.types.DataType;
import org.apache.hadoop.hive.ql.exec.vector.ColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
+import org.apache.orc.TypeDescription;
import java.util.Arrays;
import java.util.List;
@@ -35,7 +36,7 @@ public class RowDataVectorizer extends
Vectorizer<InternalRow> {
private final List<FieldWriter> fieldWriters;
- public RowDataVectorizer(String schema, DataType[] fieldTypes) {
+ public RowDataVectorizer(TypeDescription schema, DataType[] fieldTypes) {
super(schema);
this.fieldWriters =
Arrays.stream(fieldTypes)
diff --git
a/paimon-format/src/main/java/org/apache/paimon/format/orc/writer/Vectorizer.java
b/paimon-format/src/main/java/org/apache/paimon/format/orc/writer/Vectorizer.java
index 0f0e6bba7..2add46531 100644
---
a/paimon-format/src/main/java/org/apache/paimon/format/orc/writer/Vectorizer.java
+++
b/paimon-format/src/main/java/org/apache/paimon/format/orc/writer/Vectorizer.java
@@ -39,9 +39,9 @@ public abstract class Vectorizer<T> implements Serializable {
private final TypeDescription schema;
- public Vectorizer(final String schema) {
+ public Vectorizer(final TypeDescription schema) {
checkNotNull(schema);
- this.schema = TypeDescription.fromString(schema);
+ this.schema = schema;
}
/**
diff --git
a/paimon-format/src/test/java/org/apache/paimon/format/orc/OrcSplitReaderUtilTest.java
b/paimon-format/src/test/java/org/apache/paimon/format/orc/OrcSplitReaderUtilTest.java
deleted file mode 100644
index c07838dfa..000000000
---
a/paimon-format/src/test/java/org/apache/paimon/format/orc/OrcSplitReaderUtilTest.java
+++ /dev/null
@@ -1,68 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.paimon.format.orc;
-
-import org.apache.paimon.format.orc.reader.OrcSplitReaderUtil;
-import org.apache.paimon.types.DataType;
-import org.apache.paimon.types.DataTypes;
-
-import org.junit.jupiter.api.Test;
-
-import static org.apache.paimon.format.orc.reader.OrcSplitReaderUtil.toOrcType;
-import static org.assertj.core.api.Assertions.assertThat;
-
-/** Test for {@link OrcSplitReaderUtil}. */
-class OrcSplitReaderUtilTest {
-
- @Test
- void testDataTypeToOrcType() {
- test("boolean", DataTypes.BOOLEAN());
- test("char(123)", DataTypes.CHAR(123));
- test("varchar(123)", DataTypes.VARCHAR(123));
- test("string", DataTypes.STRING());
- test("binary", DataTypes.BYTES());
- test("tinyint", DataTypes.TINYINT());
- test("smallint", DataTypes.SMALLINT());
- test("int", DataTypes.INT());
- test("bigint", DataTypes.BIGINT());
- test("float", DataTypes.FLOAT());
- test("double", DataTypes.DOUBLE());
- test("date", DataTypes.DATE());
- test("timestamp", DataTypes.TIMESTAMP());
- test("array<float>", DataTypes.ARRAY(DataTypes.FLOAT()));
- test("map<float,bigint>", DataTypes.MAP(DataTypes.FLOAT(),
DataTypes.BIGINT()));
- test(
-
"struct<int0:int,str1:string,double2:double,row3:struct<int0:int,int1:int>>",
- DataTypes.ROW(
- DataTypes.FIELD(0, "int0", DataTypes.INT()),
- DataTypes.FIELD(1, "str1", DataTypes.STRING()),
- DataTypes.FIELD(2, "double2", DataTypes.DOUBLE()),
- DataTypes.FIELD(
- 3,
- "row3",
- DataTypes.ROW(
- DataTypes.FIELD(4, "int0",
DataTypes.INT()),
- DataTypes.FIELD(5, "int1",
DataTypes.INT())))));
- test("decimal(4,2)", DataTypes.DECIMAL(4, 2));
- }
-
- private void test(String expected, DataType type) {
- assertThat(toOrcType(type)).hasToString(expected);
- }
-}
diff --git
a/paimon-format/src/test/java/org/apache/paimon/format/orc/OrcTypeUtilTest.java
b/paimon-format/src/test/java/org/apache/paimon/format/orc/OrcTypeUtilTest.java
new file mode 100644
index 000000000..5669ac33d
--- /dev/null
+++
b/paimon-format/src/test/java/org/apache/paimon/format/orc/OrcTypeUtilTest.java
@@ -0,0 +1,206 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.paimon.format.orc;
+
+import org.apache.paimon.format.FileFormatFactory;
+import org.apache.paimon.format.FormatWriter;
+import org.apache.paimon.fs.FileIO;
+import org.apache.paimon.fs.Path;
+import org.apache.paimon.fs.PositionOutputStream;
+import org.apache.paimon.fs.local.LocalFileIO;
+import org.apache.paimon.options.Options;
+import org.apache.paimon.types.DataType;
+import org.apache.paimon.types.DataTypes;
+import org.apache.paimon.types.RowType;
+
+import org.apache.paimon.shade.guava30.com.google.common.base.Objects;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.orc.Reader;
+import org.apache.orc.TypeDescription;
+import org.junit.jupiter.api.Test;
+import org.junit.jupiter.api.io.TempDir;
+
+import java.io.IOException;
+import java.util.List;
+import java.util.UUID;
+import java.util.concurrent.atomic.AtomicInteger;
+
+import static org.apache.paimon.format.orc.OrcFileFormat.refineDataType;
+import static org.apache.paimon.format.orc.OrcTypeUtil.PAIMON_ORC_FIELD_ID_KEY;
+import static org.apache.paimon.format.orc.OrcTypeUtil.convertToOrcSchema;
+import static org.apache.paimon.format.orc.OrcTypeUtil.convertToOrcType;
+import static org.apache.paimon.utils.Preconditions.checkArgument;
+import static org.assertj.core.api.Assertions.assertThat;
+import static org.assertj.core.api.Assertions.assertThatNoException;
+
+/** Test for {@link OrcTypeUtil}. */
+class OrcTypeUtilTest {
+
+ @Test
+ void testDataTypeToOrcType() {
+ test("boolean", DataTypes.BOOLEAN());
+ test("char(123)", DataTypes.CHAR(123));
+ test("varchar(123)", DataTypes.VARCHAR(123));
+ test("string", DataTypes.STRING());
+ test("binary", DataTypes.BYTES());
+ test("tinyint", DataTypes.TINYINT());
+ test("smallint", DataTypes.SMALLINT());
+ test("int", DataTypes.INT());
+ test("bigint", DataTypes.BIGINT());
+ test("float", DataTypes.FLOAT());
+ test("double", DataTypes.DOUBLE());
+ test("date", DataTypes.DATE());
+ test("timestamp", DataTypes.TIMESTAMP());
+ test("array<float>", DataTypes.ARRAY(DataTypes.FLOAT()));
+ test("map<float,bigint>", DataTypes.MAP(DataTypes.FLOAT(),
DataTypes.BIGINT()));
+ test(
+
"struct<int0:int,str1:string,double2:double,row3:struct<int0:int,int1:int>>",
+ DataTypes.ROW(
+ DataTypes.FIELD(0, "int0", DataTypes.INT()),
+ DataTypes.FIELD(1, "str1", DataTypes.STRING()),
+ DataTypes.FIELD(2, "double2", DataTypes.DOUBLE()),
+ DataTypes.FIELD(
+ 3,
+ "row3",
+ DataTypes.ROW(
+ DataTypes.FIELD(4, "int0",
DataTypes.INT()),
+ DataTypes.FIELD(5, "int1",
DataTypes.INT())))));
+ test("decimal(4,2)", DataTypes.DECIMAL(4, 2));
+ }
+
+ private void test(String expected, DataType type) {
+ assertThat(convertToOrcType(type, -1, -1)).hasToString(expected);
+ }
+
+ @Test
+ void testFieldIdAttribute(@TempDir java.nio.file.Path tempPath) throws
IOException {
+ RowType rowType =
+ RowType.builder()
+ .field("a", DataTypes.INT())
+ .field(
+ "b",
+ RowType.builder(true, new AtomicInteger(10))
+ .field("f0", DataTypes.STRING())
+ .field("f1", DataTypes.INT())
+ .build())
+ .field("c", DataTypes.ARRAY(DataTypes.INT()))
+ .field("d", DataTypes.MAP(DataTypes.INT(),
DataTypes.STRING()))
+ .field(
+ "e",
+ DataTypes.ARRAY(
+ RowType.builder(true, new
AtomicInteger(20))
+ .field("f0",
DataTypes.STRING())
+ .field("f1", DataTypes.INT())
+ .build()))
+ .field(
+ "f",
+ RowType.builder(true, new AtomicInteger(30))
+ .field("f0",
DataTypes.ARRAY(DataTypes.INT()))
+ .build())
+ .build();
+
+ // write schema to orc file then get
+ FileIO fileIO = LocalFileIO.create();
+ Path tempFile = new Path(new Path(tempPath.toUri()),
UUID.randomUUID().toString());
+
+ OrcFileFormat format =
+ new OrcFileFormat(new FileFormatFactory.FormatContext(new
Options(), 1024, 1024));
+ PositionOutputStream out = fileIO.newOutputStream(tempFile, false);
+ FormatWriter writer = format.createWriterFactory(rowType).create(out,
"zstd");
+ writer.close();
+ out.close();
+
+ Reader orcReader =
+ OrcReaderFactory.createReader(new Configuration(), fileIO,
tempFile, null);
+ TypeDescription orcSchema = orcReader.getSchema();
+
+ RowType refined = (RowType) refineDataType(rowType);
+
+ assertThatNoException()
+ .isThrownBy(() -> checkStruct(convertToOrcSchema(refined),
orcSchema));
+
+ assertThatNoException()
+ .isThrownBy(
+ () ->
+ checkStruct(
+
convertToOrcSchema(refined.project("c", "b", "d")),
+ orcSchema));
+
+ assertThatNoException()
+ .isThrownBy(
+ () ->
+ checkStruct(
+
convertToOrcSchema(refined.project("a", "e", "f")),
+ orcSchema));
+ }
+
+ private void checkStruct(TypeDescription requiredStruct, TypeDescription
orcStruct) {
+ List<String> requiredFields = requiredStruct.getFieldNames();
+ List<TypeDescription> requiredTypes = requiredStruct.getChildren();
+ List<String> orcFields = orcStruct.getFieldNames();
+ List<TypeDescription> orcTypes = orcStruct.getChildren();
+
+ for (int i = 0; i < requiredFields.size(); i++) {
+ String field = requiredFields.get(i);
+ int orcIndex = orcFields.indexOf(field);
+ checkArgument(orcIndex != -1, "Cannot find field %s in orc file
meta.", field);
+ TypeDescription requiredType = requiredTypes.get(i);
+ TypeDescription orcType = orcTypes.get(orcIndex);
+ checkField(field, requiredType, orcType);
+ }
+ }
+
+ private void checkField(
+ String fieldName, TypeDescription requiredType, TypeDescription
orcType) {
+ checkFieldIdAttribute(fieldName, requiredType, orcType);
+ if (requiredType.getCategory().isPrimitive()) {
+ return;
+ }
+
+ switch (requiredType.getCategory()) {
+ case LIST:
+ checkField(
+ "_elem", requiredType.getChildren().get(0),
orcType.getChildren().get(0));
+ return;
+ case MAP:
+ checkField("_key", requiredType.getChildren().get(0),
orcType.getChildren().get(0));
+ checkField(
+ "_value", requiredType.getChildren().get(1),
orcType.getChildren().get(1));
+ return;
+ case STRUCT:
+ checkStruct(requiredType, orcType);
+ return;
+ default:
+ throw new UnsupportedOperationException("Unsupported orc type:
" + requiredType);
+ }
+ }
+
+ private void checkFieldIdAttribute(
+ String fieldName, TypeDescription requiredType, TypeDescription
orcType) {
+ String requiredId =
requiredType.getAttributeValue(PAIMON_ORC_FIELD_ID_KEY);
+ String orcId = orcType.getAttributeValue(PAIMON_ORC_FIELD_ID_KEY);
+ checkArgument(
+ Objects.equal(requiredId, orcId),
+ "Field %s has different id: read type id is %s but orc type id
is %s. This is unexpected.",
+ fieldName,
+ requiredId,
+ orcId);
+ }
+}
diff --git
a/paimon-format/src/test/java/org/apache/paimon/format/orc/OrcWriterFactoryTest.java
b/paimon-format/src/test/java/org/apache/paimon/format/orc/OrcWriterFactoryTest.java
index 2511d7ed7..52df5afb4 100644
---
a/paimon-format/src/test/java/org/apache/paimon/format/orc/OrcWriterFactoryTest.java
+++
b/paimon-format/src/test/java/org/apache/paimon/format/orc/OrcWriterFactoryTest.java
@@ -28,6 +28,7 @@ import org.apache.paimon.types.DataTypes;
import org.apache.hadoop.fs.Path;
import org.apache.orc.MemoryManager;
import org.apache.orc.OrcFile;
+import org.apache.orc.TypeDescription;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.io.TempDir;
@@ -47,7 +48,7 @@ class OrcWriterFactoryTest {
OrcWriterFactory factory =
new TestOrcWriterFactory(
new RowDataVectorizer(
- "struct<_col0:string,_col1:int>",
+
TypeDescription.fromString("struct<_col0:string,_col1:int>"),
new DataType[] {DataTypes.STRING(),
DataTypes.INT()}),
memoryManager);
factory.create(new
LocalPositionOutputStream(tmpDir.resolve("file1").toFile()), "LZ4");