[jira] [Commented] (FLINK-7243) Add ParquetInputFormat
[ https://issues.apache.org/jira/browse/FLINK-7243?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=16714408#comment-16714408 ] ASF GitHub Bot commented on FLINK-7243: --- HuangZhenQiu commented on issue #6483: [FLINK-7243][flink-formats] Add parquet input format URL: https://github.com/apache/flink/pull/6483#issuecomment-445726623 @fhueske Thanks for pointing out so many missing parts. As you pointed out, I added SqlTypeInfo conversion and enforce List and Map schema convention in Diff. Please review it when you have time. This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org > Add ParquetInputFormat > -- > > Key: FLINK-7243 > URL: https://issues.apache.org/jira/browse/FLINK-7243 > Project: Flink > Issue Type: Sub-task > Components: Table API SQL >Reporter: godfrey he >Assignee: Zhenqiu Huang >Priority: Major > Labels: pull-request-available > > Add a {{ParquetInputFormat}} to read data from a Apache Parquet file. -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Commented] (FLINK-7243) Add ParquetInputFormat
[ https://issues.apache.org/jira/browse/FLINK-7243?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=16714232#comment-16714232 ] ASF GitHub Bot commented on FLINK-7243: --- HuangZhenQiu commented on a change in pull request #6483: [FLINK-7243][flink-formats] Add parquet input format URL: https://github.com/apache/flink/pull/6483#discussion_r240074303 ## File path: flink-formats/flink-parquet/src/main/java/org/apache/flink/formats/parquet/utils/ParquetSchemaConverter.java ## @@ -0,0 +1,359 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.formats.parquet.utils; + +import org.apache.flink.api.common.typeinfo.BasicArrayTypeInfo; +import org.apache.flink.api.common.typeinfo.BasicTypeInfo; +import org.apache.flink.api.common.typeinfo.SqlTimeTypeInfo; +import org.apache.flink.api.common.typeinfo.TypeInformation; +import org.apache.flink.api.java.typeutils.MapTypeInfo; +import org.apache.flink.api.java.typeutils.ObjectArrayTypeInfo; +import org.apache.flink.api.java.typeutils.RowTypeInfo; + +import org.apache.parquet.avro.AvroSchemaConverter; +import org.apache.parquet.schema.GroupType; +import org.apache.parquet.schema.MessageType; +import org.apache.parquet.schema.OriginalType; +import org.apache.parquet.schema.PrimitiveType; +import org.apache.parquet.schema.Type; +import org.apache.parquet.schema.Types; + +import java.lang.reflect.Array; +import java.util.ArrayList; +import java.util.List; + +/** + * Schema converter converts Parquet schema to and from Flink internal types. + */ +public class ParquetSchemaConverter { + public static final String MAP_KEY = "key"; + public static final String MAP_VALUE = "value"; + public static final String LIST_ARRAY_TYPE = "array"; + public static final String LIST_ELEMENT = "element"; + public static final String LIST_GROUP_NAME = "list"; + public static final String MESSAGE_ROOT = "root"; + private static final AvroSchemaConverter SCHEMA_CONVERTER = new AvroSchemaConverter(); + + public static TypeInformation fromParquetType(MessageType type) { + return convertFields(type.getFields()); + } + + /** +* Converts Flink Internal Type to Parquet schema. +* +* @param typeInformation flink type information +* @param isStandard is standard LIST and MAP schema or back-compatible schema +* @return Parquet schema +*/ + public static MessageType toParquetType(TypeInformation typeInformation, boolean isStandard) { + return (MessageType) convertField(null, typeInformation, Type.Repetition.OPTIONAL, isStandard); + } + + private static TypeInformation convertFields(List parquetFields) { + List> types = new ArrayList<>(); + List names = new ArrayList<>(); + for (Type field : parquetFields) { + TypeInformation subType = convertField(field); + if (subType != null) { + types.add(subType); + names.add(field.getName()); + } + } + + return new RowTypeInfo(types.toArray(new TypeInformation[types.size()]), + names.toArray(new String[names.size()])); + } + + private static TypeInformation convertField(final Type fieldType) { + TypeInformation typeInfo = null; + if (fieldType.isPrimitive()) { + OriginalType originalType = fieldType.getOriginalType(); + PrimitiveType primitiveType = fieldType.asPrimitiveType(); + switch (primitiveType.getPrimitiveTypeName()) { + case BINARY: + if (originalType != null) { + switch (originalType) { + case DECIMAL: + typeInfo = BasicTypeInfo.BIG_DEC_TYPE_INFO; +
[jira] [Commented] (FLINK-7243) Add ParquetInputFormat
[ https://issues.apache.org/jira/browse/FLINK-7243?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=16707845#comment-16707845 ] ASF GitHub Bot commented on FLINK-7243: --- fhueske commented on a change in pull request #6483: [FLINK-7243][flink-formats] Add parquet input format URL: https://github.com/apache/flink/pull/6483#discussion_r238447232 ## File path: flink-formats/flink-parquet/src/main/java/org/apache/flink/formats/parquet/utils/ParquetSchemaConverter.java ## @@ -0,0 +1,359 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.formats.parquet.utils; + +import org.apache.flink.api.common.typeinfo.BasicArrayTypeInfo; +import org.apache.flink.api.common.typeinfo.BasicTypeInfo; +import org.apache.flink.api.common.typeinfo.SqlTimeTypeInfo; +import org.apache.flink.api.common.typeinfo.TypeInformation; +import org.apache.flink.api.java.typeutils.MapTypeInfo; +import org.apache.flink.api.java.typeutils.ObjectArrayTypeInfo; +import org.apache.flink.api.java.typeutils.RowTypeInfo; + +import org.apache.parquet.avro.AvroSchemaConverter; +import org.apache.parquet.schema.GroupType; +import org.apache.parquet.schema.MessageType; +import org.apache.parquet.schema.OriginalType; +import org.apache.parquet.schema.PrimitiveType; +import org.apache.parquet.schema.Type; +import org.apache.parquet.schema.Types; + +import java.lang.reflect.Array; +import java.util.ArrayList; +import java.util.List; + +/** + * Schema converter converts Parquet schema to and from Flink internal types. + */ +public class ParquetSchemaConverter { + public static final String MAP_KEY = "key"; + public static final String MAP_VALUE = "value"; + public static final String LIST_ARRAY_TYPE = "array"; + public static final String LIST_ELEMENT = "element"; + public static final String LIST_GROUP_NAME = "list"; + public static final String MESSAGE_ROOT = "root"; + private static final AvroSchemaConverter SCHEMA_CONVERTER = new AvroSchemaConverter(); + + public static TypeInformation fromParquetType(MessageType type) { + return convertFields(type.getFields()); + } + + /** +* Converts Flink Internal Type to Parquet schema. +* +* @param typeInformation flink type information +* @param isStandard is standard LIST and MAP schema or back-compatible schema +* @return Parquet schema +*/ + public static MessageType toParquetType(TypeInformation typeInformation, boolean isStandard) { + return (MessageType) convertField(null, typeInformation, Type.Repetition.OPTIONAL, isStandard); + } + + private static TypeInformation convertFields(List parquetFields) { + List> types = new ArrayList<>(); + List names = new ArrayList<>(); + for (Type field : parquetFields) { + TypeInformation subType = convertField(field); + if (subType != null) { + types.add(subType); + names.add(field.getName()); + } + } + + return new RowTypeInfo(types.toArray(new TypeInformation[types.size()]), + names.toArray(new String[names.size()])); + } + + private static TypeInformation convertField(final Type fieldType) { + TypeInformation typeInfo = null; + if (fieldType.isPrimitive()) { + OriginalType originalType = fieldType.getOriginalType(); + PrimitiveType primitiveType = fieldType.asPrimitiveType(); + switch (primitiveType.getPrimitiveTypeName()) { + case BINARY: + if (originalType != null) { + switch (originalType) { + case DECIMAL: + typeInfo = BasicTypeInfo.BIG_DEC_TYPE_INFO; +
[jira] [Commented] (FLINK-7243) Add ParquetInputFormat
[ https://issues.apache.org/jira/browse/FLINK-7243?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=16707846#comment-16707846 ] ASF GitHub Bot commented on FLINK-7243: --- fhueske commented on a change in pull request #6483: [FLINK-7243][flink-formats] Add parquet input format URL: https://github.com/apache/flink/pull/6483#discussion_r238438000 ## File path: flink-formats/flink-parquet/src/main/java/org/apache/flink/formats/parquet/utils/ParquetSchemaConverter.java ## @@ -0,0 +1,359 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.formats.parquet.utils; + +import org.apache.flink.api.common.typeinfo.BasicArrayTypeInfo; +import org.apache.flink.api.common.typeinfo.BasicTypeInfo; +import org.apache.flink.api.common.typeinfo.SqlTimeTypeInfo; +import org.apache.flink.api.common.typeinfo.TypeInformation; +import org.apache.flink.api.java.typeutils.MapTypeInfo; +import org.apache.flink.api.java.typeutils.ObjectArrayTypeInfo; +import org.apache.flink.api.java.typeutils.RowTypeInfo; + +import org.apache.parquet.avro.AvroSchemaConverter; +import org.apache.parquet.schema.GroupType; +import org.apache.parquet.schema.MessageType; +import org.apache.parquet.schema.OriginalType; +import org.apache.parquet.schema.PrimitiveType; +import org.apache.parquet.schema.Type; +import org.apache.parquet.schema.Types; + +import java.lang.reflect.Array; +import java.util.ArrayList; +import java.util.List; + +/** + * Schema converter converts Parquet schema to and from Flink internal types. + */ +public class ParquetSchemaConverter { + public static final String MAP_KEY = "key"; + public static final String MAP_VALUE = "value"; + public static final String LIST_ARRAY_TYPE = "array"; + public static final String LIST_ELEMENT = "element"; + public static final String LIST_GROUP_NAME = "list"; + public static final String MESSAGE_ROOT = "root"; + private static final AvroSchemaConverter SCHEMA_CONVERTER = new AvroSchemaConverter(); + + public static TypeInformation fromParquetType(MessageType type) { + return convertFields(type.getFields()); + } + + /** +* Converts Flink Internal Type to Parquet schema. +* +* @param typeInformation flink type information +* @param isStandard is standard LIST and MAP schema or back-compatible schema +* @return Parquet schema +*/ + public static MessageType toParquetType(TypeInformation typeInformation, boolean isStandard) { + return (MessageType) convertField(null, typeInformation, Type.Repetition.OPTIONAL, isStandard); + } + + private static TypeInformation convertFields(List parquetFields) { + List> types = new ArrayList<>(); + List names = new ArrayList<>(); + for (Type field : parquetFields) { + TypeInformation subType = convertField(field); + if (subType != null) { + types.add(subType); + names.add(field.getName()); + } + } + + return new RowTypeInfo(types.toArray(new TypeInformation[types.size()]), + names.toArray(new String[names.size()])); + } + + private static TypeInformation convertField(final Type fieldType) { + TypeInformation typeInfo = null; + if (fieldType.isPrimitive()) { + OriginalType originalType = fieldType.getOriginalType(); + PrimitiveType primitiveType = fieldType.asPrimitiveType(); + switch (primitiveType.getPrimitiveTypeName()) { + case BINARY: + if (originalType != null) { + switch (originalType) { + case DECIMAL: + typeInfo = BasicTypeInfo.BIG_DEC_TYPE_INFO; +
[jira] [Commented] (FLINK-7243) Add ParquetInputFormat
[ https://issues.apache.org/jira/browse/FLINK-7243?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=16707849#comment-16707849 ] ASF GitHub Bot commented on FLINK-7243: --- fhueske commented on a change in pull request #6483: [FLINK-7243][flink-formats] Add parquet input format URL: https://github.com/apache/flink/pull/6483#discussion_r238437634 ## File path: flink-formats/flink-parquet/src/main/java/org/apache/flink/formats/parquet/utils/ParquetSchemaConverter.java ## @@ -0,0 +1,359 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.formats.parquet.utils; + +import org.apache.flink.api.common.typeinfo.BasicArrayTypeInfo; +import org.apache.flink.api.common.typeinfo.BasicTypeInfo; +import org.apache.flink.api.common.typeinfo.SqlTimeTypeInfo; +import org.apache.flink.api.common.typeinfo.TypeInformation; +import org.apache.flink.api.java.typeutils.MapTypeInfo; +import org.apache.flink.api.java.typeutils.ObjectArrayTypeInfo; +import org.apache.flink.api.java.typeutils.RowTypeInfo; + +import org.apache.parquet.avro.AvroSchemaConverter; +import org.apache.parquet.schema.GroupType; +import org.apache.parquet.schema.MessageType; +import org.apache.parquet.schema.OriginalType; +import org.apache.parquet.schema.PrimitiveType; +import org.apache.parquet.schema.Type; +import org.apache.parquet.schema.Types; + +import java.lang.reflect.Array; +import java.util.ArrayList; +import java.util.List; + +/** + * Schema converter converts Parquet schema to and from Flink internal types. + */ +public class ParquetSchemaConverter { + public static final String MAP_KEY = "key"; + public static final String MAP_VALUE = "value"; + public static final String LIST_ARRAY_TYPE = "array"; + public static final String LIST_ELEMENT = "element"; + public static final String LIST_GROUP_NAME = "list"; + public static final String MESSAGE_ROOT = "root"; + private static final AvroSchemaConverter SCHEMA_CONVERTER = new AvroSchemaConverter(); + + public static TypeInformation fromParquetType(MessageType type) { + return convertFields(type.getFields()); + } + + /** +* Converts Flink Internal Type to Parquet schema. +* +* @param typeInformation flink type information +* @param isStandard is standard LIST and MAP schema or back-compatible schema +* @return Parquet schema +*/ + public static MessageType toParquetType(TypeInformation typeInformation, boolean isStandard) { + return (MessageType) convertField(null, typeInformation, Type.Repetition.OPTIONAL, isStandard); + } + + private static TypeInformation convertFields(List parquetFields) { + List> types = new ArrayList<>(); + List names = new ArrayList<>(); + for (Type field : parquetFields) { + TypeInformation subType = convertField(field); + if (subType != null) { + types.add(subType); + names.add(field.getName()); + } + } + + return new RowTypeInfo(types.toArray(new TypeInformation[types.size()]), + names.toArray(new String[names.size()])); + } + + private static TypeInformation convertField(final Type fieldType) { + TypeInformation typeInfo = null; + if (fieldType.isPrimitive()) { + OriginalType originalType = fieldType.getOriginalType(); + PrimitiveType primitiveType = fieldType.asPrimitiveType(); + switch (primitiveType.getPrimitiveTypeName()) { + case BINARY: + if (originalType != null) { + switch (originalType) { + case DECIMAL: + typeInfo = BasicTypeInfo.BIG_DEC_TYPE_INFO; +
[jira] [Commented] (FLINK-7243) Add ParquetInputFormat
[ https://issues.apache.org/jira/browse/FLINK-7243?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=16707851#comment-16707851 ] ASF GitHub Bot commented on FLINK-7243: --- fhueske commented on a change in pull request #6483: [FLINK-7243][flink-formats] Add parquet input format URL: https://github.com/apache/flink/pull/6483#discussion_r238439844 ## File path: flink-formats/flink-parquet/src/main/java/org/apache/flink/formats/parquet/utils/ParquetSchemaConverter.java ## @@ -0,0 +1,359 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.formats.parquet.utils; + +import org.apache.flink.api.common.typeinfo.BasicArrayTypeInfo; +import org.apache.flink.api.common.typeinfo.BasicTypeInfo; +import org.apache.flink.api.common.typeinfo.SqlTimeTypeInfo; +import org.apache.flink.api.common.typeinfo.TypeInformation; +import org.apache.flink.api.java.typeutils.MapTypeInfo; +import org.apache.flink.api.java.typeutils.ObjectArrayTypeInfo; +import org.apache.flink.api.java.typeutils.RowTypeInfo; + +import org.apache.parquet.avro.AvroSchemaConverter; +import org.apache.parquet.schema.GroupType; +import org.apache.parquet.schema.MessageType; +import org.apache.parquet.schema.OriginalType; +import org.apache.parquet.schema.PrimitiveType; +import org.apache.parquet.schema.Type; +import org.apache.parquet.schema.Types; + +import java.lang.reflect.Array; +import java.util.ArrayList; +import java.util.List; + +/** + * Schema converter converts Parquet schema to and from Flink internal types. + */ +public class ParquetSchemaConverter { + public static final String MAP_KEY = "key"; + public static final String MAP_VALUE = "value"; + public static final String LIST_ARRAY_TYPE = "array"; + public static final String LIST_ELEMENT = "element"; + public static final String LIST_GROUP_NAME = "list"; + public static final String MESSAGE_ROOT = "root"; + private static final AvroSchemaConverter SCHEMA_CONVERTER = new AvroSchemaConverter(); + + public static TypeInformation fromParquetType(MessageType type) { + return convertFields(type.getFields()); + } + + /** +* Converts Flink Internal Type to Parquet schema. +* +* @param typeInformation flink type information +* @param isStandard is standard LIST and MAP schema or back-compatible schema +* @return Parquet schema +*/ + public static MessageType toParquetType(TypeInformation typeInformation, boolean isStandard) { + return (MessageType) convertField(null, typeInformation, Type.Repetition.OPTIONAL, isStandard); + } + + private static TypeInformation convertFields(List parquetFields) { + List> types = new ArrayList<>(); + List names = new ArrayList<>(); + for (Type field : parquetFields) { + TypeInformation subType = convertField(field); + if (subType != null) { + types.add(subType); + names.add(field.getName()); + } + } + + return new RowTypeInfo(types.toArray(new TypeInformation[types.size()]), + names.toArray(new String[names.size()])); + } + + private static TypeInformation convertField(final Type fieldType) { + TypeInformation typeInfo = null; + if (fieldType.isPrimitive()) { + OriginalType originalType = fieldType.getOriginalType(); + PrimitiveType primitiveType = fieldType.asPrimitiveType(); + switch (primitiveType.getPrimitiveTypeName()) { + case BINARY: + if (originalType != null) { + switch (originalType) { + case DECIMAL: + typeInfo = BasicTypeInfo.BIG_DEC_TYPE_INFO; +
[jira] [Commented] (FLINK-7243) Add ParquetInputFormat
[ https://issues.apache.org/jira/browse/FLINK-7243?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=16707841#comment-16707841 ] ASF GitHub Bot commented on FLINK-7243: --- fhueske commented on a change in pull request #6483: [FLINK-7243][flink-formats] Add parquet input format URL: https://github.com/apache/flink/pull/6483#discussion_r238366692 ## File path: flink-formats/flink-parquet/src/main/java/org/apache/flink/formats/parquet/utils/ParquetSchemaConverter.java ## @@ -0,0 +1,359 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.formats.parquet.utils; + +import org.apache.flink.api.common.typeinfo.BasicArrayTypeInfo; +import org.apache.flink.api.common.typeinfo.BasicTypeInfo; +import org.apache.flink.api.common.typeinfo.SqlTimeTypeInfo; +import org.apache.flink.api.common.typeinfo.TypeInformation; +import org.apache.flink.api.java.typeutils.MapTypeInfo; +import org.apache.flink.api.java.typeutils.ObjectArrayTypeInfo; +import org.apache.flink.api.java.typeutils.RowTypeInfo; + +import org.apache.parquet.avro.AvroSchemaConverter; +import org.apache.parquet.schema.GroupType; +import org.apache.parquet.schema.MessageType; +import org.apache.parquet.schema.OriginalType; +import org.apache.parquet.schema.PrimitiveType; +import org.apache.parquet.schema.Type; +import org.apache.parquet.schema.Types; + +import java.lang.reflect.Array; +import java.util.ArrayList; +import java.util.List; + +/** + * Schema converter converts Parquet schema to and from Flink internal types. + */ +public class ParquetSchemaConverter { + public static final String MAP_KEY = "key"; + public static final String MAP_VALUE = "value"; + public static final String LIST_ARRAY_TYPE = "array"; + public static final String LIST_ELEMENT = "element"; + public static final String LIST_GROUP_NAME = "list"; + public static final String MESSAGE_ROOT = "root"; + private static final AvroSchemaConverter SCHEMA_CONVERTER = new AvroSchemaConverter(); + + public static TypeInformation fromParquetType(MessageType type) { + return convertFields(type.getFields()); + } + + /** +* Converts Flink Internal Type to Parquet schema. +* +* @param typeInformation flink type information +* @param isStandard is standard LIST and MAP schema or back-compatible schema +* @return Parquet schema +*/ + public static MessageType toParquetType(TypeInformation typeInformation, boolean isStandard) { + return (MessageType) convertField(null, typeInformation, Type.Repetition.OPTIONAL, isStandard); + } + + private static TypeInformation convertFields(List parquetFields) { + List> types = new ArrayList<>(); + List names = new ArrayList<>(); + for (Type field : parquetFields) { + TypeInformation subType = convertField(field); + if (subType != null) { + types.add(subType); + names.add(field.getName()); + } + } + + return new RowTypeInfo(types.toArray(new TypeInformation[types.size()]), + names.toArray(new String[names.size()])); + } + + private static TypeInformation convertField(final Type fieldType) { + TypeInformation typeInfo = null; + if (fieldType.isPrimitive()) { + OriginalType originalType = fieldType.getOriginalType(); + PrimitiveType primitiveType = fieldType.asPrimitiveType(); + switch (primitiveType.getPrimitiveTypeName()) { + case BINARY: + if (originalType != null) { + switch (originalType) { + case DECIMAL: + typeInfo = BasicTypeInfo.BIG_DEC_TYPE_INFO; +
[jira] [Commented] (FLINK-7243) Add ParquetInputFormat
[ https://issues.apache.org/jira/browse/FLINK-7243?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=16707840#comment-16707840 ] ASF GitHub Bot commented on FLINK-7243: --- fhueske commented on a change in pull request #6483: [FLINK-7243][flink-formats] Add parquet input format URL: https://github.com/apache/flink/pull/6483#discussion_r238350180 ## File path: flink-formats/flink-parquet/src/main/java/org/apache/flink/formats/parquet/utils/ParquetSchemaConverter.java ## @@ -0,0 +1,359 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.formats.parquet.utils; + +import org.apache.flink.api.common.typeinfo.BasicArrayTypeInfo; +import org.apache.flink.api.common.typeinfo.BasicTypeInfo; +import org.apache.flink.api.common.typeinfo.SqlTimeTypeInfo; +import org.apache.flink.api.common.typeinfo.TypeInformation; +import org.apache.flink.api.java.typeutils.MapTypeInfo; +import org.apache.flink.api.java.typeutils.ObjectArrayTypeInfo; +import org.apache.flink.api.java.typeutils.RowTypeInfo; + +import org.apache.parquet.avro.AvroSchemaConverter; +import org.apache.parquet.schema.GroupType; +import org.apache.parquet.schema.MessageType; +import org.apache.parquet.schema.OriginalType; +import org.apache.parquet.schema.PrimitiveType; +import org.apache.parquet.schema.Type; +import org.apache.parquet.schema.Types; + +import java.lang.reflect.Array; +import java.util.ArrayList; +import java.util.List; + +/** + * Schema converter converts Parquet schema to and from Flink internal types. + */ +public class ParquetSchemaConverter { + public static final String MAP_KEY = "key"; + public static final String MAP_VALUE = "value"; + public static final String LIST_ARRAY_TYPE = "array"; + public static final String LIST_ELEMENT = "element"; + public static final String LIST_GROUP_NAME = "list"; + public static final String MESSAGE_ROOT = "root"; + private static final AvroSchemaConverter SCHEMA_CONVERTER = new AvroSchemaConverter(); + + public static TypeInformation fromParquetType(MessageType type) { + return convertFields(type.getFields()); + } + + /** +* Converts Flink Internal Type to Parquet schema. +* +* @param typeInformation flink type information +* @param isStandard is standard LIST and MAP schema or back-compatible schema +* @return Parquet schema +*/ + public static MessageType toParquetType(TypeInformation typeInformation, boolean isStandard) { + return (MessageType) convertField(null, typeInformation, Type.Repetition.OPTIONAL, isStandard); + } + + private static TypeInformation convertFields(List parquetFields) { + List> types = new ArrayList<>(); + List names = new ArrayList<>(); + for (Type field : parquetFields) { + TypeInformation subType = convertField(field); + if (subType != null) { Review comment: what are valid cases of `subType == null`? This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org > Add ParquetInputFormat > -- > > Key: FLINK-7243 > URL: https://issues.apache.org/jira/browse/FLINK-7243 > Project: Flink > Issue Type: Sub-task > Components: Table API SQL >Reporter: godfrey he >Assignee: Zhenqiu Huang >Priority: Major > Labels: pull-request-available > > Add a {{ParquetInputFormat}} to read data from a Apache Parquet file. -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Commented] (FLINK-7243) Add ParquetInputFormat
[ https://issues.apache.org/jira/browse/FLINK-7243?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=16707835#comment-16707835 ] ASF GitHub Bot commented on FLINK-7243: --- fhueske commented on a change in pull request #6483: [FLINK-7243][flink-formats] Add parquet input format URL: https://github.com/apache/flink/pull/6483#discussion_r238363492 ## File path: flink-formats/flink-parquet/src/main/java/org/apache/flink/formats/parquet/utils/ParquetSchemaConverter.java ## @@ -0,0 +1,359 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.formats.parquet.utils; + +import org.apache.flink.api.common.typeinfo.BasicArrayTypeInfo; +import org.apache.flink.api.common.typeinfo.BasicTypeInfo; +import org.apache.flink.api.common.typeinfo.SqlTimeTypeInfo; +import org.apache.flink.api.common.typeinfo.TypeInformation; +import org.apache.flink.api.java.typeutils.MapTypeInfo; +import org.apache.flink.api.java.typeutils.ObjectArrayTypeInfo; +import org.apache.flink.api.java.typeutils.RowTypeInfo; + +import org.apache.parquet.avro.AvroSchemaConverter; +import org.apache.parquet.schema.GroupType; +import org.apache.parquet.schema.MessageType; +import org.apache.parquet.schema.OriginalType; +import org.apache.parquet.schema.PrimitiveType; +import org.apache.parquet.schema.Type; +import org.apache.parquet.schema.Types; + +import java.lang.reflect.Array; +import java.util.ArrayList; +import java.util.List; + +/** + * Schema converter converts Parquet schema to and from Flink internal types. + */ +public class ParquetSchemaConverter { + public static final String MAP_KEY = "key"; + public static final String MAP_VALUE = "value"; + public static final String LIST_ARRAY_TYPE = "array"; + public static final String LIST_ELEMENT = "element"; + public static final String LIST_GROUP_NAME = "list"; + public static final String MESSAGE_ROOT = "root"; + private static final AvroSchemaConverter SCHEMA_CONVERTER = new AvroSchemaConverter(); + + public static TypeInformation fromParquetType(MessageType type) { + return convertFields(type.getFields()); + } + + /** +* Converts Flink Internal Type to Parquet schema. +* +* @param typeInformation flink type information +* @param isStandard is standard LIST and MAP schema or back-compatible schema +* @return Parquet schema +*/ + public static MessageType toParquetType(TypeInformation typeInformation, boolean isStandard) { + return (MessageType) convertField(null, typeInformation, Type.Repetition.OPTIONAL, isStandard); + } + + private static TypeInformation convertFields(List parquetFields) { + List> types = new ArrayList<>(); + List names = new ArrayList<>(); + for (Type field : parquetFields) { + TypeInformation subType = convertField(field); + if (subType != null) { + types.add(subType); + names.add(field.getName()); + } + } + + return new RowTypeInfo(types.toArray(new TypeInformation[types.size()]), + names.toArray(new String[names.size()])); + } + + private static TypeInformation convertField(final Type fieldType) { Review comment: Can you list which types are not supported yet? This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org > Add ParquetInputFormat > -- > > Key: FLINK-7243 > URL: https://issues.apache.org/jira/browse/FLINK-7243 > Project: Flink > Issue Type: Sub-task > Components: Table API SQL >Reporter: godfrey he >Assignee: Zhenqiu Huang >
[jira] [Commented] (FLINK-7243) Add ParquetInputFormat
[ https://issues.apache.org/jira/browse/FLINK-7243?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=16707829#comment-16707829 ] ASF GitHub Bot commented on FLINK-7243: --- fhueske commented on a change in pull request #6483: [FLINK-7243][flink-formats] Add parquet input format URL: https://github.com/apache/flink/pull/6483#discussion_r238341448 ## File path: flink-formats/flink-parquet/src/main/java/org/apache/flink/formats/parquet/utils/ParquetSchemaConverter.java ## @@ -0,0 +1,359 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.formats.parquet.utils; + +import org.apache.flink.api.common.typeinfo.BasicArrayTypeInfo; +import org.apache.flink.api.common.typeinfo.BasicTypeInfo; +import org.apache.flink.api.common.typeinfo.SqlTimeTypeInfo; +import org.apache.flink.api.common.typeinfo.TypeInformation; +import org.apache.flink.api.java.typeutils.MapTypeInfo; +import org.apache.flink.api.java.typeutils.ObjectArrayTypeInfo; +import org.apache.flink.api.java.typeutils.RowTypeInfo; + +import org.apache.parquet.avro.AvroSchemaConverter; +import org.apache.parquet.schema.GroupType; +import org.apache.parquet.schema.MessageType; +import org.apache.parquet.schema.OriginalType; +import org.apache.parquet.schema.PrimitiveType; +import org.apache.parquet.schema.Type; +import org.apache.parquet.schema.Types; + +import java.lang.reflect.Array; +import java.util.ArrayList; +import java.util.List; + +/** + * Schema converter converts Parquet schema to and from Flink internal types. + */ +public class ParquetSchemaConverter { + public static final String MAP_KEY = "key"; Review comment: fields can be `private` This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org > Add ParquetInputFormat > -- > > Key: FLINK-7243 > URL: https://issues.apache.org/jira/browse/FLINK-7243 > Project: Flink > Issue Type: Sub-task > Components: Table API SQL >Reporter: godfrey he >Assignee: Zhenqiu Huang >Priority: Major > Labels: pull-request-available > > Add a {{ParquetInputFormat}} to read data from a Apache Parquet file. -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Commented] (FLINK-7243) Add ParquetInputFormat
[ https://issues.apache.org/jira/browse/FLINK-7243?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=16707847#comment-16707847 ] ASF GitHub Bot commented on FLINK-7243: --- fhueske commented on a change in pull request #6483: [FLINK-7243][flink-formats] Add parquet input format URL: https://github.com/apache/flink/pull/6483#discussion_r238439462 ## File path: flink-formats/flink-parquet/src/main/java/org/apache/flink/formats/parquet/utils/ParquetSchemaConverter.java ## @@ -0,0 +1,359 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.formats.parquet.utils; + +import org.apache.flink.api.common.typeinfo.BasicArrayTypeInfo; +import org.apache.flink.api.common.typeinfo.BasicTypeInfo; +import org.apache.flink.api.common.typeinfo.SqlTimeTypeInfo; +import org.apache.flink.api.common.typeinfo.TypeInformation; +import org.apache.flink.api.java.typeutils.MapTypeInfo; +import org.apache.flink.api.java.typeutils.ObjectArrayTypeInfo; +import org.apache.flink.api.java.typeutils.RowTypeInfo; + +import org.apache.parquet.avro.AvroSchemaConverter; +import org.apache.parquet.schema.GroupType; +import org.apache.parquet.schema.MessageType; +import org.apache.parquet.schema.OriginalType; +import org.apache.parquet.schema.PrimitiveType; +import org.apache.parquet.schema.Type; +import org.apache.parquet.schema.Types; + +import java.lang.reflect.Array; +import java.util.ArrayList; +import java.util.List; + +/** + * Schema converter converts Parquet schema to and from Flink internal types. + */ +public class ParquetSchemaConverter { + public static final String MAP_KEY = "key"; + public static final String MAP_VALUE = "value"; + public static final String LIST_ARRAY_TYPE = "array"; + public static final String LIST_ELEMENT = "element"; + public static final String LIST_GROUP_NAME = "list"; + public static final String MESSAGE_ROOT = "root"; + private static final AvroSchemaConverter SCHEMA_CONVERTER = new AvroSchemaConverter(); + + public static TypeInformation fromParquetType(MessageType type) { + return convertFields(type.getFields()); + } + + /** +* Converts Flink Internal Type to Parquet schema. +* +* @param typeInformation flink type information +* @param isStandard is standard LIST and MAP schema or back-compatible schema +* @return Parquet schema +*/ + public static MessageType toParquetType(TypeInformation typeInformation, boolean isStandard) { + return (MessageType) convertField(null, typeInformation, Type.Repetition.OPTIONAL, isStandard); + } + + private static TypeInformation convertFields(List parquetFields) { + List> types = new ArrayList<>(); + List names = new ArrayList<>(); + for (Type field : parquetFields) { + TypeInformation subType = convertField(field); + if (subType != null) { + types.add(subType); + names.add(field.getName()); + } + } + + return new RowTypeInfo(types.toArray(new TypeInformation[types.size()]), + names.toArray(new String[names.size()])); + } + + private static TypeInformation convertField(final Type fieldType) { + TypeInformation typeInfo = null; + if (fieldType.isPrimitive()) { + OriginalType originalType = fieldType.getOriginalType(); + PrimitiveType primitiveType = fieldType.asPrimitiveType(); + switch (primitiveType.getPrimitiveTypeName()) { + case BINARY: + if (originalType != null) { + switch (originalType) { + case DECIMAL: + typeInfo = BasicTypeInfo.BIG_DEC_TYPE_INFO; +
[jira] [Commented] (FLINK-7243) Add ParquetInputFormat
[ https://issues.apache.org/jira/browse/FLINK-7243?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=16707830#comment-16707830 ] ASF GitHub Bot commented on FLINK-7243: --- fhueske commented on a change in pull request #6483: [FLINK-7243][flink-formats] Add parquet input format URL: https://github.com/apache/flink/pull/6483#discussion_r238348640 ## File path: flink-formats/flink-parquet/src/main/java/org/apache/flink/formats/parquet/utils/ParquetSchemaConverter.java ## @@ -0,0 +1,359 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.formats.parquet.utils; + +import org.apache.flink.api.common.typeinfo.BasicArrayTypeInfo; +import org.apache.flink.api.common.typeinfo.BasicTypeInfo; +import org.apache.flink.api.common.typeinfo.SqlTimeTypeInfo; +import org.apache.flink.api.common.typeinfo.TypeInformation; +import org.apache.flink.api.java.typeutils.MapTypeInfo; +import org.apache.flink.api.java.typeutils.ObjectArrayTypeInfo; +import org.apache.flink.api.java.typeutils.RowTypeInfo; + +import org.apache.parquet.avro.AvroSchemaConverter; +import org.apache.parquet.schema.GroupType; +import org.apache.parquet.schema.MessageType; +import org.apache.parquet.schema.OriginalType; +import org.apache.parquet.schema.PrimitiveType; +import org.apache.parquet.schema.Type; +import org.apache.parquet.schema.Types; + +import java.lang.reflect.Array; +import java.util.ArrayList; +import java.util.List; + +/** + * Schema converter converts Parquet schema to and from Flink internal types. + */ +public class ParquetSchemaConverter { + public static final String MAP_KEY = "key"; + public static final String MAP_VALUE = "value"; + public static final String LIST_ARRAY_TYPE = "array"; + public static final String LIST_ELEMENT = "element"; + public static final String LIST_GROUP_NAME = "list"; + public static final String MESSAGE_ROOT = "root"; + private static final AvroSchemaConverter SCHEMA_CONVERTER = new AvroSchemaConverter(); + + public static TypeInformation fromParquetType(MessageType type) { Review comment: Add JavaDocs for public method This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org > Add ParquetInputFormat > -- > > Key: FLINK-7243 > URL: https://issues.apache.org/jira/browse/FLINK-7243 > Project: Flink > Issue Type: Sub-task > Components: Table API SQL >Reporter: godfrey he >Assignee: Zhenqiu Huang >Priority: Major > Labels: pull-request-available > > Add a {{ParquetInputFormat}} to read data from a Apache Parquet file. -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Commented] (FLINK-7243) Add ParquetInputFormat
[ https://issues.apache.org/jira/browse/FLINK-7243?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=16707834#comment-16707834 ] ASF GitHub Bot commented on FLINK-7243: --- fhueske commented on a change in pull request #6483: [FLINK-7243][flink-formats] Add parquet input format URL: https://github.com/apache/flink/pull/6483#discussion_r238352516 ## File path: flink-formats/flink-parquet/src/main/java/org/apache/flink/formats/parquet/utils/ParquetSchemaConverter.java ## @@ -0,0 +1,359 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.formats.parquet.utils; + +import org.apache.flink.api.common.typeinfo.BasicArrayTypeInfo; +import org.apache.flink.api.common.typeinfo.BasicTypeInfo; +import org.apache.flink.api.common.typeinfo.SqlTimeTypeInfo; +import org.apache.flink.api.common.typeinfo.TypeInformation; +import org.apache.flink.api.java.typeutils.MapTypeInfo; +import org.apache.flink.api.java.typeutils.ObjectArrayTypeInfo; +import org.apache.flink.api.java.typeutils.RowTypeInfo; + +import org.apache.parquet.avro.AvroSchemaConverter; +import org.apache.parquet.schema.GroupType; +import org.apache.parquet.schema.MessageType; +import org.apache.parquet.schema.OriginalType; +import org.apache.parquet.schema.PrimitiveType; +import org.apache.parquet.schema.Type; +import org.apache.parquet.schema.Types; + +import java.lang.reflect.Array; +import java.util.ArrayList; +import java.util.List; + +/** + * Schema converter converts Parquet schema to and from Flink internal types. + */ +public class ParquetSchemaConverter { + public static final String MAP_KEY = "key"; + public static final String MAP_VALUE = "value"; + public static final String LIST_ARRAY_TYPE = "array"; + public static final String LIST_ELEMENT = "element"; + public static final String LIST_GROUP_NAME = "list"; + public static final String MESSAGE_ROOT = "root"; + private static final AvroSchemaConverter SCHEMA_CONVERTER = new AvroSchemaConverter(); + + public static TypeInformation fromParquetType(MessageType type) { + return convertFields(type.getFields()); + } + + /** +* Converts Flink Internal Type to Parquet schema. +* +* @param typeInformation flink type information +* @param isStandard is standard LIST and MAP schema or back-compatible schema +* @return Parquet schema +*/ + public static MessageType toParquetType(TypeInformation typeInformation, boolean isStandard) { + return (MessageType) convertField(null, typeInformation, Type.Repetition.OPTIONAL, isStandard); + } + + private static TypeInformation convertFields(List parquetFields) { + List> types = new ArrayList<>(); + List names = new ArrayList<>(); + for (Type field : parquetFields) { + TypeInformation subType = convertField(field); + if (subType != null) { + types.add(subType); + names.add(field.getName()); + } + } + + return new RowTypeInfo(types.toArray(new TypeInformation[types.size()]), Review comment: use empty array as parameter for `toArray()`, i.e., `types.toArray(new TypeInformation[0])`. The parameter is only used for type inference and is not used to store the list elements. This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org > Add ParquetInputFormat > -- > > Key: FLINK-7243 > URL: https://issues.apache.org/jira/browse/FLINK-7243 > Project: Flink > Issue Type: Sub-task > Components: Table API SQL >Reporter: godfrey he >Assignee: Zhenqiu Huang >Priority:
[jira] [Commented] (FLINK-7243) Add ParquetInputFormat
[ https://issues.apache.org/jira/browse/FLINK-7243?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=16707850#comment-16707850 ] ASF GitHub Bot commented on FLINK-7243: --- fhueske commented on a change in pull request #6483: [FLINK-7243][flink-formats] Add parquet input format URL: https://github.com/apache/flink/pull/6483#discussion_r238446494 ## File path: flink-formats/flink-parquet/src/main/java/org/apache/flink/formats/parquet/utils/ParquetSchemaConverter.java ## @@ -0,0 +1,359 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.formats.parquet.utils; + +import org.apache.flink.api.common.typeinfo.BasicArrayTypeInfo; +import org.apache.flink.api.common.typeinfo.BasicTypeInfo; +import org.apache.flink.api.common.typeinfo.SqlTimeTypeInfo; +import org.apache.flink.api.common.typeinfo.TypeInformation; +import org.apache.flink.api.java.typeutils.MapTypeInfo; +import org.apache.flink.api.java.typeutils.ObjectArrayTypeInfo; +import org.apache.flink.api.java.typeutils.RowTypeInfo; + +import org.apache.parquet.avro.AvroSchemaConverter; +import org.apache.parquet.schema.GroupType; +import org.apache.parquet.schema.MessageType; +import org.apache.parquet.schema.OriginalType; +import org.apache.parquet.schema.PrimitiveType; +import org.apache.parquet.schema.Type; +import org.apache.parquet.schema.Types; + +import java.lang.reflect.Array; +import java.util.ArrayList; +import java.util.List; + +/** + * Schema converter converts Parquet schema to and from Flink internal types. + */ +public class ParquetSchemaConverter { + public static final String MAP_KEY = "key"; + public static final String MAP_VALUE = "value"; + public static final String LIST_ARRAY_TYPE = "array"; + public static final String LIST_ELEMENT = "element"; + public static final String LIST_GROUP_NAME = "list"; + public static final String MESSAGE_ROOT = "root"; + private static final AvroSchemaConverter SCHEMA_CONVERTER = new AvroSchemaConverter(); + + public static TypeInformation fromParquetType(MessageType type) { + return convertFields(type.getFields()); + } + + /** +* Converts Flink Internal Type to Parquet schema. +* +* @param typeInformation flink type information +* @param isStandard is standard LIST and MAP schema or back-compatible schema +* @return Parquet schema +*/ + public static MessageType toParquetType(TypeInformation typeInformation, boolean isStandard) { + return (MessageType) convertField(null, typeInformation, Type.Repetition.OPTIONAL, isStandard); + } + + private static TypeInformation convertFields(List parquetFields) { + List> types = new ArrayList<>(); + List names = new ArrayList<>(); + for (Type field : parquetFields) { + TypeInformation subType = convertField(field); + if (subType != null) { + types.add(subType); + names.add(field.getName()); + } + } + + return new RowTypeInfo(types.toArray(new TypeInformation[types.size()]), + names.toArray(new String[names.size()])); + } + + private static TypeInformation convertField(final Type fieldType) { + TypeInformation typeInfo = null; + if (fieldType.isPrimitive()) { + OriginalType originalType = fieldType.getOriginalType(); + PrimitiveType primitiveType = fieldType.asPrimitiveType(); + switch (primitiveType.getPrimitiveTypeName()) { + case BINARY: + if (originalType != null) { + switch (originalType) { + case DECIMAL: + typeInfo = BasicTypeInfo.BIG_DEC_TYPE_INFO; +
[jira] [Commented] (FLINK-7243) Add ParquetInputFormat
[ https://issues.apache.org/jira/browse/FLINK-7243?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=16707848#comment-16707848 ] ASF GitHub Bot commented on FLINK-7243: --- fhueske commented on a change in pull request #6483: [FLINK-7243][flink-formats] Add parquet input format URL: https://github.com/apache/flink/pull/6483#discussion_r238365934 ## File path: flink-formats/flink-parquet/src/main/java/org/apache/flink/formats/parquet/utils/ParquetSchemaConverter.java ## @@ -0,0 +1,359 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.formats.parquet.utils; + +import org.apache.flink.api.common.typeinfo.BasicArrayTypeInfo; +import org.apache.flink.api.common.typeinfo.BasicTypeInfo; +import org.apache.flink.api.common.typeinfo.SqlTimeTypeInfo; +import org.apache.flink.api.common.typeinfo.TypeInformation; +import org.apache.flink.api.java.typeutils.MapTypeInfo; +import org.apache.flink.api.java.typeutils.ObjectArrayTypeInfo; +import org.apache.flink.api.java.typeutils.RowTypeInfo; + +import org.apache.parquet.avro.AvroSchemaConverter; +import org.apache.parquet.schema.GroupType; +import org.apache.parquet.schema.MessageType; +import org.apache.parquet.schema.OriginalType; +import org.apache.parquet.schema.PrimitiveType; +import org.apache.parquet.schema.Type; +import org.apache.parquet.schema.Types; + +import java.lang.reflect.Array; +import java.util.ArrayList; +import java.util.List; + +/** + * Schema converter converts Parquet schema to and from Flink internal types. + */ +public class ParquetSchemaConverter { + public static final String MAP_KEY = "key"; + public static final String MAP_VALUE = "value"; + public static final String LIST_ARRAY_TYPE = "array"; + public static final String LIST_ELEMENT = "element"; + public static final String LIST_GROUP_NAME = "list"; + public static final String MESSAGE_ROOT = "root"; + private static final AvroSchemaConverter SCHEMA_CONVERTER = new AvroSchemaConverter(); + + public static TypeInformation fromParquetType(MessageType type) { + return convertFields(type.getFields()); + } + + /** +* Converts Flink Internal Type to Parquet schema. +* +* @param typeInformation flink type information +* @param isStandard is standard LIST and MAP schema or back-compatible schema +* @return Parquet schema +*/ + public static MessageType toParquetType(TypeInformation typeInformation, boolean isStandard) { + return (MessageType) convertField(null, typeInformation, Type.Repetition.OPTIONAL, isStandard); + } + + private static TypeInformation convertFields(List parquetFields) { + List> types = new ArrayList<>(); + List names = new ArrayList<>(); + for (Type field : parquetFields) { + TypeInformation subType = convertField(field); + if (subType != null) { + types.add(subType); + names.add(field.getName()); + } + } + + return new RowTypeInfo(types.toArray(new TypeInformation[types.size()]), + names.toArray(new String[names.size()])); + } + + private static TypeInformation convertField(final Type fieldType) { + TypeInformation typeInfo = null; + if (fieldType.isPrimitive()) { + OriginalType originalType = fieldType.getOriginalType(); + PrimitiveType primitiveType = fieldType.asPrimitiveType(); + switch (primitiveType.getPrimitiveTypeName()) { + case BINARY: + if (originalType != null) { + switch (originalType) { + case DECIMAL: + typeInfo = BasicTypeInfo.BIG_DEC_TYPE_INFO; +
[jira] [Commented] (FLINK-7243) Add ParquetInputFormat
[ https://issues.apache.org/jira/browse/FLINK-7243?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=16707844#comment-16707844 ] ASF GitHub Bot commented on FLINK-7243: --- fhueske commented on a change in pull request #6483: [FLINK-7243][flink-formats] Add parquet input format URL: https://github.com/apache/flink/pull/6483#discussion_r238440527 ## File path: flink-formats/flink-parquet/src/main/java/org/apache/flink/formats/parquet/utils/ParquetSchemaConverter.java ## @@ -0,0 +1,359 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.formats.parquet.utils; + +import org.apache.flink.api.common.typeinfo.BasicArrayTypeInfo; +import org.apache.flink.api.common.typeinfo.BasicTypeInfo; +import org.apache.flink.api.common.typeinfo.SqlTimeTypeInfo; +import org.apache.flink.api.common.typeinfo.TypeInformation; +import org.apache.flink.api.java.typeutils.MapTypeInfo; +import org.apache.flink.api.java.typeutils.ObjectArrayTypeInfo; +import org.apache.flink.api.java.typeutils.RowTypeInfo; + +import org.apache.parquet.avro.AvroSchemaConverter; +import org.apache.parquet.schema.GroupType; +import org.apache.parquet.schema.MessageType; +import org.apache.parquet.schema.OriginalType; +import org.apache.parquet.schema.PrimitiveType; +import org.apache.parquet.schema.Type; +import org.apache.parquet.schema.Types; + +import java.lang.reflect.Array; +import java.util.ArrayList; +import java.util.List; + +/** + * Schema converter converts Parquet schema to and from Flink internal types. + */ +public class ParquetSchemaConverter { + public static final String MAP_KEY = "key"; + public static final String MAP_VALUE = "value"; + public static final String LIST_ARRAY_TYPE = "array"; + public static final String LIST_ELEMENT = "element"; + public static final String LIST_GROUP_NAME = "list"; + public static final String MESSAGE_ROOT = "root"; + private static final AvroSchemaConverter SCHEMA_CONVERTER = new AvroSchemaConverter(); + + public static TypeInformation fromParquetType(MessageType type) { + return convertFields(type.getFields()); + } + + /** +* Converts Flink Internal Type to Parquet schema. +* +* @param typeInformation flink type information +* @param isStandard is standard LIST and MAP schema or back-compatible schema +* @return Parquet schema +*/ + public static MessageType toParquetType(TypeInformation typeInformation, boolean isStandard) { + return (MessageType) convertField(null, typeInformation, Type.Repetition.OPTIONAL, isStandard); + } + + private static TypeInformation convertFields(List parquetFields) { + List> types = new ArrayList<>(); + List names = new ArrayList<>(); + for (Type field : parquetFields) { + TypeInformation subType = convertField(field); + if (subType != null) { + types.add(subType); + names.add(field.getName()); + } + } + + return new RowTypeInfo(types.toArray(new TypeInformation[types.size()]), + names.toArray(new String[names.size()])); + } + + private static TypeInformation convertField(final Type fieldType) { + TypeInformation typeInfo = null; + if (fieldType.isPrimitive()) { + OriginalType originalType = fieldType.getOriginalType(); + PrimitiveType primitiveType = fieldType.asPrimitiveType(); + switch (primitiveType.getPrimitiveTypeName()) { + case BINARY: + if (originalType != null) { + switch (originalType) { + case DECIMAL: + typeInfo = BasicTypeInfo.BIG_DEC_TYPE_INFO; +
[jira] [Commented] (FLINK-7243) Add ParquetInputFormat
[ https://issues.apache.org/jira/browse/FLINK-7243?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=16707843#comment-16707843 ] ASF GitHub Bot commented on FLINK-7243: --- fhueske commented on a change in pull request #6483: [FLINK-7243][flink-formats] Add parquet input format URL: https://github.com/apache/flink/pull/6483#discussion_r238367855 ## File path: flink-formats/flink-parquet/src/main/java/org/apache/flink/formats/parquet/utils/ParquetSchemaConverter.java ## @@ -0,0 +1,359 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.formats.parquet.utils; + +import org.apache.flink.api.common.typeinfo.BasicArrayTypeInfo; +import org.apache.flink.api.common.typeinfo.BasicTypeInfo; +import org.apache.flink.api.common.typeinfo.SqlTimeTypeInfo; +import org.apache.flink.api.common.typeinfo.TypeInformation; +import org.apache.flink.api.java.typeutils.MapTypeInfo; +import org.apache.flink.api.java.typeutils.ObjectArrayTypeInfo; +import org.apache.flink.api.java.typeutils.RowTypeInfo; + +import org.apache.parquet.avro.AvroSchemaConverter; +import org.apache.parquet.schema.GroupType; +import org.apache.parquet.schema.MessageType; +import org.apache.parquet.schema.OriginalType; +import org.apache.parquet.schema.PrimitiveType; +import org.apache.parquet.schema.Type; +import org.apache.parquet.schema.Types; + +import java.lang.reflect.Array; +import java.util.ArrayList; +import java.util.List; + +/** + * Schema converter converts Parquet schema to and from Flink internal types. + */ +public class ParquetSchemaConverter { + public static final String MAP_KEY = "key"; + public static final String MAP_VALUE = "value"; + public static final String LIST_ARRAY_TYPE = "array"; + public static final String LIST_ELEMENT = "element"; + public static final String LIST_GROUP_NAME = "list"; + public static final String MESSAGE_ROOT = "root"; + private static final AvroSchemaConverter SCHEMA_CONVERTER = new AvroSchemaConverter(); + + public static TypeInformation fromParquetType(MessageType type) { + return convertFields(type.getFields()); + } + + /** +* Converts Flink Internal Type to Parquet schema. +* +* @param typeInformation flink type information +* @param isStandard is standard LIST and MAP schema or back-compatible schema +* @return Parquet schema +*/ + public static MessageType toParquetType(TypeInformation typeInformation, boolean isStandard) { + return (MessageType) convertField(null, typeInformation, Type.Repetition.OPTIONAL, isStandard); + } + + private static TypeInformation convertFields(List parquetFields) { + List> types = new ArrayList<>(); + List names = new ArrayList<>(); + for (Type field : parquetFields) { + TypeInformation subType = convertField(field); + if (subType != null) { + types.add(subType); + names.add(field.getName()); + } + } + + return new RowTypeInfo(types.toArray(new TypeInformation[types.size()]), + names.toArray(new String[names.size()])); + } + + private static TypeInformation convertField(final Type fieldType) { + TypeInformation typeInfo = null; + if (fieldType.isPrimitive()) { + OriginalType originalType = fieldType.getOriginalType(); + PrimitiveType primitiveType = fieldType.asPrimitiveType(); + switch (primitiveType.getPrimitiveTypeName()) { + case BINARY: + if (originalType != null) { + switch (originalType) { + case DECIMAL: + typeInfo = BasicTypeInfo.BIG_DEC_TYPE_INFO; +
[jira] [Commented] (FLINK-7243) Add ParquetInputFormat
[ https://issues.apache.org/jira/browse/FLINK-7243?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=16707838#comment-16707838 ] ASF GitHub Bot commented on FLINK-7243: --- fhueske commented on a change in pull request #6483: [FLINK-7243][flink-formats] Add parquet input format URL: https://github.com/apache/flink/pull/6483#discussion_r238438418 ## File path: flink-formats/flink-parquet/src/main/java/org/apache/flink/formats/parquet/utils/ParquetSchemaConverter.java ## @@ -0,0 +1,359 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.formats.parquet.utils; + +import org.apache.flink.api.common.typeinfo.BasicArrayTypeInfo; +import org.apache.flink.api.common.typeinfo.BasicTypeInfo; +import org.apache.flink.api.common.typeinfo.SqlTimeTypeInfo; +import org.apache.flink.api.common.typeinfo.TypeInformation; +import org.apache.flink.api.java.typeutils.MapTypeInfo; +import org.apache.flink.api.java.typeutils.ObjectArrayTypeInfo; +import org.apache.flink.api.java.typeutils.RowTypeInfo; + +import org.apache.parquet.avro.AvroSchemaConverter; +import org.apache.parquet.schema.GroupType; +import org.apache.parquet.schema.MessageType; +import org.apache.parquet.schema.OriginalType; +import org.apache.parquet.schema.PrimitiveType; +import org.apache.parquet.schema.Type; +import org.apache.parquet.schema.Types; + +import java.lang.reflect.Array; +import java.util.ArrayList; +import java.util.List; + +/** + * Schema converter converts Parquet schema to and from Flink internal types. + */ +public class ParquetSchemaConverter { + public static final String MAP_KEY = "key"; + public static final String MAP_VALUE = "value"; + public static final String LIST_ARRAY_TYPE = "array"; + public static final String LIST_ELEMENT = "element"; + public static final String LIST_GROUP_NAME = "list"; + public static final String MESSAGE_ROOT = "root"; + private static final AvroSchemaConverter SCHEMA_CONVERTER = new AvroSchemaConverter(); + + public static TypeInformation fromParquetType(MessageType type) { + return convertFields(type.getFields()); + } + + /** +* Converts Flink Internal Type to Parquet schema. +* +* @param typeInformation flink type information +* @param isStandard is standard LIST and MAP schema or back-compatible schema +* @return Parquet schema +*/ + public static MessageType toParquetType(TypeInformation typeInformation, boolean isStandard) { + return (MessageType) convertField(null, typeInformation, Type.Repetition.OPTIONAL, isStandard); + } + + private static TypeInformation convertFields(List parquetFields) { + List> types = new ArrayList<>(); + List names = new ArrayList<>(); + for (Type field : parquetFields) { + TypeInformation subType = convertField(field); + if (subType != null) { + types.add(subType); + names.add(field.getName()); + } + } + + return new RowTypeInfo(types.toArray(new TypeInformation[types.size()]), + names.toArray(new String[names.size()])); + } + + private static TypeInformation convertField(final Type fieldType) { + TypeInformation typeInfo = null; + if (fieldType.isPrimitive()) { + OriginalType originalType = fieldType.getOriginalType(); + PrimitiveType primitiveType = fieldType.asPrimitiveType(); + switch (primitiveType.getPrimitiveTypeName()) { + case BINARY: + if (originalType != null) { + switch (originalType) { + case DECIMAL: + typeInfo = BasicTypeInfo.BIG_DEC_TYPE_INFO; +
[jira] [Commented] (FLINK-7243) Add ParquetInputFormat
[ https://issues.apache.org/jira/browse/FLINK-7243?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=16707836#comment-16707836 ] ASF GitHub Bot commented on FLINK-7243: --- fhueske commented on a change in pull request #6483: [FLINK-7243][flink-formats] Add parquet input format URL: https://github.com/apache/flink/pull/6483#discussion_r238359839 ## File path: flink-formats/flink-parquet/src/main/java/org/apache/flink/formats/parquet/utils/ParquetSchemaConverter.java ## @@ -0,0 +1,359 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.formats.parquet.utils; + +import org.apache.flink.api.common.typeinfo.BasicArrayTypeInfo; +import org.apache.flink.api.common.typeinfo.BasicTypeInfo; +import org.apache.flink.api.common.typeinfo.SqlTimeTypeInfo; +import org.apache.flink.api.common.typeinfo.TypeInformation; +import org.apache.flink.api.java.typeutils.MapTypeInfo; +import org.apache.flink.api.java.typeutils.ObjectArrayTypeInfo; +import org.apache.flink.api.java.typeutils.RowTypeInfo; + +import org.apache.parquet.avro.AvroSchemaConverter; +import org.apache.parquet.schema.GroupType; +import org.apache.parquet.schema.MessageType; +import org.apache.parquet.schema.OriginalType; +import org.apache.parquet.schema.PrimitiveType; +import org.apache.parquet.schema.Type; +import org.apache.parquet.schema.Types; + +import java.lang.reflect.Array; +import java.util.ArrayList; +import java.util.List; + +/** + * Schema converter converts Parquet schema to and from Flink internal types. + */ +public class ParquetSchemaConverter { + public static final String MAP_KEY = "key"; + public static final String MAP_VALUE = "value"; + public static final String LIST_ARRAY_TYPE = "array"; + public static final String LIST_ELEMENT = "element"; + public static final String LIST_GROUP_NAME = "list"; + public static final String MESSAGE_ROOT = "root"; + private static final AvroSchemaConverter SCHEMA_CONVERTER = new AvroSchemaConverter(); + + public static TypeInformation fromParquetType(MessageType type) { + return convertFields(type.getFields()); + } + + /** +* Converts Flink Internal Type to Parquet schema. +* +* @param typeInformation flink type information +* @param isStandard is standard LIST and MAP schema or back-compatible schema +* @return Parquet schema +*/ + public static MessageType toParquetType(TypeInformation typeInformation, boolean isStandard) { + return (MessageType) convertField(null, typeInformation, Type.Repetition.OPTIONAL, isStandard); + } + + private static TypeInformation convertFields(List parquetFields) { + List> types = new ArrayList<>(); + List names = new ArrayList<>(); + for (Type field : parquetFields) { + TypeInformation subType = convertField(field); + if (subType != null) { + types.add(subType); + names.add(field.getName()); + } + } + + return new RowTypeInfo(types.toArray(new TypeInformation[types.size()]), + names.toArray(new String[names.size()])); + } + + private static TypeInformation convertField(final Type fieldType) { + TypeInformation typeInfo = null; + if (fieldType.isPrimitive()) { + OriginalType originalType = fieldType.getOriginalType(); + PrimitiveType primitiveType = fieldType.asPrimitiveType(); + switch (primitiveType.getPrimitiveTypeName()) { + case BINARY: + if (originalType != null) { + switch (originalType) { + case DECIMAL: + typeInfo = BasicTypeInfo.BIG_DEC_TYPE_INFO; +
[jira] [Commented] (FLINK-7243) Add ParquetInputFormat
[ https://issues.apache.org/jira/browse/FLINK-7243?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=16707833#comment-16707833 ] ASF GitHub Bot commented on FLINK-7243: --- fhueske commented on a change in pull request #6483: [FLINK-7243][flink-formats] Add parquet input format URL: https://github.com/apache/flink/pull/6483#discussion_r238438892 ## File path: flink-formats/flink-parquet/src/main/java/org/apache/flink/formats/parquet/utils/ParquetSchemaConverter.java ## @@ -0,0 +1,359 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.formats.parquet.utils; + +import org.apache.flink.api.common.typeinfo.BasicArrayTypeInfo; +import org.apache.flink.api.common.typeinfo.BasicTypeInfo; +import org.apache.flink.api.common.typeinfo.SqlTimeTypeInfo; +import org.apache.flink.api.common.typeinfo.TypeInformation; +import org.apache.flink.api.java.typeutils.MapTypeInfo; +import org.apache.flink.api.java.typeutils.ObjectArrayTypeInfo; +import org.apache.flink.api.java.typeutils.RowTypeInfo; + +import org.apache.parquet.avro.AvroSchemaConverter; +import org.apache.parquet.schema.GroupType; +import org.apache.parquet.schema.MessageType; +import org.apache.parquet.schema.OriginalType; +import org.apache.parquet.schema.PrimitiveType; +import org.apache.parquet.schema.Type; +import org.apache.parquet.schema.Types; + +import java.lang.reflect.Array; +import java.util.ArrayList; +import java.util.List; + +/** + * Schema converter converts Parquet schema to and from Flink internal types. + */ +public class ParquetSchemaConverter { + public static final String MAP_KEY = "key"; + public static final String MAP_VALUE = "value"; + public static final String LIST_ARRAY_TYPE = "array"; + public static final String LIST_ELEMENT = "element"; + public static final String LIST_GROUP_NAME = "list"; + public static final String MESSAGE_ROOT = "root"; + private static final AvroSchemaConverter SCHEMA_CONVERTER = new AvroSchemaConverter(); + + public static TypeInformation fromParquetType(MessageType type) { + return convertFields(type.getFields()); + } + + /** +* Converts Flink Internal Type to Parquet schema. +* +* @param typeInformation flink type information +* @param isStandard is standard LIST and MAP schema or back-compatible schema +* @return Parquet schema +*/ + public static MessageType toParquetType(TypeInformation typeInformation, boolean isStandard) { + return (MessageType) convertField(null, typeInformation, Type.Repetition.OPTIONAL, isStandard); + } + + private static TypeInformation convertFields(List parquetFields) { + List> types = new ArrayList<>(); + List names = new ArrayList<>(); + for (Type field : parquetFields) { + TypeInformation subType = convertField(field); + if (subType != null) { + types.add(subType); + names.add(field.getName()); + } + } + + return new RowTypeInfo(types.toArray(new TypeInformation[types.size()]), + names.toArray(new String[names.size()])); + } + + private static TypeInformation convertField(final Type fieldType) { + TypeInformation typeInfo = null; + if (fieldType.isPrimitive()) { + OriginalType originalType = fieldType.getOriginalType(); + PrimitiveType primitiveType = fieldType.asPrimitiveType(); + switch (primitiveType.getPrimitiveTypeName()) { + case BINARY: + if (originalType != null) { + switch (originalType) { + case DECIMAL: + typeInfo = BasicTypeInfo.BIG_DEC_TYPE_INFO; +
[jira] [Commented] (FLINK-7243) Add ParquetInputFormat
[ https://issues.apache.org/jira/browse/FLINK-7243?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=16707839#comment-16707839 ] ASF GitHub Bot commented on FLINK-7243: --- fhueske commented on a change in pull request #6483: [FLINK-7243][flink-formats] Add parquet input format URL: https://github.com/apache/flink/pull/6483#discussion_r238359907 ## File path: flink-formats/flink-parquet/src/main/java/org/apache/flink/formats/parquet/utils/ParquetSchemaConverter.java ## @@ -0,0 +1,359 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.formats.parquet.utils; + +import org.apache.flink.api.common.typeinfo.BasicArrayTypeInfo; +import org.apache.flink.api.common.typeinfo.BasicTypeInfo; +import org.apache.flink.api.common.typeinfo.SqlTimeTypeInfo; +import org.apache.flink.api.common.typeinfo.TypeInformation; +import org.apache.flink.api.java.typeutils.MapTypeInfo; +import org.apache.flink.api.java.typeutils.ObjectArrayTypeInfo; +import org.apache.flink.api.java.typeutils.RowTypeInfo; + +import org.apache.parquet.avro.AvroSchemaConverter; +import org.apache.parquet.schema.GroupType; +import org.apache.parquet.schema.MessageType; +import org.apache.parquet.schema.OriginalType; +import org.apache.parquet.schema.PrimitiveType; +import org.apache.parquet.schema.Type; +import org.apache.parquet.schema.Types; + +import java.lang.reflect.Array; +import java.util.ArrayList; +import java.util.List; + +/** + * Schema converter converts Parquet schema to and from Flink internal types. + */ +public class ParquetSchemaConverter { + public static final String MAP_KEY = "key"; + public static final String MAP_VALUE = "value"; + public static final String LIST_ARRAY_TYPE = "array"; + public static final String LIST_ELEMENT = "element"; + public static final String LIST_GROUP_NAME = "list"; + public static final String MESSAGE_ROOT = "root"; + private static final AvroSchemaConverter SCHEMA_CONVERTER = new AvroSchemaConverter(); + + public static TypeInformation fromParquetType(MessageType type) { + return convertFields(type.getFields()); + } + + /** +* Converts Flink Internal Type to Parquet schema. +* +* @param typeInformation flink type information +* @param isStandard is standard LIST and MAP schema or back-compatible schema +* @return Parquet schema +*/ + public static MessageType toParquetType(TypeInformation typeInformation, boolean isStandard) { + return (MessageType) convertField(null, typeInformation, Type.Repetition.OPTIONAL, isStandard); + } + + private static TypeInformation convertFields(List parquetFields) { + List> types = new ArrayList<>(); + List names = new ArrayList<>(); + for (Type field : parquetFields) { + TypeInformation subType = convertField(field); + if (subType != null) { + types.add(subType); + names.add(field.getName()); + } + } + + return new RowTypeInfo(types.toArray(new TypeInformation[types.size()]), + names.toArray(new String[names.size()])); + } + + private static TypeInformation convertField(final Type fieldType) { + TypeInformation typeInfo = null; + if (fieldType.isPrimitive()) { + OriginalType originalType = fieldType.getOriginalType(); + PrimitiveType primitiveType = fieldType.asPrimitiveType(); + switch (primitiveType.getPrimitiveTypeName()) { + case BINARY: + if (originalType != null) { + switch (originalType) { + case DECIMAL: + typeInfo = BasicTypeInfo.BIG_DEC_TYPE_INFO; +
[jira] [Commented] (FLINK-7243) Add ParquetInputFormat
[ https://issues.apache.org/jira/browse/FLINK-7243?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=16707832#comment-16707832 ] ASF GitHub Bot commented on FLINK-7243: --- fhueske commented on a change in pull request #6483: [FLINK-7243][flink-formats] Add parquet input format URL: https://github.com/apache/flink/pull/6483#discussion_r238349486 ## File path: flink-formats/flink-parquet/src/main/java/org/apache/flink/formats/parquet/utils/ParquetSchemaConverter.java ## @@ -0,0 +1,359 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.formats.parquet.utils; + +import org.apache.flink.api.common.typeinfo.BasicArrayTypeInfo; +import org.apache.flink.api.common.typeinfo.BasicTypeInfo; +import org.apache.flink.api.common.typeinfo.SqlTimeTypeInfo; +import org.apache.flink.api.common.typeinfo.TypeInformation; +import org.apache.flink.api.java.typeutils.MapTypeInfo; +import org.apache.flink.api.java.typeutils.ObjectArrayTypeInfo; +import org.apache.flink.api.java.typeutils.RowTypeInfo; + +import org.apache.parquet.avro.AvroSchemaConverter; +import org.apache.parquet.schema.GroupType; +import org.apache.parquet.schema.MessageType; +import org.apache.parquet.schema.OriginalType; +import org.apache.parquet.schema.PrimitiveType; +import org.apache.parquet.schema.Type; +import org.apache.parquet.schema.Types; + +import java.lang.reflect.Array; +import java.util.ArrayList; +import java.util.List; + +/** + * Schema converter converts Parquet schema to and from Flink internal types. + */ +public class ParquetSchemaConverter { + public static final String MAP_KEY = "key"; + public static final String MAP_VALUE = "value"; + public static final String LIST_ARRAY_TYPE = "array"; + public static final String LIST_ELEMENT = "element"; + public static final String LIST_GROUP_NAME = "list"; + public static final String MESSAGE_ROOT = "root"; + private static final AvroSchemaConverter SCHEMA_CONVERTER = new AvroSchemaConverter(); + + public static TypeInformation fromParquetType(MessageType type) { + return convertFields(type.getFields()); + } + + /** +* Converts Flink Internal Type to Parquet schema. +* +* @param typeInformation flink type information +* @param isStandard is standard LIST and MAP schema or back-compatible schema +* @return Parquet schema +*/ + public static MessageType toParquetType(TypeInformation typeInformation, boolean isStandard) { + return (MessageType) convertField(null, typeInformation, Type.Repetition.OPTIONAL, isStandard); + } + + private static TypeInformation convertFields(List parquetFields) { + List> types = new ArrayList<>(); + List names = new ArrayList<>(); + for (Type field : parquetFields) { + TypeInformation subType = convertField(field); + if (subType != null) { + types.add(subType); + names.add(field.getName()); + } + } + + return new RowTypeInfo(types.toArray(new TypeInformation[types.size()]), + names.toArray(new String[names.size()])); + } + + private static TypeInformation convertField(final Type fieldType) { + TypeInformation typeInfo = null; + if (fieldType.isPrimitive()) { + OriginalType originalType = fieldType.getOriginalType(); + PrimitiveType primitiveType = fieldType.asPrimitiveType(); + switch (primitiveType.getPrimitiveTypeName()) { + case BINARY: + if (originalType != null) { + switch (originalType) { + case DECIMAL: + typeInfo = BasicTypeInfo.BIG_DEC_TYPE_INFO; +
[jira] [Commented] (FLINK-7243) Add ParquetInputFormat
[ https://issues.apache.org/jira/browse/FLINK-7243?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=16707842#comment-16707842 ] ASF GitHub Bot commented on FLINK-7243: --- fhueske commented on a change in pull request #6483: [FLINK-7243][flink-formats] Add parquet input format URL: https://github.com/apache/flink/pull/6483#discussion_r238446869 ## File path: flink-formats/flink-parquet/src/main/java/org/apache/flink/formats/parquet/utils/ParquetSchemaConverter.java ## @@ -0,0 +1,359 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.formats.parquet.utils; + +import org.apache.flink.api.common.typeinfo.BasicArrayTypeInfo; +import org.apache.flink.api.common.typeinfo.BasicTypeInfo; +import org.apache.flink.api.common.typeinfo.SqlTimeTypeInfo; +import org.apache.flink.api.common.typeinfo.TypeInformation; +import org.apache.flink.api.java.typeutils.MapTypeInfo; +import org.apache.flink.api.java.typeutils.ObjectArrayTypeInfo; +import org.apache.flink.api.java.typeutils.RowTypeInfo; + +import org.apache.parquet.avro.AvroSchemaConverter; +import org.apache.parquet.schema.GroupType; +import org.apache.parquet.schema.MessageType; +import org.apache.parquet.schema.OriginalType; +import org.apache.parquet.schema.PrimitiveType; +import org.apache.parquet.schema.Type; +import org.apache.parquet.schema.Types; + +import java.lang.reflect.Array; +import java.util.ArrayList; +import java.util.List; + +/** + * Schema converter converts Parquet schema to and from Flink internal types. + */ +public class ParquetSchemaConverter { + public static final String MAP_KEY = "key"; + public static final String MAP_VALUE = "value"; + public static final String LIST_ARRAY_TYPE = "array"; + public static final String LIST_ELEMENT = "element"; + public static final String LIST_GROUP_NAME = "list"; + public static final String MESSAGE_ROOT = "root"; + private static final AvroSchemaConverter SCHEMA_CONVERTER = new AvroSchemaConverter(); + + public static TypeInformation fromParquetType(MessageType type) { + return convertFields(type.getFields()); + } + + /** +* Converts Flink Internal Type to Parquet schema. +* +* @param typeInformation flink type information +* @param isStandard is standard LIST and MAP schema or back-compatible schema +* @return Parquet schema +*/ + public static MessageType toParquetType(TypeInformation typeInformation, boolean isStandard) { + return (MessageType) convertField(null, typeInformation, Type.Repetition.OPTIONAL, isStandard); + } + + private static TypeInformation convertFields(List parquetFields) { + List> types = new ArrayList<>(); + List names = new ArrayList<>(); + for (Type field : parquetFields) { + TypeInformation subType = convertField(field); + if (subType != null) { + types.add(subType); + names.add(field.getName()); + } + } + + return new RowTypeInfo(types.toArray(new TypeInformation[types.size()]), + names.toArray(new String[names.size()])); + } + + private static TypeInformation convertField(final Type fieldType) { + TypeInformation typeInfo = null; + if (fieldType.isPrimitive()) { + OriginalType originalType = fieldType.getOriginalType(); + PrimitiveType primitiveType = fieldType.asPrimitiveType(); + switch (primitiveType.getPrimitiveTypeName()) { + case BINARY: + if (originalType != null) { + switch (originalType) { + case DECIMAL: + typeInfo = BasicTypeInfo.BIG_DEC_TYPE_INFO; +
[jira] [Commented] (FLINK-7243) Add ParquetInputFormat
[ https://issues.apache.org/jira/browse/FLINK-7243?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=16707837#comment-16707837 ] ASF GitHub Bot commented on FLINK-7243: --- fhueske commented on a change in pull request #6483: [FLINK-7243][flink-formats] Add parquet input format URL: https://github.com/apache/flink/pull/6483#discussion_r238367667 ## File path: flink-formats/flink-parquet/src/main/java/org/apache/flink/formats/parquet/utils/ParquetSchemaConverter.java ## @@ -0,0 +1,359 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.formats.parquet.utils; + +import org.apache.flink.api.common.typeinfo.BasicArrayTypeInfo; +import org.apache.flink.api.common.typeinfo.BasicTypeInfo; +import org.apache.flink.api.common.typeinfo.SqlTimeTypeInfo; +import org.apache.flink.api.common.typeinfo.TypeInformation; +import org.apache.flink.api.java.typeutils.MapTypeInfo; +import org.apache.flink.api.java.typeutils.ObjectArrayTypeInfo; +import org.apache.flink.api.java.typeutils.RowTypeInfo; + +import org.apache.parquet.avro.AvroSchemaConverter; +import org.apache.parquet.schema.GroupType; +import org.apache.parquet.schema.MessageType; +import org.apache.parquet.schema.OriginalType; +import org.apache.parquet.schema.PrimitiveType; +import org.apache.parquet.schema.Type; +import org.apache.parquet.schema.Types; + +import java.lang.reflect.Array; +import java.util.ArrayList; +import java.util.List; + +/** + * Schema converter converts Parquet schema to and from Flink internal types. + */ +public class ParquetSchemaConverter { + public static final String MAP_KEY = "key"; + public static final String MAP_VALUE = "value"; + public static final String LIST_ARRAY_TYPE = "array"; + public static final String LIST_ELEMENT = "element"; + public static final String LIST_GROUP_NAME = "list"; + public static final String MESSAGE_ROOT = "root"; + private static final AvroSchemaConverter SCHEMA_CONVERTER = new AvroSchemaConverter(); + + public static TypeInformation fromParquetType(MessageType type) { + return convertFields(type.getFields()); + } + + /** +* Converts Flink Internal Type to Parquet schema. +* +* @param typeInformation flink type information +* @param isStandard is standard LIST and MAP schema or back-compatible schema +* @return Parquet schema +*/ + public static MessageType toParquetType(TypeInformation typeInformation, boolean isStandard) { + return (MessageType) convertField(null, typeInformation, Type.Repetition.OPTIONAL, isStandard); + } + + private static TypeInformation convertFields(List parquetFields) { + List> types = new ArrayList<>(); + List names = new ArrayList<>(); + for (Type field : parquetFields) { + TypeInformation subType = convertField(field); + if (subType != null) { + types.add(subType); + names.add(field.getName()); + } + } + + return new RowTypeInfo(types.toArray(new TypeInformation[types.size()]), + names.toArray(new String[names.size()])); + } + + private static TypeInformation convertField(final Type fieldType) { + TypeInformation typeInfo = null; + if (fieldType.isPrimitive()) { + OriginalType originalType = fieldType.getOriginalType(); + PrimitiveType primitiveType = fieldType.asPrimitiveType(); + switch (primitiveType.getPrimitiveTypeName()) { + case BINARY: + if (originalType != null) { + switch (originalType) { + case DECIMAL: + typeInfo = BasicTypeInfo.BIG_DEC_TYPE_INFO; +
[jira] [Commented] (FLINK-7243) Add ParquetInputFormat
[ https://issues.apache.org/jira/browse/FLINK-7243?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=16707831#comment-16707831 ] ASF GitHub Bot commented on FLINK-7243: --- fhueske commented on a change in pull request #6483: [FLINK-7243][flink-formats] Add parquet input format URL: https://github.com/apache/flink/pull/6483#discussion_r238341353 ## File path: flink-formats/flink-parquet/src/main/java/org/apache/flink/formats/parquet/utils/ParquetSchemaConverter.java ## @@ -0,0 +1,359 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.formats.parquet.utils; + +import org.apache.flink.api.common.typeinfo.BasicArrayTypeInfo; +import org.apache.flink.api.common.typeinfo.BasicTypeInfo; +import org.apache.flink.api.common.typeinfo.SqlTimeTypeInfo; +import org.apache.flink.api.common.typeinfo.TypeInformation; +import org.apache.flink.api.java.typeutils.MapTypeInfo; +import org.apache.flink.api.java.typeutils.ObjectArrayTypeInfo; +import org.apache.flink.api.java.typeutils.RowTypeInfo; + +import org.apache.parquet.avro.AvroSchemaConverter; +import org.apache.parquet.schema.GroupType; +import org.apache.parquet.schema.MessageType; +import org.apache.parquet.schema.OriginalType; +import org.apache.parquet.schema.PrimitiveType; +import org.apache.parquet.schema.Type; +import org.apache.parquet.schema.Types; + +import java.lang.reflect.Array; +import java.util.ArrayList; +import java.util.List; + +/** + * Schema converter converts Parquet schema to and from Flink internal types. + */ +public class ParquetSchemaConverter { + public static final String MAP_KEY = "key"; + public static final String MAP_VALUE = "value"; + public static final String LIST_ARRAY_TYPE = "array"; + public static final String LIST_ELEMENT = "element"; + public static final String LIST_GROUP_NAME = "list"; + public static final String MESSAGE_ROOT = "root"; + private static final AvroSchemaConverter SCHEMA_CONVERTER = new AvroSchemaConverter(); Review comment: unused This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org > Add ParquetInputFormat > -- > > Key: FLINK-7243 > URL: https://issues.apache.org/jira/browse/FLINK-7243 > Project: Flink > Issue Type: Sub-task > Components: Table API SQL >Reporter: godfrey he >Assignee: Zhenqiu Huang >Priority: Major > Labels: pull-request-available > > Add a {{ParquetInputFormat}} to read data from a Apache Parquet file. -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Commented] (FLINK-7243) Add ParquetInputFormat
[ https://issues.apache.org/jira/browse/FLINK-7243?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=16699934#comment-16699934 ] ASF GitHub Bot commented on FLINK-7243: --- HuangZhenQiu commented on issue #6483: [FLINK-7243][flink-formats] Add parquet input format URL: https://github.com/apache/flink/pull/6483#issuecomment-441920259 @fhueske I removed timestamp override, and also update the failure recovery test case to test recovery reading file with 10 row group. Please review it as your most convenient time. This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org > Add ParquetInputFormat > -- > > Key: FLINK-7243 > URL: https://issues.apache.org/jira/browse/FLINK-7243 > Project: Flink > Issue Type: Sub-task > Components: Table API SQL >Reporter: godfrey he >Assignee: Zhenqiu Huang >Priority: Major > Labels: pull-request-available > > Add a {{ParquetInputFormat}} to read data from a Apache Parquet file. -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Commented] (FLINK-7243) Add ParquetInputFormat
[ https://issues.apache.org/jira/browse/FLINK-7243?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=16697431#comment-16697431 ] ASF GitHub Bot commented on FLINK-7243: --- HuangZhenQiu commented on issue #6483: [FLINK-7243][flink-formats] Add parquet input format URL: https://github.com/apache/flink/pull/6483#issuecomment-441301540 @fhueske Resolved all of the comments except the one for timestamp rewrite. It is needed for time field of window functions. Do you prefer to use timestamp udf SQL directly in this case? This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org > Add ParquetInputFormat > -- > > Key: FLINK-7243 > URL: https://issues.apache.org/jira/browse/FLINK-7243 > Project: Flink > Issue Type: Sub-task > Components: Table API SQL >Reporter: godfrey he >Assignee: Zhenqiu Huang >Priority: Major > Labels: pull-request-available > > Add a {{ParquetInputFormat}} to read data from a Apache Parquet file. -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Commented] (FLINK-7243) Add ParquetInputFormat
[ https://issues.apache.org/jira/browse/FLINK-7243?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=16692776#comment-16692776 ] ASF GitHub Bot commented on FLINK-7243: --- HuangZhenQiu commented on a change in pull request #6483: [FLINK-7243][flink-formats] Add parquet input format URL: https://github.com/apache/flink/pull/6483#discussion_r234892484 ## File path: flink-formats/flink-parquet/src/main/java/org/apache/flink/formats/parquet/ParquetRowInputFormat.java ## @@ -0,0 +1,87 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.formats.parquet; + +import org.apache.flink.api.common.typeinfo.SqlTimeTypeInfo; +import org.apache.flink.api.common.typeinfo.TypeInformation; +import org.apache.flink.api.java.typeutils.ResultTypeQueryable; +import org.apache.flink.api.java.typeutils.RowTypeInfo; +import org.apache.flink.core.fs.Path; +import org.apache.flink.types.Row; + +import org.apache.parquet.schema.MessageType; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.sql.Timestamp; + + +/** + * A subclass of {@link ParquetInputFormat} to read from Parquet files and convert to {@link Row}. + * It is mainly used to integrate with table API and batch SQL. + */ +public class ParquetRowInputFormat extends ParquetInputFormat implements ResultTypeQueryable { + private static final long serialVersionUID = 11L; + private static final Logger LOG = LoggerFactory.getLogger(ParquetRowInputFormat.class); + private boolean timeStampRewrite; + private RowTypeInfo returnType; + private int tsIndex; + + public ParquetRowInputFormat(Path path, MessageType messageType) { + super(path, messageType); + this.returnType = new RowTypeInfo(getFieldTypes(), getFieldNames()); + this.timeStampRewrite = false; + } + + @Override + public TypeInformation getProducedType() { + return new RowTypeInfo(getFieldTypes(), getFieldNames()); Review comment: Yes, you are right. This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org > Add ParquetInputFormat > -- > > Key: FLINK-7243 > URL: https://issues.apache.org/jira/browse/FLINK-7243 > Project: Flink > Issue Type: Sub-task > Components: Table API SQL >Reporter: godfrey he >Assignee: Zhenqiu Huang >Priority: Major > Labels: pull-request-available > > Add a {{ParquetInputFormat}} to read data from a Apache Parquet file. -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Commented] (FLINK-7243) Add ParquetInputFormat
[ https://issues.apache.org/jira/browse/FLINK-7243?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=16692772#comment-16692772 ] ASF GitHub Bot commented on FLINK-7243: --- HuangZhenQiu commented on a change in pull request #6483: [FLINK-7243][flink-formats] Add parquet input format URL: https://github.com/apache/flink/pull/6483#discussion_r234892171 ## File path: flink-formats/flink-parquet/src/main/java/org/apache/flink/formats/parquet/ParquetRowInputFormat.java ## @@ -0,0 +1,87 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.formats.parquet; + +import org.apache.flink.api.common.typeinfo.SqlTimeTypeInfo; +import org.apache.flink.api.common.typeinfo.TypeInformation; +import org.apache.flink.api.java.typeutils.ResultTypeQueryable; +import org.apache.flink.api.java.typeutils.RowTypeInfo; +import org.apache.flink.core.fs.Path; +import org.apache.flink.types.Row; + +import org.apache.parquet.schema.MessageType; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.sql.Timestamp; + + +/** + * A subclass of {@link ParquetInputFormat} to read from Parquet files and convert to {@link Row}. + * It is mainly used to integrate with table API and batch SQL. + */ +public class ParquetRowInputFormat extends ParquetInputFormat implements ResultTypeQueryable { + private static final long serialVersionUID = 11L; + private static final Logger LOG = LoggerFactory.getLogger(ParquetRowInputFormat.class); + private boolean timeStampRewrite; + private RowTypeInfo returnType; + private int tsIndex; + + public ParquetRowInputFormat(Path path, MessageType messageType) { + super(path, messageType); + this.returnType = new RowTypeInfo(getFieldTypes(), getFieldNames()); + this.timeStampRewrite = false; + } + + @Override + public TypeInformation getProducedType() { + return new RowTypeInfo(getFieldTypes(), getFieldNames()); + } + + @Override + protected Row convert(Row row) { + if (timeStampRewrite) { + row.setField(tsIndex, new Timestamp((long) row.getField(tsIndex))); Review comment: Agree. When I work on HiveTableSource internally, it is 1 year ago on Flink 1.4. As I remember, when I use window functions for group by, for example TUMBLE(time_attr, interval). The attribute time_attr has to be the type of Timestamp. It means we need to convert to the field to Timestamp type somewhere. Do you have any preference in the implementation? Or you just want to leave it in ParquetTableSource? This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org > Add ParquetInputFormat > -- > > Key: FLINK-7243 > URL: https://issues.apache.org/jira/browse/FLINK-7243 > Project: Flink > Issue Type: Sub-task > Components: Table API SQL >Reporter: godfrey he >Assignee: Zhenqiu Huang >Priority: Major > Labels: pull-request-available > > Add a {{ParquetInputFormat}} to read data from a Apache Parquet file. -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Commented] (FLINK-7243) Add ParquetInputFormat
[ https://issues.apache.org/jira/browse/FLINK-7243?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=16692004#comment-16692004 ] ASF GitHub Bot commented on FLINK-7243: --- fhueske commented on a change in pull request #6483: [FLINK-7243][flink-formats] Add parquet input format URL: https://github.com/apache/flink/pull/6483#discussion_r234617856 ## File path: flink-formats/flink-parquet/src/main/java/org/apache/flink/formats/parquet/utils/ParquetRecordReader.java ## @@ -0,0 +1,221 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.formats.parquet.utils; + +import org.apache.hadoop.conf.Configuration; +import org.apache.parquet.column.page.PageReadStore; +import org.apache.parquet.filter2.compat.FilterCompat; +import org.apache.parquet.filter2.compat.FilterCompat.Filter; +import org.apache.parquet.hadoop.ParquetFileReader; +import org.apache.parquet.hadoop.api.InitContext; +import org.apache.parquet.hadoop.api.ReadSupport; +import org.apache.parquet.hadoop.metadata.BlockMetaData; +import org.apache.parquet.hadoop.metadata.FileMetaData; +import org.apache.parquet.io.ColumnIOFactory; +import org.apache.parquet.io.MessageColumnIO; +import org.apache.parquet.io.RecordReader; +import org.apache.parquet.io.api.RecordMaterializer; +import org.apache.parquet.io.api.RecordMaterializer.RecordMaterializationException; +import org.apache.parquet.schema.MessageType; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import javax.annotation.CheckReturnValue; +import javax.annotation.meta.When; + +import java.io.IOException; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Set; + + +import static org.apache.flink.util.Preconditions.checkNotNull; + +/** + * Customized {@link org.apache.parquet.hadoop.ParquetRecordReader} that support start read from particular position. + */ +public class ParquetRecordReader { + private static final Logger LOG = LoggerFactory.getLogger(ParquetRecordReader.class); + + private ColumnIOFactory columnIOFactory; + private Filter filter; + private MessageType readSchema; + private MessageType fileSchema; + private ReadSupport readSupport; + + // Parquet Materializer convert record to T + private RecordMaterializer recordMaterializer; + private T currentValue; + private long total; + private long current = 0; Review comment: rename to `numReturnedRows`? This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org > Add ParquetInputFormat > -- > > Key: FLINK-7243 > URL: https://issues.apache.org/jira/browse/FLINK-7243 > Project: Flink > Issue Type: Sub-task > Components: Table API SQL >Reporter: godfrey he >Assignee: Zhenqiu Huang >Priority: Major > Labels: pull-request-available > > Add a {{ParquetInputFormat}} to read data from a Apache Parquet file. -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Commented] (FLINK-7243) Add ParquetInputFormat
[ https://issues.apache.org/jira/browse/FLINK-7243?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=16691997#comment-16691997 ] ASF GitHub Bot commented on FLINK-7243: --- fhueske commented on a change in pull request #6483: [FLINK-7243][flink-formats] Add parquet input format URL: https://github.com/apache/flink/pull/6483#discussion_r234640295 ## File path: flink-formats/flink-parquet/src/main/java/org/apache/flink/formats/parquet/utils/ParquetRecordReader.java ## @@ -0,0 +1,221 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.formats.parquet.utils; + +import org.apache.hadoop.conf.Configuration; +import org.apache.parquet.column.page.PageReadStore; +import org.apache.parquet.filter2.compat.FilterCompat; +import org.apache.parquet.filter2.compat.FilterCompat.Filter; +import org.apache.parquet.hadoop.ParquetFileReader; +import org.apache.parquet.hadoop.api.InitContext; +import org.apache.parquet.hadoop.api.ReadSupport; +import org.apache.parquet.hadoop.metadata.BlockMetaData; +import org.apache.parquet.hadoop.metadata.FileMetaData; +import org.apache.parquet.io.ColumnIOFactory; +import org.apache.parquet.io.MessageColumnIO; +import org.apache.parquet.io.RecordReader; +import org.apache.parquet.io.api.RecordMaterializer; +import org.apache.parquet.io.api.RecordMaterializer.RecordMaterializationException; +import org.apache.parquet.schema.MessageType; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import javax.annotation.CheckReturnValue; +import javax.annotation.meta.When; + +import java.io.IOException; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Set; + + +import static org.apache.flink.util.Preconditions.checkNotNull; + +/** + * Customized {@link org.apache.parquet.hadoop.ParquetRecordReader} that support start read from particular position. + */ +public class ParquetRecordReader { + private static final Logger LOG = LoggerFactory.getLogger(ParquetRecordReader.class); + + private ColumnIOFactory columnIOFactory; + private Filter filter; + private MessageType readSchema; + private MessageType fileSchema; + private ReadSupport readSupport; + + // Parquet Materializer convert record to T + private RecordMaterializer recordMaterializer; + private T currentValue; + private long total; + private long current = 0; + private int currentBlock = -1; + private ParquetFileReader reader; + private RecordReader recordReader; + private boolean strictTypeChecking = true; + private boolean skipCorruptedRecord = true; + private long countLoadUntilLastGroup = 0; Review comment: rename to `numRowsUpToPreviousBlock` This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org > Add ParquetInputFormat > -- > > Key: FLINK-7243 > URL: https://issues.apache.org/jira/browse/FLINK-7243 > Project: Flink > Issue Type: Sub-task > Components: Table API SQL >Reporter: godfrey he >Assignee: Zhenqiu Huang >Priority: Major > Labels: pull-request-available > > Add a {{ParquetInputFormat}} to read data from a Apache Parquet file. -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Commented] (FLINK-7243) Add ParquetInputFormat
[ https://issues.apache.org/jira/browse/FLINK-7243?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=16691990#comment-16691990 ] ASF GitHub Bot commented on FLINK-7243: --- fhueske commented on a change in pull request #6483: [FLINK-7243][flink-formats] Add parquet input format URL: https://github.com/apache/flink/pull/6483#discussion_r234617674 ## File path: flink-formats/flink-parquet/src/main/java/org/apache/flink/formats/parquet/utils/ParquetRecordReader.java ## @@ -0,0 +1,221 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.formats.parquet.utils; + +import org.apache.hadoop.conf.Configuration; +import org.apache.parquet.column.page.PageReadStore; +import org.apache.parquet.filter2.compat.FilterCompat; +import org.apache.parquet.filter2.compat.FilterCompat.Filter; +import org.apache.parquet.hadoop.ParquetFileReader; +import org.apache.parquet.hadoop.api.InitContext; +import org.apache.parquet.hadoop.api.ReadSupport; +import org.apache.parquet.hadoop.metadata.BlockMetaData; +import org.apache.parquet.hadoop.metadata.FileMetaData; +import org.apache.parquet.io.ColumnIOFactory; +import org.apache.parquet.io.MessageColumnIO; +import org.apache.parquet.io.RecordReader; +import org.apache.parquet.io.api.RecordMaterializer; +import org.apache.parquet.io.api.RecordMaterializer.RecordMaterializationException; +import org.apache.parquet.schema.MessageType; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import javax.annotation.CheckReturnValue; +import javax.annotation.meta.When; + +import java.io.IOException; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Set; + + +import static org.apache.flink.util.Preconditions.checkNotNull; + +/** + * Customized {@link org.apache.parquet.hadoop.ParquetRecordReader} that support start read from particular position. + */ +public class ParquetRecordReader { + private static final Logger LOG = LoggerFactory.getLogger(ParquetRecordReader.class); + + private ColumnIOFactory columnIOFactory; + private Filter filter; + private MessageType readSchema; + private MessageType fileSchema; + private ReadSupport readSupport; + + // Parquet Materializer convert record to T + private RecordMaterializer recordMaterializer; + private T currentValue; + private long total; Review comment: rename to `numTotalRows`? This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org > Add ParquetInputFormat > -- > > Key: FLINK-7243 > URL: https://issues.apache.org/jira/browse/FLINK-7243 > Project: Flink > Issue Type: Sub-task > Components: Table API SQL >Reporter: godfrey he >Assignee: Zhenqiu Huang >Priority: Major > Labels: pull-request-available > > Add a {{ParquetInputFormat}} to read data from a Apache Parquet file. -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Commented] (FLINK-7243) Add ParquetInputFormat
[ https://issues.apache.org/jira/browse/FLINK-7243?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=16691993#comment-16691993 ] ASF GitHub Bot commented on FLINK-7243: --- fhueske commented on a change in pull request #6483: [FLINK-7243][flink-formats] Add parquet input format URL: https://github.com/apache/flink/pull/6483#discussion_r234684049 ## File path: flink-formats/flink-parquet/src/main/java/org/apache/flink/formats/parquet/utils/ParquetRecordReader.java ## @@ -0,0 +1,221 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.formats.parquet.utils; + +import org.apache.hadoop.conf.Configuration; +import org.apache.parquet.column.page.PageReadStore; +import org.apache.parquet.filter2.compat.FilterCompat; +import org.apache.parquet.filter2.compat.FilterCompat.Filter; +import org.apache.parquet.hadoop.ParquetFileReader; +import org.apache.parquet.hadoop.api.InitContext; +import org.apache.parquet.hadoop.api.ReadSupport; +import org.apache.parquet.hadoop.metadata.BlockMetaData; +import org.apache.parquet.hadoop.metadata.FileMetaData; +import org.apache.parquet.io.ColumnIOFactory; +import org.apache.parquet.io.MessageColumnIO; +import org.apache.parquet.io.RecordReader; +import org.apache.parquet.io.api.RecordMaterializer; +import org.apache.parquet.io.api.RecordMaterializer.RecordMaterializationException; +import org.apache.parquet.schema.MessageType; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import javax.annotation.CheckReturnValue; +import javax.annotation.meta.When; + +import java.io.IOException; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Set; + + +import static org.apache.flink.util.Preconditions.checkNotNull; + +/** + * Customized {@link org.apache.parquet.hadoop.ParquetRecordReader} that support start read from particular position. + */ +public class ParquetRecordReader { + private static final Logger LOG = LoggerFactory.getLogger(ParquetRecordReader.class); + + private ColumnIOFactory columnIOFactory; + private Filter filter; + private MessageType readSchema; + private MessageType fileSchema; + private ReadSupport readSupport; + + // Parquet Materializer convert record to T + private RecordMaterializer recordMaterializer; + private T currentValue; + private long total; + private long current = 0; + private int currentBlock = -1; + private ParquetFileReader reader; + private RecordReader recordReader; + private boolean strictTypeChecking = true; + private boolean skipCorruptedRecord = true; + private long countLoadUntilLastGroup = 0; + private long totalCountLoadedSoFar = 0; + + public ParquetRecordReader(ReadSupport readSupport, MessageType readSchema, Filter filter) { + this.filter = checkNotNull(filter, "readSupport"); + this.readSupport = checkNotNull(readSupport, "readSchema"); + this.readSchema = checkNotNull(readSchema, "filter"); + } + + public ParquetRecordReader(ReadSupport readSupport, MessageType readSchema) { + this(readSupport, readSchema, FilterCompat.NOOP); + } + + public void initialize(ParquetFileReader reader, Configuration configuration) { + this.reader = reader; + FileMetaData parquetFileMetadata = reader.getFooter().getFileMetaData(); + // real schema of parquet file + this.fileSchema = parquetFileMetadata.getSchema(); + Map fileMetadata = parquetFileMetadata.getKeyValueMetaData(); + ReadSupport.ReadContext readContext = readSupport.init(new InitContext( + configuration, toSetMultiMap(fileMetadata), readSchema)); + + this.columnIOFactory = new ColumnIOFactory(parquetFileMetadata.getCreatedBy()); + this.recordMaterializer = readSupport.prepareForRead( + configuration, fileMetadata, readSchema, readContext); + this.total =
[jira] [Commented] (FLINK-7243) Add ParquetInputFormat
[ https://issues.apache.org/jira/browse/FLINK-7243?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=16691999#comment-16691999 ] ASF GitHub Bot commented on FLINK-7243: --- fhueske commented on a change in pull request #6483: [FLINK-7243][flink-formats] Add parquet input format URL: https://github.com/apache/flink/pull/6483#discussion_r234690193 ## File path: flink-formats/flink-parquet/src/main/java/org/apache/flink/formats/parquet/utils/ParquetRecordReader.java ## @@ -0,0 +1,196 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.formats.parquet.utils; + +import org.apache.hadoop.conf.Configuration; +import org.apache.parquet.column.page.PageReadStore; +import org.apache.parquet.filter2.compat.FilterCompat; +import org.apache.parquet.filter2.compat.FilterCompat.Filter; +import org.apache.parquet.hadoop.ParquetFileReader; +import org.apache.parquet.hadoop.api.InitContext; +import org.apache.parquet.hadoop.api.ReadSupport; +import org.apache.parquet.hadoop.metadata.FileMetaData; +import org.apache.parquet.io.ColumnIOFactory; +import org.apache.parquet.io.MessageColumnIO; +import org.apache.parquet.io.RecordReader; +import org.apache.parquet.io.api.RecordMaterializer; +import org.apache.parquet.io.api.RecordMaterializer.RecordMaterializationException; +import org.apache.parquet.schema.MessageType; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import javax.annotation.CheckReturnValue; +import javax.annotation.meta.When; + +import java.io.IOException; +import java.util.Collections; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Map; +import java.util.Set; + +import static org.apache.parquet.Preconditions.checkNotNull; + +/** + * Customized {@link org.apache.parquet.hadoop.ParquetRecordReader} that support start read from particular position. + */ +public class ParquetRecordReader { + private static final Logger LOG = LoggerFactory.getLogger(ParquetRecordReader.class); + + private ColumnIOFactory columnIOFactory; + private Filter filter; + + private MessageType readSchema; + private MessageType fileSchema; + private ReadSupport readSupport; + + private RecordMaterializer recordMaterializer; + private T currentValue; + private long total; + private long current = 0; + private int currentBlock = -1; + private ParquetFileReader reader; + private RecordReader recordReader; + private boolean strictTypeChecking = true; + private long totalCountLoadedSoFar = 0; + + public ParquetRecordReader(ReadSupport readSupport, MessageType readSchema, Filter filter) { + this.readSupport = readSupport; + this.readSchema = readSchema; + this.filter = checkNotNull(filter, "filter"); + } + + public ParquetRecordReader(ReadSupport readSupport, MessageType readSchema) { + this(readSupport, readSchema, FilterCompat.NOOP); + } + + public void initialize(ParquetFileReader reader, Configuration configuration) { + this.reader = reader; + FileMetaData parquetFileMetadata = reader.getFooter().getFileMetaData(); + this.fileSchema = parquetFileMetadata.getSchema(); + Map fileMetadata = parquetFileMetadata.getKeyValueMetaData(); + ReadSupport.ReadContext readContext = readSupport.init(new InitContext( + configuration, toSetMultiMap(fileMetadata), readSchema)); + + this.columnIOFactory = new ColumnIOFactory(parquetFileMetadata.getCreatedBy()); + this.readSchema = readContext.getRequestedSchema(); + this.recordMaterializer = readSupport.prepareForRead( + configuration, fileMetadata, readSchema, readContext); + this.total = reader.getRecordCount(); + reader.setRequestedSchema(readSchema); + } + + private void checkRead() throws IOException { + if (current == totalCountLoadedSoFar) { + PageReadStore pages =
[jira] [Commented] (FLINK-7243) Add ParquetInputFormat
[ https://issues.apache.org/jira/browse/FLINK-7243?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=16692003#comment-16692003 ] ASF GitHub Bot commented on FLINK-7243: --- fhueske commented on a change in pull request #6483: [FLINK-7243][flink-formats] Add parquet input format URL: https://github.com/apache/flink/pull/6483#discussion_r234686319 ## File path: flink-formats/flink-parquet/src/main/java/org/apache/flink/formats/parquet/utils/ParquetRecordReader.java ## @@ -0,0 +1,221 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.formats.parquet.utils; + +import org.apache.hadoop.conf.Configuration; +import org.apache.parquet.column.page.PageReadStore; +import org.apache.parquet.filter2.compat.FilterCompat; +import org.apache.parquet.filter2.compat.FilterCompat.Filter; +import org.apache.parquet.hadoop.ParquetFileReader; +import org.apache.parquet.hadoop.api.InitContext; +import org.apache.parquet.hadoop.api.ReadSupport; +import org.apache.parquet.hadoop.metadata.BlockMetaData; +import org.apache.parquet.hadoop.metadata.FileMetaData; +import org.apache.parquet.io.ColumnIOFactory; +import org.apache.parquet.io.MessageColumnIO; +import org.apache.parquet.io.RecordReader; +import org.apache.parquet.io.api.RecordMaterializer; +import org.apache.parquet.io.api.RecordMaterializer.RecordMaterializationException; +import org.apache.parquet.schema.MessageType; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import javax.annotation.CheckReturnValue; +import javax.annotation.meta.When; + +import java.io.IOException; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Set; + + +import static org.apache.flink.util.Preconditions.checkNotNull; + +/** + * Customized {@link org.apache.parquet.hadoop.ParquetRecordReader} that support start read from particular position. + */ +public class ParquetRecordReader { + private static final Logger LOG = LoggerFactory.getLogger(ParquetRecordReader.class); + + private ColumnIOFactory columnIOFactory; + private Filter filter; + private MessageType readSchema; + private MessageType fileSchema; + private ReadSupport readSupport; + + // Parquet Materializer convert record to T + private RecordMaterializer recordMaterializer; + private T currentValue; + private long total; + private long current = 0; + private int currentBlock = -1; + private ParquetFileReader reader; + private RecordReader recordReader; + private boolean strictTypeChecking = true; + private boolean skipCorruptedRecord = true; + private long countLoadUntilLastGroup = 0; + private long totalCountLoadedSoFar = 0; + + public ParquetRecordReader(ReadSupport readSupport, MessageType readSchema, Filter filter) { + this.filter = checkNotNull(filter, "readSupport"); + this.readSupport = checkNotNull(readSupport, "readSchema"); + this.readSchema = checkNotNull(readSchema, "filter"); + } + + public ParquetRecordReader(ReadSupport readSupport, MessageType readSchema) { + this(readSupport, readSchema, FilterCompat.NOOP); + } + + public void initialize(ParquetFileReader reader, Configuration configuration) { + this.reader = reader; + FileMetaData parquetFileMetadata = reader.getFooter().getFileMetaData(); + // real schema of parquet file + this.fileSchema = parquetFileMetadata.getSchema(); + Map fileMetadata = parquetFileMetadata.getKeyValueMetaData(); + ReadSupport.ReadContext readContext = readSupport.init(new InitContext( + configuration, toSetMultiMap(fileMetadata), readSchema)); + + this.columnIOFactory = new ColumnIOFactory(parquetFileMetadata.getCreatedBy()); + this.recordMaterializer = readSupport.prepareForRead( + configuration, fileMetadata, readSchema, readContext); + this.total =
[jira] [Commented] (FLINK-7243) Add ParquetInputFormat
[ https://issues.apache.org/jira/browse/FLINK-7243?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=16692000#comment-16692000 ] ASF GitHub Bot commented on FLINK-7243: --- fhueske commented on a change in pull request #6483: [FLINK-7243][flink-formats] Add parquet input format URL: https://github.com/apache/flink/pull/6483#discussion_r234689645 ## File path: flink-formats/flink-parquet/src/main/java/org/apache/flink/formats/parquet/utils/ParquetRecordReader.java ## @@ -0,0 +1,221 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.formats.parquet.utils; + +import org.apache.hadoop.conf.Configuration; +import org.apache.parquet.column.page.PageReadStore; +import org.apache.parquet.filter2.compat.FilterCompat; +import org.apache.parquet.filter2.compat.FilterCompat.Filter; +import org.apache.parquet.hadoop.ParquetFileReader; +import org.apache.parquet.hadoop.api.InitContext; +import org.apache.parquet.hadoop.api.ReadSupport; +import org.apache.parquet.hadoop.metadata.BlockMetaData; +import org.apache.parquet.hadoop.metadata.FileMetaData; +import org.apache.parquet.io.ColumnIOFactory; +import org.apache.parquet.io.MessageColumnIO; +import org.apache.parquet.io.RecordReader; +import org.apache.parquet.io.api.RecordMaterializer; +import org.apache.parquet.io.api.RecordMaterializer.RecordMaterializationException; +import org.apache.parquet.schema.MessageType; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import javax.annotation.CheckReturnValue; +import javax.annotation.meta.When; + +import java.io.IOException; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Set; + + +import static org.apache.flink.util.Preconditions.checkNotNull; + +/** + * Customized {@link org.apache.parquet.hadoop.ParquetRecordReader} that support start read from particular position. + */ +public class ParquetRecordReader { + private static final Logger LOG = LoggerFactory.getLogger(ParquetRecordReader.class); + + private ColumnIOFactory columnIOFactory; + private Filter filter; + private MessageType readSchema; + private MessageType fileSchema; + private ReadSupport readSupport; + + // Parquet Materializer convert record to T + private RecordMaterializer recordMaterializer; + private T currentValue; + private long total; + private long current = 0; + private int currentBlock = -1; + private ParquetFileReader reader; + private RecordReader recordReader; + private boolean strictTypeChecking = true; + private boolean skipCorruptedRecord = true; + private long countLoadUntilLastGroup = 0; + private long totalCountLoadedSoFar = 0; + + public ParquetRecordReader(ReadSupport readSupport, MessageType readSchema, Filter filter) { + this.filter = checkNotNull(filter, "readSupport"); + this.readSupport = checkNotNull(readSupport, "readSchema"); + this.readSchema = checkNotNull(readSchema, "filter"); + } + + public ParquetRecordReader(ReadSupport readSupport, MessageType readSchema) { + this(readSupport, readSchema, FilterCompat.NOOP); + } + + public void initialize(ParquetFileReader reader, Configuration configuration) { + this.reader = reader; + FileMetaData parquetFileMetadata = reader.getFooter().getFileMetaData(); + // real schema of parquet file + this.fileSchema = parquetFileMetadata.getSchema(); + Map fileMetadata = parquetFileMetadata.getKeyValueMetaData(); + ReadSupport.ReadContext readContext = readSupport.init(new InitContext( + configuration, toSetMultiMap(fileMetadata), readSchema)); + + this.columnIOFactory = new ColumnIOFactory(parquetFileMetadata.getCreatedBy()); + this.recordMaterializer = readSupport.prepareForRead( + configuration, fileMetadata, readSchema, readContext); + this.total =
[jira] [Commented] (FLINK-7243) Add ParquetInputFormat
[ https://issues.apache.org/jira/browse/FLINK-7243?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=16691980#comment-16691980 ] ASF GitHub Bot commented on FLINK-7243: --- fhueske commented on a change in pull request #6483: [FLINK-7243][flink-formats] Add parquet input format URL: https://github.com/apache/flink/pull/6483#discussion_r234586909 ## File path: flink-formats/flink-parquet/src/main/java/org/apache/flink/formats/parquet/ParquetInputFormat.java ## @@ -0,0 +1,283 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.formats.parquet; + +import org.apache.flink.api.common.io.CheckpointableInputFormat; +import org.apache.flink.api.common.io.FileInputFormat; +import org.apache.flink.api.common.typeinfo.TypeInformation; +import org.apache.flink.api.java.tuple.Tuple2; +import org.apache.flink.api.java.typeutils.RowTypeInfo; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.core.fs.FileInputSplit; +import org.apache.flink.core.fs.Path; +import org.apache.flink.formats.parquet.utils.ParquetRecordReader; +import org.apache.flink.formats.parquet.utils.ParquetSchemaConverter; +import org.apache.flink.formats.parquet.utils.RowReadSupport; +import org.apache.flink.metrics.Counter; +import org.apache.flink.types.Row; +import org.apache.flink.util.Preconditions; + +import org.apache.parquet.ParquetReadOptions; +import org.apache.parquet.filter2.compat.FilterCompat; +import org.apache.parquet.hadoop.ParquetFileReader; +import org.apache.parquet.hadoop.util.HadoopInputFile; +import org.apache.parquet.io.InputFile; +import org.apache.parquet.schema.MessageType; +import org.apache.parquet.schema.Type; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +import static org.apache.flink.util.Preconditions.checkNotNull; + +/** + * The base InputFormat class to read from Parquet files. + * For specific return types the {@link #convert(Row)} method need to be implemented. + * + * Using {@link ParquetRecordReader} to read files instead of {@link org.apache.flink.core.fs.FSDataInputStream}, + * we override {@link #open(FileInputSplit)} and {@link #close()} to change the behaviors. + * + * @param The type of record to read. + */ +public abstract class ParquetInputFormat + extends FileInputFormat + implements CheckpointableInputFormat> { + + private static final long serialVersionUID = 1L; + + private static final Logger LOG = LoggerFactory.getLogger(ParquetInputFormat.class); + + /** +* The flag to specify whether to skip file splits with wrong schema. +*/ + private boolean skipWrongSchemaFileSplit = false; + + /** +* The flag to specify whether to skip corrupted record. +*/ + private boolean skipCorruptedRecord = false; + + private transient Counter recordConsumed; + + private transient MessageType expectedFileSchema; + + private TypeInformation[] fieldTypes; + + private String[] fieldNames; + + private boolean skipThisSplit = false; + + private transient ParquetRecordReader parquetRecordReader; + + + /** +* Read parquet files with given parquet file schema. +* +* @param path The path of the file to read. +* @param messageType schema of parquet file +*/ + + protected ParquetInputFormat(Path path, MessageType messageType) { + super(path); + this.expectedFileSchema = checkNotNull(messageType, "messageType"); + RowTypeInfo rowTypeInfo = (RowTypeInfo) ParquetSchemaConverter.fromParquetType(expectedFileSchema); + this.fieldTypes = rowTypeInfo.getFieldTypes(); + this.fieldNames = rowTypeInfo.getFieldNames(); + // read whole parquet file as one file split + this.unsplittable = true; + } + + @Override + public void configure(Configuration parameters) { + super.configure(parameters); + + if
[jira] [Commented] (FLINK-7243) Add ParquetInputFormat
[ https://issues.apache.org/jira/browse/FLINK-7243?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=16691994#comment-16691994 ] ASF GitHub Bot commented on FLINK-7243: --- fhueske commented on a change in pull request #6483: [FLINK-7243][flink-formats] Add parquet input format URL: https://github.com/apache/flink/pull/6483#discussion_r234636743 ## File path: flink-formats/flink-parquet/src/main/java/org/apache/flink/formats/parquet/utils/ParquetRecordReader.java ## @@ -0,0 +1,221 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.formats.parquet.utils; + +import org.apache.hadoop.conf.Configuration; +import org.apache.parquet.column.page.PageReadStore; +import org.apache.parquet.filter2.compat.FilterCompat; +import org.apache.parquet.filter2.compat.FilterCompat.Filter; +import org.apache.parquet.hadoop.ParquetFileReader; +import org.apache.parquet.hadoop.api.InitContext; +import org.apache.parquet.hadoop.api.ReadSupport; +import org.apache.parquet.hadoop.metadata.BlockMetaData; +import org.apache.parquet.hadoop.metadata.FileMetaData; +import org.apache.parquet.io.ColumnIOFactory; +import org.apache.parquet.io.MessageColumnIO; +import org.apache.parquet.io.RecordReader; +import org.apache.parquet.io.api.RecordMaterializer; +import org.apache.parquet.io.api.RecordMaterializer.RecordMaterializationException; +import org.apache.parquet.schema.MessageType; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import javax.annotation.CheckReturnValue; +import javax.annotation.meta.When; + +import java.io.IOException; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Set; + + +import static org.apache.flink.util.Preconditions.checkNotNull; + +/** + * Customized {@link org.apache.parquet.hadoop.ParquetRecordReader} that support start read from particular position. + */ +public class ParquetRecordReader { + private static final Logger LOG = LoggerFactory.getLogger(ParquetRecordReader.class); + + private ColumnIOFactory columnIOFactory; + private Filter filter; + private MessageType readSchema; + private MessageType fileSchema; + private ReadSupport readSupport; + + // Parquet Materializer convert record to T + private RecordMaterializer recordMaterializer; + private T currentValue; + private long total; + private long current = 0; + private int currentBlock = -1; + private ParquetFileReader reader; + private RecordReader recordReader; + private boolean strictTypeChecking = true; + private boolean skipCorruptedRecord = true; + private long countLoadUntilLastGroup = 0; + private long totalCountLoadedSoFar = 0; + + public ParquetRecordReader(ReadSupport readSupport, MessageType readSchema, Filter filter) { + this.filter = checkNotNull(filter, "readSupport"); + this.readSupport = checkNotNull(readSupport, "readSchema"); + this.readSchema = checkNotNull(readSchema, "filter"); + } + + public ParquetRecordReader(ReadSupport readSupport, MessageType readSchema) { + this(readSupport, readSchema, FilterCompat.NOOP); + } + + public void initialize(ParquetFileReader reader, Configuration configuration) { + this.reader = reader; + FileMetaData parquetFileMetadata = reader.getFooter().getFileMetaData(); + // real schema of parquet file + this.fileSchema = parquetFileMetadata.getSchema(); + Map fileMetadata = parquetFileMetadata.getKeyValueMetaData(); + ReadSupport.ReadContext readContext = readSupport.init(new InitContext( + configuration, toSetMultiMap(fileMetadata), readSchema)); + + this.columnIOFactory = new ColumnIOFactory(parquetFileMetadata.getCreatedBy()); + this.recordMaterializer = readSupport.prepareForRead( + configuration, fileMetadata, readSchema, readContext); + this.total =
[jira] [Commented] (FLINK-7243) Add ParquetInputFormat
[ https://issues.apache.org/jira/browse/FLINK-7243?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=16691976#comment-16691976 ] ASF GitHub Bot commented on FLINK-7243: --- fhueske commented on a change in pull request #6483: [FLINK-7243][flink-formats] Add parquet input format URL: https://github.com/apache/flink/pull/6483#discussion_r234577116 ## File path: flink-formats/flink-parquet/src/main/java/org/apache/flink/formats/parquet/ParquetInputFormat.java ## @@ -0,0 +1,283 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.formats.parquet; + +import org.apache.flink.api.common.io.CheckpointableInputFormat; +import org.apache.flink.api.common.io.FileInputFormat; +import org.apache.flink.api.common.typeinfo.TypeInformation; +import org.apache.flink.api.java.tuple.Tuple2; +import org.apache.flink.api.java.typeutils.RowTypeInfo; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.core.fs.FileInputSplit; +import org.apache.flink.core.fs.Path; +import org.apache.flink.formats.parquet.utils.ParquetRecordReader; +import org.apache.flink.formats.parquet.utils.ParquetSchemaConverter; +import org.apache.flink.formats.parquet.utils.RowReadSupport; +import org.apache.flink.metrics.Counter; +import org.apache.flink.types.Row; +import org.apache.flink.util.Preconditions; + +import org.apache.parquet.ParquetReadOptions; +import org.apache.parquet.filter2.compat.FilterCompat; +import org.apache.parquet.hadoop.ParquetFileReader; +import org.apache.parquet.hadoop.util.HadoopInputFile; +import org.apache.parquet.io.InputFile; +import org.apache.parquet.schema.MessageType; +import org.apache.parquet.schema.Type; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +import static org.apache.flink.util.Preconditions.checkNotNull; + +/** + * The base InputFormat class to read from Parquet files. + * For specific return types the {@link #convert(Row)} method need to be implemented. + * + * Using {@link ParquetRecordReader} to read files instead of {@link org.apache.flink.core.fs.FSDataInputStream}, + * we override {@link #open(FileInputSplit)} and {@link #close()} to change the behaviors. + * + * @param The type of record to read. + */ +public abstract class ParquetInputFormat + extends FileInputFormat + implements CheckpointableInputFormat> { + + private static final long serialVersionUID = 1L; + + private static final Logger LOG = LoggerFactory.getLogger(ParquetInputFormat.class); + + /** +* The flag to specify whether to skip file splits with wrong schema. +*/ + private boolean skipWrongSchemaFileSplit = false; + + /** +* The flag to specify whether to skip corrupted record. +*/ + private boolean skipCorruptedRecord = false; + + private transient Counter recordConsumed; + + private transient MessageType expectedFileSchema; + + private TypeInformation[] fieldTypes; + + private String[] fieldNames; + + private boolean skipThisSplit = false; + + private transient ParquetRecordReader parquetRecordReader; + + + /** +* Read parquet files with given parquet file schema. +* +* @param path The path of the file to read. +* @param messageType schema of parquet file +*/ + + protected ParquetInputFormat(Path path, MessageType messageType) { + super(path); + this.expectedFileSchema = checkNotNull(messageType, "messageType"); + RowTypeInfo rowTypeInfo = (RowTypeInfo) ParquetSchemaConverter.fromParquetType(expectedFileSchema); + this.fieldTypes = rowTypeInfo.getFieldTypes(); + this.fieldNames = rowTypeInfo.getFieldNames(); + // read whole parquet file as one file split + this.unsplittable = true; + } + + @Override + public void configure(Configuration parameters) { + super.configure(parameters); + + if
[jira] [Commented] (FLINK-7243) Add ParquetInputFormat
[ https://issues.apache.org/jira/browse/FLINK-7243?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=16691989#comment-16691989 ] ASF GitHub Bot commented on FLINK-7243: --- fhueske commented on a change in pull request #6483: [FLINK-7243][flink-formats] Add parquet input format URL: https://github.com/apache/flink/pull/6483#discussion_r234624331 ## File path: flink-formats/flink-parquet/src/main/java/org/apache/flink/formats/parquet/utils/ParquetRecordReader.java ## @@ -0,0 +1,221 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.formats.parquet.utils; + +import org.apache.hadoop.conf.Configuration; +import org.apache.parquet.column.page.PageReadStore; +import org.apache.parquet.filter2.compat.FilterCompat; +import org.apache.parquet.filter2.compat.FilterCompat.Filter; +import org.apache.parquet.hadoop.ParquetFileReader; +import org.apache.parquet.hadoop.api.InitContext; +import org.apache.parquet.hadoop.api.ReadSupport; +import org.apache.parquet.hadoop.metadata.BlockMetaData; +import org.apache.parquet.hadoop.metadata.FileMetaData; +import org.apache.parquet.io.ColumnIOFactory; +import org.apache.parquet.io.MessageColumnIO; +import org.apache.parquet.io.RecordReader; +import org.apache.parquet.io.api.RecordMaterializer; +import org.apache.parquet.io.api.RecordMaterializer.RecordMaterializationException; +import org.apache.parquet.schema.MessageType; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import javax.annotation.CheckReturnValue; +import javax.annotation.meta.When; + +import java.io.IOException; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Set; + + +import static org.apache.flink.util.Preconditions.checkNotNull; + +/** + * Customized {@link org.apache.parquet.hadoop.ParquetRecordReader} that support start read from particular position. + */ +public class ParquetRecordReader { + private static final Logger LOG = LoggerFactory.getLogger(ParquetRecordReader.class); + + private ColumnIOFactory columnIOFactory; + private Filter filter; + private MessageType readSchema; + private MessageType fileSchema; + private ReadSupport readSupport; + + // Parquet Materializer convert record to T + private RecordMaterializer recordMaterializer; + private T currentValue; + private long total; + private long current = 0; + private int currentBlock = -1; + private ParquetFileReader reader; + private RecordReader recordReader; + private boolean strictTypeChecking = true; + private boolean skipCorruptedRecord = true; + private long countLoadUntilLastGroup = 0; + private long totalCountLoadedSoFar = 0; + + public ParquetRecordReader(ReadSupport readSupport, MessageType readSchema, Filter filter) { Review comment: Can you order the public methods (roughly) according to the lifecycle of the object? * steSkipCorruptedRecord * initialize * seek * reachEnd * hasNextRecord * nextRecrod * getCurrentBlock * getRecordInCurrentBlock * close This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org > Add ParquetInputFormat > -- > > Key: FLINK-7243 > URL: https://issues.apache.org/jira/browse/FLINK-7243 > Project: Flink > Issue Type: Sub-task > Components: Table API SQL >Reporter: godfrey he >Assignee: Zhenqiu Huang >Priority: Major > Labels: pull-request-available > > Add a {{ParquetInputFormat}} to read data from a Apache Parquet file. -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Commented] (FLINK-7243) Add ParquetInputFormat
[ https://issues.apache.org/jira/browse/FLINK-7243?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=16691979#comment-16691979 ] ASF GitHub Bot commented on FLINK-7243: --- fhueske commented on a change in pull request #6483: [FLINK-7243][flink-formats] Add parquet input format URL: https://github.com/apache/flink/pull/6483#discussion_r234584466 ## File path: flink-formats/flink-parquet/src/main/java/org/apache/flink/formats/parquet/ParquetInputFormat.java ## @@ -0,0 +1,283 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.formats.parquet; + +import org.apache.flink.api.common.io.CheckpointableInputFormat; +import org.apache.flink.api.common.io.FileInputFormat; +import org.apache.flink.api.common.typeinfo.TypeInformation; +import org.apache.flink.api.java.tuple.Tuple2; +import org.apache.flink.api.java.typeutils.RowTypeInfo; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.core.fs.FileInputSplit; +import org.apache.flink.core.fs.Path; +import org.apache.flink.formats.parquet.utils.ParquetRecordReader; +import org.apache.flink.formats.parquet.utils.ParquetSchemaConverter; +import org.apache.flink.formats.parquet.utils.RowReadSupport; +import org.apache.flink.metrics.Counter; +import org.apache.flink.types.Row; +import org.apache.flink.util.Preconditions; + +import org.apache.parquet.ParquetReadOptions; +import org.apache.parquet.filter2.compat.FilterCompat; +import org.apache.parquet.hadoop.ParquetFileReader; +import org.apache.parquet.hadoop.util.HadoopInputFile; +import org.apache.parquet.io.InputFile; +import org.apache.parquet.schema.MessageType; +import org.apache.parquet.schema.Type; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +import static org.apache.flink.util.Preconditions.checkNotNull; + +/** + * The base InputFormat class to read from Parquet files. + * For specific return types the {@link #convert(Row)} method need to be implemented. + * + * Using {@link ParquetRecordReader} to read files instead of {@link org.apache.flink.core.fs.FSDataInputStream}, + * we override {@link #open(FileInputSplit)} and {@link #close()} to change the behaviors. + * + * @param The type of record to read. + */ +public abstract class ParquetInputFormat + extends FileInputFormat + implements CheckpointableInputFormat> { + + private static final long serialVersionUID = 1L; + + private static final Logger LOG = LoggerFactory.getLogger(ParquetInputFormat.class); + + /** +* The flag to specify whether to skip file splits with wrong schema. +*/ + private boolean skipWrongSchemaFileSplit = false; + + /** +* The flag to specify whether to skip corrupted record. +*/ + private boolean skipCorruptedRecord = false; + + private transient Counter recordConsumed; + + private transient MessageType expectedFileSchema; + + private TypeInformation[] fieldTypes; + + private String[] fieldNames; + + private boolean skipThisSplit = false; + + private transient ParquetRecordReader parquetRecordReader; + + + /** +* Read parquet files with given parquet file schema. +* +* @param path The path of the file to read. +* @param messageType schema of parquet file +*/ + + protected ParquetInputFormat(Path path, MessageType messageType) { + super(path); + this.expectedFileSchema = checkNotNull(messageType, "messageType"); + RowTypeInfo rowTypeInfo = (RowTypeInfo) ParquetSchemaConverter.fromParquetType(expectedFileSchema); + this.fieldTypes = rowTypeInfo.getFieldTypes(); + this.fieldNames = rowTypeInfo.getFieldNames(); + // read whole parquet file as one file split + this.unsplittable = true; + } + + @Override + public void configure(Configuration parameters) { + super.configure(parameters); + + if
[jira] [Commented] (FLINK-7243) Add ParquetInputFormat
[ https://issues.apache.org/jira/browse/FLINK-7243?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=16691983#comment-16691983 ] ASF GitHub Bot commented on FLINK-7243: --- fhueske commented on a change in pull request #6483: [FLINK-7243][flink-formats] Add parquet input format URL: https://github.com/apache/flink/pull/6483#discussion_r234579748 ## File path: flink-formats/flink-parquet/src/main/java/org/apache/flink/formats/parquet/ParquetInputFormat.java ## @@ -0,0 +1,283 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.formats.parquet; + +import org.apache.flink.api.common.io.CheckpointableInputFormat; +import org.apache.flink.api.common.io.FileInputFormat; +import org.apache.flink.api.common.typeinfo.TypeInformation; +import org.apache.flink.api.java.tuple.Tuple2; +import org.apache.flink.api.java.typeutils.RowTypeInfo; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.core.fs.FileInputSplit; +import org.apache.flink.core.fs.Path; +import org.apache.flink.formats.parquet.utils.ParquetRecordReader; +import org.apache.flink.formats.parquet.utils.ParquetSchemaConverter; +import org.apache.flink.formats.parquet.utils.RowReadSupport; +import org.apache.flink.metrics.Counter; +import org.apache.flink.types.Row; +import org.apache.flink.util.Preconditions; + +import org.apache.parquet.ParquetReadOptions; +import org.apache.parquet.filter2.compat.FilterCompat; +import org.apache.parquet.hadoop.ParquetFileReader; +import org.apache.parquet.hadoop.util.HadoopInputFile; +import org.apache.parquet.io.InputFile; +import org.apache.parquet.schema.MessageType; +import org.apache.parquet.schema.Type; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +import static org.apache.flink.util.Preconditions.checkNotNull; + +/** + * The base InputFormat class to read from Parquet files. + * For specific return types the {@link #convert(Row)} method need to be implemented. + * + * Using {@link ParquetRecordReader} to read files instead of {@link org.apache.flink.core.fs.FSDataInputStream}, + * we override {@link #open(FileInputSplit)} and {@link #close()} to change the behaviors. + * + * @param The type of record to read. + */ +public abstract class ParquetInputFormat + extends FileInputFormat + implements CheckpointableInputFormat> { + + private static final long serialVersionUID = 1L; + + private static final Logger LOG = LoggerFactory.getLogger(ParquetInputFormat.class); + + /** +* The flag to specify whether to skip file splits with wrong schema. +*/ + private boolean skipWrongSchemaFileSplit = false; + + /** +* The flag to specify whether to skip corrupted record. +*/ + private boolean skipCorruptedRecord = false; + + private transient Counter recordConsumed; + + private transient MessageType expectedFileSchema; + + private TypeInformation[] fieldTypes; + + private String[] fieldNames; + + private boolean skipThisSplit = false; + + private transient ParquetRecordReader parquetRecordReader; + + + /** +* Read parquet files with given parquet file schema. +* +* @param path The path of the file to read. +* @param messageType schema of parquet file +*/ + + protected ParquetInputFormat(Path path, MessageType messageType) { + super(path); + this.expectedFileSchema = checkNotNull(messageType, "messageType"); + RowTypeInfo rowTypeInfo = (RowTypeInfo) ParquetSchemaConverter.fromParquetType(expectedFileSchema); + this.fieldTypes = rowTypeInfo.getFieldTypes(); + this.fieldNames = rowTypeInfo.getFieldNames(); + // read whole parquet file as one file split + this.unsplittable = true; + } + + @Override + public void configure(Configuration parameters) { + super.configure(parameters); + + if
[jira] [Commented] (FLINK-7243) Add ParquetInputFormat
[ https://issues.apache.org/jira/browse/FLINK-7243?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=16691988#comment-16691988 ] ASF GitHub Bot commented on FLINK-7243: --- fhueske commented on a change in pull request #6483: [FLINK-7243][flink-formats] Add parquet input format URL: https://github.com/apache/flink/pull/6483#discussion_r234584526 ## File path: flink-formats/flink-parquet/src/main/java/org/apache/flink/formats/parquet/ParquetInputFormat.java ## @@ -0,0 +1,283 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.formats.parquet; + +import org.apache.flink.api.common.io.CheckpointableInputFormat; +import org.apache.flink.api.common.io.FileInputFormat; +import org.apache.flink.api.common.typeinfo.TypeInformation; +import org.apache.flink.api.java.tuple.Tuple2; +import org.apache.flink.api.java.typeutils.RowTypeInfo; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.core.fs.FileInputSplit; +import org.apache.flink.core.fs.Path; +import org.apache.flink.formats.parquet.utils.ParquetRecordReader; +import org.apache.flink.formats.parquet.utils.ParquetSchemaConverter; +import org.apache.flink.formats.parquet.utils.RowReadSupport; +import org.apache.flink.metrics.Counter; +import org.apache.flink.types.Row; +import org.apache.flink.util.Preconditions; + +import org.apache.parquet.ParquetReadOptions; +import org.apache.parquet.filter2.compat.FilterCompat; +import org.apache.parquet.hadoop.ParquetFileReader; +import org.apache.parquet.hadoop.util.HadoopInputFile; +import org.apache.parquet.io.InputFile; +import org.apache.parquet.schema.MessageType; +import org.apache.parquet.schema.Type; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +import static org.apache.flink.util.Preconditions.checkNotNull; + +/** + * The base InputFormat class to read from Parquet files. + * For specific return types the {@link #convert(Row)} method need to be implemented. + * + * Using {@link ParquetRecordReader} to read files instead of {@link org.apache.flink.core.fs.FSDataInputStream}, + * we override {@link #open(FileInputSplit)} and {@link #close()} to change the behaviors. + * + * @param The type of record to read. + */ +public abstract class ParquetInputFormat + extends FileInputFormat + implements CheckpointableInputFormat> { + + private static final long serialVersionUID = 1L; + + private static final Logger LOG = LoggerFactory.getLogger(ParquetInputFormat.class); + + /** +* The flag to specify whether to skip file splits with wrong schema. +*/ + private boolean skipWrongSchemaFileSplit = false; + + /** +* The flag to specify whether to skip corrupted record. +*/ + private boolean skipCorruptedRecord = false; + + private transient Counter recordConsumed; + + private transient MessageType expectedFileSchema; + + private TypeInformation[] fieldTypes; + + private String[] fieldNames; + + private boolean skipThisSplit = false; + + private transient ParquetRecordReader parquetRecordReader; + + + /** +* Read parquet files with given parquet file schema. +* +* @param path The path of the file to read. +* @param messageType schema of parquet file +*/ + + protected ParquetInputFormat(Path path, MessageType messageType) { + super(path); + this.expectedFileSchema = checkNotNull(messageType, "messageType"); + RowTypeInfo rowTypeInfo = (RowTypeInfo) ParquetSchemaConverter.fromParquetType(expectedFileSchema); + this.fieldTypes = rowTypeInfo.getFieldTypes(); + this.fieldNames = rowTypeInfo.getFieldNames(); + // read whole parquet file as one file split + this.unsplittable = true; + } + + @Override + public void configure(Configuration parameters) { + super.configure(parameters); + + if
[jira] [Commented] (FLINK-7243) Add ParquetInputFormat
[ https://issues.apache.org/jira/browse/FLINK-7243?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=16691977#comment-16691977 ] ASF GitHub Bot commented on FLINK-7243: --- fhueske commented on a change in pull request #6483: [FLINK-7243][flink-formats] Add parquet input format URL: https://github.com/apache/flink/pull/6483#discussion_r234578727 ## File path: flink-formats/flink-parquet/src/main/java/org/apache/flink/formats/parquet/ParquetInputFormat.java ## @@ -0,0 +1,283 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.formats.parquet; + +import org.apache.flink.api.common.io.CheckpointableInputFormat; +import org.apache.flink.api.common.io.FileInputFormat; +import org.apache.flink.api.common.typeinfo.TypeInformation; +import org.apache.flink.api.java.tuple.Tuple2; +import org.apache.flink.api.java.typeutils.RowTypeInfo; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.core.fs.FileInputSplit; +import org.apache.flink.core.fs.Path; +import org.apache.flink.formats.parquet.utils.ParquetRecordReader; +import org.apache.flink.formats.parquet.utils.ParquetSchemaConverter; +import org.apache.flink.formats.parquet.utils.RowReadSupport; +import org.apache.flink.metrics.Counter; +import org.apache.flink.types.Row; +import org.apache.flink.util.Preconditions; + +import org.apache.parquet.ParquetReadOptions; +import org.apache.parquet.filter2.compat.FilterCompat; +import org.apache.parquet.hadoop.ParquetFileReader; +import org.apache.parquet.hadoop.util.HadoopInputFile; +import org.apache.parquet.io.InputFile; +import org.apache.parquet.schema.MessageType; +import org.apache.parquet.schema.Type; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +import static org.apache.flink.util.Preconditions.checkNotNull; + +/** + * The base InputFormat class to read from Parquet files. + * For specific return types the {@link #convert(Row)} method need to be implemented. + * + * Using {@link ParquetRecordReader} to read files instead of {@link org.apache.flink.core.fs.FSDataInputStream}, + * we override {@link #open(FileInputSplit)} and {@link #close()} to change the behaviors. + * + * @param The type of record to read. + */ +public abstract class ParquetInputFormat + extends FileInputFormat + implements CheckpointableInputFormat> { + + private static final long serialVersionUID = 1L; + + private static final Logger LOG = LoggerFactory.getLogger(ParquetInputFormat.class); + + /** +* The flag to specify whether to skip file splits with wrong schema. +*/ + private boolean skipWrongSchemaFileSplit = false; + + /** +* The flag to specify whether to skip corrupted record. +*/ + private boolean skipCorruptedRecord = false; + + private transient Counter recordConsumed; + + private transient MessageType expectedFileSchema; + + private TypeInformation[] fieldTypes; + + private String[] fieldNames; + + private boolean skipThisSplit = false; + + private transient ParquetRecordReader parquetRecordReader; + + + /** +* Read parquet files with given parquet file schema. +* +* @param path The path of the file to read. +* @param messageType schema of parquet file +*/ + + protected ParquetInputFormat(Path path, MessageType messageType) { + super(path); + this.expectedFileSchema = checkNotNull(messageType, "messageType"); + RowTypeInfo rowTypeInfo = (RowTypeInfo) ParquetSchemaConverter.fromParquetType(expectedFileSchema); + this.fieldTypes = rowTypeInfo.getFieldTypes(); + this.fieldNames = rowTypeInfo.getFieldNames(); + // read whole parquet file as one file split + this.unsplittable = true; + } + + @Override + public void configure(Configuration parameters) { + super.configure(parameters); + + if
[jira] [Commented] (FLINK-7243) Add ParquetInputFormat
[ https://issues.apache.org/jira/browse/FLINK-7243?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=16691985#comment-16691985 ] ASF GitHub Bot commented on FLINK-7243: --- fhueske commented on a change in pull request #6483: [FLINK-7243][flink-formats] Add parquet input format URL: https://github.com/apache/flink/pull/6483#discussion_r234644042 ## File path: flink-formats/flink-parquet/src/test/java/org/apache/flink/formats/parquet/ParquetInputFormatTest.java ## @@ -0,0 +1,250 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.formats.parquet; + +import org.apache.flink.api.common.functions.RuntimeContext; +import org.apache.flink.api.common.typeinfo.BasicArrayTypeInfo; +import org.apache.flink.api.common.typeinfo.BasicTypeInfo; +import org.apache.flink.api.java.tuple.Tuple2; +import org.apache.flink.api.java.tuple.Tuple3; +import org.apache.flink.api.java.typeutils.PojoField; +import org.apache.flink.api.java.typeutils.PojoTypeInfo; +import org.apache.flink.core.fs.FileInputSplit; +import org.apache.flink.core.fs.Path; +import org.apache.flink.formats.parquet.generated.SimpleRecord; +import org.apache.flink.formats.parquet.pojo.PojoSimpleRecord; +import org.apache.flink.formats.parquet.utils.TestUtil; +import org.apache.flink.runtime.metrics.groups.UnregisteredMetricGroups; +import org.apache.flink.types.Row; +import org.apache.flink.util.InstantiationUtil; + +import org.apache.avro.generic.IndexedRecord; +import org.apache.avro.specific.SpecificRecord; +import org.apache.parquet.avro.AvroSchemaConverter; +import org.apache.parquet.schema.MessageType; +import org.junit.ClassRule; +import org.junit.Test; +import org.junit.rules.TemporaryFolder; +import org.mockito.Mockito; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; +import java.util.Map; + +import static org.junit.Assert.assertArrayEquals; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNotNull; + +/** + * Simple test case for reading {@link org.apache.flink.types.Row}, Map and Pojo from Parquet files. + */ +public class ParquetInputFormatTest { + private static final AvroSchemaConverter SCHEMA_CONVERTER = new AvroSchemaConverter(); + + @ClassRule + public static TemporaryFolder temp = new TemporaryFolder(); + + @Test + public void testReadRowFromSimpleRecord() throws IOException { + temp.create(); + Tuple3, SpecificRecord, Row> simple = TestUtil.getSimpleRecordTestData(); + Path path = TestUtil.createTempParquetFile(temp, TestUtil.SIMPLE_SCHEMA, Collections.singletonList(simple.f1)); + MessageType simpleType = SCHEMA_CONVERTER.convert(TestUtil.SIMPLE_SCHEMA); + + ParquetRowInputFormat rowInputFormat = new ParquetRowInputFormat(path, simpleType); + + RuntimeContext mockContext = Mockito.mock(RuntimeContext.class); + Mockito.doReturn(UnregisteredMetricGroups.createUnregisteredOperatorMetricGroup()) + .when(mockContext).getMetricGroup(); + rowInputFormat.setRuntimeContext(mockContext); + + FileInputSplit[] splits = rowInputFormat.createInputSplits(1); + assertEquals(1, splits.length); + rowInputFormat.open(splits[0]); + + Row row = rowInputFormat.nextRecord(null); + assertNotNull(row); + assertEquals(simple.f2, row); + } + + @Test + public void testFailureRecoverySimpleRecord() throws IOException { Review comment: The recovery point is in the first group. Therefore, the test will never enter the while loop in `ParquetRecordReader.seek()`. We should also add a test case that covers that code. This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact
[jira] [Commented] (FLINK-7243) Add ParquetInputFormat
[ https://issues.apache.org/jira/browse/FLINK-7243?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=16691992#comment-16691992 ] ASF GitHub Bot commented on FLINK-7243: --- fhueske commented on a change in pull request #6483: [FLINK-7243][flink-formats] Add parquet input format URL: https://github.com/apache/flink/pull/6483#discussion_r234646978 ## File path: flink-formats/flink-parquet/src/main/java/org/apache/flink/formats/parquet/utils/ParquetRecordReader.java ## @@ -0,0 +1,221 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.formats.parquet.utils; + +import org.apache.hadoop.conf.Configuration; +import org.apache.parquet.column.page.PageReadStore; +import org.apache.parquet.filter2.compat.FilterCompat; +import org.apache.parquet.filter2.compat.FilterCompat.Filter; +import org.apache.parquet.hadoop.ParquetFileReader; +import org.apache.parquet.hadoop.api.InitContext; +import org.apache.parquet.hadoop.api.ReadSupport; +import org.apache.parquet.hadoop.metadata.BlockMetaData; +import org.apache.parquet.hadoop.metadata.FileMetaData; +import org.apache.parquet.io.ColumnIOFactory; +import org.apache.parquet.io.MessageColumnIO; +import org.apache.parquet.io.RecordReader; +import org.apache.parquet.io.api.RecordMaterializer; +import org.apache.parquet.io.api.RecordMaterializer.RecordMaterializationException; +import org.apache.parquet.schema.MessageType; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import javax.annotation.CheckReturnValue; +import javax.annotation.meta.When; + +import java.io.IOException; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Set; + + +import static org.apache.flink.util.Preconditions.checkNotNull; + +/** + * Customized {@link org.apache.parquet.hadoop.ParquetRecordReader} that support start read from particular position. + */ +public class ParquetRecordReader { + private static final Logger LOG = LoggerFactory.getLogger(ParquetRecordReader.class); + + private ColumnIOFactory columnIOFactory; + private Filter filter; + private MessageType readSchema; + private MessageType fileSchema; + private ReadSupport readSupport; + + // Parquet Materializer convert record to T + private RecordMaterializer recordMaterializer; + private T currentValue; + private long total; + private long current = 0; + private int currentBlock = -1; + private ParquetFileReader reader; + private RecordReader recordReader; + private boolean strictTypeChecking = true; + private boolean skipCorruptedRecord = true; + private long countLoadUntilLastGroup = 0; + private long totalCountLoadedSoFar = 0; Review comment: Do we need these four different row counters? Wouldn't it be sufficient to have the number of rows returned for the current block and the total number of rows of the current block? `reachEnd` seems to be the only place that needs the total number of rows. It could be changed to check if we are in the last block and read all its records. This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org > Add ParquetInputFormat > -- > > Key: FLINK-7243 > URL: https://issues.apache.org/jira/browse/FLINK-7243 > Project: Flink > Issue Type: Sub-task > Components: Table API SQL >Reporter: godfrey he >Assignee: Zhenqiu Huang >Priority: Major > Labels: pull-request-available > > Add a {{ParquetInputFormat}} to read data from a Apache Parquet file. -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Commented] (FLINK-7243) Add ParquetInputFormat
[ https://issues.apache.org/jira/browse/FLINK-7243?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=16692002#comment-16692002 ] ASF GitHub Bot commented on FLINK-7243: --- fhueske commented on a change in pull request #6483: [FLINK-7243][flink-formats] Add parquet input format URL: https://github.com/apache/flink/pull/6483#discussion_r234610226 ## File path: flink-formats/flink-parquet/src/main/java/org/apache/flink/formats/parquet/ParquetRowInputFormat.java ## @@ -0,0 +1,87 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.formats.parquet; + +import org.apache.flink.api.common.typeinfo.SqlTimeTypeInfo; +import org.apache.flink.api.common.typeinfo.TypeInformation; +import org.apache.flink.api.java.typeutils.ResultTypeQueryable; +import org.apache.flink.api.java.typeutils.RowTypeInfo; +import org.apache.flink.core.fs.Path; +import org.apache.flink.types.Row; + +import org.apache.parquet.schema.MessageType; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.sql.Timestamp; + + +/** + * A subclass of {@link ParquetInputFormat} to read from Parquet files and convert to {@link Row}. + * It is mainly used to integrate with table API and batch SQL. + */ +public class ParquetRowInputFormat extends ParquetInputFormat implements ResultTypeQueryable { + private static final long serialVersionUID = 11L; + private static final Logger LOG = LoggerFactory.getLogger(ParquetRowInputFormat.class); + private boolean timeStampRewrite; + private RowTypeInfo returnType; + private int tsIndex; + + public ParquetRowInputFormat(Path path, MessageType messageType) { + super(path, messageType); + this.returnType = new RowTypeInfo(getFieldTypes(), getFieldNames()); + this.timeStampRewrite = false; + } + + @Override + public TypeInformation getProducedType() { + return new RowTypeInfo(getFieldTypes(), getFieldNames()); Review comment: The return type needs to be adjusted if timestamp rewriting is enabled This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org > Add ParquetInputFormat > -- > > Key: FLINK-7243 > URL: https://issues.apache.org/jira/browse/FLINK-7243 > Project: Flink > Issue Type: Sub-task > Components: Table API SQL >Reporter: godfrey he >Assignee: Zhenqiu Huang >Priority: Major > Labels: pull-request-available > > Add a {{ParquetInputFormat}} to read data from a Apache Parquet file. -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Commented] (FLINK-7243) Add ParquetInputFormat
[ https://issues.apache.org/jira/browse/FLINK-7243?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=16692001#comment-16692001 ] ASF GitHub Bot commented on FLINK-7243: --- fhueske commented on a change in pull request #6483: [FLINK-7243][flink-formats] Add parquet input format URL: https://github.com/apache/flink/pull/6483#discussion_r234687019 ## File path: flink-formats/flink-parquet/src/main/java/org/apache/flink/formats/parquet/utils/ParquetRecordReader.java ## @@ -0,0 +1,221 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.formats.parquet.utils; + +import org.apache.hadoop.conf.Configuration; +import org.apache.parquet.column.page.PageReadStore; +import org.apache.parquet.filter2.compat.FilterCompat; +import org.apache.parquet.filter2.compat.FilterCompat.Filter; +import org.apache.parquet.hadoop.ParquetFileReader; +import org.apache.parquet.hadoop.api.InitContext; +import org.apache.parquet.hadoop.api.ReadSupport; +import org.apache.parquet.hadoop.metadata.BlockMetaData; +import org.apache.parquet.hadoop.metadata.FileMetaData; +import org.apache.parquet.io.ColumnIOFactory; +import org.apache.parquet.io.MessageColumnIO; +import org.apache.parquet.io.RecordReader; +import org.apache.parquet.io.api.RecordMaterializer; +import org.apache.parquet.io.api.RecordMaterializer.RecordMaterializationException; +import org.apache.parquet.schema.MessageType; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import javax.annotation.CheckReturnValue; +import javax.annotation.meta.When; + +import java.io.IOException; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Set; + + +import static org.apache.flink.util.Preconditions.checkNotNull; + +/** + * Customized {@link org.apache.parquet.hadoop.ParquetRecordReader} that support start read from particular position. + */ +public class ParquetRecordReader { + private static final Logger LOG = LoggerFactory.getLogger(ParquetRecordReader.class); + + private ColumnIOFactory columnIOFactory; + private Filter filter; + private MessageType readSchema; + private MessageType fileSchema; + private ReadSupport readSupport; + + // Parquet Materializer convert record to T + private RecordMaterializer recordMaterializer; + private T currentValue; + private long total; + private long current = 0; + private int currentBlock = -1; + private ParquetFileReader reader; + private RecordReader recordReader; + private boolean strictTypeChecking = true; + private boolean skipCorruptedRecord = true; + private long countLoadUntilLastGroup = 0; + private long totalCountLoadedSoFar = 0; + + public ParquetRecordReader(ReadSupport readSupport, MessageType readSchema, Filter filter) { + this.filter = checkNotNull(filter, "readSupport"); + this.readSupport = checkNotNull(readSupport, "readSchema"); + this.readSchema = checkNotNull(readSchema, "filter"); + } + + public ParquetRecordReader(ReadSupport readSupport, MessageType readSchema) { + this(readSupport, readSchema, FilterCompat.NOOP); + } + + public void initialize(ParquetFileReader reader, Configuration configuration) { + this.reader = reader; + FileMetaData parquetFileMetadata = reader.getFooter().getFileMetaData(); + // real schema of parquet file + this.fileSchema = parquetFileMetadata.getSchema(); + Map fileMetadata = parquetFileMetadata.getKeyValueMetaData(); + ReadSupport.ReadContext readContext = readSupport.init(new InitContext( + configuration, toSetMultiMap(fileMetadata), readSchema)); + + this.columnIOFactory = new ColumnIOFactory(parquetFileMetadata.getCreatedBy()); + this.recordMaterializer = readSupport.prepareForRead( + configuration, fileMetadata, readSchema, readContext); + this.total =
[jira] [Commented] (FLINK-7243) Add ParquetInputFormat
[ https://issues.apache.org/jira/browse/FLINK-7243?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=16691995#comment-16691995 ] ASF GitHub Bot commented on FLINK-7243: --- fhueske commented on a change in pull request #6483: [FLINK-7243][flink-formats] Add parquet input format URL: https://github.com/apache/flink/pull/6483#discussion_r234685198 ## File path: flink-formats/flink-parquet/src/main/java/org/apache/flink/formats/parquet/utils/ParquetRecordReader.java ## @@ -0,0 +1,221 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.formats.parquet.utils; + +import org.apache.hadoop.conf.Configuration; +import org.apache.parquet.column.page.PageReadStore; +import org.apache.parquet.filter2.compat.FilterCompat; +import org.apache.parquet.filter2.compat.FilterCompat.Filter; +import org.apache.parquet.hadoop.ParquetFileReader; +import org.apache.parquet.hadoop.api.InitContext; +import org.apache.parquet.hadoop.api.ReadSupport; +import org.apache.parquet.hadoop.metadata.BlockMetaData; +import org.apache.parquet.hadoop.metadata.FileMetaData; +import org.apache.parquet.io.ColumnIOFactory; +import org.apache.parquet.io.MessageColumnIO; +import org.apache.parquet.io.RecordReader; +import org.apache.parquet.io.api.RecordMaterializer; +import org.apache.parquet.io.api.RecordMaterializer.RecordMaterializationException; +import org.apache.parquet.schema.MessageType; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import javax.annotation.CheckReturnValue; +import javax.annotation.meta.When; + +import java.io.IOException; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Set; + + +import static org.apache.flink.util.Preconditions.checkNotNull; + +/** + * Customized {@link org.apache.parquet.hadoop.ParquetRecordReader} that support start read from particular position. + */ +public class ParquetRecordReader { + private static final Logger LOG = LoggerFactory.getLogger(ParquetRecordReader.class); + + private ColumnIOFactory columnIOFactory; + private Filter filter; + private MessageType readSchema; + private MessageType fileSchema; + private ReadSupport readSupport; + + // Parquet Materializer convert record to T + private RecordMaterializer recordMaterializer; + private T currentValue; + private long total; + private long current = 0; + private int currentBlock = -1; + private ParquetFileReader reader; + private RecordReader recordReader; + private boolean strictTypeChecking = true; + private boolean skipCorruptedRecord = true; + private long countLoadUntilLastGroup = 0; + private long totalCountLoadedSoFar = 0; + + public ParquetRecordReader(ReadSupport readSupport, MessageType readSchema, Filter filter) { + this.filter = checkNotNull(filter, "readSupport"); + this.readSupport = checkNotNull(readSupport, "readSchema"); + this.readSchema = checkNotNull(readSchema, "filter"); + } + + public ParquetRecordReader(ReadSupport readSupport, MessageType readSchema) { + this(readSupport, readSchema, FilterCompat.NOOP); + } + + public void initialize(ParquetFileReader reader, Configuration configuration) { + this.reader = reader; + FileMetaData parquetFileMetadata = reader.getFooter().getFileMetaData(); + // real schema of parquet file + this.fileSchema = parquetFileMetadata.getSchema(); + Map fileMetadata = parquetFileMetadata.getKeyValueMetaData(); + ReadSupport.ReadContext readContext = readSupport.init(new InitContext( + configuration, toSetMultiMap(fileMetadata), readSchema)); + + this.columnIOFactory = new ColumnIOFactory(parquetFileMetadata.getCreatedBy()); + this.recordMaterializer = readSupport.prepareForRead( + configuration, fileMetadata, readSchema, readContext); + this.total =
[jira] [Commented] (FLINK-7243) Add ParquetInputFormat
[ https://issues.apache.org/jira/browse/FLINK-7243?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=16691998#comment-16691998 ] ASF GitHub Bot commented on FLINK-7243: --- fhueske commented on a change in pull request #6483: [FLINK-7243][flink-formats] Add parquet input format URL: https://github.com/apache/flink/pull/6483#discussion_r234639399 ## File path: flink-formats/flink-parquet/src/main/java/org/apache/flink/formats/parquet/utils/ParquetRecordReader.java ## @@ -0,0 +1,221 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.formats.parquet.utils; + +import org.apache.hadoop.conf.Configuration; +import org.apache.parquet.column.page.PageReadStore; +import org.apache.parquet.filter2.compat.FilterCompat; +import org.apache.parquet.filter2.compat.FilterCompat.Filter; +import org.apache.parquet.hadoop.ParquetFileReader; +import org.apache.parquet.hadoop.api.InitContext; +import org.apache.parquet.hadoop.api.ReadSupport; +import org.apache.parquet.hadoop.metadata.BlockMetaData; +import org.apache.parquet.hadoop.metadata.FileMetaData; +import org.apache.parquet.io.ColumnIOFactory; +import org.apache.parquet.io.MessageColumnIO; +import org.apache.parquet.io.RecordReader; +import org.apache.parquet.io.api.RecordMaterializer; +import org.apache.parquet.io.api.RecordMaterializer.RecordMaterializationException; +import org.apache.parquet.schema.MessageType; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import javax.annotation.CheckReturnValue; +import javax.annotation.meta.When; + +import java.io.IOException; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Set; + + +import static org.apache.flink.util.Preconditions.checkNotNull; + +/** + * Customized {@link org.apache.parquet.hadoop.ParquetRecordReader} that support start read from particular position. + */ +public class ParquetRecordReader { + private static final Logger LOG = LoggerFactory.getLogger(ParquetRecordReader.class); + + private ColumnIOFactory columnIOFactory; + private Filter filter; + private MessageType readSchema; + private MessageType fileSchema; + private ReadSupport readSupport; + + // Parquet Materializer convert record to T + private RecordMaterializer recordMaterializer; + private T currentValue; + private long total; + private long current = 0; + private int currentBlock = -1; + private ParquetFileReader reader; + private RecordReader recordReader; + private boolean strictTypeChecking = true; + private boolean skipCorruptedRecord = true; + private long countLoadUntilLastGroup = 0; + private long totalCountLoadedSoFar = 0; Review comment: rename to `numRowsUpToCurrentBlock` This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org > Add ParquetInputFormat > -- > > Key: FLINK-7243 > URL: https://issues.apache.org/jira/browse/FLINK-7243 > Project: Flink > Issue Type: Sub-task > Components: Table API SQL >Reporter: godfrey he >Assignee: Zhenqiu Huang >Priority: Major > Labels: pull-request-available > > Add a {{ParquetInputFormat}} to read data from a Apache Parquet file. -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Commented] (FLINK-7243) Add ParquetInputFormat
[ https://issues.apache.org/jira/browse/FLINK-7243?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=16691996#comment-16691996 ] ASF GitHub Bot commented on FLINK-7243: --- fhueske commented on a change in pull request #6483: [FLINK-7243][flink-formats] Add parquet input format URL: https://github.com/apache/flink/pull/6483#discussion_r234637283 ## File path: flink-formats/flink-parquet/src/main/java/org/apache/flink/formats/parquet/utils/ParquetRecordReader.java ## @@ -0,0 +1,221 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.formats.parquet.utils; + +import org.apache.hadoop.conf.Configuration; +import org.apache.parquet.column.page.PageReadStore; +import org.apache.parquet.filter2.compat.FilterCompat; +import org.apache.parquet.filter2.compat.FilterCompat.Filter; +import org.apache.parquet.hadoop.ParquetFileReader; +import org.apache.parquet.hadoop.api.InitContext; +import org.apache.parquet.hadoop.api.ReadSupport; +import org.apache.parquet.hadoop.metadata.BlockMetaData; +import org.apache.parquet.hadoop.metadata.FileMetaData; +import org.apache.parquet.io.ColumnIOFactory; +import org.apache.parquet.io.MessageColumnIO; +import org.apache.parquet.io.RecordReader; +import org.apache.parquet.io.api.RecordMaterializer; +import org.apache.parquet.io.api.RecordMaterializer.RecordMaterializationException; +import org.apache.parquet.schema.MessageType; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import javax.annotation.CheckReturnValue; +import javax.annotation.meta.When; + +import java.io.IOException; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Set; + + +import static org.apache.flink.util.Preconditions.checkNotNull; + +/** + * Customized {@link org.apache.parquet.hadoop.ParquetRecordReader} that support start read from particular position. + */ +public class ParquetRecordReader { + private static final Logger LOG = LoggerFactory.getLogger(ParquetRecordReader.class); + + private ColumnIOFactory columnIOFactory; + private Filter filter; + private MessageType readSchema; + private MessageType fileSchema; + private ReadSupport readSupport; + + // Parquet Materializer convert record to T + private RecordMaterializer recordMaterializer; + private T currentValue; + private long total; + private long current = 0; + private int currentBlock = -1; + private ParquetFileReader reader; + private RecordReader recordReader; + private boolean strictTypeChecking = true; + private boolean skipCorruptedRecord = true; + private long countLoadUntilLastGroup = 0; + private long totalCountLoadedSoFar = 0; + + public ParquetRecordReader(ReadSupport readSupport, MessageType readSchema, Filter filter) { + this.filter = checkNotNull(filter, "readSupport"); + this.readSupport = checkNotNull(readSupport, "readSchema"); + this.readSchema = checkNotNull(readSchema, "filter"); + } + + public ParquetRecordReader(ReadSupport readSupport, MessageType readSchema) { + this(readSupport, readSchema, FilterCompat.NOOP); + } + + public void initialize(ParquetFileReader reader, Configuration configuration) { + this.reader = reader; + FileMetaData parquetFileMetadata = reader.getFooter().getFileMetaData(); + // real schema of parquet file + this.fileSchema = parquetFileMetadata.getSchema(); + Map fileMetadata = parquetFileMetadata.getKeyValueMetaData(); + ReadSupport.ReadContext readContext = readSupport.init(new InitContext( + configuration, toSetMultiMap(fileMetadata), readSchema)); + + this.columnIOFactory = new ColumnIOFactory(parquetFileMetadata.getCreatedBy()); + this.recordMaterializer = readSupport.prepareForRead( + configuration, fileMetadata, readSchema, readContext); + this.total =
[jira] [Commented] (FLINK-7243) Add ParquetInputFormat
[ https://issues.apache.org/jira/browse/FLINK-7243?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=16691975#comment-16691975 ] ASF GitHub Bot commented on FLINK-7243: --- fhueske commented on a change in pull request #6483: [FLINK-7243][flink-formats] Add parquet input format URL: https://github.com/apache/flink/pull/6483#discussion_r234577085 ## File path: flink-formats/flink-parquet/src/main/java/org/apache/flink/formats/parquet/ParquetInputFormat.java ## @@ -0,0 +1,283 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.formats.parquet; + +import org.apache.flink.api.common.io.CheckpointableInputFormat; +import org.apache.flink.api.common.io.FileInputFormat; +import org.apache.flink.api.common.typeinfo.TypeInformation; +import org.apache.flink.api.java.tuple.Tuple2; +import org.apache.flink.api.java.typeutils.RowTypeInfo; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.core.fs.FileInputSplit; +import org.apache.flink.core.fs.Path; +import org.apache.flink.formats.parquet.utils.ParquetRecordReader; +import org.apache.flink.formats.parquet.utils.ParquetSchemaConverter; +import org.apache.flink.formats.parquet.utils.RowReadSupport; +import org.apache.flink.metrics.Counter; +import org.apache.flink.types.Row; +import org.apache.flink.util.Preconditions; + +import org.apache.parquet.ParquetReadOptions; +import org.apache.parquet.filter2.compat.FilterCompat; +import org.apache.parquet.hadoop.ParquetFileReader; +import org.apache.parquet.hadoop.util.HadoopInputFile; +import org.apache.parquet.io.InputFile; +import org.apache.parquet.schema.MessageType; +import org.apache.parquet.schema.Type; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +import static org.apache.flink.util.Preconditions.checkNotNull; + +/** + * The base InputFormat class to read from Parquet files. + * For specific return types the {@link #convert(Row)} method need to be implemented. + * + * Using {@link ParquetRecordReader} to read files instead of {@link org.apache.flink.core.fs.FSDataInputStream}, + * we override {@link #open(FileInputSplit)} and {@link #close()} to change the behaviors. + * + * @param The type of record to read. + */ +public abstract class ParquetInputFormat + extends FileInputFormat + implements CheckpointableInputFormat> { + + private static final long serialVersionUID = 1L; + + private static final Logger LOG = LoggerFactory.getLogger(ParquetInputFormat.class); + + /** +* The flag to specify whether to skip file splits with wrong schema. +*/ + private boolean skipWrongSchemaFileSplit = false; + + /** +* The flag to specify whether to skip corrupted record. +*/ + private boolean skipCorruptedRecord = false; + + private transient Counter recordConsumed; + + private transient MessageType expectedFileSchema; + + private TypeInformation[] fieldTypes; + + private String[] fieldNames; + + private boolean skipThisSplit = false; + + private transient ParquetRecordReader parquetRecordReader; + + + /** +* Read parquet files with given parquet file schema. +* +* @param path The path of the file to read. +* @param messageType schema of parquet file +*/ + + protected ParquetInputFormat(Path path, MessageType messageType) { + super(path); + this.expectedFileSchema = checkNotNull(messageType, "messageType"); + RowTypeInfo rowTypeInfo = (RowTypeInfo) ParquetSchemaConverter.fromParquetType(expectedFileSchema); + this.fieldTypes = rowTypeInfo.getFieldTypes(); + this.fieldNames = rowTypeInfo.getFieldNames(); + // read whole parquet file as one file split + this.unsplittable = true; + } + + @Override + public void configure(Configuration parameters) { + super.configure(parameters); + + if
[jira] [Commented] (FLINK-7243) Add ParquetInputFormat
[ https://issues.apache.org/jira/browse/FLINK-7243?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=16691984#comment-16691984 ] ASF GitHub Bot commented on FLINK-7243: --- fhueske commented on a change in pull request #6483: [FLINK-7243][flink-formats] Add parquet input format URL: https://github.com/apache/flink/pull/6483#discussion_r234599870 ## File path: flink-formats/flink-parquet/src/main/java/org/apache/flink/formats/parquet/ParquetInputFormat.java ## @@ -0,0 +1,283 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.formats.parquet; + +import org.apache.flink.api.common.io.CheckpointableInputFormat; +import org.apache.flink.api.common.io.FileInputFormat; +import org.apache.flink.api.common.typeinfo.TypeInformation; +import org.apache.flink.api.java.tuple.Tuple2; +import org.apache.flink.api.java.typeutils.RowTypeInfo; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.core.fs.FileInputSplit; +import org.apache.flink.core.fs.Path; +import org.apache.flink.formats.parquet.utils.ParquetRecordReader; +import org.apache.flink.formats.parquet.utils.ParquetSchemaConverter; +import org.apache.flink.formats.parquet.utils.RowReadSupport; +import org.apache.flink.metrics.Counter; +import org.apache.flink.types.Row; +import org.apache.flink.util.Preconditions; + +import org.apache.parquet.ParquetReadOptions; +import org.apache.parquet.filter2.compat.FilterCompat; +import org.apache.parquet.hadoop.ParquetFileReader; +import org.apache.parquet.hadoop.util.HadoopInputFile; +import org.apache.parquet.io.InputFile; +import org.apache.parquet.schema.MessageType; +import org.apache.parquet.schema.Type; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +import static org.apache.flink.util.Preconditions.checkNotNull; + +/** + * The base InputFormat class to read from Parquet files. + * For specific return types the {@link #convert(Row)} method need to be implemented. + * + * Using {@link ParquetRecordReader} to read files instead of {@link org.apache.flink.core.fs.FSDataInputStream}, + * we override {@link #open(FileInputSplit)} and {@link #close()} to change the behaviors. + * + * @param The type of record to read. + */ +public abstract class ParquetInputFormat + extends FileInputFormat + implements CheckpointableInputFormat> { + + private static final long serialVersionUID = 1L; + + private static final Logger LOG = LoggerFactory.getLogger(ParquetInputFormat.class); + + /** +* The flag to specify whether to skip file splits with wrong schema. +*/ + private boolean skipWrongSchemaFileSplit = false; + + /** +* The flag to specify whether to skip corrupted record. +*/ + private boolean skipCorruptedRecord = false; + + private transient Counter recordConsumed; + + private transient MessageType expectedFileSchema; + + private TypeInformation[] fieldTypes; + + private String[] fieldNames; + + private boolean skipThisSplit = false; + + private transient ParquetRecordReader parquetRecordReader; + + + /** +* Read parquet files with given parquet file schema. +* +* @param path The path of the file to read. +* @param messageType schema of parquet file +*/ + + protected ParquetInputFormat(Path path, MessageType messageType) { + super(path); + this.expectedFileSchema = checkNotNull(messageType, "messageType"); + RowTypeInfo rowTypeInfo = (RowTypeInfo) ParquetSchemaConverter.fromParquetType(expectedFileSchema); + this.fieldTypes = rowTypeInfo.getFieldTypes(); + this.fieldNames = rowTypeInfo.getFieldNames(); + // read whole parquet file as one file split + this.unsplittable = true; + } + + @Override + public void configure(Configuration parameters) { + super.configure(parameters); + + if
[jira] [Commented] (FLINK-7243) Add ParquetInputFormat
[ https://issues.apache.org/jira/browse/FLINK-7243?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=16691981#comment-16691981 ] ASF GitHub Bot commented on FLINK-7243: --- fhueske commented on a change in pull request #6483: [FLINK-7243][flink-formats] Add parquet input format URL: https://github.com/apache/flink/pull/6483#discussion_r234602375 ## File path: flink-formats/flink-parquet/src/main/java/org/apache/flink/formats/parquet/ParquetInputFormat.java ## @@ -0,0 +1,191 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.formats.parquet; + +import org.apache.flink.api.common.io.CheckpointableInputFormat; +import org.apache.flink.api.common.io.FileInputFormat; +import org.apache.flink.api.common.typeinfo.TypeInformation; +import org.apache.flink.api.java.tuple.Tuple2; +import org.apache.flink.api.java.typeutils.RowTypeInfo; +import org.apache.flink.core.fs.FileInputSplit; +import org.apache.flink.core.fs.Path; +import org.apache.flink.formats.parquet.utils.ParquetRecordReader; +import org.apache.flink.formats.parquet.utils.ParquetSchemaConverter; +import org.apache.flink.formats.parquet.utils.RowReadSupport; +import org.apache.flink.metrics.Counter; +import org.apache.flink.types.Row; +import org.apache.flink.util.Preconditions; + +import org.apache.parquet.ParquetReadOptions; +import org.apache.parquet.filter2.compat.FilterCompat; +import org.apache.parquet.hadoop.ParquetFileReader; +import org.apache.parquet.hadoop.util.HadoopInputFile; +import org.apache.parquet.io.InputFile; +import org.apache.parquet.schema.MessageType; +import org.apache.parquet.schema.Type; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +/** + * The base InputFormat class to read from Parquet files. + * For specific return types the {@link #convert(Row)} method need to be implemented. + * + * Using {@link ParquetRecordReader} to Read files instead of {@link org.apache.flink.core.fs.FSDataInputStream}, + * we override {@link #open(FileInputSplit)} and {@link #close()} to change the behaviors. + * + * @param The type of record to read. + */ +public abstract class ParquetInputFormat extends FileInputFormat implements + CheckpointableInputFormat> { + + private static final long serialVersionUID = 1L; + + private static final Logger LOG = LoggerFactory.getLogger(ParquetInputFormat.class); + + private transient Counter recordConsumed; + + protected RowTypeInfo readType; + + protected boolean isStandard; + + protected final TypeInformation[] fieldTypes; + + protected final String[] fieldNames; + + protected transient ParquetRecordReader parquetRecordReader; + + protected transient long recordsReadSinceLastSync; + + protected long lastSyncedBlock = -1L; + + protected ParquetInputFormat(Path path, TypeInformation[] fieldTypes, String[] fieldNames, boolean isStandard) { + super(path); + this.readType = new RowTypeInfo(fieldTypes, fieldNames); + this.fieldTypes = readType.getFieldTypes(); + this.fieldNames = readType.getFieldNames(); + this.unsplittable = true; + this.isStandard = isStandard; + } + + @Override + public Tuple2 getCurrentState() { + return new Tuple2<>(this.lastSyncedBlock, this.recordsReadSinceLastSync); + } + + @Override + public void open(FileInputSplit split) throws IOException { + org.apache.hadoop.conf.Configuration configuration = new org.apache.hadoop.conf.Configuration(); + InputFile inputFile = + HadoopInputFile.fromPath(new org.apache.hadoop.fs.Path(split.getPath().toUri()), configuration); + ParquetReadOptions options = ParquetReadOptions.builder().build(); + ParquetFileReader fileReader = new ParquetFileReader(inputFile, options); + MessageType schema = fileReader.getFileMetaData().getSchema(); + MessageType readSchema
[jira] [Commented] (FLINK-7243) Add ParquetInputFormat
[ https://issues.apache.org/jira/browse/FLINK-7243?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=16691991#comment-16691991 ] ASF GitHub Bot commented on FLINK-7243: --- fhueske commented on a change in pull request #6483: [FLINK-7243][flink-formats] Add parquet input format URL: https://github.com/apache/flink/pull/6483#discussion_r234637498 ## File path: flink-formats/flink-parquet/src/main/java/org/apache/flink/formats/parquet/utils/ParquetRecordReader.java ## @@ -0,0 +1,221 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.formats.parquet.utils; + +import org.apache.hadoop.conf.Configuration; +import org.apache.parquet.column.page.PageReadStore; +import org.apache.parquet.filter2.compat.FilterCompat; +import org.apache.parquet.filter2.compat.FilterCompat.Filter; +import org.apache.parquet.hadoop.ParquetFileReader; +import org.apache.parquet.hadoop.api.InitContext; +import org.apache.parquet.hadoop.api.ReadSupport; +import org.apache.parquet.hadoop.metadata.BlockMetaData; +import org.apache.parquet.hadoop.metadata.FileMetaData; +import org.apache.parquet.io.ColumnIOFactory; +import org.apache.parquet.io.MessageColumnIO; +import org.apache.parquet.io.RecordReader; +import org.apache.parquet.io.api.RecordMaterializer; +import org.apache.parquet.io.api.RecordMaterializer.RecordMaterializationException; +import org.apache.parquet.schema.MessageType; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import javax.annotation.CheckReturnValue; +import javax.annotation.meta.When; + +import java.io.IOException; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Set; + + +import static org.apache.flink.util.Preconditions.checkNotNull; + +/** + * Customized {@link org.apache.parquet.hadoop.ParquetRecordReader} that support start read from particular position. + */ +public class ParquetRecordReader { + private static final Logger LOG = LoggerFactory.getLogger(ParquetRecordReader.class); + + private ColumnIOFactory columnIOFactory; + private Filter filter; + private MessageType readSchema; + private MessageType fileSchema; + private ReadSupport readSupport; + + // Parquet Materializer convert record to T + private RecordMaterializer recordMaterializer; + private T currentValue; + private long total; + private long current = 0; + private int currentBlock = -1; + private ParquetFileReader reader; + private RecordReader recordReader; + private boolean strictTypeChecking = true; + private boolean skipCorruptedRecord = true; + private long countLoadUntilLastGroup = 0; + private long totalCountLoadedSoFar = 0; + + public ParquetRecordReader(ReadSupport readSupport, MessageType readSchema, Filter filter) { + this.filter = checkNotNull(filter, "readSupport"); + this.readSupport = checkNotNull(readSupport, "readSchema"); + this.readSchema = checkNotNull(readSchema, "filter"); + } + + public ParquetRecordReader(ReadSupport readSupport, MessageType readSchema) { + this(readSupport, readSchema, FilterCompat.NOOP); + } + + public void initialize(ParquetFileReader reader, Configuration configuration) { + this.reader = reader; + FileMetaData parquetFileMetadata = reader.getFooter().getFileMetaData(); + // real schema of parquet file + this.fileSchema = parquetFileMetadata.getSchema(); + Map fileMetadata = parquetFileMetadata.getKeyValueMetaData(); + ReadSupport.ReadContext readContext = readSupport.init(new InitContext( + configuration, toSetMultiMap(fileMetadata), readSchema)); + + this.columnIOFactory = new ColumnIOFactory(parquetFileMetadata.getCreatedBy()); + this.recordMaterializer = readSupport.prepareForRead( + configuration, fileMetadata, readSchema, readContext); + this.total =
[jira] [Commented] (FLINK-7243) Add ParquetInputFormat
[ https://issues.apache.org/jira/browse/FLINK-7243?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=16691982#comment-16691982 ] ASF GitHub Bot commented on FLINK-7243: --- fhueske commented on a change in pull request #6483: [FLINK-7243][flink-formats] Add parquet input format URL: https://github.com/apache/flink/pull/6483#discussion_r234584698 ## File path: flink-formats/flink-parquet/src/main/java/org/apache/flink/formats/parquet/ParquetInputFormat.java ## @@ -0,0 +1,283 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.formats.parquet; + +import org.apache.flink.api.common.io.CheckpointableInputFormat; +import org.apache.flink.api.common.io.FileInputFormat; +import org.apache.flink.api.common.typeinfo.TypeInformation; +import org.apache.flink.api.java.tuple.Tuple2; +import org.apache.flink.api.java.typeutils.RowTypeInfo; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.core.fs.FileInputSplit; +import org.apache.flink.core.fs.Path; +import org.apache.flink.formats.parquet.utils.ParquetRecordReader; +import org.apache.flink.formats.parquet.utils.ParquetSchemaConverter; +import org.apache.flink.formats.parquet.utils.RowReadSupport; +import org.apache.flink.metrics.Counter; +import org.apache.flink.types.Row; +import org.apache.flink.util.Preconditions; + +import org.apache.parquet.ParquetReadOptions; +import org.apache.parquet.filter2.compat.FilterCompat; +import org.apache.parquet.hadoop.ParquetFileReader; +import org.apache.parquet.hadoop.util.HadoopInputFile; +import org.apache.parquet.io.InputFile; +import org.apache.parquet.schema.MessageType; +import org.apache.parquet.schema.Type; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +import static org.apache.flink.util.Preconditions.checkNotNull; + +/** + * The base InputFormat class to read from Parquet files. + * For specific return types the {@link #convert(Row)} method need to be implemented. + * + * Using {@link ParquetRecordReader} to read files instead of {@link org.apache.flink.core.fs.FSDataInputStream}, + * we override {@link #open(FileInputSplit)} and {@link #close()} to change the behaviors. + * + * @param The type of record to read. + */ +public abstract class ParquetInputFormat + extends FileInputFormat + implements CheckpointableInputFormat> { + + private static final long serialVersionUID = 1L; + + private static final Logger LOG = LoggerFactory.getLogger(ParquetInputFormat.class); + + /** +* The flag to specify whether to skip file splits with wrong schema. +*/ + private boolean skipWrongSchemaFileSplit = false; + + /** +* The flag to specify whether to skip corrupted record. +*/ + private boolean skipCorruptedRecord = false; + + private transient Counter recordConsumed; + + private transient MessageType expectedFileSchema; + + private TypeInformation[] fieldTypes; + + private String[] fieldNames; + + private boolean skipThisSplit = false; + + private transient ParquetRecordReader parquetRecordReader; + + + /** +* Read parquet files with given parquet file schema. +* +* @param path The path of the file to read. +* @param messageType schema of parquet file +*/ + + protected ParquetInputFormat(Path path, MessageType messageType) { + super(path); + this.expectedFileSchema = checkNotNull(messageType, "messageType"); + RowTypeInfo rowTypeInfo = (RowTypeInfo) ParquetSchemaConverter.fromParquetType(expectedFileSchema); + this.fieldTypes = rowTypeInfo.getFieldTypes(); + this.fieldNames = rowTypeInfo.getFieldNames(); + // read whole parquet file as one file split + this.unsplittable = true; + } + + @Override + public void configure(Configuration parameters) { + super.configure(parameters); + + if
[jira] [Commented] (FLINK-7243) Add ParquetInputFormat
[ https://issues.apache.org/jira/browse/FLINK-7243?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=16691978#comment-16691978 ] ASF GitHub Bot commented on FLINK-7243: --- fhueske commented on a change in pull request #6483: [FLINK-7243][flink-formats] Add parquet input format URL: https://github.com/apache/flink/pull/6483#discussion_r234608920 ## File path: flink-formats/flink-parquet/src/main/java/org/apache/flink/formats/parquet/ParquetRowInputFormat.java ## @@ -0,0 +1,87 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.formats.parquet; + +import org.apache.flink.api.common.typeinfo.SqlTimeTypeInfo; +import org.apache.flink.api.common.typeinfo.TypeInformation; +import org.apache.flink.api.java.typeutils.ResultTypeQueryable; +import org.apache.flink.api.java.typeutils.RowTypeInfo; +import org.apache.flink.core.fs.Path; +import org.apache.flink.types.Row; + +import org.apache.parquet.schema.MessageType; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.sql.Timestamp; + + +/** + * A subclass of {@link ParquetInputFormat} to read from Parquet files and convert to {@link Row}. + * It is mainly used to integrate with table API and batch SQL. + */ +public class ParquetRowInputFormat extends ParquetInputFormat implements ResultTypeQueryable { + private static final long serialVersionUID = 11L; + private static final Logger LOG = LoggerFactory.getLogger(ParquetRowInputFormat.class); + private boolean timeStampRewrite; + private RowTypeInfo returnType; + private int tsIndex; + + public ParquetRowInputFormat(Path path, MessageType messageType) { + super(path, messageType); + this.returnType = new RowTypeInfo(getFieldTypes(), getFieldNames()); + this.timeStampRewrite = false; + } + + @Override + public TypeInformation getProducedType() { + return new RowTypeInfo(getFieldTypes(), getFieldNames()); + } + + @Override + protected Row convert(Row row) { + if (timeStampRewrite) { + row.setField(tsIndex, new Timestamp((long) row.getField(tsIndex))); + } + return row; + } + + /** +* Convert long or double field in parquet schema to SqlTimeTypeInfo.TIMESTAMP, so that the row return can +* be directly used for window aggregation in table API. Rewrite the time stamp field needs to come with +* overriding the convert function. + +* @param fieldName the field needs to change to TIMESTAMP type +*/ + public void rewriteTimeStampField(String fieldName) { + this.tsIndex = returnType.getFieldIndex(fieldName); + if (tsIndex == -1) { + throw new RuntimeException(String.format("Fail to extract timestamp field for row schema: [%s]", + returnType.toString())); + } + Review comment: Add a check that the found field is actually of type `long` or `double` This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org > Add ParquetInputFormat > -- > > Key: FLINK-7243 > URL: https://issues.apache.org/jira/browse/FLINK-7243 > Project: Flink > Issue Type: Sub-task > Components: Table API SQL >Reporter: godfrey he >Assignee: Zhenqiu Huang >Priority: Major > Labels: pull-request-available > > Add a {{ParquetInputFormat}} to read data from a Apache Parquet file. -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Commented] (FLINK-7243) Add ParquetInputFormat
[ https://issues.apache.org/jira/browse/FLINK-7243?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=16691987#comment-16691987 ] ASF GitHub Bot commented on FLINK-7243: --- fhueske commented on a change in pull request #6483: [FLINK-7243][flink-formats] Add parquet input format URL: https://github.com/apache/flink/pull/6483#discussion_r234610106 ## File path: flink-formats/flink-parquet/src/main/java/org/apache/flink/formats/parquet/ParquetRowInputFormat.java ## @@ -0,0 +1,87 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.formats.parquet; + +import org.apache.flink.api.common.typeinfo.SqlTimeTypeInfo; +import org.apache.flink.api.common.typeinfo.TypeInformation; +import org.apache.flink.api.java.typeutils.ResultTypeQueryable; +import org.apache.flink.api.java.typeutils.RowTypeInfo; +import org.apache.flink.core.fs.Path; +import org.apache.flink.types.Row; + +import org.apache.parquet.schema.MessageType; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.sql.Timestamp; + + +/** + * A subclass of {@link ParquetInputFormat} to read from Parquet files and convert to {@link Row}. + * It is mainly used to integrate with table API and batch SQL. + */ +public class ParquetRowInputFormat extends ParquetInputFormat implements ResultTypeQueryable { + private static final long serialVersionUID = 11L; + private static final Logger LOG = LoggerFactory.getLogger(ParquetRowInputFormat.class); + private boolean timeStampRewrite; + private RowTypeInfo returnType; + private int tsIndex; + + public ParquetRowInputFormat(Path path, MessageType messageType) { + super(path, messageType); + this.returnType = new RowTypeInfo(getFieldTypes(), getFieldNames()); + this.timeStampRewrite = false; + } + + @Override + public TypeInformation getProducedType() { + return new RowTypeInfo(getFieldTypes(), getFieldNames()); + } + + @Override + protected Row convert(Row row) { + if (timeStampRewrite) { + row.setField(tsIndex, new Timestamp((long) row.getField(tsIndex))); Review comment: I don't think this conversion is correct, because it is timezone dependent. Internally, all timestamps are assumed to be UTC and when converting them to other types or strings, the timezone is added. Flink always uses Calcite's logic for the conversion. To be honest, I'm not sure if we should add this logic here. It seems to solve a common case, that should be addresses by `flink-table`. This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org > Add ParquetInputFormat > -- > > Key: FLINK-7243 > URL: https://issues.apache.org/jira/browse/FLINK-7243 > Project: Flink > Issue Type: Sub-task > Components: Table API SQL >Reporter: godfrey he >Assignee: Zhenqiu Huang >Priority: Major > Labels: pull-request-available > > Add a {{ParquetInputFormat}} to read data from a Apache Parquet file. -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Commented] (FLINK-7243) Add ParquetInputFormat
[ https://issues.apache.org/jira/browse/FLINK-7243?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=16691986#comment-16691986 ] ASF GitHub Bot commented on FLINK-7243: --- fhueske commented on a change in pull request #6483: [FLINK-7243][flink-formats] Add parquet input format URL: https://github.com/apache/flink/pull/6483#discussion_r234606696 ## File path: flink-formats/flink-parquet/src/main/java/org/apache/flink/formats/parquet/ParquetRowInputFormat.java ## @@ -0,0 +1,87 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.formats.parquet; + +import org.apache.flink.api.common.typeinfo.SqlTimeTypeInfo; +import org.apache.flink.api.common.typeinfo.TypeInformation; +import org.apache.flink.api.java.typeutils.ResultTypeQueryable; +import org.apache.flink.api.java.typeutils.RowTypeInfo; +import org.apache.flink.core.fs.Path; +import org.apache.flink.types.Row; + +import org.apache.parquet.schema.MessageType; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.sql.Timestamp; + + +/** + * A subclass of {@link ParquetInputFormat} to read from Parquet files and convert to {@link Row}. + * It is mainly used to integrate with table API and batch SQL. Review comment: remove "batch". Since the IF implements `CheckpointableInputFormat`, also the monitoring file source can use this IF properly. This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org > Add ParquetInputFormat > -- > > Key: FLINK-7243 > URL: https://issues.apache.org/jira/browse/FLINK-7243 > Project: Flink > Issue Type: Sub-task > Components: Table API SQL >Reporter: godfrey he >Assignee: Zhenqiu Huang >Priority: Major > Labels: pull-request-available > > Add a {{ParquetInputFormat}} to read data from a Apache Parquet file. -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Commented] (FLINK-7243) Add ParquetInputFormat
[ https://issues.apache.org/jira/browse/FLINK-7243?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=16691567#comment-16691567 ] ASF GitHub Bot commented on FLINK-7243: --- fhueske commented on a change in pull request #6483: [FLINK-7243][flink-formats] Add parquet input format URL: https://github.com/apache/flink/pull/6483#discussion_r234577028 ## File path: flink-formats/flink-parquet/src/main/java/org/apache/flink/formats/parquet/ParquetInputFormat.java ## @@ -0,0 +1,283 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.formats.parquet; + +import org.apache.flink.api.common.io.CheckpointableInputFormat; +import org.apache.flink.api.common.io.FileInputFormat; +import org.apache.flink.api.common.typeinfo.TypeInformation; +import org.apache.flink.api.java.tuple.Tuple2; +import org.apache.flink.api.java.typeutils.RowTypeInfo; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.core.fs.FileInputSplit; +import org.apache.flink.core.fs.Path; +import org.apache.flink.formats.parquet.utils.ParquetRecordReader; +import org.apache.flink.formats.parquet.utils.ParquetSchemaConverter; +import org.apache.flink.formats.parquet.utils.RowReadSupport; +import org.apache.flink.metrics.Counter; +import org.apache.flink.types.Row; +import org.apache.flink.util.Preconditions; + +import org.apache.parquet.ParquetReadOptions; +import org.apache.parquet.filter2.compat.FilterCompat; +import org.apache.parquet.hadoop.ParquetFileReader; +import org.apache.parquet.hadoop.util.HadoopInputFile; +import org.apache.parquet.io.InputFile; +import org.apache.parquet.schema.MessageType; +import org.apache.parquet.schema.Type; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +import static org.apache.flink.util.Preconditions.checkNotNull; + +/** + * The base InputFormat class to read from Parquet files. + * For specific return types the {@link #convert(Row)} method need to be implemented. + * + * Using {@link ParquetRecordReader} to read files instead of {@link org.apache.flink.core.fs.FSDataInputStream}, + * we override {@link #open(FileInputSplit)} and {@link #close()} to change the behaviors. + * + * @param The type of record to read. + */ +public abstract class ParquetInputFormat + extends FileInputFormat + implements CheckpointableInputFormat> { + + private static final long serialVersionUID = 1L; + + private static final Logger LOG = LoggerFactory.getLogger(ParquetInputFormat.class); + + /** +* The flag to specify whether to skip file splits with wrong schema. +*/ + private boolean skipWrongSchemaFileSplit = false; + + /** +* The flag to specify whether to skip corrupted record. +*/ + private boolean skipCorruptedRecord = false; + + private transient Counter recordConsumed; + + private transient MessageType expectedFileSchema; + + private TypeInformation[] fieldTypes; + + private String[] fieldNames; + + private boolean skipThisSplit = false; + + private transient ParquetRecordReader parquetRecordReader; + + + /** +* Read parquet files with given parquet file schema. +* +* @param path The path of the file to read. +* @param messageType schema of parquet file +*/ + + protected ParquetInputFormat(Path path, MessageType messageType) { + super(path); + this.expectedFileSchema = checkNotNull(messageType, "messageType"); + RowTypeInfo rowTypeInfo = (RowTypeInfo) ParquetSchemaConverter.fromParquetType(expectedFileSchema); + this.fieldTypes = rowTypeInfo.getFieldTypes(); + this.fieldNames = rowTypeInfo.getFieldNames(); + // read whole parquet file as one file split + this.unsplittable = true; + } + + @Override + public void configure(Configuration parameters) { + super.configure(parameters); + + if
[jira] [Commented] (FLINK-7243) Add ParquetInputFormat
[ https://issues.apache.org/jira/browse/FLINK-7243?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=16691566#comment-16691566 ] ASF GitHub Bot commented on FLINK-7243: --- fhueske commented on a change in pull request #6483: [FLINK-7243][flink-formats] Add parquet input format URL: https://github.com/apache/flink/pull/6483#discussion_r234577028 ## File path: flink-formats/flink-parquet/src/main/java/org/apache/flink/formats/parquet/ParquetInputFormat.java ## @@ -0,0 +1,283 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.formats.parquet; + +import org.apache.flink.api.common.io.CheckpointableInputFormat; +import org.apache.flink.api.common.io.FileInputFormat; +import org.apache.flink.api.common.typeinfo.TypeInformation; +import org.apache.flink.api.java.tuple.Tuple2; +import org.apache.flink.api.java.typeutils.RowTypeInfo; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.core.fs.FileInputSplit; +import org.apache.flink.core.fs.Path; +import org.apache.flink.formats.parquet.utils.ParquetRecordReader; +import org.apache.flink.formats.parquet.utils.ParquetSchemaConverter; +import org.apache.flink.formats.parquet.utils.RowReadSupport; +import org.apache.flink.metrics.Counter; +import org.apache.flink.types.Row; +import org.apache.flink.util.Preconditions; + +import org.apache.parquet.ParquetReadOptions; +import org.apache.parquet.filter2.compat.FilterCompat; +import org.apache.parquet.hadoop.ParquetFileReader; +import org.apache.parquet.hadoop.util.HadoopInputFile; +import org.apache.parquet.io.InputFile; +import org.apache.parquet.schema.MessageType; +import org.apache.parquet.schema.Type; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +import static org.apache.flink.util.Preconditions.checkNotNull; + +/** + * The base InputFormat class to read from Parquet files. + * For specific return types the {@link #convert(Row)} method need to be implemented. + * + * Using {@link ParquetRecordReader} to read files instead of {@link org.apache.flink.core.fs.FSDataInputStream}, + * we override {@link #open(FileInputSplit)} and {@link #close()} to change the behaviors. + * + * @param The type of record to read. + */ +public abstract class ParquetInputFormat + extends FileInputFormat + implements CheckpointableInputFormat> { + + private static final long serialVersionUID = 1L; + + private static final Logger LOG = LoggerFactory.getLogger(ParquetInputFormat.class); + + /** +* The flag to specify whether to skip file splits with wrong schema. +*/ + private boolean skipWrongSchemaFileSplit = false; + + /** +* The flag to specify whether to skip corrupted record. +*/ + private boolean skipCorruptedRecord = false; + + private transient Counter recordConsumed; + + private transient MessageType expectedFileSchema; + + private TypeInformation[] fieldTypes; + + private String[] fieldNames; + + private boolean skipThisSplit = false; + + private transient ParquetRecordReader parquetRecordReader; + + + /** +* Read parquet files with given parquet file schema. +* +* @param path The path of the file to read. +* @param messageType schema of parquet file +*/ + + protected ParquetInputFormat(Path path, MessageType messageType) { + super(path); + this.expectedFileSchema = checkNotNull(messageType, "messageType"); + RowTypeInfo rowTypeInfo = (RowTypeInfo) ParquetSchemaConverter.fromParquetType(expectedFileSchema); + this.fieldTypes = rowTypeInfo.getFieldTypes(); + this.fieldNames = rowTypeInfo.getFieldNames(); + // read whole parquet file as one file split + this.unsplittable = true; + } + + @Override + public void configure(Configuration parameters) { + super.configure(parameters); + + if
[jira] [Commented] (FLINK-7243) Add ParquetInputFormat
[ https://issues.apache.org/jira/browse/FLINK-7243?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=16658564#comment-16658564 ] ASF GitHub Bot commented on FLINK-7243: --- HuangZhenQiu opened a new pull request #6483: [FLINK-7243][flink-formats] Add parquet input format URL: https://github.com/apache/flink/pull/6483 ## What is the purpose of the change This pull request is to create a ParquetInputFormat, so that Flink can process files in Parquet schema. Parquet record can be read as Row, Pojo and Generic Map. ## Brief change log - *Add schema converter for Parquet types to Flink internal types* - *Add ParquetRecordRead to read Parquet record as Row* - *Add ParquetInputFormat that can be exteneded to convert Row to Pojo or Generic Map* ## Verifying this change This change is already covered by existing tests, such as ParquetRecordReaderTest, ParquetSchemaConverterTest and ParquetInputFormatTest. ## Does this pull request potentially affect one of the following parts: - Dependencies (does it add or upgrade a dependency): (no) - The public API, i.e., is any changed class annotated with `@Public(Evolving)`: (no) - The serializers: (no) - The runtime per-record code paths (performance sensitive): (no) - Anything that affects deployment or recovery: JobManager (and its components), Checkpointing, Yarn/Mesos, ZooKeeper: (no) - The S3 file system connector: (no) ## Documentation - Does this pull request introduce a new feature? (yes) - If yes, how is the feature documented? (JavaDocs) This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org > Add ParquetInputFormat > -- > > Key: FLINK-7243 > URL: https://issues.apache.org/jira/browse/FLINK-7243 > Project: Flink > Issue Type: Sub-task > Components: Table API SQL >Reporter: godfrey he >Assignee: Zhenqiu Huang >Priority: Major > Labels: pull-request-available > > Add a {{ParquetInputFormat}} to read data from a Apache Parquet file. -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Commented] (FLINK-7243) Add ParquetInputFormat
[ https://issues.apache.org/jira/browse/FLINK-7243?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=16658563#comment-16658563 ] ASF GitHub Bot commented on FLINK-7243: --- HuangZhenQiu commented on issue #6483: [FLINK-7243][flink-formats] Add parquet input format URL: https://github.com/apache/flink/pull/6483#issuecomment-431737854 @fhueske Thanks for your patient review. It is pretty helpful to make the PR more readable and flawless. Resolved your comments. Please read one more round at your most convenient time. This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org > Add ParquetInputFormat > -- > > Key: FLINK-7243 > URL: https://issues.apache.org/jira/browse/FLINK-7243 > Project: Flink > Issue Type: Sub-task > Components: Table API SQL >Reporter: godfrey he >Assignee: Zhenqiu Huang >Priority: Major > Labels: pull-request-available > > Add a {{ParquetInputFormat}} to read data from a Apache Parquet file. -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Commented] (FLINK-7243) Add ParquetInputFormat
[ https://issues.apache.org/jira/browse/FLINK-7243?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=16658562#comment-16658562 ] ASF GitHub Bot commented on FLINK-7243: --- HuangZhenQiu removed a comment on issue #6483: [FLINK-7243][flink-formats] Add parquet input format URL: https://github.com/apache/flink/pull/6483#issuecomment-431737413 @fhueske Thanks for your patient review. It is pretty helpful to make the PR more readable and flawless. Resolved your comments. Please read one more round at your most convenient time. This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org > Add ParquetInputFormat > -- > > Key: FLINK-7243 > URL: https://issues.apache.org/jira/browse/FLINK-7243 > Project: Flink > Issue Type: Sub-task > Components: Table API SQL >Reporter: godfrey he >Assignee: Zhenqiu Huang >Priority: Major > Labels: pull-request-available > > Add a {{ParquetInputFormat}} to read data from a Apache Parquet file. -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Commented] (FLINK-7243) Add ParquetInputFormat
[ https://issues.apache.org/jira/browse/FLINK-7243?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=16658561#comment-16658561 ] ASF GitHub Bot commented on FLINK-7243: --- HuangZhenQiu removed a comment on issue #6483: [FLINK-7243][flink-formats] Add parquet input format URL: https://github.com/apache/flink/pull/6483#issuecomment-431737590 @fhueske Thanks for your patient review. It is pretty helpful to make the PR more readable and flawless. Resolved your comments. Please read one more round at your most convenient time. This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org > Add ParquetInputFormat > -- > > Key: FLINK-7243 > URL: https://issues.apache.org/jira/browse/FLINK-7243 > Project: Flink > Issue Type: Sub-task > Components: Table API SQL >Reporter: godfrey he >Assignee: Zhenqiu Huang >Priority: Major > Labels: pull-request-available > > Add a {{ParquetInputFormat}} to read data from a Apache Parquet file. -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Commented] (FLINK-7243) Add ParquetInputFormat
[ https://issues.apache.org/jira/browse/FLINK-7243?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=16658559#comment-16658559 ] ASF GitHub Bot commented on FLINK-7243: --- HuangZhenQiu removed a comment on issue #6483: [FLINK-7243][flink-formats] Add parquet input format URL: https://github.com/apache/flink/pull/6483#issuecomment-431737435 @fhueske Thanks for your patient review. It is pretty helpful to make the PR more readable and flawless. Resolved your comments. Please read one more round at your most convenient time. This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org > Add ParquetInputFormat > -- > > Key: FLINK-7243 > URL: https://issues.apache.org/jira/browse/FLINK-7243 > Project: Flink > Issue Type: Sub-task > Components: Table API SQL >Reporter: godfrey he >Assignee: Zhenqiu Huang >Priority: Major > Labels: pull-request-available > > Add a {{ParquetInputFormat}} to read data from a Apache Parquet file. -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Commented] (FLINK-7243) Add ParquetInputFormat
[ https://issues.apache.org/jira/browse/FLINK-7243?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=16658560#comment-16658560 ] ASF GitHub Bot commented on FLINK-7243: --- HuangZhenQiu removed a comment on issue #6483: [FLINK-7243][flink-formats] Add parquet input format URL: https://github.com/apache/flink/pull/6483#issuecomment-431737481 @fhueske Thanks for your patient review. It is pretty helpful to make the PR more readable and flawless. Resolved your comments. Please read one more round at your most convenient time. This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org > Add ParquetInputFormat > -- > > Key: FLINK-7243 > URL: https://issues.apache.org/jira/browse/FLINK-7243 > Project: Flink > Issue Type: Sub-task > Components: Table API SQL >Reporter: godfrey he >Assignee: Zhenqiu Huang >Priority: Major > Labels: pull-request-available > > Add a {{ParquetInputFormat}} to read data from a Apache Parquet file. -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Commented] (FLINK-7243) Add ParquetInputFormat
[ https://issues.apache.org/jira/browse/FLINK-7243?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=16658555#comment-16658555 ] ASF GitHub Bot commented on FLINK-7243: --- HuangZhenQiu commented on issue #6483: [FLINK-7243][flink-formats] Add parquet input format URL: https://github.com/apache/flink/pull/6483#issuecomment-431737590 @fhueske Thanks for your patient review. It is pretty helpful to make the PR more readable and flawless. Resolved your comments. Please read one more round at your most convenient time. This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org > Add ParquetInputFormat > -- > > Key: FLINK-7243 > URL: https://issues.apache.org/jira/browse/FLINK-7243 > Project: Flink > Issue Type: Sub-task > Components: Table API SQL >Reporter: godfrey he >Assignee: Zhenqiu Huang >Priority: Major > Labels: pull-request-available > > Add a {{ParquetInputFormat}} to read data from a Apache Parquet file. -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Commented] (FLINK-7243) Add ParquetInputFormat
[ https://issues.apache.org/jira/browse/FLINK-7243?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=16658558#comment-16658558 ] ASF GitHub Bot commented on FLINK-7243: --- HuangZhenQiu commented on issue #6483: [FLINK-7243][flink-formats] Add parquet input format URL: https://github.com/apache/flink/pull/6483#issuecomment-431737642 @fhueske Thanks for your patient review. It is pretty helpful to make the PR more readable and flawless. Resolved your comments. Please read one more round at your most convenient time. This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org > Add ParquetInputFormat > -- > > Key: FLINK-7243 > URL: https://issues.apache.org/jira/browse/FLINK-7243 > Project: Flink > Issue Type: Sub-task > Components: Table API SQL >Reporter: godfrey he >Assignee: Zhenqiu Huang >Priority: Major > Labels: pull-request-available > > Add a {{ParquetInputFormat}} to read data from a Apache Parquet file. -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Commented] (FLINK-7243) Add ParquetInputFormat
[ https://issues.apache.org/jira/browse/FLINK-7243?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=16658557#comment-16658557 ] ASF GitHub Bot commented on FLINK-7243: --- HuangZhenQiu closed pull request #6483: [FLINK-7243][flink-formats] Add parquet input format URL: https://github.com/apache/flink/pull/6483 This is a PR merged from a forked repository. As GitHub hides the original diff on merge, it is displayed below for the sake of provenance: As this is a foreign pull request (from a fork), the diff is supplied below (as it won't show otherwise due to GitHub magic): diff --git a/.gitignore b/.gitignore index 20749c24242..fdf7bedfb26 100644 --- a/.gitignore +++ b/.gitignore @@ -19,6 +19,7 @@ tmp build-target flink-end-to-end-tests/flink-datastream-allround-test/src/main/java/org/apache/flink/streaming/tests/avro/ flink-formats/flink-avro/src/test/java/org/apache/flink/formats/avro/generated/ +flink-formats/flink-parquet/src/test/java/org/apache/flink/formats/parquet/generated/ flink-runtime-web/web-dashboard/assets/fonts/ flink-runtime-web/web-dashboard/node_modules/ flink-runtime-web/web-dashboard/bower_components/ diff --git a/flink-core/src/main/java/org/apache/flink/api/common/io/FileInputFormat.java b/flink-core/src/main/java/org/apache/flink/api/common/io/FileInputFormat.java index 14cf647cd24..4177af72318 100644 --- a/flink-core/src/main/java/org/apache/flink/api/common/io/FileInputFormat.java +++ b/flink-core/src/main/java/org/apache/flink/api/common/io/FileInputFormat.java @@ -233,6 +233,16 @@ protected static String extractFileExtension(String fileName) { */ protected boolean enumerateNestedFiles = false; + /** +* The flag to specify whether to skip file splits with wrong schema. +*/ + protected boolean skipWrongSchemaFileSplit = false; + + /** +* The flag to specify whether to skip corrupted record. +*/ + protected boolean skipCorruptedRecord = true; + /** * Files filter for determining what files/directories should be included. */ @@ -463,6 +473,14 @@ public void configure(Configuration parameters) { if (!this.enumerateNestedFiles) { this.enumerateNestedFiles = parameters.getBoolean(ENUMERATE_NESTED_FILES_FLAG, false); } + + if (!this.skipWrongSchemaFileSplit) { + this.skipWrongSchemaFileSplit = parameters.getBoolean(SKIP_WRONG_SCHEMA_SPLITS, false); + } + + if (this.skipCorruptedRecord) { + this.skipCorruptedRecord = parameters.getBoolean(SKIP_CORRUPTED_RECORD, true); + } } /** @@ -1077,4 +1095,14 @@ private void abortWait() { * The config parameter which defines whether input directories are recursively traversed. */ public static final String ENUMERATE_NESTED_FILES_FLAG = "recursive.file.enumeration"; + + /** +* The config parameter which defines whether to skip file split with wrong schema. +*/ + public static final String SKIP_WRONG_SCHEMA_SPLITS = "skip.splits.wrong.schema"; + + /** +* The config parameter which defines whether to skip corrupted record. +*/ + public static final String SKIP_CORRUPTED_RECORD = "skip.corrupted.record"; } diff --git a/flink-formats/flink-parquet/src/main/java/org/apache/flink/formats/parquet/ParquetInputFormat.java b/flink-formats/flink-parquet/src/main/java/org/apache/flink/formats/parquet/ParquetInputFormat.java new file mode 100644 index 000..dab1899a1ce --- /dev/null +++ b/flink-formats/flink-parquet/src/main/java/org/apache/flink/formats/parquet/ParquetInputFormat.java @@ -0,0 +1,248 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.formats.parquet; + +import org.apache.flink.api.common.io.CheckpointableInputFormat; +import org.apache.flink.api.common.io.FileInputFormat; +import org.apache.flink.api.common.typeinfo.TypeInformation; +import org.apache.flink.api.java.tuple.Tuple2; +import
[jira] [Commented] (FLINK-7243) Add ParquetInputFormat
[ https://issues.apache.org/jira/browse/FLINK-7243?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=16658554#comment-16658554 ] ASF GitHub Bot commented on FLINK-7243: --- HuangZhenQiu commented on issue #6483: [FLINK-7243][flink-formats] Add parquet input format URL: https://github.com/apache/flink/pull/6483#issuecomment-431737558 @fhueske Thanks for your patient review. It is pretty helpful to make the PR more readable and flawless. Resolved your comments. Please read one more round at your most convenient time. This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org > Add ParquetInputFormat > -- > > Key: FLINK-7243 > URL: https://issues.apache.org/jira/browse/FLINK-7243 > Project: Flink > Issue Type: Sub-task > Components: Table API SQL >Reporter: godfrey he >Assignee: Zhenqiu Huang >Priority: Major > Labels: pull-request-available > > Add a {{ParquetInputFormat}} to read data from a Apache Parquet file. -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Commented] (FLINK-7243) Add ParquetInputFormat
[ https://issues.apache.org/jira/browse/FLINK-7243?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=16658556#comment-16658556 ] ASF GitHub Bot commented on FLINK-7243: --- HuangZhenQiu commented on issue #6483: [FLINK-7243][flink-formats] Add parquet input format URL: https://github.com/apache/flink/pull/6483#issuecomment-431737615 @fhueske Thanks for your patient review. It is pretty helpful to make the PR more readable and flawless. Resolved your comments. Please read one more round at your most convenient time. This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org > Add ParquetInputFormat > -- > > Key: FLINK-7243 > URL: https://issues.apache.org/jira/browse/FLINK-7243 > Project: Flink > Issue Type: Sub-task > Components: Table API SQL >Reporter: godfrey he >Assignee: Zhenqiu Huang >Priority: Major > Labels: pull-request-available > > Add a {{ParquetInputFormat}} to read data from a Apache Parquet file. -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Commented] (FLINK-7243) Add ParquetInputFormat
[ https://issues.apache.org/jira/browse/FLINK-7243?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=16658553#comment-16658553 ] ASF GitHub Bot commented on FLINK-7243: --- HuangZhenQiu commented on issue #6483: [FLINK-7243][flink-formats] Add parquet input format URL: https://github.com/apache/flink/pull/6483#issuecomment-431737523 @fhueske Thanks for your patient review. It is pretty helpful to make the PR more readable and flawless. Resolved your comments. Please read one more round at your most convenient time. This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org > Add ParquetInputFormat > -- > > Key: FLINK-7243 > URL: https://issues.apache.org/jira/browse/FLINK-7243 > Project: Flink > Issue Type: Sub-task > Components: Table API SQL >Reporter: godfrey he >Assignee: Zhenqiu Huang >Priority: Major > Labels: pull-request-available > > Add a {{ParquetInputFormat}} to read data from a Apache Parquet file. -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Commented] (FLINK-7243) Add ParquetInputFormat
[ https://issues.apache.org/jira/browse/FLINK-7243?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=16658552#comment-16658552 ] ASF GitHub Bot commented on FLINK-7243: --- HuangZhenQiu commented on issue #6483: [FLINK-7243][flink-formats] Add parquet input format URL: https://github.com/apache/flink/pull/6483#issuecomment-431737481 @fhueske Thanks for your patient review. It is pretty helpful to make the PR more readable and flawless. Resolved your comments. Please read one more round at your most convenient time. This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org > Add ParquetInputFormat > -- > > Key: FLINK-7243 > URL: https://issues.apache.org/jira/browse/FLINK-7243 > Project: Flink > Issue Type: Sub-task > Components: Table API SQL >Reporter: godfrey he >Assignee: Zhenqiu Huang >Priority: Major > Labels: pull-request-available > > Add a {{ParquetInputFormat}} to read data from a Apache Parquet file. -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Commented] (FLINK-7243) Add ParquetInputFormat
[ https://issues.apache.org/jira/browse/FLINK-7243?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=16658551#comment-16658551 ] ASF GitHub Bot commented on FLINK-7243: --- HuangZhenQiu commented on issue #6483: [FLINK-7243][flink-formats] Add parquet input format URL: https://github.com/apache/flink/pull/6483#issuecomment-431737461 @fhueske Thanks for your patient review. It is pretty helpful to make the PR more readable and flawless. Resolved your comments. Please read one more round at your most convenient time. This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org > Add ParquetInputFormat > -- > > Key: FLINK-7243 > URL: https://issues.apache.org/jira/browse/FLINK-7243 > Project: Flink > Issue Type: Sub-task > Components: Table API SQL >Reporter: godfrey he >Assignee: Zhenqiu Huang >Priority: Major > Labels: pull-request-available > > Add a {{ParquetInputFormat}} to read data from a Apache Parquet file. -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Commented] (FLINK-7243) Add ParquetInputFormat
[ https://issues.apache.org/jira/browse/FLINK-7243?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=16658550#comment-16658550 ] ASF GitHub Bot commented on FLINK-7243: --- HuangZhenQiu commented on issue #6483: [FLINK-7243][flink-formats] Add parquet input format URL: https://github.com/apache/flink/pull/6483#issuecomment-431737435 @fhueske Thanks for your patient review. It is pretty helpful to make the PR more readable and flawless. Resolved your comments. Please read one more round at your most convenient time. This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org > Add ParquetInputFormat > -- > > Key: FLINK-7243 > URL: https://issues.apache.org/jira/browse/FLINK-7243 > Project: Flink > Issue Type: Sub-task > Components: Table API SQL >Reporter: godfrey he >Assignee: Zhenqiu Huang >Priority: Major > Labels: pull-request-available > > Add a {{ParquetInputFormat}} to read data from a Apache Parquet file. -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Commented] (FLINK-7243) Add ParquetInputFormat
[ https://issues.apache.org/jira/browse/FLINK-7243?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=16658549#comment-16658549 ] ASF GitHub Bot commented on FLINK-7243: --- HuangZhenQiu commented on issue #6483: [FLINK-7243][flink-formats] Add parquet input format URL: https://github.com/apache/flink/pull/6483#issuecomment-431737413 @fhueske Thanks for your patient review. It is pretty helpful to make the PR more readable and flawless. Resolved your comments. Please read one more round at your most convenient time. This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org > Add ParquetInputFormat > -- > > Key: FLINK-7243 > URL: https://issues.apache.org/jira/browse/FLINK-7243 > Project: Flink > Issue Type: Sub-task > Components: Table API SQL >Reporter: godfrey he >Assignee: Zhenqiu Huang >Priority: Major > Labels: pull-request-available > > Add a {{ParquetInputFormat}} to read data from a Apache Parquet file. -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Commented] (FLINK-7243) Add ParquetInputFormat
[ https://issues.apache.org/jira/browse/FLINK-7243?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=16656539#comment-16656539 ] ASF GitHub Bot commented on FLINK-7243: --- fhueske commented on a change in pull request #6483: [FLINK-7243][flink-formats] Add parquet input format URL: https://github.com/apache/flink/pull/6483#discussion_r226383417 ## File path: flink-formats/flink-parquet/src/main/java/org/apache/flink/formats/parquet/ParquetInputFormat.java ## @@ -0,0 +1,248 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.formats.parquet; + +import org.apache.flink.api.common.io.CheckpointableInputFormat; +import org.apache.flink.api.common.io.FileInputFormat; +import org.apache.flink.api.common.typeinfo.TypeInformation; +import org.apache.flink.api.java.tuple.Tuple2; +import org.apache.flink.api.java.typeutils.RowTypeInfo; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.core.fs.FileInputSplit; +import org.apache.flink.core.fs.Path; +import org.apache.flink.formats.parquet.utils.ParquetRecordReader; +import org.apache.flink.formats.parquet.utils.ParquetSchemaConverter; +import org.apache.flink.formats.parquet.utils.RowReadSupport; +import org.apache.flink.metrics.Counter; +import org.apache.flink.types.Row; +import org.apache.flink.util.Preconditions; + +import org.apache.parquet.ParquetReadOptions; +import org.apache.parquet.filter2.compat.FilterCompat; +import org.apache.parquet.hadoop.ParquetFileReader; +import org.apache.parquet.hadoop.util.HadoopInputFile; +import org.apache.parquet.io.InputFile; +import org.apache.parquet.schema.MessageType; +import org.apache.parquet.schema.Type; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +import static org.apache.flink.util.Preconditions.checkNotNull; + +/** + * The base InputFormat class to read from Parquet files. + * For specific return types the {@link #convert(Row)} method need to be implemented. + * + * Using {@link ParquetRecordReader} to read files instead of {@link org.apache.flink.core.fs.FSDataInputStream}, + * we override {@link #open(FileInputSplit)} and {@link #close()} to change the behaviors. + * + * @param The type of record to read. + */ +public abstract class ParquetInputFormat + extends FileInputFormat + implements CheckpointableInputFormat> { + + private static final long serialVersionUID = 1L; + + private static final Logger LOG = LoggerFactory.getLogger(ParquetInputFormat.class); + + private transient Counter recordConsumed; + + private transient MessageType expectedFileSchema; + + private TypeInformation[] fieldTypes; + + private String[] fieldNames; + + private boolean skipThisSplit = false; + + private transient ParquetRecordReader parquetRecordReader; + + + /** +* Read parquet files with given parquet file schema. +* +* @param path The path of the file to read. +* @param messageType schema of parquet file +*/ + + protected ParquetInputFormat(Path path, MessageType messageType) { + super(path); + this.expectedFileSchema = checkNotNull(messageType, "messageType"); + RowTypeInfo rowTypeInfo = (RowTypeInfo) ParquetSchemaConverter.fromParquetType(expectedFileSchema); + this.fieldTypes = rowTypeInfo.getFieldTypes(); + this.fieldNames = rowTypeInfo.getFieldNames(); + // read whole parquet file as one file split + this.unsplittable = true; + } + + @Override + public void configure(Configuration parameters) { + super.configure(parameters); + parquetRecordReader.setSkipCorruptedRecord(this.skipCorruptedRecord); Review comment: we can read the Parquet specific configuration that you put into `FileInputFormat` at this place. This is an automated message from the Apache Git Service. To respond to the
[jira] [Commented] (FLINK-7243) Add ParquetInputFormat
[ https://issues.apache.org/jira/browse/FLINK-7243?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=16656543#comment-16656543 ] ASF GitHub Bot commented on FLINK-7243: --- fhueske commented on a change in pull request #6483: [FLINK-7243][flink-formats] Add parquet input format URL: https://github.com/apache/flink/pull/6483#discussion_r226385139 ## File path: flink-formats/flink-parquet/src/main/java/org/apache/flink/formats/parquet/ParquetInputFormat.java ## @@ -0,0 +1,248 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.formats.parquet; + +import org.apache.flink.api.common.io.CheckpointableInputFormat; +import org.apache.flink.api.common.io.FileInputFormat; +import org.apache.flink.api.common.typeinfo.TypeInformation; +import org.apache.flink.api.java.tuple.Tuple2; +import org.apache.flink.api.java.typeutils.RowTypeInfo; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.core.fs.FileInputSplit; +import org.apache.flink.core.fs.Path; +import org.apache.flink.formats.parquet.utils.ParquetRecordReader; +import org.apache.flink.formats.parquet.utils.ParquetSchemaConverter; +import org.apache.flink.formats.parquet.utils.RowReadSupport; +import org.apache.flink.metrics.Counter; +import org.apache.flink.types.Row; +import org.apache.flink.util.Preconditions; + +import org.apache.parquet.ParquetReadOptions; +import org.apache.parquet.filter2.compat.FilterCompat; +import org.apache.parquet.hadoop.ParquetFileReader; +import org.apache.parquet.hadoop.util.HadoopInputFile; +import org.apache.parquet.io.InputFile; +import org.apache.parquet.schema.MessageType; +import org.apache.parquet.schema.Type; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +import static org.apache.flink.util.Preconditions.checkNotNull; + +/** + * The base InputFormat class to read from Parquet files. + * For specific return types the {@link #convert(Row)} method need to be implemented. + * + * Using {@link ParquetRecordReader} to read files instead of {@link org.apache.flink.core.fs.FSDataInputStream}, + * we override {@link #open(FileInputSplit)} and {@link #close()} to change the behaviors. + * + * @param The type of record to read. + */ +public abstract class ParquetInputFormat + extends FileInputFormat + implements CheckpointableInputFormat> { + + private static final long serialVersionUID = 1L; + + private static final Logger LOG = LoggerFactory.getLogger(ParquetInputFormat.class); + + private transient Counter recordConsumed; + + private transient MessageType expectedFileSchema; + + private TypeInformation[] fieldTypes; + + private String[] fieldNames; + + private boolean skipThisSplit = false; + + private transient ParquetRecordReader parquetRecordReader; + + + /** +* Read parquet files with given parquet file schema. +* +* @param path The path of the file to read. +* @param messageType schema of parquet file +*/ + + protected ParquetInputFormat(Path path, MessageType messageType) { + super(path); + this.expectedFileSchema = checkNotNull(messageType, "messageType"); + RowTypeInfo rowTypeInfo = (RowTypeInfo) ParquetSchemaConverter.fromParquetType(expectedFileSchema); + this.fieldTypes = rowTypeInfo.getFieldTypes(); + this.fieldNames = rowTypeInfo.getFieldNames(); + // read whole parquet file as one file split + this.unsplittable = true; + } + + @Override + public void configure(Configuration parameters) { + super.configure(parameters); + parquetRecordReader.setSkipCorruptedRecord(this.skipCorruptedRecord); + } + + public void selectFields(String[] fieldNames) { + checkNotNull(fieldNames, "fieldNames"); + this.fieldNames = fieldNames; + RowTypeInfo rowTypeInfo = (RowTypeInfo)
[jira] [Commented] (FLINK-7243) Add ParquetInputFormat
[ https://issues.apache.org/jira/browse/FLINK-7243?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=16656542#comment-16656542 ] ASF GitHub Bot commented on FLINK-7243: --- fhueske commented on a change in pull request #6483: [FLINK-7243][flink-formats] Add parquet input format URL: https://github.com/apache/flink/pull/6483#discussion_r226591696 ## File path: flink-formats/flink-parquet/src/main/java/org/apache/flink/formats/parquet/ParquetInputFormat.java ## @@ -0,0 +1,248 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.formats.parquet; + +import org.apache.flink.api.common.io.CheckpointableInputFormat; +import org.apache.flink.api.common.io.FileInputFormat; +import org.apache.flink.api.common.typeinfo.TypeInformation; +import org.apache.flink.api.java.tuple.Tuple2; +import org.apache.flink.api.java.typeutils.RowTypeInfo; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.core.fs.FileInputSplit; +import org.apache.flink.core.fs.Path; +import org.apache.flink.formats.parquet.utils.ParquetRecordReader; +import org.apache.flink.formats.parquet.utils.ParquetSchemaConverter; +import org.apache.flink.formats.parquet.utils.RowReadSupport; +import org.apache.flink.metrics.Counter; +import org.apache.flink.types.Row; +import org.apache.flink.util.Preconditions; + +import org.apache.parquet.ParquetReadOptions; +import org.apache.parquet.filter2.compat.FilterCompat; +import org.apache.parquet.hadoop.ParquetFileReader; +import org.apache.parquet.hadoop.util.HadoopInputFile; +import org.apache.parquet.io.InputFile; +import org.apache.parquet.schema.MessageType; +import org.apache.parquet.schema.Type; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +import static org.apache.flink.util.Preconditions.checkNotNull; + +/** + * The base InputFormat class to read from Parquet files. + * For specific return types the {@link #convert(Row)} method need to be implemented. + * + * Using {@link ParquetRecordReader} to read files instead of {@link org.apache.flink.core.fs.FSDataInputStream}, + * we override {@link #open(FileInputSplit)} and {@link #close()} to change the behaviors. + * + * @param The type of record to read. + */ +public abstract class ParquetInputFormat + extends FileInputFormat + implements CheckpointableInputFormat> { + + private static final long serialVersionUID = 1L; + + private static final Logger LOG = LoggerFactory.getLogger(ParquetInputFormat.class); + + private transient Counter recordConsumed; + + private transient MessageType expectedFileSchema; + + private TypeInformation[] fieldTypes; + + private String[] fieldNames; + + private boolean skipThisSplit = false; + + private transient ParquetRecordReader parquetRecordReader; + + + /** +* Read parquet files with given parquet file schema. +* +* @param path The path of the file to read. +* @param messageType schema of parquet file +*/ + + protected ParquetInputFormat(Path path, MessageType messageType) { + super(path); + this.expectedFileSchema = checkNotNull(messageType, "messageType"); + RowTypeInfo rowTypeInfo = (RowTypeInfo) ParquetSchemaConverter.fromParquetType(expectedFileSchema); + this.fieldTypes = rowTypeInfo.getFieldTypes(); + this.fieldNames = rowTypeInfo.getFieldNames(); + // read whole parquet file as one file split + this.unsplittable = true; + } + + @Override + public void configure(Configuration parameters) { + super.configure(parameters); + parquetRecordReader.setSkipCorruptedRecord(this.skipCorruptedRecord); + } + + public void selectFields(String[] fieldNames) { + checkNotNull(fieldNames, "fieldNames"); + this.fieldNames = fieldNames; + RowTypeInfo rowTypeInfo = (RowTypeInfo)
[jira] [Commented] (FLINK-7243) Add ParquetInputFormat
[ https://issues.apache.org/jira/browse/FLINK-7243?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=16656540#comment-16656540 ] ASF GitHub Bot commented on FLINK-7243: --- fhueske commented on a change in pull request #6483: [FLINK-7243][flink-formats] Add parquet input format URL: https://github.com/apache/flink/pull/6483#discussion_r226380623 ## File path: flink-core/src/main/java/org/apache/flink/api/common/io/FileInputFormat.java ## @@ -233,6 +233,16 @@ protected static String extractFileExtension(String fileName) { */ protected boolean enumerateNestedFiles = false; + /** +* The flag to specify whether to skip file splits with wrong schema. +*/ + protected boolean skipWrongSchemaFileSplit = false; Review comment: I would move these options to `ParquetInputFormat`. `TextInputFormat` does not have a notion of schema or corrupted records. This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org > Add ParquetInputFormat > -- > > Key: FLINK-7243 > URL: https://issues.apache.org/jira/browse/FLINK-7243 > Project: Flink > Issue Type: Sub-task > Components: Table API SQL >Reporter: godfrey he >Assignee: Zhenqiu Huang >Priority: Major > Labels: pull-request-available > > Add a {{ParquetInputFormat}} to read data from a Apache Parquet file. -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Commented] (FLINK-7243) Add ParquetInputFormat
[ https://issues.apache.org/jira/browse/FLINK-7243?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=16656544#comment-16656544 ] ASF GitHub Bot commented on FLINK-7243: --- fhueske commented on a change in pull request #6483: [FLINK-7243][flink-formats] Add parquet input format URL: https://github.com/apache/flink/pull/6483#discussion_r226380357 ## File path: flink-core/src/main/java/org/apache/flink/api/common/io/FileInputFormat.java ## @@ -233,6 +233,16 @@ protected static String extractFileExtension(String fileName) { */ protected boolean enumerateNestedFiles = false; + /** +* The flag to specify whether to skip file splits with wrong schema. +*/ + protected boolean skipWrongSchemaFileSplit = false; + + /** +* The flag to specify whether to skip corrupted record. +*/ + protected boolean skipCorruptedRecord = true; Review comment: Move this field as well to `ParquetInputFormat`. Also, I'd be conservative and initialize with `false`. This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org > Add ParquetInputFormat > -- > > Key: FLINK-7243 > URL: https://issues.apache.org/jira/browse/FLINK-7243 > Project: Flink > Issue Type: Sub-task > Components: Table API SQL >Reporter: godfrey he >Assignee: Zhenqiu Huang >Priority: Major > Labels: pull-request-available > > Add a {{ParquetInputFormat}} to read data from a Apache Parquet file. -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Commented] (FLINK-7243) Add ParquetInputFormat
[ https://issues.apache.org/jira/browse/FLINK-7243?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=16656545#comment-16656545 ] ASF GitHub Bot commented on FLINK-7243: --- fhueske commented on a change in pull request #6483: [FLINK-7243][flink-formats] Add parquet input format URL: https://github.com/apache/flink/pull/6483#discussion_r226586847 ## File path: flink-formats/flink-parquet/src/main/java/org/apache/flink/formats/parquet/ParquetInputFormat.java ## @@ -0,0 +1,248 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.formats.parquet; + +import org.apache.flink.api.common.io.CheckpointableInputFormat; +import org.apache.flink.api.common.io.FileInputFormat; +import org.apache.flink.api.common.typeinfo.TypeInformation; +import org.apache.flink.api.java.tuple.Tuple2; +import org.apache.flink.api.java.typeutils.RowTypeInfo; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.core.fs.FileInputSplit; +import org.apache.flink.core.fs.Path; +import org.apache.flink.formats.parquet.utils.ParquetRecordReader; +import org.apache.flink.formats.parquet.utils.ParquetSchemaConverter; +import org.apache.flink.formats.parquet.utils.RowReadSupport; +import org.apache.flink.metrics.Counter; +import org.apache.flink.types.Row; +import org.apache.flink.util.Preconditions; + +import org.apache.parquet.ParquetReadOptions; +import org.apache.parquet.filter2.compat.FilterCompat; +import org.apache.parquet.hadoop.ParquetFileReader; +import org.apache.parquet.hadoop.util.HadoopInputFile; +import org.apache.parquet.io.InputFile; +import org.apache.parquet.schema.MessageType; +import org.apache.parquet.schema.Type; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +import static org.apache.flink.util.Preconditions.checkNotNull; + +/** + * The base InputFormat class to read from Parquet files. + * For specific return types the {@link #convert(Row)} method need to be implemented. + * + * Using {@link ParquetRecordReader} to read files instead of {@link org.apache.flink.core.fs.FSDataInputStream}, + * we override {@link #open(FileInputSplit)} and {@link #close()} to change the behaviors. + * + * @param The type of record to read. + */ +public abstract class ParquetInputFormat + extends FileInputFormat + implements CheckpointableInputFormat> { + + private static final long serialVersionUID = 1L; + + private static final Logger LOG = LoggerFactory.getLogger(ParquetInputFormat.class); + + private transient Counter recordConsumed; + + private transient MessageType expectedFileSchema; + + private TypeInformation[] fieldTypes; + + private String[] fieldNames; + + private boolean skipThisSplit = false; + + private transient ParquetRecordReader parquetRecordReader; + + + /** +* Read parquet files with given parquet file schema. +* +* @param path The path of the file to read. +* @param messageType schema of parquet file +*/ + + protected ParquetInputFormat(Path path, MessageType messageType) { + super(path); + this.expectedFileSchema = checkNotNull(messageType, "messageType"); + RowTypeInfo rowTypeInfo = (RowTypeInfo) ParquetSchemaConverter.fromParquetType(expectedFileSchema); + this.fieldTypes = rowTypeInfo.getFieldTypes(); + this.fieldNames = rowTypeInfo.getFieldNames(); + // read whole parquet file as one file split + this.unsplittable = true; + } + + @Override + public void configure(Configuration parameters) { + super.configure(parameters); + parquetRecordReader.setSkipCorruptedRecord(this.skipCorruptedRecord); + } + + public void selectFields(String[] fieldNames) { + checkNotNull(fieldNames, "fieldNames"); + this.fieldNames = fieldNames; + RowTypeInfo rowTypeInfo = (RowTypeInfo)
[jira] [Commented] (FLINK-7243) Add ParquetInputFormat
[ https://issues.apache.org/jira/browse/FLINK-7243?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=16656546#comment-16656546 ] ASF GitHub Bot commented on FLINK-7243: --- fhueske commented on a change in pull request #6483: [FLINK-7243][flink-formats] Add parquet input format URL: https://github.com/apache/flink/pull/6483#discussion_r226590288 ## File path: flink-formats/flink-parquet/src/main/java/org/apache/flink/formats/parquet/ParquetInputFormat.java ## @@ -0,0 +1,248 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.formats.parquet; + +import org.apache.flink.api.common.io.CheckpointableInputFormat; +import org.apache.flink.api.common.io.FileInputFormat; +import org.apache.flink.api.common.typeinfo.TypeInformation; +import org.apache.flink.api.java.tuple.Tuple2; +import org.apache.flink.api.java.typeutils.RowTypeInfo; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.core.fs.FileInputSplit; +import org.apache.flink.core.fs.Path; +import org.apache.flink.formats.parquet.utils.ParquetRecordReader; +import org.apache.flink.formats.parquet.utils.ParquetSchemaConverter; +import org.apache.flink.formats.parquet.utils.RowReadSupport; +import org.apache.flink.metrics.Counter; +import org.apache.flink.types.Row; +import org.apache.flink.util.Preconditions; + +import org.apache.parquet.ParquetReadOptions; +import org.apache.parquet.filter2.compat.FilterCompat; +import org.apache.parquet.hadoop.ParquetFileReader; +import org.apache.parquet.hadoop.util.HadoopInputFile; +import org.apache.parquet.io.InputFile; +import org.apache.parquet.schema.MessageType; +import org.apache.parquet.schema.Type; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +import static org.apache.flink.util.Preconditions.checkNotNull; + +/** + * The base InputFormat class to read from Parquet files. + * For specific return types the {@link #convert(Row)} method need to be implemented. + * + * Using {@link ParquetRecordReader} to read files instead of {@link org.apache.flink.core.fs.FSDataInputStream}, + * we override {@link #open(FileInputSplit)} and {@link #close()} to change the behaviors. + * + * @param The type of record to read. + */ +public abstract class ParquetInputFormat + extends FileInputFormat + implements CheckpointableInputFormat> { + + private static final long serialVersionUID = 1L; + + private static final Logger LOG = LoggerFactory.getLogger(ParquetInputFormat.class); + + private transient Counter recordConsumed; + + private transient MessageType expectedFileSchema; + + private TypeInformation[] fieldTypes; + + private String[] fieldNames; + + private boolean skipThisSplit = false; + + private transient ParquetRecordReader parquetRecordReader; + + + /** +* Read parquet files with given parquet file schema. +* +* @param path The path of the file to read. +* @param messageType schema of parquet file +*/ + + protected ParquetInputFormat(Path path, MessageType messageType) { + super(path); + this.expectedFileSchema = checkNotNull(messageType, "messageType"); + RowTypeInfo rowTypeInfo = (RowTypeInfo) ParquetSchemaConverter.fromParquetType(expectedFileSchema); + this.fieldTypes = rowTypeInfo.getFieldTypes(); + this.fieldNames = rowTypeInfo.getFieldNames(); + // read whole parquet file as one file split + this.unsplittable = true; + } + + @Override + public void configure(Configuration parameters) { + super.configure(parameters); + parquetRecordReader.setSkipCorruptedRecord(this.skipCorruptedRecord); + } + + public void selectFields(String[] fieldNames) { + checkNotNull(fieldNames, "fieldNames"); + this.fieldNames = fieldNames; + RowTypeInfo rowTypeInfo = (RowTypeInfo)
[jira] [Commented] (FLINK-7243) Add ParquetInputFormat
[ https://issues.apache.org/jira/browse/FLINK-7243?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=16656541#comment-16656541 ] ASF GitHub Bot commented on FLINK-7243: --- fhueske commented on a change in pull request #6483: [FLINK-7243][flink-formats] Add parquet input format URL: https://github.com/apache/flink/pull/6483#discussion_r226585824 ## File path: flink-formats/flink-parquet/src/main/java/org/apache/flink/formats/parquet/ParquetInputFormat.java ## @@ -0,0 +1,248 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.formats.parquet; + +import org.apache.flink.api.common.io.CheckpointableInputFormat; +import org.apache.flink.api.common.io.FileInputFormat; +import org.apache.flink.api.common.typeinfo.TypeInformation; +import org.apache.flink.api.java.tuple.Tuple2; +import org.apache.flink.api.java.typeutils.RowTypeInfo; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.core.fs.FileInputSplit; +import org.apache.flink.core.fs.Path; +import org.apache.flink.formats.parquet.utils.ParquetRecordReader; +import org.apache.flink.formats.parquet.utils.ParquetSchemaConverter; +import org.apache.flink.formats.parquet.utils.RowReadSupport; +import org.apache.flink.metrics.Counter; +import org.apache.flink.types.Row; +import org.apache.flink.util.Preconditions; + +import org.apache.parquet.ParquetReadOptions; +import org.apache.parquet.filter2.compat.FilterCompat; +import org.apache.parquet.hadoop.ParquetFileReader; +import org.apache.parquet.hadoop.util.HadoopInputFile; +import org.apache.parquet.io.InputFile; +import org.apache.parquet.schema.MessageType; +import org.apache.parquet.schema.Type; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +import static org.apache.flink.util.Preconditions.checkNotNull; + +/** + * The base InputFormat class to read from Parquet files. + * For specific return types the {@link #convert(Row)} method need to be implemented. + * + * Using {@link ParquetRecordReader} to read files instead of {@link org.apache.flink.core.fs.FSDataInputStream}, + * we override {@link #open(FileInputSplit)} and {@link #close()} to change the behaviors. + * + * @param The type of record to read. + */ +public abstract class ParquetInputFormat + extends FileInputFormat + implements CheckpointableInputFormat> { + + private static final long serialVersionUID = 1L; + + private static final Logger LOG = LoggerFactory.getLogger(ParquetInputFormat.class); + + private transient Counter recordConsumed; + + private transient MessageType expectedFileSchema; + + private TypeInformation[] fieldTypes; + + private String[] fieldNames; + + private boolean skipThisSplit = false; + + private transient ParquetRecordReader parquetRecordReader; + + + /** +* Read parquet files with given parquet file schema. +* +* @param path The path of the file to read. +* @param messageType schema of parquet file +*/ + + protected ParquetInputFormat(Path path, MessageType messageType) { + super(path); + this.expectedFileSchema = checkNotNull(messageType, "messageType"); + RowTypeInfo rowTypeInfo = (RowTypeInfo) ParquetSchemaConverter.fromParquetType(expectedFileSchema); + this.fieldTypes = rowTypeInfo.getFieldTypes(); + this.fieldNames = rowTypeInfo.getFieldNames(); + // read whole parquet file as one file split + this.unsplittable = true; + } + + @Override + public void configure(Configuration parameters) { + super.configure(parameters); + parquetRecordReader.setSkipCorruptedRecord(this.skipCorruptedRecord); + } + + public void selectFields(String[] fieldNames) { + checkNotNull(fieldNames, "fieldNames"); + this.fieldNames = fieldNames; + RowTypeInfo rowTypeInfo = (RowTypeInfo)
[jira] [Commented] (FLINK-7243) Add ParquetInputFormat
[ https://issues.apache.org/jira/browse/FLINK-7243?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=16651999#comment-16651999 ] ASF GitHub Bot commented on FLINK-7243: --- HuangZhenQiu commented on a change in pull request #6483: [FLINK-7243][flink-formats] Add parquet input format URL: https://github.com/apache/flink/pull/6483#discussion_r225605353 ## File path: flink-formats/flink-parquet/src/main/java/org/apache/flink/formats/parquet/utils/ParquetRecordReader.java ## @@ -0,0 +1,196 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.formats.parquet.utils; + +import org.apache.hadoop.conf.Configuration; +import org.apache.parquet.column.page.PageReadStore; +import org.apache.parquet.filter2.compat.FilterCompat; +import org.apache.parquet.filter2.compat.FilterCompat.Filter; +import org.apache.parquet.hadoop.ParquetFileReader; +import org.apache.parquet.hadoop.api.InitContext; +import org.apache.parquet.hadoop.api.ReadSupport; +import org.apache.parquet.hadoop.metadata.FileMetaData; +import org.apache.parquet.io.ColumnIOFactory; +import org.apache.parquet.io.MessageColumnIO; +import org.apache.parquet.io.RecordReader; +import org.apache.parquet.io.api.RecordMaterializer; +import org.apache.parquet.io.api.RecordMaterializer.RecordMaterializationException; +import org.apache.parquet.schema.MessageType; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import javax.annotation.CheckReturnValue; +import javax.annotation.meta.When; + +import java.io.IOException; +import java.util.Collections; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Map; +import java.util.Set; + +import static org.apache.parquet.Preconditions.checkNotNull; + +/** + * Customized {@link org.apache.parquet.hadoop.ParquetRecordReader} that support start read from particular position. + */ +public class ParquetRecordReader { + private static final Logger LOG = LoggerFactory.getLogger(ParquetRecordReader.class); + + private ColumnIOFactory columnIOFactory; + private Filter filter; + + private MessageType readSchema; + private MessageType fileSchema; + private ReadSupport readSupport; + + private RecordMaterializer recordMaterializer; Review comment: Done. This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org > Add ParquetInputFormat > -- > > Key: FLINK-7243 > URL: https://issues.apache.org/jira/browse/FLINK-7243 > Project: Flink > Issue Type: Sub-task > Components: Table API SQL >Reporter: godfrey he >Assignee: Zhenqiu Huang >Priority: Major > Labels: pull-request-available > > Add a {{ParquetInputFormat}} to read data from a Apache Parquet file. -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Commented] (FLINK-7243) Add ParquetInputFormat
[ https://issues.apache.org/jira/browse/FLINK-7243?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=16651997#comment-16651997 ] ASF GitHub Bot commented on FLINK-7243: --- HuangZhenQiu commented on a change in pull request #6483: [FLINK-7243][flink-formats] Add parquet input format URL: https://github.com/apache/flink/pull/6483#discussion_r225604644 ## File path: flink-formats/flink-parquet/src/main/java/org/apache/flink/formats/parquet/ParquetInputFormat.java ## @@ -0,0 +1,241 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.formats.parquet; + +import org.apache.flink.api.common.io.CheckpointableInputFormat; +import org.apache.flink.api.common.io.FileInputFormat; +import org.apache.flink.api.common.typeinfo.TypeInformation; +import org.apache.flink.api.java.tuple.Tuple2; +import org.apache.flink.api.java.typeutils.RowTypeInfo; +import org.apache.flink.core.fs.FileInputSplit; +import org.apache.flink.core.fs.Path; +import org.apache.flink.formats.parquet.utils.ParquetRecordReader; +import org.apache.flink.formats.parquet.utils.ParquetSchemaConverter; +import org.apache.flink.formats.parquet.utils.RowReadSupport; +import org.apache.flink.metrics.Counter; +import org.apache.flink.types.Row; +import org.apache.flink.util.Preconditions; + +import org.apache.parquet.ParquetReadOptions; +import org.apache.parquet.filter2.compat.FilterCompat; +import org.apache.parquet.hadoop.ParquetFileReader; +import org.apache.parquet.hadoop.util.HadoopInputFile; +import org.apache.parquet.io.InputFile; +import org.apache.parquet.schema.MessageType; +import org.apache.parquet.schema.Type; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +/** + * The base InputFormat class to read from Parquet files. + * For specific return types the {@link #convert(Row)} method need to be implemented. + * + * Using {@link ParquetRecordReader} to read files instead of {@link org.apache.flink.core.fs.FSDataInputStream}, + * we override {@link #open(FileInputSplit)} and {@link #close()} to change the behaviors. + * + * @param The type of record to read. + */ +public abstract class ParquetInputFormat + extends FileInputFormat + implements CheckpointableInputFormat> { + + private static final long serialVersionUID = 1L; + + private static final Logger LOG = LoggerFactory.getLogger(ParquetInputFormat.class); + + private transient Counter recordConsumed; + + private final TypeInformation[] fieldTypes; + + private final String[] fieldNames; + + private boolean skipThisSplit = false; + + private transient ParquetRecordReader parquetRecordReader; + + private transient long recordsReadSinceLastSync; + + private long lastSyncedBlock = -1L; + + /** +* Read parquet files with given result parquet schema. +* +* @param path The path of the file to read. +* @param messageType schema of read result +*/ + + protected ParquetInputFormat(Path path, MessageType messageType) { + super(path); + RowTypeInfo readType = (RowTypeInfo) ParquetSchemaConverter.fromParquetType(messageType); + this.fieldTypes = readType.getFieldTypes(); + this.fieldNames = readType.getFieldNames(); + // read whole parquet file as one file split + this.unsplittable = true; + } + + /** +* Read parquet files with given result field names and types. +* +* @param path The path of the file to read. +* @param fieldTypes field types of read result of fields +* @param fieldNames field names to read, which can be subset of the parquet schema +*/ + protected ParquetInputFormat(Path path, TypeInformation[] fieldTypes, String[] fieldNames) { + super(path); + this.fieldTypes = fieldTypes; + this.fieldNames = fieldNames; + // read whole parquet file as one file split + this.unsplittable =
[jira] [Commented] (FLINK-7243) Add ParquetInputFormat
[ https://issues.apache.org/jira/browse/FLINK-7243?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=16651998#comment-16651998 ] ASF GitHub Bot commented on FLINK-7243: --- HuangZhenQiu commented on a change in pull request #6483: [FLINK-7243][flink-formats] Add parquet input format URL: https://github.com/apache/flink/pull/6483#discussion_r225604086 ## File path: flink-formats/flink-parquet/src/main/java/org/apache/flink/formats/parquet/utils/ParquetSchemaConverter.java ## @@ -0,0 +1,242 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.formats.parquet.utils; + +import org.apache.flink.api.common.typeinfo.BasicArrayTypeInfo; +import org.apache.flink.api.common.typeinfo.BasicTypeInfo; +import org.apache.flink.api.common.typeinfo.TypeInformation; +import org.apache.flink.api.java.typeutils.MapTypeInfo; +import org.apache.flink.api.java.typeutils.ObjectArrayTypeInfo; +import org.apache.flink.api.java.typeutils.RowTypeInfo; + +import org.apache.parquet.avro.AvroSchemaConverter; +import org.apache.parquet.schema.GroupType; +import org.apache.parquet.schema.MessageType; +import org.apache.parquet.schema.OriginalType; +import org.apache.parquet.schema.PrimitiveType; +import org.apache.parquet.schema.Type; +import org.apache.parquet.schema.Types; + +import java.lang.reflect.Array; +import java.util.ArrayList; +import java.util.List; + +/** + * Schema converter converts Parquet schema to and from Flink internal types. + */ +public class ParquetSchemaConverter { + public static final String MAP_KEY = "key"; + public static final String MAP_VALUE = "value"; + public static final String LIST_ELEMENT = "array"; + public static final String MESSAGE_ROOT = "root"; + private static final AvroSchemaConverter SCHEMA_CONVERTER = new AvroSchemaConverter(); + + public static TypeInformation fromParquetType(MessageType type) { + return convertFields(type.getFields()); + } + + public static MessageType toParquetType(TypeInformation typeInformation) { + return (MessageType) convertField(null, typeInformation, Type.Repetition.OPTIONAL); + } + + private static TypeInformation convertFields(List parquetFields) { + List> types = new ArrayList<>(); + List names = new ArrayList<>(); + for (Type field : parquetFields) { + TypeInformation subType = convertField(field); + if (subType != null) { + types.add(subType); + names.add(field.getName()); + } + } + + return new RowTypeInfo(types.toArray(new TypeInformation[types.size()]), + names.toArray(new String[names.size()])); + } + + private static TypeInformation convertField(final Type fieldType) { + TypeInformation typeInfo = null; + if (fieldType.isPrimitive()) { + PrimitiveType primitiveType = fieldType.asPrimitiveType(); Review comment: @twalthr Agree. It is needed. Added explicit type conversion for all Parquet types to Flink types. This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org > Add ParquetInputFormat > -- > > Key: FLINK-7243 > URL: https://issues.apache.org/jira/browse/FLINK-7243 > Project: Flink > Issue Type: Sub-task > Components: Table API SQL >Reporter: godfrey he >Assignee: Zhenqiu Huang >Priority: Major > Labels: pull-request-available > > Add a {{ParquetInputFormat}} to read data from a Apache Parquet file. -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Commented] (FLINK-7243) Add ParquetInputFormat
[ https://issues.apache.org/jira/browse/FLINK-7243?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=16651981#comment-16651981 ] ASF GitHub Bot commented on FLINK-7243: --- HuangZhenQiu commented on a change in pull request #6483: [FLINK-7243][flink-formats] Add parquet input format URL: https://github.com/apache/flink/pull/6483#discussion_r225603693 ## File path: flink-formats/flink-parquet/src/main/java/org/apache/flink/formats/parquet/utils/ParquetRecordReader.java ## @@ -0,0 +1,196 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.formats.parquet.utils; + +import org.apache.hadoop.conf.Configuration; +import org.apache.parquet.column.page.PageReadStore; +import org.apache.parquet.filter2.compat.FilterCompat; +import org.apache.parquet.filter2.compat.FilterCompat.Filter; +import org.apache.parquet.hadoop.ParquetFileReader; +import org.apache.parquet.hadoop.api.InitContext; +import org.apache.parquet.hadoop.api.ReadSupport; +import org.apache.parquet.hadoop.metadata.FileMetaData; +import org.apache.parquet.io.ColumnIOFactory; +import org.apache.parquet.io.MessageColumnIO; +import org.apache.parquet.io.RecordReader; +import org.apache.parquet.io.api.RecordMaterializer; +import org.apache.parquet.io.api.RecordMaterializer.RecordMaterializationException; +import org.apache.parquet.schema.MessageType; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import javax.annotation.CheckReturnValue; +import javax.annotation.meta.When; + +import java.io.IOException; +import java.util.Collections; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Map; +import java.util.Set; + +import static org.apache.parquet.Preconditions.checkNotNull; + +/** + * Customized {@link org.apache.parquet.hadoop.ParquetRecordReader} that support start read from particular position. + */ +public class ParquetRecordReader { + private static final Logger LOG = LoggerFactory.getLogger(ParquetRecordReader.class); + + private ColumnIOFactory columnIOFactory; + private Filter filter; + + private MessageType readSchema; + private MessageType fileSchema; + private ReadSupport readSupport; + + private RecordMaterializer recordMaterializer; + private T currentValue; + private long total; + private long current = 0; + private int currentBlock = -1; + private ParquetFileReader reader; + private RecordReader recordReader; + private boolean strictTypeChecking = true; + private long totalCountLoadedSoFar = 0; + + public ParquetRecordReader(ReadSupport readSupport, MessageType readSchema, Filter filter) { + this.readSupport = readSupport; + this.readSchema = readSchema; + this.filter = checkNotNull(filter, "filter"); + } + + public ParquetRecordReader(ReadSupport readSupport, MessageType readSchema) { + this(readSupport, readSchema, FilterCompat.NOOP); + } + + public void initialize(ParquetFileReader reader, Configuration configuration) { + this.reader = reader; + FileMetaData parquetFileMetadata = reader.getFooter().getFileMetaData(); + this.fileSchema = parquetFileMetadata.getSchema(); + Map fileMetadata = parquetFileMetadata.getKeyValueMetaData(); + ReadSupport.ReadContext readContext = readSupport.init(new InitContext( + configuration, toSetMultiMap(fileMetadata), readSchema)); + + this.columnIOFactory = new ColumnIOFactory(parquetFileMetadata.getCreatedBy()); + this.readSchema = readContext.getRequestedSchema(); + this.recordMaterializer = readSupport.prepareForRead( + configuration, fileMetadata, readSchema, readContext); + this.total = reader.getRecordCount(); + reader.setRequestedSchema(readSchema); + } + + private void checkRead() throws IOException { + if (current == totalCountLoadedSoFar) { + PageReadStore pages =
[jira] [Commented] (FLINK-7243) Add ParquetInputFormat
[ https://issues.apache.org/jira/browse/FLINK-7243?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=16651980#comment-16651980 ] ASF GitHub Bot commented on FLINK-7243: --- HuangZhenQiu commented on a change in pull request #6483: [FLINK-7243][flink-formats] Add parquet input format URL: https://github.com/apache/flink/pull/6483#discussion_r225603575 ## File path: flink-formats/flink-parquet/src/main/java/org/apache/flink/formats/parquet/utils/ParquetRecordReader.java ## @@ -0,0 +1,196 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.formats.parquet.utils; + +import org.apache.hadoop.conf.Configuration; +import org.apache.parquet.column.page.PageReadStore; +import org.apache.parquet.filter2.compat.FilterCompat; +import org.apache.parquet.filter2.compat.FilterCompat.Filter; +import org.apache.parquet.hadoop.ParquetFileReader; +import org.apache.parquet.hadoop.api.InitContext; +import org.apache.parquet.hadoop.api.ReadSupport; +import org.apache.parquet.hadoop.metadata.FileMetaData; +import org.apache.parquet.io.ColumnIOFactory; +import org.apache.parquet.io.MessageColumnIO; +import org.apache.parquet.io.RecordReader; +import org.apache.parquet.io.api.RecordMaterializer; +import org.apache.parquet.io.api.RecordMaterializer.RecordMaterializationException; +import org.apache.parquet.schema.MessageType; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import javax.annotation.CheckReturnValue; +import javax.annotation.meta.When; + +import java.io.IOException; +import java.util.Collections; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Map; +import java.util.Set; + +import static org.apache.parquet.Preconditions.checkNotNull; + +/** + * Customized {@link org.apache.parquet.hadoop.ParquetRecordReader} that support start read from particular position. + */ +public class ParquetRecordReader { + private static final Logger LOG = LoggerFactory.getLogger(ParquetRecordReader.class); + + private ColumnIOFactory columnIOFactory; + private Filter filter; + + private MessageType readSchema; + private MessageType fileSchema; + private ReadSupport readSupport; + + private RecordMaterializer recordMaterializer; + private T currentValue; + private long total; + private long current = 0; + private int currentBlock = -1; + private ParquetFileReader reader; + private RecordReader recordReader; + private boolean strictTypeChecking = true; + private long totalCountLoadedSoFar = 0; + + public ParquetRecordReader(ReadSupport readSupport, MessageType readSchema, Filter filter) { + this.readSupport = readSupport; + this.readSchema = readSchema; + this.filter = checkNotNull(filter, "filter"); + } + + public ParquetRecordReader(ReadSupport readSupport, MessageType readSchema) { + this(readSupport, readSchema, FilterCompat.NOOP); + } + + public void initialize(ParquetFileReader reader, Configuration configuration) { + this.reader = reader; + FileMetaData parquetFileMetadata = reader.getFooter().getFileMetaData(); + this.fileSchema = parquetFileMetadata.getSchema(); + Map fileMetadata = parquetFileMetadata.getKeyValueMetaData(); + ReadSupport.ReadContext readContext = readSupport.init(new InitContext( + configuration, toSetMultiMap(fileMetadata), readSchema)); + + this.columnIOFactory = new ColumnIOFactory(parquetFileMetadata.getCreatedBy()); + this.readSchema = readContext.getRequestedSchema(); + this.recordMaterializer = readSupport.prepareForRead( + configuration, fileMetadata, readSchema, readContext); + this.total = reader.getRecordCount(); + reader.setRequestedSchema(readSchema); + } + + private void checkRead() throws IOException { + if (current == totalCountLoadedSoFar) { + PageReadStore pages =
[jira] [Commented] (FLINK-7243) Add ParquetInputFormat
[ https://issues.apache.org/jira/browse/FLINK-7243?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=16651975#comment-16651975 ] ASF GitHub Bot commented on FLINK-7243: --- HuangZhenQiu commented on a change in pull request #6483: [FLINK-7243][flink-formats] Add parquet input format URL: https://github.com/apache/flink/pull/6483#discussion_r225602641 ## File path: flink-formats/flink-parquet/src/main/java/org/apache/flink/formats/parquet/utils/ParquetRecordReader.java ## @@ -0,0 +1,196 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.formats.parquet.utils; + +import org.apache.hadoop.conf.Configuration; +import org.apache.parquet.column.page.PageReadStore; +import org.apache.parquet.filter2.compat.FilterCompat; +import org.apache.parquet.filter2.compat.FilterCompat.Filter; +import org.apache.parquet.hadoop.ParquetFileReader; +import org.apache.parquet.hadoop.api.InitContext; +import org.apache.parquet.hadoop.api.ReadSupport; +import org.apache.parquet.hadoop.metadata.FileMetaData; +import org.apache.parquet.io.ColumnIOFactory; +import org.apache.parquet.io.MessageColumnIO; +import org.apache.parquet.io.RecordReader; +import org.apache.parquet.io.api.RecordMaterializer; +import org.apache.parquet.io.api.RecordMaterializer.RecordMaterializationException; +import org.apache.parquet.schema.MessageType; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import javax.annotation.CheckReturnValue; +import javax.annotation.meta.When; + +import java.io.IOException; +import java.util.Collections; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Map; +import java.util.Set; + +import static org.apache.parquet.Preconditions.checkNotNull; + +/** + * Customized {@link org.apache.parquet.hadoop.ParquetRecordReader} that support start read from particular position. + */ +public class ParquetRecordReader { + private static final Logger LOG = LoggerFactory.getLogger(ParquetRecordReader.class); + + private ColumnIOFactory columnIOFactory; + private Filter filter; + + private MessageType readSchema; + private MessageType fileSchema; + private ReadSupport readSupport; + + private RecordMaterializer recordMaterializer; + private T currentValue; + private long total; + private long current = 0; + private int currentBlock = -1; + private ParquetFileReader reader; + private RecordReader recordReader; + private boolean strictTypeChecking = true; + private long totalCountLoadedSoFar = 0; + + public ParquetRecordReader(ReadSupport readSupport, MessageType readSchema, Filter filter) { + this.readSupport = readSupport; + this.readSchema = readSchema; + this.filter = checkNotNull(filter, "filter"); + } + + public ParquetRecordReader(ReadSupport readSupport, MessageType readSchema) { + this(readSupport, readSchema, FilterCompat.NOOP); + } + + public void initialize(ParquetFileReader reader, Configuration configuration) { + this.reader = reader; + FileMetaData parquetFileMetadata = reader.getFooter().getFileMetaData(); + this.fileSchema = parquetFileMetadata.getSchema(); + Map fileMetadata = parquetFileMetadata.getKeyValueMetaData(); + ReadSupport.ReadContext readContext = readSupport.init(new InitContext( + configuration, toSetMultiMap(fileMetadata), readSchema)); + + this.columnIOFactory = new ColumnIOFactory(parquetFileMetadata.getCreatedBy()); + this.readSchema = readContext.getRequestedSchema(); + this.recordMaterializer = readSupport.prepareForRead( + configuration, fileMetadata, readSchema, readContext); + this.total = reader.getRecordCount(); + reader.setRequestedSchema(readSchema); + } + + private void checkRead() throws IOException { + if (current == totalCountLoadedSoFar) { + PageReadStore pages =
[jira] [Commented] (FLINK-7243) Add ParquetInputFormat
[ https://issues.apache.org/jira/browse/FLINK-7243?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=16651966#comment-16651966 ] ASF GitHub Bot commented on FLINK-7243: --- HuangZhenQiu commented on a change in pull request #6483: [FLINK-7243][flink-formats] Add parquet input format URL: https://github.com/apache/flink/pull/6483#discussion_r225601083 ## File path: flink-formats/flink-parquet/src/main/java/org/apache/flink/formats/parquet/utils/ParquetRecordReader.java ## @@ -0,0 +1,196 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.formats.parquet.utils; + +import org.apache.hadoop.conf.Configuration; +import org.apache.parquet.column.page.PageReadStore; +import org.apache.parquet.filter2.compat.FilterCompat; +import org.apache.parquet.filter2.compat.FilterCompat.Filter; +import org.apache.parquet.hadoop.ParquetFileReader; +import org.apache.parquet.hadoop.api.InitContext; +import org.apache.parquet.hadoop.api.ReadSupport; +import org.apache.parquet.hadoop.metadata.FileMetaData; +import org.apache.parquet.io.ColumnIOFactory; +import org.apache.parquet.io.MessageColumnIO; +import org.apache.parquet.io.RecordReader; +import org.apache.parquet.io.api.RecordMaterializer; +import org.apache.parquet.io.api.RecordMaterializer.RecordMaterializationException; +import org.apache.parquet.schema.MessageType; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import javax.annotation.CheckReturnValue; +import javax.annotation.meta.When; + +import java.io.IOException; +import java.util.Collections; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Map; +import java.util.Set; + +import static org.apache.parquet.Preconditions.checkNotNull; + +/** + * Customized {@link org.apache.parquet.hadoop.ParquetRecordReader} that support start read from particular position. + */ +public class ParquetRecordReader { + private static final Logger LOG = LoggerFactory.getLogger(ParquetRecordReader.class); + + private ColumnIOFactory columnIOFactory; + private Filter filter; + + private MessageType readSchema; + private MessageType fileSchema; + private ReadSupport readSupport; + + private RecordMaterializer recordMaterializer; + private T currentValue; + private long total; + private long current = 0; + private int currentBlock = -1; + private ParquetFileReader reader; + private RecordReader recordReader; + private boolean strictTypeChecking = true; + private long totalCountLoadedSoFar = 0; + + public ParquetRecordReader(ReadSupport readSupport, MessageType readSchema, Filter filter) { + this.readSupport = readSupport; + this.readSchema = readSchema; + this.filter = checkNotNull(filter, "filter"); + } + + public ParquetRecordReader(ReadSupport readSupport, MessageType readSchema) { + this(readSupport, readSchema, FilterCompat.NOOP); + } + + public void initialize(ParquetFileReader reader, Configuration configuration) { + this.reader = reader; + FileMetaData parquetFileMetadata = reader.getFooter().getFileMetaData(); + this.fileSchema = parquetFileMetadata.getSchema(); Review comment: There are some duplication. I simplified it, This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org > Add ParquetInputFormat > -- > > Key: FLINK-7243 > URL: https://issues.apache.org/jira/browse/FLINK-7243 > Project: Flink > Issue Type: Sub-task > Components: Table API SQL >Reporter: godfrey he >Assignee: Zhenqiu Huang >Priority: Major > Labels: pull-request-available > > Add a {{ParquetInputFormat}} to read data from a Apache
[jira] [Commented] (FLINK-7243) Add ParquetInputFormat
[ https://issues.apache.org/jira/browse/FLINK-7243?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=16648787#comment-16648787 ] ASF GitHub Bot commented on FLINK-7243: --- HuangZhenQiu commented on issue #6483: [FLINK-7243][flink-formats] Add parquet input format URL: https://github.com/apache/flink/pull/6483#issuecomment-429517418 @fhueske Add unit test for failure recovery logic. Please review it again after the travis check turns to green. This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org > Add ParquetInputFormat > -- > > Key: FLINK-7243 > URL: https://issues.apache.org/jira/browse/FLINK-7243 > Project: Flink > Issue Type: Sub-task > Components: Table API SQL >Reporter: godfrey he >Assignee: Zhenqiu Huang >Priority: Major > Labels: pull-request-available > > Add a {{ParquetInputFormat}} to read data from a Apache Parquet file. -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Commented] (FLINK-7243) Add ParquetInputFormat
[ https://issues.apache.org/jira/browse/FLINK-7243?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=16647573#comment-16647573 ] ASF GitHub Bot commented on FLINK-7243: --- HuangZhenQiu commented on issue #6483: [FLINK-7243][flink-formats] Add parquet input format URL: https://github.com/apache/flink/pull/6483#issuecomment-429234463 @fhueske Thanks for the review. Resolved all of the comments except unit tests for the checkpointing logic 1) For the question of "instead of always reading as Row and from there converting to the other types?" In Parquet's interface, a converter is needed for each type of result. Record can be convert to row by recursively put children in particular index, but Map has to do it with Key. To reduce code duplication, I use the row as intermediate representation. So type conversion can be put in sub class of ParquetInputFormat. 2) I will add unit test for checkpoint logic tomorrow night. This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org > Add ParquetInputFormat > -- > > Key: FLINK-7243 > URL: https://issues.apache.org/jira/browse/FLINK-7243 > Project: Flink > Issue Type: Sub-task > Components: Table API SQL >Reporter: godfrey he >Assignee: Zhenqiu Huang >Priority: Major > Labels: pull-request-available > > Add a {{ParquetInputFormat}} to read data from a Apache Parquet file. -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Commented] (FLINK-7243) Add ParquetInputFormat
[ https://issues.apache.org/jira/browse/FLINK-7243?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=16646085#comment-16646085 ] ASF GitHub Bot commented on FLINK-7243: --- twalthr commented on a change in pull request #6483: [FLINK-7243][flink-formats] Add parquet input format URL: https://github.com/apache/flink/pull/6483#discussion_r224349110 ## File path: flink-formats/flink-parquet/src/main/java/org/apache/flink/formats/parquet/utils/ParquetSchemaConverter.java ## @@ -0,0 +1,242 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.formats.parquet.utils; + +import org.apache.flink.api.common.typeinfo.BasicArrayTypeInfo; +import org.apache.flink.api.common.typeinfo.BasicTypeInfo; +import org.apache.flink.api.common.typeinfo.TypeInformation; +import org.apache.flink.api.java.typeutils.MapTypeInfo; +import org.apache.flink.api.java.typeutils.ObjectArrayTypeInfo; +import org.apache.flink.api.java.typeutils.RowTypeInfo; + +import org.apache.parquet.avro.AvroSchemaConverter; +import org.apache.parquet.schema.GroupType; +import org.apache.parquet.schema.MessageType; +import org.apache.parquet.schema.OriginalType; +import org.apache.parquet.schema.PrimitiveType; +import org.apache.parquet.schema.Type; +import org.apache.parquet.schema.Types; + +import java.lang.reflect.Array; +import java.util.ArrayList; +import java.util.List; + +/** + * Schema converter converts Parquet schema to and from Flink internal types. + */ +public class ParquetSchemaConverter { + public static final String MAP_KEY = "key"; + public static final String MAP_VALUE = "value"; + public static final String LIST_ELEMENT = "array"; + public static final String MESSAGE_ROOT = "root"; + private static final AvroSchemaConverter SCHEMA_CONVERTER = new AvroSchemaConverter(); + + public static TypeInformation fromParquetType(MessageType type) { + return convertFields(type.getFields()); + } + + public static MessageType toParquetType(TypeInformation typeInformation) { + return (MessageType) convertField(null, typeInformation, Type.Repetition.OPTIONAL); + } + + private static TypeInformation convertFields(List parquetFields) { + List> types = new ArrayList<>(); + List names = new ArrayList<>(); + for (Type field : parquetFields) { + TypeInformation subType = convertField(field); + if (subType != null) { + types.add(subType); + names.add(field.getName()); + } + } + + return new RowTypeInfo(types.toArray(new TypeInformation[types.size()]), + names.toArray(new String[names.size()])); + } + + private static TypeInformation convertField(final Type fieldType) { + TypeInformation typeInfo = null; + if (fieldType.isPrimitive()) { + PrimitiveType primitiveType = fieldType.asPrimitiveType(); Review comment: IMHO I think we should aim to map all Parquet types to Flink types. Otherwise the integration is not end-to-end and requires a lot of customization nevertheless. New supported formats such as ORC, Avro, or JSON support all types. Which makes the connector robust. This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org > Add ParquetInputFormat > -- > > Key: FLINK-7243 > URL: https://issues.apache.org/jira/browse/FLINK-7243 > Project: Flink > Issue Type: Sub-task > Components: Table API SQL >Reporter: godfrey he >Assignee: Zhenqiu Huang >Priority: Major > Labels: pull-request-available > > Add a
[jira] [Commented] (FLINK-7243) Add ParquetInputFormat
[ https://issues.apache.org/jira/browse/FLINK-7243?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=16645678#comment-16645678 ] ASF GitHub Bot commented on FLINK-7243: --- fhueske commented on a change in pull request #6483: [FLINK-7243][flink-formats] Add parquet input format URL: https://github.com/apache/flink/pull/6483#discussion_r224259148 ## File path: flink-formats/flink-parquet/src/main/java/org/apache/flink/formats/parquet/ParquetInputFormat.java ## @@ -0,0 +1,241 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.formats.parquet; + +import org.apache.flink.api.common.io.CheckpointableInputFormat; +import org.apache.flink.api.common.io.FileInputFormat; +import org.apache.flink.api.common.typeinfo.TypeInformation; +import org.apache.flink.api.java.tuple.Tuple2; +import org.apache.flink.api.java.typeutils.RowTypeInfo; +import org.apache.flink.core.fs.FileInputSplit; +import org.apache.flink.core.fs.Path; +import org.apache.flink.formats.parquet.utils.ParquetRecordReader; +import org.apache.flink.formats.parquet.utils.ParquetSchemaConverter; +import org.apache.flink.formats.parquet.utils.RowReadSupport; +import org.apache.flink.metrics.Counter; +import org.apache.flink.types.Row; +import org.apache.flink.util.Preconditions; + +import org.apache.parquet.ParquetReadOptions; +import org.apache.parquet.filter2.compat.FilterCompat; +import org.apache.parquet.hadoop.ParquetFileReader; +import org.apache.parquet.hadoop.util.HadoopInputFile; +import org.apache.parquet.io.InputFile; +import org.apache.parquet.schema.MessageType; +import org.apache.parquet.schema.Type; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +/** + * The base InputFormat class to read from Parquet files. + * For specific return types the {@link #convert(Row)} method need to be implemented. + * + * Using {@link ParquetRecordReader} to read files instead of {@link org.apache.flink.core.fs.FSDataInputStream}, + * we override {@link #open(FileInputSplit)} and {@link #close()} to change the behaviors. + * + * @param The type of record to read. + */ +public abstract class ParquetInputFormat + extends FileInputFormat + implements CheckpointableInputFormat> { + + private static final long serialVersionUID = 1L; + + private static final Logger LOG = LoggerFactory.getLogger(ParquetInputFormat.class); + + private transient Counter recordConsumed; + + private final TypeInformation[] fieldTypes; + + private final String[] fieldNames; + + private boolean skipThisSplit = false; + + private transient ParquetRecordReader parquetRecordReader; + + private transient long recordsReadSinceLastSync; + + private long lastSyncedBlock = -1L; + + /** +* Read parquet files with given result parquet schema. +* +* @param path The path of the file to read. +* @param messageType schema of read result +*/ + + protected ParquetInputFormat(Path path, MessageType messageType) { + super(path); + RowTypeInfo readType = (RowTypeInfo) ParquetSchemaConverter.fromParquetType(messageType); + this.fieldTypes = readType.getFieldTypes(); + this.fieldNames = readType.getFieldNames(); + // read whole parquet file as one file split + this.unsplittable = true; + } + + /** +* Read parquet files with given result field names and types. +* +* @param path The path of the file to read. +* @param fieldTypes field types of read result of fields +* @param fieldNames field names to read, which can be subset of the parquet schema +*/ + protected ParquetInputFormat(Path path, TypeInformation[] fieldTypes, String[] fieldNames) { + super(path); + this.fieldTypes = fieldTypes; + this.fieldNames = fieldNames; + // read whole parquet file as one file split + this.unsplittable = true;
[jira] [Commented] (FLINK-7243) Add ParquetInputFormat
[ https://issues.apache.org/jira/browse/FLINK-7243?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=16645670#comment-16645670 ] ASF GitHub Bot commented on FLINK-7243: --- fhueske commented on a change in pull request #6483: [FLINK-7243][flink-formats] Add parquet input format URL: https://github.com/apache/flink/pull/6483#discussion_r224251082 ## File path: flink-formats/flink-parquet/src/main/java/org/apache/flink/formats/parquet/utils/ParquetRecordReader.java ## @@ -0,0 +1,196 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.formats.parquet.utils; + +import org.apache.hadoop.conf.Configuration; +import org.apache.parquet.column.page.PageReadStore; +import org.apache.parquet.filter2.compat.FilterCompat; +import org.apache.parquet.filter2.compat.FilterCompat.Filter; +import org.apache.parquet.hadoop.ParquetFileReader; +import org.apache.parquet.hadoop.api.InitContext; +import org.apache.parquet.hadoop.api.ReadSupport; +import org.apache.parquet.hadoop.metadata.FileMetaData; +import org.apache.parquet.io.ColumnIOFactory; +import org.apache.parquet.io.MessageColumnIO; +import org.apache.parquet.io.RecordReader; +import org.apache.parquet.io.api.RecordMaterializer; +import org.apache.parquet.io.api.RecordMaterializer.RecordMaterializationException; +import org.apache.parquet.schema.MessageType; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import javax.annotation.CheckReturnValue; +import javax.annotation.meta.When; + +import java.io.IOException; +import java.util.Collections; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Map; +import java.util.Set; + +import static org.apache.parquet.Preconditions.checkNotNull; + +/** + * Customized {@link org.apache.parquet.hadoop.ParquetRecordReader} that support start read from particular position. + */ +public class ParquetRecordReader { + private static final Logger LOG = LoggerFactory.getLogger(ParquetRecordReader.class); + + private ColumnIOFactory columnIOFactory; + private Filter filter; + + private MessageType readSchema; + private MessageType fileSchema; + private ReadSupport readSupport; + + private RecordMaterializer recordMaterializer; + private T currentValue; + private long total; + private long current = 0; + private int currentBlock = -1; + private ParquetFileReader reader; + private RecordReader recordReader; + private boolean strictTypeChecking = true; + private long totalCountLoadedSoFar = 0; + + public ParquetRecordReader(ReadSupport readSupport, MessageType readSchema, Filter filter) { + this.readSupport = readSupport; + this.readSchema = readSchema; + this.filter = checkNotNull(filter, "filter"); + } + + public ParquetRecordReader(ReadSupport readSupport, MessageType readSchema) { + this(readSupport, readSchema, FilterCompat.NOOP); + } + + public void initialize(ParquetFileReader reader, Configuration configuration) { + this.reader = reader; + FileMetaData parquetFileMetadata = reader.getFooter().getFileMetaData(); + this.fileSchema = parquetFileMetadata.getSchema(); + Map fileMetadata = parquetFileMetadata.getKeyValueMetaData(); + ReadSupport.ReadContext readContext = readSupport.init(new InitContext( + configuration, toSetMultiMap(fileMetadata), readSchema)); + + this.columnIOFactory = new ColumnIOFactory(parquetFileMetadata.getCreatedBy()); + this.readSchema = readContext.getRequestedSchema(); + this.recordMaterializer = readSupport.prepareForRead( + configuration, fileMetadata, readSchema, readContext); + this.total = reader.getRecordCount(); + reader.setRequestedSchema(readSchema); + } + + private void checkRead() throws IOException { + if (current == totalCountLoadedSoFar) { + PageReadStore pages =
[jira] [Commented] (FLINK-7243) Add ParquetInputFormat
[ https://issues.apache.org/jira/browse/FLINK-7243?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=16645674#comment-16645674 ] ASF GitHub Bot commented on FLINK-7243: --- fhueske commented on a change in pull request #6483: [FLINK-7243][flink-formats] Add parquet input format URL: https://github.com/apache/flink/pull/6483#discussion_r224244074 ## File path: flink-formats/flink-parquet/src/main/java/org/apache/flink/formats/parquet/utils/ParquetRecordReader.java ## @@ -0,0 +1,196 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.formats.parquet.utils; + +import org.apache.hadoop.conf.Configuration; +import org.apache.parquet.column.page.PageReadStore; +import org.apache.parquet.filter2.compat.FilterCompat; +import org.apache.parquet.filter2.compat.FilterCompat.Filter; +import org.apache.parquet.hadoop.ParquetFileReader; +import org.apache.parquet.hadoop.api.InitContext; +import org.apache.parquet.hadoop.api.ReadSupport; +import org.apache.parquet.hadoop.metadata.FileMetaData; +import org.apache.parquet.io.ColumnIOFactory; +import org.apache.parquet.io.MessageColumnIO; +import org.apache.parquet.io.RecordReader; +import org.apache.parquet.io.api.RecordMaterializer; +import org.apache.parquet.io.api.RecordMaterializer.RecordMaterializationException; +import org.apache.parquet.schema.MessageType; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import javax.annotation.CheckReturnValue; +import javax.annotation.meta.When; + +import java.io.IOException; +import java.util.Collections; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Map; +import java.util.Set; + +import static org.apache.parquet.Preconditions.checkNotNull; + +/** + * Customized {@link org.apache.parquet.hadoop.ParquetRecordReader} that support start read from particular position. + */ +public class ParquetRecordReader { + private static final Logger LOG = LoggerFactory.getLogger(ParquetRecordReader.class); + + private ColumnIOFactory columnIOFactory; + private Filter filter; + + private MessageType readSchema; + private MessageType fileSchema; + private ReadSupport readSupport; + + private RecordMaterializer recordMaterializer; + private T currentValue; + private long total; + private long current = 0; + private int currentBlock = -1; + private ParquetFileReader reader; + private RecordReader recordReader; + private boolean strictTypeChecking = true; + private long totalCountLoadedSoFar = 0; + + public ParquetRecordReader(ReadSupport readSupport, MessageType readSchema, Filter filter) { + this.readSupport = readSupport; + this.readSchema = readSchema; + this.filter = checkNotNull(filter, "filter"); Review comment: Flink has its own `org.apache.flink.util.Preconditions` class that could be used here This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org > Add ParquetInputFormat > -- > > Key: FLINK-7243 > URL: https://issues.apache.org/jira/browse/FLINK-7243 > Project: Flink > Issue Type: Sub-task > Components: Table API SQL >Reporter: godfrey he >Assignee: Zhenqiu Huang >Priority: Major > Labels: pull-request-available > > Add a {{ParquetInputFormat}} to read data from a Apache Parquet file. -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Commented] (FLINK-7243) Add ParquetInputFormat
[ https://issues.apache.org/jira/browse/FLINK-7243?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=16645673#comment-16645673 ] ASF GitHub Bot commented on FLINK-7243: --- fhueske commented on a change in pull request #6483: [FLINK-7243][flink-formats] Add parquet input format URL: https://github.com/apache/flink/pull/6483#discussion_r224262756 ## File path: flink-formats/flink-parquet/src/main/java/org/apache/flink/formats/parquet/utils/ParquetRecordReader.java ## @@ -0,0 +1,196 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.formats.parquet.utils; + +import org.apache.hadoop.conf.Configuration; +import org.apache.parquet.column.page.PageReadStore; +import org.apache.parquet.filter2.compat.FilterCompat; +import org.apache.parquet.filter2.compat.FilterCompat.Filter; +import org.apache.parquet.hadoop.ParquetFileReader; +import org.apache.parquet.hadoop.api.InitContext; +import org.apache.parquet.hadoop.api.ReadSupport; +import org.apache.parquet.hadoop.metadata.FileMetaData; +import org.apache.parquet.io.ColumnIOFactory; +import org.apache.parquet.io.MessageColumnIO; +import org.apache.parquet.io.RecordReader; +import org.apache.parquet.io.api.RecordMaterializer; +import org.apache.parquet.io.api.RecordMaterializer.RecordMaterializationException; +import org.apache.parquet.schema.MessageType; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import javax.annotation.CheckReturnValue; +import javax.annotation.meta.When; + +import java.io.IOException; +import java.util.Collections; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Map; +import java.util.Set; + +import static org.apache.parquet.Preconditions.checkNotNull; + +/** + * Customized {@link org.apache.parquet.hadoop.ParquetRecordReader} that support start read from particular position. + */ +public class ParquetRecordReader { + private static final Logger LOG = LoggerFactory.getLogger(ParquetRecordReader.class); + + private ColumnIOFactory columnIOFactory; + private Filter filter; + + private MessageType readSchema; + private MessageType fileSchema; + private ReadSupport readSupport; + + private RecordMaterializer recordMaterializer; + private T currentValue; + private long total; + private long current = 0; + private int currentBlock = -1; + private ParquetFileReader reader; + private RecordReader recordReader; + private boolean strictTypeChecking = true; + private long totalCountLoadedSoFar = 0; + + public ParquetRecordReader(ReadSupport readSupport, MessageType readSchema, Filter filter) { + this.readSupport = readSupport; + this.readSchema = readSchema; + this.filter = checkNotNull(filter, "filter"); + } + + public ParquetRecordReader(ReadSupport readSupport, MessageType readSchema) { + this(readSupport, readSchema, FilterCompat.NOOP); + } + + public void initialize(ParquetFileReader reader, Configuration configuration) { + this.reader = reader; + FileMetaData parquetFileMetadata = reader.getFooter().getFileMetaData(); + this.fileSchema = parquetFileMetadata.getSchema(); + Map fileMetadata = parquetFileMetadata.getKeyValueMetaData(); + ReadSupport.ReadContext readContext = readSupport.init(new InitContext( + configuration, toSetMultiMap(fileMetadata), readSchema)); + + this.columnIOFactory = new ColumnIOFactory(parquetFileMetadata.getCreatedBy()); + this.readSchema = readContext.getRequestedSchema(); + this.recordMaterializer = readSupport.prepareForRead( + configuration, fileMetadata, readSchema, readContext); + this.total = reader.getRecordCount(); + reader.setRequestedSchema(readSchema); + } + + private void checkRead() throws IOException { + if (current == totalCountLoadedSoFar) { + PageReadStore pages =
[jira] [Commented] (FLINK-7243) Add ParquetInputFormat
[ https://issues.apache.org/jira/browse/FLINK-7243?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=16645677#comment-16645677 ] ASF GitHub Bot commented on FLINK-7243: --- fhueske commented on a change in pull request #6483: [FLINK-7243][flink-formats] Add parquet input format URL: https://github.com/apache/flink/pull/6483#discussion_r224265721 ## File path: flink-formats/flink-parquet/src/main/java/org/apache/flink/formats/parquet/utils/ParquetRecordReader.java ## @@ -0,0 +1,196 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.formats.parquet.utils; + +import org.apache.hadoop.conf.Configuration; +import org.apache.parquet.column.page.PageReadStore; +import org.apache.parquet.filter2.compat.FilterCompat; +import org.apache.parquet.filter2.compat.FilterCompat.Filter; +import org.apache.parquet.hadoop.ParquetFileReader; +import org.apache.parquet.hadoop.api.InitContext; +import org.apache.parquet.hadoop.api.ReadSupport; +import org.apache.parquet.hadoop.metadata.FileMetaData; +import org.apache.parquet.io.ColumnIOFactory; +import org.apache.parquet.io.MessageColumnIO; +import org.apache.parquet.io.RecordReader; +import org.apache.parquet.io.api.RecordMaterializer; +import org.apache.parquet.io.api.RecordMaterializer.RecordMaterializationException; +import org.apache.parquet.schema.MessageType; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import javax.annotation.CheckReturnValue; +import javax.annotation.meta.When; + +import java.io.IOException; +import java.util.Collections; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Map; +import java.util.Set; + +import static org.apache.parquet.Preconditions.checkNotNull; + +/** + * Customized {@link org.apache.parquet.hadoop.ParquetRecordReader} that support start read from particular position. + */ +public class ParquetRecordReader { + private static final Logger LOG = LoggerFactory.getLogger(ParquetRecordReader.class); + + private ColumnIOFactory columnIOFactory; + private Filter filter; + + private MessageType readSchema; + private MessageType fileSchema; + private ReadSupport readSupport; + + private RecordMaterializer recordMaterializer; + private T currentValue; + private long total; + private long current = 0; + private int currentBlock = -1; + private ParquetFileReader reader; + private RecordReader recordReader; + private boolean strictTypeChecking = true; + private long totalCountLoadedSoFar = 0; + + public ParquetRecordReader(ReadSupport readSupport, MessageType readSchema, Filter filter) { + this.readSupport = readSupport; + this.readSchema = readSchema; + this.filter = checkNotNull(filter, "filter"); + } + + public ParquetRecordReader(ReadSupport readSupport, MessageType readSchema) { + this(readSupport, readSchema, FilterCompat.NOOP); + } + + public void initialize(ParquetFileReader reader, Configuration configuration) { + this.reader = reader; + FileMetaData parquetFileMetadata = reader.getFooter().getFileMetaData(); + this.fileSchema = parquetFileMetadata.getSchema(); + Map fileMetadata = parquetFileMetadata.getKeyValueMetaData(); + ReadSupport.ReadContext readContext = readSupport.init(new InitContext( + configuration, toSetMultiMap(fileMetadata), readSchema)); + + this.columnIOFactory = new ColumnIOFactory(parquetFileMetadata.getCreatedBy()); + this.readSchema = readContext.getRequestedSchema(); + this.recordMaterializer = readSupport.prepareForRead( + configuration, fileMetadata, readSchema, readContext); + this.total = reader.getRecordCount(); + reader.setRequestedSchema(readSchema); + } + + private void checkRead() throws IOException { + if (current == totalCountLoadedSoFar) { + PageReadStore pages =