paul-rogers commented on code in PR #2937: URL: https://github.com/apache/drill/pull/2937#discussion_r1746149745
########## exec/java-exec/src/test/java/org/apache/drill/TestParquetPartiallyMissingColumns.java: ########## @@ -0,0 +1,128 @@ +package org.apache.drill; + +import org.apache.drill.common.types.TypeProtos; +import org.apache.drill.common.types.Types; +import org.apache.drill.exec.record.BatchSchema; +import org.apache.drill.exec.record.BatchSchemaBuilder; +import org.apache.drill.exec.record.metadata.SchemaBuilder; +import org.apache.drill.test.ClusterFixture; +import org.apache.drill.test.ClusterTest; +import org.junit.BeforeClass; +import org.junit.Test; + +import java.nio.file.Paths; + +/** + * Covers querying a table in which some parquet files do contain selected columns, and + * others do not (or have them as OPTIONALs). + * + * Expected behavior for the missing columns is following: + * 1) If at least 1 parquet file to be read has the column, take the minor type from there. + * Otherwise, default to INT. + * 2) If at least 1 parquet file to be read doesn't have the column, or has it as OPTIONAL, + * enforce the overall scan output schema to have it as OPTIONAL + * + * We need to control ordering of scanning batches to cover different erroneous cases, and we assume + * that parquet files in a table would be read in alphabetic order (not a real use case though). So + * we name our files 0.parquet and 1.parquet expecting that they would be scanned in that order + * (not guaranteed though, but seems to work). We use such tables for such scenarios: + * + * - parquet/partially_missing/o_m -- optional, then missing + * - parquet/partially_missing/m_o -- missing, then optional + * - parquet/partially_missing/r_m -- required, then missing + * - parquet/partially_missing/r_o -- required, then optional + * + * These tables have these parquet files with such schemas: + * + * - parquet/partially_missing/o_m/0.parquet: id<INT(REQUIRED)> | name<VARCHAR(OPTIONAL)> | age<INT(OPTIONAL)> + * - parquet/partially_missing/o_m/1.parquet: id<INT(REQUIRED)> + * + * - parquet/partially_missing/m_0/0.parquet: id<INT(REQUIRED)> + * - parquet/partially_missing/m_0/1.parquet: id<INT(REQUIRED)> | name<VARCHAR(OPTIONAL)> | age<INT(OPTIONAL)> + * + * - parquet/partially_missing/r_m/0.parquet: id<INT(REQUIRED)> | name<VARCHAR(REQUIRED)> | age<INT(REQUIRED)> + * - parquet/partially_missing/r_m/1.parquet: id<INT(REQUIRED)> + * + * - parquet/partially_missing/r_o/0.parquet: id<INT(REQUIRED)> | name<VARCHAR(REQUIRED)> | age<INT(REQUIRED)> + * - parquet/partially_missing/r_o/1.parquet: id<INT(REQUIRED)> | name<VARCHAR(OPTIONAL)> | age<INT(OPTIONAL)> + * + * So, by querying "age" or "name" columns we would trigger both 0.parquet reader to read the data and + * 1.parquet reader to create the missing column vector. + */ +public class TestParquetPartiallyMissingColumns extends ClusterTest { + + private static final SchemaBuilder ageSchema = + new SchemaBuilder().add("age", Types.optional(TypeProtos.MinorType.INT)); + private static final SchemaBuilder nameSchema = + new SchemaBuilder().add("name", Types.optional(TypeProtos.MinorType.VARCHAR)); + + @BeforeClass + public static void setup() throws Exception { + startCluster(ClusterFixture.builder(dirTestWatcher)); + dirTestWatcher.copyResourceToRoot(Paths.get("parquet", "partially_missing")); + } + + /* + Field name for the missing column MUST NOT be quoted with back-ticks, so we should have ONLY ONE + column for that field (unquoted) + */ Review Comment: Nit: The preferred style for comments is the way the class header comment is formatted: ```java /* * Field name for the missing column MUST NOT be quoted with back-ticks, so we should have ONLY ONE * column for that field (unquoted) */ ``` ########## exec/java-exec/src/test/java/org/apache/drill/TestParquetMissingColumns.java: ########## @@ -0,0 +1,91 @@ +package org.apache.drill; + +import org.apache.drill.common.types.TypeProtos; +import org.apache.drill.common.types.Types; +import org.apache.drill.exec.record.BatchSchema; +import org.apache.drill.exec.record.BatchSchemaBuilder; +import org.apache.drill.exec.record.metadata.SchemaBuilder; +import org.apache.drill.test.ClusterFixture; +import org.apache.drill.test.ClusterTest; +import org.junit.BeforeClass; +import org.junit.Test; + +/** + * Covers selecting completely missing columns from a parquet table. Should create Nullable Int + * ValueVector in that case since there is no chance to guess the correct data type here. + */ Review Comment: The "default type" for the other readers is part of the reader definition. We could create yet another system/session option for the default Parquet type. After this change, the default type will only apply if the user requests a column that appears in none of the files. Maybe the file appeared in older files, but not newer ones, or visa versa. Suppose the user has a tool that sends the SQL. That tool expects `comments` to be `VARCHAR`, but this was something added recently. When using that query against old files, the column will suddenly become `INT`, perhaps breaking code that expected the `VARCHAR` type. We cannot fix this case: only a full metastore that tracks all files could solve this case. Having an overall default column type can't fix per-column issues. So, I guess we can set this issue aside for now. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: dev-unsubscr...@drill.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org