[jira] [Commented] (DRILL-4184) Drill does not support Parquet DECIMAL values in variable length BINARY fields
[ https://issues.apache.org/jira/browse/DRILL-4184?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16465534#comment-16465534 ] ASF GitHub Bot commented on DRILL-4184: --- vvysotskyi closed pull request #372: DRILL-4184: support variable length decimal fields in parquet URL: https://github.com/apache/drill/pull/372 This is a PR merged from a forked repository. As GitHub hides the original diff on merge, it is displayed below for the sake of provenance: As this is a foreign pull request (from a fork), the diff is supplied below (as it won't show otherwise due to GitHub magic): diff --git a/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/columnreaders/NullableVarLengthValuesColumn.java b/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/columnreaders/NullableVarLengthValuesColumn.java index b18a81c606..bcfc812f0b 100644 --- a/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/columnreaders/NullableVarLengthValuesColumn.java +++ b/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/columnreaders/NullableVarLengthValuesColumn.java @@ -20,10 +20,14 @@ import io.netty.buffer.DrillBuf; import java.io.IOException; +import java.math.BigDecimal; +import java.nio.ByteBuffer; import org.apache.drill.common.exceptions.ExecutionSetupException; import org.apache.drill.exec.vector.ValueVector; - +import org.apache.drill.exec.vector.VariableWidthVector; +import org.apache.drill.exec.util.DecimalUtility; +import org.apache.drill.exec.vector.FixedWidthVector; import org.apache.parquet.column.ColumnDescriptor; import org.apache.parquet.format.SchemaElement; import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData; @@ -69,11 +73,16 @@ protected boolean readAndStoreValueSizeInformation() throws IOException { if ( currDefLevel == -1 ) { currDefLevel = pageReader.definitionLevels.readInteger(); } -if ( columnDescriptor.getMaxDefinitionLevel() > currDefLevel) { + +if (columnDescriptor.getMaxDefinitionLevel() > currDefLevel) { nullsRead++; - // set length of zero, each index in the vector defaults to null so no need to set the nullability - variableWidthVector.getMutator().setValueLengthSafe( - valuesReadInCurrentPass + pageReader.valuesReadyToRead, 0); + // set length of zero, each index in the vector defaults to null so no + // need to set the nullability + if (variableWidthVector == null) { +addDecimalLength(null); // store null length in BYTES for null value + } else { + variableWidthVector.getMutator().setValueLengthSafe(valuesReadInCurrentPass + pageReader.valuesReadyToRead, 0); + } currentValNull = true; return false;// field is null, no length to add to data vector } @@ -83,18 +92,26 @@ protected boolean readAndStoreValueSizeInformation() throws IOException { currLengthDeterminingDictVal = pageReader.dictionaryLengthDeterminingReader.readBytes(); } currDictValToWrite = currLengthDeterminingDictVal; - // re-purposing this field here for length in BYTES to prevent repetitive multiplication/division + + // re-purposing this field here for length in BYTES to prevent + // repetitive multiplication/division dataTypeLengthInBits = currLengthDeterminingDictVal.length(); } else { // re-purposing this field here for length in BYTES to prevent repetitive multiplication/division dataTypeLengthInBits = pageReader.pageData.getInt((int) pageReader.readyToReadPosInBytes); } -// I think this also needs to happen if it is null for the random access -boolean success = setSafe(valuesReadInCurrentPass + pageReader.valuesReadyToRead, pageReader.pageData, -(int) pageReader.readyToReadPosInBytes + 4, dataTypeLengthInBits); -if ( ! success ) { - return true; + +if (variableWidthVector == null) { + addDecimalLength(dataTypeLengthInBits); // store decimal length variable length decimal field +} +else { + // I think this also needs to happen if it is null for the random access + boolean success = setSafe(valuesReadInCurrentPass + pageReader.valuesReadyToRead, pageReader.pageData, + (int) pageReader.readyToReadPosInBytes + 4, dataTypeLengthInBits); + if ( ! success ) { +return true; + } } return false; } @@ -122,19 +139,34 @@ public void updatePosition() { protected void readField(long recordsToRead) { // TODO - unlike most implementations of this method, the recordsReadInThisIteration field is not set here // should verify that this is not breaking anything -currentValNull = variableWidthVector.getAccessor().getObject(valuesReadInCurrentPass) == null; +if (variableWidthVector == null) { + currentValNull = getDecimalLength(valuesReadInCurrentPass) == null; +
[jira] [Commented] (DRILL-4184) Drill does not support Parquet DECIMAL values in variable length BINARY fields
[ https://issues.apache.org/jira/browse/DRILL-4184?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16465533#comment-16465533 ] ASF GitHub Bot commented on DRILL-4184: --- vvysotskyi commented on issue #372: DRILL-4184: support variable length decimal fields in parquet URL: https://github.com/apache/drill/pull/372#issuecomment-386978734 Closing this PR since it was fixed in the scope of DRILL-6094 This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org > Drill does not support Parquet DECIMAL values in variable length BINARY fields > -- > > Key: DRILL-4184 > URL: https://issues.apache.org/jira/browse/DRILL-4184 > Project: Apache Drill > Issue Type: Bug > Components: Storage - Parquet >Affects Versions: 1.4.0 > Environment: Windows 7 Professional, Java 1.8.0_66 >Reporter: Dave Oshinsky >Priority: Major > > Encoding a DECIMAL logical type in Parquet using the variable length BINARY > primitive type is not supported by Drill as of versions 1.3.0 and 1.4.0. The > problem first surfaces with the ClassCastException shown below, but fixing > the immediate cause of the exception is not sufficient to support this > combination (DECIMAL, BINARY) in a Parquet file. > In Drill, DECIMAL is currently assumed to be INT32, INT64, INT96, or > FIXED_LEN_BINARY_ARRAY. Are there any plans to support DECIMAL with variable > length BINARY? Avro definitely supports encoding DECIMAL in variable length > bytes (see https://avro.apache.org/docs/current/spec.html#Decimal), but this > support in Parquet is less clear. > Selecting on a BINARY DECIMAL field in a parquet file throws an exception as > shown below (java.lang.ClassCastException: > org.apache.drill.exec.vector.Decimal28SparseVector cannot be cast to > org.apache.drill.exec.vector.VariableWidthVector). The successful query at > bottom selected on a string field in the same file. > 0: jdbc:drill:zk=local> select count(*) from > dfs.`c:/dao/DBArchivePredictor/tenrows.parquet` where acct_no=7020; > org.apache.drill.common.exceptions.DrillRuntimeException: Error in parquet > recor > d reader. > Message: Failure in setting up reader > Parquet Metadata: ParquetMetaData{FileMetaData{schema: message sbi.acct_mstr { > required binary ACCT_NO (DECIMAL(20,0)); > optional binary SF_NO (UTF8); > optional binary LF_NO (UTF8); > optional binary BRANCH_NO (DECIMAL(20,0)); > optional binary INTRO_CUST_NO (DECIMAL(20,0)); > optional binary INTRO_ACCT_NO (DECIMAL(20,0)); > optional binary INTRO_SIGN (UTF8); > optional binary TYPE (UTF8); > optional binary OPR_MODE (UTF8); > optional binary CUR_ACCT_TYPE (UTF8); > optional binary TITLE (UTF8); > optional binary CORP_CUST_NO (DECIMAL(20,0)); > optional binary APLNDT (UTF8); > optional binary OPNDT (UTF8); > optional binary VERI_EMP_NO (DECIMAL(20,0)); > optional binary VERI_SIGN (UTF8); > optional binary MANAGER_SIGN (UTF8); > optional binary CURBAL (DECIMAL(8,2)); > optional binary STATUS (UTF8); > } > , metadata: > {parquet.avro.schema={"type":"record","name":"acct_mstr","namespace" > :"sbi","fields":[{"name":"ACCT_NO","type":{"type":"bytes","logicalType":"decimal > ","precision":20,"scale":0,"cv_auto_incr":false,"cv_case_sensitive":false,"cv_co > lumn_class":"java.math.BigDecimal","cv_connection":"oracle.jdbc.driver.T4CConnec > tion","cv_currency":true,"cv_def_writable":false,"cv_nullable":0,"cv_precision": > 20,"cv_read_only":false,"cv_scale":0,"cv_searchable":true,"cv_signed":true,"cv_s > ubscript":1,"cv_type":2,"cv_typename":"NUMBER","cv_writable":true}},{"name":"SF_ > NO","type":["null",{"type":"string","cv_auto_incr":false,"cv_case_sensitive":tru > e,"cv_column_class":"java.lang.String","cv_currency":false,"cv_def_writable":fal > se,"cv_nullable":1,"cv_precision":10,"cv_read_only":false,"cv_scale":0,"cv_searc > hable":true,"cv_signed":true,"cv_subscript":2,"cv_type":12,"cv_typename":"VARCHA > R2","cv_writable":true}]},{"name":"LF_NO","type":["null",{"type":"string","cv_au > to_incr":false,"cv_case_sensitive":true,"cv_column_class":"java.lang.String","cv > _currency":false,"cv_def_writable":false,"cv_nullable":1,"cv_precision":10,"cv_r > ead_only":false,"cv_scale":0,"cv_searchable":true,"cv_signed":true,"cv_subscript > ":3,"cv_type":12,"cv_typename":"VARCHAR2","cv_writable":true}]},{"name":"BRANCH_ > NO","type":["null",{"type":"bytes","logicalType":"decimal","precision":20,"scale > ":0,"cv_auto_incr":false,"cv_case_sensitive":false,"cv_column_class":"java.math.
[jira] [Commented] (DRILL-4184) Drill does not support Parquet DECIMAL values in variable length BINARY fields
[ https://issues.apache.org/jira/browse/DRILL-4184?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16465532#comment-16465532 ] ASF GitHub Bot commented on DRILL-4184: --- vvysotskyi commented on issue #372: DRILL-4184: support variable length decimal fields in parquet URL: https://github.com/apache/drill/pull/372#issuecomment-386975850 @daveoshinsky, could you please close this PR, since it was fixed in the scope of DRILL-6094 This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org > Drill does not support Parquet DECIMAL values in variable length BINARY fields > -- > > Key: DRILL-4184 > URL: https://issues.apache.org/jira/browse/DRILL-4184 > Project: Apache Drill > Issue Type: Bug > Components: Storage - Parquet >Affects Versions: 1.4.0 > Environment: Windows 7 Professional, Java 1.8.0_66 >Reporter: Dave Oshinsky >Priority: Major > > Encoding a DECIMAL logical type in Parquet using the variable length BINARY > primitive type is not supported by Drill as of versions 1.3.0 and 1.4.0. The > problem first surfaces with the ClassCastException shown below, but fixing > the immediate cause of the exception is not sufficient to support this > combination (DECIMAL, BINARY) in a Parquet file. > In Drill, DECIMAL is currently assumed to be INT32, INT64, INT96, or > FIXED_LEN_BINARY_ARRAY. Are there any plans to support DECIMAL with variable > length BINARY? Avro definitely supports encoding DECIMAL in variable length > bytes (see https://avro.apache.org/docs/current/spec.html#Decimal), but this > support in Parquet is less clear. > Selecting on a BINARY DECIMAL field in a parquet file throws an exception as > shown below (java.lang.ClassCastException: > org.apache.drill.exec.vector.Decimal28SparseVector cannot be cast to > org.apache.drill.exec.vector.VariableWidthVector). The successful query at > bottom selected on a string field in the same file. > 0: jdbc:drill:zk=local> select count(*) from > dfs.`c:/dao/DBArchivePredictor/tenrows.parquet` where acct_no=7020; > org.apache.drill.common.exceptions.DrillRuntimeException: Error in parquet > recor > d reader. > Message: Failure in setting up reader > Parquet Metadata: ParquetMetaData{FileMetaData{schema: message sbi.acct_mstr { > required binary ACCT_NO (DECIMAL(20,0)); > optional binary SF_NO (UTF8); > optional binary LF_NO (UTF8); > optional binary BRANCH_NO (DECIMAL(20,0)); > optional binary INTRO_CUST_NO (DECIMAL(20,0)); > optional binary INTRO_ACCT_NO (DECIMAL(20,0)); > optional binary INTRO_SIGN (UTF8); > optional binary TYPE (UTF8); > optional binary OPR_MODE (UTF8); > optional binary CUR_ACCT_TYPE (UTF8); > optional binary TITLE (UTF8); > optional binary CORP_CUST_NO (DECIMAL(20,0)); > optional binary APLNDT (UTF8); > optional binary OPNDT (UTF8); > optional binary VERI_EMP_NO (DECIMAL(20,0)); > optional binary VERI_SIGN (UTF8); > optional binary MANAGER_SIGN (UTF8); > optional binary CURBAL (DECIMAL(8,2)); > optional binary STATUS (UTF8); > } > , metadata: > {parquet.avro.schema={"type":"record","name":"acct_mstr","namespace" > :"sbi","fields":[{"name":"ACCT_NO","type":{"type":"bytes","logicalType":"decimal > ","precision":20,"scale":0,"cv_auto_incr":false,"cv_case_sensitive":false,"cv_co > lumn_class":"java.math.BigDecimal","cv_connection":"oracle.jdbc.driver.T4CConnec > tion","cv_currency":true,"cv_def_writable":false,"cv_nullable":0,"cv_precision": > 20,"cv_read_only":false,"cv_scale":0,"cv_searchable":true,"cv_signed":true,"cv_s > ubscript":1,"cv_type":2,"cv_typename":"NUMBER","cv_writable":true}},{"name":"SF_ > NO","type":["null",{"type":"string","cv_auto_incr":false,"cv_case_sensitive":tru > e,"cv_column_class":"java.lang.String","cv_currency":false,"cv_def_writable":fal > se,"cv_nullable":1,"cv_precision":10,"cv_read_only":false,"cv_scale":0,"cv_searc > hable":true,"cv_signed":true,"cv_subscript":2,"cv_type":12,"cv_typename":"VARCHA > R2","cv_writable":true}]},{"name":"LF_NO","type":["null",{"type":"string","cv_au > to_incr":false,"cv_case_sensitive":true,"cv_column_class":"java.lang.String","cv > _currency":false,"cv_def_writable":false,"cv_nullable":1,"cv_precision":10,"cv_r > ead_only":false,"cv_scale":0,"cv_searchable":true,"cv_signed":true,"cv_subscript > ":3,"cv_type":12,"cv_typename":"VARCHAR2","cv_writable":true}]},{"name":"BRANCH_ > NO","type":["null",{"type":"bytes","logicalType":"decimal","precision":20,"scale > ":0,"cv_auto_incr":false,"cv_case_sensitive":fal
[jira] [Commented] (DRILL-4184) Drill does not support Parquet DECIMAL values in variable length BINARY fields
[ https://issues.apache.org/jira/browse/DRILL-4184?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16465517#comment-16465517 ] ASF GitHub Bot commented on DRILL-4184: --- vvysotskyi commented on issue #372: DRILL-4184: support variable length decimal fields in parquet URL: https://github.com/apache/drill/pull/372#issuecomment-386975850 @daveoshinsky, could you please close this PR, since it was fixed in the scope of DRILL-6094 This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org > Drill does not support Parquet DECIMAL values in variable length BINARY fields > -- > > Key: DRILL-4184 > URL: https://issues.apache.org/jira/browse/DRILL-4184 > Project: Apache Drill > Issue Type: Bug > Components: Storage - Parquet >Affects Versions: 1.4.0 > Environment: Windows 7 Professional, Java 1.8.0_66 >Reporter: Dave Oshinsky >Priority: Major > > Encoding a DECIMAL logical type in Parquet using the variable length BINARY > primitive type is not supported by Drill as of versions 1.3.0 and 1.4.0. The > problem first surfaces with the ClassCastException shown below, but fixing > the immediate cause of the exception is not sufficient to support this > combination (DECIMAL, BINARY) in a Parquet file. > In Drill, DECIMAL is currently assumed to be INT32, INT64, INT96, or > FIXED_LEN_BINARY_ARRAY. Are there any plans to support DECIMAL with variable > length BINARY? Avro definitely supports encoding DECIMAL in variable length > bytes (see https://avro.apache.org/docs/current/spec.html#Decimal), but this > support in Parquet is less clear. > Selecting on a BINARY DECIMAL field in a parquet file throws an exception as > shown below (java.lang.ClassCastException: > org.apache.drill.exec.vector.Decimal28SparseVector cannot be cast to > org.apache.drill.exec.vector.VariableWidthVector). The successful query at > bottom selected on a string field in the same file. > 0: jdbc:drill:zk=local> select count(*) from > dfs.`c:/dao/DBArchivePredictor/tenrows.parquet` where acct_no=7020; > org.apache.drill.common.exceptions.DrillRuntimeException: Error in parquet > recor > d reader. > Message: Failure in setting up reader > Parquet Metadata: ParquetMetaData{FileMetaData{schema: message sbi.acct_mstr { > required binary ACCT_NO (DECIMAL(20,0)); > optional binary SF_NO (UTF8); > optional binary LF_NO (UTF8); > optional binary BRANCH_NO (DECIMAL(20,0)); > optional binary INTRO_CUST_NO (DECIMAL(20,0)); > optional binary INTRO_ACCT_NO (DECIMAL(20,0)); > optional binary INTRO_SIGN (UTF8); > optional binary TYPE (UTF8); > optional binary OPR_MODE (UTF8); > optional binary CUR_ACCT_TYPE (UTF8); > optional binary TITLE (UTF8); > optional binary CORP_CUST_NO (DECIMAL(20,0)); > optional binary APLNDT (UTF8); > optional binary OPNDT (UTF8); > optional binary VERI_EMP_NO (DECIMAL(20,0)); > optional binary VERI_SIGN (UTF8); > optional binary MANAGER_SIGN (UTF8); > optional binary CURBAL (DECIMAL(8,2)); > optional binary STATUS (UTF8); > } > , metadata: > {parquet.avro.schema={"type":"record","name":"acct_mstr","namespace" > :"sbi","fields":[{"name":"ACCT_NO","type":{"type":"bytes","logicalType":"decimal > ","precision":20,"scale":0,"cv_auto_incr":false,"cv_case_sensitive":false,"cv_co > lumn_class":"java.math.BigDecimal","cv_connection":"oracle.jdbc.driver.T4CConnec > tion","cv_currency":true,"cv_def_writable":false,"cv_nullable":0,"cv_precision": > 20,"cv_read_only":false,"cv_scale":0,"cv_searchable":true,"cv_signed":true,"cv_s > ubscript":1,"cv_type":2,"cv_typename":"NUMBER","cv_writable":true}},{"name":"SF_ > NO","type":["null",{"type":"string","cv_auto_incr":false,"cv_case_sensitive":tru > e,"cv_column_class":"java.lang.String","cv_currency":false,"cv_def_writable":fal > se,"cv_nullable":1,"cv_precision":10,"cv_read_only":false,"cv_scale":0,"cv_searc > hable":true,"cv_signed":true,"cv_subscript":2,"cv_type":12,"cv_typename":"VARCHA > R2","cv_writable":true}]},{"name":"LF_NO","type":["null",{"type":"string","cv_au > to_incr":false,"cv_case_sensitive":true,"cv_column_class":"java.lang.String","cv > _currency":false,"cv_def_writable":false,"cv_nullable":1,"cv_precision":10,"cv_r > ead_only":false,"cv_scale":0,"cv_searchable":true,"cv_signed":true,"cv_subscript > ":3,"cv_type":12,"cv_typename":"VARCHAR2","cv_writable":true}]},{"name":"BRANCH_ > NO","type":["null",{"type":"bytes","logicalType":"decimal","precision":20,"scale > ":0,"cv_auto_incr":false,"cv_case_sensitive":fal
[jira] [Commented] (DRILL-4184) Drill does not support Parquet DECIMAL values in variable length BINARY fields
[ https://issues.apache.org/jira/browse/DRILL-4184?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=15185648#comment-15185648 ] ASF GitHub Bot commented on DRILL-4184: --- Github user daveoshinsky commented on a diff in the pull request: https://github.com/apache/drill/pull/372#discussion_r55417098 --- Diff: exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/columnreaders/NullableVarLengthValuesColumn.java --- @@ -69,11 +73,16 @@ protected boolean readAndStoreValueSizeInformation() throws IOException { if ( currDefLevel == -1 ) { currDefLevel = pageReader.definitionLevels.readInteger(); } -if ( columnDescriptor.getMaxDefinitionLevel() > currDefLevel) { + +if (columnDescriptor.getMaxDefinitionLevel() > currDefLevel) { nullsRead++; - // set length of zero, each index in the vector defaults to null so no need to set the nullability - variableWidthVector.getMutator().setValueLengthSafe( - valuesReadInCurrentPass + pageReader.valuesReadyToRead, 0); + // set length of zero, each index in the vector defaults to null so no + // need to set the nullability + if (variableWidthVector == null) { --- End diff -- Regarding the two variables variableWidthVector and fixedWidthVector that I added, here is my reasoning. Either variableWidthVector is set if we have a VariableWidthVector, or fixedWidthVector is set if we have a FixedWidthVector (i.e., decimal). Hence, variableWidthVector is non-null if and only if we are to invoke the pre-existing logic, that assumed a variable width vector. When variableWidthVector is null (fixedWidthVector is non-null, but not currently used), we invoke the new logic to save the length information in decimalLengths. If this is no good, please tell me why, and suggest an alternative. > Drill does not support Parquet DECIMAL values in variable length BINARY fields > -- > > Key: DRILL-4184 > URL: https://issues.apache.org/jira/browse/DRILL-4184 > Project: Apache Drill > Issue Type: Bug > Components: Storage - Parquet >Affects Versions: 1.4.0 > Environment: Windows 7 Professional, Java 1.8.0_66 >Reporter: Dave Oshinsky > > Encoding a DECIMAL logical type in Parquet using the variable length BINARY > primitive type is not supported by Drill as of versions 1.3.0 and 1.4.0. The > problem first surfaces with the ClassCastException shown below, but fixing > the immediate cause of the exception is not sufficient to support this > combination (DECIMAL, BINARY) in a Parquet file. > In Drill, DECIMAL is currently assumed to be INT32, INT64, INT96, or > FIXED_LEN_BINARY_ARRAY. Are there any plans to support DECIMAL with variable > length BINARY? Avro definitely supports encoding DECIMAL in variable length > bytes (see https://avro.apache.org/docs/current/spec.html#Decimal), but this > support in Parquet is less clear. > Selecting on a BINARY DECIMAL field in a parquet file throws an exception as > shown below (java.lang.ClassCastException: > org.apache.drill.exec.vector.Decimal28SparseVector cannot be cast to > org.apache.drill.exec.vector.VariableWidthVector). The successful query at > bottom selected on a string field in the same file. > 0: jdbc:drill:zk=local> select count(*) from > dfs.`c:/dao/DBArchivePredictor/tenrows.parquet` where acct_no=7020; > org.apache.drill.common.exceptions.DrillRuntimeException: Error in parquet > recor > d reader. > Message: Failure in setting up reader > Parquet Metadata: ParquetMetaData{FileMetaData{schema: message sbi.acct_mstr { > required binary ACCT_NO (DECIMAL(20,0)); > optional binary SF_NO (UTF8); > optional binary LF_NO (UTF8); > optional binary BRANCH_NO (DECIMAL(20,0)); > optional binary INTRO_CUST_NO (DECIMAL(20,0)); > optional binary INTRO_ACCT_NO (DECIMAL(20,0)); > optional binary INTRO_SIGN (UTF8); > optional binary TYPE (UTF8); > optional binary OPR_MODE (UTF8); > optional binary CUR_ACCT_TYPE (UTF8); > optional binary TITLE (UTF8); > optional binary CORP_CUST_NO (DECIMAL(20,0)); > optional binary APLNDT (UTF8); > optional binary OPNDT (UTF8); > optional binary VERI_EMP_NO (DECIMAL(20,0)); > optional binary VERI_SIGN (UTF8); > optional binary MANAGER_SIGN (UTF8); > optional binary CURBAL (DECIMAL(8,2)); > optional binary STATUS (UTF8); > } > , metadata: > {parquet.avro.schema={"type":"record","name":"acct_mstr","namespace" > :"sbi","fields":[{"name":"ACCT_NO","type":{"type":"bytes","logicalType":"decimal > ","precision":20,"scale":0,"cv_auto_incr":false,"cv_case_sensitive":false,"cv_co > lumn_class":"java.math.BigDecimal","cv_connection":"oracle.jdbc.driver.T4CConnec > tion","
[jira] [Commented] (DRILL-4184) Drill does not support Parquet DECIMAL values in variable length BINARY fields
[ https://issues.apache.org/jira/browse/DRILL-4184?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=15183775#comment-15183775 ] ASF GitHub Bot commented on DRILL-4184: --- Github user daveoshinsky commented on a diff in the pull request: https://github.com/apache/drill/pull/372#discussion_r55277730 --- Diff: exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/columnreaders/VarLengthValuesColumn.java --- @@ -81,6 +112,14 @@ public boolean skipReadyToReadPositionUpdate() { return false; } + // decimalLengths list is part of a near-term fix for DRILL-4184. + // Decimal[23]8SparseVector classes are fixed width vectors, without ability to "remember" offsets of + // (variable width) field sizes. so, we "remember" the array sizes in decimalLengths (also used to + // "remember" whether a value was null, for nullable decimal columns). + // TODO: storage of decimal values should support variable length values in a much cleaner way than this, + // perhaps with a new variable width Decimal vector class. + protected ArrayList decimalLengths = new ArrayList(); --- End diff -- I have moved decimalLengths class member to top of the class. I found that the list is NOT always accessed at the end, hence left it as a list. > Drill does not support Parquet DECIMAL values in variable length BINARY fields > -- > > Key: DRILL-4184 > URL: https://issues.apache.org/jira/browse/DRILL-4184 > Project: Apache Drill > Issue Type: Bug > Components: Storage - Parquet >Affects Versions: 1.4.0 > Environment: Windows 7 Professional, Java 1.8.0_66 >Reporter: Dave Oshinsky > > Encoding a DECIMAL logical type in Parquet using the variable length BINARY > primitive type is not supported by Drill as of versions 1.3.0 and 1.4.0. The > problem first surfaces with the ClassCastException shown below, but fixing > the immediate cause of the exception is not sufficient to support this > combination (DECIMAL, BINARY) in a Parquet file. > In Drill, DECIMAL is currently assumed to be INT32, INT64, INT96, or > FIXED_LEN_BINARY_ARRAY. Are there any plans to support DECIMAL with variable > length BINARY? Avro definitely supports encoding DECIMAL in variable length > bytes (see https://avro.apache.org/docs/current/spec.html#Decimal), but this > support in Parquet is less clear. > Selecting on a BINARY DECIMAL field in a parquet file throws an exception as > shown below (java.lang.ClassCastException: > org.apache.drill.exec.vector.Decimal28SparseVector cannot be cast to > org.apache.drill.exec.vector.VariableWidthVector). The successful query at > bottom selected on a string field in the same file. > 0: jdbc:drill:zk=local> select count(*) from > dfs.`c:/dao/DBArchivePredictor/tenrows.parquet` where acct_no=7020; > org.apache.drill.common.exceptions.DrillRuntimeException: Error in parquet > recor > d reader. > Message: Failure in setting up reader > Parquet Metadata: ParquetMetaData{FileMetaData{schema: message sbi.acct_mstr { > required binary ACCT_NO (DECIMAL(20,0)); > optional binary SF_NO (UTF8); > optional binary LF_NO (UTF8); > optional binary BRANCH_NO (DECIMAL(20,0)); > optional binary INTRO_CUST_NO (DECIMAL(20,0)); > optional binary INTRO_ACCT_NO (DECIMAL(20,0)); > optional binary INTRO_SIGN (UTF8); > optional binary TYPE (UTF8); > optional binary OPR_MODE (UTF8); > optional binary CUR_ACCT_TYPE (UTF8); > optional binary TITLE (UTF8); > optional binary CORP_CUST_NO (DECIMAL(20,0)); > optional binary APLNDT (UTF8); > optional binary OPNDT (UTF8); > optional binary VERI_EMP_NO (DECIMAL(20,0)); > optional binary VERI_SIGN (UTF8); > optional binary MANAGER_SIGN (UTF8); > optional binary CURBAL (DECIMAL(8,2)); > optional binary STATUS (UTF8); > } > , metadata: > {parquet.avro.schema={"type":"record","name":"acct_mstr","namespace" > :"sbi","fields":[{"name":"ACCT_NO","type":{"type":"bytes","logicalType":"decimal > ","precision":20,"scale":0,"cv_auto_incr":false,"cv_case_sensitive":false,"cv_co > lumn_class":"java.math.BigDecimal","cv_connection":"oracle.jdbc.driver.T4CConnec > tion","cv_currency":true,"cv_def_writable":false,"cv_nullable":0,"cv_precision": > 20,"cv_read_only":false,"cv_scale":0,"cv_searchable":true,"cv_signed":true,"cv_s > ubscript":1,"cv_type":2,"cv_typename":"NUMBER","cv_writable":true}},{"name":"SF_ > NO","type":["null",{"type":"string","cv_auto_incr":false,"cv_case_sensitive":tru > e,"cv_column_class":"java.lang.String","cv_currency":false,"cv_def_writable":fal > se,"cv_nullable":1,"cv_precision":10,"cv_read_only":false,"cv_scale":0,"cv_searc > hable":true,"cv_signed":true,"cv_subscript":2,"cv_type":12,"cv_typename":"VARCHA > R2
[jira] [Commented] (DRILL-4184) Drill does not support Parquet DECIMAL values in variable length BINARY fields
[ https://issues.apache.org/jira/browse/DRILL-4184?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=15182339#comment-15182339 ] ASF GitHub Bot commented on DRILL-4184: --- Github user daveoshinsky commented on the pull request: https://github.com/apache/drill/pull/372#issuecomment-19251 Regarding the overall intent of the fix, as the "TODO" comment on decimalLengths implies, it's intended only as a short-term fix. More long-term, I would suggest that decimal values should be stored in a VariableWidthVector (which was assumed by VarLenghValuesColumn, hence the class cast exception). This would use memory more efficiently when most values are far smaller than full precision, as is often the case (think java.math.BigDecimal, which operates this way). Moreover, there would be no need to have a whole bunch of separate (generated) classes for different decimal precisions. Just one class, variable width, handling any precision. I also suggest that some other "special cases" could be combined. Fixed width is a special case of variable width, where there's no need to store a separate length for each value. Non-nullable is a special case of nullable, where there's no need to store a nullable boolean (or equivalent) for each value. One last bit of feedback - it would be much easier to maintain the code if it did not involve generation of code (freemarker). An old-fashioned class hierarchy, with no generated code, would probably work just fine for the vectoring mechanisms. > Drill does not support Parquet DECIMAL values in variable length BINARY fields > -- > > Key: DRILL-4184 > URL: https://issues.apache.org/jira/browse/DRILL-4184 > Project: Apache Drill > Issue Type: Bug > Components: Storage - Parquet >Affects Versions: 1.4.0 > Environment: Windows 7 Professional, Java 1.8.0_66 >Reporter: Dave Oshinsky > > Encoding a DECIMAL logical type in Parquet using the variable length BINARY > primitive type is not supported by Drill as of versions 1.3.0 and 1.4.0. The > problem first surfaces with the ClassCastException shown below, but fixing > the immediate cause of the exception is not sufficient to support this > combination (DECIMAL, BINARY) in a Parquet file. > In Drill, DECIMAL is currently assumed to be INT32, INT64, INT96, or > FIXED_LEN_BINARY_ARRAY. Are there any plans to support DECIMAL with variable > length BINARY? Avro definitely supports encoding DECIMAL in variable length > bytes (see https://avro.apache.org/docs/current/spec.html#Decimal), but this > support in Parquet is less clear. > Selecting on a BINARY DECIMAL field in a parquet file throws an exception as > shown below (java.lang.ClassCastException: > org.apache.drill.exec.vector.Decimal28SparseVector cannot be cast to > org.apache.drill.exec.vector.VariableWidthVector). The successful query at > bottom selected on a string field in the same file. > 0: jdbc:drill:zk=local> select count(*) from > dfs.`c:/dao/DBArchivePredictor/tenrows.parquet` where acct_no=7020; > org.apache.drill.common.exceptions.DrillRuntimeException: Error in parquet > recor > d reader. > Message: Failure in setting up reader > Parquet Metadata: ParquetMetaData{FileMetaData{schema: message sbi.acct_mstr { > required binary ACCT_NO (DECIMAL(20,0)); > optional binary SF_NO (UTF8); > optional binary LF_NO (UTF8); > optional binary BRANCH_NO (DECIMAL(20,0)); > optional binary INTRO_CUST_NO (DECIMAL(20,0)); > optional binary INTRO_ACCT_NO (DECIMAL(20,0)); > optional binary INTRO_SIGN (UTF8); > optional binary TYPE (UTF8); > optional binary OPR_MODE (UTF8); > optional binary CUR_ACCT_TYPE (UTF8); > optional binary TITLE (UTF8); > optional binary CORP_CUST_NO (DECIMAL(20,0)); > optional binary APLNDT (UTF8); > optional binary OPNDT (UTF8); > optional binary VERI_EMP_NO (DECIMAL(20,0)); > optional binary VERI_SIGN (UTF8); > optional binary MANAGER_SIGN (UTF8); > optional binary CURBAL (DECIMAL(8,2)); > optional binary STATUS (UTF8); > } > , metadata: > {parquet.avro.schema={"type":"record","name":"acct_mstr","namespace" > :"sbi","fields":[{"name":"ACCT_NO","type":{"type":"bytes","logicalType":"decimal > ","precision":20,"scale":0,"cv_auto_incr":false,"cv_case_sensitive":false,"cv_co > lumn_class":"java.math.BigDecimal","cv_connection":"oracle.jdbc.driver.T4CConnec > tion","cv_currency":true,"cv_def_writable":false,"cv_nullable":0,"cv_precision": > 20,"cv_read_only":false,"cv_scale":0,"cv_searchable":true,"cv_signed":true,"cv_s > ubscript":1,"cv_type":2,"cv_typename":"NUMBER","cv_writable":true}},{"name":"SF_ > NO","type":["null",{"type":"string","cv_auto_incr":false,"cv_case_sensitive":tru > e,"cv_column_class":"java.lang.String","cv_currency":f
[jira] [Commented] (DRILL-4184) Drill does not support Parquet DECIMAL values in variable length BINARY fields
[ https://issues.apache.org/jira/browse/DRILL-4184?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=15182335#comment-15182335 ] ASF GitHub Bot commented on DRILL-4184: --- Github user daveoshinsky commented on a diff in the pull request: https://github.com/apache/drill/pull/372#discussion_r55148335 --- Diff: exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/columnreaders/VarLengthValuesColumn.java --- @@ -81,6 +112,14 @@ public boolean skipReadyToReadPositionUpdate() { return false; } + // decimalLengths list is part of a near-term fix for DRILL-4184. + // Decimal[23]8SparseVector classes are fixed width vectors, without ability to "remember" offsets of + // (variable width) field sizes. so, we "remember" the array sizes in decimalLengths (also used to + // "remember" whether a value was null, for nullable decimal columns). + // TODO: storage of decimal values should support variable length values in a much cleaner way than this, + // perhaps with a new variable width Decimal vector class. + protected ArrayList decimalLengths = new ArrayList(); --- End diff -- Once I'm sure whether a list, or a single value, is needed, I will move the value. > Drill does not support Parquet DECIMAL values in variable length BINARY fields > -- > > Key: DRILL-4184 > URL: https://issues.apache.org/jira/browse/DRILL-4184 > Project: Apache Drill > Issue Type: Bug > Components: Storage - Parquet >Affects Versions: 1.4.0 > Environment: Windows 7 Professional, Java 1.8.0_66 >Reporter: Dave Oshinsky > > Encoding a DECIMAL logical type in Parquet using the variable length BINARY > primitive type is not supported by Drill as of versions 1.3.0 and 1.4.0. The > problem first surfaces with the ClassCastException shown below, but fixing > the immediate cause of the exception is not sufficient to support this > combination (DECIMAL, BINARY) in a Parquet file. > In Drill, DECIMAL is currently assumed to be INT32, INT64, INT96, or > FIXED_LEN_BINARY_ARRAY. Are there any plans to support DECIMAL with variable > length BINARY? Avro definitely supports encoding DECIMAL in variable length > bytes (see https://avro.apache.org/docs/current/spec.html#Decimal), but this > support in Parquet is less clear. > Selecting on a BINARY DECIMAL field in a parquet file throws an exception as > shown below (java.lang.ClassCastException: > org.apache.drill.exec.vector.Decimal28SparseVector cannot be cast to > org.apache.drill.exec.vector.VariableWidthVector). The successful query at > bottom selected on a string field in the same file. > 0: jdbc:drill:zk=local> select count(*) from > dfs.`c:/dao/DBArchivePredictor/tenrows.parquet` where acct_no=7020; > org.apache.drill.common.exceptions.DrillRuntimeException: Error in parquet > recor > d reader. > Message: Failure in setting up reader > Parquet Metadata: ParquetMetaData{FileMetaData{schema: message sbi.acct_mstr { > required binary ACCT_NO (DECIMAL(20,0)); > optional binary SF_NO (UTF8); > optional binary LF_NO (UTF8); > optional binary BRANCH_NO (DECIMAL(20,0)); > optional binary INTRO_CUST_NO (DECIMAL(20,0)); > optional binary INTRO_ACCT_NO (DECIMAL(20,0)); > optional binary INTRO_SIGN (UTF8); > optional binary TYPE (UTF8); > optional binary OPR_MODE (UTF8); > optional binary CUR_ACCT_TYPE (UTF8); > optional binary TITLE (UTF8); > optional binary CORP_CUST_NO (DECIMAL(20,0)); > optional binary APLNDT (UTF8); > optional binary OPNDT (UTF8); > optional binary VERI_EMP_NO (DECIMAL(20,0)); > optional binary VERI_SIGN (UTF8); > optional binary MANAGER_SIGN (UTF8); > optional binary CURBAL (DECIMAL(8,2)); > optional binary STATUS (UTF8); > } > , metadata: > {parquet.avro.schema={"type":"record","name":"acct_mstr","namespace" > :"sbi","fields":[{"name":"ACCT_NO","type":{"type":"bytes","logicalType":"decimal > ","precision":20,"scale":0,"cv_auto_incr":false,"cv_case_sensitive":false,"cv_co > lumn_class":"java.math.BigDecimal","cv_connection":"oracle.jdbc.driver.T4CConnec > tion","cv_currency":true,"cv_def_writable":false,"cv_nullable":0,"cv_precision": > 20,"cv_read_only":false,"cv_scale":0,"cv_searchable":true,"cv_signed":true,"cv_s > ubscript":1,"cv_type":2,"cv_typename":"NUMBER","cv_writable":true}},{"name":"SF_ > NO","type":["null",{"type":"string","cv_auto_incr":false,"cv_case_sensitive":tru > e,"cv_column_class":"java.lang.String","cv_currency":false,"cv_def_writable":fal > se,"cv_nullable":1,"cv_precision":10,"cv_read_only":false,"cv_scale":0,"cv_searc > hable":true,"cv_signed":true,"cv_subscript":2,"cv_type":12,"cv_typename":"VARCHA > R2","cv_writable":true}]},{"name":"LF_NO","type":["null",{"type"
[jira] [Commented] (DRILL-4184) Drill does not support Parquet DECIMAL values in variable length BINARY fields
[ https://issues.apache.org/jira/browse/DRILL-4184?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=15182333#comment-15182333 ] ASF GitHub Bot commented on DRILL-4184: --- Github user daveoshinsky commented on a diff in the pull request: https://github.com/apache/drill/pull/372#discussion_r55148322 --- Diff: exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/columnreaders/NullableVarLengthValuesColumn.java --- @@ -69,11 +73,16 @@ protected boolean readAndStoreValueSizeInformation() throws IOException { if ( currDefLevel == -1 ) { currDefLevel = pageReader.definitionLevels.readInteger(); } -if ( columnDescriptor.getMaxDefinitionLevel() > currDefLevel) { + +if (columnDescriptor.getMaxDefinitionLevel() > currDefLevel) { nullsRead++; - // set length of zero, each index in the vector defaults to null so no need to set the nullability - variableWidthVector.getMutator().setValueLengthSafe( - valuesReadInCurrentPass + pageReader.valuesReadyToRead, 0); + // set length of zero, each index in the vector defaults to null so no + // need to set the nullability + if (variableWidthVector == null) { --- End diff -- It took me a while to get some idea how this code is supposed to work. It was not clear to me that only the last item in the list was accessed. If you are certain that is the case, we don't need a list, but instead just a single value. > Drill does not support Parquet DECIMAL values in variable length BINARY fields > -- > > Key: DRILL-4184 > URL: https://issues.apache.org/jira/browse/DRILL-4184 > Project: Apache Drill > Issue Type: Bug > Components: Storage - Parquet >Affects Versions: 1.4.0 > Environment: Windows 7 Professional, Java 1.8.0_66 >Reporter: Dave Oshinsky > > Encoding a DECIMAL logical type in Parquet using the variable length BINARY > primitive type is not supported by Drill as of versions 1.3.0 and 1.4.0. The > problem first surfaces with the ClassCastException shown below, but fixing > the immediate cause of the exception is not sufficient to support this > combination (DECIMAL, BINARY) in a Parquet file. > In Drill, DECIMAL is currently assumed to be INT32, INT64, INT96, or > FIXED_LEN_BINARY_ARRAY. Are there any plans to support DECIMAL with variable > length BINARY? Avro definitely supports encoding DECIMAL in variable length > bytes (see https://avro.apache.org/docs/current/spec.html#Decimal), but this > support in Parquet is less clear. > Selecting on a BINARY DECIMAL field in a parquet file throws an exception as > shown below (java.lang.ClassCastException: > org.apache.drill.exec.vector.Decimal28SparseVector cannot be cast to > org.apache.drill.exec.vector.VariableWidthVector). The successful query at > bottom selected on a string field in the same file. > 0: jdbc:drill:zk=local> select count(*) from > dfs.`c:/dao/DBArchivePredictor/tenrows.parquet` where acct_no=7020; > org.apache.drill.common.exceptions.DrillRuntimeException: Error in parquet > recor > d reader. > Message: Failure in setting up reader > Parquet Metadata: ParquetMetaData{FileMetaData{schema: message sbi.acct_mstr { > required binary ACCT_NO (DECIMAL(20,0)); > optional binary SF_NO (UTF8); > optional binary LF_NO (UTF8); > optional binary BRANCH_NO (DECIMAL(20,0)); > optional binary INTRO_CUST_NO (DECIMAL(20,0)); > optional binary INTRO_ACCT_NO (DECIMAL(20,0)); > optional binary INTRO_SIGN (UTF8); > optional binary TYPE (UTF8); > optional binary OPR_MODE (UTF8); > optional binary CUR_ACCT_TYPE (UTF8); > optional binary TITLE (UTF8); > optional binary CORP_CUST_NO (DECIMAL(20,0)); > optional binary APLNDT (UTF8); > optional binary OPNDT (UTF8); > optional binary VERI_EMP_NO (DECIMAL(20,0)); > optional binary VERI_SIGN (UTF8); > optional binary MANAGER_SIGN (UTF8); > optional binary CURBAL (DECIMAL(8,2)); > optional binary STATUS (UTF8); > } > , metadata: > {parquet.avro.schema={"type":"record","name":"acct_mstr","namespace" > :"sbi","fields":[{"name":"ACCT_NO","type":{"type":"bytes","logicalType":"decimal > ","precision":20,"scale":0,"cv_auto_incr":false,"cv_case_sensitive":false,"cv_co > lumn_class":"java.math.BigDecimal","cv_connection":"oracle.jdbc.driver.T4CConnec > tion","cv_currency":true,"cv_def_writable":false,"cv_nullable":0,"cv_precision": > 20,"cv_read_only":false,"cv_scale":0,"cv_searchable":true,"cv_signed":true,"cv_s > ubscript":1,"cv_type":2,"cv_typename":"NUMBER","cv_writable":true}},{"name":"SF_ > NO","type":["null",{"type":"string","cv_auto_incr":false,"cv_case_sensitive":tru > e,"cv_column_class":"java.lang.String","cv_currency":fals
[jira] [Commented] (DRILL-4184) Drill does not support Parquet DECIMAL values in variable length BINARY fields
[ https://issues.apache.org/jira/browse/DRILL-4184?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=15181796#comment-15181796 ] ASF GitHub Bot commented on DRILL-4184: --- Github user jaltekruse commented on the pull request: https://github.com/apache/drill/pull/372#issuecomment-192701211 @daveoshinsky I have a few questions about your design here, I have a proposed alternative but I might be wrong about what you are trying to accomplish. > Drill does not support Parquet DECIMAL values in variable length BINARY fields > -- > > Key: DRILL-4184 > URL: https://issues.apache.org/jira/browse/DRILL-4184 > Project: Apache Drill > Issue Type: Bug > Components: Storage - Parquet >Affects Versions: 1.4.0 > Environment: Windows 7 Professional, Java 1.8.0_66 >Reporter: Dave Oshinsky > > Encoding a DECIMAL logical type in Parquet using the variable length BINARY > primitive type is not supported by Drill as of versions 1.3.0 and 1.4.0. The > problem first surfaces with the ClassCastException shown below, but fixing > the immediate cause of the exception is not sufficient to support this > combination (DECIMAL, BINARY) in a Parquet file. > In Drill, DECIMAL is currently assumed to be INT32, INT64, INT96, or > FIXED_LEN_BINARY_ARRAY. Are there any plans to support DECIMAL with variable > length BINARY? Avro definitely supports encoding DECIMAL in variable length > bytes (see https://avro.apache.org/docs/current/spec.html#Decimal), but this > support in Parquet is less clear. > Selecting on a BINARY DECIMAL field in a parquet file throws an exception as > shown below (java.lang.ClassCastException: > org.apache.drill.exec.vector.Decimal28SparseVector cannot be cast to > org.apache.drill.exec.vector.VariableWidthVector). The successful query at > bottom selected on a string field in the same file. > 0: jdbc:drill:zk=local> select count(*) from > dfs.`c:/dao/DBArchivePredictor/tenrows.parquet` where acct_no=7020; > org.apache.drill.common.exceptions.DrillRuntimeException: Error in parquet > recor > d reader. > Message: Failure in setting up reader > Parquet Metadata: ParquetMetaData{FileMetaData{schema: message sbi.acct_mstr { > required binary ACCT_NO (DECIMAL(20,0)); > optional binary SF_NO (UTF8); > optional binary LF_NO (UTF8); > optional binary BRANCH_NO (DECIMAL(20,0)); > optional binary INTRO_CUST_NO (DECIMAL(20,0)); > optional binary INTRO_ACCT_NO (DECIMAL(20,0)); > optional binary INTRO_SIGN (UTF8); > optional binary TYPE (UTF8); > optional binary OPR_MODE (UTF8); > optional binary CUR_ACCT_TYPE (UTF8); > optional binary TITLE (UTF8); > optional binary CORP_CUST_NO (DECIMAL(20,0)); > optional binary APLNDT (UTF8); > optional binary OPNDT (UTF8); > optional binary VERI_EMP_NO (DECIMAL(20,0)); > optional binary VERI_SIGN (UTF8); > optional binary MANAGER_SIGN (UTF8); > optional binary CURBAL (DECIMAL(8,2)); > optional binary STATUS (UTF8); > } > , metadata: > {parquet.avro.schema={"type":"record","name":"acct_mstr","namespace" > :"sbi","fields":[{"name":"ACCT_NO","type":{"type":"bytes","logicalType":"decimal > ","precision":20,"scale":0,"cv_auto_incr":false,"cv_case_sensitive":false,"cv_co > lumn_class":"java.math.BigDecimal","cv_connection":"oracle.jdbc.driver.T4CConnec > tion","cv_currency":true,"cv_def_writable":false,"cv_nullable":0,"cv_precision": > 20,"cv_read_only":false,"cv_scale":0,"cv_searchable":true,"cv_signed":true,"cv_s > ubscript":1,"cv_type":2,"cv_typename":"NUMBER","cv_writable":true}},{"name":"SF_ > NO","type":["null",{"type":"string","cv_auto_incr":false,"cv_case_sensitive":tru > e,"cv_column_class":"java.lang.String","cv_currency":false,"cv_def_writable":fal > se,"cv_nullable":1,"cv_precision":10,"cv_read_only":false,"cv_scale":0,"cv_searc > hable":true,"cv_signed":true,"cv_subscript":2,"cv_type":12,"cv_typename":"VARCHA > R2","cv_writable":true}]},{"name":"LF_NO","type":["null",{"type":"string","cv_au > to_incr":false,"cv_case_sensitive":true,"cv_column_class":"java.lang.String","cv > _currency":false,"cv_def_writable":false,"cv_nullable":1,"cv_precision":10,"cv_r > ead_only":false,"cv_scale":0,"cv_searchable":true,"cv_signed":true,"cv_subscript > ":3,"cv_type":12,"cv_typename":"VARCHAR2","cv_writable":true}]},{"name":"BRANCH_ > NO","type":["null",{"type":"bytes","logicalType":"decimal","precision":20,"scale > ":0,"cv_auto_incr":false,"cv_case_sensitive":false,"cv_column_class":"java.math. > BigDecimal","cv_currency":true,"cv_def_writable":false,"cv_nullable":1,"cv_preci > sion":20,"cv_read_only":false,"cv_scale":0,"cv_searchable":true,"cv_signed":true > ,"cv_subscript":4,"cv_type":2,"cv_typename":"NUMBER","cv_writable":true}]},{"nam > e":"INTRO_CUST_NO","type":["null",{"type":"by
[jira] [Commented] (DRILL-4184) Drill does not support Parquet DECIMAL values in variable length BINARY fields
[ https://issues.apache.org/jira/browse/DRILL-4184?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=15181795#comment-15181795 ] ASF GitHub Bot commented on DRILL-4184: --- Github user jaltekruse commented on a diff in the pull request: https://github.com/apache/drill/pull/372#discussion_r55125267 --- Diff: exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/columnreaders/NullableVarLengthValuesColumn.java --- @@ -69,11 +73,16 @@ protected boolean readAndStoreValueSizeInformation() throws IOException { if ( currDefLevel == -1 ) { currDefLevel = pageReader.definitionLevels.readInteger(); } -if ( columnDescriptor.getMaxDefinitionLevel() > currDefLevel) { + +if (columnDescriptor.getMaxDefinitionLevel() > currDefLevel) { nullsRead++; - // set length of zero, each index in the vector defaults to null so no need to set the nullability - variableWidthVector.getMutator().setValueLengthSafe( - valuesReadInCurrentPass + pageReader.valuesReadyToRead, 0); + // set length of zero, each index in the vector defaults to null so no + // need to set the nullability + if (variableWidthVector == null) { --- End diff -- I know this class hierarchy is a bit messy as is ( I am the original author of most if it, I should go back to clean it up). But we shouldn't be using the presence or absence of this field as a flag to know which class we are in. I'm > Drill does not support Parquet DECIMAL values in variable length BINARY fields > -- > > Key: DRILL-4184 > URL: https://issues.apache.org/jira/browse/DRILL-4184 > Project: Apache Drill > Issue Type: Bug > Components: Storage - Parquet >Affects Versions: 1.4.0 > Environment: Windows 7 Professional, Java 1.8.0_66 >Reporter: Dave Oshinsky > > Encoding a DECIMAL logical type in Parquet using the variable length BINARY > primitive type is not supported by Drill as of versions 1.3.0 and 1.4.0. The > problem first surfaces with the ClassCastException shown below, but fixing > the immediate cause of the exception is not sufficient to support this > combination (DECIMAL, BINARY) in a Parquet file. > In Drill, DECIMAL is currently assumed to be INT32, INT64, INT96, or > FIXED_LEN_BINARY_ARRAY. Are there any plans to support DECIMAL with variable > length BINARY? Avro definitely supports encoding DECIMAL in variable length > bytes (see https://avro.apache.org/docs/current/spec.html#Decimal), but this > support in Parquet is less clear. > Selecting on a BINARY DECIMAL field in a parquet file throws an exception as > shown below (java.lang.ClassCastException: > org.apache.drill.exec.vector.Decimal28SparseVector cannot be cast to > org.apache.drill.exec.vector.VariableWidthVector). The successful query at > bottom selected on a string field in the same file. > 0: jdbc:drill:zk=local> select count(*) from > dfs.`c:/dao/DBArchivePredictor/tenrows.parquet` where acct_no=7020; > org.apache.drill.common.exceptions.DrillRuntimeException: Error in parquet > recor > d reader. > Message: Failure in setting up reader > Parquet Metadata: ParquetMetaData{FileMetaData{schema: message sbi.acct_mstr { > required binary ACCT_NO (DECIMAL(20,0)); > optional binary SF_NO (UTF8); > optional binary LF_NO (UTF8); > optional binary BRANCH_NO (DECIMAL(20,0)); > optional binary INTRO_CUST_NO (DECIMAL(20,0)); > optional binary INTRO_ACCT_NO (DECIMAL(20,0)); > optional binary INTRO_SIGN (UTF8); > optional binary TYPE (UTF8); > optional binary OPR_MODE (UTF8); > optional binary CUR_ACCT_TYPE (UTF8); > optional binary TITLE (UTF8); > optional binary CORP_CUST_NO (DECIMAL(20,0)); > optional binary APLNDT (UTF8); > optional binary OPNDT (UTF8); > optional binary VERI_EMP_NO (DECIMAL(20,0)); > optional binary VERI_SIGN (UTF8); > optional binary MANAGER_SIGN (UTF8); > optional binary CURBAL (DECIMAL(8,2)); > optional binary STATUS (UTF8); > } > , metadata: > {parquet.avro.schema={"type":"record","name":"acct_mstr","namespace" > :"sbi","fields":[{"name":"ACCT_NO","type":{"type":"bytes","logicalType":"decimal > ","precision":20,"scale":0,"cv_auto_incr":false,"cv_case_sensitive":false,"cv_co > lumn_class":"java.math.BigDecimal","cv_connection":"oracle.jdbc.driver.T4CConnec > tion","cv_currency":true,"cv_def_writable":false,"cv_nullable":0,"cv_precision": > 20,"cv_read_only":false,"cv_scale":0,"cv_searchable":true,"cv_signed":true,"cv_s > ubscript":1,"cv_type":2,"cv_typename":"NUMBER","cv_writable":true}},{"name":"SF_ > NO","type":["null",{"type":"string","cv_auto_incr":false,"cv_case_sensitive":tru > e,"cv_column_class":"java.lang.String","cv_currency":f
[jira] [Commented] (DRILL-4184) Drill does not support Parquet DECIMAL values in variable length BINARY fields
[ https://issues.apache.org/jira/browse/DRILL-4184?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=15181794#comment-15181794 ] ASF GitHub Bot commented on DRILL-4184: --- Github user jaltekruse commented on a diff in the pull request: https://github.com/apache/drill/pull/372#discussion_r55125265 --- Diff: exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/columnreaders/VarLengthValuesColumn.java --- @@ -81,6 +112,14 @@ public boolean skipReadyToReadPositionUpdate() { return false; } + // decimalLengths list is part of a near-term fix for DRILL-4184. + // Decimal[23]8SparseVector classes are fixed width vectors, without ability to "remember" offsets of + // (variable width) field sizes. so, we "remember" the array sizes in decimalLengths (also used to + // "remember" whether a value was null, for nullable decimal columns). + // TODO: storage of decimal values should support variable length values in a much cleaner way than this, + // perhaps with a new variable width Decimal vector class. + protected ArrayList decimalLengths = new ArrayList(); --- End diff -- please put class members at the top of the class. > Drill does not support Parquet DECIMAL values in variable length BINARY fields > -- > > Key: DRILL-4184 > URL: https://issues.apache.org/jira/browse/DRILL-4184 > Project: Apache Drill > Issue Type: Bug > Components: Storage - Parquet >Affects Versions: 1.4.0 > Environment: Windows 7 Professional, Java 1.8.0_66 >Reporter: Dave Oshinsky > > Encoding a DECIMAL logical type in Parquet using the variable length BINARY > primitive type is not supported by Drill as of versions 1.3.0 and 1.4.0. The > problem first surfaces with the ClassCastException shown below, but fixing > the immediate cause of the exception is not sufficient to support this > combination (DECIMAL, BINARY) in a Parquet file. > In Drill, DECIMAL is currently assumed to be INT32, INT64, INT96, or > FIXED_LEN_BINARY_ARRAY. Are there any plans to support DECIMAL with variable > length BINARY? Avro definitely supports encoding DECIMAL in variable length > bytes (see https://avro.apache.org/docs/current/spec.html#Decimal), but this > support in Parquet is less clear. > Selecting on a BINARY DECIMAL field in a parquet file throws an exception as > shown below (java.lang.ClassCastException: > org.apache.drill.exec.vector.Decimal28SparseVector cannot be cast to > org.apache.drill.exec.vector.VariableWidthVector). The successful query at > bottom selected on a string field in the same file. > 0: jdbc:drill:zk=local> select count(*) from > dfs.`c:/dao/DBArchivePredictor/tenrows.parquet` where acct_no=7020; > org.apache.drill.common.exceptions.DrillRuntimeException: Error in parquet > recor > d reader. > Message: Failure in setting up reader > Parquet Metadata: ParquetMetaData{FileMetaData{schema: message sbi.acct_mstr { > required binary ACCT_NO (DECIMAL(20,0)); > optional binary SF_NO (UTF8); > optional binary LF_NO (UTF8); > optional binary BRANCH_NO (DECIMAL(20,0)); > optional binary INTRO_CUST_NO (DECIMAL(20,0)); > optional binary INTRO_ACCT_NO (DECIMAL(20,0)); > optional binary INTRO_SIGN (UTF8); > optional binary TYPE (UTF8); > optional binary OPR_MODE (UTF8); > optional binary CUR_ACCT_TYPE (UTF8); > optional binary TITLE (UTF8); > optional binary CORP_CUST_NO (DECIMAL(20,0)); > optional binary APLNDT (UTF8); > optional binary OPNDT (UTF8); > optional binary VERI_EMP_NO (DECIMAL(20,0)); > optional binary VERI_SIGN (UTF8); > optional binary MANAGER_SIGN (UTF8); > optional binary CURBAL (DECIMAL(8,2)); > optional binary STATUS (UTF8); > } > , metadata: > {parquet.avro.schema={"type":"record","name":"acct_mstr","namespace" > :"sbi","fields":[{"name":"ACCT_NO","type":{"type":"bytes","logicalType":"decimal > ","precision":20,"scale":0,"cv_auto_incr":false,"cv_case_sensitive":false,"cv_co > lumn_class":"java.math.BigDecimal","cv_connection":"oracle.jdbc.driver.T4CConnec > tion","cv_currency":true,"cv_def_writable":false,"cv_nullable":0,"cv_precision": > 20,"cv_read_only":false,"cv_scale":0,"cv_searchable":true,"cv_signed":true,"cv_s > ubscript":1,"cv_type":2,"cv_typename":"NUMBER","cv_writable":true}},{"name":"SF_ > NO","type":["null",{"type":"string","cv_auto_incr":false,"cv_case_sensitive":tru > e,"cv_column_class":"java.lang.String","cv_currency":false,"cv_def_writable":fal > se,"cv_nullable":1,"cv_precision":10,"cv_read_only":false,"cv_scale":0,"cv_searc > hable":true,"cv_signed":true,"cv_subscript":2,"cv_type":12,"cv_typename":"VARCHA > R2","cv_writable":true}]},{"name":"LF_NO","type":["null",{"type":"string","cv_au > to_incr":false,"c
[jira] [Commented] (DRILL-4184) Drill does not support Parquet DECIMAL values in variable length BINARY fields
[ https://issues.apache.org/jira/browse/DRILL-4184?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=15139970#comment-15139970 ] ASF GitHub Bot commented on DRILL-4184: --- GitHub user daveoshinsky opened a pull request: https://github.com/apache/drill/pull/372 DRILL-4184: support variable length decimal fields in parquet Support decimal fields in parquet that are stored as variable length BINARY. Parquet files that store decimal values this way are often significantly smaller than ones storing decimal values as FIXED_LEN_BYTE_ARRAY's (full precision). You can merge this pull request into a Git repository by running: $ git pull https://github.com/daveoshinsky/drill master Alternatively you can review and apply these changes as the patch at: https://github.com/apache/drill/pull/372.patch To close this pull request, make a commit to your master/trunk branch with (at least) the following in the commit message: This closes #372 commit 9a47ca52125139d88adf39b5d894a02f870f37d9 Author: U-COMMVAULT-NJ\doshinsky Date: 2016-02-09T22:37:47Z DRILL-4184: support variable length decimal fields in parquet commit dec00a808c99554f008e23fd21b944b858aa9ae0 Author: daveoshinsky Date: 2016-02-09T22:56:28Z DRILL-4184: changes to support variable length decimal fields in parquet > Drill does not support Parquet DECIMAL values in variable length BINARY fields > -- > > Key: DRILL-4184 > URL: https://issues.apache.org/jira/browse/DRILL-4184 > Project: Apache Drill > Issue Type: Bug > Components: Storage - Parquet >Affects Versions: 1.4.0 > Environment: Windows 7 Professional, Java 1.8.0_66 >Reporter: Dave Oshinsky > > Encoding a DECIMAL logical type in Parquet using the variable length BINARY > primitive type is not supported by Drill as of versions 1.3.0 and 1.4.0. The > problem first surfaces with the ClassCastException shown below, but fixing > the immediate cause of the exception is not sufficient to support this > combination (DECIMAL, BINARY) in a Parquet file. > In Drill, DECIMAL is currently assumed to be INT32, INT64, INT96, or > FIXED_LEN_BINARY_ARRAY. Are there any plans to support DECIMAL with variable > length BINARY? Avro definitely supports encoding DECIMAL in variable length > bytes (see https://avro.apache.org/docs/current/spec.html#Decimal), but this > support in Parquet is less clear. > Selecting on a BINARY DECIMAL field in a parquet file throws an exception as > shown below (java.lang.ClassCastException: > org.apache.drill.exec.vector.Decimal28SparseVector cannot be cast to > org.apache.drill.exec.vector.VariableWidthVector). The successful query at > bottom selected on a string field in the same file. > 0: jdbc:drill:zk=local> select count(*) from > dfs.`c:/dao/DBArchivePredictor/tenrows.parquet` where acct_no=7020; > org.apache.drill.common.exceptions.DrillRuntimeException: Error in parquet > recor > d reader. > Message: Failure in setting up reader > Parquet Metadata: ParquetMetaData{FileMetaData{schema: message sbi.acct_mstr { > required binary ACCT_NO (DECIMAL(20,0)); > optional binary SF_NO (UTF8); > optional binary LF_NO (UTF8); > optional binary BRANCH_NO (DECIMAL(20,0)); > optional binary INTRO_CUST_NO (DECIMAL(20,0)); > optional binary INTRO_ACCT_NO (DECIMAL(20,0)); > optional binary INTRO_SIGN (UTF8); > optional binary TYPE (UTF8); > optional binary OPR_MODE (UTF8); > optional binary CUR_ACCT_TYPE (UTF8); > optional binary TITLE (UTF8); > optional binary CORP_CUST_NO (DECIMAL(20,0)); > optional binary APLNDT (UTF8); > optional binary OPNDT (UTF8); > optional binary VERI_EMP_NO (DECIMAL(20,0)); > optional binary VERI_SIGN (UTF8); > optional binary MANAGER_SIGN (UTF8); > optional binary CURBAL (DECIMAL(8,2)); > optional binary STATUS (UTF8); > } > , metadata: > {parquet.avro.schema={"type":"record","name":"acct_mstr","namespace" > :"sbi","fields":[{"name":"ACCT_NO","type":{"type":"bytes","logicalType":"decimal > ","precision":20,"scale":0,"cv_auto_incr":false,"cv_case_sensitive":false,"cv_co > lumn_class":"java.math.BigDecimal","cv_connection":"oracle.jdbc.driver.T4CConnec > tion","cv_currency":true,"cv_def_writable":false,"cv_nullable":0,"cv_precision": > 20,"cv_read_only":false,"cv_scale":0,"cv_searchable":true,"cv_signed":true,"cv_s > ubscript":1,"cv_type":2,"cv_typename":"NUMBER","cv_writable":true}},{"name":"SF_ > NO","type":["null",{"type":"string","cv_auto_incr":false,"cv_case_sensitive":tru > e,"cv_column_class":"java.lang.String","cv_currency":false,"cv_def_writable":fal > se,"cv_nullable":1,"cv_precision":10,"cv_read_only":false,"cv_scale":0,"cv_searc > hable":true,"cv_signed":true,"cv_subscript":2,"cv_type":12,"cv_typena