[ 
https://issues.apache.org/jira/browse/DRILL-6744?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16686354#comment-16686354
 ] 

ASF GitHub Bot commented on DRILL-6744:
---------------------------------------

vvysotskyi commented on a change in pull request #1537: DRILL-6744: Support 
varchar and decimal push down
URL: https://github.com/apache/drill/pull/1537#discussion_r233394516
 
 

 ##########
 File path: 
exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/stat/ParquetMetaStatCollector.java
 ##########
 @@ -132,62 +129,163 @@ public 
ParquetMetaStatCollector(ParquetTableMetadataBase parquetTableMetadata,
   }
 
   /**
-   * Builds column statistics using given primitiveType, originalType, scale,
-   * precision, numNull, min and max values.
+   * Helper class that creates parquet {@link ColumnStatistics} based on given
+   * min and max values, type, number of nulls, precision and scale.
    *
-   * @param min             min value for statistics
-   * @param max             max value for statistics
-   * @param numNulls        num_nulls for statistics
-   * @param primitiveType   type that determines statistics class
-   * @param originalType    type that determines statistics class
-   * @param scale           scale value (used for DECIMAL type)
-   * @param precision       precision value (used for DECIMAL type)
-   * @return column statistics
    */
-  private ColumnStatistics getStat(Object min, Object max, long numNulls,
-                                   PrimitiveType.PrimitiveTypeName 
primitiveType, OriginalType originalType,
-                                   int scale, int precision) {
-    Statistics stat = Statistics.getStatsBasedOnType(primitiveType);
-    Statistics convertedStat = stat;
-
-    TypeProtos.MajorType type = ParquetReaderUtility.getType(primitiveType, 
originalType, scale, precision);
-    stat.setNumNulls(numNulls);
-
-    if (min != null && max != null ) {
-      switch (type.getMinorType()) {
-      case INT :
-      case TIME:
-        ((IntStatistics) stat).setMinMax(Integer.parseInt(min.toString()), 
Integer.parseInt(max.toString()));
-        break;
-      case BIGINT:
-      case TIMESTAMP:
-        ((LongStatistics) stat).setMinMax(Long.parseLong(min.toString()), 
Long.parseLong(max.toString()));
-        break;
-      case FLOAT4:
-        ((FloatStatistics) stat).setMinMax(Float.parseFloat(min.toString()), 
Float.parseFloat(max.toString()));
-        break;
-      case FLOAT8:
-        ((DoubleStatistics) 
stat).setMinMax(Double.parseDouble(min.toString()), 
Double.parseDouble(max.toString()));
-        break;
-      case DATE:
-        convertedStat = new LongStatistics();
-        convertedStat.setNumNulls(stat.getNumNulls());
-        final long minMS = 
convertToDrillDateValue(Integer.parseInt(min.toString()));
-        final long maxMS = 
convertToDrillDateValue(Integer.parseInt(max.toString()));
-        ((LongStatistics) convertedStat ).setMinMax(minMS, maxMS);
-        break;
-      case BIT:
-        ((BooleanStatistics) 
stat).setMinMax(Boolean.parseBoolean(min.toString()), 
Boolean.parseBoolean(max.toString()));
-        break;
-      default:
-      }
+  private static class ColumnStatisticsBuilder {
+
+    private Object min;
+    private Object max;
+    private long numNulls;
+    private PrimitiveType.PrimitiveTypeName primitiveType;
+    private OriginalType originalType;
+    private int scale;
+    private int precision;
+
+    static ColumnStatisticsBuilder builder() {
+      return new ColumnStatisticsBuilder();
     }
 
-    return new ColumnStatistics(convertedStat, type);
-  }
+    ColumnStatisticsBuilder setMin(Object min) {
+      this.min = min;
+      return this;
+    }
+
+    ColumnStatisticsBuilder setMax(Object max) {
+      this.max = max;
+      return this;
+    }
+
+    ColumnStatisticsBuilder setNumNulls(long numNulls) {
+      this.numNulls = numNulls;
+      return this;
+    }
+
+    ColumnStatisticsBuilder setPrimitiveType(PrimitiveType.PrimitiveTypeName 
primitiveType) {
+      this.primitiveType = primitiveType;
+      return this;
+    }
+
+    ColumnStatisticsBuilder setOriginalType(OriginalType originalType) {
+      this.originalType = originalType;
+      return this;
+    }
 
-  private static long convertToDrillDateValue(int dateValue) {
+    ColumnStatisticsBuilder setScale(int scale) {
+      this.scale = scale;
+      return this;
+    }
+
+    ColumnStatisticsBuilder setPrecision(int precision) {
+      this.precision = precision;
+      return this;
+    }
+
+
+    /**
+     * Builds column statistics using given primitive and original types,
+     * scale, precision, number of nulls, min and max values.
+     * Min and max values for binary statistics are set only if allowed.
+     *
+     * @return column statistics
+     */
+    ColumnStatistics build() {
+      Statistics stat = Statistics.getStatsBasedOnType(primitiveType);
+      Statistics convertedStat = stat;
+
+      TypeProtos.MajorType type = ParquetReaderUtility.getType(primitiveType, 
originalType, scale, precision);
+      stat.setNumNulls(numNulls);
+
+      if (min != null && max != null) {
+        switch (type.getMinorType()) {
+          case INT :
+          case TIME:
+            ((IntStatistics) stat).setMinMax(Integer.parseInt(min.toString()), 
Integer.parseInt(max.toString()));
+            break;
+          case BIGINT:
+          case TIMESTAMP:
+            ((LongStatistics) stat).setMinMax(Long.parseLong(min.toString()), 
Long.parseLong(max.toString()));
+            break;
+          case FLOAT4:
+            ((FloatStatistics) 
stat).setMinMax(Float.parseFloat(min.toString()), 
Float.parseFloat(max.toString()));
+            break;
+          case FLOAT8:
+            ((DoubleStatistics) 
stat).setMinMax(Double.parseDouble(min.toString()), 
Double.parseDouble(max.toString()));
+            break;
+          case DATE:
+            convertedStat = new LongStatistics();
+            convertedStat.setNumNulls(stat.getNumNulls());
+            long minMS = 
convertToDrillDateValue(Integer.parseInt(min.toString()));
+            long maxMS = 
convertToDrillDateValue(Integer.parseInt(max.toString()));
+            ((LongStatistics) convertedStat ).setMinMax(minMS, maxMS);
+            break;
+          case BIT:
+            ((BooleanStatistics) 
stat).setMinMax(Boolean.parseBoolean(min.toString()), 
Boolean.parseBoolean(max.toString()));
+            break;
+          case VARCHAR:
+            if (min instanceof Binary && max instanceof Binary) { // when read 
directly from parquet footer
+              ((BinaryStatistics) stat).setMinMaxFromBytes(((Binary) 
min).getBytes(), ((Binary) max).getBytes());
+            } else if (min instanceof byte[] && max instanceof byte[]) { // 
when deserialized from Drill metadata file
+              ((BinaryStatistics) stat).setMinMaxFromBytes((byte[]) min, 
(byte[]) max);
+            }
+            break;
+          case VARDECIMAL:
+            byte[] minBytes = null;
+            byte[] maxBytes = null;
+            boolean setLength = false;
+
+            switch (primitiveType) {
+              case INT32:
+              case INT64:
+                minBytes = new BigInteger(min.toString()).toByteArray();
+                maxBytes = new BigInteger(max.toString()).toByteArray();
+                break;
+              case FIXED_LEN_BYTE_ARRAY:
+                setLength = true;
+                // fall through
+              case BINARY:
+                // wrap up into BigInteger to avoid PARQUET-1417
+                if (min instanceof Binary && max instanceof Binary) { // when 
read directly from parquet footer
+                  minBytes = new BigInteger(((Binary) 
min).getBytes()).toByteArray();
+                  maxBytes = new BigInteger(((Binary) 
max).getBytes()).toByteArray();
+                } else if (min instanceof byte[] && max instanceof byte[]) {  
// when deserialized from Drill metadata file
+                  minBytes = new BigInteger((byte[]) min).toByteArray();
+                  maxBytes = new BigInteger((byte[]) max).toByteArray();
+                }
+                break;
 
 Review comment:
   `break` and `default` here may be removed.

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


> Support filter push down for varchar / decimal data types
> ---------------------------------------------------------
>
>                 Key: DRILL-6744
>                 URL: https://issues.apache.org/jira/browse/DRILL-6744
>             Project: Apache Drill
>          Issue Type: Improvement
>    Affects Versions: 1.14.0
>            Reporter: Arina Ielchiieva
>            Assignee: Arina Ielchiieva
>            Priority: Major
>              Labels: doc-impacting
>             Fix For: 1.15.0
>
>
> Since now Drill is using Apache Parquet 1.10.0 where issue with incorrectly 
> stored varchar / decimal min / max statistics is resolved, we should add 
> support for varchar / decimal filter push down. Only files created with 
> parquet lib 1.9.1 (1.10.0)) and later will be subjected to push down. In 
> cases if user knows that prior created files have correct min / max 
> statistics (i.e. user exactly knows that data in binary columns in ASCII (not 
> UTF-8)) than parquet.strings.signed-min-max.enabled can be set to true to 
> enable filter push down.
> *Description*
> _Note: Drill is using Parquet 1.10.0 library since 1.13.0 version._
> *Varchar Partition Pruning*
> Varchar Pruning will work for files generated prior and after Parquet 1.10.0 
> version, since to enable partition pruning both min and max values should be 
> the same and there are no issues with incorrectly stored statistics for 
> binary data for the same min and max values. Partition pruning using Drill 
> metadata files will also work, no matter when metadata file was created 
> (prior or after Drill 1.15.0).
> Partition pruning won't work for files where partition is null due to 
> PARQUET-1341, issue will be fixed in Parquet 1.11.0.
> *Varchar Filter Push Down*
> Varchar filter push down will work for parquet files created with Parquet 
> 1.10.0 and later.
> There are two options how to enable push down for files generated with prior 
> Parquet versions, when user exactly knows that binary data is in ASCII (not 
> UTF-8):
> 1. set configuration {{enableStringsSignedMinMax}} to true (false by default) 
> for parquet format plugin: 
> {noformat}
>         "parquet" : {
>           type: "parquet",
>           enableStringsSignedMinMax: true 
>         }
> {noformat}
> This would apply to all parquet files of a given file plugin, including all 
> workspaces.
> 2. If user wants to enable / disable allowing reading binary statistics for 
> old parquet files per session, session option 
> {{store.parquet.reader.strings_signed_min_max}} can be used. By default, it 
> has empty string value. Setting such option will take priority over config in 
> parquet format plugin. Option allows three values: 'true', 'false', '' (empty 
> string).
> _Note: store.parquet.reader.strings_signed_min_max also can be set at system 
> level, thus it will apply to all parquet files in the system._
> The same config / session option will apply to allow reading binary 
> statistics from Drill metadata files generated prior to Drill 1.15.0. If 
> Drill metadata file was created prior to  Drill 1.15.0 but for parquet files 
> created with Parquet library 1.10.0 and later, user would have to enable 
> config / session option or regenerate Drill metadata file with Drill 1.15.0 
> or later, because from the metadata file we don't know if statistics is 
> stored correctly (prior Drill was writing reading and writing binary 
> statistics by default though did not use it).
> When creating Drill metadata file with Drill 1.15.0 and later for old parquet 
> files, user should mind config / session option. If strings_signed_min_max is 
> enabled,  Drill will store in the Drill metadata file binary statistics but 
> since metadata file was created with Drill 1.15.0 and later, Drill would read 
> it back disregarding the option (assuming that if statistics is present in 
> the Drill metadata file, it is correct). If user mistakenly enabled 
> strings_signed_min_max, he needs to disable it and regenerated Drill metadata 
> file. The same is in the opposite way, if user created metadata file when 
> strings_signed_min_max was disabled, no min / max values for binary 
> statistics will be written and thus read back, even if during reading the 
> metadata strings_signed_min_max is enabled.
> *Decimal Partition Pruning*
> Decimal values can be represented in four logical types: int_32, int_64, 
> fixed_len_byte_array and binary.
> Partition pruning will work for all  logical types for old and new decimal 
> files, i.e. created with Parquet 1.10.0, prior and after. Partition pruning 
> won't work for files with null partition due to PARQUET-1341 which will be 
> fixed in Parquet 1.11.0.
> Partition pruning with Drill metadata file will work for old and new decimal 
> files disregarding with which Drill version metadata file was created.
> *Decimal Filter Push Down*
> For int_32 / int_64 decimal push down will work only for new files (i.e. 
> generated by Parquet 1.10.0 and later), for old files push down won't work 
> due to PARQUET-1322.
> For old int_32 / int_64 decimal push down will work with old Drill metadata 
> file, i.e. prior to Drill 1.14.0, for Drill metadata file generated after 
> Drill 1.14.0 push down won't work since it is generated after upgrade to 
> Parquet 1.10.0 (due to PARQUET-1322). For new int_32 / int_64 decimal, push 
> down will work with new Drill metadata file.
> For old fixed_len_byte_array / binary decimal files generated prior to 
> Parquet 1.10.0 filter push down won't work. Push down with old Drill metadata 
> file only if strings_signed_min_max config / session option is set to true. 
> Push down with new Drill metadata file won't work.
> For new fixed_len_byte_array / binary files filter push down will work with 
> and without metadata file (only if Drill metadata file was generated by Drill 
> 1.15.0). If Drill metadata file was generated prior to Drill 1.15.0, to 
> enable reading such statistics user needs to enable strings_signed_min_max 
> config / session option or re-generated Drill metadata file.
> *Hive Varchar Filter Push Down using Drill native reader*
> Hive 2.3 parquet files are generated with Parquet library prior to 1.10.0 
> version, where statistics for binary UTF-8 is can be stored incorrectly. If 
> user exactly knows that data in the binary columns in ASCIIĀ (not in UTF-8), 
> session option store.parquet.reader.strings_signed_min_max can be set to 
> 'true' to enable varchar filter push down.
> *Hive Decimal Filter Push Down using Drill native reader*
> Hive 2.3 parquet files are generated with Parquet library prior to 1.10.0 
> version, decimal statistics for such files is not available thus push down 
> won't work with Hive parquet decimal files.



--
This message was sent by Atlassian JIRA
(v7.6.3#76005)

Reply via email to