vvysotskyi commented on a change in pull request #1537: DRILL-6744: Support varchar and decimal push down URL: https://github.com/apache/drill/pull/1537#discussion_r233394516
########## File path: exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/stat/ParquetMetaStatCollector.java ########## @@ -132,62 +129,163 @@ public ParquetMetaStatCollector(ParquetTableMetadataBase parquetTableMetadata, } /** - * Builds column statistics using given primitiveType, originalType, scale, - * precision, numNull, min and max values. + * Helper class that creates parquet {@link ColumnStatistics} based on given + * min and max values, type, number of nulls, precision and scale. * - * @param min min value for statistics - * @param max max value for statistics - * @param numNulls num_nulls for statistics - * @param primitiveType type that determines statistics class - * @param originalType type that determines statistics class - * @param scale scale value (used for DECIMAL type) - * @param precision precision value (used for DECIMAL type) - * @return column statistics */ - private ColumnStatistics getStat(Object min, Object max, long numNulls, - PrimitiveType.PrimitiveTypeName primitiveType, OriginalType originalType, - int scale, int precision) { - Statistics stat = Statistics.getStatsBasedOnType(primitiveType); - Statistics convertedStat = stat; - - TypeProtos.MajorType type = ParquetReaderUtility.getType(primitiveType, originalType, scale, precision); - stat.setNumNulls(numNulls); - - if (min != null && max != null ) { - switch (type.getMinorType()) { - case INT : - case TIME: - ((IntStatistics) stat).setMinMax(Integer.parseInt(min.toString()), Integer.parseInt(max.toString())); - break; - case BIGINT: - case TIMESTAMP: - ((LongStatistics) stat).setMinMax(Long.parseLong(min.toString()), Long.parseLong(max.toString())); - break; - case FLOAT4: - ((FloatStatistics) stat).setMinMax(Float.parseFloat(min.toString()), Float.parseFloat(max.toString())); - break; - case FLOAT8: - ((DoubleStatistics) stat).setMinMax(Double.parseDouble(min.toString()), Double.parseDouble(max.toString())); - break; - case DATE: - convertedStat = new LongStatistics(); - convertedStat.setNumNulls(stat.getNumNulls()); - final long minMS = convertToDrillDateValue(Integer.parseInt(min.toString())); - final long maxMS = convertToDrillDateValue(Integer.parseInt(max.toString())); - ((LongStatistics) convertedStat ).setMinMax(minMS, maxMS); - break; - case BIT: - ((BooleanStatistics) stat).setMinMax(Boolean.parseBoolean(min.toString()), Boolean.parseBoolean(max.toString())); - break; - default: - } + private static class ColumnStatisticsBuilder { + + private Object min; + private Object max; + private long numNulls; + private PrimitiveType.PrimitiveTypeName primitiveType; + private OriginalType originalType; + private int scale; + private int precision; + + static ColumnStatisticsBuilder builder() { + return new ColumnStatisticsBuilder(); } - return new ColumnStatistics(convertedStat, type); - } + ColumnStatisticsBuilder setMin(Object min) { + this.min = min; + return this; + } + + ColumnStatisticsBuilder setMax(Object max) { + this.max = max; + return this; + } + + ColumnStatisticsBuilder setNumNulls(long numNulls) { + this.numNulls = numNulls; + return this; + } + + ColumnStatisticsBuilder setPrimitiveType(PrimitiveType.PrimitiveTypeName primitiveType) { + this.primitiveType = primitiveType; + return this; + } + + ColumnStatisticsBuilder setOriginalType(OriginalType originalType) { + this.originalType = originalType; + return this; + } - private static long convertToDrillDateValue(int dateValue) { + ColumnStatisticsBuilder setScale(int scale) { + this.scale = scale; + return this; + } + + ColumnStatisticsBuilder setPrecision(int precision) { + this.precision = precision; + return this; + } + + + /** + * Builds column statistics using given primitive and original types, + * scale, precision, number of nulls, min and max values. + * Min and max values for binary statistics are set only if allowed. + * + * @return column statistics + */ + ColumnStatistics build() { + Statistics stat = Statistics.getStatsBasedOnType(primitiveType); + Statistics convertedStat = stat; + + TypeProtos.MajorType type = ParquetReaderUtility.getType(primitiveType, originalType, scale, precision); + stat.setNumNulls(numNulls); + + if (min != null && max != null) { + switch (type.getMinorType()) { + case INT : + case TIME: + ((IntStatistics) stat).setMinMax(Integer.parseInt(min.toString()), Integer.parseInt(max.toString())); + break; + case BIGINT: + case TIMESTAMP: + ((LongStatistics) stat).setMinMax(Long.parseLong(min.toString()), Long.parseLong(max.toString())); + break; + case FLOAT4: + ((FloatStatistics) stat).setMinMax(Float.parseFloat(min.toString()), Float.parseFloat(max.toString())); + break; + case FLOAT8: + ((DoubleStatistics) stat).setMinMax(Double.parseDouble(min.toString()), Double.parseDouble(max.toString())); + break; + case DATE: + convertedStat = new LongStatistics(); + convertedStat.setNumNulls(stat.getNumNulls()); + long minMS = convertToDrillDateValue(Integer.parseInt(min.toString())); + long maxMS = convertToDrillDateValue(Integer.parseInt(max.toString())); + ((LongStatistics) convertedStat ).setMinMax(minMS, maxMS); + break; + case BIT: + ((BooleanStatistics) stat).setMinMax(Boolean.parseBoolean(min.toString()), Boolean.parseBoolean(max.toString())); + break; + case VARCHAR: + if (min instanceof Binary && max instanceof Binary) { // when read directly from parquet footer + ((BinaryStatistics) stat).setMinMaxFromBytes(((Binary) min).getBytes(), ((Binary) max).getBytes()); + } else if (min instanceof byte[] && max instanceof byte[]) { // when deserialized from Drill metadata file + ((BinaryStatistics) stat).setMinMaxFromBytes((byte[]) min, (byte[]) max); + } + break; + case VARDECIMAL: + byte[] minBytes = null; + byte[] maxBytes = null; + boolean setLength = false; + + switch (primitiveType) { + case INT32: + case INT64: + minBytes = new BigInteger(min.toString()).toByteArray(); + maxBytes = new BigInteger(max.toString()).toByteArray(); + break; + case FIXED_LEN_BYTE_ARRAY: + setLength = true; + // fall through + case BINARY: + // wrap up into BigInteger to avoid PARQUET-1417 + if (min instanceof Binary && max instanceof Binary) { // when read directly from parquet footer + minBytes = new BigInteger(((Binary) min).getBytes()).toByteArray(); + maxBytes = new BigInteger(((Binary) max).getBytes()).toByteArray(); + } else if (min instanceof byte[] && max instanceof byte[]) { // when deserialized from Drill metadata file + minBytes = new BigInteger((byte[]) min).toByteArray(); + maxBytes = new BigInteger((byte[]) max).toByteArray(); + } + break; Review comment: `break` and `default` here may be removed. ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services