spark git commit: [SPARK-4176] [SQL] [MINOR] Should use unscaled Long to write decimals for precision <= 18 rather than 8

lian Sat, 08 Aug 2015 03:11:45 -0700

Repository: spark
Updated Branches:
  refs/heads/branch-1.5 2ad75d99f -> 2cd96329f



[SPARK-4176] [SQL] [MINOR] Should use unscaled Long to write decimals for 
precision <= 18 rather than 8

This PR fixes a minor bug introduced in #7455: when writing decimals, we should 
use the unscaled Long for better performance when the precision <= 18 rather 
than 8 (should be a typo). This bug doesn't affect correctness, but hurts 
Parquet decimal writing performance.

This PR also replaced similar magic numbers with newly defined constants.

Author: Cheng Lian <l...@databricks.com>

Closes #8031 from liancheng/spark-4176/minor-fix-for-writing-decimals and 
squashes the following commits:

10d4ea3 [Cheng Lian] Should use unscaled Long to write decimals for precision 
<= 18 rather than 8

(cherry picked from commit 11caf1ce290b6931647c2f71268f847d1d48930e)
Signed-off-by: Cheng Lian <l...@databricks.com>


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/2cd96329
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/2cd96329
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/2cd96329

Branch: refs/heads/branch-1.5
Commit: 2cd96329f0214db8d20b3c56ad41a46a32036ae8
Parents: 2ad75d9
Author: Cheng Lian <l...@databricks.com>
Authored: Sat Aug 8 18:09:48 2015 +0800
Committer: Cheng Lian <l...@databricks.com>
Committed: Sat Aug 8 18:10:48 2015 +0800

----------------------------------------------------------------------
 .../sql/parquet/CatalystRowConverter.scala      |  2 +-
 .../sql/parquet/CatalystSchemaConverter.scala   | 29 ++++++++++++--------
 2 files changed, 18 insertions(+), 13 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/2cd96329/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystRowConverter.scala
----------------------------------------------------------------------
diff --git 
a/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystRowConverter.scala
 
b/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystRowConverter.scala
index 6938b07..4fe8a39 100644
--- 
a/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystRowConverter.scala
+++ 
b/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystRowConverter.scala
@@ -264,7 +264,7 @@ private[parquet] class CatalystRowConverter(
       val scale = decimalType.scale
       val bytes = value.getBytes
 
-      if (precision <= 8) {
+      if (precision <= CatalystSchemaConverter.MAX_PRECISION_FOR_INT64) {
         // Constructs a `Decimal` with an unscaled `Long` value if possible.
         var unscaled = 0L
         var i = 0

http://git-wip-us.apache.org/repos/asf/spark/blob/2cd96329/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystSchemaConverter.scala
----------------------------------------------------------------------
diff --git 
a/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystSchemaConverter.scala
 
b/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystSchemaConverter.scala
index d43ca95..b12149d 100644
--- 
a/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystSchemaConverter.scala
+++ 
b/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystSchemaConverter.scala
@@ -25,6 +25,7 @@ import 
org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName._
 import org.apache.parquet.schema.Type.Repetition._
 import org.apache.parquet.schema._
 
+import 
org.apache.spark.sql.parquet.CatalystSchemaConverter.{MAX_PRECISION_FOR_INT32, 
MAX_PRECISION_FOR_INT64, maxPrecisionForBytes}
 import org.apache.spark.sql.types._
 import org.apache.spark.sql.{AnalysisException, SQLConf}
 
@@ -155,7 +156,7 @@ private[parquet] class CatalystSchemaConverter(
           case INT_16 => ShortType
           case INT_32 | null => IntegerType
           case DATE => DateType
-          case DECIMAL => makeDecimalType(maxPrecisionForBytes(4))
+          case DECIMAL => makeDecimalType(MAX_PRECISION_FOR_INT32)
           case TIME_MILLIS => typeNotImplemented()
           case _ => illegalType()
         }
@@ -163,7 +164,7 @@ private[parquet] class CatalystSchemaConverter(
       case INT64 =>
         originalType match {
           case INT_64 | null => LongType
-          case DECIMAL => makeDecimalType(maxPrecisionForBytes(8))
+          case DECIMAL => makeDecimalType(MAX_PRECISION_FOR_INT64)
           case TIMESTAMP_MILLIS => typeNotImplemented()
           case _ => illegalType()
         }
@@ -405,7 +406,7 @@ private[parquet] class CatalystSchemaConverter(
 
       // Uses INT32 for 1 <= precision <= 9
       case DecimalType.Fixed(precision, scale)
-        if precision <= maxPrecisionForBytes(4) && followParquetFormatSpec =>
+          if precision <= MAX_PRECISION_FOR_INT32 && followParquetFormatSpec =>
         Types
           .primitive(INT32, repetition)
           .as(DECIMAL)
@@ -415,7 +416,7 @@ private[parquet] class CatalystSchemaConverter(
 
       // Uses INT64 for 1 <= precision <= 18
       case DecimalType.Fixed(precision, scale)
-        if precision <= maxPrecisionForBytes(8) && followParquetFormatSpec =>
+          if precision <= MAX_PRECISION_FOR_INT64 && followParquetFormatSpec =>
         Types
           .primitive(INT64, repetition)
           .as(DECIMAL)
@@ -534,14 +535,6 @@ private[parquet] class CatalystSchemaConverter(
         throw new AnalysisException(s"Unsupported data type $field.dataType")
     }
   }
-
-  // Max precision of a decimal value stored in `numBytes` bytes
-  private def maxPrecisionForBytes(numBytes: Int): Int = {
-    Math.round(                               // convert double to long
-      Math.floor(Math.log10(                  // number of base-10 digits
-        Math.pow(2, 8 * numBytes - 1) - 1)))  // max value stored in numBytes
-      .asInstanceOf[Int]
-  }
 }
 
 
@@ -584,4 +577,16 @@ private[parquet] object CatalystSchemaConverter {
       computeMinBytesForPrecision(precision)
     }
   }
+
+  val MAX_PRECISION_FOR_INT32 = maxPrecisionForBytes(4)
+
+  val MAX_PRECISION_FOR_INT64 = maxPrecisionForBytes(8)
+
+  // Max precision of a decimal value stored in `numBytes` bytes
+  def maxPrecisionForBytes(numBytes: Int): Int = {
+    Math.round(                               // convert double to long
+      Math.floor(Math.log10(                  // number of base-10 digits
+        Math.pow(2, 8 * numBytes - 1) - 1)))  // max value stored in numBytes
+      .asInstanceOf[Int]
+  }
 }


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-4176] [SQL] [MINOR] Should use unscaled Long to write decimals for precision <= 18 rather than 8

Reply via email to