spark git commit: [SPARK-15280] Input/Output] Refactored OrcOutputWriter and moved serialization to a new class.

yhuai Sat, 21 May 2016 16:09:06 -0700

Repository: spark
Updated Branches:
  refs/heads/branch-2.0 4148a9c2c -> 6871deb93



[SPARK-15280] Input/Output] Refactored OrcOutputWriter and moved serialization 
to a new class.

## What changes were proposed in this pull request?
Refactoring: Separated ORC serialization logic from OrcOutputWriter and moved 
to a new class called OrcSerializer.

## How was this patch tested?
Manual tests & existing tests.

Author: Ergin Seyfe <ese...@fb.com>

Closes #13066 from seyfe/orc_serializer.

(cherry picked from commit c18fa464f404ed2612f8c4d355cb0544b355975b)
Signed-off-by: Yin Huai <yh...@databricks.com>


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/6871deb9
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/6871deb9
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/6871deb9

Branch: refs/heads/branch-2.0
Commit: 6871deb937fd6d6185b1d2a7a2ea36535ce303ea
Parents: 4148a9c
Author: Ergin Seyfe <ese...@fb.com>
Authored: Sat May 21 16:08:31 2016 -0700
Committer: Yin Huai <yh...@databricks.com>
Committed: Sat May 21 16:08:51 2016 -0700

----------------------------------------------------------------------
 .../apache/spark/sql/hive/orc/OrcRelation.scala | 84 +++++++++++---------
 1 file changed, 45 insertions(+), 39 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/6871deb9/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcRelation.scala
----------------------------------------------------------------------
diff --git 
a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcRelation.scala 
b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcRelation.scala
index 6e55137..38f50c1 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcRelation.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcRelation.scala
@@ -149,39 +149,70 @@ private[sql] class DefaultSource
   }
 }
 
-private[orc] class OrcOutputWriter(
-    path: String,
-    bucketId: Option[Int],
-    dataSchema: StructType,
-    context: TaskAttemptContext)
-  extends OutputWriter with HiveInspectors {
+private[orc] class OrcSerializer(dataSchema: StructType, conf: Configuration)
+  extends HiveInspectors {
+
+  def serialize(row: InternalRow): Writable = {
+    wrapOrcStruct(cachedOrcStruct, structOI, row)
+    serializer.serialize(cachedOrcStruct, structOI)
+  }
 
-  private val serializer = {
+  private[this] val serializer = {
     val table = new Properties()
     table.setProperty("columns", dataSchema.fieldNames.mkString(","))
     table.setProperty("columns.types", 
dataSchema.map(_.dataType.catalogString).mkString(":"))
 
     val serde = new OrcSerde
-    val configuration = context.getConfiguration
-    serde.initialize(configuration, table)
+    serde.initialize(conf, table)
     serde
   }
 
-  // Object inspector converted from the schema of the relation to be written.
-  private val structOI = {
+  // Object inspector converted from the schema of the relation to be 
serialized.
+  private[this] val structOI = {
     val typeInfo = 
TypeInfoUtils.getTypeInfoFromTypeString(dataSchema.catalogString)
     OrcStruct.createObjectInspector(typeInfo.asInstanceOf[StructTypeInfo])
       .asInstanceOf[SettableStructObjectInspector]
   }
 
+  private[this] val cachedOrcStruct = structOI.create().asInstanceOf[OrcStruct]
+
+  private[this] def wrapOrcStruct(
+      struct: OrcStruct,
+      oi: SettableStructObjectInspector,
+      row: InternalRow): Unit = {
+    val fieldRefs = oi.getAllStructFieldRefs
+    var i = 0
+    while (i < fieldRefs.size) {
+
+      oi.setStructFieldData(
+        struct,
+        fieldRefs.get(i),
+        wrap(
+          row.get(i, dataSchema(i).dataType),
+          fieldRefs.get(i).getFieldObjectInspector,
+          dataSchema(i).dataType))
+      i += 1
+    }
+  }
+}
+
+private[orc] class OrcOutputWriter(
+    path: String,
+    bucketId: Option[Int],
+    dataSchema: StructType,
+    context: TaskAttemptContext)
+  extends OutputWriter {
+
+  private[this] val conf = context.getConfiguration
+
+  private[this] val serializer = new OrcSerializer(dataSchema, conf)
+
   // `OrcRecordWriter.close()` creates an empty file if no rows are written at 
all.  We use this
   // flag to decide whether `OrcRecordWriter.close()` needs to be called.
   private var recordWriterInstantiated = false
 
   private lazy val recordWriter: RecordWriter[NullWritable, Writable] = {
     recordWriterInstantiated = true
-
-    val conf = context.getConfiguration
     val uniqueWriteJobId = conf.get("spark.sql.sources.writeJobUUID")
     val taskAttemptId = context.getTaskAttemptID
     val partition = taskAttemptId.getTaskID.getId
@@ -206,33 +237,8 @@ private[orc] class OrcOutputWriter(
   override def write(row: Row): Unit =
     throw new UnsupportedOperationException("call writeInternal")
 
-  private def wrapOrcStruct(
-      struct: OrcStruct,
-      oi: SettableStructObjectInspector,
-      row: InternalRow): Unit = {
-    val fieldRefs = oi.getAllStructFieldRefs
-    var i = 0
-    while (i < fieldRefs.size) {
-
-      oi.setStructFieldData(
-        struct,
-        fieldRefs.get(i),
-        wrap(
-          row.get(i, dataSchema(i).dataType),
-          fieldRefs.get(i).getFieldObjectInspector,
-          dataSchema(i).dataType))
-      i += 1
-    }
-  }
-
-  val cachedOrcStruct = structOI.create().asInstanceOf[OrcStruct]
-
   override protected[sql] def writeInternal(row: InternalRow): Unit = {
-    wrapOrcStruct(cachedOrcStruct, structOI, row)
-
-    recordWriter.write(
-      NullWritable.get(),
-      serializer.serialize(cachedOrcStruct, structOI))
+    recordWriter.write(NullWritable.get(), serializer.serialize(row))
   }
 
   override def close(): Unit = {


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-15280] Input/Output] Refactored OrcOutputWriter and moved serialization to a new class.

Reply via email to