[GitHub] [spark] HeartSaVioR commented on a change in pull request #25670: [SPARK-28869][CORE] Roll over event log files

GitBox Wed, 25 Sep 2019 18:18:40 -0700

HeartSaVioR commented on a change in pull request #25670: [SPARK-28869][CORE] 
Roll over event log files
URL: https://github.com/apache/spark/pull/25670#discussion_r328400815


 ##########
 File path: 
core/src/main/scala/org/apache/spark/deploy/history/EventLogFileWriters.scala
 ##########
 @@ -0,0 +1,422 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.deploy.history
+
+import java.io._
+import java.net.URI
+import java.nio.charset.StandardCharsets
+
+import org.apache.commons.compress.utils.CountingOutputStream
+import org.apache.hadoop.conf.Configuration
+import org.apache.hadoop.fs.{FileStatus, FileSystem, FSDataOutputStream, Path}
+import org.apache.hadoop.fs.permission.FsPermission
+
+import org.apache.spark.SparkConf
+import org.apache.spark.deploy.SparkHadoopUtil
+import org.apache.spark.internal.Logging
+import org.apache.spark.internal.config._
+import org.apache.spark.io.CompressionCodec
+import org.apache.spark.util.Utils
+
+/**
+ * The base class of writer which will write event logs into file.
+ *
+ * The following configurable parameters are available to tune the behavior of 
writing:
+ *   spark.eventLog.compress - Whether to compress logged events
+ *   spark.eventLog.compression.codec - The codec to compress logged events
+ *   spark.eventLog.overwrite - Whether to overwrite any existing files
+ *   spark.eventLog.buffer.kb - Buffer size to use when writing to output 
streams
+ *
+ * Note that descendant classes can maintain its own parameters: refer the 
javadoc of each class
+ * for more details.
+ *
+ * NOTE: CountingOutputStream being returned by "initLogFile" counts 
"non-compressed" bytes.
+ */
+abstract class EventLogFileWriter(
+    appId: String,
+    appAttemptId : Option[String],
+    logBaseDir: URI,
+    sparkConf: SparkConf,
+    hadoopConf: Configuration) extends Logging {
+
+  protected val shouldCompress = sparkConf.get(EVENT_LOG_COMPRESS)
+  protected val shouldOverwrite = sparkConf.get(EVENT_LOG_OVERWRITE)
+  protected val shouldAllowECLogs = sparkConf.get(EVENT_LOG_ALLOW_EC)
+  protected val outputBufferSize = 
sparkConf.get(EVENT_LOG_OUTPUT_BUFFER_SIZE).toInt
+  protected val fileSystem = Utils.getHadoopFileSystem(logBaseDir, hadoopConf)
+  protected val compressionCodec =
+    if (shouldCompress) {
+      Some(CompressionCodec.createCodec(sparkConf, 
sparkConf.get(EVENT_LOG_COMPRESSION_CODEC)))
+    } else {
+      None
+    }
+
+  private[history] val compressionCodecName = compressionCodec.map { c =>
+    CompressionCodec.getShortName(c.getClass.getName)
+  }
+
+  // Only defined if the file system scheme is not local
+  protected var hadoopDataStream: Option[FSDataOutputStream] = None
+  protected var writer: Option[PrintWriter] = None
+
+  protected def requireLogBaseDirAsDirectory(): Unit = {
+    if (!fileSystem.getFileStatus(new Path(logBaseDir)).isDirectory) {
+      throw new IllegalArgumentException(s"Log directory $logBaseDir is not a 
directory.")
+    }
+  }
+
+  protected def initLogFile(path: Path, fnSetupWriter: OutputStream => 
PrintWriter): Unit = {
+    if (shouldOverwrite && fileSystem.delete(path, true)) {
+      logWarning(s"Event log $path already exists. Overwriting...")
+    }
+
+    val defaultFs = FileSystem.getDefaultUri(hadoopConf).getScheme
+    val isDefaultLocal = defaultFs == null || defaultFs == "file"
+    val uri = path.toUri
+
+    /* The Hadoop LocalFileSystem (r1.0.4) has known issues with syncing 
(HADOOP-7844).
+     * Therefore, for local files, use FileOutputStream instead. */
+    val dstream =
+      if ((isDefaultLocal && uri.getScheme == null) || uri.getScheme == 
"file") {
+        new FileOutputStream(uri.getPath)
+      } else {
+        hadoopDataStream = Some(
+          SparkHadoopUtil.createFile(fileSystem, path, 
sparkConf.get(EVENT_LOG_ALLOW_EC)))
+        hadoopDataStream.get
+      }
+
+    try {
+      val cstream = 
compressionCodec.map(_.compressedOutputStream(dstream)).getOrElse(dstream)
+      val bstream = new BufferedOutputStream(cstream, outputBufferSize)
+      fileSystem.setPermission(path, EventLogFileWriter.LOG_FILE_PERMISSIONS)
+      logInfo(s"Logging events to $path")
+      writer = Some(fnSetupWriter(bstream))
+    } catch {
+      case e: Exception =>
+        dstream.close()
+        throw e
+    }
+  }
+
+  protected def writeJson(json: String, flushLogger: Boolean = false): Unit = {
+    // scalastyle:off println
+    writer.foreach(_.println(json))
+    // scalastyle:on println
+    if (flushLogger) {
+      writer.foreach(_.flush())
+      hadoopDataStream.foreach(_.hflush())
+    }
+  }
+
+  protected def closeWriter(): Unit = {
+    writer.foreach(_.close())
+  }
+
+  protected def renameFile(src: Path, dest: Path, overwrite: Boolean): Unit = {
+    if (fileSystem.exists(dest)) {
+      if (overwrite) {
+        logWarning(s"Event log $dest already exists. Overwriting...")
+        if (!fileSystem.delete(dest, true)) {
+          logWarning(s"Error deleting $dest")
+        }
+      } else {
+        throw new IOException(s"Target log file already exists ($dest)")
+      }
+    }
+    fileSystem.rename(src, dest)
+    // touch file to ensure modtime is current across those filesystems where 
rename()
+    // does not set it, -and which support setTimes(); it's a no-op on most 
object stores
 
 Review comment:
   Will fix it. I guess the comment was already there and I'm unsure I made a 
mistake.

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

[GitHub] [spark] HeartSaVioR commented on a change in pull request #25670: [SPARK-28869][CORE] Roll over event log files

Reply via email to