Github user squito commented on a diff in the pull request: https://github.com/apache/spark/pull/22881#discussion_r229172448 --- Diff: core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala --- @@ -471,4 +473,42 @@ object SparkHadoopUtil { hadoopConf.set(key.substring("spark.hadoop.".length), value) } } + + + lazy val builderReflection: Option[(Class[_], Method, Method)] = Try { + val cls = Utils.classForName( + "org.apache.hadoop.hdfs.DistributedFileSystem$HdfsDataOutputStreamBuilder") + (cls, cls.getMethod("replicate"), cls.getMethod("build")) + }.toOption + + // scalastyle:off line.size.limit + /** + * Create a path that uses replication instead of erasure coding, regardless of the default + * configuration in hdfs for the given path. This can be helpful as hdfs ec doesn't support + * hflush(), hsync(), or append() + * https://hadoop.apache.org/docs/r3.0.0/hadoop-project-dist/hadoop-hdfs/HDFSErasureCoding.html#Limitations + */ + // scalastyle:on line.size.limit + def createNonECFile(fs: FileSystem, path: Path): FSDataOutputStream = { + try { + // Use reflection as this uses apis only avialable in hadoop 3 + val builderMethod = fs.getClass().getMethod("createFile", classOf[Path]) + val builder = builderMethod.invoke(fs, path) + builderReflection match { --- End diff -- good point on the reflection, I was trying something else in earlier experiments and didn't clean up. on poking into `DistributedFileSystem` -- @xiao-chen had similar concerns, but also said it seemed there wasn't another option and it looked like an oversight in the hdfs api. @steveloughran maybe you have thoughts here as well?
--- --------------------------------------------------------------------- To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org