Github user viirya commented on a diff in the pull request:

    https://github.com/apache/spark/pull/19571#discussion_r146751242
  
    --- Diff: 
sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcFileFormat.scala
 ---
    @@ -39,4 +45,33 @@ private[sql] object OrcFileFormat {
         schema.fieldNames.foreach(checkFieldName)
         schema
       }
    +
    +  def getSchemaString(schema: StructType): String = {
    +    schema.fields.map(f => 
s"${f.name}:${f.dataType.catalogString}").mkString("struct<", ",", ">")
    +  }
    +
    +  private def readSchema(file: Path, conf: ReaderOptions): 
Option[TypeDescription] = {
    +    try {
    +      val reader = OrcFile.createReader(file, conf)
    +      val schema = reader.getSchema
    +      if (schema.getFieldNames.size == 0) {
    +        None
    +      } else {
    +        Some(schema)
    +      }
    +    } catch {
    +      case _: IOException => None
    +    }
    +  }
    +
    +  def readSchema(sparkSession: SparkSession, files: Seq[FileStatus]): 
Option[StructType] = {
    +    val conf = sparkSession.sparkContext.hadoopConfiguration
    +    val fs = FileSystem.get(conf)
    +    val options = OrcFile.readerOptions(conf).filesystem(fs)
    +    files.map(_.getPath).flatMap(readSchema(_, options))
    +      .headOption.map { schema =>
    --- End diff --
    
    Seems that you just take the first available schema. Looks like we don't 
need to read other files when we found the first available schema.


---

---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

Reply via email to