Github user viirya commented on a diff in the pull request: https://github.com/apache/spark/pull/19571#discussion_r146751242 --- Diff: sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcFileFormat.scala --- @@ -39,4 +45,33 @@ private[sql] object OrcFileFormat { schema.fieldNames.foreach(checkFieldName) schema } + + def getSchemaString(schema: StructType): String = { + schema.fields.map(f => s"${f.name}:${f.dataType.catalogString}").mkString("struct<", ",", ">") + } + + private def readSchema(file: Path, conf: ReaderOptions): Option[TypeDescription] = { + try { + val reader = OrcFile.createReader(file, conf) + val schema = reader.getSchema + if (schema.getFieldNames.size == 0) { + None + } else { + Some(schema) + } + } catch { + case _: IOException => None + } + } + + def readSchema(sparkSession: SparkSession, files: Seq[FileStatus]): Option[StructType] = { + val conf = sparkSession.sparkContext.hadoopConfiguration + val fs = FileSystem.get(conf) + val options = OrcFile.readerOptions(conf).filesystem(fs) + files.map(_.getPath).flatMap(readSchema(_, options)) + .headOption.map { schema => --- End diff -- Seems that you just take the first available schema. Looks like we don't need to read other files when we found the first available schema.
--- --------------------------------------------------------------------- To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org