c21 commented on a change in pull request #31958: URL: https://github.com/apache/spark/pull/31958#discussion_r603014404
########## File path: sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala ########## @@ -838,6 +838,13 @@ object SQLConf { .intConf .createWithDefault(4096) + val ORC_VECTORIZED_READER_NESTED_COLUMN_ENABLED = + buildConf("spark.sql.orc.enableNestedColumnVectorizedReader") + .doc("Enables vectorized orc decoding for nested column.") + .version("3.2.0") + .booleanConf + .createWithDefault(true) Review comment: @dongjoon-hyun - makes sense to me. Updated. For all reviewers, https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/136587/testReport is the passed unit tests when enabling nested column vectorized reader by default. ########## File path: sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcFileFormat.scala ########## @@ -131,11 +131,27 @@ class OrcFileFormat } } + private def supportBatchForNestedColumn( + sparkSession: SparkSession, + schema: StructType): Boolean = { + val hasNestedColumn = schema.map(_.dataType).exists { + case _: ArrayType | _: MapType | _: StructType => true + case _ => false + } + if (hasNestedColumn) { + sparkSession.sessionState.conf.orcVectorizedReaderNestedColumnEnabled + } else { + true + } + } + override def supportBatch(sparkSession: SparkSession, schema: StructType): Boolean = { val conf = sparkSession.sessionState.conf conf.orcVectorizedReaderEnabled && conf.wholeStageEnabled && schema.length <= conf.wholeStageMaxNumFields && - schema.forall(_.dataType.isInstanceOf[AtomicType]) + schema.forall(s => supportDataType(s.dataType) && + !s.dataType.isInstanceOf[UserDefinedType[_]]) && + supportBatchForNestedColumn(sparkSession, schema) Review comment: @dongjoon-hyun - do you mean implementing Parquet vectorized reader for nested column? I created https://issues.apache.org/jira/browse/SPARK-34863 and plan to do it after this one, thanks. ########## File path: project/MimaExcludes.scala ########## @@ -417,6 +417,21 @@ object MimaExcludes { case _ => true }, + // [SPARK-34862][SQL] Support nested column in ORC vectorized reader + ProblemFilters.exclude[DirectAbstractMethodProblem]("org.apache.spark.sql.vectorized.ColumnVector.getBoolean"), + ProblemFilters.exclude[DirectAbstractMethodProblem]("org.apache.spark.sql.vectorized.ColumnVector.getByte"), + ProblemFilters.exclude[DirectAbstractMethodProblem]("org.apache.spark.sql.vectorized.ColumnVector.getShort"), + ProblemFilters.exclude[DirectAbstractMethodProblem]("org.apache.spark.sql.vectorized.ColumnVector.getInt"), + ProblemFilters.exclude[DirectAbstractMethodProblem]("org.apache.spark.sql.vectorized.ColumnVector.getLong"), + ProblemFilters.exclude[DirectAbstractMethodProblem]("org.apache.spark.sql.vectorized.ColumnVector.getFloat"), + ProblemFilters.exclude[DirectAbstractMethodProblem]("org.apache.spark.sql.vectorized.ColumnVector.getDouble"), + ProblemFilters.exclude[DirectAbstractMethodProblem]("org.apache.spark.sql.vectorized.ColumnVector.getDecimal"), + ProblemFilters.exclude[DirectAbstractMethodProblem]("org.apache.spark.sql.vectorized.ColumnVector.getUTF8String"), + ProblemFilters.exclude[DirectAbstractMethodProblem]("org.apache.spark.sql.vectorized.ColumnVector.getBinary"), + ProblemFilters.exclude[DirectAbstractMethodProblem]("org.apache.spark.sql.vectorized.ColumnVector.getArray"), + ProblemFilters.exclude[DirectAbstractMethodProblem]("org.apache.spark.sql.vectorized.ColumnVector.getMap"), + ProblemFilters.exclude[DirectAbstractMethodProblem]("org.apache.spark.sql.vectorized.ColumnVector.getChild"), Review comment: @dongjoon-hyun - updated, thanks. Sorry I was not looking at this file very closely. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org