Github user MaxGekk commented on a diff in the pull request: https://github.com/apache/spark/pull/21296#discussion_r187610192 --- Diff: sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParser.scala --- @@ -73,11 +64,24 @@ class UnivocityParser( // Each input token is placed in each output row's position by mapping these. In this case, // // output row - ["A", 2] - private val valueConverters: Array[ValueConverter] = - schema.map(f => makeConverter(f.name, f.dataType, f.nullable, options)).toArray + private val valueConverters: Array[ValueConverter] = { + requiredSchema.map(f => makeConverter(f.name, f.dataType, f.nullable, options)).toArray + } - private val tokenIndexArr: Array[Int] = { - requiredSchema.map(f => schema.indexOf(f)).toArray + private val tokenizer = { + val parserSetting = options.asParserSettings + if (requiredSchema.length < schema.length) { + val tokenIndexArr = requiredSchema.map(f => java.lang.Integer.valueOf(schema.indexOf(f))) + parserSetting.selectIndexes(tokenIndexArr: _*) --- End diff -- I tried the changes as an experiment because some of our clients have many columns (> 200 columns) in their input CSV files. The experiment shows that the improvements can significantly impact on total execution time.
--- --------------------------------------------------------------------- To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org