[GitHub] spark pull request #22676: [SPARK-25684][SQL] Organize header related codes ...

HyukjinKwon Tue, 09 Oct 2018 00:59:07 -0700

Github user HyukjinKwon commented on a diff in the pull request:

    https://github.com/apache/spark/pull/22676#discussion_r223593838
  
    --- Diff: 
sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParser.scala
 ---
    @@ -273,44 +274,47 @@ private[csv] object UnivocityParser {
           inputStream: InputStream,
           shouldDropHeader: Boolean,
           tokenizer: CsvParser): Iterator[Array[String]] = {
    -    convertStream(inputStream, shouldDropHeader, tokenizer)(tokens => 
tokens)
    +    val handleHeader: () => Unit =
    +      () => if (shouldDropHeader) tokenizer.parseNext
    +
    +    convertStream(inputStream, tokenizer, handleHeader)(tokens => tokens)
       }
     
       /**
        * Parses a stream that contains CSV strings and turns it into an 
iterator of rows.
        */
       def parseStream(
           inputStream: InputStream,
    -      shouldDropHeader: Boolean,
           parser: UnivocityParser,
    -      schema: StructType,
    -      checkHeader: Array[String] => Unit): Iterator[InternalRow] = {
    +      headerChecker: CSVHeaderChecker,
    +      schema: StructType): Iterator[InternalRow] = {
         val tokenizer = parser.tokenizer
         val safeParser = new FailureSafeParser[Array[String]](
           input => Seq(parser.convert(input)),
           parser.options.parseMode,
           schema,
           parser.options.columnNameOfCorruptRecord,
           parser.options.multiLine)
    -    convertStream(inputStream, shouldDropHeader, tokenizer, checkHeader) { 
tokens =>
    +
    +    val handleHeader: () => Unit =
    +      () => headerChecker.checkHeaderColumnNames(tokenizer)
    --- End diff --
    
    This matches the code structure with `parseStream` and `parseIterator` 
which are used in multimode and non-multimode.



---

---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

[GitHub] spark pull request #22676: [SPARK-25684][SQL] Organize header related codes ...

Reply via email to