HyukjinKwon commented on a change in pull request #33706: URL: https://github.com/apache/spark/pull/33706#discussion_r687388095
########## File path: sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JsonInferSchema.scala ########## @@ -68,21 +73,27 @@ private[sql] class JsonInferSchema(options: JSONOptions) extends Serializable { Some(inferField(parser)) } } catch { - case e @ (_: RuntimeException | _: JsonProcessingException) => parseMode match { - case PermissiveMode => - Some(StructType(Seq(StructField(columnNameOfCorruptRecord, StringType)))) - case DropMalformedMode => - None - case FailFastMode => - throw QueryExecutionErrors.malformedRecordsDetectedInSchemaInferenceError(e) + case e @ (_: JsonProcessingException | _: CharConversionException) + if ignoreCorruptFiles => + logWarning("Skipped inferring json schema from the corrupted data: " + + s"${row.asInstanceOf[InternalRow].getString(0)}", e) + None + case e @ (_: RuntimeException | _: JsonProcessingException | + _: CharConversionException) => Review comment: @yaooqinn can we just don't respect `ignoreCorruptFiles` here for now, and match here with `JacksonParser`: https://github.com/apache/spark/blob/186815be1c0832e2f976faf6f2d5c31e42eebf1a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonParser.scala#L495 https://github.com/apache/spark/blob/186815be1c0832e2f976faf6f2d5c31e42eebf1a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonParser.scala#L490 meaning that here we will catch two more exceptions of `CharConversionException` (if encoding is not specified) and `MalformedInputException`. I think JSON datasource itself doesn't really support `ignoreCorruptFiles` (not only JSON schema inference). For example, if other exceptions happen or `encoding` is specified, I think the exception will be thrown. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org