Github user srowen commented on a diff in the pull request: https://github.com/apache/spark/pull/22975#discussion_r231977614 --- Diff: mllib/src/main/scala/org/apache/spark/ml/feature/Tokenizer.scala --- @@ -37,7 +39,7 @@ class Tokenizer @Since("1.4.0") (@Since("1.4.0") override val uid: String) override protected def createTransformFunc: String => Seq[String] = { // scalastyle:off caselocale - _.toLowerCase.split("\\s") + _.toLowerCase(Locale.ROOT).split("\\s") --- End diff -- Same here, this needs to be locale-aware. It's modifying user data.
--- --------------------------------------------------------------------- To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org