This is an automated email from the ASF dual-hosted git repository. gurwls223 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push: new ded1a74 [SPARK-28365][ML] Fallback locale to en_US in StopWordsRemover if system default locale isn't in available locales in JVM ded1a74 is described below commit ded1a7495b443f4735057eb5520f31df5b9860d2 Author: Liang-Chi Hsieh <vii...@gmail.com> AuthorDate: Fri Jul 26 12:13:10 2019 +0900 [SPARK-28365][ML] Fallback locale to en_US in StopWordsRemover if system default locale isn't in available locales in JVM ## What changes were proposed in this pull request? Because the local default locale isn't in available locales at `Locale`, when I did some tests locally with python code, `StopWordsRemover` related python test hits some errors, like: ``` Traceback (most recent call last): File "/spark-1/python/pyspark/ml/tests/test_feature.py", line 87, in test_stopwordsremover stopWordRemover = StopWordsRemover(inputCol="input", outputCol="output") File "/spark-1/python/pyspark/__init__.py", line 111, in wrapper return func(self, **kwargs) File "/spark-1/python/pyspark/ml/feature.py", line 2646, in __init__ self.uid) File "/spark-1/python/pyspark/ml/wrapper.py", line 67, in _new_java_obj return java_obj(*java_args) File /spark-1/python/lib/py4j-0.10.8.1-src.zip/py4j/java_gateway.py", line 1554, in __call__ answer, self._gateway_client, None, self._fqn) File "/spark-1/python/pyspark/sql/utils.py", line 93, in deco raise converted pyspark.sql.utils.IllegalArgumentException: 'StopWordsRemover_4598673ee802 parameter locale given invalid value en_TW.' ``` As per HyukjinKwon's advice, instead of setting up locale to pass test, it is better to have a workable locale if system default locale can't be found in available locales in JVM. Otherwise, users have to manually change system locale or accessing a private property _jvm in PySpark. ## How was this patch tested? Added test and manual test. ``` scala> val remover = new StopWordsRemover().setInputCol("raw").setOutputCol("filtered") 19/07/14 19:20:03 WARN StopWordsRemover: Default locale set was [en_TW]; however, it was not found in available locales in JVM, falling back to en_US locale. Set param `locale` in order to respect another locale. ``` Closes #25133 from viirya/pytest-default-locale. Authored-by: Liang-Chi Hsieh <vii...@gmail.com> Signed-off-by: HyukjinKwon <gurwls...@apache.org> --- .../apache/spark/ml/feature/StopWordsRemover.scala | 20 ++++++++++++++++++-- .../spark/ml/feature/StopWordsRemoverSuite.scala | 17 +++++++++++++++++ 2 files changed, 35 insertions(+), 2 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala index 6669d40..f95e03a 100755 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala @@ -89,7 +89,8 @@ class StopWordsRemover @Since("1.5.0") (@Since("1.5.0") override val uid: String /** * Locale of the input for case insensitive matching. Ignored when [[caseSensitive]] * is true. - * Default: Locale.getDefault.toString + * Default: the string of default locale (`Locale.getDefault`), or `Locale.US` if default locale + * is not in available locales in JVM. * @group param */ @Since("2.4.0") @@ -105,8 +106,23 @@ class StopWordsRemover @Since("1.5.0") (@Since("1.5.0") override val uid: String @Since("2.4.0") def getLocale: String = $(locale) + /** + * Returns system default locale, or `Locale.US` if the default locale is not in available locales + * in JVM. + */ + private val getDefaultOrUS: Locale = { + if (Locale.getAvailableLocales.contains(Locale.getDefault)) { + Locale.getDefault + } else { + logWarning(s"Default locale set was [${Locale.getDefault.toString}]; however, it was " + + "not found in available locales in JVM, falling back to en_US locale. Set param `locale` " + + "in order to respect another locale.") + Locale.US + } + } + setDefault(stopWords -> StopWordsRemover.loadDefaultStopWords("english"), - caseSensitive -> false, locale -> Locale.getDefault.toString) + caseSensitive -> false, locale -> getDefaultOrUS.toString) @Since("2.0.0") override def transform(dataset: Dataset[_]): DataFrame = { diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/StopWordsRemoverSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/StopWordsRemoverSuite.scala index 20972d1..6d0b83e 100755 --- a/mllib/src/test/scala/org/apache/spark/ml/feature/StopWordsRemoverSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/feature/StopWordsRemoverSuite.scala @@ -17,6 +17,8 @@ package org.apache.spark.ml.feature +import java.util.Locale + import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTest} import org.apache.spark.sql.{DataFrame, Row} @@ -200,4 +202,19 @@ class StopWordsRemoverSuite extends MLTest with DefaultReadWriteTest { s"requirement failed: Column $outputCol already exists.", "expected") } + + test("SPARK-28365: Fallback to en_US if default locale isn't in available locales") { + val oldDefault = Locale.getDefault() + try { + val dummyLocale = Locale.forLanguageTag("test") + Locale.setDefault(dummyLocale) + + val remover = new StopWordsRemover() + .setInputCol("raw") + .setOutputCol("filtered") + assert(remover.getLocale == "en_US") + } finally { + Locale.setDefault(oldDefault) + } + } } --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org