[ https://issues.apache.org/jira/browse/SPARK-21227?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16076897#comment-16076897 ]
Seydou Dia edited comment on SPARK-21227 at 7/6/17 4:59 PM: ------------------------------------------------------------ Hi [~srowen], I had time to dig further the issue and come up with a solution. As [~hyukjin.kwon] first pointed out, the problem was the use of the method {code:java} equalsIgnoreCase {code}. In the Java implementation the strings to be compared are changed to uppercase, hence i and dotless evaluated equal. The solution I suggest is to switch to {code:java}toLowerCase{code} based comparison. None of the previous methods rely on Locals(). Please find below my diff already tested on my local machine: {code:diff} diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/package.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/package.scala index 7731336..d6e1e9a 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/package.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/package.scala @@ -33,7 33,7 @@ package object analysis { */ type Resolver = (String, String) => Boolean - val caseInsensitiveResolution = (a: String, b: String) => a.equalsIgnoreCase(b) val caseInsensitiveResolution = (a: String, b: String) => a.toLowerCase() == b.toLowerCase() val caseSensitiveResolution = (a: String, b: String) => a == b implicit class AnalysisErrorAt(t: TreeNode[_]) { diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisErrorSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisErrorSuite.scala index 7311dc3..aae108c 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisErrorSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisErrorSuite.scala @@ -579,4 579,23 @@ class AnalysisErrorSuite extends AnalysisTest { assertAnalysisError(plan5, "Accessing outer query column is not allowed in" :: Nil) } test("SPARK-21227: i and dotless i should not be considered the same characters in column name.") { // scalastyle:off nonascii val relation = LocalRelation( AttributeReference("city_name", StringType)(), AttributeReference("c\u0131ty_name", StringType)() ) val plan1 = relation.select($"city_name") assertAnalysisSuccess(plan1, caseSensitive = false) assertAnalysisSuccess(plan1, caseSensitive = true) val plan2 = relation.select($"c\u0131ty_name") assertAnalysisSuccess(plan2, caseSensitive = false) assertAnalysisSuccess(plan2, caseSensitive = true) // scalastyle:on nonascii } } {code} Ref: http://grepcode.com/file/repository.grepcode.com/java/root/jdk/openjdk/6-b14/java/lang/String.java#String.equalsIgnoreCase%28java.lang.String%29 was (Author: sdia): Hi [~srowen], I had time to dig further the issue and come up with a solution. As [~hyukjin.kwon] first pointed out, the problem was the use of the method {code:java} equalsIgnoreCase {code}. In the Java implementation the strings to be compared are changed to uppercase, hence i and dotless evaluated equal. The solution I suggest is to switch to {code:java}toLowerCase{code} based comparison. None of the previous methods rely on Locals(). Please find below my diff already tested on my local machine: {code} diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/package.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/package.scala index 7731336..d6e1e9a 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/package.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/package.scala @@ -33,7 33,7 @@ package object analysis { */ type Resolver = (String, String) => Boolean - val caseInsensitiveResolution = (a: String, b: String) => a.equalsIgnoreCase(b) val caseInsensitiveResolution = (a: String, b: String) => a.toLowerCase() == b.toLowerCase() val caseSensitiveResolution = (a: String, b: String) => a == b implicit class AnalysisErrorAt(t: TreeNode[_]) { diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisErrorSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisErrorSuite.scala index 7311dc3..aae108c 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisErrorSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisErrorSuite.scala @@ -579,4 579,23 @@ class AnalysisErrorSuite extends AnalysisTest { assertAnalysisError(plan5, "Accessing outer query column is not allowed in" :: Nil) } test("SPARK-21227: i and dotless i should not be considered the same characters in column name.") { // scalastyle:off nonascii val relation = LocalRelation( AttributeReference("city_name", StringType)(), AttributeReference("c\u0131ty_name", StringType)() ) val plan1 = relation.select($"city_name") assertAnalysisSuccess(plan1, caseSensitive = false) assertAnalysisSuccess(plan1, caseSensitive = true) val plan2 = relation.select($"c\u0131ty_name") assertAnalysisSuccess(plan2, caseSensitive = false) assertAnalysisSuccess(plan2, caseSensitive = true) // scalastyle:on nonascii } } {code} Ref: http://grepcode.com/file/repository.grepcode.com/java/root/jdk/openjdk/6-b14/java/lang/String.java#String.equalsIgnoreCase%28java.lang.String%29 > Unicode in Json field causes AnalysisException when selecting from Dataframe > ---------------------------------------------------------------------------- > > Key: SPARK-21227 > URL: https://issues.apache.org/jira/browse/SPARK-21227 > Project: Spark > Issue Type: Bug > Components: PySpark > Affects Versions: 2.1.0 > Reporter: Seydou Dia > > Hi, > please find below the step to reproduce the issue I am facing. > First I create a json with 2 fields: > * city_name > * cıty_name > The first one is valid ascii, while the second contains a unicode (ı, i > without dot ). > When I try to select from the dataframe I have an {noformat} > AnalysisException {noformat}. > {code:python} > $ pyspark > Python 3.4.3 (default, Sep 1 2016, 23:33:38) > [GCC 4.8.3 20140911 (Red Hat 4.8.3-9)] on linux > Type "help", "copyright", "credits" or "license" for more information. > Setting default log level to "WARN". > To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use > setLogLevel(newLevel). > 17/06/27 12:29:05 WARN Utils: Service 'SparkUI' could not bind on port 4040. > Attempting port 4041. > 17/06/27 12:29:05 WARN Utils: Service 'SparkUI' could not bind on port 4041. > Attempting port 4042. > 17/06/27 12:29:08 WARN Client: Neither spark.yarn.jars nor spark.yarn.archive > is set, falling back to uploading libraries under SPARK_HOME. > Welcome to > ____ __ > / __/__ ___ _____/ /__ > _\ \/ _ \/ _ `/ __/ '_/ > /__ / .__/\_,_/_/ /_/\_\ version 2.1.0 > /_/ > Using Python version 3.4.3 (default, Sep 1 2016 23:33:38) > SparkSession available as 'spark'. > >>> sc=spark.sparkContext > >>> js = ['{"city_name": "paris"}' > ... , '{"city_name": "rome"}' > ... , '{"city_name": "berlin"}' > ... , '{"cıty_name": "new-york"}' > ... , '{"cıty_name": "toronto"}' > ... , '{"cıty_name": "chicago"}' > ... , '{"cıty_name": "dubai"}'] > >>> myRDD = sc.parallelize(js) > >>> myDF = spark.read.json(myRDD) > >>> myDF.printSchema() > >>> > root > |-- city_name: string (nullable = true) > |-- cıty_name: string (nullable = true) > >>> myDF.select(myDF['city_name']) > Traceback (most recent call last): > File "/usr/lib/spark/python/pyspark/sql/utils.py", line 63, in deco > return f(*a, **kw) > File "/usr/lib/spark/python/lib/py4j-0.10.4-src.zip/py4j/protocol.py", line > 319, in get_return_value > py4j.protocol.Py4JJavaError: An error occurred while calling o226.apply. > : org.apache.spark.sql.AnalysisException: Reference 'city_name' is ambiguous, > could be: city_name#29, city_name#30.; > at > org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.resolve(LogicalPlan.scala:264) > at > org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.resolveQuoted(LogicalPlan.scala:168) > at org.apache.spark.sql.Dataset.resolve(Dataset.scala:217) > at org.apache.spark.sql.Dataset.col(Dataset.scala:1073) > at org.apache.spark.sql.Dataset.apply(Dataset.scala:1059) > at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) > at > sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62) > at > sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) > at java.lang.reflect.Method.invoke(Method.java:498) > at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244) > at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357) > at py4j.Gateway.invoke(Gateway.java:280) > at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132) > at py4j.commands.CallCommand.execute(CallCommand.java:79) > at py4j.GatewayConnection.run(GatewayConnection.java:214) > at java.lang.Thread.run(Thread.java:745) > During handling of the above exception, another exception occurred: > Traceback (most recent call last): > File "<stdin>", line 1, in <module> > File "/usr/lib/spark/python/pyspark/sql/dataframe.py", line 943, in > __getitem__ > jc = self._jdf.apply(item) > File "/usr/lib/spark/python/lib/py4j-0.10.4-src.zip/py4j/java_gateway.py", > line 1133, in __call__ > File "/usr/lib/spark/python/pyspark/sql/utils.py", line 69, in deco > raise AnalysisException(s.split(': ', 1)[1], stackTrace) > pyspark.sql.utils.AnalysisException: "Reference 'city_name' is ambiguous, > could be: city_name#29, city_name#30.;" > {code} -- This message was sent by Atlassian JIRA (v6.4.14#64029) --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org For additional commands, e-mail: issues-h...@spark.apache.org