[ https://issues.apache.org/jira/browse/SPARK-21227?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]
Seydou Dia updated SPARK-21227: ------------------------------- Description: Hi, please find below the step to reproduce the issue I am facing, {code:python} $ pyspark Python 3.4.3 (default, Sep 1 2016, 23:33:38) [GCC 4.8.3 20140911 (Red Hat 4.8.3-9)] on linux Type "help", "copyright", "credits" or "license" for more information. Setting default log level to "WARN". To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel). 17/06/27 12:29:05 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041. 17/06/27 12:29:05 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042. 17/06/27 12:29:08 WARN Client: Neither spark.yarn.jars nor spark.yarn.archive is set, falling back to uploading libraries under SPARK_HOME. Welcome to ____ __ / __/__ ___ _____/ /__ _\ \/ _ \/ _ `/ __/ '_/ /__ / .__/\_,_/_/ /_/\_\ version 2.1.0 /_/ Using Python version 3.4.3 (default, Sep 1 2016 23:33:38) SparkSession available as 'spark'. >>> sc=spark.sparkContext >>> js = ['{"city_name": "paris"}' ... , '{"city_name": "rome"}' ... , '{"city_name": "berlin"}' ... , '{"cıty_name": "new-york"}' ... , '{"cıty_name": "toronto"}' ... , '{"cıty_name": "chicago"}' ... , '{"cıty_name": "dubai"}'] >>> myRDD = sc.parallelize(js) >>> myDF = spark.read.json(myRDD) >>> myDF.printSchema() root |-- city_name: string (nullable = true) |-- cıty_name: string (nullable = true) >>> myDF.select(myDF['city_name']) Traceback (most recent call last): File "/usr/lib/spark/python/pyspark/sql/utils.py", line 63, in deco return f(*a, **kw) File "/usr/lib/spark/python/lib/py4j-0.10.4-src.zip/py4j/protocol.py", line 319, in get_return_value py4j.protocol.Py4JJavaError: An error occurred while calling o226.apply. : org.apache.spark.sql.AnalysisException: Reference 'city_name' is ambiguous, could be: city_name#29, city_name#30.; at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.resolve(LogicalPlan.scala:264) at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.resolveQuoted(LogicalPlan.scala:168) at org.apache.spark.sql.Dataset.resolve(Dataset.scala:217) at org.apache.spark.sql.Dataset.col(Dataset.scala:1073) at org.apache.spark.sql.Dataset.apply(Dataset.scala:1059) at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62) at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) at java.lang.reflect.Method.invoke(Method.java:498) at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244) at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357) at py4j.Gateway.invoke(Gateway.java:280) at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132) at py4j.commands.CallCommand.execute(CallCommand.java:79) at py4j.GatewayConnection.run(GatewayConnection.java:214) at java.lang.Thread.run(Thread.java:745) During handling of the above exception, another exception occurred: Traceback (most recent call last): File "<stdin>", line 1, in <module> File "/usr/lib/spark/python/pyspark/sql/dataframe.py", line 943, in __getitem__ jc = self._jdf.apply(item) File "/usr/lib/spark/python/lib/py4j-0.10.4-src.zip/py4j/java_gateway.py", line 1133, in __call__ File "/usr/lib/spark/python/pyspark/sql/utils.py", line 69, in deco raise AnalysisException(s.split(': ', 1)[1], stackTrace) pyspark.sql.utils.AnalysisException: "Reference 'city_name' is ambiguous, could be: city_name#29, city_name#30.;" {code} was: Hi, please find below the step to reproduce the issue I am facing, $ pyspark {code:python} Python 3.4.3 (default, Sep 1 2016, 23:33:38) [GCC 4.8.3 20140911 (Red Hat 4.8.3-9)] on linux Type "help", "copyright", "credits" or "license" for more information. Setting default log level to "WARN". To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel). 17/06/27 12:29:05 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041. 17/06/27 12:29:05 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042. 17/06/27 12:29:08 WARN Client: Neither spark.yarn.jars nor spark.yarn.archive is set, falling back to uploading libraries under SPARK_HOME. Welcome to ____ __ / __/__ ___ _____/ /__ _\ \/ _ \/ _ `/ __/ '_/ /__ / .__/\_,_/_/ /_/\_\ version 2.1.0 /_/ Using Python version 3.4.3 (default, Sep 1 2016 23:33:38) SparkSession available as 'spark'. >>> sc=spark.sparkContext >>> js = ['{"city_name": "paris"}' ... , '{"city_name": "rome"}' ... , '{"city_name": "berlin"}' ... , '{"cıty_name": "new-york"}' ... , '{"cıty_name": "toronto"}' ... , '{"cıty_name": "chicago"}' ... , '{"cıty_name": "dubai"}'] >>> myRDD = sc.parallelize(js) >>> myDF = spark.read.json(myRDD) >>> myDF.printSchema() root |-- city_name: string (nullable = true) |-- cıty_name: string (nullable = true) >>> myDF.select(myDF['city_name']) Traceback (most recent call last): File "/usr/lib/spark/python/pyspark/sql/utils.py", line 63, in deco return f(*a, **kw) File "/usr/lib/spark/python/lib/py4j-0.10.4-src.zip/py4j/protocol.py", line 319, in get_return_value py4j.protocol.Py4JJavaError: An error occurred while calling o226.apply. : org.apache.spark.sql.AnalysisException: Reference 'city_name' is ambiguous, could be: city_name#29, city_name#30.; at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.resolve(LogicalPlan.scala:264) at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.resolveQuoted(LogicalPlan.scala:168) at org.apache.spark.sql.Dataset.resolve(Dataset.scala:217) at org.apache.spark.sql.Dataset.col(Dataset.scala:1073) at org.apache.spark.sql.Dataset.apply(Dataset.scala:1059) at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62) at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) at java.lang.reflect.Method.invoke(Method.java:498) at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244) at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357) at py4j.Gateway.invoke(Gateway.java:280) at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132) at py4j.commands.CallCommand.execute(CallCommand.java:79) at py4j.GatewayConnection.run(GatewayConnection.java:214) at java.lang.Thread.run(Thread.java:745) During handling of the above exception, another exception occurred: Traceback (most recent call last): File "<stdin>", line 1, in <module> File "/usr/lib/spark/python/pyspark/sql/dataframe.py", line 943, in __getitem__ jc = self._jdf.apply(item) File "/usr/lib/spark/python/lib/py4j-0.10.4-src.zip/py4j/java_gateway.py", line 1133, in __call__ File "/usr/lib/spark/python/pyspark/sql/utils.py", line 69, in deco raise AnalysisException(s.split(': ', 1)[1], stackTrace) pyspark.sql.utils.AnalysisException: "Reference 'city_name' is ambiguous, could be: city_name#29, city_name#30.;" {code} > Unicode in Json field causes AnalysisException when selecting from Dataframe > ---------------------------------------------------------------------------- > > Key: SPARK-21227 > URL: https://issues.apache.org/jira/browse/SPARK-21227 > Project: Spark > Issue Type: Bug > Components: PySpark > Affects Versions: 2.1.0 > Reporter: Seydou Dia > > Hi, > please find below the step to reproduce the issue I am facing, > {code:python} > $ pyspark > Python 3.4.3 (default, Sep 1 2016, 23:33:38) > [GCC 4.8.3 20140911 (Red Hat 4.8.3-9)] on linux > Type "help", "copyright", "credits" or "license" for more information. > Setting default log level to "WARN". > To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use > setLogLevel(newLevel). > 17/06/27 12:29:05 WARN Utils: Service 'SparkUI' could not bind on port 4040. > Attempting port 4041. > 17/06/27 12:29:05 WARN Utils: Service 'SparkUI' could not bind on port 4041. > Attempting port 4042. > 17/06/27 12:29:08 WARN Client: Neither spark.yarn.jars nor spark.yarn.archive > is set, falling back to uploading libraries under SPARK_HOME. > Welcome to > ____ __ > / __/__ ___ _____/ /__ > _\ \/ _ \/ _ `/ __/ '_/ > /__ / .__/\_,_/_/ /_/\_\ version 2.1.0 > /_/ > Using Python version 3.4.3 (default, Sep 1 2016 23:33:38) > SparkSession available as 'spark'. > >>> sc=spark.sparkContext > >>> js = ['{"city_name": "paris"}' > ... , '{"city_name": "rome"}' > ... , '{"city_name": "berlin"}' > ... , '{"cıty_name": "new-york"}' > ... , '{"cıty_name": "toronto"}' > ... , '{"cıty_name": "chicago"}' > ... , '{"cıty_name": "dubai"}'] > >>> myRDD = sc.parallelize(js) > >>> myDF = spark.read.json(myRDD) > >>> myDF.printSchema() > >>> > root > |-- city_name: string (nullable = true) > |-- cıty_name: string (nullable = true) > >>> myDF.select(myDF['city_name']) > Traceback (most recent call last): > File "/usr/lib/spark/python/pyspark/sql/utils.py", line 63, in deco > return f(*a, **kw) > File "/usr/lib/spark/python/lib/py4j-0.10.4-src.zip/py4j/protocol.py", line > 319, in get_return_value > py4j.protocol.Py4JJavaError: An error occurred while calling o226.apply. > : org.apache.spark.sql.AnalysisException: Reference 'city_name' is ambiguous, > could be: city_name#29, city_name#30.; > at > org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.resolve(LogicalPlan.scala:264) > at > org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.resolveQuoted(LogicalPlan.scala:168) > at org.apache.spark.sql.Dataset.resolve(Dataset.scala:217) > at org.apache.spark.sql.Dataset.col(Dataset.scala:1073) > at org.apache.spark.sql.Dataset.apply(Dataset.scala:1059) > at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) > at > sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62) > at > sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) > at java.lang.reflect.Method.invoke(Method.java:498) > at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244) > at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357) > at py4j.Gateway.invoke(Gateway.java:280) > at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132) > at py4j.commands.CallCommand.execute(CallCommand.java:79) > at py4j.GatewayConnection.run(GatewayConnection.java:214) > at java.lang.Thread.run(Thread.java:745) > During handling of the above exception, another exception occurred: > Traceback (most recent call last): > File "<stdin>", line 1, in <module> > File "/usr/lib/spark/python/pyspark/sql/dataframe.py", line 943, in > __getitem__ > jc = self._jdf.apply(item) > File "/usr/lib/spark/python/lib/py4j-0.10.4-src.zip/py4j/java_gateway.py", > line 1133, in __call__ > File "/usr/lib/spark/python/pyspark/sql/utils.py", line 69, in deco > raise AnalysisException(s.split(': ', 1)[1], stackTrace) > pyspark.sql.utils.AnalysisException: "Reference 'city_name' is ambiguous, > could be: city_name#29, city_name#30.;" > {code} -- This message was sent by Atlassian JIRA (v6.4.14#64029) --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org For additional commands, e-mail: issues-h...@spark.apache.org