Repository: spark Updated Branches: refs/heads/branch-1.4 228aabe24 -> 17def3957
[SPARK-8766] support non-ascii character in column names Use UTF-8 to encode the name of column in Python 2, or it may failed to encode with default encoding ('ascii'). This PR also fix a bug when there is Java exception without error message. Author: Davies Liu <dav...@databricks.com> Closes #7165 from davies/non_ascii and squashes the following commits: 02cb61a [Davies Liu] fix tests 3b09d31 [Davies Liu] add encoding in header 867754a [Davies Liu] support non-ascii character in column names (cherry picked from commit f958f27e2056f9e380373c2807d8bb5977ecf269) Signed-off-by: Davies Liu <dav...@databricks.com> Conflicts: python/pyspark/sql/utils.py Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/17def395 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/17def395 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/17def395 Branch: refs/heads/branch-1.4 Commit: 17def395798dfc3af962d34b9a0260fa8880fe7d Parents: 228aabe Author: Davies Liu <dav...@databricks.com> Authored: Wed Jul 1 16:43:18 2015 -0700 Committer: Davies Liu <dav...@databricks.com> Committed: Wed Jul 1 17:18:04 2015 -0700 ---------------------------------------------------------------------- python/pyspark/sql/dataframe.py | 3 +-- python/pyspark/sql/tests.py | 9 +++++++++ python/pyspark/sql/types.py | 2 ++ 3 files changed, 12 insertions(+), 2 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/spark/blob/17def395/python/pyspark/sql/dataframe.py ---------------------------------------------------------------------- diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py index 2d8c595..e9dd00e 100644 --- a/python/pyspark/sql/dataframe.py +++ b/python/pyspark/sql/dataframe.py @@ -476,13 +476,12 @@ class DataFrame(object): return [(str(f.name), f.dataType.simpleString()) for f in self.schema.fields] @property - @ignore_unicode_prefix @since(1.3) def columns(self): """Returns all column names as a list. >>> df.columns - [u'age', u'name'] + ['age', 'name'] """ return [f.name for f in self.schema.fields] http://git-wip-us.apache.org/repos/asf/spark/blob/17def395/python/pyspark/sql/tests.py ---------------------------------------------------------------------- diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py index f902776..27c2ad1 100644 --- a/python/pyspark/sql/tests.py +++ b/python/pyspark/sql/tests.py @@ -1,3 +1,4 @@ +# -*- encoding: utf-8 -*- # # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with @@ -583,6 +584,14 @@ class SQLTests(ReusedPySparkTestCase): self.assertRaises(IndexError, lambda: df["bad_key"]) self.assertRaises(TypeError, lambda: df[{}]) + def test_column_name_with_non_ascii(self): + df = self.sqlCtx.createDataFrame([(1,)], ["æ°é"]) + self.assertEqual(StructType([StructField("æ°é", LongType(), True)]), df.schema) + self.assertEqual("DataFrame[æ°é: bigint]", str(df)) + self.assertEqual([("æ°é", 'bigint')], df.dtypes) + self.assertEqual(1, df.select("æ°é").first()[0]) + self.assertEqual(1, df.select(df["æ°é"]).first()[0]) + def test_access_nested_types(self): df = self.sc.parallelize([Row(l=[1], r=Row(a=1, b="b"), d={"k": "v"})]).toDF() self.assertEqual(1, df.select(df.l[0]).first()[0]) http://git-wip-us.apache.org/repos/asf/spark/blob/17def395/python/pyspark/sql/types.py ---------------------------------------------------------------------- diff --git a/python/pyspark/sql/types.py b/python/pyspark/sql/types.py index b6ec613..e4cb006 100644 --- a/python/pyspark/sql/types.py +++ b/python/pyspark/sql/types.py @@ -323,6 +323,8 @@ class StructField(DataType): False """ assert isinstance(dataType, DataType), "dataType should be DataType" + if not isinstance(name, str): + name = name.encode('utf-8') self.name = name self.dataType = dataType self.nullable = nullable --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org