Repository: spark
Updated Branches:
  refs/heads/master 3a5962f0f -> d1d0ee41c


[SPARK-3103] [PySpark] fix saveAsTextFile() with utf-8

bugfix: It will raise an exception when it try to encode non-ASCII strings into 
unicode. It should only encode unicode as "utf-8".

Author: Davies Liu <davies....@gmail.com>

Closes #2018 from davies/fix_utf8 and squashes the following commits:

4db7967 [Davies Liu] fix saveAsTextFile() with utf-8


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/d1d0ee41
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/d1d0ee41
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/d1d0ee41

Branch: refs/heads/master
Commit: d1d0ee41c27f1d07fed0c5d56ba26c723cc3dc26
Parents: 3a5962f
Author: Davies Liu <davies....@gmail.com>
Authored: Mon Aug 18 13:58:35 2014 -0700
Committer: Josh Rosen <joshro...@apache.org>
Committed: Mon Aug 18 13:58:35 2014 -0700

----------------------------------------------------------------------
 python/pyspark/rdd.py   | 4 +++-
 python/pyspark/tests.py | 9 +++++++++
 2 files changed, 12 insertions(+), 1 deletion(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/d1d0ee41/python/pyspark/rdd.py
----------------------------------------------------------------------
diff --git a/python/pyspark/rdd.py b/python/pyspark/rdd.py
index 240381e..c708b69 100644
--- a/python/pyspark/rdd.py
+++ b/python/pyspark/rdd.py
@@ -1191,7 +1191,9 @@ class RDD(object):
             for x in iterator:
                 if not isinstance(x, basestring):
                     x = unicode(x)
-                yield x.encode("utf-8")
+                if isinstance(x, unicode):
+                    x = x.encode("utf-8")
+                yield x
         keyed = self.mapPartitionsWithIndex(func)
         keyed._bypass_serializer = True
         keyed._jrdd.map(self.ctx._jvm.BytesToString()).saveAsTextFile(path)

http://git-wip-us.apache.org/repos/asf/spark/blob/d1d0ee41/python/pyspark/tests.py
----------------------------------------------------------------------
diff --git a/python/pyspark/tests.py b/python/pyspark/tests.py
index f1fece9..69d543d 100644
--- a/python/pyspark/tests.py
+++ b/python/pyspark/tests.py
@@ -256,6 +256,15 @@ class TestRDDFunctions(PySparkTestCase):
         raw_contents = ''.join(input(glob(tempFile.name + "/part-0000*")))
         self.assertEqual(x, unicode(raw_contents.strip(), "utf-8"))
 
+    def test_save_as_textfile_with_utf8(self):
+        x = u"\u00A1Hola, mundo!"
+        data = self.sc.parallelize([x.encode("utf-8")])
+        tempFile = tempfile.NamedTemporaryFile(delete=True)
+        tempFile.close()
+        data.saveAsTextFile(tempFile.name)
+        raw_contents = ''.join(input(glob(tempFile.name + "/part-0000*")))
+        self.assertEqual(x, unicode(raw_contents.strip(), "utf-8"))
+
     def test_transforming_cartesian_result(self):
         # Regression test for SPARK-1034
         rdd1 = self.sc.parallelize([1, 2])


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

Reply via email to