Github user HyukjinKwon commented on a diff in the pull request: https://github.com/apache/spark/pull/19339#discussion_r141231147 --- Diff: python/pyspark/sql/readwriter.py --- @@ -420,7 +425,29 @@ def csv(self, path, schema=None, sep=None, encoding=None, quote=None, escape=Non columnNameOfCorruptRecord=columnNameOfCorruptRecord, multiLine=multiLine) if isinstance(path, basestring): path = [path] - return self._df(self._jreader.csv(self._spark._sc._jvm.PythonUtils.toSeq(path))) + if type(path) == list: + return self._df(self._jreader.csv(self._spark._sc._jvm.PythonUtils.toSeq(path))) + elif isinstance(path, RDD): + def func(iterator): + for x in iterator: + if not isinstance(x, basestring): + x = unicode(x) + if isinstance(x, unicode): + x = x.encode("utf-8") + yield x + keyed = path.mapPartitions(func) + keyed._bypass_serializer = True + jrdd = keyed._jrdd.map(self._spark._jvm.BytesToString()) + # see SPARK-22112 + # There aren't any jvm api for creating a dataframe from rdd storing csv. --- End diff -- Let's fix these comments like, ``` SPARK-22112: There aren't any jvm api for creating a dataframe from rdd storing csv. ... ``` or ``` There aren't any jvm api ... ... for creating a dataframe from dataset storing csv. See SPARK-22112. ``` when we happened to fix some code around here or review other PRs fixing some codes around here in the future.
--- --------------------------------------------------------------------- To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org