Github user davies commented on a diff in the pull request: https://github.com/apache/spark/pull/1351#discussion_r17200250 --- Diff: python/pyspark/sql.py --- @@ -187,6 +187,56 @@ def func(split, iterator): jschema_rdd = self._ssql_ctx.jsonRDD(jrdd.rdd()) return SchemaRDD(jschema_rdd, self) + def csvFile(self, path, delimiter = ",", quote = "\"", header = False): + """ + Loads a CSV file (according to RFC 4180) and returns the result as a L{SchemaRDD}. + header flag specified if first line of each file should be treated as header. + + NOTE: If there are new line characters inside quoted fields this method may fail to + parse correctly, because the two lines may be in different partitions. Use + L{SQLContext#csvRDD} to parse such files. + + >>> import tempfile, shutil + >>> csvFile = tempfile.mkdtemp() + >>> shutil.rmtree(csvFile) + >>> ofn = open(csvFile, 'w') + >>> for csvStr in csvStrings: + ... print>>ofn, csvStr + >>> ofn.close() + >>> csv = sqlCtx.csvFile(csvFile, delimiter = ", ", header = True) + >>> sqlCtx.registerRDDAsTable(csv, "csvTable") + >>> csvRes = sqlCtx.sql("SELECT Year FROM csvTable WHERE Make = 'Ford'") + >>> csvRes.collect() + [{u'Year': u'1997'}] + """ + jschema_rdd = self._ssql_ctx.csvFile(path, delimiter, quote, header) + return SchemaRDD(jschema_rdd, self) + + def csvRDD(self, rdd, delimiter = ",", quote = "\"", header = False): + """ + Parses an RDD of String as a CSV (according to RFC 4180) and returns the result as a + L{SchemaRDD}. + + NOTE: If there are new line characters inside quoted fields, use wholeTextFile to + read each file into a single partition. + + >>> csvrdd = sqlCtx.csvRDD(csv, delimiter = ", ", header = True) + >>> sqlCtx.registerRDDAsTable(csvrdd, "csvTable2") + >>> csvRes = sqlCtx.sql("SELECT count(*) FROM csvTable2") + >>> csvRes.collect() == [{"c0": 3}] + True + """ + def func(split, iterator): + for x in iterator: + if not isinstance(x, basestring): + x = unicode(x) + yield x.encode("utf-8") --- End diff -- `x` could be str (in "utf-8"), see saveAsTextFile()
--- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- --------------------------------------------------------------------- To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org