Github user davies commented on a diff in the pull request:

    https://github.com/apache/spark/pull/1351#discussion_r17200139
  
    --- Diff: python/pyspark/sql.py ---
    @@ -187,6 +187,56 @@ def func(split, iterator):
             jschema_rdd = self._ssql_ctx.jsonRDD(jrdd.rdd())
             return SchemaRDD(jschema_rdd, self)
     
    +    def csvFile(self, path, delimiter = ",", quote = "\"", header = False):
    +        """
    +        Loads a CSV file (according to RFC 4180) and returns the result as 
a L{SchemaRDD}.
    +        header flag specified if first line of each file should be treated 
as header.
    +
    +        NOTE: If there are new line characters inside quoted fields this 
method may fail to
    +        parse correctly, because the two lines may be in different 
partitions. Use
    +        L{SQLContext#csvRDD} to parse such files.
    +
    +        >>> import tempfile, shutil
    +        >>> csvFile = tempfile.mkdtemp()
    +        >>> shutil.rmtree(csvFile)
    +        >>> ofn = open(csvFile, 'w')
    +        >>> for csvStr in csvStrings:
    +        ...   print>>ofn, csvStr
    +        >>> ofn.close()
    +        >>> csv = sqlCtx.csvFile(csvFile, delimiter = ", ", header = True)
    +        >>> sqlCtx.registerRDDAsTable(csv, "csvTable")
    +        >>> csvRes = sqlCtx.sql("SELECT Year FROM csvTable WHERE Make = 
'Ford'")
    +        >>> csvRes.collect()
    +        [{u'Year': u'1997'}]
    +        """
    +        jschema_rdd = self._ssql_ctx.csvFile(path, delimiter, quote, 
header)
    +        return SchemaRDD(jschema_rdd, self)
    +
    +    def csvRDD(self, rdd, delimiter = ",", quote = "\"", header = False):
    +        """
    +        Parses an RDD of String as a CSV (according to RFC 4180) and 
returns the result as a
    +        L{SchemaRDD}.
    +
    +        NOTE: If there are new line characters inside quoted fields, use 
wholeTextFile to
    +        read each file into a single partition.
    --- End diff --
    
    PS: wholeTextFile() will read a file as single string, may it's not what 
you want.


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---

---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

Reply via email to