Repository: spark Updated Branches: refs/heads/master b1310425b -> 4e14199ff
[MINOR][PYSPARK][DOC] Fix wrongly formatted examples in PySpark documentation ## What changes were proposed in this pull request? This PR fixes wrongly formatted examples in PySpark documentation as below: - **`SparkSession`** - **Before** ![2016-07-06 11 34 41](https://cloud.githubusercontent.com/assets/6477701/16605847/ae939526-436d-11e6-8ab8-6ad578362425.png) - **After** ![2016-07-06 11 33 56](https://cloud.githubusercontent.com/assets/6477701/16605845/ace9ee78-436d-11e6-8923-b76d4fc3e7c3.png) - **`Builder`** - **Before** ![2016-07-06 11 34 44](https://cloud.githubusercontent.com/assets/6477701/16605844/aba60dbc-436d-11e6-990a-c87bc0281c6b.png) - **After** ![2016-07-06 1 26 37](https://cloud.githubusercontent.com/assets/6477701/16607562/586704c0-437d-11e6-9483-e0af93d8f74e.png) This PR also fixes several similar instances across the documentation in `sql` PySpark module. ## How was this patch tested? N/A Author: hyukjinkwon <gurwls...@gmail.com> Closes #14063 from HyukjinKwon/minor-pyspark-builder. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/4e14199f Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/4e14199f Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/4e14199f Branch: refs/heads/master Commit: 4e14199ff740ea186eb2cec2e5cf901b58c5f90e Parents: b131042 Author: hyukjinkwon <gurwls...@gmail.com> Authored: Wed Jul 6 10:45:51 2016 -0700 Committer: Reynold Xin <r...@databricks.com> Committed: Wed Jul 6 10:45:51 2016 -0700 ---------------------------------------------------------------------- python/pyspark/mllib/clustering.py | 14 +++++++------- python/pyspark/sql/dataframe.py | 8 ++++---- python/pyspark/sql/functions.py | 8 ++++---- python/pyspark/sql/group.py | 2 ++ python/pyspark/sql/session.py | 13 +++++++------ python/pyspark/sql/types.py | 4 ++-- 6 files changed, 26 insertions(+), 23 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/spark/blob/4e14199f/python/pyspark/mllib/clustering.py ---------------------------------------------------------------------- diff --git a/python/pyspark/mllib/clustering.py b/python/pyspark/mllib/clustering.py index 93a0b64..c38c543 100644 --- a/python/pyspark/mllib/clustering.py +++ b/python/pyspark/mllib/clustering.py @@ -571,14 +571,14 @@ class PowerIterationClusteringModel(JavaModelWrapper, JavaSaveable, JavaLoader): >>> import math >>> def genCircle(r, n): - ... points = [] - ... for i in range(0, n): - ... theta = 2.0 * math.pi * i / n - ... points.append((r * math.cos(theta), r * math.sin(theta))) - ... return points + ... points = [] + ... for i in range(0, n): + ... theta = 2.0 * math.pi * i / n + ... points.append((r * math.cos(theta), r * math.sin(theta))) + ... return points >>> def sim(x, y): - ... dist2 = (x[0] - y[0]) * (x[0] - y[0]) + (x[1] - y[1]) * (x[1] - y[1]) - ... return math.exp(-dist2 / 2.0) + ... dist2 = (x[0] - y[0]) * (x[0] - y[0]) + (x[1] - y[1]) * (x[1] - y[1]) + ... return math.exp(-dist2 / 2.0) >>> r1 = 1.0 >>> n1 = 10 >>> r2 = 4.0 http://git-wip-us.apache.org/repos/asf/spark/blob/4e14199f/python/pyspark/sql/dataframe.py ---------------------------------------------------------------------- diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py index e44b01b..a0ac7a9 100644 --- a/python/pyspark/sql/dataframe.py +++ b/python/pyspark/sql/dataframe.py @@ -1045,10 +1045,10 @@ class DataFrame(object): :func:`drop_duplicates` is an alias for :func:`dropDuplicates`. >>> from pyspark.sql import Row - >>> df = sc.parallelize([ \ - Row(name='Alice', age=5, height=80), \ - Row(name='Alice', age=5, height=80), \ - Row(name='Alice', age=10, height=80)]).toDF() + >>> df = sc.parallelize([ \\ + ... Row(name='Alice', age=5, height=80), \\ + ... Row(name='Alice', age=5, height=80), \\ + ... Row(name='Alice', age=10, height=80)]).toDF() >>> df.dropDuplicates().show() +---+------+-----+ |age|height| name| http://git-wip-us.apache.org/repos/asf/spark/blob/4e14199f/python/pyspark/sql/functions.py ---------------------------------------------------------------------- diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py index 7a73451..92d709e 100644 --- a/python/pyspark/sql/functions.py +++ b/python/pyspark/sql/functions.py @@ -1550,8 +1550,8 @@ def translate(srcCol, matching, replace): The translate will happen when any character in the string matching with the character in the `matching`. - >>> spark.createDataFrame([('translate',)], ['a']).select(translate('a', "rnlt", "123")\ - .alias('r')).collect() + >>> spark.createDataFrame([('translate',)], ['a']).select(translate('a', "rnlt", "123") \\ + ... .alias('r')).collect() [Row(r=u'1a2s3ae')] """ sc = SparkContext._active_spark_context @@ -1670,8 +1670,8 @@ def get_json_object(col, path): >>> data = [("1", '''{"f1": "value1", "f2": "value2"}'''), ("2", '''{"f1": "value12"}''')] >>> df = spark.createDataFrame(data, ("key", "jstring")) - >>> df.select(df.key, get_json_object(df.jstring, '$.f1').alias("c0"), \ - get_json_object(df.jstring, '$.f2').alias("c1") ).collect() + >>> df.select(df.key, get_json_object(df.jstring, '$.f1').alias("c0"), \\ + ... get_json_object(df.jstring, '$.f2').alias("c1") ).collect() [Row(key=u'1', c0=u'value1', c1=u'value2'), Row(key=u'2', c0=u'value12', c1=None)] """ sc = SparkContext._active_spark_context http://git-wip-us.apache.org/repos/asf/spark/blob/4e14199f/python/pyspark/sql/group.py ---------------------------------------------------------------------- diff --git a/python/pyspark/sql/group.py b/python/pyspark/sql/group.py index a423206..f2092f9 100644 --- a/python/pyspark/sql/group.py +++ b/python/pyspark/sql/group.py @@ -179,10 +179,12 @@ class GroupedData(object): :param values: List of values that will be translated to columns in the output DataFrame. # Compute the sum of earnings for each year by course with each course as a separate column + >>> df4.groupBy("year").pivot("course", ["dotNET", "Java"]).sum("earnings").collect() [Row(year=2012, dotNET=15000, Java=20000), Row(year=2013, dotNET=48000, Java=30000)] # Or without specifying column values (less efficient) + >>> df4.groupBy("year").pivot("course").sum("earnings").collect() [Row(year=2012, Java=20000, dotNET=15000), Row(year=2013, Java=30000, dotNET=48000)] """ http://git-wip-us.apache.org/repos/asf/spark/blob/4e14199f/python/pyspark/sql/session.py ---------------------------------------------------------------------- diff --git a/python/pyspark/sql/session.py b/python/pyspark/sql/session.py index 55f86a1..a360fbe 100644 --- a/python/pyspark/sql/session.py +++ b/python/pyspark/sql/session.py @@ -66,12 +66,11 @@ class SparkSession(object): tables, execute SQL over tables, cache tables, and read parquet files. To create a SparkSession, use the following builder pattern: - >>> spark = SparkSession.builder \ - .master("local") \ - .appName("Word Count") \ - .config("spark.some.config.option", "some-value") \ - .getOrCreate() - + >>> spark = SparkSession.builder \\ + ... .master("local") \\ + ... .appName("Word Count") \\ + ... .config("spark.some.config.option", "some-value") \\ + ... .getOrCreate() """ class Builder(object): @@ -87,11 +86,13 @@ class SparkSession(object): both :class:`SparkConf` and :class:`SparkSession`'s own configuration. For an existing SparkConf, use `conf` parameter. + >>> from pyspark.conf import SparkConf >>> SparkSession.builder.config(conf=SparkConf()) <pyspark.sql.session... For a (key, value) pair, you can omit parameter names. + >>> SparkSession.builder.config("spark.some.config.option", "some-value") <pyspark.sql.session... http://git-wip-us.apache.org/repos/asf/spark/blob/4e14199f/python/pyspark/sql/types.py ---------------------------------------------------------------------- diff --git a/python/pyspark/sql/types.py b/python/pyspark/sql/types.py index a367987..eea8068 100644 --- a/python/pyspark/sql/types.py +++ b/python/pyspark/sql/types.py @@ -486,8 +486,8 @@ class StructType(DataType): DataType object. >>> struct1 = StructType().add("f1", StringType(), True).add("f2", StringType(), True, None) - >>> struct2 = StructType([StructField("f1", StringType(), True),\ - StructField("f2", StringType(), True, None)]) + >>> struct2 = StructType([StructField("f1", StringType(), True), \\ + ... StructField("f2", StringType(), True, None)]) >>> struct1 == struct2 True >>> struct1 = StructType().add(StructField("f1", StringType(), True)) --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org