Repository: spark Updated Branches: refs/heads/master 4f28291f8 -> 81da3bee6
[SPARK-5865][API DOC] Add doc warnings for methods that return local data structures rxin srowen I work out note message for rdd.take function, please help to review. If it's fine, I can apply to all other function later. Author: Tommy YU <tumm...@163.com> Closes #10874 from Wenpei/spark-5865-add-warning-for-localdatastructure. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/81da3bee Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/81da3bee Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/81da3bee Branch: refs/heads/master Commit: 81da3bee669aaeb79ec68baaf7c99bff6e5d14fe Parents: 4f28291 Author: Tommy YU <tumm...@163.com> Authored: Sat Feb 6 17:29:09 2016 +0000 Committer: Sean Owen <so...@cloudera.com> Committed: Sat Feb 6 17:29:09 2016 +0000 ---------------------------------------------------------------------- .../org/apache/spark/api/java/JavaPairRDD.scala | 3 +++ .../org/apache/spark/api/java/JavaRDDLike.scala | 24 ++++++++++++++++++++ .../org/apache/spark/rdd/PairRDDFunctions.scala | 3 +++ .../main/scala/org/apache/spark/rdd/RDD.scala | 15 ++++++++++++ python/pyspark/rdd.py | 17 ++++++++++++++ python/pyspark/sql/dataframe.py | 6 +++++ .../scala/org/apache/spark/sql/DataFrame.scala | 4 ++++ 7 files changed, 72 insertions(+) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/spark/blob/81da3bee/core/src/main/scala/org/apache/spark/api/java/JavaPairRDD.scala ---------------------------------------------------------------------- diff --git a/core/src/main/scala/org/apache/spark/api/java/JavaPairRDD.scala b/core/src/main/scala/org/apache/spark/api/java/JavaPairRDD.scala index fb04472..94d1035 100644 --- a/core/src/main/scala/org/apache/spark/api/java/JavaPairRDD.scala +++ b/core/src/main/scala/org/apache/spark/api/java/JavaPairRDD.scala @@ -636,6 +636,9 @@ class JavaPairRDD[K, V](val rdd: RDD[(K, V)]) /** * Return the key-value pairs in this RDD to the master as a Map. + * + * @note this method should only be used if the resulting data is expected to be small, as + * all the data is loaded into the driver's memory. */ def collectAsMap(): java.util.Map[K, V] = mapAsSerializableJavaMap(rdd.collectAsMap()) http://git-wip-us.apache.org/repos/asf/spark/blob/81da3bee/core/src/main/scala/org/apache/spark/api/java/JavaRDDLike.scala ---------------------------------------------------------------------- diff --git a/core/src/main/scala/org/apache/spark/api/java/JavaRDDLike.scala b/core/src/main/scala/org/apache/spark/api/java/JavaRDDLike.scala index 7340def..37c211f 100644 --- a/core/src/main/scala/org/apache/spark/api/java/JavaRDDLike.scala +++ b/core/src/main/scala/org/apache/spark/api/java/JavaRDDLike.scala @@ -327,6 +327,9 @@ trait JavaRDDLike[T, This <: JavaRDDLike[T, This]] extends Serializable { /** * Return an array that contains all of the elements in this RDD. + * + * @note this method should only be used if the resulting array is expected to be small, as + * all the data is loaded into the driver's memory. */ def collect(): JList[T] = rdd.collect().toSeq.asJava @@ -465,6 +468,9 @@ trait JavaRDDLike[T, This <: JavaRDDLike[T, This]] extends Serializable { * Take the first num elements of the RDD. This currently scans the partitions *one by one*, so * it will be slow if a lot of partitions are required. In that case, use collect() to get the * whole RDD instead. + * + * @note this method should only be used if the resulting array is expected to be small, as + * all the data is loaded into the driver's memory. */ def take(num: Int): JList[T] = rdd.take(num).toSeq.asJava @@ -548,6 +554,9 @@ trait JavaRDDLike[T, This <: JavaRDDLike[T, This]] extends Serializable { /** * Returns the top k (largest) elements from this RDD as defined by * the specified Comparator[T] and maintains the order. + * + * @note this method should only be used if the resulting array is expected to be small, as + * all the data is loaded into the driver's memory. * @param num k, the number of top elements to return * @param comp the comparator that defines the order * @return an array of top elements @@ -559,6 +568,9 @@ trait JavaRDDLike[T, This <: JavaRDDLike[T, This]] extends Serializable { /** * Returns the top k (largest) elements from this RDD using the * natural ordering for T and maintains the order. + * + * @note this method should only be used if the resulting array is expected to be small, as + * all the data is loaded into the driver's memory. * @param num k, the number of top elements to return * @return an array of top elements */ @@ -570,6 +582,9 @@ trait JavaRDDLike[T, This <: JavaRDDLike[T, This]] extends Serializable { /** * Returns the first k (smallest) elements from this RDD as defined by * the specified Comparator[T] and maintains the order. + * + * @note this method should only be used if the resulting array is expected to be small, as + * all the data is loaded into the driver's memory. * @param num k, the number of elements to return * @param comp the comparator that defines the order * @return an array of top elements @@ -601,6 +616,9 @@ trait JavaRDDLike[T, This <: JavaRDDLike[T, This]] extends Serializable { /** * Returns the first k (smallest) elements from this RDD using the * natural ordering for T while maintain the order. + * + * @note this method should only be used if the resulting array is expected to be small, as + * all the data is loaded into the driver's memory. * @param num k, the number of top elements to return * @return an array of top elements */ @@ -634,6 +652,9 @@ trait JavaRDDLike[T, This <: JavaRDDLike[T, This]] extends Serializable { /** * The asynchronous version of `collect`, which returns a future for * retrieving an array containing all of the elements in this RDD. + * + * @note this method should only be used if the resulting array is expected to be small, as + * all the data is loaded into the driver's memory. */ def collectAsync(): JavaFutureAction[JList[T]] = { new JavaFutureActionWrapper(rdd.collectAsync(), (x: Seq[T]) => x.asJava) @@ -642,6 +663,9 @@ trait JavaRDDLike[T, This <: JavaRDDLike[T, This]] extends Serializable { /** * The asynchronous version of the `take` action, which returns a * future for retrieving the first `num` elements of this RDD. + * + * @note this method should only be used if the resulting array is expected to be small, as + * all the data is loaded into the driver's memory. */ def takeAsync(num: Int): JavaFutureAction[JList[T]] = { new JavaFutureActionWrapper(rdd.takeAsync(num), (x: Seq[T]) => x.asJava) http://git-wip-us.apache.org/repos/asf/spark/blob/81da3bee/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala ---------------------------------------------------------------------- diff --git a/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala b/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala index 33f2f0b..61905a8 100644 --- a/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala +++ b/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala @@ -726,6 +726,9 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)]) * * Warning: this doesn't return a multimap (so if you have multiple values to the same key, only * one value per key is preserved in the map returned) + * + * @note this method should only be used if the resulting data is expected to be small, as + * all the data is loaded into the driver's memory. */ def collectAsMap(): Map[K, V] = self.withScope { val data = self.collect() http://git-wip-us.apache.org/repos/asf/spark/blob/81da3bee/core/src/main/scala/org/apache/spark/rdd/RDD.scala ---------------------------------------------------------------------- diff --git a/core/src/main/scala/org/apache/spark/rdd/RDD.scala b/core/src/main/scala/org/apache/spark/rdd/RDD.scala index e8157cf..a81a98b 100644 --- a/core/src/main/scala/org/apache/spark/rdd/RDD.scala +++ b/core/src/main/scala/org/apache/spark/rdd/RDD.scala @@ -481,6 +481,9 @@ abstract class RDD[T: ClassTag]( /** * Return a fixed-size sampled subset of this RDD in an array * + * @note this method should only be used if the resulting array is expected to be small, as + * all the data is loaded into the driver's memory. + * * @param withReplacement whether sampling is done with replacement * @param num size of the returned sample * @param seed seed for the random number generator @@ -836,6 +839,9 @@ abstract class RDD[T: ClassTag]( /** * Return an array that contains all of the elements in this RDD. + * + * @note this method should only be used if the resulting array is expected to be small, as + * all the data is loaded into the driver's memory. */ def collect(): Array[T] = withScope { val results = sc.runJob(this, (iter: Iterator[T]) => iter.toArray) @@ -1202,6 +1208,9 @@ abstract class RDD[T: ClassTag]( * results from that partition to estimate the number of additional partitions needed to satisfy * the limit. * + * @note this method should only be used if the resulting array is expected to be small, as + * all the data is loaded into the driver's memory. + * * @note due to complications in the internal implementation, this method will raise * an exception if called on an RDD of `Nothing` or `Null`. */ @@ -1263,6 +1272,9 @@ abstract class RDD[T: ClassTag]( * // returns Array(6, 5) * }}} * + * @note this method should only be used if the resulting array is expected to be small, as + * all the data is loaded into the driver's memory. + * * @param num k, the number of top elements to return * @param ord the implicit ordering for T * @return an array of top elements @@ -1283,6 +1295,9 @@ abstract class RDD[T: ClassTag]( * // returns Array(2, 3) * }}} * + * @note this method should only be used if the resulting array is expected to be small, as + * all the data is loaded into the driver's memory. + * * @param num k, the number of elements to return * @param ord the implicit ordering for T * @return an array of top elements http://git-wip-us.apache.org/repos/asf/spark/blob/81da3bee/python/pyspark/rdd.py ---------------------------------------------------------------------- diff --git a/python/pyspark/rdd.py b/python/pyspark/rdd.py index c285946..fe2264a 100644 --- a/python/pyspark/rdd.py +++ b/python/pyspark/rdd.py @@ -426,6 +426,9 @@ class RDD(object): """ Return a fixed-size sampled subset of this RDD. + Note that this method should only be used if the resulting array is expected + to be small, as all the data is loaded into the driver's memory. + >>> rdd = sc.parallelize(range(0, 10)) >>> len(rdd.takeSample(True, 20, 1)) 20 @@ -766,6 +769,8 @@ class RDD(object): def collect(self): """ Return a list that contains all of the elements in this RDD. + Note that this method should only be used if the resulting array is expected + to be small, as all the data is loaded into the driver's memory. """ with SCCallSiteSync(self.context) as css: port = self.ctx._jvm.PythonRDD.collectAndServe(self._jrdd.rdd()) @@ -1213,6 +1218,9 @@ class RDD(object): """ Get the top N elements from a RDD. + Note that this method should only be used if the resulting array is expected + to be small, as all the data is loaded into the driver's memory. + Note: It returns the list sorted in descending order. >>> sc.parallelize([10, 4, 2, 12, 3]).top(1) @@ -1235,6 +1243,9 @@ class RDD(object): Get the N elements from a RDD ordered in ascending order or as specified by the optional key function. + Note that this method should only be used if the resulting array is expected + to be small, as all the data is loaded into the driver's memory. + >>> sc.parallelize([10, 1, 2, 9, 3, 4, 5, 6, 7]).takeOrdered(6) [1, 2, 3, 4, 5, 6] >>> sc.parallelize([10, 1, 2, 9, 3, 4, 5, 6, 7], 2).takeOrdered(6, key=lambda x: -x) @@ -1254,6 +1265,9 @@ class RDD(object): that partition to estimate the number of additional partitions needed to satisfy the limit. + Note that this method should only be used if the resulting array is expected + to be small, as all the data is loaded into the driver's memory. + Translated from the Scala implementation in RDD#take(). >>> sc.parallelize([2, 3, 4, 5, 6]).cache().take(2) @@ -1511,6 +1525,9 @@ class RDD(object): """ Return the key-value pairs in this RDD to the master as a dictionary. + Note that this method should only be used if the resulting data is expected + to be small, as all the data is loaded into the driver's memory. + >>> m = sc.parallelize([(1, 2), (3, 4)]).collectAsMap() >>> m[1] 2 http://git-wip-us.apache.org/repos/asf/spark/blob/81da3bee/python/pyspark/sql/dataframe.py ---------------------------------------------------------------------- diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py index 90a6b5d..3a8c830 100644 --- a/python/pyspark/sql/dataframe.py +++ b/python/pyspark/sql/dataframe.py @@ -739,6 +739,9 @@ class DataFrame(object): def head(self, n=None): """Returns the first ``n`` rows. + Note that this method should only be used if the resulting array is expected + to be small, as all the data is loaded into the driver's memory. + :param n: int, default 1. Number of rows to return. :return: If n is greater than 1, return a list of :class:`Row`. If n is 1, return a single Row. @@ -1330,6 +1333,9 @@ class DataFrame(object): def toPandas(self): """Returns the contents of this :class:`DataFrame` as Pandas ``pandas.DataFrame``. + Note that this method should only be used if the resulting Pandas's DataFrame is expected + to be small, as all the data is loaded into the driver's memory. + This is only available if Pandas is installed and available. >>> df.toPandas() # doctest: +SKIP http://git-wip-us.apache.org/repos/asf/spark/blob/81da3bee/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala ---------------------------------------------------------------------- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala index f15b926..7aa08fb 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala @@ -1384,6 +1384,10 @@ class DataFrame private[sql]( /** * Returns the first `n` rows. + * + * @note this method should only be used if the resulting array is expected to be small, as + * all the data is loaded into the driver's memory. + * * @group action * @since 1.3.0 */ --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org