Repository: spark Updated Branches: refs/heads/master f07e71406 -> b80a030e9
[SPARK-6623][SQL] Alias DataFrame.na.drop and DataFrame.na.fill in Python. To maintain consistency with the Scala API. Author: Reynold Xin <r...@databricks.com> Closes #5284 from rxin/df-na-alias and squashes the following commits: 19f46b7 [Reynold Xin] Show DataFrameNaFunctions in docs. 6618118 [Reynold Xin] [SPARK-6623][SQL] Alias DataFrame.na.drop and DataFrame.na.fill in Python. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/b80a030e Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/b80a030e Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/b80a030e Branch: refs/heads/master Commit: b80a030e90d790e27e89b26f536565c582dbf3d5 Parents: f07e714 Author: Reynold Xin <r...@databricks.com> Authored: Tue Mar 31 00:25:23 2015 -0700 Committer: Reynold Xin <r...@databricks.com> Committed: Tue Mar 31 00:25:23 2015 -0700 ---------------------------------------------------------------------- python/pyspark/sql/__init__.py | 10 +++++---- python/pyspark/sql/dataframe.py | 41 ++++++++++++++++++++++++++++++++++-- 2 files changed, 45 insertions(+), 6 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/spark/blob/b80a030e/python/pyspark/sql/__init__.py ---------------------------------------------------------------------- diff --git a/python/pyspark/sql/__init__.py b/python/pyspark/sql/__init__.py index 54a0163..9d39e5d 100644 --- a/python/pyspark/sql/__init__.py +++ b/python/pyspark/sql/__init__.py @@ -22,22 +22,24 @@ public classes of Spark SQL: Main entry point for :class:`DataFrame` and SQL functionality. - L{DataFrame} A distributed collection of data grouped into named columns. - - L{GroupedData} - Aggregation methods, returned by :func:`DataFrame.groupBy`. - L{Column} A column expression in a :class:`DataFrame`. - L{Row} A row of data in a :class:`DataFrame`. - L{HiveContext} Main entry point for accessing data stored in Apache Hive. + - L{GroupedData} + Aggregation methods, returned by :func:`DataFrame.groupBy`. + - L{DataFrameNaFunctions} + Methods for handling missing data (null values). - L{functions} List of built-in functions available for :class:`DataFrame`. """ from pyspark.sql.context import SQLContext, HiveContext from pyspark.sql.types import Row -from pyspark.sql.dataframe import DataFrame, GroupedData, Column, SchemaRDD +from pyspark.sql.dataframe import DataFrame, GroupedData, Column, SchemaRDD, DataFrameNaFunctions __all__ = [ - 'SQLContext', 'HiveContext', 'DataFrame', 'GroupedData', 'Column', 'Row', + 'SQLContext', 'HiveContext', 'DataFrame', 'GroupedData', 'Column', 'Row', 'DataFrameNaFunctions' ] http://git-wip-us.apache.org/repos/asf/spark/blob/b80a030e/python/pyspark/sql/dataframe.py ---------------------------------------------------------------------- diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py index 4f174de..1550802 100644 --- a/python/pyspark/sql/dataframe.py +++ b/python/pyspark/sql/dataframe.py @@ -31,7 +31,7 @@ from pyspark.sql.types import * from pyspark.sql.types import _create_cls, _parse_datatype_json_string -__all__ = ["DataFrame", "GroupedData", "Column", "SchemaRDD"] +__all__ = ["DataFrame", "GroupedData", "Column", "SchemaRDD", "DataFrameNaFunctions"] class DataFrame(object): @@ -86,6 +86,12 @@ class DataFrame(object): return self._lazy_rdd + @property + def na(self): + """Returns a :class:`DataFrameNaFunctions` for handling missing values. + """ + return DataFrameNaFunctions(self) + def toJSON(self, use_unicode=False): """Convert a :class:`DataFrame` into a MappedRDD of JSON documents; one document per row. @@ -693,6 +699,8 @@ class DataFrame(object): def dropna(self, how='any', thresh=None, subset=None): """Returns a new :class:`DataFrame` omitting rows with null values. + This is an alias for `na.drop`. + :param how: 'any' or 'all'. If 'any', drop a row if it contains any nulls. If 'all', drop a row only if all its values are null. @@ -704,6 +712,10 @@ class DataFrame(object): >>> df4.dropna().show() age height name 10 80 Alice + + >>> df4.na.drop().show() + age height name + 10 80 Alice """ if how is not None and how not in ['any', 'all']: raise ValueError("how ('" + how + "') should be 'any' or 'all'") @@ -723,7 +735,7 @@ class DataFrame(object): return DataFrame(self._jdf.na().drop(thresh, cols), self.sql_ctx) def fillna(self, value, subset=None): - """Replace null values. + """Replace null values, alias for `na.fill`. :param value: int, long, float, string, or dict. Value to replace null values with. @@ -748,6 +760,13 @@ class DataFrame(object): 5 null Bob 50 null Tom 50 null unknown + + >>> df4.na.fill({'age': 50, 'name': 'unknown'}).show() + age height name + 10 80 Alice + 5 null Bob + 50 null Tom + 50 null unknown """ if not isinstance(value, (float, int, long, basestring, dict)): raise ValueError("value should be a float, int, long, string, or dict") @@ -1134,6 +1153,24 @@ class Column(object): return 'Column<%s>' % self._jc.toString().encode('utf8') +class DataFrameNaFunctions(object): + """Functionality for working with missing data in :class:`DataFrame`. + """ + + def __init__(self, df): + self.df = df + + def drop(self, how='any', thresh=None, subset=None): + return self.df.dropna(how=how, thresh=thresh, subset=subset) + + drop.__doc__ = DataFrame.dropna.__doc__ + + def fill(self, value, subset=None): + return self.df.fillna(value=value, subset=subset) + + fill.__doc__ = DataFrame.fillna.__doc__ + + def _test(): import doctest from pyspark.context import SparkContext --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org