Repository: spark Updated Branches: refs/heads/master d009fb369 -> bccc33019
[SPARK-20196][PYTHON][SQL] update doc for catalog functions for all languages, add pyspark refreshByPath API ## What changes were proposed in this pull request? Update doc to remove external for createTable, add refreshByPath in python ## How was this patch tested? manual Author: Felix Cheung <felixcheun...@hotmail.com> Closes #17512 from felixcheung/catalogdoc. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/bccc3301 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/bccc3301 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/bccc3301 Branch: refs/heads/master Commit: bccc330193217b2ec9660e06f1db6dd58f7af5d8 Parents: d009fb3 Author: Felix Cheung <felixcheun...@hotmail.com> Authored: Thu Apr 6 09:09:43 2017 -0700 Committer: Felix Cheung <felixche...@apache.org> Committed: Thu Apr 6 09:09:43 2017 -0700 ---------------------------------------------------------------------- R/pkg/R/SQLContext.R | 11 +++-- R/pkg/R/catalog.R | 52 +++++++++++--------- python/pyspark/sql/catalog.py | 27 +++++++--- python/pyspark/sql/context.py | 2 +- .../org/apache/spark/sql/catalog/Catalog.scala | 17 ++++--- .../apache/spark/sql/internal/CatalogImpl.scala | 22 ++++++--- 6 files changed, 79 insertions(+), 52 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/spark/blob/bccc3301/R/pkg/R/SQLContext.R ---------------------------------------------------------------------- diff --git a/R/pkg/R/SQLContext.R b/R/pkg/R/SQLContext.R index a1edef7..c2a1e24 100644 --- a/R/pkg/R/SQLContext.R +++ b/R/pkg/R/SQLContext.R @@ -544,12 +544,15 @@ sql <- function(x, ...) { dispatchFunc("sql(sqlQuery)", x, ...) } -#' Create a SparkDataFrame from a SparkSQL Table +#' Create a SparkDataFrame from a SparkSQL table or view #' -#' Returns the specified Table as a SparkDataFrame. The Table must have already been registered -#' in the SparkSession. +#' Returns the specified table or view as a SparkDataFrame. The table or view must already exist or +#' have already been registered in the SparkSession. #' -#' @param tableName The SparkSQL Table to convert to a SparkDataFrame. +#' @param tableName the qualified or unqualified name that designates a table or view. If a database +#' is specified, it identifies the table/view from the database. +#' Otherwise, it first attempts to find a temporary view with the given name +#' and then match the table/view from the current database. #' @return SparkDataFrame #' @rdname tableToDF #' @name tableToDF http://git-wip-us.apache.org/repos/asf/spark/blob/bccc3301/R/pkg/R/catalog.R ---------------------------------------------------------------------- diff --git a/R/pkg/R/catalog.R b/R/pkg/R/catalog.R index 07a89f7..4b7f841 100644 --- a/R/pkg/R/catalog.R +++ b/R/pkg/R/catalog.R @@ -65,7 +65,8 @@ createExternalTable <- function(x, ...) { #' #' Caches the specified table in-memory. #' -#' @param tableName The name of the table being cached +#' @param tableName the qualified or unqualified name that designates a table. If no database +#' identifier is provided, it refers to a table in the current database. #' @return SparkDataFrame #' @rdname cacheTable #' @export @@ -94,7 +95,8 @@ cacheTable <- function(x, ...) { #' #' Removes the specified table from the in-memory cache. #' -#' @param tableName The name of the table being uncached +#' @param tableName the qualified or unqualified name that designates a table. If no database +#' identifier is provided, it refers to a table in the current database. #' @return SparkDataFrame #' @rdname uncacheTable #' @export @@ -162,6 +164,7 @@ clearCache <- function() { #' @method dropTempTable default #' @note dropTempTable since 1.4.0 dropTempTable.default <- function(tableName) { + .Deprecated("dropTempView", old = "dropTempTable") if (class(tableName) != "character") { stop("tableName must be a string.") } @@ -169,7 +172,6 @@ dropTempTable.default <- function(tableName) { } dropTempTable <- function(x, ...) { - .Deprecated("dropTempView") dispatchFunc("dropTempView(viewName)", x, ...) } @@ -178,7 +180,7 @@ dropTempTable <- function(x, ...) { #' Drops the temporary view with the given view name in the catalog. #' If the view has been cached before, then it will also be uncached. #' -#' @param viewName the name of the view to be dropped. +#' @param viewName the name of the temporary view to be dropped. #' @return TRUE if the view is dropped successfully, FALSE otherwise. #' @rdname dropTempView #' @name dropTempView @@ -317,10 +319,10 @@ listDatabases <- function() { dataFrame(callJMethod(callJMethod(catalog, "listDatabases"), "toDF")) } -#' Returns a list of tables in the specified database +#' Returns a list of tables or views in the specified database #' -#' Returns a list of tables in the specified database. -#' This includes all temporary tables. +#' Returns a list of tables or views in the specified database. +#' This includes all temporary views. #' #' @param databaseName (optional) name of the database #' @return a SparkDataFrame of the list of tables. @@ -349,11 +351,13 @@ listTables <- function(databaseName = NULL) { dataFrame(callJMethod(jdst, "toDF")) } -#' Returns a list of columns for the given table in the specified database +#' Returns a list of columns for the given table/view in the specified database #' -#' Returns a list of columns for the given table in the specified database. +#' Returns a list of columns for the given table/view in the specified database. #' -#' @param tableName a name of the table. +#' @param tableName the qualified or unqualified name that designates a table/view. If no database +#' identifier is provided, it refers to a table/view in the current database. +#' If \code{databaseName} parameter is specified, this must be an unqualified name. #' @param databaseName (optional) name of the database #' @return a SparkDataFrame of the list of column descriptions. #' @rdname listColumns @@ -409,12 +413,13 @@ listFunctions <- function(databaseName = NULL) { dataFrame(callJMethod(jdst, "toDF")) } -#' Recover all the partitions in the directory of a table and update the catalog +#' Recovers all the partitions in the directory of a table and update the catalog #' -#' Recover all the partitions in the directory of a table and update the catalog. The name should -#' reference a partitioned table, and not a temporary view. +#' Recovers all the partitions in the directory of a table and update the catalog. The name should +#' reference a partitioned table, and not a view. #' -#' @param tableName a name of the table. +#' @param tableName the qualified or unqualified name that designates a table. If no database +#' identifier is provided, it refers to a table in the current database. #' @rdname recoverPartitions #' @name recoverPartitions #' @export @@ -430,17 +435,18 @@ recoverPartitions <- function(tableName) { invisible(handledCallJMethod(catalog, "recoverPartitions", tableName)) } -#' Invalidate and refresh all the cached metadata of the given table +#' Invalidates and refreshes all the cached data and metadata of the given table #' -#' Invalidate and refresh all the cached metadata of the given table. For performance reasons, -#' Spark SQL or the external data source library it uses might cache certain metadata about a -#' table, such as the location of blocks. When those change outside of Spark SQL, users should +#' Invalidates and refreshes all the cached data and metadata of the given table. For performance +#' reasons, Spark SQL or the external data source library it uses might cache certain metadata about +#' a table, such as the location of blocks. When those change outside of Spark SQL, users should #' call this function to invalidate the cache. #' #' If this table is cached as an InMemoryRelation, drop the original cached version and make the #' new version cached lazily. #' -#' @param tableName a name of the table. +#' @param tableName the qualified or unqualified name that designates a table. If no database +#' identifier is provided, it refers to a table in the current database. #' @rdname refreshTable #' @name refreshTable #' @export @@ -456,11 +462,11 @@ refreshTable <- function(tableName) { invisible(handledCallJMethod(catalog, "refreshTable", tableName)) } -#' Invalidate and refresh all the cached data and metadata for SparkDataFrame containing path +#' Invalidates and refreshes all the cached data and metadata for SparkDataFrame containing path #' -#' Invalidate and refresh all the cached data (and the associated metadata) for any SparkDataFrame -#' that contains the given data source path. Path matching is by prefix, i.e. "/" would invalidate -#' everything that is cached. +#' Invalidates and refreshes all the cached data (and the associated metadata) for any +#' SparkDataFrame that contains the given data source path. Path matching is by prefix, i.e. "/" +#' would invalidate everything that is cached. #' #' @param path the path of the data source. #' @rdname refreshByPath http://git-wip-us.apache.org/repos/asf/spark/blob/bccc3301/python/pyspark/sql/catalog.py ---------------------------------------------------------------------- diff --git a/python/pyspark/sql/catalog.py b/python/pyspark/sql/catalog.py index 253a750..41e68a4 100644 --- a/python/pyspark/sql/catalog.py +++ b/python/pyspark/sql/catalog.py @@ -72,10 +72,10 @@ class Catalog(object): @ignore_unicode_prefix @since(2.0) def listTables(self, dbName=None): - """Returns a list of tables in the specified database. + """Returns a list of tables/views in the specified database. If no database is specified, the current database is used. - This includes all temporary tables. + This includes all temporary views. """ if dbName is None: dbName = self.currentDatabase() @@ -115,7 +115,7 @@ class Catalog(object): @ignore_unicode_prefix @since(2.0) def listColumns(self, tableName, dbName=None): - """Returns a list of columns for the given table in the specified database. + """Returns a list of columns for the given table/view in the specified database. If no database is specified, the current database is used. @@ -161,14 +161,15 @@ class Catalog(object): def createTable(self, tableName, path=None, source=None, schema=None, **options): """Creates a table based on the dataset in a data source. - It returns the DataFrame associated with the external table. + It returns the DataFrame associated with the table. The data source is specified by the ``source`` and a set of ``options``. If ``source`` is not specified, the default data source configured by - ``spark.sql.sources.default`` will be used. + ``spark.sql.sources.default`` will be used. When ``path`` is specified, an external table is + created from the data at the given path. Otherwise a managed table is created. Optionally, a schema can be provided as the schema of the returned :class:`DataFrame` and - created external table. + created table. :return: :class:`DataFrame` """ @@ -276,14 +277,24 @@ class Catalog(object): @since(2.0) def refreshTable(self, tableName): - """Invalidate and refresh all the cached metadata of the given table.""" + """Invalidates and refreshes all the cached data and metadata of the given table.""" self._jcatalog.refreshTable(tableName) @since('2.1.1') def recoverPartitions(self, tableName): - """Recover all the partitions of the given table and update the catalog.""" + """Recovers all the partitions of the given table and update the catalog. + + Only works with a partitioned table, and not a view. + """ self._jcatalog.recoverPartitions(tableName) + @since('2.2.0') + def refreshByPath(self, path): + """Invalidates and refreshes all the cached data (and the associated metadata) for any + DataFrame that contains the given data source path. + """ + self._jcatalog.refreshByPath(path) + def _reset(self): """(Internal use only) Drop all existing databases (except "default"), tables, partitions and functions, and set the current database to "default". http://git-wip-us.apache.org/repos/asf/spark/blob/bccc3301/python/pyspark/sql/context.py ---------------------------------------------------------------------- diff --git a/python/pyspark/sql/context.py b/python/pyspark/sql/context.py index c22f4b8..fdb7abb 100644 --- a/python/pyspark/sql/context.py +++ b/python/pyspark/sql/context.py @@ -385,7 +385,7 @@ class SQLContext(object): @since(1.0) def table(self, tableName): - """Returns the specified table as a :class:`DataFrame`. + """Returns the specified table or view as a :class:`DataFrame`. :return: :class:`DataFrame` http://git-wip-us.apache.org/repos/asf/spark/blob/bccc3301/sql/core/src/main/scala/org/apache/spark/sql/catalog/Catalog.scala ---------------------------------------------------------------------- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/catalog/Catalog.scala b/sql/core/src/main/scala/org/apache/spark/sql/catalog/Catalog.scala index 137b0cb..074952f 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/catalog/Catalog.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/catalog/Catalog.scala @@ -283,7 +283,7 @@ abstract class Catalog { /** * :: Experimental :: - * Creates a table from the given path based on a data source and a set of options. + * Creates a table based on the dataset in a data source and a set of options. * Then, returns the corresponding DataFrame. * * @param tableName is either a qualified or unqualified name that designates a table. @@ -321,7 +321,7 @@ abstract class Catalog { /** * :: Experimental :: * (Scala-specific) - * Creates a table from the given path based on a data source and a set of options. + * Creates a table based on the dataset in a data source and a set of options. * Then, returns the corresponding DataFrame. * * @param tableName is either a qualified or unqualified name that designates a table. @@ -357,7 +357,7 @@ abstract class Catalog { /** * :: Experimental :: - * Create a table from the given path based on a data source, a schema and a set of options. + * Create a table based on the dataset in a data source, a schema and a set of options. * Then, returns the corresponding DataFrame. * * @param tableName is either a qualified or unqualified name that designates a table. @@ -397,7 +397,7 @@ abstract class Catalog { /** * :: Experimental :: * (Scala-specific) - * Create a table from the given path based on a data source, a schema and a set of options. + * Create a table based on the dataset in a data source, a schema and a set of options. * Then, returns the corresponding DataFrame. * * @param tableName is either a qualified or unqualified name that designates a table. @@ -447,6 +447,7 @@ abstract class Catalog { /** * Recovers all the partitions in the directory of a table and update the catalog. + * Only works with a partitioned table, and not a view. * * @param tableName is either a qualified or unqualified name that designates a table. * If no database identifier is provided, it refers to a table in the @@ -493,10 +494,10 @@ abstract class Catalog { def clearCache(): Unit /** - * Invalidates and refreshes all the cached metadata of the given table. For performance reasons, - * Spark SQL or the external data source library it uses might cache certain metadata about a - * table, such as the location of blocks. When those change outside of Spark SQL, users should - * call this function to invalidate the cache. + * Invalidates and refreshes all the cached data and metadata of the given table. For performance + * reasons, Spark SQL or the external data source library it uses might cache certain metadata + * about a table, such as the location of blocks. When those change outside of Spark SQL, users + * should call this function to invalidate the cache. * * If this table is cached as an InMemoryRelation, drop the original cached version and make the * new version cached lazily. http://git-wip-us.apache.org/repos/asf/spark/blob/bccc3301/sql/core/src/main/scala/org/apache/spark/sql/internal/CatalogImpl.scala ---------------------------------------------------------------------- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/internal/CatalogImpl.scala b/sql/core/src/main/scala/org/apache/spark/sql/internal/CatalogImpl.scala index 5d1c35a..aebb663 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/internal/CatalogImpl.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/internal/CatalogImpl.scala @@ -141,7 +141,7 @@ class CatalogImpl(sparkSession: SparkSession) extends Catalog { } /** - * Returns a list of columns for the given table temporary view. + * Returns a list of columns for the given table/view or temporary view. */ @throws[AnalysisException]("table does not exist") override def listColumns(tableName: String): Dataset[Column] = { @@ -150,7 +150,7 @@ class CatalogImpl(sparkSession: SparkSession) extends Catalog { } /** - * Returns a list of columns for the given table in the specified database. + * Returns a list of columns for the given table/view or temporary view in the specified database. */ @throws[AnalysisException]("database or table does not exist") override def listColumns(dbName: String, tableName: String): Dataset[Column] = { @@ -273,7 +273,7 @@ class CatalogImpl(sparkSession: SparkSession) extends Catalog { /** * :: Experimental :: - * Creates a table from the given path based on a data source and returns the corresponding + * Creates a table from the given path and returns the corresponding * DataFrame. * * @group ddl_ops @@ -287,7 +287,7 @@ class CatalogImpl(sparkSession: SparkSession) extends Catalog { /** * :: Experimental :: * (Scala-specific) - * Creates a table from the given path based on a data source and a set of options. + * Creates a table based on the dataset in a data source and a set of options. * Then, returns the corresponding DataFrame. * * @group ddl_ops @@ -304,7 +304,7 @@ class CatalogImpl(sparkSession: SparkSession) extends Catalog { /** * :: Experimental :: * (Scala-specific) - * Creates a table from the given path based on a data source, a schema and a set of options. + * Creates a table based on the dataset in a data source, a schema and a set of options. * Then, returns the corresponding DataFrame. * * @group ddl_ops @@ -367,6 +367,7 @@ class CatalogImpl(sparkSession: SparkSession) extends Catalog { /** * Recovers all the partitions in the directory of a table and update the catalog. + * Only works with a partitioned table, and not a temporary view. * * @param tableName is either a qualified or unqualified name that designates a table. * If no database identifier is provided, it refers to a table in the @@ -431,8 +432,12 @@ class CatalogImpl(sparkSession: SparkSession) extends Catalog { } /** - * Refreshes the cache entry for a table or view, if any. For Hive metastore table, the metadata - * is refreshed. For data source tables, the schema will not be inferred and refreshed. + * Invalidates and refreshes all the cached data and metadata of the given table or view. + * For Hive metastore table, the metadata is refreshed. For data source tables, the schema will + * not be inferred and refreshed. + * + * If this table is cached as an InMemoryRelation, drop the original cached version and make the + * new version cached lazily. * * @group cachemgmt * @since 2.0.0 @@ -456,7 +461,8 @@ class CatalogImpl(sparkSession: SparkSession) extends Catalog { /** * Refreshes the cache entry and the associated metadata for all Dataset (if any), that contain - * the given data source path. + * the given data source path. Path matching is by prefix, i.e. "/" would invalidate + * everything that is cached. * * @group cachemgmt * @since 2.0.0 --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org