(spark) branch master updated: [SPARK-46544][SQL] Support v2 DESCRIBE TABLE EXTENDED with table stats

maxgekk Tue, 02 Jan 2024 03:11:40 -0800

This is an automated email from the ASF dual-hosted git repository.

maxgekk pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git



The following commit(s) were added to refs/heads/master by this push:
     new 7d09991610f9 [SPARK-46544][SQL] Support v2 DESCRIBE TABLE EXTENDED 
with table stats
7d09991610f9 is described below

commit 7d09991610f966fa42707ba7c8bb2575ec20d233
Author: zouxxyy <zouxinyu....@alibaba-inc.com>
AuthorDate: Tue Jan 2 14:11:11 2024 +0300

    [SPARK-46544][SQL] Support v2 DESCRIBE TABLE EXTENDED with table stats
    
    ### What changes were proposed in this pull request?
    
    Support v2 DESCRIBE TABLE EXTENDED with table stats
    
    ### Why are the changes needed?
    
    Similar to  #40058, make DS v1/v2 command parity, e.g.
    
    DESC EXTENDED table
    
    | col_name          | data_type                 | comment    |
    |-------------------|---------------------------|------------|
    | ...               | ...                       | ...        |
    | Statistics        | 864 bytes, 2 rows         |            |
    | ...               | ...                       | ...        |
    
    ### Does this PR introduce _any_ user-facing change?
    
    No
    
    ### How was this patch tested?
    
    add test `describe extended table with stats`
    
    ### Was this patch authored or co-authored using generative AI tooling?
    
    No
    
    Closes #44535 from Zouxxyy/dev/desc-table-stats.
    
    Lead-authored-by: zouxxyy <zouxinyu....@alibaba-inc.com>
    Co-authored-by: Zouxxyy <zoux...@qq.com>
    Signed-off-by: Max Gekk <max.g...@gmail.com>
---
 .../datasources/v2/DescribeTableExec.scala         | 22 +++++++++++++++++++++-
 .../spark/sql/connector/DataSourceV2SQLSuite.scala |  3 ++-
 .../execution/command/v2/DescribeTableSuite.scala  | 19 ++++++++++++++++++-
 3 files changed, 41 insertions(+), 3 deletions(-)

diff --git 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DescribeTableExec.scala
 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DescribeTableExec.scala
index 3d79a7113e0d..a225dffb075b 100644
--- 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DescribeTableExec.scala
+++ 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DescribeTableExec.scala
@@ -24,8 +24,10 @@ import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.catalog.CatalogTableType
 import org.apache.spark.sql.catalyst.expressions.Attribute
 import org.apache.spark.sql.catalyst.util.{quoteIfNeeded, 
ResolveDefaultColumns}
-import org.apache.spark.sql.connector.catalog.{CatalogV2Util, 
SupportsMetadataColumns, Table, TableCatalog}
+import org.apache.spark.sql.connector.catalog.{CatalogV2Util, 
SupportsMetadataColumns, SupportsRead, Table, TableCatalog}
 import org.apache.spark.sql.connector.expressions.IdentityTransform
+import org.apache.spark.sql.connector.read.SupportsReportStatistics
+import org.apache.spark.sql.util.CaseInsensitiveStringMap
 import org.apache.spark.util.ArrayImplicits._
 
 case class DescribeTableExec(
@@ -40,6 +42,7 @@ case class DescribeTableExec(
     if (isExtended) {
       addMetadataColumns(rows)
       addTableDetails(rows)
+      addTableStats(rows)
     }
     rows.toSeq
   }
@@ -96,6 +99,23 @@ case class DescribeTableExec(
     case _ =>
   }
 
+  private def addTableStats(rows: ArrayBuffer[InternalRow]): Unit = table 
match {
+    case read: SupportsRead =>
+      read.newScanBuilder(CaseInsensitiveStringMap.empty()).build() match {
+        case s: SupportsReportStatistics =>
+          val stats = s.estimateStatistics()
+          val statsComponents = Seq(
+            
Option.when(stats.sizeInBytes().isPresent)(s"${stats.sizeInBytes().getAsLong} 
bytes"),
+            
Option.when(stats.numRows().isPresent)(s"${stats.numRows().getAsLong} rows")
+          ).flatten
+          if (statsComponents.nonEmpty) {
+            rows += toCatalystRow("Statistics", statsComponents.mkString(", 
"), null)
+          }
+        case _ =>
+      }
+    case _ =>
+  }
+
   private def addPartitioning(rows: ArrayBuffer[InternalRow]): Unit = {
     if (table.partitioning.nonEmpty) {
       val partitionColumnsOnly = table.partitioning.forall(t => 
t.isInstanceOf[IdentityTransform])
diff --git 
a/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2SQLSuite.scala
 
b/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2SQLSuite.scala
index 589283a29b85..f92a9a827b1c 100644
--- 
a/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2SQLSuite.scala
+++ 
b/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2SQLSuite.scala
@@ -3342,7 +3342,8 @@ class DataSourceV2SQLSuiteV1Filter
             Row("# Column Default Values", "", ""),
             Row("# Metadata Columns", "", ""),
             Row("id", "bigint", "42"),
-            Row("id", "bigint", null)
+            Row("id", "bigint", null),
+            Row("Statistics", "0 bytes, 0 rows", null)
           ))
       }
     }
diff --git 
a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/v2/DescribeTableSuite.scala
 
b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/v2/DescribeTableSuite.scala
index a21baebe24d8..cfd26c09bf3e 100644
--- 
a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/v2/DescribeTableSuite.scala
+++ 
b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/v2/DescribeTableSuite.scala
@@ -90,7 +90,8 @@ class DescribeTableSuite extends 
command.DescribeTableSuiteBase
           Row("Location", "file:/tmp/testcat/table_name", ""),
           Row("Provider", "_", ""),
           Row(TableCatalog.PROP_OWNER.capitalize, Utils.getCurrentUserName(), 
""),
-          Row("Table Properties", "[bar=baz]", "")))
+          Row("Table Properties", "[bar=baz]", ""),
+          Row("Statistics", "0 bytes, 0 rows", null)))
     }
   }
 
@@ -196,4 +197,20 @@ class DescribeTableSuite extends 
command.DescribeTableSuiteBase
           Row("comment", "column_comment")))
     }
   }
+
+  test("describe extended table with stats") {
+    withNamespaceAndTable("ns", "tbl") { tbl =>
+      sql(
+        s"""
+           |CREATE TABLE $tbl
+           |(key INT, col STRING)
+           |$defaultUsing""".stripMargin)
+
+      sql(s"INSERT INTO $tbl values (1, 'aaa'), (2, 'bbb'), (3, 'ccc'), (null, 
'ddd')")
+      val descriptionDf = sql(s"DESCRIBE TABLE EXTENDED $tbl")
+      val stats = descriptionDf.filter("col_name == 'Statistics'").head()
+        .getAs[String]("data_type")
+      assert("""\d+\s+bytes,\s+4\s+rows""".r.matches(stats))
+    }
+  }
 }


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

(spark) branch master updated: [SPARK-46544][SQL] Support v2 DESCRIBE TABLE EXTENDED with table stats

Reply via email to