[spark] branch master updated: [SPARK-42943][SQL] Use LONGTEXT instead of TEXT for StringType for effective length

yao Tue, 28 Mar 2023 05:06:51 -0700

This is an automated email from the ASF dual-hosted git repository.

yao pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git



The following commit(s) were added to refs/heads/master by this push:
     new 8da2cc38c96 [SPARK-42943][SQL] Use LONGTEXT instead of TEXT for 
StringType for effective length
8da2cc38c96 is described below

commit 8da2cc38c9608c7c2d97a6c6aadc5408367efdd9
Author: Kent Yao <[email protected]>
AuthorDate: Tue Mar 28 20:05:23 2023 +0800

    [SPARK-42943][SQL] Use LONGTEXT instead of TEXT for StringType for 
effective length
    
    ### What changes were proposed in this pull request?
    
    Referring to 
https://dev.mysql.com/doc/refman/8.0/en/string-type-syntax.html, A 
[TEXT](https://dev.mysql.com/doc/refman/8.0/en/blob.html) column with a maximum 
length of 65,535 (2^16 − 1) characters.
    
    We currently convert our string to MySQL's `text` and jdbc's `CLOB`. The 
`text` here is insufficient. And `CLOB` is incorrect, which LONGVARCHAR should 
be replaced instead.
    
    ### Why are the changes needed?
    
    better compatibility with MySQL and bugfix
    
    ### Does this PR introduce _any_ user-facing change?
    
    Yes, you won't see MysqlDataTruncation if you store a string exceeding 
65536 into a column defined by spark' string with MySQL catalog.
    
    ```java
    ob aborted due to stage failure: Task 0 in stage 1.0 failed 1 times, most 
recent failure: Lost task 0.0 in stage 1.0 (TID 1) (10.221.102.180 executor 
driver): java.sql.BatchUpdateException: Data truncation: Data too long for 
column 'c1' at row 1
            at sun.reflect.NativeConstructorAccessorImpl.newInstance0(Native 
Method)
            at 
sun.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:62)
            at 
sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45)
            at java.lang.reflect.Constructor.newInstance(Constructor.java:423)
            at com.mysql.cj.util.Util.handleNewInstance(Util.java:192)
            at com.mysql.cj.util.Util.getInstance(Util.java:167)
            at com.mysql.cj.util.Util.getInstance(Util.java:174)
            at 
com.mysql.cj.jdbc.exceptions.SQLError.createBatchUpdateException(SQLError.java:224)
            at 
com.mysql.cj.jdbc.ClientPreparedStatement.executeBatchSerially(ClientPreparedStatement.java:816)
            at 
com.mysql.cj.jdbc.ClientPreparedStatement.executeBatchInternal(ClientPreparedStatement.java:418)
            at 
com.mysql.cj.jdbc.StatementImpl.executeBatch(StatementImpl.java:795)
            at 
org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils$.savePartition(JdbcUtils.scala:742)
            at 
org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils$.$anonfun$saveTable$1(JdbcUtils.scala:893)
            at 
org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils$.$anonfun$saveTable$1$adapted(JdbcUtils.scala:892)
            at 
org.apache.spark.rdd.RDD.$anonfun$foreachPartition$2(RDD.scala:1011)
            at 
org.apache.spark.rdd.RDD.$anonfun$foreachPartition$2$adapted(RDD.scala:1011)
    ```
    
    ### How was this patch tested?
    
    new tests
    
    Closes #40573 from yaooqinn/SPARK-42943.
    
    Authored-by: Kent Yao <[email protected]>
    Signed-off-by: Kent Yao <[email protected]>
---
 .../org/apache/spark/sql/jdbc/v2/MySQLIntegrationSuite.scala     | 9 +++++++++
 .../src/main/scala/org/apache/spark/sql/jdbc/MySQLDialect.scala  | 3 ++-
 .../src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala     | 2 +-
 3 files changed, 12 insertions(+), 2 deletions(-)

diff --git 
a/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/v2/MySQLIntegrationSuite.scala
 
b/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/v2/MySQLIntegrationSuite.scala
index 789dfeddc21..7ee73a5a978 100644
--- 
a/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/v2/MySQLIntegrationSuite.scala
+++ 
b/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/v2/MySQLIntegrationSuite.scala
@@ -145,4 +145,13 @@ class MySQLIntegrationSuite extends 
DockerJDBCIntegrationV2Suite with V2JDBCTest
   override def supportListIndexes: Boolean = true
 
   override def indexOptions: String = "KEY_BLOCK_SIZE=10"
+
+  test("SPARK-42943: Use LONGTEXT instead of TEXT for StringType for effective 
length") {
+    val tableName = catalogName + ".t1"
+    withTable(tableName) {
+      sql(s"CREATE TABLE $tableName(c1 string)")
+      sql(s"INSERT INTO $tableName SELECT rpad('hi', 65536, 'spark')")
+      assert(sql(s"SELECT char_length(c1) from $tableName").head().get(0) === 
65536)
+    }
+  }
 }
diff --git 
a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/MySQLDialect.scala 
b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/MySQLDialect.scala
index e688af561c4..11305dbde42 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/MySQLDialect.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/MySQLDialect.scala
@@ -32,7 +32,7 @@ import org.apache.spark.sql.connector.catalog.index.TableIndex
 import org.apache.spark.sql.connector.expressions.{Expression, FieldReference, 
NamedReference, NullOrdering, SortDirection}
 import org.apache.spark.sql.errors.QueryExecutionErrors
 import org.apache.spark.sql.execution.datasources.jdbc.{JDBCOptions, JdbcUtils}
-import org.apache.spark.sql.types.{BooleanType, DataType, FloatType, LongType, 
MetadataBuilder}
+import org.apache.spark.sql.types.{BooleanType, DataType, FloatType, LongType, 
MetadataBuilder, StringType}
 
 private case object MySQLDialect extends JdbcDialect with SQLConfHelper {
 
@@ -176,6 +176,7 @@ private case object MySQLDialect extends JdbcDialect with 
SQLConfHelper {
     // See SPARK-35446: MySQL treats REAL as a synonym to DOUBLE by default
     // We override getJDBCType so that FloatType is mapped to FLOAT instead
     case FloatType => Option(JdbcType("FLOAT", java.sql.Types.FLOAT))
+    case StringType => Option(JdbcType("LONGTEXT", java.sql.Types.LONGVARCHAR))
     case _ => JdbcUtils.getCommonJDBCType(dt)
   }
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala 
b/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala
index 0d102e1632a..8b1fd359123 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala
@@ -1324,7 +1324,7 @@ class JDBCSuite extends QueryTest with SharedSparkSession 
{
       df.schema,
       df.sqlContext.conf.caseSensitiveAnalysis,
       "jdbc:mysql://localhost:3306/temp")
-    assert(schema.contains("`order` TEXT"))
+    assert(schema.contains("`order` LONGTEXT"))
   }
 
   test("SPARK-18141: Predicates on quoted column names in the jdbc data 
source") {


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[spark] branch master updated: [SPARK-42943][SQL] Use LONGTEXT instead of TEXT for StringType for effective length

Reply via email to