This is an automated email from the ASF dual-hosted git repository.
yao pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new 8da2cc38c96 [SPARK-42943][SQL] Use LONGTEXT instead of TEXT for
StringType for effective length
8da2cc38c96 is described below
commit 8da2cc38c9608c7c2d97a6c6aadc5408367efdd9
Author: Kent Yao <[email protected]>
AuthorDate: Tue Mar 28 20:05:23 2023 +0800
[SPARK-42943][SQL] Use LONGTEXT instead of TEXT for StringType for
effective length
### What changes were proposed in this pull request?
Referring to
https://dev.mysql.com/doc/refman/8.0/en/string-type-syntax.html, A
[TEXT](https://dev.mysql.com/doc/refman/8.0/en/blob.html) column with a maximum
length of 65,535 (2^16 − 1) characters.
We currently convert our string to MySQL's `text` and jdbc's `CLOB`. The
`text` here is insufficient. And `CLOB` is incorrect, which LONGVARCHAR should
be replaced instead.
### Why are the changes needed?
better compatibility with MySQL and bugfix
### Does this PR introduce _any_ user-facing change?
Yes, you won't see MysqlDataTruncation if you store a string exceeding
65536 into a column defined by spark' string with MySQL catalog.
```java
ob aborted due to stage failure: Task 0 in stage 1.0 failed 1 times, most
recent failure: Lost task 0.0 in stage 1.0 (TID 1) (10.221.102.180 executor
driver): java.sql.BatchUpdateException: Data truncation: Data too long for
column 'c1' at row 1
at sun.reflect.NativeConstructorAccessorImpl.newInstance0(Native
Method)
at
sun.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:62)
at
sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45)
at java.lang.reflect.Constructor.newInstance(Constructor.java:423)
at com.mysql.cj.util.Util.handleNewInstance(Util.java:192)
at com.mysql.cj.util.Util.getInstance(Util.java:167)
at com.mysql.cj.util.Util.getInstance(Util.java:174)
at
com.mysql.cj.jdbc.exceptions.SQLError.createBatchUpdateException(SQLError.java:224)
at
com.mysql.cj.jdbc.ClientPreparedStatement.executeBatchSerially(ClientPreparedStatement.java:816)
at
com.mysql.cj.jdbc.ClientPreparedStatement.executeBatchInternal(ClientPreparedStatement.java:418)
at
com.mysql.cj.jdbc.StatementImpl.executeBatch(StatementImpl.java:795)
at
org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils$.savePartition(JdbcUtils.scala:742)
at
org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils$.$anonfun$saveTable$1(JdbcUtils.scala:893)
at
org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils$.$anonfun$saveTable$1$adapted(JdbcUtils.scala:892)
at
org.apache.spark.rdd.RDD.$anonfun$foreachPartition$2(RDD.scala:1011)
at
org.apache.spark.rdd.RDD.$anonfun$foreachPartition$2$adapted(RDD.scala:1011)
```
### How was this patch tested?
new tests
Closes #40573 from yaooqinn/SPARK-42943.
Authored-by: Kent Yao <[email protected]>
Signed-off-by: Kent Yao <[email protected]>
---
.../org/apache/spark/sql/jdbc/v2/MySQLIntegrationSuite.scala | 9 +++++++++
.../src/main/scala/org/apache/spark/sql/jdbc/MySQLDialect.scala | 3 ++-
.../src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala | 2 +-
3 files changed, 12 insertions(+), 2 deletions(-)
diff --git
a/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/v2/MySQLIntegrationSuite.scala
b/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/v2/MySQLIntegrationSuite.scala
index 789dfeddc21..7ee73a5a978 100644
---
a/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/v2/MySQLIntegrationSuite.scala
+++
b/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/v2/MySQLIntegrationSuite.scala
@@ -145,4 +145,13 @@ class MySQLIntegrationSuite extends
DockerJDBCIntegrationV2Suite with V2JDBCTest
override def supportListIndexes: Boolean = true
override def indexOptions: String = "KEY_BLOCK_SIZE=10"
+
+ test("SPARK-42943: Use LONGTEXT instead of TEXT for StringType for effective
length") {
+ val tableName = catalogName + ".t1"
+ withTable(tableName) {
+ sql(s"CREATE TABLE $tableName(c1 string)")
+ sql(s"INSERT INTO $tableName SELECT rpad('hi', 65536, 'spark')")
+ assert(sql(s"SELECT char_length(c1) from $tableName").head().get(0) ===
65536)
+ }
+ }
}
diff --git
a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/MySQLDialect.scala
b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/MySQLDialect.scala
index e688af561c4..11305dbde42 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/MySQLDialect.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/MySQLDialect.scala
@@ -32,7 +32,7 @@ import org.apache.spark.sql.connector.catalog.index.TableIndex
import org.apache.spark.sql.connector.expressions.{Expression, FieldReference,
NamedReference, NullOrdering, SortDirection}
import org.apache.spark.sql.errors.QueryExecutionErrors
import org.apache.spark.sql.execution.datasources.jdbc.{JDBCOptions, JdbcUtils}
-import org.apache.spark.sql.types.{BooleanType, DataType, FloatType, LongType,
MetadataBuilder}
+import org.apache.spark.sql.types.{BooleanType, DataType, FloatType, LongType,
MetadataBuilder, StringType}
private case object MySQLDialect extends JdbcDialect with SQLConfHelper {
@@ -176,6 +176,7 @@ private case object MySQLDialect extends JdbcDialect with
SQLConfHelper {
// See SPARK-35446: MySQL treats REAL as a synonym to DOUBLE by default
// We override getJDBCType so that FloatType is mapped to FLOAT instead
case FloatType => Option(JdbcType("FLOAT", java.sql.Types.FLOAT))
+ case StringType => Option(JdbcType("LONGTEXT", java.sql.Types.LONGVARCHAR))
case _ => JdbcUtils.getCommonJDBCType(dt)
}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala
b/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala
index 0d102e1632a..8b1fd359123 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala
@@ -1324,7 +1324,7 @@ class JDBCSuite extends QueryTest with SharedSparkSession
{
df.schema,
df.sqlContext.conf.caseSensitiveAnalysis,
"jdbc:mysql://localhost:3306/temp")
- assert(schema.contains("`order` TEXT"))
+ assert(schema.contains("`order` LONGTEXT"))
}
test("SPARK-18141: Predicates on quoted column names in the jdbc data
source") {
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]