This is an automated email from the ASF dual-hosted git repository. lwz9103 pushed a commit to branch liquid in repository https://gitbox.apache.org/repos/asf/incubator-gluten.git
commit 813729c81a3852e0b73f554ad8c1826bd5691004 Author: Chang Chen <[email protected]> AuthorDate: Thu Aug 22 23:17:00 2024 +0800 [KY-SPARK] Better way to support kyspark - 250306 Fix conflict due to https://github.com/apache/incubator-gluten/pull/8904 - 250123 Fix conflict due to https://github.com/apache/incubator-gluten/pull/8418 - 250119 Fix conflict due to https://github.com/apache/incubator-gluten/pull/7789 - 241218 Fix conflict due to https://github.com/apache/incubator-gluten/pull/8263 - 241218 Fix conflict due to https://github.com/apache/incubator-gluten/pull/8250 - 241129 Fix conflict due to https://github.com/apache/incubator-gluten/pull/8106 - 241023 Fix conflict due to https://github.com/apache/incubator-gluten/pull/7593 - 241012 Fix conflict due to https://github.com/apache/incubator-gluten/pull/7466 - 240910 Fix conflict due to https://github.com/apache/incubator-gluten/pull/7175 - 240909 Fix conflict due to https://github.com/apache/incubator-gluten/pull/7027 - 240906 Fix conflict due to https://github.com/apache/incubator-gluten/pull/7140 - 240906 Fix conflict due to https://github.com/apache/incubator-gluten/pull/7115 - [KY-SPARK] Bump to 3.3.0-kylin-4.6.27.0-SNAPSHOT - [POM] Minor fix to avoid downloading spark-sql-columnar-shims-spark34 (#24) - Rename spark-3.3 to kyspark - Fix GlutenMetadataColumnSuite by introducing fake MetadataColumnSuite - Fix following UTs by throwing IllegalStateException instead of SparkExeption - GlutenBucketedReadWithoutHiveSupportSuite [error if there exists any malformed bucket files] - ClickHouseAdaptiveQueryExecSuite [SPARK-30291: AQE should catch the exceptions when doing materialize] (cherry picked from commit d5fa296fc47383448833399d9623ef327b96a767) --- ep/build-clickhouse/src/package.sh | 42 ++++++------------ gluten-ut/pom.xml | 2 +- .../sql/connector/GlutenMetadataColumnSuite.scala | 2 + package/pom.xml | 1 + pom.xml | 50 +++++++++++++++++++--- shims/pom.xml | 2 +- shims/spark33/pom.xml | 2 +- .../gluten/sql/shims/spark33/Spark33Shims.scala | 9 +--- .../catalyst/expressions/PromotePrecision.scala | 19 ++++++-- .../sql/execution/AbstractFileSourceScanExec.scala | 3 +- .../InsertIntoHadoopFsRelationCommand.scala | 2 +- 11 files changed, 80 insertions(+), 54 deletions(-) diff --git a/ep/build-clickhouse/src/package.sh b/ep/build-clickhouse/src/package.sh index 06ca63c5d4..2ceab18dd9 100755 --- a/ep/build-clickhouse/src/package.sh +++ b/ep/build-clickhouse/src/package.sh @@ -33,7 +33,7 @@ function detect_os_version() { } detect_os_version -DEFAULT_SPARK_PROFILE="spark-3.3" +DEFAULT_SPARK_PROFILE="kyspark" function get_project_version() { cd "${GLUTEN_SOURCE}" # use mvn command to get project version @@ -48,8 +48,6 @@ OS_ARCH=$(uname -m) PACKAGE_NAME=gluten-${BUILD_VERSION}-${OS_VERSION}-${OS_ARCH} PACKAGE_DIR_PATH="${GLUTEN_SOURCE}"/dist/"${PACKAGE_NAME}" -spark_scala_versions=("3.2_2.12" "3.3_2.12" "3.5_2.13") - # cleanup working directory [[ -d "${GLUTEN_SOURCE}"/dist/"${PACKAGE_NAME}" ]] && rm -rf "${GLUTEN_SOURCE}"/dist/"${PACKAGE_NAME}" [[ -d "${GLUTEN_SOURCE}"/dist/"${PACKAGE_NAME}".tar.gz ]] && rm -f "${GLUTEN_SOURCE}"/dist/"${PACKAGE_NAME}".tar.gz @@ -63,15 +61,11 @@ mkdir -p "${GLUTEN_SOURCE}"/dist/"${PACKAGE_NAME}" mkdir "${GLUTEN_SOURCE}"/dist/"${PACKAGE_NAME}"/bin mkdir "${GLUTEN_SOURCE}"/dist/"${PACKAGE_NAME}"/conf mkdir "${GLUTEN_SOURCE}"/dist/"${PACKAGE_NAME}"/jars +mkdir "${GLUTEN_SOURCE}"/dist/"${PACKAGE_NAME}"/jars/spark32 +mkdir "${GLUTEN_SOURCE}"/dist/"${PACKAGE_NAME}"/jars/spark33 mkdir "${GLUTEN_SOURCE}"/dist/"${PACKAGE_NAME}"/libs mkdir "${GLUTEN_SOURCE}"/dist/"${PACKAGE_NAME}"/logs -for ssv in "${spark_scala_versions[@]}" -do - spark_version=$(echo ${ssv%_*} | tr -d '.') - mkdir "${GLUTEN_SOURCE}"/dist/"${PACKAGE_NAME}"/jars/spark"$spark_version" -done - # create BUILD_INFO { echo "BUILD_VERSION=${BUILD_VERSION}" @@ -84,28 +78,16 @@ done cp "${GLUTEN_SOURCE}"/LICENSE "${GLUTEN_SOURCE}"/dist/"${PACKAGE_NAME}" cp "${GLUTEN_SOURCE}"/README.md "${GLUTEN_SOURCE}"/dist/"${PACKAGE_NAME}" -function build_gluten_by_spark_version() { - spark_profile=$1 - scala_version=$2 - sv=$(echo "$spark_profile" | tr -d '.') - echo "build gluten with spark ${spark_profile}, scala ${scala_version}" - - mvn clean install -Pbackends-clickhouse -Pspark-"${spark_profile}" -Pscala-"${scala_version}" -Pceleborn -Piceberg -Pdelta -DskipTests -Dcheckstyle.skip - cp "${GLUTEN_SOURCE}"/backends-clickhouse/target/gluten-*-spark-"${spark_profile}"-jar-with-dependencies.jar "${PACKAGE_DIR_PATH}"/jars/spark"${sv}"/gluten.jar - delta_version=$(mvn -q -Dexec.executable="echo" -Dexec.args='${delta.version}' -Pspark-"${spark_profile}" --non-recursive exec:exec) - delta_package_name=$(mvn -q -Dexec.executable="echo" -Dexec.args='${delta.package.name}' -Pspark-"${spark_profile}" --non-recursive exec:exec) - wget https://repo1.maven.org/maven2/io/delta/"${delta_package_name}"_${scala_version}/"${delta_version}"/"${delta_package_name}"_${scala_version}-"${delta_version}".jar -P "${PACKAGE_DIR_PATH}"/jars/spark"${sv}" - wget https://repo1.maven.org/maven2/io/delta/delta-storage/"${delta_version}"/delta-storage-"${delta_version}".jar -P "${PACKAGE_DIR_PATH}"/jars/spark"${sv}" - celeborn_version=$(mvn -q -P${DEFAULT_SPARK_PROFILE} -Dexec.executable="echo" -Dexec.args='${celeborn.version}' --non-recursive exec:exec) - wget https://repo1.maven.org/maven2/org/apache/celeborn/celeborn-client-spark-3-shaded_${scala_version}/${celeborn_version}/celeborn-client-spark-3-shaded_${scala_version}-${celeborn_version}.jar -P "${PACKAGE_DIR_PATH}"/jars/spark"${sv}" -} +# build gluten with kyspark +mvn clean install -Pbackends-clickhouse -Pkyspark -Pceleborn -Piceberg -Pdelta -DskipTests -Dcheckstyle.skip +cp "${GLUTEN_SOURCE}"/backends-clickhouse/target/gluten-*-spark-3.3-jar-with-dependencies.jar "${PACKAGE_DIR_PATH}"/jars/spark33/gluten.jar +delta_version_33=$(mvn -q -Dexec.executable="echo" -Dexec.args='${delta.version}' -Pkyspark --non-recursive exec:exec) +wget https://repo1.maven.org/maven2/io/delta/delta-core_2.12/${delta_version_33}/delta-core_2.12-${delta_version_33}.jar -P "${PACKAGE_DIR_PATH}"/jars/spark33 +wget https://repo1.maven.org/maven2/io/delta/delta-storage/${delta_version_33}/delta-storage-${delta_version_33}.jar -P "${PACKAGE_DIR_PATH}"/jars/spark33 -for ssv in "${spark_scala_versions[@]}" -do - spark_profile="${ssv%_*}" - scala_version="${ssv#*_}" - build_gluten_by_spark_version "$spark_profile" "$scala_version" -done +# download common 3rd party jars +celeborn_version=$(mvn -q -P${DEFAULT_SPARK_PROFILE} -Dexec.executable="echo" -Dexec.args='${celeborn.version}' --non-recursive exec:exec) +wget https://repo1.maven.org/maven2/org/apache/celeborn/celeborn-client-spark-3-shaded_2.12/${celeborn_version}/celeborn-client-spark-3-shaded_2.12-${celeborn_version}.jar -P "${PACKAGE_DIR_PATH}"/jars/spark33 # build libch.so bash "${GLUTEN_SOURCE}"/ep/build-clickhouse/src/build_clickhouse.sh diff --git a/gluten-ut/pom.xml b/gluten-ut/pom.xml index 6e8655cb12..41cfdd7d39 100644 --- a/gluten-ut/pom.xml +++ b/gluten-ut/pom.xml @@ -201,7 +201,7 @@ </modules> </profile> <profile> - <id>spark-3.3</id> + <id>kyspark</id> <modules> <module>spark33</module> </modules> diff --git a/gluten-ut/spark33/src/test/scala/org/apache/spark/sql/connector/GlutenMetadataColumnSuite.scala b/gluten-ut/spark33/src/test/scala/org/apache/spark/sql/connector/GlutenMetadataColumnSuite.scala index 59a14fb11c..5f5b69e80d 100644 --- a/gluten-ut/spark33/src/test/scala/org/apache/spark/sql/connector/GlutenMetadataColumnSuite.scala +++ b/gluten-ut/spark33/src/test/scala/org/apache/spark/sql/connector/GlutenMetadataColumnSuite.scala @@ -18,4 +18,6 @@ package org.apache.spark.sql.connector import org.apache.spark.sql.GlutenSQLTestsBaseTrait +// Make scala compiler happy, kyspark misses this class. +class MetadataColumnSuite extends DatasourceV2SQLBase {} class GlutenMetadataColumnSuite extends MetadataColumnSuite with GlutenSQLTestsBaseTrait {} diff --git a/package/pom.xml b/package/pom.xml index 3b1f61cd08..f830f1d1cf 100644 --- a/package/pom.xml +++ b/package/pom.xml @@ -352,6 +352,7 @@ <ignoreClass>org.apache.spark.util.collection.unsafe.sort.UnsafeExternalSorter$ChainedIterator</ignoreClass> <ignoreClass>org.apache.spark.memory.MemoryConsumer</ignoreClass> <ignoreClass>org.apache.spark.memory.TaskMemoryManager </ignoreClass> + <ignoreClass>com.google.thirdparty.publicsuffix.*</ignoreClass> </ignoreClasses> <scopes> <scope>compile</scope> diff --git a/pom.xml b/pom.xml index 03fe1f49e2..42be1bb291 100644 --- a/pom.xml +++ b/pom.xml @@ -69,9 +69,9 @@ <scala.binary.version>2.12</scala.binary.version> <scala.version>2.12.15</scala.version> <spark.major.version>3</spark.major.version> - <sparkbundle.version>3.4</sparkbundle.version> - <spark.version>3.4.4</spark.version> - <sparkshim.artifactId>spark-sql-columnar-shims-spark34</sparkshim.artifactId> + <sparkbundle.version>3.3</sparkbundle.version> + <spark.version>3.3.0-kylin-4.6.27.0-SNAPSHOT</spark.version> + <sparkshim.artifactId>spark-sql-columnar-shims-kyspark</sparkshim.artifactId> <iceberg.version>1.5.0</iceberg.version> <delta.package.name>delta-core</delta.package.name> <delta.version>2.4.0</delta.version> @@ -316,11 +316,11 @@ </properties> </profile> <profile> - <id>spark-3.3</id> + <id>kyspark</id> <properties> - <sparkbundle.version>3.3</sparkbundle.version> - <sparkshim.artifactId>spark-sql-columnar-shims-spark33</sparkshim.artifactId> - <spark.version>3.3.1</spark.version> + <sparkbundle.version>3.3</sparkbundle.version> + <sparkshim.artifactId>spark-sql-columnar-shims-kyspark</sparkshim.artifactId> + <spark.version>3.3.0-kylin-4.6.27.0-SNAPSHOT</spark.version> <iceberg.version>1.5.0</iceberg.version> <delta.package.name>delta-core</delta.package.name> <delta.version>2.3.0</delta.version> @@ -1071,10 +1071,42 @@ <groupId>org.apache.hadoop</groupId> <artifactId>hadoop-client-runtime</artifactId> </exclusion> + <exclusion> + <groupId>log4j</groupId> + <artifactId>log4j</artifactId> + </exclusion> + <exclusion> + <groupId>org.slf4j</groupId> + <artifactId>slf4j-log4j12</artifactId> + </exclusion> + <exclusion> + <groupId>commons-logging</groupId> + <artifactId>commons-logging</artifactId> + </exclusion> + <exclusion> + <groupId>org.apache.tomcat</groupId> + <artifactId>tomcat-annotation-api</artifactId> + </exclusion> + <exclusion> + <groupId>org.apache.tomcat.embed</groupId> + <artifactId>tomcat-embed-core</artifactId> + </exclusion> + <exclusion> + <groupId>com.google.guava</groupId> + <artifactId>guava</artifactId> + </exclusion> <exclusion> <artifactId>protobuf-java</artifactId> <groupId>com.google.protobuf</groupId> </exclusion> + <exclusion> + <groupId>javax.annotation</groupId> + <artifactId>javax.annotation-api</artifactId> + </exclusion> + <exclusion> + <groupId>org.apache.thrift</groupId> + <artifactId>libthrift</artifactId> + </exclusion> </exclusions> </dependency> <dependency> @@ -1111,6 +1143,10 @@ <groupId>org.apache.curator</groupId> <artifactId>curator-recipes</artifactId> </exclusion> + <exclusion> + <groupId>com.google.guava</groupId> + <artifactId>guava</artifactId> + </exclusion> <exclusion> <artifactId>protobuf-java</artifactId> <groupId>com.google.protobuf</groupId> diff --git a/shims/pom.xml b/shims/pom.xml index 38dfb04750..5bf788741e 100644 --- a/shims/pom.xml +++ b/shims/pom.xml @@ -61,7 +61,7 @@ </modules> </profile> <profile> - <id>spark-3.3</id> + <id>kyspark</id> <modules> <module>spark33</module> </modules> diff --git a/shims/spark33/pom.xml b/shims/spark33/pom.xml index 336ff34fba..35136dcecf 100644 --- a/shims/spark33/pom.xml +++ b/shims/spark33/pom.xml @@ -24,7 +24,7 @@ <relativePath>../pom.xml</relativePath> </parent> - <artifactId>spark-sql-columnar-shims-spark33</artifactId> + <artifactId>spark-sql-columnar-shims-kyspark</artifactId> <name>Gluten Shims for Spark 3.3</name> <packaging>jar</packaging> diff --git a/shims/spark33/src/main/scala/org/apache/gluten/sql/shims/spark33/Spark33Shims.scala b/shims/spark33/src/main/scala/org/apache/gluten/sql/shims/spark33/Spark33Shims.scala index e8a827fe79..6e3c883320 100644 --- a/shims/spark33/src/main/scala/org/apache/gluten/sql/shims/spark33/Spark33Shims.scala +++ b/shims/spark33/src/main/scala/org/apache/gluten/sql/shims/spark33/Spark33Shims.scala @@ -148,7 +148,7 @@ class Spark33Shims extends SparkShims { f => BucketingUtils .getBucketId(new Path(f.filePath).getName) - .getOrElse(throw invalidBucketFile(f.filePath)) + .getOrElse(throw new IllegalStateException(s"Invalid bucket file ${f.filePath}")) } } @@ -248,13 +248,6 @@ class Spark33Shims extends SparkShims { metadataColumn } - private def invalidBucketFile(path: String): Throwable = { - new SparkException( - errorClass = "INVALID_BUCKET_FILE", - messageParameters = Array(path), - cause = null) - } - override def getExtendedColumnarPostRules(): List[SparkSession => Rule[SparkPlan]] = { List(session => GlutenFormatFactory.getExtendedColumnarPostRule(session)) } diff --git a/gluten-ut/spark33/src/test/scala/org/apache/spark/sql/connector/GlutenMetadataColumnSuite.scala b/shims/spark33/src/main/scala/org/apache/spark/sql/catalyst/expressions/PromotePrecision.scala similarity index 50% copy from gluten-ut/spark33/src/test/scala/org/apache/spark/sql/connector/GlutenMetadataColumnSuite.scala copy to shims/spark33/src/main/scala/org/apache/spark/sql/catalyst/expressions/PromotePrecision.scala index 59a14fb11c..b18a79b864 100644 --- a/gluten-ut/spark33/src/test/scala/org/apache/spark/sql/connector/GlutenMetadataColumnSuite.scala +++ b/shims/spark33/src/main/scala/org/apache/spark/sql/catalyst/expressions/PromotePrecision.scala @@ -14,8 +14,21 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.spark.sql.connector +package org.apache.spark.sql.catalyst.expressions -import org.apache.spark.sql.GlutenSQLTestsBaseTrait +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, ExprCode} +import org.apache.spark.sql.types._ -class GlutenMetadataColumnSuite extends MetadataColumnSuite with GlutenSQLTestsBaseTrait {} +case class PromotePrecision(child: Expression) extends UnaryExpression { + override def dataType: DataType = child.dataType + override def eval(input: InternalRow): Any = child.eval(input) + override protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = + child.genCode(ctx) + override def prettyName: String = "promote_precision" + override def sql: String = child.sql + override lazy val canonicalized: Expression = child.canonicalized + + override protected def withNewChildInternal(newChild: Expression): Expression = + copy(child = newChild) +} diff --git a/shims/spark33/src/main/scala/org/apache/spark/sql/execution/AbstractFileSourceScanExec.scala b/shims/spark33/src/main/scala/org/apache/spark/sql/execution/AbstractFileSourceScanExec.scala index 01df5ba621..b9a5e4f723 100644 --- a/shims/spark33/src/main/scala/org/apache/spark/sql/execution/AbstractFileSourceScanExec.scala +++ b/shims/spark33/src/main/scala/org/apache/spark/sql/execution/AbstractFileSourceScanExec.scala @@ -20,7 +20,6 @@ import org.apache.spark.sql.catalyst.{InternalRow, TableIdentifier} import org.apache.spark.sql.catalyst.catalog.BucketSpec import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.plans.physical.{HashPartitioning, Partitioning, UnknownPartitioning} -import org.apache.spark.sql.errors.QueryExecutionErrors import org.apache.spark.sql.execution.datasources._ import org.apache.spark.sql.execution.datasources.parquet.{ParquetFileFormat => ParquetSource} import org.apache.spark.sql.execution.metric.{SQLMetric, SQLMetrics} @@ -494,7 +493,7 @@ abstract class AbstractFileSourceScanExec( f => BucketingUtils .getBucketId(new Path(f.filePath).getName) - .getOrElse(throw QueryExecutionErrors.invalidBucketFile(f.filePath)) + .getOrElse(throw new IllegalStateException(s"Invalid bucket file ${f.filePath}")) } val prunedFilesGroupedToBuckets = if (optionalBucketSet.isDefined) { diff --git a/shims/spark33/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala b/shims/spark33/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala index b1e740284b..f5465ad364 100644 --- a/shims/spark33/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala +++ b/shims/spark33/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala @@ -135,7 +135,7 @@ case class InsertIntoHadoopFsRelationCommand( case (SaveMode.Ignore, exists) => !exists case (s, exists) => - throw QueryExecutionErrors.saveModeUnsupportedError(s, exists) + throw QueryExecutionErrors.unsupportedSaveModeError(s.name(), exists) } } --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
