This is an automated email from the ASF dual-hosted git repository. yangjie01 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push: new 9d1e67cc200 [SPARK-43647][CONNECT][TESTS] Clean up hive classes dir when test `connect-client-jvm` without -Phive 9d1e67cc200 is described below commit 9d1e67cc200b9315dccdc2f081549dbfe5d1ecd9 Author: yangjie01 <yangji...@baidu.com> AuthorDate: Fri May 26 09:31:11 2023 +0800 [SPARK-43647][CONNECT][TESTS] Clean up hive classes dir when test `connect-client-jvm` without -Phive ### What changes were proposed in this pull request? This pr aims to added a cleaning action for the `$sparkHome/sql/hive/target/$scalaDir/classes` and `$sparkHome/sql/hive/target/$scalaDir/test-classes` directories before `SimpleSparkConnectService` starts when running test cases that inherit `RemoteSparkSession` without `-Phive` to avoid to unexpected loading of `sql/hive/target/scala-2.12/classes/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister` by `ServiceLoader`. ### Why are the changes needed? When we run the test cases that inherit `RemoteSparkSession`, the classpath used to launch `SimpleSparkConnectService` will at least include the following directory, both maven and sbt: ``` $sparkHome/conf/ $sparkHome/common/kvstore/target/scala-2.12/classes/ $sparkHome/common/network-common/target/scala-2.12/classes/ $sparkHome/common/network-shuffle/target/scala-2.12/classes/ $sparkHome/common/network-yarn/target/scala-2.12/classes $sparkHome/common/sketch/target/scala-2.12/classes/ $sparkHome/common/tags/target/scala-2.12/classes/ $sparkHome/common/unsafe/target/scala-2.12/classes/ $sparkHome/core/target/scala-2.12/classes/ $sparkHome/examples/target/scala-2.12/classes/ $sparkHome/graphx/target/scala-2.12/classes/ $sparkHome/launcher/target/scala-2.12/classes/ $sparkHome/mllib/target/scala-2.12/classes/ $sparkHome/repl/target/scala-2.12/classes/ $sparkHome/resource-managers/mesos/target/scala-2.12/classes $sparkHome/resource-managers/yarn/target/scala-2.12/classes $sparkHome/sql/catalyst/target/scala-2.12/classes/ $sparkHome/sql/core/target/scala-2.12/classes/ $sparkHome/sql/hive/target/scala-2.12/classes/ $sparkHome/sql/hive-thriftserver/target/scala-2.12/classes/ $sparkHome/streaming/target/scala-2.12/classes/ $sparkHome/common/kvstore/target/scala-2.12/test-classes $sparkHome/common/network-common/target/scala-2.12/test-classes/ $sparkHome/common/network-shuffle/target/scala-2.12/test-classes/ $sparkHome/common/network-yarn/target/scala-2.12/test-classes $sparkHome/common/sketch/target/scala-2.12/test-classes $sparkHome/common/tags/target/scala-2.12/test-classes/ $sparkHome/common/unsafe/target/scala-2.12/test-classes $sparkHome/core/target/scala-2.12/test-classes/ $sparkHome/examples/target/scala-2.12/test-classes $sparkHome/graphx/target/scala-2.12/test-classes $sparkHome/launcher/target/scala-2.12/test-classes/ $sparkHome/mllib/target/scala-2.12/test-classes $sparkHome/repl/target/scala-2.12/test-classes $sparkHome/resource-managers/mesos/target/scala-2.12/test-classes $sparkHome/resource-managers/yarn/target/scala-2.12/test-classes $sparkHome/sql/catalyst/target/scala-2.12/test-classes/ $sparkHome/sql/core/target/scala-2.12/test-classes $sparkHome/sql/hive/target/scala-2.12/test-classes $sparkHome/sql/hive-thriftserver/target/scala-2.12/test-classes $sparkHome/streaming/target/scala-2.12/test-classes $sparkHome/connector/connect/client/jvm/target/scala-2.12/test-classes/ $sparkHome/connector/connect/common/target/scala-2.12/test-classes/ ... ``` So if the test case need calls `DataSource#lookupDataSource` and the `hive` module is compiled, `sql/hive/target/scala-2.12/classes/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister` will be loaded by `ServiceLoader`. After SPARK-43186 | https://github.com/apache/spark/pull/40848 merged, `org.apache.spark.sql.hive.execution.HiveFileFormat` changed to use `org.apache.hadoop.hive.ql.plan.FileSinkDesc` instead of `org.apache.spark.sql.hive.HiveShim.ShimFileSinkDesc`, it has a strong dependence on `hive-exec`. But when there is no hive related jars under `assembly/target/$scalaDir/jars/`, it will cause initialization fail of `org.apache.spark.sql.hive.execution.HiveFileFormat` and test fail. For example, when we run the following commands to test `connect-client-jvm` without `-Phive`: ``` build/mvn clean install -DskipTests build/mvn test -pl connector/connect/client/jvm ``` Then hive related jars will not be copied to `assembly/target/$scalaDir/jars/`, there will be test error as: **Client side** ``` - read and write *** FAILED *** io.grpc.StatusRuntimeException: INTERNAL: org.apache.spark.sql.sources.DataSourceRegister: Provider org.apache.spark.sql.hive.execution.HiveFileFormat could not be instantiated at io.grpc.Status.asRuntimeException(Status.java:535) at io.grpc.stub.ClientCalls$BlockingResponseStream.hasNext(ClientCalls.java:660) at scala.collection.convert.Wrappers$JIteratorWrapper.hasNext(Wrappers.scala:45) at scala.collection.Iterator.toStream(Iterator.scala:1417) at scala.collection.Iterator.toStream$(Iterator.scala:1416) at scala.collection.AbstractIterator.toStream(Iterator.scala:1431) at scala.collection.TraversableOnce.toSeq(TraversableOnce.scala:354) at scala.collection.TraversableOnce.toSeq$(TraversableOnce.scala:354) at scala.collection.AbstractIterator.toSeq(Iterator.scala:1431) at org.apache.spark.sql.SparkSession.execute(SparkSession.scala:489) ... ``` **Server side** ``` java.util.ServiceConfigurationError: org.apache.spark.sql.sources.DataSourceRegister: Provider org.apache.spark.sql.hive.execution.HiveFileFormat could not be instantiated at java.util.ServiceLoader.fail(ServiceLoader.java:232) at java.util.ServiceLoader.access$100(ServiceLoader.java:185) at java.util.ServiceLoader$LazyIterator.nextService(ServiceLoader.java:384) at java.util.ServiceLoader$LazyIterator.next(ServiceLoader.java:404) at java.util.ServiceLoader$1.next(ServiceLoader.java:480) at scala.collection.convert.Wrappers$JIteratorWrapper.next(Wrappers.scala:46) at scala.collection.Iterator.foreach(Iterator.scala:943) at scala.collection.Iterator.foreach$(Iterator.scala:943) at scala.collection.AbstractIterator.foreach(Iterator.scala:1431) at scala.collection.IterableLike.foreach(IterableLike.scala:74) at scala.collection.IterableLike.foreach$(IterableLike.scala:73) at scala.collection.AbstractIterable.foreach(Iterable.scala:56) at scala.collection.TraversableLike.filterImpl(TraversableLike.scala:303) at scala.collection.TraversableLike.filterImpl$(TraversableLike.scala:297) at scala.collection.AbstractTraversable.filterImpl(Traversable.scala:108) at scala.collection.TraversableLike.filter(TraversableLike.scala:395) at scala.collection.TraversableLike.filter$(TraversableLike.scala:395) at scala.collection.AbstractTraversable.filter(Traversable.scala:108) at org.apache.spark.sql.execution.datasources.DataSource$.lookupDataSource(DataSource.scala:629) at org.apache.spark.sql.execution.datasources.DataSource$.lookupDataSourceV2(DataSource.scala:697) at org.apache.spark.sql.DataFrameWriter.lookupV2Provider(DataFrameWriter.scala:860) at org.apache.spark.sql.DataFrameWriter.saveAsTable(DataFrameWriter.scala:559) at org.apache.spark.sql.connect.planner.SparkConnectPlanner.handleWriteOperation(SparkConnectPlanner.scala:2326) at org.apache.spark.sql.connect.planner.SparkConnectPlanner.process(SparkConnectPlanner.scala:2091) at org.apache.spark.sql.connect.service.SparkConnectStreamHandler.handleCommand(SparkConnectStreamHandler.scala:120) at org.apache.spark.sql.connect.service.SparkConnectStreamHandler.$anonfun$handle$2(SparkConnectStreamHandler.scala:86) at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23) at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:825) at org.apache.spark.sql.connect.service.SparkConnectStreamHandler.$anonfun$handle$1(SparkConnectStreamHandler.scala:53) at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23) at org.apache.spark.util.Utils$.withContextClassLoader(Utils.scala:209) at org.apache.spark.sql.connect.artifact.SparkConnectArtifactManager$.withArtifactClassLoader(SparkConnectArtifactManager.scala:178) at org.apache.spark.sql.connect.service.SparkConnectStreamHandler.handle(SparkConnectStreamHandler.scala:48) at org.apache.spark.sql.connect.service.SparkConnectService.executePlan(SparkConnectService.scala:166) at org.apache.spark.connect.proto.SparkConnectServiceGrpc$MethodHandlers.invoke(SparkConnectServiceGrpc.java:611) at org.sparkproject.connect.grpc.io.grpc.stub.ServerCalls$UnaryServerCallHandler$UnaryServerCallListener.onHalfClose(ServerCalls.java:182) at org.sparkproject.connect.grpc.io.grpc.internal.ServerCallImpl$ServerStreamListenerImpl.halfClosed(ServerCallImpl.java:352) at org.sparkproject.connect.grpc.io.grpc.internal.ServerImpl$JumpToApplicationThreadServerStreamListener$1HalfClosed.runInContext(ServerImpl.java:866) at org.sparkproject.connect.grpc.io.grpc.internal.ContextRunnable.run(ContextRunnable.java:37) at org.sparkproject.connect.grpc.io.grpc.internal.SerializingExecutor.run(SerializingExecutor.java:133) at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) at java.lang.Thread.run(Thread.java:750) Caused by: java.lang.NoClassDefFoundError: org/apache/hadoop/hive/ql/plan/FileSinkDesc at java.lang.Class.getDeclaredConstructors0(Native Method) at java.lang.Class.privateGetDeclaredConstructors(Class.java:2671) at java.lang.Class.getConstructor0(Class.java:3075) at java.lang.Class.newInstance(Class.java:412) at java.util.ServiceLoader$LazyIterator.nextService(ServiceLoader.java:380) ... 40 more Caused by: java.lang.ClassNotFoundException: org.apache.hadoop.hive.ql.plan.FileSinkDesc at java.net.URLClassLoader.findClass(URLClassLoader.java:387) at java.lang.ClassLoader.loadClass(ClassLoader.java:419) at sun.misc.Launcher$AppClassLoader.loadClass(Launcher.java:352) at java.lang.ClassLoader.loadClass(ClassLoader.java:352) ... 45 more ``` So this PR proposal takes the initiative to clean up the `$sparkHome/sql/hive/target/$scalaDir/classes` and `$sparkHome/sql/hive/target/$scalaDir/test-classes` directories when `IntegrationTestUtils#isSparkHiveJarAvailable` is false to protect the above scenario. ### Does this PR introduce _any_ user-facing change? No, just for test. ### How was this patch tested? - Pass Github Actions - Manual test: The following command can reproduce the problem without this pr Maven ``` build/mvn clean install -DskipTests build/mvn test -pl connector/connect/client/jvm ``` SBT ``` build/sbt package build/sbt "connect-client-jvm/test" ``` **Before** Maven There are 13 test cases with similar failures ``` - recoverPartitions *** FAILED *** io.grpc.StatusRuntimeException: INTERNAL: org.apache.spark.sql.sources.DataSourceRegister: Provider org.apache.spark.sql.hive.execution.HiveFileFormat could not be instantiated at io.grpc.Status.asRuntimeException(Status.java:535) at io.grpc.stub.ClientCalls$BlockingResponseStream.hasNext(ClientCalls.java:660) at scala.collection.convert.Wrappers$JIteratorWrapper.hasNext(Wrappers.scala:45) at scala.collection.Iterator.toStream(Iterator.scala:1417) at scala.collection.Iterator.toStream$(Iterator.scala:1416) at scala.collection.AbstractIterator.toStream(Iterator.scala:1431) at scala.collection.TraversableOnce.toSeq(TraversableOnce.scala:354) at scala.collection.TraversableOnce.toSeq$(TraversableOnce.scala:354) at scala.collection.AbstractIterator.toSeq(Iterator.scala:1431) at org.apache.spark.sql.SparkSession.execute(SparkSession.scala:489) ... ``` SBT There are similar errors of sbt, and the test will unexpectedly aborted. **After** Both Maven and SBT no longer have similar test failures Closes #41282 from LuciferYang/SPARK-43647. Authored-by: yangjie01 <yangji...@baidu.com> Signed-off-by: yangjie01 <yangji...@baidu.com> --- .../spark/sql/connect/client/util/IntegrationTestUtils.scala | 12 ++++++++++++ .../spark/sql/connect/client/util/RemoteSparkSession.scala | 4 ++++ 2 files changed, 16 insertions(+) diff --git a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/connect/client/util/IntegrationTestUtils.scala b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/connect/client/util/IntegrationTestUtils.scala index 7e34726b48e..1c17a3fc36b 100644 --- a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/connect/client/util/IntegrationTestUtils.scala +++ b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/connect/client/util/IntegrationTestUtils.scala @@ -23,6 +23,8 @@ import scala.util.Properties.versionNumberString import org.scalatest.Assertions.fail +import org.apache.spark.util.Utils + object IntegrationTestUtils { // System properties used for testing and debugging @@ -77,6 +79,16 @@ object IntegrationTestUtils { Files.exists(Paths.get(filePath)) } + private[sql] def cleanUpHiveClassesDirIfNeeded(): Unit = { + def delete(f: File): Unit = { + if (f.exists()) { + Utils.deleteRecursively(f) + } + } + delete(new File(s"$sparkHome/sql/hive/target/$scalaDir/classes")) + delete(new File(s"$sparkHome/sql/hive/target/$scalaDir/test-classes")) + } + /** * Find a jar in the Spark project artifacts. It requires a build first (e.g. build/sbt package, * build/mvn clean install -DskipTests) so that this method can find the jar in the target diff --git a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/connect/client/util/RemoteSparkSession.scala b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/connect/client/util/RemoteSparkSession.scala index b9edf9ac1a5..e05828606d0 100644 --- a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/connect/client/util/RemoteSparkSession.scala +++ b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/connect/client/util/RemoteSparkSession.scala @@ -117,6 +117,10 @@ object SparkConnectServerUtils { "1. Test with maven: run `build/mvn install -DskipTests -Phive` before testing\n" + "2. Test with sbt: run test with `-Phive` profile") // scalastyle:on println + // SPARK-43647: Proactively cleaning the `classes` and `test-classes` dir of hive + // module to avoid unexpected loading of `DataSourceRegister` in hive module during + // testing without `-Phive` profile. + IntegrationTestUtils.cleanUpHiveClassesDirIfNeeded() "in-memory" } Seq("--conf", s"spark.sql.catalogImplementation=$catalogImplementation") --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org