This is an automated email from the ASF dual-hosted git repository.

yangjie01 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
     new df63adf7343 [SPARK-43646][CONNECT][TESTS] Make both SBT and Maven use 
`spark-proto` uber jar to test the `connect` module
df63adf7343 is described below

commit df63adf734370f5c2d71a348f9d36658718b302c
Author: yangjie01 <yangji...@baidu.com>
AuthorDate: Tue Aug 29 11:15:23 2023 +0800

    [SPARK-43646][CONNECT][TESTS] Make both SBT and Maven use `spark-proto` 
uber jar to test the `connect` module
    
    ### What changes were proposed in this pull request?
    Before this pr, when we tested the `connect` module, Maven used the shaded 
`spark-protobuf` jar for testing, while SBT used the original jar for testing, 
which also led to inconsistent testing behavior. So some tests passed when 
using SBT, but failed when using Maven:
    
    run
    
    ```
    build/mvn clean install -DskipTests
    build/mvn test -pl connector/connect/server
    ```
    
    there will be two test failed as follows:
    
    ```
    - from_protobuf_messageClassName *** FAILED ***
      org.apache.spark.sql.AnalysisException: [CANNOT_LOAD_PROTOBUF_CLASS] 
Could not load Protobuf class with name 
org.apache.spark.connect.proto.StorageLevel. 
org.apache.spark.connect.proto.StorageLevel does not extend shaded Protobuf 
Message class org.sparkproject.spark_protobuf.protobuf.Message. The jar with 
Protobuf classes needs to be shaded (com.google.protobuf.* --> 
org.sparkproject.spark_protobuf.protobuf.*).
      at 
org.apache.spark.sql.errors.QueryCompilationErrors$.protobufClassLoadError(QueryCompilationErrors.scala:3417)
      at 
org.apache.spark.sql.protobuf.utils.ProtobufUtils$.buildDescriptorFromJavaClass(ProtobufUtils.scala:193)
      at 
org.apache.spark.sql.protobuf.utils.ProtobufUtils$.buildDescriptor(ProtobufUtils.scala:151)
      at 
org.apache.spark.sql.protobuf.ProtobufDataToCatalyst.messageDescriptor$lzycompute(ProtobufDataToCatalyst.scala:58)
      at 
org.apache.spark.sql.protobuf.ProtobufDataToCatalyst.messageDescriptor(ProtobufDataToCatalyst.scala:57)
      at 
org.apache.spark.sql.protobuf.ProtobufDataToCatalyst.dataType$lzycompute(ProtobufDataToCatalyst.scala:43)
      at 
org.apache.spark.sql.protobuf.ProtobufDataToCatalyst.dataType(ProtobufDataToCatalyst.scala:42)
      at 
org.apache.spark.sql.catalyst.expressions.Alias.toAttribute(namedExpressions.scala:194)
      at 
org.apache.spark.sql.catalyst.plans.logical.Project.$anonfun$output$1(basicLogicalOperators.scala:72)
      at 
scala.collection.TraversableLike.$anonfun$map$1(TraversableLike.scala:286)
    
    - from_protobuf_messageClassName_options *** FAILED ***
      org.apache.spark.sql.AnalysisException: [CANNOT_LOAD_PROTOBUF_CLASS] 
Could not load Protobuf class with name 
org.apache.spark.connect.proto.StorageLevel. 
org.apache.spark.connect.proto.StorageLevel does not extend shaded Protobuf 
Message class org.sparkproject.spark_protobuf.protobuf.Message. The jar with 
Protobuf classes needs to be shaded (com.google.protobuf.* --> 
org.sparkproject.spark_protobuf.protobuf.*).
      at 
org.apache.spark.sql.errors.QueryCompilationErrors$.protobufClassLoadError(QueryCompilationErrors.scala:3417)
      at 
org.apache.spark.sql.protobuf.utils.ProtobufUtils$.buildDescriptorFromJavaClass(ProtobufUtils.scala:193)
      at 
org.apache.spark.sql.protobuf.utils.ProtobufUtils$.buildDescriptor(ProtobufUtils.scala:151)
      at 
org.apache.spark.sql.protobuf.ProtobufDataToCatalyst.messageDescriptor$lzycompute(ProtobufDataToCatalyst.scala:58)
      at 
org.apache.spark.sql.protobuf.ProtobufDataToCatalyst.messageDescriptor(ProtobufDataToCatalyst.scala:57)
      at 
org.apache.spark.sql.protobuf.ProtobufDataToCatalyst.dataType$lzycompute(ProtobufDataToCatalyst.scala:43)
      at 
org.apache.spark.sql.protobuf.ProtobufDataToCatalyst.dataType(ProtobufDataToCatalyst.scala:42)
      at 
org.apache.spark.sql.catalyst.expressions.Alias.toAttribute(namedExpressions.scala:194)
      at 
org.apache.spark.sql.catalyst.plans.logical.Project.$anonfun$output$1(basicLogicalOperators.scala:72)
      at 
scala.collection.TraversableLike.$anonfun$map$1(TraversableLike.scala:286)
    ```
    
    So this pr make SBT also use `spark-proto` uber 
jar(`spark-protobuf-assembly-**-SNAPSHOT.jar`) for the above tests and refactor 
the test cases to make them pass both SBT and Maven after this pr.
    
    ### Why are the changes needed?
    Make connect server module can test pass using both SBT and maven.
    
    ### Does this PR introduce _any_ user-facing change?
    No
    
    ### How was this patch tested?
    - Pass Github Actions
    - Manual check
    
    ```
    build/mvn clean install -DskipTests
    build/mvn test -pl connector/connect/server
    ```
    
    all test passed after this pr.
    
    Closes #42236 from LuciferYang/protobuf-test.
    
    Authored-by: yangjie01 <yangji...@baidu.com>
    Signed-off-by: yangjie01 <yangji...@baidu.com>
---
 .../apache/spark/sql/PlanGenerationTestSuite.scala |   5 +-
 .../from_protobuf_messageClassName.explain         |   2 +-
 .../from_protobuf_messageClassName_options.explain |   2 +-
 .../queries/from_protobuf_messageClassName.json    |   2 +-
 .../from_protobuf_messageClassName.proto.bin       | Bin 125 -> 131 bytes
 .../from_protobuf_messageClassName_options.json    |   2 +-
 ...rom_protobuf_messageClassName_options.proto.bin | Bin 174 -> 182 bytes
 connector/connect/server/pom.xml                   |  88 +++++++++++++++++++++
 .../connect/server/src/test/protobuf/test.proto    |  27 +++++++
 project/SparkBuild.scala                           |  55 ++++++++++++-
 10 files changed, 175 insertions(+), 8 deletions(-)

diff --git 
a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/PlanGenerationTestSuite.scala
 
b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/PlanGenerationTestSuite.scala
index ccd68f75bda..df416ef93d8 100644
--- 
a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/PlanGenerationTestSuite.scala
+++ 
b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/PlanGenerationTestSuite.scala
@@ -3236,14 +3236,15 @@ class PlanGenerationTestSuite
     "connect/common/src/test/resources/protobuf-tests/common.desc"
 
   test("from_protobuf messageClassName") {
-    binary.select(pbFn.from_protobuf(fn.col("bytes"), 
classOf[StorageLevel].getName))
+    binary.select(
+      pbFn.from_protobuf(fn.col("bytes"), 
"org.apache.spark.sql.protobuf.protos.TestProtoObj"))
   }
 
   test("from_protobuf messageClassName options") {
     binary.select(
       pbFn.from_protobuf(
         fn.col("bytes"),
-        classOf[StorageLevel].getName,
+        "org.apache.spark.sql.protobuf.protos.TestProtoObj",
         Map("recursive.fields.max.depth" -> "2").asJava))
   }
 
diff --git 
a/connector/connect/common/src/test/resources/query-tests/explain-results/from_protobuf_messageClassName.explain
 
b/connector/connect/common/src/test/resources/query-tests/explain-results/from_protobuf_messageClassName.explain
index e7a1867fe90..6f48cb090cd 100644
--- 
a/connector/connect/common/src/test/resources/query-tests/explain-results/from_protobuf_messageClassName.explain
+++ 
b/connector/connect/common/src/test/resources/query-tests/explain-results/from_protobuf_messageClassName.explain
@@ -1,2 +1,2 @@
-Project [from_protobuf(bytes#0, org.apache.spark.connect.proto.StorageLevel, 
None) AS from_protobuf(bytes)#0]
+Project [from_protobuf(bytes#0, 
org.apache.spark.sql.protobuf.protos.TestProtoObj, None) AS 
from_protobuf(bytes)#0]
 +- LocalRelation <empty>, [id#0L, bytes#0]
diff --git 
a/connector/connect/common/src/test/resources/query-tests/explain-results/from_protobuf_messageClassName_options.explain
 
b/connector/connect/common/src/test/resources/query-tests/explain-results/from_protobuf_messageClassName_options.explain
index c02d829fcac..ba87e4774f1 100644
--- 
a/connector/connect/common/src/test/resources/query-tests/explain-results/from_protobuf_messageClassName_options.explain
+++ 
b/connector/connect/common/src/test/resources/query-tests/explain-results/from_protobuf_messageClassName_options.explain
@@ -1,2 +1,2 @@
-Project [from_protobuf(bytes#0, org.apache.spark.connect.proto.StorageLevel, 
None, (recursive.fields.max.depth,2)) AS from_protobuf(bytes)#0]
+Project [from_protobuf(bytes#0, 
org.apache.spark.sql.protobuf.protos.TestProtoObj, None, 
(recursive.fields.max.depth,2)) AS from_protobuf(bytes)#0]
 +- LocalRelation <empty>, [id#0L, bytes#0]
diff --git 
a/connector/connect/common/src/test/resources/query-tests/queries/from_protobuf_messageClassName.json
 
b/connector/connect/common/src/test/resources/query-tests/queries/from_protobuf_messageClassName.json
index dc23ac2a117..6c5891e7016 100644
--- 
a/connector/connect/common/src/test/resources/query-tests/queries/from_protobuf_messageClassName.json
+++ 
b/connector/connect/common/src/test/resources/query-tests/queries/from_protobuf_messageClassName.json
@@ -20,7 +20,7 @@
           }
         }, {
           "literal": {
-            "string": "org.apache.spark.connect.proto.StorageLevel"
+            "string": "org.apache.spark.sql.protobuf.protos.TestProtoObj"
           }
         }]
       }
diff --git 
a/connector/connect/common/src/test/resources/query-tests/queries/from_protobuf_messageClassName.proto.bin
 
b/connector/connect/common/src/test/resources/query-tests/queries/from_protobuf_messageClassName.proto.bin
index cc46234b747..9d7aeaf3089 100644
Binary files 
a/connector/connect/common/src/test/resources/query-tests/queries/from_protobuf_messageClassName.proto.bin
 and 
b/connector/connect/common/src/test/resources/query-tests/queries/from_protobuf_messageClassName.proto.bin
 differ
diff --git 
a/connector/connect/common/src/test/resources/query-tests/queries/from_protobuf_messageClassName_options.json
 
b/connector/connect/common/src/test/resources/query-tests/queries/from_protobuf_messageClassName_options.json
index 36f69646ef8..691e144eabd 100644
--- 
a/connector/connect/common/src/test/resources/query-tests/queries/from_protobuf_messageClassName_options.json
+++ 
b/connector/connect/common/src/test/resources/query-tests/queries/from_protobuf_messageClassName_options.json
@@ -20,7 +20,7 @@
           }
         }, {
           "literal": {
-            "string": "org.apache.spark.connect.proto.StorageLevel"
+            "string": "org.apache.spark.sql.protobuf.protos.TestProtoObj"
           }
         }, {
           "unresolvedFunction": {
diff --git 
a/connector/connect/common/src/test/resources/query-tests/queries/from_protobuf_messageClassName_options.proto.bin
 
b/connector/connect/common/src/test/resources/query-tests/queries/from_protobuf_messageClassName_options.proto.bin
index 72a1c6b8207..89000e473fa 100644
Binary files 
a/connector/connect/common/src/test/resources/query-tests/queries/from_protobuf_messageClassName_options.proto.bin
 and 
b/connector/connect/common/src/test/resources/query-tests/queries/from_protobuf_messageClassName_options.proto.bin
 differ
diff --git a/connector/connect/server/pom.xml b/connector/connect/server/pom.xml
index e98b8da8e5c..34c919bc614 100644
--- a/connector/connect/server/pom.xml
+++ b/connector/connect/server/pom.xml
@@ -248,6 +248,13 @@
     </dependency>
   </dependencies>
   <build>
+    <extensions>
+      <extension>
+        <groupId>kr.motd.maven</groupId>
+        <artifactId>os-maven-plugin</artifactId>
+        <version>1.6.2</version>
+      </extension>
+    </extensions>
     
<outputDirectory>target/scala-${scala.binary.version}/classes</outputDirectory>
     
<testOutputDirectory>target/scala-${scala.binary.version}/test-classes</testOutputDirectory>
     <plugins>
@@ -403,6 +410,87 @@
           </transformers>
         </configuration>
       </plugin>
+      <!--
+        SPARK-43646: In order for `ProtoToParsedPlanTestSuite` to successfully 
test `from_protobuf_messageClassName`
+        and `from_protobuf_messageClassName_options`, `maven-antrun-plugin` is 
used to replace all
+        `com.google.protobuf.` with 
`org.sparkproject.spark_protobuf.protobuf.` in the Java files
+        generated by `protobuf-maven-plugin`.
+      -->
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-antrun-plugin</artifactId>
+        <executions>
+          <execution>
+            <phase>process-test-sources</phase>
+            <configuration>
+              <target>
+                <replace dir="${project.basedir}/target/generated-test-sources"
+                         includes="**/*.java"
+                         token="com.google.protobuf."
+                         value="org.sparkproject.spark_protobuf.protobuf."/>
+              </target>
+            </configuration>
+            <goals>
+              <goal>run</goal>
+            </goals>
+          </execution>
+        </executions>
+      </plugin>
     </plugins>
   </build>
+  <profiles>
+    <profile>
+      <id>default-protoc</id>
+      <activation>
+        <activeByDefault>true</activeByDefault>
+      </activation>
+      <build>
+        <plugins>
+          <!-- Add protobuf-maven-plugin and provide ScalaPB as a code 
generation plugin -->
+          <plugin>
+            <groupId>org.xolstice.maven.plugins</groupId>
+            <artifactId>protobuf-maven-plugin</artifactId>
+            <version>0.6.1</version>
+            <configuration>
+              
<protocArtifact>com.google.protobuf:protoc:${protobuf.version}:exe:${os.detected.classifier}</protocArtifact>
+              <protoTestSourceRoot>src/test/protobuf</protoTestSourceRoot>
+            </configuration>
+            <executions>
+              <execution>
+                <goals>
+                  <goal>test-compile</goal>
+                </goals>
+              </execution>
+            </executions>
+          </plugin>
+        </plugins>
+      </build>
+    </profile>
+    <profile>
+      <id>user-defined-protoc</id>
+      <properties>
+        
<spark.protoc.executable.path>${env.SPARK_PROTOC_EXEC_PATH}</spark.protoc.executable.path>
+      </properties>
+      <build>
+        <plugins>
+          <plugin>
+            <groupId>org.xolstice.maven.plugins</groupId>
+            <artifactId>protobuf-maven-plugin</artifactId>
+            <version>0.6.1</version>
+            <configuration>
+              
<protocExecutable>${spark.protoc.executable.path}</protocExecutable>
+              <protoTestSourceRoot>src/test/protobuf</protoTestSourceRoot>
+            </configuration>
+            <executions>
+              <execution>
+                <goals>
+                  <goal>test-compile</goal>
+                </goals>
+              </execution>
+            </executions>
+          </plugin>
+        </plugins>
+      </build>
+    </profile>
+  </profiles>
 </project>
diff --git a/connector/connect/server/src/test/protobuf/test.proto 
b/connector/connect/server/src/test/protobuf/test.proto
new file mode 100644
index 00000000000..844f89ba81f
--- /dev/null
+++ b/connector/connect/server/src/test/protobuf/test.proto
@@ -0,0 +1,27 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+syntax = "proto3";
+package org.apache.spark.sql.protobuf.protos;
+
+option java_multiple_files = true;
+option java_outer_classname = "TestProto";
+
+message TestProtoObj {
+  int64 v1 = 1;
+  int32 v2 = 2;
+}
diff --git a/project/SparkBuild.scala b/project/SparkBuild.scala
index 2f437eeb75c..978165ba0da 100644
--- a/project/SparkBuild.scala
+++ b/project/SparkBuild.scala
@@ -17,7 +17,7 @@
 
 import java.io._
 import java.nio.charset.StandardCharsets.UTF_8
-import java.nio.file.Files
+import java.nio.file.{Files, StandardOpenOption}
 import java.util.Locale
 
 import scala.io.Source
@@ -765,7 +765,13 @@ object SparkConnectCommon {
 object SparkConnect {
   import BuildCommons.protoVersion
 
+  val rewriteJavaFile = TaskKey[Unit]("rewriteJavaFile",
+    "Rewrite the generated Java PB files.")
+  val genPBAndRewriteJavaFile = TaskKey[Unit]("genPBAndRewriteJavaFile",
+    "Generate Java PB files and overwrite their contents.")
+
   lazy val settings = Seq(
+    PB.protocVersion := BuildCommons.protoVersion,
     // For some reason the resolution from the imported Maven build does not 
work for some
     // of these dependendencies that we need to shade later on.
     libraryDependencies ++= {
@@ -796,6 +802,42 @@ object SparkConnect {
       )
     },
 
+    // SPARK-43646: The following 3 statements are used to make the connect 
module use the
+    // Spark-proto assembly jar when compiling and testing using SBT, which 
can be keep same
+    // behavior of Maven testing.
+    (Test / unmanagedJars) += (LocalProject("protobuf") / assembly).value,
+    (Test / fullClasspath) :=
+      (Test / fullClasspath).value.filterNot { f => 
f.toString.contains("spark-protobuf") },
+    (Test / fullClasspath) += (LocalProject("protobuf") / assembly).value,
+
+    (Test / PB.protoSources) += (Test / sourceDirectory).value / "resources" / 
"protobuf",
+
+    (Test / PB.targets) := Seq(
+      PB.gens.java -> target.value / "generated-test-sources",
+    ),
+
+    // SPARK-43646: Create a custom task to replace all `com.google.protobuf.` 
with
+    // `org.sparkproject.spark_protobuf.protobuf.` in the generated Java PB 
files.
+    // This is to generate Java files that can be used to test spark-protobuf 
functions
+    // in `ProtoToParsedPlanTestSuite`.
+    rewriteJavaFile := {
+      val protobufDir = target.value / 
"generated-test-sources"/"org"/"apache"/"spark"/"sql"/"protobuf"/"protos"
+      protobufDir.listFiles().foreach { f =>
+        if (f.getName.endsWith(".java")) {
+          val contents = Files.readAllLines(f.toPath, UTF_8)
+          val replaced = contents.asScala.map { line =>
+            line.replaceAll("com.google.protobuf.", 
"org.sparkproject.spark_protobuf.protobuf.")
+          }
+          Files.write(f.toPath, replaced.asJava, 
StandardOpenOption.TRUNCATE_EXISTING, StandardOpenOption.WRITE)
+        }
+      }
+    },
+    // SPARK-43646: `genPBAndRewriteJavaFile` is used to specify the execution 
order of `PB.generate`
+    // and `rewriteJavaFile`, and makes `Test / compile` dependent on 
`genPBAndRewriteJavaFile`
+    // being executed first.
+    genPBAndRewriteJavaFile := Def.sequential(Test / PB.generate, 
rewriteJavaFile).value,
+    (Test / compile) := (Test / 
compile).dependsOn(genPBAndRewriteJavaFile).value,
+
     (assembly / test) := { },
 
     (assembly / logLevel) := Level.Info,
@@ -841,7 +883,16 @@ object SparkConnect {
       case m if m.toLowerCase(Locale.ROOT).endsWith(".proto") => 
MergeStrategy.discard
       case _ => MergeStrategy.first
     }
-  )
+  ) ++ {
+    val sparkProtocExecPath = sys.props.get("spark.protoc.executable.path")
+    if (sparkProtocExecPath.isDefined) {
+      Seq(
+        PB.protocExecutable := file(sparkProtocExecPath.get)
+      )
+    } else {
+      Seq.empty
+    }
+  }
 }
 
 object SparkConnectClient {


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

Reply via email to