[spark] branch master updated: [SPARK-27392][SQL] TestHive test tables should be placed in shared test state, not per session

srowen Mon, 22 Apr 2019 11:06:23 -0700

This is an automated email from the ASF dual-hosted git repository.

srowen pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git



The following commit(s) were added to refs/heads/master by this push:
     new 5172190  [SPARK-27392][SQL] TestHive test tables should be placed in 
shared test state, not per session
5172190 is described below

commit 5172190da19a2750e00e1eac00cebfbca1f3c173
Author: Eric Liang <e...@databricks.com>
AuthorDate: Mon Apr 22 11:05:31 2019 -0700

    [SPARK-27392][SQL] TestHive test tables should be placed in shared test 
state, not per session
    
    ## What changes were proposed in this pull request?
    
    Otherwise, tests that use tables from multiple sessions will run into 
issues if they access the same table. The correct location is in shared state.
    
    A couple other minor test improvements.
    
    cc gatorsmile srinathshankar
    
    ## How was this patch tested?
    
    Existing unit tests.
    
    Closes #24302 from ericl/test-conflicts.
    
    Lead-authored-by: Eric Liang <e...@databricks.com>
    Co-authored-by: Eric Liang <ekhli...@gmail.com>
    Signed-off-by: Sean Owen <sean.o...@databricks.com>
---
 .../spark/mllib/regression/JavaRidgeRegressionSuite.java   |  7 +++++--
 .../scala/org/apache/spark/sql/hive/test/TestHive.scala    | 14 ++++++++------
 2 files changed, 13 insertions(+), 8 deletions(-)

diff --git 
a/mllib/src/test/java/org/apache/spark/mllib/regression/JavaRidgeRegressionSuite.java
 
b/mllib/src/test/java/org/apache/spark/mllib/regression/JavaRidgeRegressionSuite.java
index cb00977..fb6c775 100644
--- 
a/mllib/src/test/java/org/apache/spark/mllib/regression/JavaRidgeRegressionSuite.java
+++ 
b/mllib/src/test/java/org/apache/spark/mllib/regression/JavaRidgeRegressionSuite.java
@@ -17,6 +17,7 @@
 
 package org.apache.spark.mllib.regression;
 
+import java.util.ArrayList;
 import java.util.List;
 import java.util.Random;
 
@@ -55,7 +56,8 @@ public class JavaRidgeRegressionSuite extends 
SharedSparkSession {
     int numFeatures = 20;
     List<LabeledPoint> data = generateRidgeData(2 * numExamples, numFeatures, 
10.0);
 
-    JavaRDD<LabeledPoint> testRDD = jsc.parallelize(data.subList(0, 
numExamples));
+    JavaRDD<LabeledPoint> testRDD = jsc.parallelize(
+      new ArrayList<LabeledPoint>(data.subList(0, numExamples)));
     List<LabeledPoint> validationData = data.subList(numExamples, 2 * 
numExamples);
 
     RidgeRegressionWithSGD ridgeSGDImpl = new RidgeRegressionWithSGD();
@@ -79,7 +81,8 @@ public class JavaRidgeRegressionSuite extends 
SharedSparkSession {
     int numFeatures = 20;
     List<LabeledPoint> data = generateRidgeData(2 * numExamples, numFeatures, 
10.0);
 
-    JavaRDD<LabeledPoint> testRDD = jsc.parallelize(data.subList(0, 
numExamples));
+    JavaRDD<LabeledPoint> testRDD = jsc.parallelize(
+      new ArrayList<LabeledPoint>(data.subList(0, numExamples)));
     List<LabeledPoint> validationData = data.subList(numExamples, 2 * 
numExamples);
 
     RidgeRegressionModel model = RidgeRegressionWithSGD.train(testRDD.rdd(), 
200, 1.0, 0.0);
diff --git 
a/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala 
b/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala
index 1515807..e8a749f 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala
@@ -92,6 +92,10 @@ private[hive] class TestHiveSharedState(
     hiveClient: Option[HiveClient] = None)
   extends SharedState(sc, initialConfigs = Map.empty[String, String]) {
 
+  // The set of loaded tables should be kept in shared state, since there may 
be multiple sessions
+  // created that want to use the same tables.
+  val loadedTables = new collection.mutable.HashSet[String]
+
   override lazy val externalCatalog: ExternalCatalogWithListener = {
     new ExternalCatalogWithListener(new TestHiveExternalCatalog(
       sc.conf,
@@ -491,14 +495,12 @@ private[hive] class TestHiveSparkSession(
     hiveQTestUtilTables.foreach(registerTestTable)
   }
 
-  private val loadedTables = new collection.mutable.HashSet[String]
-
-  def getLoadedTables: collection.mutable.HashSet[String] = loadedTables
+  def getLoadedTables: collection.mutable.HashSet[String] = 
sharedState.loadedTables
 
   def loadTestTable(name: String) {
-    if (!(loadedTables contains name)) {
+    if (!sharedState.loadedTables.contains(name)) {
       // Marks the table as loaded first to prevent infinite mutually 
recursive table loading.
-      loadedTables += name
+      sharedState.loadedTables += name
       logDebug(s"Loading test table $name")
       val createCmds =
         testTables.get(name).map(_.commands).getOrElse(sys.error(s"Unknown 
test table $name"))
@@ -545,7 +547,7 @@ private[hive] class TestHiveSparkSession(
       warehouseDir.mkdir()
 
       sharedState.cacheManager.clearCache()
-      loadedTables.clear()
+      sharedState.loadedTables.clear()
       sessionState.catalog.reset()
       metadataHive.reset()
 


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

[spark] branch master updated: [SPARK-27392][SQL] TestHive test tables should be placed in shared test state, not per session

Reply via email to