spark git commit: [SPARK-12075][SQL] Speed up HiveComparisionTest by avoiding / speeding up TestHive.reset()

rxin Tue, 01 Dec 2015 15:41:37 -0800

Repository: spark
Updated Branches:
  refs/heads/branch-1.6 012de2ce5 -> d77bf0bd9



[SPARK-12075][SQL] Speed up HiveComparisionTest by avoiding / speeding up 
TestHive.reset()

When profiling HiveCompatibilitySuite, I noticed that most of the time seems to 
be spent in expensive `TestHive.reset()` calls. This patch speeds up suites 
based on HiveComparisionTest, such as HiveCompatibilitySuite, with the 
following changes:

- Avoid `TestHive.reset()` whenever possible:
  - Use a simple set of heuristics to guess whether we need to call `reset()` 
in between tests.
  - As a safety-net, automatically re-run failed tests by calling `reset()` 
before the re-attempt.
- Speed up the expensive parts of `TestHive.reset()`: loading the `src` and 
`srcpart` tables took roughly 600ms per test, so we now avoid this by using a 
simple heuristic which only loads those tables by tests that reference them. 
This is based on simple string matching over the test queries which errs on the 
side of loading in more situations than might be strictly necessary.

After these changes, HiveCompatibilitySuite seems to run in about 10 minutes.

This PR is a revival of #6663, an earlier experimental PR from June, where I 
played around with several possible speedups for this suite.

Author: Josh Rosen <joshro...@databricks.com>

Closes #10055 from JoshRosen/speculative-testhive-reset.

(cherry picked from commit ef6790fdc3b70b9d6184ec2b3d926e4b0e4b15f6)
Signed-off-by: Reynold Xin <r...@databricks.com>


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/d77bf0bd
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/d77bf0bd
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/d77bf0bd

Branch: refs/heads/branch-1.6
Commit: d77bf0bd922835b6a63bb1eeedf91e2a92d92ca9
Parents: 012de2c
Author: Josh Rosen <joshro...@databricks.com>
Authored: Wed Dec 2 07:29:45 2015 +0800
Committer: Reynold Xin <r...@databricks.com>
Committed: Wed Dec 2 07:30:07 2015 +0800

----------------------------------------------------------------------
 .../apache/spark/sql/hive/test/TestHive.scala   |  7 --
 .../sql/hive/execution/HiveComparisonTest.scala | 67 ++++++++++++++++++--
 .../sql/hive/execution/HiveQueryFileTest.scala  |  2 +-
 3 files changed, 62 insertions(+), 14 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/d77bf0bd/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala
----------------------------------------------------------------------
diff --git 
a/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala 
b/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala
index 6883d30..2e2d201 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala
@@ -443,13 +443,6 @@ class TestHiveContext(sc: SparkContext) extends 
HiveContext(sc) {
       defaultOverrides()
 
       runSqlHive("USE default")
-
-      // Just loading src makes a lot of tests pass.  This is because some 
tests do something like
-      // drop an index on src at the beginning.  Since we just pass DDL to 
hive this bypasses our
-      // Analyzer and thus the test table auto-loading mechanism.
-      // Remove after we handle more DDL operations natively.
-      loadTestTable("src")
-      loadTestTable("srcpart")
     } catch {
       case e: Exception =>
         logError("FATAL ERROR: Failed to reset TestDB state.", e)

http://git-wip-us.apache.org/repos/asf/spark/blob/d77bf0bd/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveComparisonTest.scala
----------------------------------------------------------------------
diff --git 
a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveComparisonTest.scala
 
b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveComparisonTest.scala
index aa95ba9..4455430 100644
--- 
a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveComparisonTest.scala
+++ 
b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveComparisonTest.scala
@@ -209,7 +209,11 @@ abstract class HiveComparisonTest
   }
 
   val installHooksCommand = "(?i)SET.*hooks".r
-  def createQueryTest(testCaseName: String, sql: String, reset: Boolean = 
true) {
+  def createQueryTest(
+      testCaseName: String,
+      sql: String,
+      reset: Boolean = true,
+      tryWithoutResettingFirst: Boolean = false) {
     // testCaseName must not contain ':', which is not allowed to appear in a 
filename of Windows
     assert(!testCaseName.contains(":"))
 
@@ -240,9 +244,6 @@ abstract class HiveComparisonTest
     test(testCaseName) {
       logDebug(s"=== HIVE TEST: $testCaseName ===")
 
-      // Clear old output for this testcase.
-      outputDirectories.map(new File(_, 
testCaseName)).filter(_.exists()).foreach(_.delete())
-
       val sqlWithoutComment =
         sql.split("\n").filterNot(l => 
l.matches("--.*(?<=[^\\\\]);")).mkString("\n")
       val allQueries =
@@ -269,11 +270,32 @@ abstract class HiveComparisonTest
         }.mkString("\n== Console version of this test ==\n", "\n", "\n")
       }
 
-      try {
+      def doTest(reset: Boolean, isSpeculative: Boolean = false): Unit = {
+        // Clear old output for this testcase.
+        outputDirectories.map(new File(_, 
testCaseName)).filter(_.exists()).foreach(_.delete())
+
         if (reset) {
           TestHive.reset()
         }
 
+        // Many tests drop indexes on src and srcpart at the beginning, so we 
need to load those
+        // tables here. Since DROP INDEX DDL is just passed to Hive, it 
bypasses the analyzer and
+        // thus the tables referenced in those DDL commands cannot be 
extracted for use by our
+        // test table auto-loading mechanism. In addition, the tests which use 
the SHOW TABLES
+        // command expect these tables to exist.
+        val hasShowTableCommand = 
queryList.exists(_.toLowerCase.contains("show tables"))
+        for (table <- Seq("src", "srcpart")) {
+          val hasMatchingQuery = queryList.exists { query =>
+            val normalizedQuery = query.toLowerCase.stripSuffix(";")
+            normalizedQuery.endsWith(table) ||
+              normalizedQuery.contains(s"from $table") ||
+              normalizedQuery.contains(s"from default.$table")
+          }
+          if (hasShowTableCommand || hasMatchingQuery) {
+            TestHive.loadTestTable(table)
+          }
+        }
+
         val hiveCacheFiles = queryList.zipWithIndex.map {
           case (queryString, i) =>
             val cachedAnswerName = s"$testCaseName-$i-${getMd5(queryString)}"
@@ -430,12 +452,45 @@ abstract class HiveComparisonTest
                 """.stripMargin
 
               stringToFile(new File(wrongDirectory, testCaseName), 
errorMessage + consoleTestCase)
-              fail(errorMessage)
+              if (isSpeculative && !reset) {
+                fail("Failed on first run; retrying")
+              } else {
+                fail(errorMessage)
+              }
             }
         }
 
         // Touch passed file.
         new FileOutputStream(new File(passedDirectory, testCaseName)).close()
+      }
+
+      val canSpeculativelyTryWithoutReset: Boolean = {
+        val excludedSubstrings = Seq(
+          "into table",
+          "create table",
+          "drop index"
+        )
+        !queryList.map(_.toLowerCase).exists { query =>
+          excludedSubstrings.exists(s => query.contains(s))
+        }
+      }
+
+      try {
+        try {
+          if (tryWithoutResettingFirst && canSpeculativelyTryWithoutReset) {
+            doTest(reset = false, isSpeculative = true)
+          } else {
+            doTest(reset)
+          }
+        } catch {
+          case tf: org.scalatest.exceptions.TestFailedException =>
+            if (tryWithoutResettingFirst && canSpeculativelyTryWithoutReset) {
+              logWarning("Test failed without reset(); retrying with reset()")
+              doTest(reset = true)
+            } else {
+              throw tf
+            }
+        }
       } catch {
         case tf: org.scalatest.exceptions.TestFailedException => throw tf
         case originalException: Exception =>

http://git-wip-us.apache.org/repos/asf/spark/blob/d77bf0bd/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQueryFileTest.scala
----------------------------------------------------------------------
diff --git 
a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQueryFileTest.scala
 
b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQueryFileTest.scala
index f7b37da..f96c989 100644
--- 
a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQueryFileTest.scala
+++ 
b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQueryFileTest.scala
@@ -59,7 +59,7 @@ abstract class HiveQueryFileTest extends HiveComparisonTest {
         runAll) {
         // Build a test case and submit it to scala test framework...
         val queriesString = fileToString(testCaseFile)
-        createQueryTest(testCaseName, queriesString)
+        createQueryTest(testCaseName, queriesString, reset = true, 
tryWithoutResettingFirst = true)
       } else {
         // Only output warnings for the built in whitelist as this clutters 
the output when the user
         // trying to execute a single test from the commandline.


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-12075][SQL] Speed up HiveComparisionTest by avoiding / speeding up TestHive.reset()

Reply via email to