spark git commit: [SPARK-12075][SQL] Speed up HiveComparisionTest by avoiding / speeding up TestHive.reset()

2015-12-01 Thread rxin
Repository: spark
Updated Branches:
  refs/heads/branch-1.6 012de2ce5 -> d77bf0bd9


[SPARK-12075][SQL] Speed up HiveComparisionTest by avoiding / speeding up 
TestHive.reset()

When profiling HiveCompatibilitySuite, I noticed that most of the time seems to 
be spent in expensive `TestHive.reset()` calls. This patch speeds up suites 
based on HiveComparisionTest, such as HiveCompatibilitySuite, with the 
following changes:

- Avoid `TestHive.reset()` whenever possible:
  - Use a simple set of heuristics to guess whether we need to call `reset()` 
in between tests.
  - As a safety-net, automatically re-run failed tests by calling `reset()` 
before the re-attempt.
- Speed up the expensive parts of `TestHive.reset()`: loading the `src` and 
`srcpart` tables took roughly 600ms per test, so we now avoid this by using a 
simple heuristic which only loads those tables by tests that reference them. 
This is based on simple string matching over the test queries which errs on the 
side of loading in more situations than might be strictly necessary.

After these changes, HiveCompatibilitySuite seems to run in about 10 minutes.

This PR is a revival of #6663, an earlier experimental PR from June, where I 
played around with several possible speedups for this suite.

Author: Josh Rosen 

Closes #10055 from JoshRosen/speculative-testhive-reset.

(cherry picked from commit ef6790fdc3b70b9d6184ec2b3d926e4b0e4b15f6)
Signed-off-by: Reynold Xin 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/d77bf0bd
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/d77bf0bd
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/d77bf0bd

Branch: refs/heads/branch-1.6
Commit: d77bf0bd922835b6a63bb1eeedf91e2a92d92ca9
Parents: 012de2c
Author: Josh Rosen 
Authored: Wed Dec 2 07:29:45 2015 +0800
Committer: Reynold Xin 
Committed: Wed Dec 2 07:30:07 2015 +0800

--
 .../apache/spark/sql/hive/test/TestHive.scala   |  7 --
 .../sql/hive/execution/HiveComparisonTest.scala | 67 ++--
 .../sql/hive/execution/HiveQueryFileTest.scala  |  2 +-
 3 files changed, 62 insertions(+), 14 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/d77bf0bd/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala
--
diff --git 
a/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala 
b/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala
index 6883d30..2e2d201 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala
@@ -443,13 +443,6 @@ class TestHiveContext(sc: SparkContext) extends 
HiveContext(sc) {
   defaultOverrides()
 
   runSqlHive("USE default")
-
-  // Just loading src makes a lot of tests pass.  This is because some 
tests do something like
-  // drop an index on src at the beginning.  Since we just pass DDL to 
hive this bypasses our
-  // Analyzer and thus the test table auto-loading mechanism.
-  // Remove after we handle more DDL operations natively.
-  loadTestTable("src")
-  loadTestTable("srcpart")
 } catch {
   case e: Exception =>
 logError("FATAL ERROR: Failed to reset TestDB state.", e)

http://git-wip-us.apache.org/repos/asf/spark/blob/d77bf0bd/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveComparisonTest.scala
--
diff --git 
a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveComparisonTest.scala
 
b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveComparisonTest.scala
index aa95ba9..4455430 100644
--- 
a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveComparisonTest.scala
+++ 
b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveComparisonTest.scala
@@ -209,7 +209,11 @@ abstract class HiveComparisonTest
   }
 
   val installHooksCommand = "(?i)SET.*hooks".r
-  def createQueryTest(testCaseName: String, sql: String, reset: Boolean = 
true) {
+  def createQueryTest(
+  testCaseName: String,
+  sql: String,
+  reset: Boolean = true,
+  tryWithoutResettingFirst: Boolean = false) {
 // testCaseName must not contain ':', which is not allowed to appear in a 
filename of Windows
 assert(!testCaseName.contains(":"))
 
@@ -240,9 +244,6 @@ abstract class HiveComparisonTest
 test(testCaseName) {
   logDebug(s"=== HIVE TEST: $testCaseName ===")
 
-  // Clear old output for this testcase.
-  outputDirectories.map(new File(_, 
testCaseName)).filter(_.exists()).foreach(_.delete())
-
   val sqlWithoutComment =
 sql.split("\n").filterNot(l => 
l.matches("--.*(?<=[^]);

spark git commit: [SPARK-12075][SQL] Speed up HiveComparisionTest by avoiding / speeding up TestHive.reset()

2015-12-01 Thread rxin
Repository: spark
Updated Branches:
  refs/heads/master f292018f8 -> ef6790fdc


[SPARK-12075][SQL] Speed up HiveComparisionTest by avoiding / speeding up 
TestHive.reset()

When profiling HiveCompatibilitySuite, I noticed that most of the time seems to 
be spent in expensive `TestHive.reset()` calls. This patch speeds up suites 
based on HiveComparisionTest, such as HiveCompatibilitySuite, with the 
following changes:

- Avoid `TestHive.reset()` whenever possible:
  - Use a simple set of heuristics to guess whether we need to call `reset()` 
in between tests.
  - As a safety-net, automatically re-run failed tests by calling `reset()` 
before the re-attempt.
- Speed up the expensive parts of `TestHive.reset()`: loading the `src` and 
`srcpart` tables took roughly 600ms per test, so we now avoid this by using a 
simple heuristic which only loads those tables by tests that reference them. 
This is based on simple string matching over the test queries which errs on the 
side of loading in more situations than might be strictly necessary.

After these changes, HiveCompatibilitySuite seems to run in about 10 minutes.

This PR is a revival of #6663, an earlier experimental PR from June, where I 
played around with several possible speedups for this suite.

Author: Josh Rosen 

Closes #10055 from JoshRosen/speculative-testhive-reset.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/ef6790fd
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/ef6790fd
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/ef6790fd

Branch: refs/heads/master
Commit: ef6790fdc3b70b9d6184ec2b3d926e4b0e4b15f6
Parents: f292018
Author: Josh Rosen 
Authored: Wed Dec 2 07:29:45 2015 +0800
Committer: Reynold Xin 
Committed: Wed Dec 2 07:29:45 2015 +0800

--
 .../apache/spark/sql/hive/test/TestHive.scala   |  7 --
 .../sql/hive/execution/HiveComparisonTest.scala | 67 ++--
 .../sql/hive/execution/HiveQueryFileTest.scala  |  2 +-
 3 files changed, 62 insertions(+), 14 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/ef6790fd/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala
--
diff --git 
a/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala 
b/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala
index 6883d30..2e2d201 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala
@@ -443,13 +443,6 @@ class TestHiveContext(sc: SparkContext) extends 
HiveContext(sc) {
   defaultOverrides()
 
   runSqlHive("USE default")
-
-  // Just loading src makes a lot of tests pass.  This is because some 
tests do something like
-  // drop an index on src at the beginning.  Since we just pass DDL to 
hive this bypasses our
-  // Analyzer and thus the test table auto-loading mechanism.
-  // Remove after we handle more DDL operations natively.
-  loadTestTable("src")
-  loadTestTable("srcpart")
 } catch {
   case e: Exception =>
 logError("FATAL ERROR: Failed to reset TestDB state.", e)

http://git-wip-us.apache.org/repos/asf/spark/blob/ef6790fd/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveComparisonTest.scala
--
diff --git 
a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveComparisonTest.scala
 
b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveComparisonTest.scala
index aa95ba9..4455430 100644
--- 
a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveComparisonTest.scala
+++ 
b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveComparisonTest.scala
@@ -209,7 +209,11 @@ abstract class HiveComparisonTest
   }
 
   val installHooksCommand = "(?i)SET.*hooks".r
-  def createQueryTest(testCaseName: String, sql: String, reset: Boolean = 
true) {
+  def createQueryTest(
+  testCaseName: String,
+  sql: String,
+  reset: Boolean = true,
+  tryWithoutResettingFirst: Boolean = false) {
 // testCaseName must not contain ':', which is not allowed to appear in a 
filename of Windows
 assert(!testCaseName.contains(":"))
 
@@ -240,9 +244,6 @@ abstract class HiveComparisonTest
 test(testCaseName) {
   logDebug(s"=== HIVE TEST: $testCaseName ===")
 
-  // Clear old output for this testcase.
-  outputDirectories.map(new File(_, 
testCaseName)).filter(_.exists()).foreach(_.delete())
-
   val sqlWithoutComment =
 sql.split("\n").filterNot(l => 
l.matches("--.*(?<=[^]);")).mkString("\n")
   val allQueries =
@@ -269,11 +270,32 @@ abstract class HiveComparisonTest