svn commit: r30903 - in /dev/spark/3.0.0-SNAPSHOT-2018_11_14_20_33-03306a6-docs: ./ _site/ _site/api/ _site/api/R/ _site/api/java/ _site/api/java/lib/ _site/api/java/org/ _site/api/java/org/apache/ _s

2018-11-14 Thread pwendell
Author: pwendell
Date: Thu Nov 15 04:45:51 2018
New Revision: 30903

Log:
Apache Spark 3.0.0-SNAPSHOT-2018_11_14_20_33-03306a6 docs


[This commit notification would consist of 1755 parts, 
which exceeds the limit of 50 ones, so it was shortened to the summary.]

-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



[3/4] spark git commit: [SPARK-26036][PYTHON] Break large tests.py files into smaller files

2018-11-14 Thread gurwls223
http://git-wip-us.apache.org/repos/asf/spark/blob/03306a6d/python/pyspark/tests.py
--
diff --git a/python/pyspark/tests.py b/python/pyspark/tests.py
deleted file mode 100644
index 131c51e..000
--- a/python/pyspark/tests.py
+++ /dev/null
@@ -1,2502 +0,0 @@
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""
-Unit tests for PySpark; additional tests are implemented as doctests in
-individual modules.
-"""
-
-from array import array
-from glob import glob
-import os
-import re
-import shutil
-import subprocess
-import sys
-import tempfile
-import time
-import zipfile
-import random
-import threading
-import hashlib
-
-from py4j.protocol import Py4JJavaError
-try:
-import xmlrunner
-except ImportError:
-xmlrunner = None
-
-if sys.version_info[:2] <= (2, 6):
-try:
-import unittest2 as unittest
-except ImportError:
-sys.stderr.write('Please install unittest2 to test with Python 2.6 or 
earlier')
-sys.exit(1)
-else:
-import unittest
-if sys.version_info[0] >= 3:
-xrange = range
-basestring = str
-
-if sys.version >= "3":
-from io import StringIO
-else:
-from StringIO import StringIO
-
-
-from pyspark import keyword_only
-from pyspark.conf import SparkConf
-from pyspark.context import SparkContext
-from pyspark.rdd import RDD
-from pyspark.files import SparkFiles
-from pyspark.serializers import read_int, BatchedSerializer, 
MarshalSerializer, PickleSerializer, \
-CloudPickleSerializer, CompressedSerializer, UTF8Deserializer, 
NoOpSerializer, \
-PairDeserializer, CartesianDeserializer, AutoBatchedSerializer, 
AutoSerializer, \
-FlattenedValuesSerializer
-from pyspark.shuffle import Aggregator, ExternalMerger, ExternalSorter
-from pyspark import shuffle
-from pyspark.profiler import BasicProfiler
-from pyspark.taskcontext import BarrierTaskContext, TaskContext
-
-_have_scipy = False
-_have_numpy = False
-try:
-import scipy.sparse
-_have_scipy = True
-except:
-# No SciPy, but that's okay, we'll skip those tests
-pass
-try:
-import numpy as np
-_have_numpy = True
-except:
-# No NumPy, but that's okay, we'll skip those tests
-pass
-
-
-SPARK_HOME = os.environ["SPARK_HOME"]
-
-
-class MergerTests(unittest.TestCase):
-
-def setUp(self):
-self.N = 1 << 12
-self.l = [i for i in xrange(self.N)]
-self.data = list(zip(self.l, self.l))
-self.agg = Aggregator(lambda x: [x],
-  lambda x, y: x.append(y) or x,
-  lambda x, y: x.extend(y) or x)
-
-def test_small_dataset(self):
-m = ExternalMerger(self.agg, 1000)
-m.mergeValues(self.data)
-self.assertEqual(m.spills, 0)
-self.assertEqual(sum(sum(v) for k, v in m.items()),
- sum(xrange(self.N)))
-
-m = ExternalMerger(self.agg, 1000)
-m.mergeCombiners(map(lambda x_y1: (x_y1[0], [x_y1[1]]), self.data))
-self.assertEqual(m.spills, 0)
-self.assertEqual(sum(sum(v) for k, v in m.items()),
- sum(xrange(self.N)))
-
-def test_medium_dataset(self):
-m = ExternalMerger(self.agg, 20)
-m.mergeValues(self.data)
-self.assertTrue(m.spills >= 1)
-self.assertEqual(sum(sum(v) for k, v in m.items()),
- sum(xrange(self.N)))
-
-m = ExternalMerger(self.agg, 10)
-m.mergeCombiners(map(lambda x_y2: (x_y2[0], [x_y2[1]]), self.data * 3))
-self.assertTrue(m.spills >= 1)
-self.assertEqual(sum(sum(v) for k, v in m.items()),
- sum(xrange(self.N)) * 3)
-
-def test_huge_dataset(self):
-m = ExternalMerger(self.agg, 5, partitions=3)
-m.mergeCombiners(map(lambda k_v: (k_v[0], [str(k_v[1])]), self.data * 
10))
-self.assertTrue(m.spills >= 1)
-self.assertEqual(sum(len(v) for k, v in m.items()),
- self.N * 10)
-m._cleanup()
-
-def test_group_by_key(self):
-
-def gen_data(N, step):
-for i in range(1, N + 1, step):
-for j in range(i):
-yield (i, [j])
-
-   

[1/4] spark git commit: [SPARK-26036][PYTHON] Break large tests.py files into smaller files

2018-11-14 Thread gurwls223
Repository: spark
Updated Branches:
  refs/heads/master f6255d7b7 -> 03306a6df


http://git-wip-us.apache.org/repos/asf/spark/blob/03306a6d/python/pyspark/tests/test_readwrite.py
--
diff --git a/python/pyspark/tests/test_readwrite.py 
b/python/pyspark/tests/test_readwrite.py
new file mode 100644
index 000..e45f5b3
--- /dev/null
+++ b/python/pyspark/tests/test_readwrite.py
@@ -0,0 +1,499 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+import os
+import shutil
+import sys
+import tempfile
+import unittest
+from array import array
+
+from pyspark.testing.utils import ReusedPySparkTestCase, SPARK_HOME
+
+
+class InputFormatTests(ReusedPySparkTestCase):
+
+@classmethod
+def setUpClass(cls):
+ReusedPySparkTestCase.setUpClass()
+cls.tempdir = tempfile.NamedTemporaryFile(delete=False)
+os.unlink(cls.tempdir.name)
+
cls.sc._jvm.WriteInputFormatTestDataGenerator.generateData(cls.tempdir.name, 
cls.sc._jsc)
+
+@classmethod
+def tearDownClass(cls):
+ReusedPySparkTestCase.tearDownClass()
+shutil.rmtree(cls.tempdir.name)
+
+@unittest.skipIf(sys.version >= "3", "serialize array of byte")
+def test_sequencefiles(self):
+basepath = self.tempdir.name
+ints = sorted(self.sc.sequenceFile(basepath + "/sftestdata/sfint/",
+   "org.apache.hadoop.io.IntWritable",
+   
"org.apache.hadoop.io.Text").collect())
+ei = [(1, u'aa'), (1, u'aa'), (2, u'aa'), (2, u'bb'), (2, u'bb'), (3, 
u'cc')]
+self.assertEqual(ints, ei)
+
+doubles = sorted(self.sc.sequenceFile(basepath + 
"/sftestdata/sfdouble/",
+  
"org.apache.hadoop.io.DoubleWritable",
+  
"org.apache.hadoop.io.Text").collect())
+ed = [(1.0, u'aa'), (1.0, u'aa'), (2.0, u'aa'), (2.0, u'bb'), (2.0, 
u'bb'), (3.0, u'cc')]
+self.assertEqual(doubles, ed)
+
+bytes = sorted(self.sc.sequenceFile(basepath + "/sftestdata/sfbytes/",
+"org.apache.hadoop.io.IntWritable",
+
"org.apache.hadoop.io.BytesWritable").collect())
+ebs = [(1, bytearray('aa', 'utf-8')),
+   (1, bytearray('aa', 'utf-8')),
+   (2, bytearray('aa', 'utf-8')),
+   (2, bytearray('bb', 'utf-8')),
+   (2, bytearray('bb', 'utf-8')),
+   (3, bytearray('cc', 'utf-8'))]
+self.assertEqual(bytes, ebs)
+
+text = sorted(self.sc.sequenceFile(basepath + "/sftestdata/sftext/",
+   "org.apache.hadoop.io.Text",
+   
"org.apache.hadoop.io.Text").collect())
+et = [(u'1', u'aa'),
+  (u'1', u'aa'),
+  (u'2', u'aa'),
+  (u'2', u'bb'),
+  (u'2', u'bb'),
+  (u'3', u'cc')]
+self.assertEqual(text, et)
+
+bools = sorted(self.sc.sequenceFile(basepath + "/sftestdata/sfbool/",
+"org.apache.hadoop.io.IntWritable",
+
"org.apache.hadoop.io.BooleanWritable").collect())
+eb = [(1, False), (1, True), (2, False), (2, False), (2, True), (3, 
True)]
+self.assertEqual(bools, eb)
+
+nulls = sorted(self.sc.sequenceFile(basepath + "/sftestdata/sfnull/",
+"org.apache.hadoop.io.IntWritable",
+
"org.apache.hadoop.io.BooleanWritable").collect())
+en = [(1, None), (1, None), (2, None), (2, None), (2, None), (3, None)]
+self.assertEqual(nulls, en)
+
+maps = self.sc.sequenceFile(basepath + "/sftestdata/sfmap/",
+"org.apache.hadoop.io.IntWritable",
+
"org.apache.hadoop.io.MapWritable").collect()
+em = [(1, {}),
+  (1, {3.0: u'bb'}),
+  (2, {1.0: u'aa'}),
+  (2, {1.0: u'cc'}),
+  (3, {2.0: 

[2/4] spark git commit: [SPARK-26036][PYTHON] Break large tests.py files into smaller files

2018-11-14 Thread gurwls223
http://git-wip-us.apache.org/repos/asf/spark/blob/03306a6d/python/pyspark/tests/__init__.py
--
diff --git a/python/pyspark/tests/__init__.py b/python/pyspark/tests/__init__.py
new file mode 100644
index 000..12bdf0d
--- /dev/null
+++ b/python/pyspark/tests/__init__.py
@@ -0,0 +1,16 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#

http://git-wip-us.apache.org/repos/asf/spark/blob/03306a6d/python/pyspark/tests/test_appsubmit.py
--
diff --git a/python/pyspark/tests/test_appsubmit.py 
b/python/pyspark/tests/test_appsubmit.py
new file mode 100644
index 000..92bcb11
--- /dev/null
+++ b/python/pyspark/tests/test_appsubmit.py
@@ -0,0 +1,248 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import os
+import re
+import shutil
+import subprocess
+import tempfile
+import unittest
+import zipfile
+
+
+class SparkSubmitTests(unittest.TestCase):
+
+def setUp(self):
+self.programDir = tempfile.mkdtemp()
+tmp_dir = tempfile.gettempdir()
+self.sparkSubmit = [
+os.path.join(os.environ.get("SPARK_HOME"), "bin", "spark-submit"),
+"--conf", 
"spark.driver.extraJavaOptions=-Djava.io.tmpdir={0}".format(tmp_dir),
+"--conf", 
"spark.executor.extraJavaOptions=-Djava.io.tmpdir={0}".format(tmp_dir),
+]
+
+def tearDown(self):
+shutil.rmtree(self.programDir)
+
+def createTempFile(self, name, content, dir=None):
+"""
+Create a temp file with the given name and content and return its path.
+Strips leading spaces from content up to the first '|' in each line.
+"""
+pattern = re.compile(r'^ *\|', re.MULTILINE)
+content = re.sub(pattern, '', content.strip())
+if dir is None:
+path = os.path.join(self.programDir, name)
+else:
+os.makedirs(os.path.join(self.programDir, dir))
+path = os.path.join(self.programDir, dir, name)
+with open(path, "w") as f:
+f.write(content)
+return path
+
+def createFileInZip(self, name, content, ext=".zip", dir=None, 
zip_name=None):
+"""
+Create a zip archive containing a file with the given content and 
return its path.
+Strips leading spaces from content up to the first '|' in each line.
+"""
+pattern = re.compile(r'^ *\|', re.MULTILINE)
+content = re.sub(pattern, '', content.strip())
+if dir is None:
+path = os.path.join(self.programDir, name + ext)
+else:
+path = os.path.join(self.programDir, dir, zip_name + ext)
+zip = zipfile.ZipFile(path, 'w')
+zip.writestr(name, content)
+zip.close()
+return path
+
+def create_spark_package(self, artifact_name):
+group_id, artifact_id, version = artifact_name.split(":")
+self.createTempFile("%s-%s.pom" % (artifact_id, version), ("""
+|
+|http://maven.apache.org/POM/4.0.0;
+|   xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance;
+|   xsi:schemaLocation="http://maven.apache.org/POM/4.0.0
+|   http://maven.apache.org/xsd/maven-4.0.0.xsd;>
+|   4.0.0
+|   %s
+|   %s
+|   %s
+|
+""" % (group_id, artifact_id, version)).lstrip(),
+

svn commit: r30902 - in /dev/spark/3.0.0-SNAPSHOT-2018_11_14_16_25-ad853c5-docs: ./ _site/ _site/api/ _site/api/R/ _site/api/java/ _site/api/java/lib/ _site/api/java/org/ _site/api/java/org/apache/ _s

2018-11-14 Thread pwendell
Author: pwendell
Date: Thu Nov 15 00:38:01 2018
New Revision: 30902

Log:
Apache Spark 3.0.0-SNAPSHOT-2018_11_14_16_25-ad853c5 docs


[This commit notification would consist of 1755 parts, 
which exceeds the limit of 50 ones, so it was shortened to the summary.]

-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



spark git commit: [MINOR][SQL] Add disable bucketedRead workaround when throw RuntimeException

2018-11-14 Thread gurwls223
Repository: spark
Updated Branches:
  refs/heads/master ad853c567 -> f6255d7b7


[MINOR][SQL] Add disable bucketedRead workaround when throw RuntimeException

## What changes were proposed in this pull request?
It will throw `RuntimeException` when read from bucketed table(about 1.7G per 
bucket file):
![image](https://user-images.githubusercontent.com/5399861/48346889-8041ce00-e6b7-11e8-83b0-ead83fb15821.png)

Default(enable bucket read):
![image](https://user-images.githubusercontent.com/5399861/48347084-2c83b480-e6b8-11e8-913a-9cafc043e9e4.png)

Disable bucket read:
![image](https://user-images.githubusercontent.com/5399861/48347099-3a393a00-e6b8-11e8-94af-cb814e1ba277.png)

The reason is that each bucket file is too big. a workaround is disable bucket 
read. This PR add this workaround to Spark.

## How was this patch tested?

manual tests

Closes #23014 from wangyum/anotherWorkaround.

Authored-by: Yuming Wang 
Signed-off-by: hyukjinkwon 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/f6255d7b
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/f6255d7b
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/f6255d7b

Branch: refs/heads/master
Commit: f6255d7b7cc4cc5d1f4fe0e5e493a1efee22f38f
Parents: ad853c5
Author: Yuming Wang 
Authored: Thu Nov 15 08:33:06 2018 +0800
Committer: hyukjinkwon 
Committed: Thu Nov 15 08:33:06 2018 +0800

--
 .../spark/sql/execution/vectorized/WritableColumnVector.java| 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/f6255d7b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/WritableColumnVector.java
--
diff --git 
a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/WritableColumnVector.java
 
b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/WritableColumnVector.java
index b0e119d..4f5e72c 100644
--- 
a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/WritableColumnVector.java
+++ 
b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/WritableColumnVector.java
@@ -101,10 +101,11 @@ public abstract class WritableColumnVector extends 
ColumnVector {
 String message = "Cannot reserve additional contiguous bytes in the 
vectorized reader (" +
 (requiredCapacity >= 0 ? "requested " + requiredCapacity + " bytes" : 
"integer overflow") +
 "). As a workaround, you can reduce the vectorized reader batch size, 
or disable the " +
-"vectorized reader. For parquet file format, refer to " +
+"vectorized reader, or disable " + SQLConf.BUCKETING_ENABLED().key() + 
" if you read " +
+"from bucket table. For Parquet file format, refer to " +
 SQLConf.PARQUET_VECTORIZED_READER_BATCH_SIZE().key() +
 " (default " + 
SQLConf.PARQUET_VECTORIZED_READER_BATCH_SIZE().defaultValueString() +
-") and " + SQLConf.PARQUET_VECTORIZED_READER_ENABLED().key() + "; for 
orc file format, " +
+") and " + SQLConf.PARQUET_VECTORIZED_READER_ENABLED().key() + "; for 
ORC file format, " +
 "refer to " + SQLConf.ORC_VECTORIZED_READER_BATCH_SIZE().key() +
 " (default " + 
SQLConf.ORC_VECTORIZED_READER_BATCH_SIZE().defaultValueString() +
 ") and " + SQLConf.ORC_VECTORIZED_READER_ENABLED().key() + ".";


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



spark git commit: [SPARK-25956] Make Scala 2.12 as default Scala version in Spark 3.0

2018-11-14 Thread dongjoon
Repository: spark
Updated Branches:
  refs/heads/master 2977e2312 -> ad853c567


[SPARK-25956] Make Scala 2.12 as default Scala version in Spark 3.0

## What changes were proposed in this pull request?

This PR makes Spark's default Scala version as 2.12, and Scala 2.11 will be the 
alternative version. This implies that Scala 2.12 will be used by our CI builds 
including pull request builds.

We'll update the Jenkins to include a new compile-only jobs for Scala 2.11 to 
ensure the code can be still compiled with Scala 2.11.

## How was this patch tested?

existing tests

Closes #22967 from dbtsai/scala2.12.

Authored-by: DB Tsai 
Signed-off-by: Dongjoon Hyun 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/ad853c56
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/ad853c56
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/ad853c56

Branch: refs/heads/master
Commit: ad853c56788fd32e035369d1fe3d96aaf6c4ef16
Parents: 2977e23
Author: DB Tsai 
Authored: Wed Nov 14 16:22:23 2018 -0800
Committer: Dongjoon Hyun 
Committed: Wed Nov 14 16:22:23 2018 -0800

--
 assembly/pom.xml|  4 +--
 common/kvstore/pom.xml  |  4 +--
 common/network-common/pom.xml   |  4 +--
 common/network-shuffle/pom.xml  |  4 +--
 common/network-yarn/pom.xml |  4 +--
 common/sketch/pom.xml   |  4 +--
 common/tags/pom.xml |  4 +--
 common/unsafe/pom.xml   |  4 +--
 core/pom.xml|  4 +--
 dev/deps/spark-deps-hadoop-2.7  | 36 ++--
 dev/deps/spark-deps-hadoop-3.1  | 36 ++--
 docs/_config.yml|  4 +--
 docs/_plugins/copy_api_dirs.rb  |  2 +-
 docs/building-spark.md  | 18 +-
 docs/cloud-integration.md   |  2 +-
 docs/sparkr.md  |  2 +-
 examples/pom.xml|  4 +--
 external/avro/pom.xml   |  4 +--
 external/docker-integration-tests/pom.xml   |  4 +--
 external/kafka-0-10-assembly/pom.xml|  4 +--
 external/kafka-0-10-sql/pom.xml |  4 +--
 external/kafka-0-10/pom.xml |  4 +--
 external/kinesis-asl-assembly/pom.xml   |  4 +--
 external/kinesis-asl/pom.xml|  4 +--
 external/spark-ganglia-lgpl/pom.xml |  4 +--
 graphx/pom.xml  |  4 +--
 hadoop-cloud/pom.xml|  4 +--
 launcher/pom.xml|  4 +--
 mllib-local/pom.xml |  4 +--
 mllib/pom.xml   |  4 +--
 pom.xml | 20 ++-
 project/MimaBuild.scala |  2 +-
 project/SparkBuild.scala| 14 
 python/run-tests.py |  4 +--
 repl/pom.xml|  4 +--
 resource-managers/kubernetes/core/pom.xml   |  4 +--
 .../kubernetes/integration-tests/pom.xml|  4 +--
 resource-managers/mesos/pom.xml |  4 +--
 resource-managers/yarn/pom.xml  |  4 +--
 sql/catalyst/pom.xml|  4 +--
 sql/core/pom.xml|  4 +--
 sql/hive-thriftserver/pom.xml   |  4 +--
 sql/hive/pom.xml|  4 +--
 streaming/pom.xml   |  4 +--
 tools/pom.xml   |  4 +--
 45 files changed, 138 insertions(+), 138 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/ad853c56/assembly/pom.xml
--
diff --git a/assembly/pom.xml b/assembly/pom.xml
index b0337e5..68ebfad 100644
--- a/assembly/pom.xml
+++ b/assembly/pom.xml
@@ -20,12 +20,12 @@
   4.0.0
   
 org.apache.spark
-spark-parent_2.11
+spark-parent_2.12
 3.0.0-SNAPSHOT
 ../pom.xml
   
 
-  spark-assembly_2.11
+  spark-assembly_2.12
   Spark Project Assembly
   http://spark.apache.org/
   pom

http://git-wip-us.apache.org/repos/asf/spark/blob/ad853c56/common/kvstore/pom.xml
--
diff --git a/common/kvstore/pom.xml b/common/kvstore/pom.xml
index 23a0f49..f042a12 100644
--- a/common/kvstore/pom.xml
+++ b/common/kvstore/pom.xml
@@ -21,12 +21,12 @@
   4.0.0
   
 org.apache.spark
-spark-parent_2.11
+spark-parent_2.12
 3.0.0-SNAPSHOT
 ../../pom.xml
   
 
-  

svn commit: r30901 - in /dev/spark/2.4.1-SNAPSHOT-2018_11_14_14_19-ba638a7-docs: ./ _site/ _site/api/ _site/api/R/ _site/api/java/ _site/api/java/lib/ _site/api/java/org/ _site/api/java/org/apache/ _s

2018-11-14 Thread pwendell
Author: pwendell
Date: Wed Nov 14 22:33:54 2018
New Revision: 30901

Log:
Apache Spark 2.4.1-SNAPSHOT-2018_11_14_14_19-ba638a7 docs


[This commit notification would consist of 1476 parts, 
which exceeds the limit of 50 ones, so it was shortened to the summary.]

-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



spark git commit: [SPARK-25986][BUILD] Add rules to ban throw Errors in application code

2018-11-14 Thread srowen
Repository: spark
Updated Branches:
  refs/heads/master 2b671e729 -> 2977e2312


[SPARK-25986][BUILD] Add rules to ban throw Errors in application code

## What changes were proposed in this pull request?

Add scala and java lint check rules to ban the usage of `throw new xxxErrors` 
and fix up all exists instance followed by 
https://github.com/apache/spark/pull/22989#issuecomment-437939830. See more 
details in https://github.com/apache/spark/pull/22969.

## How was this patch tested?

Local test with lint-scala and lint-java.

Closes #22989 from xuanyuanking/SPARK-25986.

Authored-by: Yuanjian Li 
Signed-off-by: Sean Owen 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/2977e231
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/2977e231
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/2977e231

Branch: refs/heads/master
Commit: 2977e2312d9690c9ced3c86b0ce937819e957775
Parents: 2b671e7
Author: Yuanjian Li 
Authored: Wed Nov 14 13:05:18 2018 -0800
Committer: Sean Owen 
Committed: Wed Nov 14 13:05:18 2018 -0800

--
 .../spark/unsafe/UnsafeAlignedOffset.java   |  4 +++
 .../org/apache/spark/memory/MemoryConsumer.java |  2 ++
 .../apache/spark/memory/TaskMemoryManager.java  |  4 +++
 .../unsafe/sort/UnsafeInMemorySorter.java   |  2 ++
 .../spark/util/random/RandomSampler.scala   |  2 +-
 .../scala/org/apache/spark/FailureSuite.scala   |  2 ++
 .../apache/spark/executor/ExecutorSuite.scala   |  2 ++
 .../spark/scheduler/TaskResultGetterSuite.scala |  2 ++
 .../spark/storage/BlockManagerSuite.scala   |  2 +-
 dev/checkstyle.xml  | 13 +---
 .../spark/streaming/kafka010/KafkaUtils.scala   |  2 +-
 .../org/apache/spark/ml/linalg/Vectors.scala|  2 +-
 .../spark/ml/classification/NaiveBayes.scala|  8 ++---
 .../org/apache/spark/ml/param/params.scala  |  4 +--
 .../spark/ml/tuning/ValidatorParams.scala   |  4 +--
 .../spark/mllib/classification/NaiveBayes.scala |  2 +-
 .../org/apache/spark/mllib/linalg/Vectors.scala |  2 +-
 .../org/apache/spark/ml/PredictorSuite.scala|  6 ++--
 .../ml/classification/ClassifierSuite.scala | 11 ---
 .../ml/classification/NaiveBayesSuite.scala |  4 +--
 .../ml/classification/OneVsRestSuite.scala  | 16 +-
 .../spark/ml/feature/VectorIndexerSuite.scala   |  4 ++-
 .../spark/ml/tree/impl/RandomForestSuite.scala  |  6 ++--
 .../apache/spark/ml/tree/impl/TreeTests.scala   |  6 ++--
 .../spark/ml/tuning/CrossValidatorSuite.scala   | 32 ++--
 .../ml/tuning/TrainValidationSplitSuite.scala   | 12 
 .../ml/tuning/ValidatorParamsSuiteHelpers.scala |  3 +-
 .../mllib/classification/NaiveBayesSuite.scala  |  2 +-
 .../spark/mllib/clustering/KMeansSuite.scala|  2 +-
 .../spark/mllib/tree/DecisionTreeSuite.scala| 15 -
 scalastyle-config.xml   | 11 +++
 .../aggregate/TungstenAggregationIterator.scala |  2 ++
 .../spark/sql/FileBasedDataSourceSuite.scala|  9 +++---
 .../spark/sql/execution/PlannerSuite.scala  |  2 +-
 .../datasources/FileSourceStrategySuite.scala   |  2 +-
 .../vectorized/ColumnarBatchSuite.scala |  2 +-
 .../apache/spark/streaming/util/StateMap.scala  |  2 +-
 .../spark/streaming/InputStreamsSuite.scala |  5 +--
 .../apache/spark/streaming/StateMapSuite.scala  |  2 +-
 39 files changed, 128 insertions(+), 87 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/2977e231/common/unsafe/src/main/java/org/apache/spark/unsafe/UnsafeAlignedOffset.java
--
diff --git 
a/common/unsafe/src/main/java/org/apache/spark/unsafe/UnsafeAlignedOffset.java 
b/common/unsafe/src/main/java/org/apache/spark/unsafe/UnsafeAlignedOffset.java
index be62e40..546e878 100644
--- 
a/common/unsafe/src/main/java/org/apache/spark/unsafe/UnsafeAlignedOffset.java
+++ 
b/common/unsafe/src/main/java/org/apache/spark/unsafe/UnsafeAlignedOffset.java
@@ -39,7 +39,9 @@ public class UnsafeAlignedOffset {
   case 8:
 return (int)Platform.getLong(object, offset);
   default:
+// checkstyle.off: RegexpSinglelineJava
 throw new AssertionError("Illegal UAO_SIZE");
+// checkstyle.on: RegexpSinglelineJava
 }
   }
 
@@ -52,7 +54,9 @@ public class UnsafeAlignedOffset {
 Platform.putLong(object, offset, value);
 break;
   default:
+// checkstyle.off: RegexpSinglelineJava
 throw new AssertionError("Illegal UAO_SIZE");
+// checkstyle.on: RegexpSinglelineJava
 }
   }
 }

http://git-wip-us.apache.org/repos/asf/spark/blob/2977e231/core/src/main/java/org/apache/spark/memory/MemoryConsumer.java
--
diff --git 

spark git commit: [SPARK-25778] WriteAheadLogBackedBlockRDD in YARN Cluster Mode Fails …

2018-11-14 Thread vanzin
Repository: spark
Updated Branches:
  refs/heads/branch-2.4 ca426bfa5 -> ba638a783


[SPARK-25778] WriteAheadLogBackedBlockRDD in YARN Cluster Mode Fails …

…due lack of access to tmpDir from $PWD to HDFS

WriteAheadLogBackedBlockRDD usage of java.io.tmpdir will fail if $PWD resolves 
to a folder in HDFS and the Spark YARN Cluster job does not have the correct 
access to this folder in regards to the dummy folder. So this patch provides an 
option to set spark.streaming.receiver.blockStore.tmpdir to override 
java.io.tmpdir which sets $PWD from YARN Cluster mode.

## What changes were proposed in this pull request?
This change provides an option to override the java.io.tmpdir option so that 
when $PWD is resolved in YARN Cluster mode Spark does not attempt to use this 
folder and instead use the folder provided with the following option: 
spark.streaming.receiver.blockStore.tmpdir

## How was this patch tested?
Patch was manually tested on a Spark Streaming Job with Write Ahead logs in 
Cluster mode.

Closes #22867 from gss2002/SPARK-25778.

Authored-by: gss2002 
Signed-off-by: Marcelo Vanzin 
(cherry picked from commit 2b671e729250b980aa9e4ea2d483f44fa0e129cb)
Signed-off-by: Marcelo Vanzin 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/ba638a78
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/ba638a78
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/ba638a78

Branch: refs/heads/branch-2.4
Commit: ba638a783442f6a5b7b8e0a363edfb398eb2b6c7
Parents: ca426bf
Author: gss2002 
Authored: Wed Nov 14 13:02:13 2018 -0800
Committer: Marcelo Vanzin 
Committed: Wed Nov 14 13:02:24 2018 -0800

--
 .../apache/spark/streaming/rdd/WriteAheadLogBackedBlockRDD.scala   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/ba638a78/streaming/src/main/scala/org/apache/spark/streaming/rdd/WriteAheadLogBackedBlockRDD.scala
--
diff --git 
a/streaming/src/main/scala/org/apache/spark/streaming/rdd/WriteAheadLogBackedBlockRDD.scala
 
b/streaming/src/main/scala/org/apache/spark/streaming/rdd/WriteAheadLogBackedBlockRDD.scala
index 844760a..f677c49 100644
--- 
a/streaming/src/main/scala/org/apache/spark/streaming/rdd/WriteAheadLogBackedBlockRDD.scala
+++ 
b/streaming/src/main/scala/org/apache/spark/streaming/rdd/WriteAheadLogBackedBlockRDD.scala
@@ -136,7 +136,7 @@ class WriteAheadLogBackedBlockRDD[T: ClassTag](
 // this dummy directory should not already exist otherwise the WAL 
will try to recover
 // past events from the directory and throw errors.
 val nonExistentDirectory = new File(
-  System.getProperty("java.io.tmpdir"), 
UUID.randomUUID().toString).getAbsolutePath
+  System.getProperty("java.io.tmpdir"), 
UUID.randomUUID().toString).toURI.toString
 writeAheadLog = WriteAheadLogUtils.createLogForReceiver(
   SparkEnv.get.conf, nonExistentDirectory, hadoopConf)
 dataRead = writeAheadLog.read(partition.walRecordHandle)


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



spark git commit: [SPARK-25778] WriteAheadLogBackedBlockRDD in YARN Cluster Mode Fails …

2018-11-14 Thread vanzin
Repository: spark
Updated Branches:
  refs/heads/master 722369ee5 -> 2b671e729


[SPARK-25778] WriteAheadLogBackedBlockRDD in YARN Cluster Mode Fails …

…due lack of access to tmpDir from $PWD to HDFS

WriteAheadLogBackedBlockRDD usage of java.io.tmpdir will fail if $PWD resolves 
to a folder in HDFS and the Spark YARN Cluster job does not have the correct 
access to this folder in regards to the dummy folder. So this patch provides an 
option to set spark.streaming.receiver.blockStore.tmpdir to override 
java.io.tmpdir which sets $PWD from YARN Cluster mode.

## What changes were proposed in this pull request?
This change provides an option to override the java.io.tmpdir option so that 
when $PWD is resolved in YARN Cluster mode Spark does not attempt to use this 
folder and instead use the folder provided with the following option: 
spark.streaming.receiver.blockStore.tmpdir

## How was this patch tested?
Patch was manually tested on a Spark Streaming Job with Write Ahead logs in 
Cluster mode.

Closes #22867 from gss2002/SPARK-25778.

Authored-by: gss2002 
Signed-off-by: Marcelo Vanzin 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/2b671e72
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/2b671e72
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/2b671e72

Branch: refs/heads/master
Commit: 2b671e729250b980aa9e4ea2d483f44fa0e129cb
Parents: 722369e
Author: gss2002 
Authored: Wed Nov 14 13:02:13 2018 -0800
Committer: Marcelo Vanzin 
Committed: Wed Nov 14 13:02:13 2018 -0800

--
 .../apache/spark/streaming/rdd/WriteAheadLogBackedBlockRDD.scala   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/2b671e72/streaming/src/main/scala/org/apache/spark/streaming/rdd/WriteAheadLogBackedBlockRDD.scala
--
diff --git 
a/streaming/src/main/scala/org/apache/spark/streaming/rdd/WriteAheadLogBackedBlockRDD.scala
 
b/streaming/src/main/scala/org/apache/spark/streaming/rdd/WriteAheadLogBackedBlockRDD.scala
index 844760a..f677c49 100644
--- 
a/streaming/src/main/scala/org/apache/spark/streaming/rdd/WriteAheadLogBackedBlockRDD.scala
+++ 
b/streaming/src/main/scala/org/apache/spark/streaming/rdd/WriteAheadLogBackedBlockRDD.scala
@@ -136,7 +136,7 @@ class WriteAheadLogBackedBlockRDD[T: ClassTag](
 // this dummy directory should not already exist otherwise the WAL 
will try to recover
 // past events from the directory and throw errors.
 val nonExistentDirectory = new File(
-  System.getProperty("java.io.tmpdir"), 
UUID.randomUUID().toString).getAbsolutePath
+  System.getProperty("java.io.tmpdir"), 
UUID.randomUUID().toString).toURI.toString
 writeAheadLog = WriteAheadLogUtils.createLogForReceiver(
   SparkEnv.get.conf, nonExistentDirectory, hadoopConf)
 dataRead = writeAheadLog.read(partition.walRecordHandle)


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



spark git commit: [SPARK-24421][BUILD][CORE] Accessing sun.misc.Cleaner in JDK11

2018-11-14 Thread srowen
Repository: spark
Updated Branches:
  refs/heads/master 922dfe486 -> 722369ee5


[SPARK-24421][BUILD][CORE] Accessing sun.misc.Cleaner in JDK11

…. Other related changes to get JDK 11 working, to test

## What changes were proposed in this pull request?

- Access `sun.misc.Cleaner` (Java 8) and `jdk.internal.ref.Cleaner` (JDK 9+) by 
reflection (note: the latter only works if illegal reflective access is allowed)
- Access `sun.misc.Unsafe.invokeCleaner` in Java 9+ instead of 
`sun.misc.Cleaner` (Java 8)

In order to test anything on JDK 11, I also fixed a few small things, which I 
include here:

- Fix minor JDK 11 compile issues
- Update scala plugin, Jetty for JDK 11, to facilitate tests too

This doesn't mean JDK 11 tests all pass now, but lots do. Note also that the 
JDK 9+ solution for the Cleaner has a big caveat.

## How was this patch tested?

Existing tests. Manually tested JDK 11 build and tests, and tests covering this 
change appear to pass. All Java 8 tests should still pass, but this change 
alone does not achieve full JDK 11 compatibility.

Closes #22993 from srowen/SPARK-24421.

Authored-by: Sean Owen 
Signed-off-by: Sean Owen 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/722369ee
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/722369ee
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/722369ee

Branch: refs/heads/master
Commit: 722369ee5532c94eb7dd0cbda3b7365fc2f52026
Parents: 922dfe4
Author: Sean Owen 
Authored: Wed Nov 14 12:52:54 2018 -0800
Committer: Sean Owen 
Committed: Wed Nov 14 12:52:54 2018 -0800

--
 .../java/org/apache/spark/unsafe/Platform.java  | 74 +---
 .../org/apache/spark/storage/StorageUtils.scala | 36 --
 .../org/apache/spark/util/UtilsSuite.scala  |  7 +-
 dev/deps/spark-deps-hadoop-2.7  |  2 +-
 dev/deps/spark-deps-hadoop-3.1  |  6 +-
 .../org/apache/spark/examples/LogQuery.scala|  4 +-
 .../apache/spark/ml/linalg/MatricesSuite.scala  |  4 +-
 .../spark/mllib/linalg/MatricesSuite.scala  |  4 +-
 pom.xml | 22 --
 .../cli/thrift/ThriftHttpCLIService.java|  3 +-
 10 files changed, 123 insertions(+), 39 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/722369ee/common/unsafe/src/main/java/org/apache/spark/unsafe/Platform.java
--
diff --git a/common/unsafe/src/main/java/org/apache/spark/unsafe/Platform.java 
b/common/unsafe/src/main/java/org/apache/spark/unsafe/Platform.java
index aca6fca..076b693 100644
--- a/common/unsafe/src/main/java/org/apache/spark/unsafe/Platform.java
+++ b/common/unsafe/src/main/java/org/apache/spark/unsafe/Platform.java
@@ -19,10 +19,10 @@ package org.apache.spark.unsafe;
 
 import java.lang.reflect.Constructor;
 import java.lang.reflect.Field;
+import java.lang.reflect.InvocationTargetException;
 import java.lang.reflect.Method;
 import java.nio.ByteBuffer;
 
-import sun.misc.Cleaner;
 import sun.misc.Unsafe;
 
 public final class Platform {
@@ -67,6 +67,60 @@ public final class Platform {
 unaligned = _unaligned;
   }
 
+  // Access fields and constructors once and store them, for performance:
+
+  private static final Constructor DBB_CONSTRUCTOR;
+  private static final Field DBB_CLEANER_FIELD;
+  static {
+try {
+  Class cls = Class.forName("java.nio.DirectByteBuffer");
+  Constructor constructor = cls.getDeclaredConstructor(Long.TYPE, 
Integer.TYPE);
+  constructor.setAccessible(true);
+  Field cleanerField = cls.getDeclaredField("cleaner");
+  cleanerField.setAccessible(true);
+  DBB_CONSTRUCTOR = constructor;
+  DBB_CLEANER_FIELD = cleanerField;
+} catch (ClassNotFoundException | NoSuchMethodException | 
NoSuchFieldException e) {
+  throw new IllegalStateException(e);
+}
+  }
+
+  private static final Method CLEANER_CREATE_METHOD;
+  static {
+// The implementation of Cleaner changed from JDK 8 to 9
+// Split java.version on non-digit chars:
+int majorVersion = 
Integer.parseInt(System.getProperty("java.version").split("\\D+")[0]);
+String cleanerClassName;
+if (majorVersion < 9) {
+  cleanerClassName = "sun.misc.Cleaner";
+} else {
+  cleanerClassName = "jdk.internal.ref.Cleaner";
+}
+try {
+  Class cleanerClass = Class.forName(cleanerClassName);
+  Method createMethod = cleanerClass.getMethod("create", Object.class, 
Runnable.class);
+  // Accessing jdk.internal.ref.Cleaner should actually fail by default in 
JDK 9+,
+  // unfortunately, unless the user has allowed access with something like
+  // --add-opens java.base/java.lang=ALL-UNNAMED  If not, we can't really 
use the Cleaner
+  // hack below. It 

svn commit: r30897 - in /dev/spark/3.0.0-SNAPSHOT-2018_11_14_12_12-922dfe4-docs: ./ _site/ _site/api/ _site/api/R/ _site/api/java/ _site/api/java/lib/ _site/api/java/org/ _site/api/java/org/apache/ _s

2018-11-14 Thread pwendell
Author: pwendell
Date: Wed Nov 14 20:27:01 2018
New Revision: 30897

Log:
Apache Spark 3.0.0-SNAPSHOT-2018_11_14_12_12-922dfe4 docs


[This commit notification would consist of 1471 parts, 
which exceeds the limit of 50 ones, so it was shortened to the summary.]

-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



spark git commit: [SPARK-25965][SQL][TEST] Add avro read benchmark

2018-11-14 Thread dongjoon
Repository: spark
Updated Branches:
  refs/heads/master 4035c98a0 -> 922dfe486


[SPARK-25965][SQL][TEST] Add avro read benchmark

Add read benchmark for Avro, which is missing for a period.
The benchmark is similar to `DataSourceReadBenchmark` and `OrcReadBenchmark`

Manually run benchmark

Closes #22966 from gengliangwang/avroReadBenchmark.

Lead-authored-by: Gengliang Wang 
Co-authored-by: Gengliang Wang 
Co-authored-by: Dongjoon Hyun 
Signed-off-by: Dongjoon Hyun 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/922dfe48
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/922dfe48
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/922dfe48

Branch: refs/heads/master
Commit: 922dfe4865216987f9a92892b89dc3eaa9610b9b
Parents: 4035c98
Author: Gengliang Wang 
Authored: Wed Nov 14 11:25:11 2018 -0800
Committer: Dongjoon Hyun 
Committed: Wed Nov 14 11:26:26 2018 -0800

--
 .../benchmarks/AvroReadBenchmark-results.txt| 122 +++
 .../execution/benchmark/AvroReadBenchmark.scala | 216 +++
 2 files changed, 338 insertions(+)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/922dfe48/external/avro/benchmarks/AvroReadBenchmark-results.txt
--
diff --git a/external/avro/benchmarks/AvroReadBenchmark-results.txt 
b/external/avro/benchmarks/AvroReadBenchmark-results.txt
new file mode 100644
index 000..7900fea
--- /dev/null
+++ b/external/avro/benchmarks/AvroReadBenchmark-results.txt
@@ -0,0 +1,122 @@
+
+SQL Single Numeric Column Scan
+
+
+OpenJDK 64-Bit Server VM 1.8.0_191-b12 on Linux 3.10.0-862.3.2.el7.x86_64
+Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
+SQL Single TINYINT Column Scan:  Best/Avg Time(ms)Rate(M/s)   Per 
Row(ns)   Relative
+
+Sum   2774 / 2815  5.7 
176.4   1.0X
+
+OpenJDK 64-Bit Server VM 1.8.0_191-b12 on Linux 3.10.0-862.3.2.el7.x86_64
+Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
+SQL Single SMALLINT Column Scan: Best/Avg Time(ms)Rate(M/s)   Per 
Row(ns)   Relative
+
+Sum   2761 / 2777  5.7 
175.5   1.0X
+
+OpenJDK 64-Bit Server VM 1.8.0_191-b12 on Linux 3.10.0-862.3.2.el7.x86_64
+Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
+SQL Single INT Column Scan:  Best/Avg Time(ms)Rate(M/s)   Per 
Row(ns)   Relative
+
+Sum   2783 / 2870  5.7 
176.9   1.0X
+
+OpenJDK 64-Bit Server VM 1.8.0_191-b12 on Linux 3.10.0-862.3.2.el7.x86_64
+Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
+SQL Single BIGINT Column Scan:   Best/Avg Time(ms)Rate(M/s)   Per 
Row(ns)   Relative
+
+Sum   3256 / 3266  4.8 
207.0   1.0X
+
+OpenJDK 64-Bit Server VM 1.8.0_191-b12 on Linux 3.10.0-862.3.2.el7.x86_64
+Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
+SQL Single FLOAT Column Scan:Best/Avg Time(ms)Rate(M/s)   Per 
Row(ns)   Relative
+
+Sum   2841 / 2867  5.5 
180.6   1.0X
+
+OpenJDK 64-Bit Server VM 1.8.0_191-b12 on Linux 3.10.0-862.3.2.el7.x86_64
+Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
+SQL Single DOUBLE Column Scan:   Best/Avg Time(ms)Rate(M/s)   Per 
Row(ns)   Relative
+
+Sum   2981 / 2996  5.3 
189.5   1.0X
+
+
+
+Int and String Scan
+
+
+OpenJDK 64-Bit Server VM 1.8.0_191-b12 on Linux 3.10.0-862.3.2.el7.x86_64
+Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
+Int and String Scan: Best/Avg Time(ms)Rate(M/s)   Per 
Row(ns)   Relative
+
+Sum of 

spark git commit: [SPARK-26042][SS][TESTS] Fix a potential hang in KafkaContinuousSourceTopicDeletionSuite

2018-11-14 Thread zsxwing
Repository: spark
Updated Branches:
  refs/heads/master 5f11e8c4c -> 4035c98a0


[SPARK-26042][SS][TESTS] Fix a potential hang in 
KafkaContinuousSourceTopicDeletionSuite

## What changes were proposed in this pull request?

As initializing lazy vals shares the same lock, a thread is trying to 
initialize `executedPlan` when `isRDD` is running, this thread will hang 
forever.

This PR just materializes `executedPlan` so that accessing it when `toRdd` is 
running doesn't need to wait for a lock

## How was this patch tested?

Jenkins

Closes #23023 from zsxwing/SPARK-26042.

Authored-by: Shixiong Zhu 
Signed-off-by: Shixiong Zhu 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/4035c98a
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/4035c98a
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/4035c98a

Branch: refs/heads/master
Commit: 4035c98a0c03cf61d1fb9a9916df513ab1081a9b
Parents: 5f11e8c
Author: Shixiong Zhu 
Authored: Wed Nov 14 10:19:20 2018 -0800
Committer: Shixiong Zhu 
Committed: Wed Nov 14 10:19:20 2018 -0800

--
 .../execution/streaming/continuous/ContinuousExecution.scala  | 7 ++-
 1 file changed, 6 insertions(+), 1 deletion(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/4035c98a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousExecution.scala
--
diff --git 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousExecution.scala
 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousExecution.scala
index f009c52..4a7df73 100644
--- 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousExecution.scala
+++ 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousExecution.scala
@@ -262,7 +262,12 @@ class ContinuousExecution(
 
   reportTimeTaken("runContinuous") {
 SQLExecution.withNewExecutionId(
-  sparkSessionForQuery, lastExecution)(lastExecution.toRdd)
+  sparkSessionForQuery, lastExecution) {
+  // Materialize `executedPlan` so that accessing it when `toRdd` is 
running doesn't need to
+  // wait for a lock
+  lastExecution.executedPlan
+  lastExecution.toRdd
+}
   }
 } catch {
   case t: Throwable if StreamExecution.isInterruptionException(t, 
sparkSession.sparkContext) &&


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



spark git commit: [SPARK-26042][SS][TESTS] Fix a potential hang in KafkaContinuousSourceTopicDeletionSuite

2018-11-14 Thread zsxwing
Repository: spark
Updated Branches:
  refs/heads/branch-2.4 e2e1f0ad8 -> ca426bfa5


[SPARK-26042][SS][TESTS] Fix a potential hang in 
KafkaContinuousSourceTopicDeletionSuite

## What changes were proposed in this pull request?

As initializing lazy vals shares the same lock, a thread is trying to 
initialize `executedPlan` when `isRDD` is running, this thread will hang 
forever.

This PR just materializes `executedPlan` so that accessing it when `toRdd` is 
running doesn't need to wait for a lock

## How was this patch tested?

Jenkins

Closes #23023 from zsxwing/SPARK-26042.

Authored-by: Shixiong Zhu 
Signed-off-by: Shixiong Zhu 
(cherry picked from commit 4035c98a0c03cf61d1fb9a9916df513ab1081a9b)
Signed-off-by: Shixiong Zhu 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/ca426bfa
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/ca426bfa
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/ca426bfa

Branch: refs/heads/branch-2.4
Commit: ca426bfa56045f01de0ea14480a375753073e025
Parents: e2e1f0a
Author: Shixiong Zhu 
Authored: Wed Nov 14 10:19:20 2018 -0800
Committer: Shixiong Zhu 
Committed: Wed Nov 14 10:19:37 2018 -0800

--
 .../execution/streaming/continuous/ContinuousExecution.scala  | 7 ++-
 1 file changed, 6 insertions(+), 1 deletion(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/ca426bfa/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousExecution.scala
--
diff --git 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousExecution.scala
 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousExecution.scala
index f104422..2e24fa6 100644
--- 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousExecution.scala
+++ 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousExecution.scala
@@ -259,7 +259,12 @@ class ContinuousExecution(
 
   reportTimeTaken("runContinuous") {
 SQLExecution.withNewExecutionId(
-  sparkSessionForQuery, lastExecution)(lastExecution.toRdd)
+  sparkSessionForQuery, lastExecution) {
+  // Materialize `executedPlan` so that accessing it when `toRdd` is 
running doesn't need to
+  // wait for a lock
+  lastExecution.executedPlan
+  lastExecution.toRdd
+}
   }
 } catch {
   case t: Throwable


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



spark git commit: [SPARK-25118][CORE] Persist Driver Logs in Client mode to Hdfs

2018-11-14 Thread vanzin
Repository: spark
Updated Branches:
  refs/heads/master e503065fd -> 5f11e8c4c


[SPARK-25118][CORE] Persist Driver Logs in Client mode to Hdfs

Currently, we do not have a mechanism to collect driver logs if a user chooses
to run their application in client mode. This is a big issue as admin teams need
to create their own mechanisms to capture driver logs.

This commit adds a logger which, if enabled, adds a local log appender to the
root logger and asynchronously syncs it an application specific log file on hdfs
(Spark Driver Log Dir).

Additionally, this collects spark-shell driver logs at INFO level by default.
The change is that instead of setting root logger level to WARN, we will set the
consoleAppender threshold to WARN, in case of spark-shell. This ensures that
only WARN logs are printed on CONSOLE but other log appenders still capture INFO
(or the default log level logs).

1. Verified that logs are written to local and remote dir
2. Added a unit test case
3. Verified this for spark-shell, client mode and pyspark.
4. Verified in both non-kerberos and kerberos environment
5. Verified with following unexpected termination conditions: Ctrl + C, Driver
OOM, Large Log Files
6. Ran an application in spark-shell and ensured that driver logs were
captured at INFO level
7. Started the application at WARN level, programmatically changed the level to
INFO and ensured that logs on console were printed at INFO level

Closes #22504 from ankuriitg/ankurgupta/SPARK-25118.

Authored-by: ankurgupta 
Signed-off-by: Marcelo Vanzin 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/5f11e8c4
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/5f11e8c4
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/5f11e8c4

Branch: refs/heads/master
Commit: 5f11e8c4cb9a5db037ac239b8fcc97f3a746e772
Parents: e503065
Author: ankurgupta 
Authored: Wed Nov 14 08:23:24 2018 -0800
Committer: Marcelo Vanzin 
Committed: Wed Nov 14 08:23:34 2018 -0800

--
 .../scala/org/apache/spark/SparkContext.scala   |   8 +
 .../deploy/history/FsHistoryProvider.scala  |  88 +++-
 .../org/apache/spark/internal/Logging.scala |  29 ++-
 .../apache/spark/internal/config/History.scala  |  17 ++
 .../apache/spark/internal/config/package.scala  |  13 ++
 .../spark/scheduler/EventLoggingListener.scala  |   8 +-
 .../scala/org/apache/spark/util/Utils.scala |  15 +-
 .../spark/util/logging/DriverLogger.scala   | 205 +++
 .../deploy/history/FsHistoryProviderSuite.scala |  70 ++-
 .../spark/util/logging/DriverLoggerSuite.scala  |  79 +++
 docs/configuration.md   |  33 +++
 docs/monitoring.md  |  22 ++
 docs/security.md|  11 +
 13 files changed, 576 insertions(+), 22 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/5f11e8c4/core/src/main/scala/org/apache/spark/SparkContext.scala
--
diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala 
b/core/src/main/scala/org/apache/spark/SparkContext.scala
index b3c9c03..cb91717 100644
--- a/core/src/main/scala/org/apache/spark/SparkContext.scala
+++ b/core/src/main/scala/org/apache/spark/SparkContext.scala
@@ -58,6 +58,7 @@ import org.apache.spark.storage._
 import org.apache.spark.storage.BlockManagerMessages.TriggerThreadDump
 import org.apache.spark.ui.{ConsoleProgressBar, SparkUI}
 import org.apache.spark.util._
+import org.apache.spark.util.logging.DriverLogger
 
 /**
  * Main entry point for Spark functionality. A SparkContext represents the 
connection to a Spark
@@ -205,6 +206,7 @@ class SparkContext(config: SparkConf) extends Logging {
   private var _applicationId: String = _
   private var _applicationAttemptId: Option[String] = None
   private var _eventLogger: Option[EventLoggingListener] = None
+  private var _driverLogger: Option[DriverLogger] = None
   private var _executorAllocationManager: Option[ExecutorAllocationManager] = 
None
   private var _cleaner: Option[ContextCleaner] = None
   private var _listenerBusStarted: Boolean = false
@@ -371,6 +373,8 @@ class SparkContext(config: SparkConf) extends Logging {
   throw new SparkException("An application name must be set in your 
configuration")
 }
 
+_driverLogger = DriverLogger(_conf)
+
 // log out spark.app.name in the Spark driver logs
 logInfo(s"Submitted application: $appName")
 
@@ -1867,6 +1871,9 @@ class SparkContext(config: SparkConf) extends Logging {
   postApplicationEnd()
 }
 Utils.tryLogNonFatalError {
+  _driverLogger.foreach(_.stop())
+}
+Utils.tryLogNonFatalError {
   _ui.foreach(_.stop())
 }
 if (env != null) {
@@ -2351,6 +2358,7 @@ class 

svn commit: r30892 - in /dev/spark/3.0.0-SNAPSHOT-2018_11_14_07_44-e503065-docs: ./ _site/ _site/api/ _site/api/R/ _site/api/java/ _site/api/java/lib/ _site/api/java/org/ _site/api/java/org/apache/ _s

2018-11-14 Thread pwendell
Author: pwendell
Date: Wed Nov 14 15:58:48 2018
New Revision: 30892

Log:
Apache Spark 3.0.0-SNAPSHOT-2018_11_14_07_44-e503065 docs


[This commit notification would consist of 1471 parts, 
which exceeds the limit of 50 ones, so it was shortened to the summary.]

-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



spark git commit: [SPARK-25868][MLLIB] One part of Spark MLlib Kmean Logic Performance problem

2018-11-14 Thread srowen
Repository: spark
Updated Branches:
  refs/heads/master a7a331df6 -> e503065fd


[SPARK-25868][MLLIB] One part of Spark MLlib Kmean Logic Performance problem

## What changes were proposed in this pull request?

Fix fastSquaredDistance to calculate dense-dense situation calculation 
performance problem and meanwhile enhance the calculation accuracy.

## How was this patch tested?
>From different point to test after add this patch, the dense-dense calculation 
>situation performance is enhanced and will do influence other calculation 
>situation like (sparse-sparse, sparse-dense)

**For calculation logic test**
There is my test for sparse-sparse, dense-dense, sparse-dense case

There is test result:
First we need define some branch path logic for sparse-sparse and sparse-dense 
case
if meet precisionBound1, we define it as LOGIC1
if not meet precisionBound1, and not meet precisionBound2, we define it as 
LOGIC2
if not meet precisionBound1, but meet precisionBound2, we define it as LOGIC3
(There is a trick, you can manually change the precision value to meet above 
situation)

sparse- sparse case time cost situation (milliseconds)
LOGIC1
Before add patch: 7786, 7970, 8086
After add patch: 7729, 7653, 7903
LOGIC2
Before add patch: 8412, 9029, 8606
After add patch: 8603, 8724, 9024
LOGIC3
Before add patch: 19365, 19146, 19351
After add patch: 18917, 19007, 19074

sparse-dense case time cost situation (milliseconds)
LOGIC1
Before add patch: 4195, 4014, 4409
After add patch: 4081,3971, 4151
LOGIC2
Before add patch: 4968, 5579, 5080
After add patch: 4980, 5472, 5148
LOGIC3
Before add patch: 11848, 12077, 12168
After add patch: 11718, 11874, 11743

And for dense-dense case like we already discussed in comment, only use sqdist 
to calculate distance

dense-dense case time cost situation (milliseconds)
Before add patch: 7340, 7816, 7672
After add patch: 5752, 5800, 5753

**For real world data test**
There is my test data situation
I use the data
http://archive.ics.uci.edu/ml/datasets/Condition+monitoring+of+hydraulic+systems
extract file (PS1, PS2, PS3, PS4, PS5, PS6) to form the test data

total instances are 13230
the attributes for line are 6000

Result for sparse-sparse situation time cost (milliseconds)
Before Enhance: 7670, 7704, 7652
After Enhance: 7634, 7729, 7645

Closes #22893 from KyleLi1985/updatekmeanpatch.

Authored-by: 李亮 
Signed-off-by: Sean Owen 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/e503065f
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/e503065f
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/e503065f

Branch: refs/heads/master
Commit: e503065fd8855cd8810ca541db6c56d6fa0a5ff6
Parents: a7a331d
Author: 李亮 
Authored: Wed Nov 14 07:24:13 2018 -0800
Committer: Sean Owen 
Committed: Wed Nov 14 07:24:13 2018 -0800

--
 .../org/apache/spark/mllib/util/MLUtils.scala   | 30 +++-
 python/pyspark/ml/clustering.py |  6 ++--
 python/pyspark/mllib/clustering.py  |  2 +-
 3 files changed, 20 insertions(+), 18 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/e503065f/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala
--
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala 
b/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala
index 14af8b5..6d15a6b 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala
@@ -506,8 +506,6 @@ object MLUtils extends Logging {
 val n = v1.size
 require(v2.size == n)
 require(norm1 >= 0.0 && norm2 >= 0.0)
-val sumSquaredNorm = norm1 * norm1 + norm2 * norm2
-val normDiff = norm1 - norm2
 var sqDist = 0.0
 /*
  * The relative error is
@@ -521,19 +519,23 @@ object MLUtils extends Logging {
  * The bound doesn't need the inner product, so we can use it as a 
sufficient condition to
  * check quickly whether the inner product approach is accurate.
  */
-val precisionBound1 = 2.0 * EPSILON * sumSquaredNorm / (normDiff * 
normDiff + EPSILON)
-if (precisionBound1 < precision) {
-  sqDist = sumSquaredNorm - 2.0 * dot(v1, v2)
-} else if (v1.isInstanceOf[SparseVector] || v2.isInstanceOf[SparseVector]) 
{
-  val dotValue = dot(v1, v2)
-  sqDist = math.max(sumSquaredNorm - 2.0 * dotValue, 0.0)
-  val precisionBound2 = EPSILON * (sumSquaredNorm + 2.0 * 
math.abs(dotValue)) /
-(sqDist + EPSILON)
-  if (precisionBound2 > precision) {
-sqDist = Vectors.sqdist(v1, v2)
-  }
-} else {
+if (v1.isInstanceOf[DenseVector] && v2.isInstanceOf[DenseVector]) {
   sqDist = Vectors.sqdist(v1, v2)
+} else