svn commit: r30903 - in /dev/spark/3.0.0-SNAPSHOT-2018_11_14_20_33-03306a6-docs: ./ _site/ _site/api/ _site/api/R/ _site/api/java/ _site/api/java/lib/ _site/api/java/org/ _site/api/java/org/apache/ _s
Author: pwendell Date: Thu Nov 15 04:45:51 2018 New Revision: 30903 Log: Apache Spark 3.0.0-SNAPSHOT-2018_11_14_20_33-03306a6 docs [This commit notification would consist of 1755 parts, which exceeds the limit of 50 ones, so it was shortened to the summary.] - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[3/4] spark git commit: [SPARK-26036][PYTHON] Break large tests.py files into smaller files
http://git-wip-us.apache.org/repos/asf/spark/blob/03306a6d/python/pyspark/tests.py -- diff --git a/python/pyspark/tests.py b/python/pyspark/tests.py deleted file mode 100644 index 131c51e..000 --- a/python/pyspark/tests.py +++ /dev/null @@ -1,2502 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -#http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -""" -Unit tests for PySpark; additional tests are implemented as doctests in -individual modules. -""" - -from array import array -from glob import glob -import os -import re -import shutil -import subprocess -import sys -import tempfile -import time -import zipfile -import random -import threading -import hashlib - -from py4j.protocol import Py4JJavaError -try: -import xmlrunner -except ImportError: -xmlrunner = None - -if sys.version_info[:2] <= (2, 6): -try: -import unittest2 as unittest -except ImportError: -sys.stderr.write('Please install unittest2 to test with Python 2.6 or earlier') -sys.exit(1) -else: -import unittest -if sys.version_info[0] >= 3: -xrange = range -basestring = str - -if sys.version >= "3": -from io import StringIO -else: -from StringIO import StringIO - - -from pyspark import keyword_only -from pyspark.conf import SparkConf -from pyspark.context import SparkContext -from pyspark.rdd import RDD -from pyspark.files import SparkFiles -from pyspark.serializers import read_int, BatchedSerializer, MarshalSerializer, PickleSerializer, \ -CloudPickleSerializer, CompressedSerializer, UTF8Deserializer, NoOpSerializer, \ -PairDeserializer, CartesianDeserializer, AutoBatchedSerializer, AutoSerializer, \ -FlattenedValuesSerializer -from pyspark.shuffle import Aggregator, ExternalMerger, ExternalSorter -from pyspark import shuffle -from pyspark.profiler import BasicProfiler -from pyspark.taskcontext import BarrierTaskContext, TaskContext - -_have_scipy = False -_have_numpy = False -try: -import scipy.sparse -_have_scipy = True -except: -# No SciPy, but that's okay, we'll skip those tests -pass -try: -import numpy as np -_have_numpy = True -except: -# No NumPy, but that's okay, we'll skip those tests -pass - - -SPARK_HOME = os.environ["SPARK_HOME"] - - -class MergerTests(unittest.TestCase): - -def setUp(self): -self.N = 1 << 12 -self.l = [i for i in xrange(self.N)] -self.data = list(zip(self.l, self.l)) -self.agg = Aggregator(lambda x: [x], - lambda x, y: x.append(y) or x, - lambda x, y: x.extend(y) or x) - -def test_small_dataset(self): -m = ExternalMerger(self.agg, 1000) -m.mergeValues(self.data) -self.assertEqual(m.spills, 0) -self.assertEqual(sum(sum(v) for k, v in m.items()), - sum(xrange(self.N))) - -m = ExternalMerger(self.agg, 1000) -m.mergeCombiners(map(lambda x_y1: (x_y1[0], [x_y1[1]]), self.data)) -self.assertEqual(m.spills, 0) -self.assertEqual(sum(sum(v) for k, v in m.items()), - sum(xrange(self.N))) - -def test_medium_dataset(self): -m = ExternalMerger(self.agg, 20) -m.mergeValues(self.data) -self.assertTrue(m.spills >= 1) -self.assertEqual(sum(sum(v) for k, v in m.items()), - sum(xrange(self.N))) - -m = ExternalMerger(self.agg, 10) -m.mergeCombiners(map(lambda x_y2: (x_y2[0], [x_y2[1]]), self.data * 3)) -self.assertTrue(m.spills >= 1) -self.assertEqual(sum(sum(v) for k, v in m.items()), - sum(xrange(self.N)) * 3) - -def test_huge_dataset(self): -m = ExternalMerger(self.agg, 5, partitions=3) -m.mergeCombiners(map(lambda k_v: (k_v[0], [str(k_v[1])]), self.data * 10)) -self.assertTrue(m.spills >= 1) -self.assertEqual(sum(len(v) for k, v in m.items()), - self.N * 10) -m._cleanup() - -def test_group_by_key(self): - -def gen_data(N, step): -for i in range(1, N + 1, step): -for j in range(i): -yield (i, [j]) - -
[1/4] spark git commit: [SPARK-26036][PYTHON] Break large tests.py files into smaller files
Repository: spark Updated Branches: refs/heads/master f6255d7b7 -> 03306a6df http://git-wip-us.apache.org/repos/asf/spark/blob/03306a6d/python/pyspark/tests/test_readwrite.py -- diff --git a/python/pyspark/tests/test_readwrite.py b/python/pyspark/tests/test_readwrite.py new file mode 100644 index 000..e45f5b3 --- /dev/null +++ b/python/pyspark/tests/test_readwrite.py @@ -0,0 +1,499 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +#http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +import os +import shutil +import sys +import tempfile +import unittest +from array import array + +from pyspark.testing.utils import ReusedPySparkTestCase, SPARK_HOME + + +class InputFormatTests(ReusedPySparkTestCase): + +@classmethod +def setUpClass(cls): +ReusedPySparkTestCase.setUpClass() +cls.tempdir = tempfile.NamedTemporaryFile(delete=False) +os.unlink(cls.tempdir.name) + cls.sc._jvm.WriteInputFormatTestDataGenerator.generateData(cls.tempdir.name, cls.sc._jsc) + +@classmethod +def tearDownClass(cls): +ReusedPySparkTestCase.tearDownClass() +shutil.rmtree(cls.tempdir.name) + +@unittest.skipIf(sys.version >= "3", "serialize array of byte") +def test_sequencefiles(self): +basepath = self.tempdir.name +ints = sorted(self.sc.sequenceFile(basepath + "/sftestdata/sfint/", + "org.apache.hadoop.io.IntWritable", + "org.apache.hadoop.io.Text").collect()) +ei = [(1, u'aa'), (1, u'aa'), (2, u'aa'), (2, u'bb'), (2, u'bb'), (3, u'cc')] +self.assertEqual(ints, ei) + +doubles = sorted(self.sc.sequenceFile(basepath + "/sftestdata/sfdouble/", + "org.apache.hadoop.io.DoubleWritable", + "org.apache.hadoop.io.Text").collect()) +ed = [(1.0, u'aa'), (1.0, u'aa'), (2.0, u'aa'), (2.0, u'bb'), (2.0, u'bb'), (3.0, u'cc')] +self.assertEqual(doubles, ed) + +bytes = sorted(self.sc.sequenceFile(basepath + "/sftestdata/sfbytes/", +"org.apache.hadoop.io.IntWritable", + "org.apache.hadoop.io.BytesWritable").collect()) +ebs = [(1, bytearray('aa', 'utf-8')), + (1, bytearray('aa', 'utf-8')), + (2, bytearray('aa', 'utf-8')), + (2, bytearray('bb', 'utf-8')), + (2, bytearray('bb', 'utf-8')), + (3, bytearray('cc', 'utf-8'))] +self.assertEqual(bytes, ebs) + +text = sorted(self.sc.sequenceFile(basepath + "/sftestdata/sftext/", + "org.apache.hadoop.io.Text", + "org.apache.hadoop.io.Text").collect()) +et = [(u'1', u'aa'), + (u'1', u'aa'), + (u'2', u'aa'), + (u'2', u'bb'), + (u'2', u'bb'), + (u'3', u'cc')] +self.assertEqual(text, et) + +bools = sorted(self.sc.sequenceFile(basepath + "/sftestdata/sfbool/", +"org.apache.hadoop.io.IntWritable", + "org.apache.hadoop.io.BooleanWritable").collect()) +eb = [(1, False), (1, True), (2, False), (2, False), (2, True), (3, True)] +self.assertEqual(bools, eb) + +nulls = sorted(self.sc.sequenceFile(basepath + "/sftestdata/sfnull/", +"org.apache.hadoop.io.IntWritable", + "org.apache.hadoop.io.BooleanWritable").collect()) +en = [(1, None), (1, None), (2, None), (2, None), (2, None), (3, None)] +self.assertEqual(nulls, en) + +maps = self.sc.sequenceFile(basepath + "/sftestdata/sfmap/", +"org.apache.hadoop.io.IntWritable", + "org.apache.hadoop.io.MapWritable").collect() +em = [(1, {}), + (1, {3.0: u'bb'}), + (2, {1.0: u'aa'}), + (2, {1.0: u'cc'}), + (3, {2.0:
[2/4] spark git commit: [SPARK-26036][PYTHON] Break large tests.py files into smaller files
http://git-wip-us.apache.org/repos/asf/spark/blob/03306a6d/python/pyspark/tests/__init__.py -- diff --git a/python/pyspark/tests/__init__.py b/python/pyspark/tests/__init__.py new file mode 100644 index 000..12bdf0d --- /dev/null +++ b/python/pyspark/tests/__init__.py @@ -0,0 +1,16 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# http://git-wip-us.apache.org/repos/asf/spark/blob/03306a6d/python/pyspark/tests/test_appsubmit.py -- diff --git a/python/pyspark/tests/test_appsubmit.py b/python/pyspark/tests/test_appsubmit.py new file mode 100644 index 000..92bcb11 --- /dev/null +++ b/python/pyspark/tests/test_appsubmit.py @@ -0,0 +1,248 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +#http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import os +import re +import shutil +import subprocess +import tempfile +import unittest +import zipfile + + +class SparkSubmitTests(unittest.TestCase): + +def setUp(self): +self.programDir = tempfile.mkdtemp() +tmp_dir = tempfile.gettempdir() +self.sparkSubmit = [ +os.path.join(os.environ.get("SPARK_HOME"), "bin", "spark-submit"), +"--conf", "spark.driver.extraJavaOptions=-Djava.io.tmpdir={0}".format(tmp_dir), +"--conf", "spark.executor.extraJavaOptions=-Djava.io.tmpdir={0}".format(tmp_dir), +] + +def tearDown(self): +shutil.rmtree(self.programDir) + +def createTempFile(self, name, content, dir=None): +""" +Create a temp file with the given name and content and return its path. +Strips leading spaces from content up to the first '|' in each line. +""" +pattern = re.compile(r'^ *\|', re.MULTILINE) +content = re.sub(pattern, '', content.strip()) +if dir is None: +path = os.path.join(self.programDir, name) +else: +os.makedirs(os.path.join(self.programDir, dir)) +path = os.path.join(self.programDir, dir, name) +with open(path, "w") as f: +f.write(content) +return path + +def createFileInZip(self, name, content, ext=".zip", dir=None, zip_name=None): +""" +Create a zip archive containing a file with the given content and return its path. +Strips leading spaces from content up to the first '|' in each line. +""" +pattern = re.compile(r'^ *\|', re.MULTILINE) +content = re.sub(pattern, '', content.strip()) +if dir is None: +path = os.path.join(self.programDir, name + ext) +else: +path = os.path.join(self.programDir, dir, zip_name + ext) +zip = zipfile.ZipFile(path, 'w') +zip.writestr(name, content) +zip.close() +return path + +def create_spark_package(self, artifact_name): +group_id, artifact_id, version = artifact_name.split(":") +self.createTempFile("%s-%s.pom" % (artifact_id, version), (""" +| +|http://maven.apache.org/POM/4.0.0; +| xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance; +| xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 +| http://maven.apache.org/xsd/maven-4.0.0.xsd;> +| 4.0.0 +| %s +| %s +| %s +| +""" % (group_id, artifact_id, version)).lstrip(), +
svn commit: r30902 - in /dev/spark/3.0.0-SNAPSHOT-2018_11_14_16_25-ad853c5-docs: ./ _site/ _site/api/ _site/api/R/ _site/api/java/ _site/api/java/lib/ _site/api/java/org/ _site/api/java/org/apache/ _s
Author: pwendell Date: Thu Nov 15 00:38:01 2018 New Revision: 30902 Log: Apache Spark 3.0.0-SNAPSHOT-2018_11_14_16_25-ad853c5 docs [This commit notification would consist of 1755 parts, which exceeds the limit of 50 ones, so it was shortened to the summary.] - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [MINOR][SQL] Add disable bucketedRead workaround when throw RuntimeException
Repository: spark Updated Branches: refs/heads/master ad853c567 -> f6255d7b7 [MINOR][SQL] Add disable bucketedRead workaround when throw RuntimeException ## What changes were proposed in this pull request? It will throw `RuntimeException` when read from bucketed table(about 1.7G per bucket file): ![image](https://user-images.githubusercontent.com/5399861/48346889-8041ce00-e6b7-11e8-83b0-ead83fb15821.png) Default(enable bucket read): ![image](https://user-images.githubusercontent.com/5399861/48347084-2c83b480-e6b8-11e8-913a-9cafc043e9e4.png) Disable bucket read: ![image](https://user-images.githubusercontent.com/5399861/48347099-3a393a00-e6b8-11e8-94af-cb814e1ba277.png) The reason is that each bucket file is too big. a workaround is disable bucket read. This PR add this workaround to Spark. ## How was this patch tested? manual tests Closes #23014 from wangyum/anotherWorkaround. Authored-by: Yuming Wang Signed-off-by: hyukjinkwon Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/f6255d7b Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/f6255d7b Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/f6255d7b Branch: refs/heads/master Commit: f6255d7b7cc4cc5d1f4fe0e5e493a1efee22f38f Parents: ad853c5 Author: Yuming Wang Authored: Thu Nov 15 08:33:06 2018 +0800 Committer: hyukjinkwon Committed: Thu Nov 15 08:33:06 2018 +0800 -- .../spark/sql/execution/vectorized/WritableColumnVector.java| 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/f6255d7b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/WritableColumnVector.java -- diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/WritableColumnVector.java b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/WritableColumnVector.java index b0e119d..4f5e72c 100644 --- a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/WritableColumnVector.java +++ b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/WritableColumnVector.java @@ -101,10 +101,11 @@ public abstract class WritableColumnVector extends ColumnVector { String message = "Cannot reserve additional contiguous bytes in the vectorized reader (" + (requiredCapacity >= 0 ? "requested " + requiredCapacity + " bytes" : "integer overflow") + "). As a workaround, you can reduce the vectorized reader batch size, or disable the " + -"vectorized reader. For parquet file format, refer to " + +"vectorized reader, or disable " + SQLConf.BUCKETING_ENABLED().key() + " if you read " + +"from bucket table. For Parquet file format, refer to " + SQLConf.PARQUET_VECTORIZED_READER_BATCH_SIZE().key() + " (default " + SQLConf.PARQUET_VECTORIZED_READER_BATCH_SIZE().defaultValueString() + -") and " + SQLConf.PARQUET_VECTORIZED_READER_ENABLED().key() + "; for orc file format, " + +") and " + SQLConf.PARQUET_VECTORIZED_READER_ENABLED().key() + "; for ORC file format, " + "refer to " + SQLConf.ORC_VECTORIZED_READER_BATCH_SIZE().key() + " (default " + SQLConf.ORC_VECTORIZED_READER_BATCH_SIZE().defaultValueString() + ") and " + SQLConf.ORC_VECTORIZED_READER_ENABLED().key() + "."; - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-25956] Make Scala 2.12 as default Scala version in Spark 3.0
Repository: spark Updated Branches: refs/heads/master 2977e2312 -> ad853c567 [SPARK-25956] Make Scala 2.12 as default Scala version in Spark 3.0 ## What changes were proposed in this pull request? This PR makes Spark's default Scala version as 2.12, and Scala 2.11 will be the alternative version. This implies that Scala 2.12 will be used by our CI builds including pull request builds. We'll update the Jenkins to include a new compile-only jobs for Scala 2.11 to ensure the code can be still compiled with Scala 2.11. ## How was this patch tested? existing tests Closes #22967 from dbtsai/scala2.12. Authored-by: DB Tsai Signed-off-by: Dongjoon Hyun Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/ad853c56 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/ad853c56 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/ad853c56 Branch: refs/heads/master Commit: ad853c56788fd32e035369d1fe3d96aaf6c4ef16 Parents: 2977e23 Author: DB Tsai Authored: Wed Nov 14 16:22:23 2018 -0800 Committer: Dongjoon Hyun Committed: Wed Nov 14 16:22:23 2018 -0800 -- assembly/pom.xml| 4 +-- common/kvstore/pom.xml | 4 +-- common/network-common/pom.xml | 4 +-- common/network-shuffle/pom.xml | 4 +-- common/network-yarn/pom.xml | 4 +-- common/sketch/pom.xml | 4 +-- common/tags/pom.xml | 4 +-- common/unsafe/pom.xml | 4 +-- core/pom.xml| 4 +-- dev/deps/spark-deps-hadoop-2.7 | 36 ++-- dev/deps/spark-deps-hadoop-3.1 | 36 ++-- docs/_config.yml| 4 +-- docs/_plugins/copy_api_dirs.rb | 2 +- docs/building-spark.md | 18 +- docs/cloud-integration.md | 2 +- docs/sparkr.md | 2 +- examples/pom.xml| 4 +-- external/avro/pom.xml | 4 +-- external/docker-integration-tests/pom.xml | 4 +-- external/kafka-0-10-assembly/pom.xml| 4 +-- external/kafka-0-10-sql/pom.xml | 4 +-- external/kafka-0-10/pom.xml | 4 +-- external/kinesis-asl-assembly/pom.xml | 4 +-- external/kinesis-asl/pom.xml| 4 +-- external/spark-ganglia-lgpl/pom.xml | 4 +-- graphx/pom.xml | 4 +-- hadoop-cloud/pom.xml| 4 +-- launcher/pom.xml| 4 +-- mllib-local/pom.xml | 4 +-- mllib/pom.xml | 4 +-- pom.xml | 20 ++- project/MimaBuild.scala | 2 +- project/SparkBuild.scala| 14 python/run-tests.py | 4 +-- repl/pom.xml| 4 +-- resource-managers/kubernetes/core/pom.xml | 4 +-- .../kubernetes/integration-tests/pom.xml| 4 +-- resource-managers/mesos/pom.xml | 4 +-- resource-managers/yarn/pom.xml | 4 +-- sql/catalyst/pom.xml| 4 +-- sql/core/pom.xml| 4 +-- sql/hive-thriftserver/pom.xml | 4 +-- sql/hive/pom.xml| 4 +-- streaming/pom.xml | 4 +-- tools/pom.xml | 4 +-- 45 files changed, 138 insertions(+), 138 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/ad853c56/assembly/pom.xml -- diff --git a/assembly/pom.xml b/assembly/pom.xml index b0337e5..68ebfad 100644 --- a/assembly/pom.xml +++ b/assembly/pom.xml @@ -20,12 +20,12 @@ 4.0.0 org.apache.spark -spark-parent_2.11 +spark-parent_2.12 3.0.0-SNAPSHOT ../pom.xml - spark-assembly_2.11 + spark-assembly_2.12 Spark Project Assembly http://spark.apache.org/ pom http://git-wip-us.apache.org/repos/asf/spark/blob/ad853c56/common/kvstore/pom.xml -- diff --git a/common/kvstore/pom.xml b/common/kvstore/pom.xml index 23a0f49..f042a12 100644 --- a/common/kvstore/pom.xml +++ b/common/kvstore/pom.xml @@ -21,12 +21,12 @@ 4.0.0 org.apache.spark -spark-parent_2.11 +spark-parent_2.12 3.0.0-SNAPSHOT ../../pom.xml -
svn commit: r30901 - in /dev/spark/2.4.1-SNAPSHOT-2018_11_14_14_19-ba638a7-docs: ./ _site/ _site/api/ _site/api/R/ _site/api/java/ _site/api/java/lib/ _site/api/java/org/ _site/api/java/org/apache/ _s
Author: pwendell Date: Wed Nov 14 22:33:54 2018 New Revision: 30901 Log: Apache Spark 2.4.1-SNAPSHOT-2018_11_14_14_19-ba638a7 docs [This commit notification would consist of 1476 parts, which exceeds the limit of 50 ones, so it was shortened to the summary.] - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-25986][BUILD] Add rules to ban throw Errors in application code
Repository: spark Updated Branches: refs/heads/master 2b671e729 -> 2977e2312 [SPARK-25986][BUILD] Add rules to ban throw Errors in application code ## What changes were proposed in this pull request? Add scala and java lint check rules to ban the usage of `throw new xxxErrors` and fix up all exists instance followed by https://github.com/apache/spark/pull/22989#issuecomment-437939830. See more details in https://github.com/apache/spark/pull/22969. ## How was this patch tested? Local test with lint-scala and lint-java. Closes #22989 from xuanyuanking/SPARK-25986. Authored-by: Yuanjian Li Signed-off-by: Sean Owen Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/2977e231 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/2977e231 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/2977e231 Branch: refs/heads/master Commit: 2977e2312d9690c9ced3c86b0ce937819e957775 Parents: 2b671e7 Author: Yuanjian Li Authored: Wed Nov 14 13:05:18 2018 -0800 Committer: Sean Owen Committed: Wed Nov 14 13:05:18 2018 -0800 -- .../spark/unsafe/UnsafeAlignedOffset.java | 4 +++ .../org/apache/spark/memory/MemoryConsumer.java | 2 ++ .../apache/spark/memory/TaskMemoryManager.java | 4 +++ .../unsafe/sort/UnsafeInMemorySorter.java | 2 ++ .../spark/util/random/RandomSampler.scala | 2 +- .../scala/org/apache/spark/FailureSuite.scala | 2 ++ .../apache/spark/executor/ExecutorSuite.scala | 2 ++ .../spark/scheduler/TaskResultGetterSuite.scala | 2 ++ .../spark/storage/BlockManagerSuite.scala | 2 +- dev/checkstyle.xml | 13 +--- .../spark/streaming/kafka010/KafkaUtils.scala | 2 +- .../org/apache/spark/ml/linalg/Vectors.scala| 2 +- .../spark/ml/classification/NaiveBayes.scala| 8 ++--- .../org/apache/spark/ml/param/params.scala | 4 +-- .../spark/ml/tuning/ValidatorParams.scala | 4 +-- .../spark/mllib/classification/NaiveBayes.scala | 2 +- .../org/apache/spark/mllib/linalg/Vectors.scala | 2 +- .../org/apache/spark/ml/PredictorSuite.scala| 6 ++-- .../ml/classification/ClassifierSuite.scala | 11 --- .../ml/classification/NaiveBayesSuite.scala | 4 +-- .../ml/classification/OneVsRestSuite.scala | 16 +- .../spark/ml/feature/VectorIndexerSuite.scala | 4 ++- .../spark/ml/tree/impl/RandomForestSuite.scala | 6 ++-- .../apache/spark/ml/tree/impl/TreeTests.scala | 6 ++-- .../spark/ml/tuning/CrossValidatorSuite.scala | 32 ++-- .../ml/tuning/TrainValidationSplitSuite.scala | 12 .../ml/tuning/ValidatorParamsSuiteHelpers.scala | 3 +- .../mllib/classification/NaiveBayesSuite.scala | 2 +- .../spark/mllib/clustering/KMeansSuite.scala| 2 +- .../spark/mllib/tree/DecisionTreeSuite.scala| 15 - scalastyle-config.xml | 11 +++ .../aggregate/TungstenAggregationIterator.scala | 2 ++ .../spark/sql/FileBasedDataSourceSuite.scala| 9 +++--- .../spark/sql/execution/PlannerSuite.scala | 2 +- .../datasources/FileSourceStrategySuite.scala | 2 +- .../vectorized/ColumnarBatchSuite.scala | 2 +- .../apache/spark/streaming/util/StateMap.scala | 2 +- .../spark/streaming/InputStreamsSuite.scala | 5 +-- .../apache/spark/streaming/StateMapSuite.scala | 2 +- 39 files changed, 128 insertions(+), 87 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/2977e231/common/unsafe/src/main/java/org/apache/spark/unsafe/UnsafeAlignedOffset.java -- diff --git a/common/unsafe/src/main/java/org/apache/spark/unsafe/UnsafeAlignedOffset.java b/common/unsafe/src/main/java/org/apache/spark/unsafe/UnsafeAlignedOffset.java index be62e40..546e878 100644 --- a/common/unsafe/src/main/java/org/apache/spark/unsafe/UnsafeAlignedOffset.java +++ b/common/unsafe/src/main/java/org/apache/spark/unsafe/UnsafeAlignedOffset.java @@ -39,7 +39,9 @@ public class UnsafeAlignedOffset { case 8: return (int)Platform.getLong(object, offset); default: +// checkstyle.off: RegexpSinglelineJava throw new AssertionError("Illegal UAO_SIZE"); +// checkstyle.on: RegexpSinglelineJava } } @@ -52,7 +54,9 @@ public class UnsafeAlignedOffset { Platform.putLong(object, offset, value); break; default: +// checkstyle.off: RegexpSinglelineJava throw new AssertionError("Illegal UAO_SIZE"); +// checkstyle.on: RegexpSinglelineJava } } } http://git-wip-us.apache.org/repos/asf/spark/blob/2977e231/core/src/main/java/org/apache/spark/memory/MemoryConsumer.java -- diff --git
spark git commit: [SPARK-25778] WriteAheadLogBackedBlockRDD in YARN Cluster Mode Fails …
Repository: spark Updated Branches: refs/heads/branch-2.4 ca426bfa5 -> ba638a783 [SPARK-25778] WriteAheadLogBackedBlockRDD in YARN Cluster Mode Fails ⦠â¦due lack of access to tmpDir from $PWD to HDFS WriteAheadLogBackedBlockRDD usage of java.io.tmpdir will fail if $PWD resolves to a folder in HDFS and the Spark YARN Cluster job does not have the correct access to this folder in regards to the dummy folder. So this patch provides an option to set spark.streaming.receiver.blockStore.tmpdir to override java.io.tmpdir which sets $PWD from YARN Cluster mode. ## What changes were proposed in this pull request? This change provides an option to override the java.io.tmpdir option so that when $PWD is resolved in YARN Cluster mode Spark does not attempt to use this folder and instead use the folder provided with the following option: spark.streaming.receiver.blockStore.tmpdir ## How was this patch tested? Patch was manually tested on a Spark Streaming Job with Write Ahead logs in Cluster mode. Closes #22867 from gss2002/SPARK-25778. Authored-by: gss2002 Signed-off-by: Marcelo Vanzin (cherry picked from commit 2b671e729250b980aa9e4ea2d483f44fa0e129cb) Signed-off-by: Marcelo Vanzin Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/ba638a78 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/ba638a78 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/ba638a78 Branch: refs/heads/branch-2.4 Commit: ba638a783442f6a5b7b8e0a363edfb398eb2b6c7 Parents: ca426bf Author: gss2002 Authored: Wed Nov 14 13:02:13 2018 -0800 Committer: Marcelo Vanzin Committed: Wed Nov 14 13:02:24 2018 -0800 -- .../apache/spark/streaming/rdd/WriteAheadLogBackedBlockRDD.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/ba638a78/streaming/src/main/scala/org/apache/spark/streaming/rdd/WriteAheadLogBackedBlockRDD.scala -- diff --git a/streaming/src/main/scala/org/apache/spark/streaming/rdd/WriteAheadLogBackedBlockRDD.scala b/streaming/src/main/scala/org/apache/spark/streaming/rdd/WriteAheadLogBackedBlockRDD.scala index 844760a..f677c49 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/rdd/WriteAheadLogBackedBlockRDD.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/rdd/WriteAheadLogBackedBlockRDD.scala @@ -136,7 +136,7 @@ class WriteAheadLogBackedBlockRDD[T: ClassTag]( // this dummy directory should not already exist otherwise the WAL will try to recover // past events from the directory and throw errors. val nonExistentDirectory = new File( - System.getProperty("java.io.tmpdir"), UUID.randomUUID().toString).getAbsolutePath + System.getProperty("java.io.tmpdir"), UUID.randomUUID().toString).toURI.toString writeAheadLog = WriteAheadLogUtils.createLogForReceiver( SparkEnv.get.conf, nonExistentDirectory, hadoopConf) dataRead = writeAheadLog.read(partition.walRecordHandle) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-25778] WriteAheadLogBackedBlockRDD in YARN Cluster Mode Fails …
Repository: spark Updated Branches: refs/heads/master 722369ee5 -> 2b671e729 [SPARK-25778] WriteAheadLogBackedBlockRDD in YARN Cluster Mode Fails ⦠â¦due lack of access to tmpDir from $PWD to HDFS WriteAheadLogBackedBlockRDD usage of java.io.tmpdir will fail if $PWD resolves to a folder in HDFS and the Spark YARN Cluster job does not have the correct access to this folder in regards to the dummy folder. So this patch provides an option to set spark.streaming.receiver.blockStore.tmpdir to override java.io.tmpdir which sets $PWD from YARN Cluster mode. ## What changes were proposed in this pull request? This change provides an option to override the java.io.tmpdir option so that when $PWD is resolved in YARN Cluster mode Spark does not attempt to use this folder and instead use the folder provided with the following option: spark.streaming.receiver.blockStore.tmpdir ## How was this patch tested? Patch was manually tested on a Spark Streaming Job with Write Ahead logs in Cluster mode. Closes #22867 from gss2002/SPARK-25778. Authored-by: gss2002 Signed-off-by: Marcelo Vanzin Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/2b671e72 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/2b671e72 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/2b671e72 Branch: refs/heads/master Commit: 2b671e729250b980aa9e4ea2d483f44fa0e129cb Parents: 722369e Author: gss2002 Authored: Wed Nov 14 13:02:13 2018 -0800 Committer: Marcelo Vanzin Committed: Wed Nov 14 13:02:13 2018 -0800 -- .../apache/spark/streaming/rdd/WriteAheadLogBackedBlockRDD.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/2b671e72/streaming/src/main/scala/org/apache/spark/streaming/rdd/WriteAheadLogBackedBlockRDD.scala -- diff --git a/streaming/src/main/scala/org/apache/spark/streaming/rdd/WriteAheadLogBackedBlockRDD.scala b/streaming/src/main/scala/org/apache/spark/streaming/rdd/WriteAheadLogBackedBlockRDD.scala index 844760a..f677c49 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/rdd/WriteAheadLogBackedBlockRDD.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/rdd/WriteAheadLogBackedBlockRDD.scala @@ -136,7 +136,7 @@ class WriteAheadLogBackedBlockRDD[T: ClassTag]( // this dummy directory should not already exist otherwise the WAL will try to recover // past events from the directory and throw errors. val nonExistentDirectory = new File( - System.getProperty("java.io.tmpdir"), UUID.randomUUID().toString).getAbsolutePath + System.getProperty("java.io.tmpdir"), UUID.randomUUID().toString).toURI.toString writeAheadLog = WriteAheadLogUtils.createLogForReceiver( SparkEnv.get.conf, nonExistentDirectory, hadoopConf) dataRead = writeAheadLog.read(partition.walRecordHandle) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-24421][BUILD][CORE] Accessing sun.misc.Cleaner in JDK11
Repository: spark Updated Branches: refs/heads/master 922dfe486 -> 722369ee5 [SPARK-24421][BUILD][CORE] Accessing sun.misc.Cleaner in JDK11 â¦. Other related changes to get JDK 11 working, to test ## What changes were proposed in this pull request? - Access `sun.misc.Cleaner` (Java 8) and `jdk.internal.ref.Cleaner` (JDK 9+) by reflection (note: the latter only works if illegal reflective access is allowed) - Access `sun.misc.Unsafe.invokeCleaner` in Java 9+ instead of `sun.misc.Cleaner` (Java 8) In order to test anything on JDK 11, I also fixed a few small things, which I include here: - Fix minor JDK 11 compile issues - Update scala plugin, Jetty for JDK 11, to facilitate tests too This doesn't mean JDK 11 tests all pass now, but lots do. Note also that the JDK 9+ solution for the Cleaner has a big caveat. ## How was this patch tested? Existing tests. Manually tested JDK 11 build and tests, and tests covering this change appear to pass. All Java 8 tests should still pass, but this change alone does not achieve full JDK 11 compatibility. Closes #22993 from srowen/SPARK-24421. Authored-by: Sean Owen Signed-off-by: Sean Owen Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/722369ee Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/722369ee Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/722369ee Branch: refs/heads/master Commit: 722369ee5532c94eb7dd0cbda3b7365fc2f52026 Parents: 922dfe4 Author: Sean Owen Authored: Wed Nov 14 12:52:54 2018 -0800 Committer: Sean Owen Committed: Wed Nov 14 12:52:54 2018 -0800 -- .../java/org/apache/spark/unsafe/Platform.java | 74 +--- .../org/apache/spark/storage/StorageUtils.scala | 36 -- .../org/apache/spark/util/UtilsSuite.scala | 7 +- dev/deps/spark-deps-hadoop-2.7 | 2 +- dev/deps/spark-deps-hadoop-3.1 | 6 +- .../org/apache/spark/examples/LogQuery.scala| 4 +- .../apache/spark/ml/linalg/MatricesSuite.scala | 4 +- .../spark/mllib/linalg/MatricesSuite.scala | 4 +- pom.xml | 22 -- .../cli/thrift/ThriftHttpCLIService.java| 3 +- 10 files changed, 123 insertions(+), 39 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/722369ee/common/unsafe/src/main/java/org/apache/spark/unsafe/Platform.java -- diff --git a/common/unsafe/src/main/java/org/apache/spark/unsafe/Platform.java b/common/unsafe/src/main/java/org/apache/spark/unsafe/Platform.java index aca6fca..076b693 100644 --- a/common/unsafe/src/main/java/org/apache/spark/unsafe/Platform.java +++ b/common/unsafe/src/main/java/org/apache/spark/unsafe/Platform.java @@ -19,10 +19,10 @@ package org.apache.spark.unsafe; import java.lang.reflect.Constructor; import java.lang.reflect.Field; +import java.lang.reflect.InvocationTargetException; import java.lang.reflect.Method; import java.nio.ByteBuffer; -import sun.misc.Cleaner; import sun.misc.Unsafe; public final class Platform { @@ -67,6 +67,60 @@ public final class Platform { unaligned = _unaligned; } + // Access fields and constructors once and store them, for performance: + + private static final Constructor DBB_CONSTRUCTOR; + private static final Field DBB_CLEANER_FIELD; + static { +try { + Class cls = Class.forName("java.nio.DirectByteBuffer"); + Constructor constructor = cls.getDeclaredConstructor(Long.TYPE, Integer.TYPE); + constructor.setAccessible(true); + Field cleanerField = cls.getDeclaredField("cleaner"); + cleanerField.setAccessible(true); + DBB_CONSTRUCTOR = constructor; + DBB_CLEANER_FIELD = cleanerField; +} catch (ClassNotFoundException | NoSuchMethodException | NoSuchFieldException e) { + throw new IllegalStateException(e); +} + } + + private static final Method CLEANER_CREATE_METHOD; + static { +// The implementation of Cleaner changed from JDK 8 to 9 +// Split java.version on non-digit chars: +int majorVersion = Integer.parseInt(System.getProperty("java.version").split("\\D+")[0]); +String cleanerClassName; +if (majorVersion < 9) { + cleanerClassName = "sun.misc.Cleaner"; +} else { + cleanerClassName = "jdk.internal.ref.Cleaner"; +} +try { + Class cleanerClass = Class.forName(cleanerClassName); + Method createMethod = cleanerClass.getMethod("create", Object.class, Runnable.class); + // Accessing jdk.internal.ref.Cleaner should actually fail by default in JDK 9+, + // unfortunately, unless the user has allowed access with something like + // --add-opens java.base/java.lang=ALL-UNNAMED If not, we can't really use the Cleaner + // hack below. It
svn commit: r30897 - in /dev/spark/3.0.0-SNAPSHOT-2018_11_14_12_12-922dfe4-docs: ./ _site/ _site/api/ _site/api/R/ _site/api/java/ _site/api/java/lib/ _site/api/java/org/ _site/api/java/org/apache/ _s
Author: pwendell Date: Wed Nov 14 20:27:01 2018 New Revision: 30897 Log: Apache Spark 3.0.0-SNAPSHOT-2018_11_14_12_12-922dfe4 docs [This commit notification would consist of 1471 parts, which exceeds the limit of 50 ones, so it was shortened to the summary.] - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-25965][SQL][TEST] Add avro read benchmark
Repository: spark Updated Branches: refs/heads/master 4035c98a0 -> 922dfe486 [SPARK-25965][SQL][TEST] Add avro read benchmark Add read benchmark for Avro, which is missing for a period. The benchmark is similar to `DataSourceReadBenchmark` and `OrcReadBenchmark` Manually run benchmark Closes #22966 from gengliangwang/avroReadBenchmark. Lead-authored-by: Gengliang Wang Co-authored-by: Gengliang Wang Co-authored-by: Dongjoon Hyun Signed-off-by: Dongjoon Hyun Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/922dfe48 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/922dfe48 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/922dfe48 Branch: refs/heads/master Commit: 922dfe4865216987f9a92892b89dc3eaa9610b9b Parents: 4035c98 Author: Gengliang Wang Authored: Wed Nov 14 11:25:11 2018 -0800 Committer: Dongjoon Hyun Committed: Wed Nov 14 11:26:26 2018 -0800 -- .../benchmarks/AvroReadBenchmark-results.txt| 122 +++ .../execution/benchmark/AvroReadBenchmark.scala | 216 +++ 2 files changed, 338 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/922dfe48/external/avro/benchmarks/AvroReadBenchmark-results.txt -- diff --git a/external/avro/benchmarks/AvroReadBenchmark-results.txt b/external/avro/benchmarks/AvroReadBenchmark-results.txt new file mode 100644 index 000..7900fea --- /dev/null +++ b/external/avro/benchmarks/AvroReadBenchmark-results.txt @@ -0,0 +1,122 @@ + +SQL Single Numeric Column Scan + + +OpenJDK 64-Bit Server VM 1.8.0_191-b12 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +SQL Single TINYINT Column Scan: Best/Avg Time(ms)Rate(M/s) Per Row(ns) Relative + +Sum 2774 / 2815 5.7 176.4 1.0X + +OpenJDK 64-Bit Server VM 1.8.0_191-b12 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +SQL Single SMALLINT Column Scan: Best/Avg Time(ms)Rate(M/s) Per Row(ns) Relative + +Sum 2761 / 2777 5.7 175.5 1.0X + +OpenJDK 64-Bit Server VM 1.8.0_191-b12 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +SQL Single INT Column Scan: Best/Avg Time(ms)Rate(M/s) Per Row(ns) Relative + +Sum 2783 / 2870 5.7 176.9 1.0X + +OpenJDK 64-Bit Server VM 1.8.0_191-b12 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +SQL Single BIGINT Column Scan: Best/Avg Time(ms)Rate(M/s) Per Row(ns) Relative + +Sum 3256 / 3266 4.8 207.0 1.0X + +OpenJDK 64-Bit Server VM 1.8.0_191-b12 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +SQL Single FLOAT Column Scan:Best/Avg Time(ms)Rate(M/s) Per Row(ns) Relative + +Sum 2841 / 2867 5.5 180.6 1.0X + +OpenJDK 64-Bit Server VM 1.8.0_191-b12 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +SQL Single DOUBLE Column Scan: Best/Avg Time(ms)Rate(M/s) Per Row(ns) Relative + +Sum 2981 / 2996 5.3 189.5 1.0X + + + +Int and String Scan + + +OpenJDK 64-Bit Server VM 1.8.0_191-b12 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Int and String Scan: Best/Avg Time(ms)Rate(M/s) Per Row(ns) Relative + +Sum of
spark git commit: [SPARK-26042][SS][TESTS] Fix a potential hang in KafkaContinuousSourceTopicDeletionSuite
Repository: spark Updated Branches: refs/heads/master 5f11e8c4c -> 4035c98a0 [SPARK-26042][SS][TESTS] Fix a potential hang in KafkaContinuousSourceTopicDeletionSuite ## What changes were proposed in this pull request? As initializing lazy vals shares the same lock, a thread is trying to initialize `executedPlan` when `isRDD` is running, this thread will hang forever. This PR just materializes `executedPlan` so that accessing it when `toRdd` is running doesn't need to wait for a lock ## How was this patch tested? Jenkins Closes #23023 from zsxwing/SPARK-26042. Authored-by: Shixiong Zhu Signed-off-by: Shixiong Zhu Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/4035c98a Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/4035c98a Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/4035c98a Branch: refs/heads/master Commit: 4035c98a0c03cf61d1fb9a9916df513ab1081a9b Parents: 5f11e8c Author: Shixiong Zhu Authored: Wed Nov 14 10:19:20 2018 -0800 Committer: Shixiong Zhu Committed: Wed Nov 14 10:19:20 2018 -0800 -- .../execution/streaming/continuous/ContinuousExecution.scala | 7 ++- 1 file changed, 6 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/4035c98a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousExecution.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousExecution.scala index f009c52..4a7df73 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousExecution.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousExecution.scala @@ -262,7 +262,12 @@ class ContinuousExecution( reportTimeTaken("runContinuous") { SQLExecution.withNewExecutionId( - sparkSessionForQuery, lastExecution)(lastExecution.toRdd) + sparkSessionForQuery, lastExecution) { + // Materialize `executedPlan` so that accessing it when `toRdd` is running doesn't need to + // wait for a lock + lastExecution.executedPlan + lastExecution.toRdd +} } } catch { case t: Throwable if StreamExecution.isInterruptionException(t, sparkSession.sparkContext) && - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-26042][SS][TESTS] Fix a potential hang in KafkaContinuousSourceTopicDeletionSuite
Repository: spark Updated Branches: refs/heads/branch-2.4 e2e1f0ad8 -> ca426bfa5 [SPARK-26042][SS][TESTS] Fix a potential hang in KafkaContinuousSourceTopicDeletionSuite ## What changes were proposed in this pull request? As initializing lazy vals shares the same lock, a thread is trying to initialize `executedPlan` when `isRDD` is running, this thread will hang forever. This PR just materializes `executedPlan` so that accessing it when `toRdd` is running doesn't need to wait for a lock ## How was this patch tested? Jenkins Closes #23023 from zsxwing/SPARK-26042. Authored-by: Shixiong Zhu Signed-off-by: Shixiong Zhu (cherry picked from commit 4035c98a0c03cf61d1fb9a9916df513ab1081a9b) Signed-off-by: Shixiong Zhu Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/ca426bfa Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/ca426bfa Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/ca426bfa Branch: refs/heads/branch-2.4 Commit: ca426bfa56045f01de0ea14480a375753073e025 Parents: e2e1f0a Author: Shixiong Zhu Authored: Wed Nov 14 10:19:20 2018 -0800 Committer: Shixiong Zhu Committed: Wed Nov 14 10:19:37 2018 -0800 -- .../execution/streaming/continuous/ContinuousExecution.scala | 7 ++- 1 file changed, 6 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/ca426bfa/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousExecution.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousExecution.scala index f104422..2e24fa6 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousExecution.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousExecution.scala @@ -259,7 +259,12 @@ class ContinuousExecution( reportTimeTaken("runContinuous") { SQLExecution.withNewExecutionId( - sparkSessionForQuery, lastExecution)(lastExecution.toRdd) + sparkSessionForQuery, lastExecution) { + // Materialize `executedPlan` so that accessing it when `toRdd` is running doesn't need to + // wait for a lock + lastExecution.executedPlan + lastExecution.toRdd +} } } catch { case t: Throwable - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-25118][CORE] Persist Driver Logs in Client mode to Hdfs
Repository: spark Updated Branches: refs/heads/master e503065fd -> 5f11e8c4c [SPARK-25118][CORE] Persist Driver Logs in Client mode to Hdfs Currently, we do not have a mechanism to collect driver logs if a user chooses to run their application in client mode. This is a big issue as admin teams need to create their own mechanisms to capture driver logs. This commit adds a logger which, if enabled, adds a local log appender to the root logger and asynchronously syncs it an application specific log file on hdfs (Spark Driver Log Dir). Additionally, this collects spark-shell driver logs at INFO level by default. The change is that instead of setting root logger level to WARN, we will set the consoleAppender threshold to WARN, in case of spark-shell. This ensures that only WARN logs are printed on CONSOLE but other log appenders still capture INFO (or the default log level logs). 1. Verified that logs are written to local and remote dir 2. Added a unit test case 3. Verified this for spark-shell, client mode and pyspark. 4. Verified in both non-kerberos and kerberos environment 5. Verified with following unexpected termination conditions: Ctrl + C, Driver OOM, Large Log Files 6. Ran an application in spark-shell and ensured that driver logs were captured at INFO level 7. Started the application at WARN level, programmatically changed the level to INFO and ensured that logs on console were printed at INFO level Closes #22504 from ankuriitg/ankurgupta/SPARK-25118. Authored-by: ankurgupta Signed-off-by: Marcelo Vanzin Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/5f11e8c4 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/5f11e8c4 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/5f11e8c4 Branch: refs/heads/master Commit: 5f11e8c4cb9a5db037ac239b8fcc97f3a746e772 Parents: e503065 Author: ankurgupta Authored: Wed Nov 14 08:23:24 2018 -0800 Committer: Marcelo Vanzin Committed: Wed Nov 14 08:23:34 2018 -0800 -- .../scala/org/apache/spark/SparkContext.scala | 8 + .../deploy/history/FsHistoryProvider.scala | 88 +++- .../org/apache/spark/internal/Logging.scala | 29 ++- .../apache/spark/internal/config/History.scala | 17 ++ .../apache/spark/internal/config/package.scala | 13 ++ .../spark/scheduler/EventLoggingListener.scala | 8 +- .../scala/org/apache/spark/util/Utils.scala | 15 +- .../spark/util/logging/DriverLogger.scala | 205 +++ .../deploy/history/FsHistoryProviderSuite.scala | 70 ++- .../spark/util/logging/DriverLoggerSuite.scala | 79 +++ docs/configuration.md | 33 +++ docs/monitoring.md | 22 ++ docs/security.md| 11 + 13 files changed, 576 insertions(+), 22 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/5f11e8c4/core/src/main/scala/org/apache/spark/SparkContext.scala -- diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala b/core/src/main/scala/org/apache/spark/SparkContext.scala index b3c9c03..cb91717 100644 --- a/core/src/main/scala/org/apache/spark/SparkContext.scala +++ b/core/src/main/scala/org/apache/spark/SparkContext.scala @@ -58,6 +58,7 @@ import org.apache.spark.storage._ import org.apache.spark.storage.BlockManagerMessages.TriggerThreadDump import org.apache.spark.ui.{ConsoleProgressBar, SparkUI} import org.apache.spark.util._ +import org.apache.spark.util.logging.DriverLogger /** * Main entry point for Spark functionality. A SparkContext represents the connection to a Spark @@ -205,6 +206,7 @@ class SparkContext(config: SparkConf) extends Logging { private var _applicationId: String = _ private var _applicationAttemptId: Option[String] = None private var _eventLogger: Option[EventLoggingListener] = None + private var _driverLogger: Option[DriverLogger] = None private var _executorAllocationManager: Option[ExecutorAllocationManager] = None private var _cleaner: Option[ContextCleaner] = None private var _listenerBusStarted: Boolean = false @@ -371,6 +373,8 @@ class SparkContext(config: SparkConf) extends Logging { throw new SparkException("An application name must be set in your configuration") } +_driverLogger = DriverLogger(_conf) + // log out spark.app.name in the Spark driver logs logInfo(s"Submitted application: $appName") @@ -1867,6 +1871,9 @@ class SparkContext(config: SparkConf) extends Logging { postApplicationEnd() } Utils.tryLogNonFatalError { + _driverLogger.foreach(_.stop()) +} +Utils.tryLogNonFatalError { _ui.foreach(_.stop()) } if (env != null) { @@ -2351,6 +2358,7 @@ class
svn commit: r30892 - in /dev/spark/3.0.0-SNAPSHOT-2018_11_14_07_44-e503065-docs: ./ _site/ _site/api/ _site/api/R/ _site/api/java/ _site/api/java/lib/ _site/api/java/org/ _site/api/java/org/apache/ _s
Author: pwendell Date: Wed Nov 14 15:58:48 2018 New Revision: 30892 Log: Apache Spark 3.0.0-SNAPSHOT-2018_11_14_07_44-e503065 docs [This commit notification would consist of 1471 parts, which exceeds the limit of 50 ones, so it was shortened to the summary.] - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-25868][MLLIB] One part of Spark MLlib Kmean Logic Performance problem
Repository: spark Updated Branches: refs/heads/master a7a331df6 -> e503065fd [SPARK-25868][MLLIB] One part of Spark MLlib Kmean Logic Performance problem ## What changes were proposed in this pull request? Fix fastSquaredDistance to calculate dense-dense situation calculation performance problem and meanwhile enhance the calculation accuracy. ## How was this patch tested? >From different point to test after add this patch, the dense-dense calculation >situation performance is enhanced and will do influence other calculation >situation like (sparse-sparse, sparse-dense) **For calculation logic test** There is my test for sparse-sparse, dense-dense, sparse-dense case There is test result: First we need define some branch path logic for sparse-sparse and sparse-dense case if meet precisionBound1, we define it as LOGIC1 if not meet precisionBound1, and not meet precisionBound2, we define it as LOGIC2 if not meet precisionBound1, but meet precisionBound2, we define it as LOGIC3 (There is a trick, you can manually change the precision value to meet above situation) sparse- sparse case time cost situation (milliseconds) LOGIC1 Before add patch: 7786, 7970, 8086 After add patch: 7729, 7653, 7903 LOGIC2 Before add patch: 8412, 9029, 8606 After add patch: 8603, 8724, 9024 LOGIC3 Before add patch: 19365, 19146, 19351 After add patch: 18917, 19007, 19074 sparse-dense case time cost situation (milliseconds) LOGIC1 Before add patch: 4195, 4014, 4409 After add patch: 4081,3971, 4151 LOGIC2 Before add patch: 4968, 5579, 5080 After add patch: 4980, 5472, 5148 LOGIC3 Before add patch: 11848, 12077, 12168 After add patch: 11718, 11874, 11743 And for dense-dense case like we already discussed in comment, only use sqdist to calculate distance dense-dense case time cost situation (milliseconds) Before add patch: 7340, 7816, 7672 After add patch: 5752, 5800, 5753 **For real world data test** There is my test data situation I use the data http://archive.ics.uci.edu/ml/datasets/Condition+monitoring+of+hydraulic+systems extract file (PS1, PS2, PS3, PS4, PS5, PS6) to form the test data total instances are 13230 the attributes for line are 6000 Result for sparse-sparse situation time cost (milliseconds) Before Enhance: 7670, 7704, 7652 After Enhance: 7634, 7729, 7645 Closes #22893 from KyleLi1985/updatekmeanpatch. Authored-by: æ亮 Signed-off-by: Sean Owen Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/e503065f Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/e503065f Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/e503065f Branch: refs/heads/master Commit: e503065fd8855cd8810ca541db6c56d6fa0a5ff6 Parents: a7a331d Author: æ亮 Authored: Wed Nov 14 07:24:13 2018 -0800 Committer: Sean Owen Committed: Wed Nov 14 07:24:13 2018 -0800 -- .../org/apache/spark/mllib/util/MLUtils.scala | 30 +++- python/pyspark/ml/clustering.py | 6 ++-- python/pyspark/mllib/clustering.py | 2 +- 3 files changed, 20 insertions(+), 18 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/e503065f/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala b/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala index 14af8b5..6d15a6b 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala @@ -506,8 +506,6 @@ object MLUtils extends Logging { val n = v1.size require(v2.size == n) require(norm1 >= 0.0 && norm2 >= 0.0) -val sumSquaredNorm = norm1 * norm1 + norm2 * norm2 -val normDiff = norm1 - norm2 var sqDist = 0.0 /* * The relative error is @@ -521,19 +519,23 @@ object MLUtils extends Logging { * The bound doesn't need the inner product, so we can use it as a sufficient condition to * check quickly whether the inner product approach is accurate. */ -val precisionBound1 = 2.0 * EPSILON * sumSquaredNorm / (normDiff * normDiff + EPSILON) -if (precisionBound1 < precision) { - sqDist = sumSquaredNorm - 2.0 * dot(v1, v2) -} else if (v1.isInstanceOf[SparseVector] || v2.isInstanceOf[SparseVector]) { - val dotValue = dot(v1, v2) - sqDist = math.max(sumSquaredNorm - 2.0 * dotValue, 0.0) - val precisionBound2 = EPSILON * (sumSquaredNorm + 2.0 * math.abs(dotValue)) / -(sqDist + EPSILON) - if (precisionBound2 > precision) { -sqDist = Vectors.sqdist(v1, v2) - } -} else { +if (v1.isInstanceOf[DenseVector] && v2.isInstanceOf[DenseVector]) { sqDist = Vectors.sqdist(v1, v2) +} else