spark git commit: [SPARK-22313][PYTHON] Mark/print deprecation warnings as DeprecationWarning for deprecated APIs
Repository: spark Updated Branches: refs/heads/master 884d4f95f -> d9798c834 [SPARK-22313][PYTHON] Mark/print deprecation warnings as DeprecationWarning for deprecated APIs ## What changes were proposed in this pull request? This PR proposes to mark the existing warnings as `DeprecationWarning` and print out warnings for deprecated functions. This could be actually useful for Spark app developers. I use (old) PyCharm and this IDE can detect this specific `DeprecationWarning` in some cases: **Before** https://user-images.githubusercontent.com/6477701/31762664-df68d9f8-b4f6-11e7-8773-f0468f70a2cc.png; height="45" /> **After** https://user-images.githubusercontent.com/6477701/31762662-de4d6868-b4f6-11e7-98dc-3c8446a0c28a.png; height="70" /> For console usage, `DeprecationWarning` is usually disabled (see https://docs.python.org/2/library/warnings.html#warning-categories and https://docs.python.org/3/library/warnings.html#warning-categories): ``` >>> import warnings >>> filter(lambda f: f[2] == DeprecationWarning, warnings.filters) [('ignore', <_sre.SRE_Pattern object at 0x10ba58c00>, , <_sre.SRE_Pattern object at 0x10bb04138>, 0), ('ignore', None, , None, 0)] ``` so, it won't actually mess up the terminal much unless it is intended. If this is intendedly enabled, it'd should as below: ``` >>> import warnings >>> warnings.simplefilter('always', DeprecationWarning) >>> >>> from pyspark.sql import functions >>> functions.approxCountDistinct("a") .../spark/python/pyspark/sql/functions.py:232: DeprecationWarning: Deprecated in 2.1, use approx_count_distinct instead. "Deprecated in 2.1, use approx_count_distinct instead.", DeprecationWarning) ... ``` These instances were found by: ``` cd python/pyspark grep -r "Deprecated" . grep -r "deprecated" . grep -r "deprecate" . ``` ## How was this patch tested? Manually tested. Author: hyukjinkwonCloses #19535 from HyukjinKwon/deprecated-warning. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/d9798c83 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/d9798c83 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/d9798c83 Branch: refs/heads/master Commit: d9798c834f3fed060cfd18a8d38c398cb2efcc82 Parents: 884d4f9 Author: hyukjinkwon Authored: Tue Oct 24 12:44:47 2017 +0900 Committer: hyukjinkwon Committed: Tue Oct 24 12:44:47 2017 +0900 -- python/pyspark/ml/util.py | 8 +++- python/pyspark/mllib/classification.py | 2 +- python/pyspark/mllib/evaluation.py | 6 +-- python/pyspark/mllib/regression.py | 8 ++-- python/pyspark/sql/dataframe.py| 3 ++ python/pyspark/sql/functions.py| 18 python/pyspark/streaming/flume.py | 14 +- python/pyspark/streaming/kafka.py | 72 + 8 files changed, 110 insertions(+), 21 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/d9798c83/python/pyspark/ml/util.py -- diff --git a/python/pyspark/ml/util.py b/python/pyspark/ml/util.py index 6777291..c3c47bd 100644 --- a/python/pyspark/ml/util.py +++ b/python/pyspark/ml/util.py @@ -175,7 +175,9 @@ class JavaMLWriter(MLWriter): .. note:: Deprecated in 2.1 and will be removed in 3.0, use session instead. """ -warnings.warn("Deprecated in 2.1 and will be removed in 3.0, use session instead.") +warnings.warn( +"Deprecated in 2.1 and will be removed in 3.0, use session instead.", +DeprecationWarning) self._jwrite.context(sqlContext._ssql_ctx) return self @@ -256,7 +258,9 @@ class JavaMLReader(MLReader): .. note:: Deprecated in 2.1 and will be removed in 3.0, use session instead. """ -warnings.warn("Deprecated in 2.1 and will be removed in 3.0, use session instead.") +warnings.warn( +"Deprecated in 2.1 and will be removed in 3.0, use session instead.", +DeprecationWarning) self._jread.context(sqlContext._ssql_ctx) return self http://git-wip-us.apache.org/repos/asf/spark/blob/d9798c83/python/pyspark/mllib/classification.py -- diff --git a/python/pyspark/mllib/classification.py b/python/pyspark/mllib/classification.py index e04eeb2..cce703d 100644 --- a/python/pyspark/mllib/classification.py +++ b/python/pyspark/mllib/classification.py @@ -311,7 +311,7 @@ class LogisticRegressionWithSGD(object): """ warnings.warn( "Deprecated in 2.0.0. Use ml.classification.LogisticRegression or " -"LogisticRegressionWithLBFGS.") +
spark git commit: [SPARK-21912][SQL][FOLLOW-UP] ORC/Parquet table should not create invalid column names
Repository: spark Updated Branches: refs/heads/master f6290aea2 -> 884d4f95f [SPARK-21912][SQL][FOLLOW-UP] ORC/Parquet table should not create invalid column names ## What changes were proposed in this pull request? During [SPARK-21912](https://issues.apache.org/jira/browse/SPARK-21912), we skipped testing 'ADD COLUMNS' on ORC tables due to ORC limitation. Since [SPARK-21929](https://issues.apache.org/jira/browse/SPARK-21929) is resolved now, we can test both `ORC` and `PARQUET` completely. ## How was this patch tested? Pass the updated test case. Author: Dongjoon HyunCloses #19562 from dongjoon-hyun/SPARK-21912-2. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/884d4f95 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/884d4f95 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/884d4f95 Branch: refs/heads/master Commit: 884d4f95f7ebfaa9d8c57cf770d10a2c6ab82d62 Parents: f6290ae Author: Dongjoon Hyun Authored: Mon Oct 23 17:21:49 2017 -0700 Committer: gatorsmile Committed: Mon Oct 23 17:21:49 2017 -0700 -- .../apache/spark/sql/hive/execution/SQLQuerySuite.scala | 11 --- 1 file changed, 4 insertions(+), 7 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/884d4f95/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala -- diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala index 1cf1c5c..39e918c 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala @@ -2031,8 +2031,8 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton { test("SPARK-21912 ORC/Parquet table should not create invalid column names") { Seq(" ", ",", ";", "{", "}", "(", ")", "\n", "\t", "=").foreach { name => - withTable("t21912") { -Seq("ORC", "PARQUET").foreach { source => + Seq("ORC", "PARQUET").foreach { source => +withTable("t21912") { val m = intercept[AnalysisException] { sql(s"CREATE TABLE t21912(`col$name` INT) USING $source") }.getMessage @@ -2049,15 +2049,12 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton { }.getMessage assert(m3.contains(s"contains invalid character(s)")) } -} -// TODO: After SPARK-21929, we need to check ORC, too. -Seq("PARQUET").foreach { source => sql(s"CREATE TABLE t21912(`col` INT) USING $source") - val m = intercept[AnalysisException] { + val m4 = intercept[AnalysisException] { sql(s"ALTER TABLE t21912 ADD COLUMNS(`col$name` INT)") }.getMessage - assert(m.contains(s"contains invalid character(s)")) + assert(m4.contains(s"contains invalid character(s)")) } } } - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-22285][SQL] Change implementation of ApproxCountDistinctForIntervals to TypedImperativeAggregate
Repository: spark Updated Branches: refs/heads/master 5a5b6b785 -> f6290aea2 [SPARK-22285][SQL] Change implementation of ApproxCountDistinctForIntervals to TypedImperativeAggregate ## What changes were proposed in this pull request? The current implementation of `ApproxCountDistinctForIntervals` is `ImperativeAggregate`. The number of `aggBufferAttributes` is the number of total words in the hllppHelper array. Each hllppHelper has 52 words by default relativeSD. Since this aggregate function is used in equi-height histogram generation, and the number of buckets in histogram is usually hundreds, the number of `aggBufferAttributes` can easily reach tens of thousands or even more. This leads to a huge method in codegen and causes error: ``` org.codehaus.janino.JaninoRuntimeException: Code of method "apply(Lorg/apache/spark/sql/catalyst/InternalRow;)Lorg/apache/spark/sql/catalyst/expressions/UnsafeRow;" of class "org.apache.spark.sql.catalyst.expressions.GeneratedClass$SpecificUnsafeProjection" grows beyond 64 KB. ``` Besides, huge generated methods also result in performance regression. In this PR, we change its implementation to `TypedImperativeAggregate`. After the fix, `ApproxCountDistinctForIntervals` can deal with more than thousands endpoints without throwing codegen error, and improve performance from `20 sec` to `2 sec` in a test case of 500 endpoints. ## How was this patch tested? Test by an added test case and existing tests. Author: Zhenhua WangCloses #19506 from wzhfy/change_forIntervals_typedAgg. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/f6290aea Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/f6290aea Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/f6290aea Branch: refs/heads/master Commit: f6290aea24efeb238db88bdaef4e24d50740ca4c Parents: 5a5b6b7 Author: Zhenhua Wang Authored: Mon Oct 23 23:02:36 2017 +0100 Committer: Wenchen Fan Committed: Mon Oct 23 23:02:36 2017 +0100 -- .../ApproxCountDistinctForIntervals.scala | 97 +++- .../ApproxCountDistinctForIntervalsSuite.scala | 34 +++ ...roxCountDistinctForIntervalsQuerySuite.scala | 61 3 files changed, 130 insertions(+), 62 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/f6290aea/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/ApproxCountDistinctForIntervals.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/ApproxCountDistinctForIntervals.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/ApproxCountDistinctForIntervals.scala index 096d1b3..d4421ca 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/ApproxCountDistinctForIntervals.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/ApproxCountDistinctForIntervals.scala @@ -22,9 +22,10 @@ import java.util import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.analysis.TypeCheckResult import org.apache.spark.sql.catalyst.analysis.TypeCheckResult.{TypeCheckFailure, TypeCheckSuccess} -import org.apache.spark.sql.catalyst.expressions.{AttributeReference, ExpectsInputTypes, Expression} +import org.apache.spark.sql.catalyst.expressions.{ExpectsInputTypes, Expression, GenericInternalRow} import org.apache.spark.sql.catalyst.util.{ArrayData, GenericArrayData, HyperLogLogPlusPlusHelper} import org.apache.spark.sql.types._ +import org.apache.spark.unsafe.Platform /** * This function counts the approximate number of distinct values (ndv) in @@ -46,16 +47,7 @@ case class ApproxCountDistinctForIntervals( relativeSD: Double = 0.05, mutableAggBufferOffset: Int = 0, inputAggBufferOffset: Int = 0) - extends ImperativeAggregate with ExpectsInputTypes { - - def this(child: Expression, endpointsExpression: Expression) = { -this( - child = child, - endpointsExpression = endpointsExpression, - relativeSD = 0.05, - mutableAggBufferOffset = 0, - inputAggBufferOffset = 0) - } + extends TypedImperativeAggregate[Array[Long]] with ExpectsInputTypes { def this(child: Expression, endpointsExpression: Expression, relativeSD: Expression) = { this( @@ -114,29 +106,11 @@ case class ApproxCountDistinctForIntervals( private lazy val totalNumWords = numWordsPerHllpp * hllppArray.length /** Allocate enough words to store all registers. */ - override lazy val aggBufferAttributes: Seq[AttributeReference] = { -Seq.tabulate(totalNumWords) { i => -
spark git commit: [SPARK-22303][SQL] Handle Oracle specific jdbc types in OracleDialect
Repository: spark Updated Branches: refs/heads/master 57accf6e3 -> 5a5b6b785 [SPARK-22303][SQL] Handle Oracle specific jdbc types in OracleDialect TIMESTAMP (-101), BINARY_DOUBLE (101) and BINARY_FLOAT (100) are handled in OracleDialect ## What changes were proposed in this pull request? When a oracle table contains columns whose type is BINARY_FLOAT or BINARY_DOUBLE, spark sql fails to load a table with SQLException ``` java.sql.SQLException: Unsupported type 101 at org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils$.org$apache$spark$sql$execution$datasources$jdbc$JdbcUtils$$getCatalystType(JdbcUtils.scala:235) at org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils$$anonfun$8.apply(JdbcUtils.scala:292) at org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils$$anonfun$8.apply(JdbcUtils.scala:292) at scala.Option.getOrElse(Option.scala:121) at org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils$.getSchema(JdbcUtils.scala:291) at org.apache.spark.sql.execution.datasources.jdbc.JDBCRDD$.resolveTable(JDBCRDD.scala:64) at org.apache.spark.sql.execution.datasources.jdbc.JDBCRelation.(JDBCRelation.scala:113) at org.apache.spark.sql.execution.datasources.jdbc.JdbcRelationProvider.createRelation(JdbcRelationProvider.scala:47) at org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:306) at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:178) at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:146) ``` ## How was this patch tested? I updated a UT which covers type conversion test for types (-101, 100, 101), on top of that I tested this change against actual table with those columns and it was able to read and write to the table. Author: Kohki NishioCloses #19548 from taroplus/oracle_sql_types_101. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/5a5b6b78 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/5a5b6b78 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/5a5b6b78 Branch: refs/heads/master Commit: 5a5b6b78517b526771ee5b579d56aa1daa4b3ef1 Parents: 57accf6 Author: Kohki Nishio Authored: Mon Oct 23 09:55:46 2017 -0700 Committer: gatorsmile Committed: Mon Oct 23 09:55:46 2017 -0700 -- .../spark/sql/jdbc/OracleIntegrationSuite.scala | 43 --- .../execution/datasources/jdbc/JdbcUtils.scala | 1 - .../apache/spark/sql/jdbc/OracleDialect.scala | 44 +++- .../org/apache/spark/sql/jdbc/JDBCSuite.scala | 6 +++ 4 files changed, 68 insertions(+), 26 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/5a5b6b78/external/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/OracleIntegrationSuite.scala -- diff --git a/external/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/OracleIntegrationSuite.scala b/external/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/OracleIntegrationSuite.scala index 7680ae3..9034318 100644 --- a/external/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/OracleIntegrationSuite.scala +++ b/external/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/OracleIntegrationSuite.scala @@ -21,7 +21,7 @@ import java.sql.{Connection, Date, Timestamp} import java.util.Properties import java.math.BigDecimal -import org.apache.spark.sql.{DataFrame, Row} +import org.apache.spark.sql.{DataFrame, Row, SaveMode} import org.apache.spark.sql.execution.{WholeStageCodegenExec, RowDataSourceScanExec} import org.apache.spark.sql.test.SharedSQLContext import org.apache.spark.sql.types._ @@ -52,7 +52,7 @@ class OracleIntegrationSuite extends DockerJDBCIntegrationSuite with SharedSQLCo import testImplicits._ override val db = new DatabaseOnDocker { -override val imageName = "wnameless/oracle-xe-11g:14.04.4" +override val imageName = "wnameless/oracle-xe-11g:16.04" override val env = Map( "ORACLE_ROOT_PASSWORD" -> "oracle" ) @@ -104,15 +104,18 @@ class OracleIntegrationSuite extends DockerJDBCIntegrationSuite with SharedSQLCo """.stripMargin.replaceAll("\n", " ")) -conn.prepareStatement("CREATE TABLE numerics (b DECIMAL(1), f DECIMAL(3, 2), i DECIMAL(10))").executeUpdate(); +conn.prepareStatement("CREATE TABLE numerics (b DECIMAL(1), f DECIMAL(3, 2), i DECIMAL(10))").executeUpdate() conn.prepareStatement( - "INSERT INTO numerics VALUES (4, 1.23, 99)").executeUpdate(); -conn.commit(); + "INSERT INTO numerics VALUES (4, 1.23, 99)").executeUpdate() +conn.commit() + +conn.prepareStatement("CREATE TABLE oracle_types (d
spark git commit: [SPARK-22319][CORE][BACKPORT-2.2] call loginUserFromKeytab before accessing hdfs
Repository: spark Updated Branches: refs/heads/branch-2.2 f8c83fdc5 -> bf8163f5b [SPARK-22319][CORE][BACKPORT-2.2] call loginUserFromKeytab before accessing hdfs In SparkSubmit, call loginUserFromKeytab before attempting to make RPC calls to the NameNode. Same as #https://github.com/apache/spark/pull/19540, but for branch-2.2. Manually tested for master as described in https://github.com/apache/spark/pull/19540. Author: Steven RandCloses #19554 from sjrand/SPARK-22319-branch-2.2. Change-Id: Ic550a818fd6a3f38b356ac48029942d463738458 Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/bf8163f5 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/bf8163f5 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/bf8163f5 Branch: refs/heads/branch-2.2 Commit: bf8163f5be55a94e02849ccbaf755702a2c6c68f Parents: f8c83fd Author: Steven Rand Authored: Mon Oct 23 14:26:03 2017 +0800 Committer: jerryshao Committed: Mon Oct 23 14:26:03 2017 +0800 -- .../org/apache/spark/deploy/SparkSubmit.scala | 38 ++-- 1 file changed, 19 insertions(+), 19 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/bf8163f5/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala -- diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala index 86d578e..4f2f2c1 100644 --- a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala +++ b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala @@ -316,6 +316,25 @@ object SparkSubmit extends CommandLineUtils { RPackageUtils.checkAndBuildRPackage(args.jars, printStream, args.verbose) } +// assure a keytab is available from any place in a JVM +if (clusterManager == YARN || clusterManager == LOCAL) { + if (args.principal != null) { +require(args.keytab != null, "Keytab must be specified when principal is specified") +if (!new File(args.keytab).exists()) { + throw new SparkException(s"Keytab file: ${args.keytab} does not exist") +} else { + // Add keytab and principal configurations in sysProps to make them available + // for later use; e.g. in spark sql, the isolated class loader used to talk + // to HiveMetastore will use these settings. They will be set as Java system + // properties and then loaded by SparkConf + sysProps.put("spark.yarn.keytab", args.keytab) + sysProps.put("spark.yarn.principal", args.principal) + + UserGroupInformation.loginUserFromKeytab(args.principal, args.keytab) +} + } +} + // In client mode, download remote files. var localPrimaryResource: String = null var localJars: String = null @@ -582,25 +601,6 @@ object SparkSubmit extends CommandLineUtils { } } -// assure a keytab is available from any place in a JVM -if (clusterManager == YARN || clusterManager == LOCAL) { - if (args.principal != null) { -require(args.keytab != null, "Keytab must be specified when principal is specified") -if (!new File(args.keytab).exists()) { - throw new SparkException(s"Keytab file: ${args.keytab} does not exist") -} else { - // Add keytab and principal configurations in sysProps to make them available - // for later use; e.g. in spark sql, the isolated class loader used to talk - // to HiveMetastore will use these settings. They will be set as Java system - // properties and then loaded by SparkConf - sysProps.put("spark.yarn.keytab", args.keytab) - sysProps.put("spark.yarn.principal", args.principal) - - UserGroupInformation.loginUserFromKeytab(args.principal, args.keytab) -} - } -} - // In yarn-cluster mode, use yarn.Client as a wrapper around the user class if (isYarnCluster) { childMainClass = "org.apache.spark.deploy.yarn.Client" - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org