Repository: spark Updated Branches: refs/heads/master 91c239768 -> b2074b664
http://git-wip-us.apache.org/repos/asf/spark/blob/b2074b66/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala ---------------------------------------------------------------------- diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala index f8204e1..9b7afd4 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala @@ -45,7 +45,6 @@ import org.apache.spark.sql.catalyst.catalog.CatalogTypes.TablePartitionSpec import org.apache.spark.sql.catalyst.expressions.Expression import org.apache.spark.sql.catalyst.parser.{CatalystSqlParser, ParseException} import org.apache.spark.sql.execution.QueryExecutionException -import org.apache.spark.sql.execution.command.DDLUtils import org.apache.spark.sql.types.{StructField, StructType} import org.apache.spark.util.{CircularBuffer, Utils} @@ -392,20 +391,7 @@ private[hive] class HiveClientImpl( createTime = h.getTTable.getCreateTime.toLong * 1000, lastAccessTime = h.getLastAccessTime.toLong * 1000, storage = CatalogStorageFormat( - locationUri = shim.getDataLocation(h).filterNot { _ => - // SPARK-15269: Persisted data source tables always store the location URI as a SerDe - // property named "path" instead of standard Hive `dataLocation`, because Hive only - // allows directory paths as location URIs while Spark SQL data source tables also - // allows file paths. So the standard Hive `dataLocation` is meaningless for Spark SQL - // data source tables. - DDLUtils.isDatasourceTable(properties) && - h.getTableType == HiveTableType.EXTERNAL_TABLE && - // Spark SQL may also save external data source in Hive compatible format when - // possible, so that these tables can be directly accessed by Hive. For these tables, - // `dataLocation` is still necessary. Here we also check for input format class - // because only these Hive compatible tables set this field. - h.getInputFormatClass == null - }, + locationUri = shim.getDataLocation(h), inputFormat = Option(h.getInputFormatClass).map(_.getName), outputFormat = Option(h.getOutputFormatClass).map(_.getName), serde = Option(h.getSerializationLib), http://git-wip-us.apache.org/repos/asf/spark/blob/b2074b66/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFileFormat.scala ---------------------------------------------------------------------- diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFileFormat.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFileFormat.scala index c74d948..286197b 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFileFormat.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFileFormat.scala @@ -34,7 +34,6 @@ import org.apache.hadoop.mapreduce.lib.input.{FileInputFormat, FileSplit} import org.apache.spark.sql.{Row, SparkSession} import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions._ -import org.apache.spark.sql.execution.command.CreateDataSourceTableUtils import org.apache.spark.sql.execution.datasources._ import org.apache.spark.sql.hive.{HiveInspectors, HiveShim} import org.apache.spark.sql.sources.{Filter, _} @@ -222,7 +221,7 @@ private[orc] class OrcOutputWriter( private lazy val recordWriter: RecordWriter[NullWritable, Writable] = { recordWriterInstantiated = true - val uniqueWriteJobId = conf.get(CreateDataSourceTableUtils.DATASOURCE_WRITEJOBUUID) + val uniqueWriteJobId = conf.get(WriterContainer.DATASOURCE_WRITEJOBUUID) val taskAttemptId = context.getTaskAttemptID val partition = taskAttemptId.getTaskID.getId val bucketString = bucketId.map(BucketingUtils.bucketIdToString).getOrElse("") http://git-wip-us.apache.org/repos/asf/spark/blob/b2074b66/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala ---------------------------------------------------------------------- diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala index 3892fe8..571ba49 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala @@ -26,9 +26,9 @@ import org.apache.hadoop.fs.Path import org.apache.spark.sql._ import org.apache.spark.sql.catalyst.TableIdentifier import org.apache.spark.sql.catalyst.catalog.{CatalogStorageFormat, CatalogTable, CatalogTableType} -import org.apache.spark.sql.catalyst.parser.CatalystSqlParser -import org.apache.spark.sql.execution.command.CreateDataSourceTableUtils._ import org.apache.spark.sql.execution.datasources.{HadoopFsRelation, LogicalRelation} +import org.apache.spark.sql.hive.HiveExternalCatalog._ +import org.apache.spark.sql.hive.client.HiveClient import org.apache.spark.sql.hive.test.TestHiveSingleton import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.test.SQLTestUtils @@ -49,6 +49,10 @@ class MetastoreDataSourcesSuite extends QueryTest with SQLTestUtils with TestHiv jsonFilePath = Utils.getSparkClassLoader.getResource("sample.json").getFile } + // To test `HiveExternalCatalog`, we need to read the raw table metadata(schema, partition + // columns and bucket specification are still in table properties) from hive client. + private def hiveClient: HiveClient = sharedState.asInstanceOf[HiveSharedState].metadataHive + test("persistent JSON table") { withTable("jsonTable") { sql( @@ -697,18 +701,18 @@ class MetastoreDataSourcesSuite extends QueryTest with SQLTestUtils with TestHiv withTable("wide_schema") { withTempDir { tempDir => // We will need 80 splits for this schema if the threshold is 4000. - val schema = StructType((1 to 5000).map(i => StructField(s"c_$i", StringType, true))) - - // Manually create a metastore data source table. - createDataSourceTable( - sparkSession = spark, - tableIdent = TableIdentifier("wide_schema"), + val schema = StructType((1 to 5000).map(i => StructField(s"c_$i", StringType))) + + val tableDesc = CatalogTable( + identifier = TableIdentifier("wide_schema"), + tableType = CatalogTableType.EXTERNAL, + storage = CatalogStorageFormat.empty.copy( + properties = Map("path" -> tempDir.getCanonicalPath) + ), schema = schema, - partitionColumns = Array.empty[String], - bucketSpec = None, - provider = "json", - options = Map("path" -> tempDir.getCanonicalPath), - isExternal = false) + provider = Some("json") + ) + spark.sessionState.catalog.createTable(tableDesc, ignoreIfExists = false) sessionState.refreshTable("wide_schema") @@ -741,7 +745,7 @@ class MetastoreDataSourcesSuite extends QueryTest with SQLTestUtils with TestHiv DATASOURCE_SCHEMA -> schema.json, "EXTERNAL" -> "FALSE")) - sharedState.externalCatalog.createTable(hiveTable, ignoreIfExists = false) + hiveClient.createTable(hiveTable, ignoreIfExists = false) sessionState.refreshTable(tableName) val actualSchema = table(tableName).schema @@ -759,7 +763,7 @@ class MetastoreDataSourcesSuite extends QueryTest with SQLTestUtils with TestHiv withTable(tableName) { df.write.format("parquet").partitionBy("d", "b").saveAsTable(tableName) sessionState.refreshTable(tableName) - val metastoreTable = sharedState.externalCatalog.getTable("default", tableName) + val metastoreTable = hiveClient.getTable("default", tableName) val expectedPartitionColumns = StructType(df.schema("d") :: df.schema("b") :: Nil) val numPartCols = metastoreTable.properties(DATASOURCE_SCHEMA_NUMPARTCOLS).toInt @@ -794,7 +798,7 @@ class MetastoreDataSourcesSuite extends QueryTest with SQLTestUtils with TestHiv .sortBy("c") .saveAsTable(tableName) sessionState.refreshTable(tableName) - val metastoreTable = sharedState.externalCatalog.getTable("default", tableName) + val metastoreTable = hiveClient.getTable("default", tableName) val expectedBucketByColumns = StructType(df.schema("d") :: df.schema("b") :: Nil) val expectedSortByColumns = StructType(df.schema("c") :: Nil) @@ -985,35 +989,37 @@ class MetastoreDataSourcesSuite extends QueryTest with SQLTestUtils with TestHiv withTempDir { tempPath => val schema = StructType((1 to 5).map(i => StructField(s"c_$i", StringType))) - createDataSourceTable( - sparkSession = spark, - tableIdent = TableIdentifier("not_skip_hive_metadata"), + val tableDesc1 = CatalogTable( + identifier = TableIdentifier("not_skip_hive_metadata"), + tableType = CatalogTableType.EXTERNAL, + storage = CatalogStorageFormat.empty.copy( + properties = Map("path" -> tempPath.getCanonicalPath, "skipHiveMetadata" -> "false") + ), schema = schema, - partitionColumns = Array.empty[String], - bucketSpec = None, - provider = "parquet", - options = Map("path" -> tempPath.getCanonicalPath, "skipHiveMetadata" -> "false"), - isExternal = false) + provider = Some("parquet") + ) + spark.sessionState.catalog.createTable(tableDesc1, ignoreIfExists = false) // As a proxy for verifying that the table was stored in Hive compatible format, // we verify that each column of the table is of native type StringType. - assert(sharedState.externalCatalog.getTable("default", "not_skip_hive_metadata").schema + assert(hiveClient.getTable("default", "not_skip_hive_metadata").schema .forall(_.dataType == StringType)) - createDataSourceTable( - sparkSession = spark, - tableIdent = TableIdentifier("skip_hive_metadata"), + val tableDesc2 = CatalogTable( + identifier = TableIdentifier("skip_hive_metadata", Some("default")), + tableType = CatalogTableType.EXTERNAL, + storage = CatalogStorageFormat.empty.copy( + properties = Map("path" -> tempPath.getCanonicalPath, "skipHiveMetadata" -> "true") + ), schema = schema, - partitionColumns = Array.empty[String], - bucketSpec = None, - provider = "parquet", - options = Map("path" -> tempPath.getCanonicalPath, "skipHiveMetadata" -> "true"), - isExternal = false) + provider = Some("parquet") + ) + spark.sessionState.catalog.createTable(tableDesc2, ignoreIfExists = false) // As a proxy for verifying that the table was stored in SparkSQL format, // we verify that the table has a column type as array of StringType. - assert(sharedState.externalCatalog.getTable("default", "skip_hive_metadata") - .schema.forall(_.dataType == ArrayType(StringType))) + assert(hiveClient.getTable("default", "skip_hive_metadata").schema + .forall(_.dataType == ArrayType(StringType))) } } @@ -1030,7 +1036,7 @@ class MetastoreDataSourcesSuite extends QueryTest with SQLTestUtils with TestHiv """.stripMargin ) - val metastoreTable = sharedState.externalCatalog.getTable("default", "t") + val metastoreTable = hiveClient.getTable("default", "t") assert(metastoreTable.properties(DATASOURCE_SCHEMA_NUMPARTCOLS).toInt === 1) assert(!metastoreTable.properties.contains(DATASOURCE_SCHEMA_NUMBUCKETS)) assert(!metastoreTable.properties.contains(DATASOURCE_SCHEMA_NUMBUCKETCOLS)) @@ -1054,7 +1060,7 @@ class MetastoreDataSourcesSuite extends QueryTest with SQLTestUtils with TestHiv """.stripMargin ) - val metastoreTable = sharedState.externalCatalog.getTable("default", "t") + val metastoreTable = hiveClient.getTable("default", "t") assert(!metastoreTable.properties.contains(DATASOURCE_SCHEMA_NUMPARTCOLS)) assert(metastoreTable.properties(DATASOURCE_SCHEMA_NUMBUCKETS).toInt === 2) assert(metastoreTable.properties(DATASOURCE_SCHEMA_NUMBUCKETCOLS).toInt === 1) @@ -1076,7 +1082,7 @@ class MetastoreDataSourcesSuite extends QueryTest with SQLTestUtils with TestHiv """.stripMargin ) - val metastoreTable = sharedState.externalCatalog.getTable("default", "t") + val metastoreTable = hiveClient.getTable("default", "t") assert(!metastoreTable.properties.contains(DATASOURCE_SCHEMA_NUMPARTCOLS)) assert(metastoreTable.properties(DATASOURCE_SCHEMA_NUMBUCKETS).toInt === 2) assert(metastoreTable.properties(DATASOURCE_SCHEMA_NUMBUCKETCOLS).toInt === 1) @@ -1101,7 +1107,7 @@ class MetastoreDataSourcesSuite extends QueryTest with SQLTestUtils with TestHiv """.stripMargin ) - val metastoreTable = sharedState.externalCatalog.getTable("default", "t") + val metastoreTable = hiveClient.getTable("default", "t") assert(metastoreTable.properties(DATASOURCE_SCHEMA_NUMPARTCOLS).toInt === 1) assert(metastoreTable.properties(DATASOURCE_SCHEMA_NUMBUCKETS).toInt === 2) assert(metastoreTable.properties(DATASOURCE_SCHEMA_NUMBUCKETCOLS).toInt === 1) @@ -1168,7 +1174,7 @@ class MetastoreDataSourcesSuite extends QueryTest with SQLTestUtils with TestHiv ) sql("insert into t values (2, 3, 4)") checkAnswer(table("t"), Seq(Row(1, 2, 3), Row(2, 3, 4))) - val catalogTable = sharedState.externalCatalog.getTable("default", "t") + val catalogTable = hiveClient.getTable("default", "t") // there should not be a lowercase key 'path' now assert(catalogTable.storage.properties.get("path").isEmpty) assert(catalogTable.storage.properties.get("PATH").isDefined) @@ -1188,4 +1194,28 @@ class MetastoreDataSourcesSuite extends QueryTest with SQLTestUtils with TestHiv } } } + + test("read table with corrupted schema") { + try { + val schema = StructType(StructField("int", IntegerType, true) :: Nil) + val hiveTable = CatalogTable( + identifier = TableIdentifier("t", Some("default")), + tableType = CatalogTableType.MANAGED, + schema = new StructType, + storage = CatalogStorageFormat.empty, + properties = Map( + DATASOURCE_PROVIDER -> "json", + // no DATASOURCE_SCHEMA_NUMPARTS + DATASOURCE_SCHEMA_PART_PREFIX + 0 -> schema.json)) + + hiveClient.createTable(hiveTable, ignoreIfExists = false) + + val e = intercept[AnalysisException] { + sharedState.externalCatalog.getTable("default", "t") + }.getMessage + assert(e.contains(s"Could not read schema from the hive metastore because it is corrupted")) + } finally { + hiveClient.dropTable("default", "t", ignoreIfNotExists = true, purge = true) + } + } } http://git-wip-us.apache.org/repos/asf/spark/blob/b2074b66/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveCommandSuite.scala ---------------------------------------------------------------------- diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveCommandSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveCommandSuite.scala index 5d51019..76aa84b 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveCommandSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveCommandSuite.scala @@ -18,21 +18,32 @@ package org.apache.spark.sql.hive.execution import org.apache.spark.sql.{AnalysisException, QueryTest, Row, SaveMode} +import org.apache.spark.sql.catalyst.TableIdentifier import org.apache.spark.sql.catalyst.analysis.NoSuchTableException -import org.apache.spark.sql.execution.command.CreateDataSourceTableUtils._ +import org.apache.spark.sql.catalyst.catalog.{CatalogStorageFormat, CatalogTable, CatalogTableType} import org.apache.spark.sql.hive.test.TestHiveSingleton import org.apache.spark.sql.test.SQLTestUtils +import org.apache.spark.sql.types.StructType class HiveCommandSuite extends QueryTest with SQLTestUtils with TestHiveSingleton { import testImplicits._ protected override def beforeAll(): Unit = { super.beforeAll() - sql( - """ - |CREATE TABLE parquet_tab1 (c1 INT, c2 STRING) - |USING org.apache.spark.sql.parquet.DefaultSource - """.stripMargin) + + // Use catalog to create table instead of SQL string here, because we don't support specifying + // table properties for data source table with SQL API now. + hiveContext.sessionState.catalog.createTable( + CatalogTable( + identifier = TableIdentifier("parquet_tab1"), + tableType = CatalogTableType.MANAGED, + storage = CatalogStorageFormat.empty, + schema = new StructType().add("c1", "int").add("c2", "string"), + provider = Some("parquet"), + properties = Map("my_key1" -> "v1") + ), + ignoreIfExists = false + ) sql( """ @@ -101,23 +112,14 @@ class HiveCommandSuite extends QueryTest with SQLTestUtils with TestHiveSingleto test("show tblproperties of data source tables - basic") { checkAnswer( - sql("SHOW TBLPROPERTIES parquet_tab1").filter(s"key = '$DATASOURCE_PROVIDER'"), - Row(DATASOURCE_PROVIDER, "org.apache.spark.sql.parquet.DefaultSource") :: Nil + sql("SHOW TBLPROPERTIES parquet_tab1").filter(s"key = 'my_key1'"), + Row("my_key1", "v1") :: Nil ) checkAnswer( - sql(s"SHOW TBLPROPERTIES parquet_tab1($DATASOURCE_PROVIDER)"), - Row("org.apache.spark.sql.parquet.DefaultSource") :: Nil + sql(s"SHOW TBLPROPERTIES parquet_tab1('my_key1')"), + Row("v1") :: Nil ) - - checkAnswer( - sql("SHOW TBLPROPERTIES parquet_tab1").filter(s"key = '$DATASOURCE_SCHEMA_NUMPARTS'"), - Row(DATASOURCE_SCHEMA_NUMPARTS, "1") :: Nil - ) - - checkAnswer( - sql(s"SHOW TBLPROPERTIES parquet_tab1('$DATASOURCE_SCHEMA_NUMPARTS')"), - Row("1")) } test("show tblproperties for datasource table - errors") { http://git-wip-us.apache.org/repos/asf/spark/blob/b2074b66/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala ---------------------------------------------------------------------- diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala index 970b688..f00a99b 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala @@ -692,4 +692,27 @@ class HiveDDLSuite )) } } + + test("datasource table property keys are not allowed") { + import org.apache.spark.sql.hive.HiveExternalCatalog.DATASOURCE_PREFIX + + withTable("tbl") { + sql("CREATE TABLE tbl(a INT) STORED AS parquet") + + val e = intercept[AnalysisException] { + sql(s"ALTER TABLE tbl SET TBLPROPERTIES ('${DATASOURCE_PREFIX}foo' = 'loser')") + } + assert(e.getMessage.contains(DATASOURCE_PREFIX + "foo")) + + val e2 = intercept[AnalysisException] { + sql(s"ALTER TABLE tbl UNSET TBLPROPERTIES ('${DATASOURCE_PREFIX}foo')") + } + assert(e2.getMessage.contains(DATASOURCE_PREFIX + "foo")) + + val e3 = intercept[AnalysisException] { + sql(s"CREATE TABLE tbl TBLPROPERTIES ('${DATASOURCE_PREFIX}foo'='anything')") + } + assert(e3.getMessage.contains(DATASOURCE_PREFIX + "foo")) + } + } } http://git-wip-us.apache.org/repos/asf/spark/blob/b2074b66/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala ---------------------------------------------------------------------- diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala index e6fe47a..4ca882f 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala @@ -29,7 +29,6 @@ import org.apache.spark.sql.catalyst.TableIdentifier import org.apache.spark.sql.catalyst.analysis.{EliminateSubqueryAliases, FunctionRegistry} import org.apache.spark.sql.catalyst.catalog.CatalogTableType import org.apache.spark.sql.catalyst.parser.ParseException -import org.apache.spark.sql.execution.command.CreateDataSourceTableUtils import org.apache.spark.sql.execution.datasources.{HadoopFsRelation, LogicalRelation} import org.apache.spark.sql.functions._ import org.apache.spark.sql.hive.{HiveUtils, MetastoreRelation} @@ -436,8 +435,7 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton { assert(r.options("path") === location) case None => // OK. } - assert( - catalogTable.properties(CreateDataSourceTableUtils.DATASOURCE_PROVIDER) === format) + assert(catalogTable.provider.get === format) case r: MetastoreRelation => if (isDataSourceParquet) { http://git-wip-us.apache.org/repos/asf/spark/blob/b2074b66/sql/hive/src/test/scala/org/apache/spark/sql/sources/SimpleTextRelation.scala ---------------------------------------------------------------------- diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/sources/SimpleTextRelation.scala b/sql/hive/src/test/scala/org/apache/spark/sql/sources/SimpleTextRelation.scala index 67a58a3..906de6b 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/sources/SimpleTextRelation.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/sources/SimpleTextRelation.scala @@ -29,7 +29,6 @@ import org.apache.spark.sql.{sources, Row, SparkSession} import org.apache.spark.sql.catalyst.{expressions, InternalRow} import org.apache.spark.sql.catalyst.expressions.{Cast, Expression, GenericInternalRow, InterpretedPredicate, InterpretedProjection, JoinedRow, Literal} import org.apache.spark.sql.catalyst.expressions.codegen.GenerateUnsafeProjection -import org.apache.spark.sql.execution.command.CreateDataSourceTableUtils._ import org.apache.spark.sql.execution.datasources._ import org.apache.spark.sql.types.{DataType, StructType} import org.apache.spark.util.SerializableConfiguration @@ -145,7 +144,7 @@ class AppendingTextOutputFormat(outputFile: Path) extends TextOutputFormat[NullW override def getDefaultWorkFile(context: TaskAttemptContext, extension: String): Path = { val configuration = context.getConfiguration - val uniqueWriteJobId = configuration.get(DATASOURCE_WRITEJOBUUID) + val uniqueWriteJobId = configuration.get(WriterContainer.DATASOURCE_WRITEJOBUUID) val taskAttemptId = context.getTaskAttemptID val split = taskAttemptId.getTaskID.getId val name = FileOutputFormat.getOutputName(context) --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org