Github user cloud-fan commented on a diff in the pull request: https://github.com/apache/spark/pull/14155#discussion_r75267415 --- Diff: sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala --- @@ -144,16 +163,172 @@ private[spark] class HiveExternalCatalog(client: HiveClient, hadoopConf: Configu assert(tableDefinition.identifier.database.isDefined) val db = tableDefinition.identifier.database.get requireDbExists(db) + verifyTableProperties(tableDefinition) + // We can't create index table currently. + assert(tableDefinition.tableType != INDEX) + // All tables except view must have a provider. + assert(tableDefinition.tableType == VIEW || tableDefinition.provider.isDefined) + + // For view or Hive serde tables, they are guaranteed to be Hive compatible and we save them + // to Hive metastore directly. Otherwise, we need to put table metadata to table properties to + // work around some hive metastore problems, e.g. not case-preserving, bad decimal type support. + if (tableDefinition.provider == Some("hive") || tableDefinition.tableType == VIEW) { + client.createTable(tableDefinition, ignoreIfExists) + } else { + // Before saving data source table metadata into Hive metastore, we should: + // 1. Put table schema, partition column names and bucket specification in table properties. + // 2. Check if this table is hive compatible + // 2.1 If it's not hive compatible, set schema, partition columns and bucket spec to empty + // and save table metadata to Hive. + // 2.1 If it's hive compatible, set serde information in table metadata and try to save + // it to Hive. If it fails, treat it as not hive compatible and go back to 2.1 + + val tableProperties = tableMetadataToProperties(tableDefinition) + + // converts the table metadata to Spark SQL specific format, i.e. set schema, partition column + // names and bucket specification to empty. + def newSparkSQLSpecificMetastoreTable(): CatalogTable = { + tableDefinition.copy( + schema = new StructType, + partitionColumnNames = Nil, + bucketSpec = None, + properties = tableDefinition.properties ++ tableProperties) + } + + // converts the table metadata to Hive compatible format, i.e. set the serde information. + def newHiveCompatibleMetastoreTable(serde: HiveSerDe, path: String): CatalogTable = { + tableDefinition.copy( + storage = tableDefinition.storage.copy( + locationUri = Some(new Path(path).toUri.toString), + inputFormat = serde.inputFormat, + outputFormat = serde.outputFormat, + serde = serde.serde + ), + properties = tableDefinition.properties ++ tableProperties) + } + + val qualifiedTableName = tableDefinition.identifier.quotedString + val maybeSerde = HiveSerDe.sourceToSerDe(tableDefinition.provider.get) + val maybePath = new CaseInsensitiveMap(tableDefinition.storage.properties).get("path") + val skipHiveMetadata = tableDefinition.storage.properties + .getOrElse("skipHiveMetadata", "false").toBoolean + + val (hiveCompatibleTable, logMessage) = (maybeSerde, maybePath) match { + case _ if skipHiveMetadata => + val message = + s"Persisting data source table $qualifiedTableName into Hive metastore in" + + "Spark SQL specific format, which is NOT compatible with Hive." + (None, message) + + // our bucketing is un-compatible with hive(different hash function) + case _ if tableDefinition.bucketSpec.nonEmpty => + val message = + s"Persisting bucketed data source table $qualifiedTableName into " + + "Hive metastore in Spark SQL specific format, which is NOT compatible with Hive. " + (None, message) + + case (Some(serde), Some(path)) => + val message = + s"Persisting data source table $qualifiedTableName with a single input path " + + s"into Hive metastore in Hive compatible format." + (Some(newHiveCompatibleMetastoreTable(serde, path)), message) + + case (Some(_), None) => --- End diff -- this branch is not reachable, it means the relation is file based but the path is not set, which should never happen. It's copied from previous code, we can leave it for safety. We can clean it up after we consolidate the path/locationUri for data source table and hive serde table.
--- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- --------------------------------------------------------------------- To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org