Github user gatorsmile commented on a diff in the pull request: https://github.com/apache/spark/pull/14155#discussion_r75565740 --- Diff: sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala --- @@ -144,16 +164,162 @@ private[spark] class HiveExternalCatalog(client: HiveClient, hadoopConf: Configu assert(tableDefinition.identifier.database.isDefined) val db = tableDefinition.identifier.database.get requireDbExists(db) + verifyTableProperties(tableDefinition) + + // Before saving data source table metadata into Hive metastore, we should: + // 1. Put table schema, partition column names and bucket specification in table properties. + // 2. Check if this table is hive compatible + // 2.1 If it's not hive compatible, set schema, partition columns and bucket spec to empty + // and save table metadata to Hive. + // 2.1 If it's hive compatible, set serde information in table metadata and try to save + // it to Hive. If it fails, treat it as not hive compatible and go back to 2.1 + if (DDLUtils.isDatasourceTable(tableDefinition)) { + // data source table always have a provider, it's guaranteed by `DDLUtils.isDatasourceTable`. + val provider = tableDefinition.provider.get + val partitionColumns = tableDefinition.partitionColumnNames + val bucketSpec = tableDefinition.bucketSpec + + val tableProperties = new scala.collection.mutable.HashMap[String, String] + tableProperties.put(DATASOURCE_PROVIDER, provider) + + // Serialized JSON schema string may be too long to be stored into a single metastore table + // property. In this case, we split the JSON string and store each part as a separate table + // property. + // TODO: the threshold should be set by `spark.sql.sources.schemaStringLengthThreshold`, + // however the current SQLConf is session isolated, which is not applicable to external + // catalog. We should re-enable this conf instead of hard code the value here, after we have + // global SQLConf. + val threshold = 4000 + val schemaJsonString = tableDefinition.schema.json + // Split the JSON string. + val parts = schemaJsonString.grouped(threshold).toSeq + tableProperties.put(DATASOURCE_SCHEMA_NUMPARTS, parts.size.toString) + parts.zipWithIndex.foreach { case (part, index) => + tableProperties.put(s"$DATASOURCE_SCHEMA_PART_PREFIX$index", part) + } + + if (partitionColumns.nonEmpty) { + tableProperties.put(DATASOURCE_SCHEMA_NUMPARTCOLS, partitionColumns.length.toString) + partitionColumns.zipWithIndex.foreach { case (partCol, index) => + tableProperties.put(s"$DATASOURCE_SCHEMA_PARTCOL_PREFIX$index", partCol) + } + } + + if (bucketSpec.isDefined) { + val BucketSpec(numBuckets, bucketColumnNames, sortColumnNames) = bucketSpec.get + + tableProperties.put(DATASOURCE_SCHEMA_NUMBUCKETS, numBuckets.toString) + tableProperties.put(DATASOURCE_SCHEMA_NUMBUCKETCOLS, bucketColumnNames.length.toString) + bucketColumnNames.zipWithIndex.foreach { case (bucketCol, index) => + tableProperties.put(s"$DATASOURCE_SCHEMA_BUCKETCOL_PREFIX$index", bucketCol) + } + + if (sortColumnNames.nonEmpty) { + tableProperties.put(DATASOURCE_SCHEMA_NUMSORTCOLS, sortColumnNames.length.toString) + sortColumnNames.zipWithIndex.foreach { case (sortCol, index) => + tableProperties.put(s"$DATASOURCE_SCHEMA_SORTCOL_PREFIX$index", sortCol) + } + } + } + + // converts the table metadata to Spark SQL specific format, i.e. set schema, partition column + // names and bucket specification to empty. + def newSparkSQLSpecificMetastoreTable(): CatalogTable = { + tableDefinition.copy( + schema = new StructType, + partitionColumnNames = Nil, + bucketSpec = None, + properties = tableDefinition.properties ++ tableProperties) + } + + // converts the table metadata to Hive compatible format, i.e. set the serde information. + def newHiveCompatibleMetastoreTable(serde: HiveSerDe, path: String): CatalogTable = { + tableDefinition.copy( + storage = tableDefinition.storage.copy( + locationUri = Some(new Path(path).toUri.toString), + inputFormat = serde.inputFormat, + outputFormat = serde.outputFormat, + serde = serde.serde + ), + properties = tableDefinition.properties ++ tableProperties) + } + + val qualifiedTableName = tableDefinition.identifier.quotedString + val maybeSerde = HiveSerDe.sourceToSerDe(tableDefinition.provider.get) + val maybePath = new CaseInsensitiveMap(tableDefinition.storage.properties).get("path") + val skipHiveMetadata = tableDefinition.storage.properties + .getOrElse("skipHiveMetadata", "false").toBoolean + + val (hiveCompatibleTable, logMessage) = (maybeSerde, maybePath) match { + case _ if skipHiveMetadata => + val message = + s"Persisting data source table $qualifiedTableName into Hive metastore in" + + "Spark SQL specific format, which is NOT compatible with Hive." + (None, message) + + // our bucketing is un-compatible with hive(different hash function) + case _ if tableDefinition.bucketSpec.nonEmpty => + val message = + s"Persisting bucketed data source table $qualifiedTableName into " + + "Hive metastore in Spark SQL specific format, which is NOT compatible with Hive. " + (None, message) + + case (Some(serde), Some(path)) => + val message = + s"Persisting data source table $qualifiedTableName with a single input path " + --- End diff -- It sounds like it is impossible to provide multiple input paths in the write path of data source tables. Maybe we can update the message here?
--- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- --------------------------------------------------------------------- To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org