[GitHub] spark pull request #14155: [SPARK-16498][SQL] move hive hack for data source...

gatorsmile Fri, 19 Aug 2016 16:41:40 -0700

Github user gatorsmile commented on a diff in the pull request:

    https://github.com/apache/spark/pull/14155#discussion_r75565740
  
    --- Diff: 
sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala ---
    @@ -144,16 +164,162 @@ private[spark] class HiveExternalCatalog(client: 
HiveClient, hadoopConf: Configu
         assert(tableDefinition.identifier.database.isDefined)
         val db = tableDefinition.identifier.database.get
         requireDbExists(db)
    +    verifyTableProperties(tableDefinition)
    +
    +    // Before saving data source table metadata into Hive metastore, we 
should:
    +    //  1. Put table schema, partition column names and bucket 
specification in table properties.
    +    //  2. Check if this table is hive compatible
    +    //    2.1  If it's not hive compatible, set schema, partition columns 
and bucket spec to empty
    +    //         and save table metadata to Hive.
    +    //    2.1  If it's hive compatible, set serde information in table 
metadata and try to save
    +    //         it to Hive. If it fails, treat it as not hive compatible 
and go back to 2.1
    +    if (DDLUtils.isDatasourceTable(tableDefinition)) {
    +      // data source table always have a provider, it's guaranteed by 
`DDLUtils.isDatasourceTable`.
    +      val provider = tableDefinition.provider.get
    +      val partitionColumns = tableDefinition.partitionColumnNames
    +      val bucketSpec = tableDefinition.bucketSpec
    +
    +      val tableProperties = new scala.collection.mutable.HashMap[String, 
String]
    +      tableProperties.put(DATASOURCE_PROVIDER, provider)
    +
    +      // Serialized JSON schema string may be too long to be stored into a 
single metastore table
    +      // property. In this case, we split the JSON string and store each 
part as a separate table
    +      // property.
    +      // TODO: the threshold should be set by 
`spark.sql.sources.schemaStringLengthThreshold`,
    +      // however the current SQLConf is session isolated, which is not 
applicable to external
    +      // catalog. We should re-enable this conf instead of hard code the 
value here, after we have
    +      // global SQLConf.
    +      val threshold = 4000
    +      val schemaJsonString = tableDefinition.schema.json
    +      // Split the JSON string.
    +      val parts = schemaJsonString.grouped(threshold).toSeq
    +      tableProperties.put(DATASOURCE_SCHEMA_NUMPARTS, parts.size.toString)
    +      parts.zipWithIndex.foreach { case (part, index) =>
    +        tableProperties.put(s"$DATASOURCE_SCHEMA_PART_PREFIX$index", part)
    +      }
    +
    +      if (partitionColumns.nonEmpty) {
    +        tableProperties.put(DATASOURCE_SCHEMA_NUMPARTCOLS, 
partitionColumns.length.toString)
    +        partitionColumns.zipWithIndex.foreach { case (partCol, index) =>
    +          tableProperties.put(s"$DATASOURCE_SCHEMA_PARTCOL_PREFIX$index", 
partCol)
    +        }
    +      }
    +
    +      if (bucketSpec.isDefined) {
    +        val BucketSpec(numBuckets, bucketColumnNames, sortColumnNames) = 
bucketSpec.get
    +
    +        tableProperties.put(DATASOURCE_SCHEMA_NUMBUCKETS, 
numBuckets.toString)
    +        tableProperties.put(DATASOURCE_SCHEMA_NUMBUCKETCOLS, 
bucketColumnNames.length.toString)
    +        bucketColumnNames.zipWithIndex.foreach { case (bucketCol, index) =>
    +          
tableProperties.put(s"$DATASOURCE_SCHEMA_BUCKETCOL_PREFIX$index", bucketCol)
    +        }
    +
    +        if (sortColumnNames.nonEmpty) {
    +          tableProperties.put(DATASOURCE_SCHEMA_NUMSORTCOLS, 
sortColumnNames.length.toString)
    +          sortColumnNames.zipWithIndex.foreach { case (sortCol, index) =>
    +            
tableProperties.put(s"$DATASOURCE_SCHEMA_SORTCOL_PREFIX$index", sortCol)
    +          }
    +        }
    +      }
    +
    +      // converts the table metadata to Spark SQL specific format, i.e. 
set schema, partition column
    +      // names and bucket specification to empty.
    +      def newSparkSQLSpecificMetastoreTable(): CatalogTable = {
    +        tableDefinition.copy(
    +          schema = new StructType,
    +          partitionColumnNames = Nil,
    +          bucketSpec = None,
    +          properties = tableDefinition.properties ++ tableProperties)
    +      }
    +
    +      // converts the table metadata to Hive compatible format, i.e. set 
the serde information.
    +      def newHiveCompatibleMetastoreTable(serde: HiveSerDe, path: String): 
CatalogTable = {
    +        tableDefinition.copy(
    +          storage = tableDefinition.storage.copy(
    +            locationUri = Some(new Path(path).toUri.toString),
    +            inputFormat = serde.inputFormat,
    +            outputFormat = serde.outputFormat,
    +            serde = serde.serde
    +          ),
    +          properties = tableDefinition.properties ++ tableProperties)
    +      }
    +
    +      val qualifiedTableName = tableDefinition.identifier.quotedString
    +      val maybeSerde = 
HiveSerDe.sourceToSerDe(tableDefinition.provider.get)
    +      val maybePath = new 
CaseInsensitiveMap(tableDefinition.storage.properties).get("path")
    +      val skipHiveMetadata = tableDefinition.storage.properties
    +        .getOrElse("skipHiveMetadata", "false").toBoolean
    +
    +      val (hiveCompatibleTable, logMessage) = (maybeSerde, maybePath) 
match {
    +        case _ if skipHiveMetadata =>
    +          val message =
    +            s"Persisting data source table $qualifiedTableName into Hive 
metastore in" +
    +              "Spark SQL specific format, which is NOT compatible with 
Hive."
    +          (None, message)
    +
    +        // our bucketing is un-compatible with hive(different hash 
function)
    +        case _ if tableDefinition.bucketSpec.nonEmpty =>
    +          val message =
    +            s"Persisting bucketed data source table $qualifiedTableName 
into " +
    +              "Hive metastore in Spark SQL specific format, which is NOT 
compatible with Hive. "
    +          (None, message)
    +
    +        case (Some(serde), Some(path)) =>
    +          val message =
    +            s"Persisting data source table $qualifiedTableName with a 
single input path " +
    --- End diff --
    
    It sounds like it is impossible to provide multiple input paths in the 
write path of data source tables. Maybe we can update the message here?



---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---

---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

[GitHub] spark pull request #14155: [SPARK-16498][SQL] move hive hack for data source...

Reply via email to