Github user davies commented on a diff in the pull request: https://github.com/apache/spark/pull/14500#discussion_r74132132 --- Diff: sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala --- @@ -425,6 +430,111 @@ case class AlterTableDropPartitionCommand( } +/** + * Recover Partitions in ALTER TABLE: recover all the partition in the directory of a table and + * update the catalog. + * + * The syntax of this command is: + * {{{ + * ALTER TABLE table RECOVER PARTITIONS; + * MSCK REPAIR TABLE table; + * }}} + */ +case class AlterTableRecoverPartitionsCommand( + tableName: TableIdentifier, + cmd: String = "ALTER TABLE RECOVER PARTITIONS") extends RunnableCommand { + override def run(spark: SparkSession): Seq[Row] = { + val catalog = spark.sessionState.catalog + if (!catalog.tableExists(tableName)) { + throw new AnalysisException(s"Table $tableName in $cmd does not exist.") + } + val table = catalog.getTableMetadata(tableName) + if (catalog.isTemporaryTable(tableName)) { + throw new AnalysisException( + s"Operation not allowed: $cmd on temporary tables: $tableName") + } + if (DDLUtils.isDatasourceTable(table)) { + throw new AnalysisException( + s"Operation not allowed: $cmd on datasource tables: $tableName") + } + if (table.tableType != CatalogTableType.EXTERNAL) { + throw new AnalysisException( + s"Operation not allowed: $cmd only works on external tables: $tableName") + } + if (!DDLUtils.isTablePartitioned(table)) { + throw new AnalysisException( + s"Operation not allowed: $cmd only works on partitioned tables: $tableName") + } + if (table.storage.locationUri.isEmpty) { + throw new AnalysisException( + s"Operation not allowed: $cmd only works on table with location provided: $tableName") + } + + val root = new Path(table.storage.locationUri.get) + val fs = root.getFileSystem(spark.sparkContext.hadoopConfiguration) + // Dummy jobconf to get to the pathFilter defined in configuration + // It's very expensive to create a JobConf(ClassUtil.findContainingJar() is slow) + val jobConf = new JobConf(spark.sparkContext.hadoopConfiguration, this.getClass) + val pathFilter = FileInputFormat.getInputPathFilter(jobConf) + val partitionSpecsAndLocs = scanPartitions( + spark, fs, pathFilter, root, Map(), table.partitionColumnNames.map(_.toLowerCase)) + val parts = partitionSpecsAndLocs.map { case (spec, location) => + // inherit table storage format (possibly except for location) + CatalogTablePartition(spec, table.storage.copy(locationUri = Some(location.toUri.toString))) + } + spark.sessionState.catalog.createPartitions(tableName, + parts.toArray[CatalogTablePartition], ignoreIfExists = true) --- End diff -- Good question, see the implementation in HiveShim: ``` // Follows exactly the same logic of DDLTask.createPartitions in Hive 0.12 override def createPartitions( hive: Hive, database: String, tableName: String, parts: Seq[CatalogTablePartition], ignoreIfExists: Boolean): Unit = { val table = hive.getTable(database, tableName) parts.foreach { s => val location = s.storage.locationUri.map(new Path(table.getPath, _)).orNull val spec = s.spec.asJava if (hive.getPartition(table, spec, false) != null && ignoreIfExists) { // Ignore this partition since it already exists and ignoreIfExists == true } else { if (location == null && table.isView()) { throw new HiveException("LOCATION clause illegal for view partition"); } createPartitionMethod.invoke( hive, table, spec, location, null, // partParams null, // inputFormat null, // outputFormat -1: JInteger, // numBuckets null, // cols null, // serializationLib null, // serdeParams null, // bucketCols null) // sortCols } } } ``` All these partitions will be insert into Hive in sequential way, so group them as batches will not help here.
--- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- --------------------------------------------------------------------- To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org