[ https://issues.apache.org/jira/browse/SPARK-46617?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]
ASF GitHub Bot updated SPARK-46617: ----------------------------------- Labels: pull-request-available (was: ) > Create-table-if-not-exists overwrites tables > -------------------------------------------- > > Key: SPARK-46617 > URL: https://issues.apache.org/jira/browse/SPARK-46617 > Project: Spark > Issue Type: Bug > Components: SQL > Affects Versions: 3.5.0 > Reporter: Adrian Stratulat > Priority: Minor > Labels: pull-request-available > > h2. Description: > *Encountered behavior:* Using "CREATE TABLE IF NOT EXISTS" in a spark > session, on a table that already exists, while Hive is not enabled, results > in the table being {*}silently overwritten{*}. > *Expected behavior:* "CREATE TABLE IF NOT EXISTS" on a a table that already > exists results in the statement being skipped. See test-matrix below. > h2. Test matrix & test scenarios: > {noformat} > +----------------------------+-------------------------------------+ > | sql statement | Behavior when overwriting a table | > | +---------------+---------------------+ > | | hive-enabled | hive-disabled | > +----------------------------+---------------+---------------------+ > | create-table | exception (1) | exception (2) | > | create-table-if-not-exists | skip (3) | OVERWRITE *BUG* (4) | > +----------------------------+---------------+---------------------+ > {noformat} > h3. (1) Create Table - Hive > > {code:java} > // works_as_expected_hive.sc > import $ivy.`org.apache.spark::spark-core:3.5.0` > import $ivy.`org.apache.spark::spark-sql:3.5.0` > import $ivy.`org.apache.spark::spark-hive:3.5.0` > import org.apache.spark.sql.SparkSession > @main > def main() : Unit = { > val spark = SparkSession > .builder() > .appName("Spark SQL basic example") > .master("local[*]") > .enableHiveSupport() > .getOrCreate() > spark.sql(""" > CREATE TABLE hello > USING csv > OPTIONS (header=true) > LOCATION 'store/' > AS (select 1 as col) > """) > } {code} > > {noformat} > # Clean up the warehouse > > rm -rf spark-warehouse metastore_db > # Run the CTAS & check the result. Works ok. > > amm works_as_expected_hive.sc > > ls spark-warehouse/store/part* > spark-warehouse/store/part-00000-78bfa8ca-b36c-45d7-b475-eae8a4dbda06-c000.csv > > cat spark-warehouse/store/part* > col > 1 > # Run the CTAS again > # Exception: Table cannot be overwritten > > amm works_as_expected_hive.sc > org.apache.spark.sql.catalyst.analysis.TableAlreadyExistsException: > [TABLE_OR_VIEW_ALREADY_EXISTS] Cannot create table or view > `spark_catalog`.`default`.`hello` because it already exists. > Choose a different name, drop or replace the existing object, or add the IF > NOT EXISTS clause to tolerate pre-existing objects. > > org.apache.spark.sql.errors.QueryCompilationErrors$.tableAlreadyExistsError(QueryCompilationErrors.scala:2514) > > org.apache.spark.sql.execution.command.CreateDataSourceTableAsSelectCommand.run(createDataSourceTables.scala:162) > > org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult$lzycompute(commands.scala:75) > > org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult(commands.scala:73) > # Check that the file was left in place > > ls spark-warehouse/store/part* > spark-warehouse/store/part-00000-78bfa8ca-b36c-45d7-b475-eae8a4dbda06-c000.csv > {noformat} > > > h2. (2) Create Table - No Hive > {code:java} > // works_as_expected_nohive.sc > import $ivy.`org.apache.spark::spark-core:3.5.0` > import $ivy.`org.apache.spark::spark-sql:3.5.0` > import org.apache.spark.sql.SparkSession > @main > def main() : Unit = { > val spark = SparkSession > .builder() > .appName("Spark SQL basic example") > .master("local[*]") > .getOrCreate() > spark.sql(""" > CREATE TABLE hello > USING csv > OPTIONS (header=true) > LOCATION 'store/' > AS (select 1 as col) > """) > } {code} > > {noformat} > # Clean up the warehouse > > rm -rf spark-warehouse metastore_db > # Run the CTAS & check the result. Works ok. > > amm works_as_expected_nohive.sc > > ls spark-warehouse/store/part* > spark-warehouse/store/part-00000-08802249-e654-4f22-8b07-eb453d39b482-c000.csv > > cat spark-warehouse/store/part* > col > 1 > # Run the CTAS again > # Exception: Table cannot be overwritten > > amm works_as_expected_nohive.sc > org.apache.spark.sql.AnalysisException: CREATE-TABLE-AS-SELECT cannot create > table with location to a non-empty directory > file:/home/adrian/spark-warehouse/store. To allow overwriting the existing > non-empty directory, set 'spark.sql.legacy.allowNonEmptyLocationInCTAS' to > true. > > org.apache.spark.sql.errors.QueryCompilationErrors$.createTableAsSelectWithNonEmptyDirectoryError(QueryCompilationErrors.scala:2522) > > org.apache.spark.sql.execution.command.DataWritingCommand$.assertEmptyRootPath(DataWritingCommand.scala:119) > > org.apache.spark.sql.execution.command.CreateDataSourceTableAsSelectCommand.$anonfun$run$6(createDataSourceTables.scala:173) > > org.apache.spark.sql.execution.command.CreateDataSourceTableAsSelectCommand.$anonfun$run$6$adapted(createDataSourceTables.scala:172) > # Check that the file was left in place > > ls spark-warehouse/store/part* > spark-warehouse/store/part-00000-08802249-e654-4f22-8b07-eb453d39b482-c000.csv > {noformat} > > {noformat} > == Parsed Logical Plan == > 'CreateTableAsSelect unresolvedtablespec(Some(csv), > optionlist((header,true)), Some(store/), None, None, false), false, false > :- 'UnresolvedIdentifier [hello], false > +- Project [2 AS col#0] > +- OneRowRelation > == Analyzed Logical Plan == > CreateDataSourceTableAsSelectCommand `spark_catalog`.`default`.`hello`, > ErrorIfExists, [col] > +- Project [2 AS col#0] > +- OneRowRelation > == Optimized Logical Plan == > CreateDataSourceTableAsSelectCommand `spark_catalog`.`default`.`hello`, > ErrorIfExists, [col] > +- Project [2 AS col#0] > +- OneRowRelation > == Physical Plan == > Execute CreateDataSourceTableAsSelectCommand > +- CreateDataSourceTableAsSelectCommand `spark_catalog`.`default`.`hello`, > ErrorIfExists, [col] > +- Project [2 AS col#0] > +- OneRowRelation > {noformat} > > > > h3. (3) Create Table IF NOT EXISTS - Hive > {code:java} > // works_as_expected_if_not_exists_hive.sc > import $ivy.`org.apache.spark::spark-core:3.5.0` > import $ivy.`org.apache.spark::spark-sql:3.5.0` > import $ivy.`org.apache.spark::spark-hive:3.5.0` > import org.apache.spark.sql.SparkSession > @main > def main() : Unit = { > val spark = SparkSession > .builder() > .appName("Spark SQL basic example") > .master("local[*]") > .enableHiveSupport() > .getOrCreate() > spark.sql(""" > CREATE TABLE IF NOT EXISTS hello > USING csv > OPTIONS (header=true) > LOCATION 'store/' > AS (select 1 as col) > """) > }{code} > > {noformat} > # Clean up the warehouse > > rm -rf spark-warehouse metastore_db > # Run the CTAS & check the result. Works ok. > > amm works_as_expected_if_not_exists_hive.sc > > ls spark-warehouse/store/part* > spark-warehouse/store/part-00000-e010f9d7-0e6f-4bd8-adc5-e74578d6bdb4-c000.csv > > cat spark-warehouse/store/part* > col > 1 > # Run the CTAS again > # No error, table is left in place > > amm works_as_expected_if_not_exists_hive.sc > # Check that the file was left in place > > ls spark-warehouse/store/part* > spark-warehouse/store/part-00000-e010f9d7-0e6f-4bd8-adc5-e74578d6bdb4-c000.csv > {noformat} > > > h3. (4) Create Table IF NOT EXISTS - No Hive > > {code:java} > //bug1.sc > import $ivy.`org.apache.spark::spark-core:3.5.0` > import $ivy.`org.apache.spark::spark-sql:3.5.0` > import org.apache.spark.sql.SparkSession > @main > def main() : Unit = { > val spark = SparkSession > .builder() > .appName("Spark SQL basic example") > .master("local[*]") > .getOrCreate() > spark.sql(""" > CREATE TABLE IF NOT EXISTS hello > USING csv > OPTIONS (header=true) > LOCATION 'store/' > AS (select 1 as col) > """) > } > {code} > {code:java} > //bug2.sc > import $ivy.`org.apache.spark::spark-core:3.5.0` > import $ivy.`org.apache.spark::spark-sql:3.5.0` > import org.apache.spark.sql.SparkSession > @main > def main() : Unit = { > val spark = SparkSession > .builder() > .appName("Spark SQL basic example") > .master("local[*]") > .getOrCreate() > spark.sql(""" > CREATE TABLE IF NOT EXISTS hello > USING csv > OPTIONS (header=true) > LOCATION 'store/' > AS (select 2 as col) -- PAYLOAD BECOMES 2 > """) > } {code} > > {noformat} > # Clean up the warehouse > > rm -rf spark-warehouse metastore_db > # Run the CTAS & check the result. > > amm bug1.sc > > ls spark-warehouse/store/part* > spark-warehouse/store/part-00000-3f81c2ea-bfa1-428d-9816-506520be028b-c000.csv > > cat spark-warehouse/store/part* > col > 1 > # Run the second CTAS. > # BUG - TABLE GETS SILENTLY OVERWRITTEN > > amm bug2.sc > > ls spark-warehouse/store/part* > spark-warehouse/store/part-00000-1d1d66e9-ce80-475d-aa49-c73f629219d9-c000.csv > > cat spark-warehouse/store/part* > col > 2 > {noformat} > > > {noformat} > |== Parsed Logical Plan == > 'CreateTableAsSelect unresolvedtablespec(Some(csv), > optionlist((header,true)), Some(store/), None, None, false), true, false > :- 'UnresolvedIdentifier [hello], false > +- Project [2 AS col#0] > +- OneRowRelation > == Analyzed Logical Plan == > CreateDataSourceTableAsSelectCommand `spark_catalog`.`default`.`hello`, > Ignore, [col] > +- Project [2 AS col#0] > +- OneRowRelation > == Optimized Logical Plan == > CreateDataSourceTableAsSelectCommand `spark_catalog`.`default`.`hello`, > Ignore, [col] > +- Project [2 AS col#0] > +- OneRowRelation > == Physical Plan == > Execute CreateDataSourceTableAsSelectCommand > +- CreateDataSourceTableAsSelectCommand `spark_catalog`.`default`.`hello`, > Ignore, [col] > +- Project [2 AS col#0] > +- OneRowRelation > {noformat} > -- This message was sent by Atlassian Jira (v8.20.10#820010) --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org For additional commands, e-mail: issues-h...@spark.apache.org