[GitHub] [iceberg] chenjunjiedada commented on a change in pull request #1497: MR: apply row-level delete files when reading

GitBox Mon, 28 Sep 2020 23:49:41 -0700


chenjunjiedada commented on a change in pull request #1497:
URL: https://github.com/apache/iceberg/pull/1497#discussion_r496455338




##########
File path: 
spark/src/test/java/org/apache/iceberg/spark/source/TestSparkReaderDeletes.java
##########
@@ -20,285 +20,111 @@
 package org.apache.iceberg.spark.source;
 
 import java.io.IOException;
-import java.util.List;
-import java.util.Set;
+import org.apache.hadoop.hive.conf.HiveConf;
 import org.apache.iceberg.BaseTable;
-import org.apache.iceberg.DataFile;
-import org.apache.iceberg.DeleteFile;
-import org.apache.iceberg.Files;
+import org.apache.iceberg.PartitionSpec;
 import org.apache.iceberg.Schema;
 import org.apache.iceberg.Table;
 import org.apache.iceberg.TableMetadata;
 import org.apache.iceberg.TableOperations;
-import org.apache.iceberg.TestHelpers.Row;
+import org.apache.iceberg.catalog.Namespace;
 import org.apache.iceberg.catalog.TableIdentifier;
-import org.apache.iceberg.data.FileHelpers;
-import org.apache.iceberg.data.GenericRecord;
-import org.apache.iceberg.data.Record;
-import org.apache.iceberg.relocated.com.google.common.collect.Lists;
-import org.apache.iceberg.relocated.com.google.common.collect.Sets;
+import org.apache.iceberg.data.DeletesReadTest;
+import org.apache.iceberg.exceptions.AlreadyExistsException;
+import org.apache.iceberg.hive.HiveCatalog;
+import org.apache.iceberg.hive.TestHiveMetastore;
 import org.apache.iceberg.spark.SparkStructLike;
-import org.apache.iceberg.spark.SparkTestBase;
 import org.apache.iceberg.types.Types;
-import org.apache.iceberg.util.ArrayUtil;
-import org.apache.iceberg.util.Pair;
 import org.apache.iceberg.util.StructLikeSet;
-import org.apache.iceberg.util.StructProjection;
 import org.apache.spark.sql.Dataset;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.SparkSession;
+import org.apache.spark.sql.internal.SQLConf;
 import org.junit.After;
-import org.junit.Assert;
+import org.junit.AfterClass;
 import org.junit.Before;
-import org.junit.Rule;
-import org.junit.Test;
-import org.junit.rules.TemporaryFolder;
+import org.junit.BeforeClass;
 
-public abstract class TestSparkReaderDeletes extends SparkTestBase {
-  private static final Schema SCHEMA = new Schema(
-      Types.NestedField.required(1, "id", Types.IntegerType.get()),
-      Types.NestedField.required(2, "data", Types.StringType.get()));
-  private Table table = null;
-  private List<Record> records = null;
-  private DataFile dataFile = null;
+import static org.apache.hadoop.hive.conf.HiveConf.ConfVars.METASTOREURIS;
 
-  @Rule
-  public TemporaryFolder temp = new TemporaryFolder();
+public abstract class TestSparkReaderDeletes extends DeletesReadTest {
 
-  @Before
-  public void createTable() throws IOException {
-    this.table = catalog.createTable(TableIdentifier.of("default", "table"), 
SCHEMA);
-    TableOperations ops = ((BaseTable) table).operations();
-    TableMetadata meta = ops.current();
-    ops.commit(meta, meta.upgradeToFormatVersion(2));
-
-    this.records = Lists.newArrayList();
+  private static TestHiveMetastore metastore = null;
+  protected static SparkSession spark = null;
+  protected static HiveCatalog catalog = null;
 
-    // records all use IDs that are in bucket id_bucket=0
-    GenericRecord record = GenericRecord.create(table.schema());
-    records.add(record.copy("id", 29, "data", "a"));
-    records.add(record.copy("id", 43, "data", "b"));
-    records.add(record.copy("id", 61, "data", "c"));
-    records.add(record.copy("id", 89, "data", "d"));
-    records.add(record.copy("id", 100, "data", "e"));
-    records.add(record.copy("id", 121, "data", "f"));
-    records.add(record.copy("id", 122, "data", "g"));
+  @BeforeClass
+  public static void startMetastoreAndSpark() {
+    metastore = new TestHiveMetastore();
+    metastore.start();
+    HiveConf hiveConf = metastore.hiveConf();
 
-    this.dataFile = FileHelpers.writeDataFile(table, 
Files.localOutput(temp.newFile()), Row.of(0), records);
+    spark = SparkSession.builder()
+        .master("local[2]")
+        .config(SQLConf.PARTITION_OVERWRITE_MODE().key(), "dynamic")
+        .config("spark.hadoop." + METASTOREURIS.varname, 
hiveConf.get(METASTOREURIS.varname))
+        .enableHiveSupport()
+        .getOrCreate();
 
-    table.newAppend()
-        .appendFile(dataFile)
-        .commit();
-  }
+    catalog = new HiveCatalog(spark.sessionState().newHadoopConf());
 
-  @After
-  public void dropTable() {
-    catalog.dropTable(TableIdentifier.of("default", "table"));
+    try {
+      catalog.createNamespace(Namespace.of("default"));
+    } catch (AlreadyExistsException ignored) {
+      // the default namespace already exists. ignore the create error
+    }
   }
 
-  @Test
-  public void testEqualityDeletes() throws IOException {
-    Schema deleteRowSchema = table.schema().select("data");
-    Record dataDelete = GenericRecord.create(deleteRowSchema);
-    List<Record> dataDeletes = Lists.newArrayList(
-        dataDelete.copy("data", "a"), // id = 29
-        dataDelete.copy("data", "d"), // id = 89
-        dataDelete.copy("data", "g") // id = 122
-    );
-
-    DeleteFile eqDeletes = FileHelpers.writeDeleteFile(
-        table, Files.localOutput(temp.newFile()), Row.of(0), dataDeletes, 
deleteRowSchema);
-
-    table.newRowDelta()
-        .addDeletes(eqDeletes)
-        .commit();
-
-    StructLikeSet expected = rowSetWithoutIds(29, 89, 122);
-    StructLikeSet actual = rowSet(table);
-
-    Assert.assertEquals("Table should contain expected rows", expected, 
actual);
+  @AfterClass
+  public static void stopMetastoreAndSpark() {
+    catalog.close();
+    catalog = null;
+    metastore.stop();
+    metastore = null;
+    spark.stop();
+    spark = null;
   }
 
-  @Test
-  public void testEqualityDeletesWithRequiredEqColumn() throws IOException {
-    Schema deleteRowSchema = table.schema().select("data");
-    Record dataDelete = GenericRecord.create(deleteRowSchema);
-    List<Record> dataDeletes = Lists.newArrayList(
-        dataDelete.copy("data", "a"), // id = 29
-        dataDelete.copy("data", "d"), // id = 89
-        dataDelete.copy("data", "g") // id = 122
-    );
-
-    DeleteFile eqDeletes = FileHelpers.writeDeleteFile(
-        table, Files.localOutput(temp.newFile()), Row.of(0), dataDeletes, 
deleteRowSchema);
-
-    table.newRowDelta()
-        .addDeletes(eqDeletes)
-        .commit();
-
-    StructLikeSet expected = selectColumns(rowSetWithoutIds(29, 89, 122), 
"id");
-    StructLikeSet actual = rowSet(table, "id"); // data is added by the reader 
to apply the eq deletes
-
-    Assert.assertEquals("Table should contain expected rows", expected, 
actual);
-  }
-
-  @Test
-  public void testPositionDeletes() throws IOException {
-    List<Pair<CharSequence, Long>> deletes = Lists.newArrayList(
-        Pair.of(dataFile.path(), 0L), // id = 29
-        Pair.of(dataFile.path(), 3L), // id = 89
-        Pair.of(dataFile.path(), 6L) // id = 122
-    );
-
-    DeleteFile posDeletes = FileHelpers.writeDeleteFile(
-        table, Files.localOutput(temp.newFile()), Row.of(0), deletes);
-
-    table.newRowDelta()
-        .addDeletes(posDeletes)
-        .commit();
-
-    StructLikeSet expected = rowSetWithoutIds(29, 89, 122);
-    StructLikeSet actual = rowSet(table);
-
-    Assert.assertEquals("Table should contain expected rows", expected, 
actual);
-  }
-
-  @Test
-  public void testMixedPositionAndEqualityDeletes() throws IOException {
-    Schema dataSchema = table.schema().select("data");
-    Record dataDelete = GenericRecord.create(dataSchema);
-    List<Record> dataDeletes = Lists.newArrayList(
-        dataDelete.copy("data", "a"), // id = 29
-        dataDelete.copy("data", "d"), // id = 89
-        dataDelete.copy("data", "g") // id = 122
-    );
-
-    DeleteFile eqDeletes = FileHelpers.writeDeleteFile(
-        table, Files.localOutput(temp.newFile()), Row.of(0), dataDeletes, 
dataSchema);
-
-    List<Pair<CharSequence, Long>> deletes = Lists.newArrayList(
-        Pair.of(dataFile.path(), 3L), // id = 89
-        Pair.of(dataFile.path(), 5L) // id = 121
-    );
+  @Before
+  public void prepareData() throws IOException {
+    this.table = createTable("table", SCHEMA, SPEC);

Review comment:
       Make sense, Updated




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
[email protected]



---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[GitHub] [iceberg] chenjunjiedada commented on a change in pull request #1497: MR: apply row-level delete files when reading

Reply via email to