[GitHub] [flink-table-store] JingsongLi commented on a diff in pull request #484: [FLINK-30516] Introduce files table in table store

GitBox Mon, 16 Jan 2023 19:42:12 -0800


JingsongLi commented on code in PR #484:
URL: https://github.com/apache/flink-table-store/pull/484#discussion_r1071702317



##########
flink-table-store-core/src/main/java/org/apache/flink/table/store/CoreOptions.java:
##########
@@ -401,6 +401,12 @@ public class CoreOptions implements Serializable {
                     .withDescription(
                             "Whether to create underlying storage when reading 
and writing the table.");
 
+    public static final ConfigOption<Long> FILES_SNAPSHOT_ID =
+            ConfigOptions.key("files.snapshot-id")

Review Comment:
   Can we just reuse `scan.snapshot-id`?



##########
flink-table-store-core/src/main/java/org/apache/flink/table/store/table/system/FilesTable.java:
##########
@@ -0,0 +1,296 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.flink.table.store.table.system;
+
+import org.apache.flink.core.fs.Path;
+import org.apache.flink.table.store.CoreOptions;
+import org.apache.flink.table.store.data.BinaryString;
+import org.apache.flink.table.store.data.GenericRow;
+import org.apache.flink.table.store.data.InternalRow;
+import org.apache.flink.table.store.file.casting.CastExecutor;
+import org.apache.flink.table.store.file.io.DataFileMeta;
+import org.apache.flink.table.store.file.io.DataFilePathFactory;
+import org.apache.flink.table.store.file.predicate.Predicate;
+import org.apache.flink.table.store.file.schema.SchemaEvolutionUtil;
+import org.apache.flink.table.store.file.schema.SchemaManager;
+import org.apache.flink.table.store.file.schema.TableSchema;
+import org.apache.flink.table.store.file.stats.BinaryTableStats;
+import org.apache.flink.table.store.file.stats.FieldStatsArraySerializer;
+import org.apache.flink.table.store.file.utils.IteratorRecordReader;
+import org.apache.flink.table.store.file.utils.RecordReader;
+import org.apache.flink.table.store.file.utils.SerializationUtils;
+import org.apache.flink.table.store.format.FieldStats;
+import org.apache.flink.table.store.table.FileStoreTable;
+import org.apache.flink.table.store.table.Table;
+import org.apache.flink.table.store.table.source.DataTableScan;
+import org.apache.flink.table.store.table.source.Split;
+import org.apache.flink.table.store.table.source.TableRead;
+import org.apache.flink.table.store.table.source.TableScan;
+import org.apache.flink.table.store.types.BigIntType;
+import org.apache.flink.table.store.types.DataField;
+import org.apache.flink.table.store.types.IntType;
+import org.apache.flink.table.store.types.RowType;
+import org.apache.flink.table.store.utils.ProjectedRow;
+
+import org.apache.flink.shaded.guava30.com.google.common.collect.Iterators;
+
+import javax.annotation.Nullable;
+
+import java.io.IOException;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+import java.util.Objects;
+import java.util.TreeMap;
+
+import static 
org.apache.flink.table.store.file.catalog.Catalog.SYSTEM_TABLE_SPLITTER;
+import static org.apache.flink.util.Preconditions.checkArgument;
+
+/** A {@link Table} for showing files of a snapshot in specific table. */
+public class FilesTable implements Table {
+
+    private static final long serialVersionUID = 1L;
+
+    public static final String FILES = "files";
+
+    public static final RowType TABLE_TYPE =
+            new RowType(
+                    Arrays.asList(
+                            new DataField(0, "snapshot_id", new 
BigIntType(true)),

Review Comment:
   add partition and bucket?



##########
flink-table-store-core/src/main/java/org/apache/flink/table/store/table/system/FilesTable.java:
##########
@@ -0,0 +1,296 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.flink.table.store.table.system;
+
+import org.apache.flink.core.fs.Path;
+import org.apache.flink.table.store.CoreOptions;
+import org.apache.flink.table.store.data.BinaryString;
+import org.apache.flink.table.store.data.GenericRow;
+import org.apache.flink.table.store.data.InternalRow;
+import org.apache.flink.table.store.file.casting.CastExecutor;
+import org.apache.flink.table.store.file.io.DataFileMeta;
+import org.apache.flink.table.store.file.io.DataFilePathFactory;
+import org.apache.flink.table.store.file.predicate.Predicate;
+import org.apache.flink.table.store.file.schema.SchemaEvolutionUtil;
+import org.apache.flink.table.store.file.schema.SchemaManager;
+import org.apache.flink.table.store.file.schema.TableSchema;
+import org.apache.flink.table.store.file.stats.BinaryTableStats;
+import org.apache.flink.table.store.file.stats.FieldStatsArraySerializer;
+import org.apache.flink.table.store.file.utils.IteratorRecordReader;
+import org.apache.flink.table.store.file.utils.RecordReader;
+import org.apache.flink.table.store.file.utils.SerializationUtils;
+import org.apache.flink.table.store.format.FieldStats;
+import org.apache.flink.table.store.table.FileStoreTable;
+import org.apache.flink.table.store.table.Table;
+import org.apache.flink.table.store.table.source.DataTableScan;
+import org.apache.flink.table.store.table.source.Split;
+import org.apache.flink.table.store.table.source.TableRead;
+import org.apache.flink.table.store.table.source.TableScan;
+import org.apache.flink.table.store.types.BigIntType;
+import org.apache.flink.table.store.types.DataField;
+import org.apache.flink.table.store.types.IntType;
+import org.apache.flink.table.store.types.RowType;
+import org.apache.flink.table.store.utils.ProjectedRow;
+
+import org.apache.flink.shaded.guava30.com.google.common.collect.Iterators;
+
+import javax.annotation.Nullable;
+
+import java.io.IOException;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+import java.util.Objects;
+import java.util.TreeMap;
+
+import static 
org.apache.flink.table.store.file.catalog.Catalog.SYSTEM_TABLE_SPLITTER;
+import static org.apache.flink.util.Preconditions.checkArgument;
+
+/** A {@link Table} for showing files of a snapshot in specific table. */
+public class FilesTable implements Table {
+
+    private static final long serialVersionUID = 1L;
+
+    public static final String FILES = "files";
+
+    public static final RowType TABLE_TYPE =
+            new RowType(
+                    Arrays.asList(
+                            new DataField(0, "snapshot_id", new 
BigIntType(true)),
+                            new DataField(1, "file_path", 
SerializationUtils.newStringType(false)),
+                            new DataField(
+                                    2, "file_format", 
SerializationUtils.newStringType(false)),
+                            new DataField(3, "schema_id", new 
BigIntType(false)),
+                            new DataField(4, "level", new IntType(false)),
+                            new DataField(5, "record_count", new 
BigIntType(false)),
+                            new DataField(6, "file_size_in_bytes", new 
BigIntType(false)),
+                            new DataField(
+                                    7,
+                                    "null_value_counts",
+                                    SerializationUtils.newStringType(false)),
+                            new DataField(
+                                    8,
+                                    "lower_value_bounds",
+                                    SerializationUtils.newStringType(false)),
+                            new DataField(
+                                    9,
+                                    "upper_value_bounds",

Review Comment:
   maybe rename this to something thing like `min max stats`



##########
flink-table-store-core/src/main/java/org/apache/flink/table/store/table/system/FilesTable.java:
##########
@@ -0,0 +1,296 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.flink.table.store.table.system;
+
+import org.apache.flink.core.fs.Path;
+import org.apache.flink.table.store.CoreOptions;
+import org.apache.flink.table.store.data.BinaryString;
+import org.apache.flink.table.store.data.GenericRow;
+import org.apache.flink.table.store.data.InternalRow;
+import org.apache.flink.table.store.file.casting.CastExecutor;
+import org.apache.flink.table.store.file.io.DataFileMeta;
+import org.apache.flink.table.store.file.io.DataFilePathFactory;
+import org.apache.flink.table.store.file.predicate.Predicate;
+import org.apache.flink.table.store.file.schema.SchemaEvolutionUtil;
+import org.apache.flink.table.store.file.schema.SchemaManager;
+import org.apache.flink.table.store.file.schema.TableSchema;
+import org.apache.flink.table.store.file.stats.BinaryTableStats;
+import org.apache.flink.table.store.file.stats.FieldStatsArraySerializer;
+import org.apache.flink.table.store.file.utils.IteratorRecordReader;
+import org.apache.flink.table.store.file.utils.RecordReader;
+import org.apache.flink.table.store.file.utils.SerializationUtils;
+import org.apache.flink.table.store.format.FieldStats;
+import org.apache.flink.table.store.table.FileStoreTable;
+import org.apache.flink.table.store.table.Table;
+import org.apache.flink.table.store.table.source.DataTableScan;
+import org.apache.flink.table.store.table.source.Split;
+import org.apache.flink.table.store.table.source.TableRead;
+import org.apache.flink.table.store.table.source.TableScan;
+import org.apache.flink.table.store.types.BigIntType;
+import org.apache.flink.table.store.types.DataField;
+import org.apache.flink.table.store.types.IntType;
+import org.apache.flink.table.store.types.RowType;
+import org.apache.flink.table.store.utils.ProjectedRow;
+
+import org.apache.flink.shaded.guava30.com.google.common.collect.Iterators;
+
+import javax.annotation.Nullable;
+
+import java.io.IOException;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+import java.util.Objects;
+import java.util.TreeMap;
+
+import static 
org.apache.flink.table.store.file.catalog.Catalog.SYSTEM_TABLE_SPLITTER;
+import static org.apache.flink.util.Preconditions.checkArgument;
+
+/** A {@link Table} for showing files of a snapshot in specific table. */
+public class FilesTable implements Table {
+
+    private static final long serialVersionUID = 1L;
+
+    public static final String FILES = "files";
+
+    public static final RowType TABLE_TYPE =
+            new RowType(
+                    Arrays.asList(
+                            new DataField(0, "snapshot_id", new 
BigIntType(true)),
+                            new DataField(1, "file_path", 
SerializationUtils.newStringType(false)),
+                            new DataField(
+                                    2, "file_format", 
SerializationUtils.newStringType(false)),
+                            new DataField(3, "schema_id", new 
BigIntType(false)),
+                            new DataField(4, "level", new IntType(false)),
+                            new DataField(5, "record_count", new 
BigIntType(false)),
+                            new DataField(6, "file_size_in_bytes", new 
BigIntType(false)),
+                            new DataField(
+                                    7,
+                                    "null_value_counts",
+                                    SerializationUtils.newStringType(false)),
+                            new DataField(
+                                    8,
+                                    "lower_value_bounds",

Review Comment:
   maybe maxKey and minKey are more important that bounds.



##########
flink-table-store-core/src/main/java/org/apache/flink/table/store/table/system/FilesTable.java:
##########
@@ -0,0 +1,296 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.flink.table.store.table.system;
+
+import org.apache.flink.core.fs.Path;
+import org.apache.flink.table.store.CoreOptions;
+import org.apache.flink.table.store.data.BinaryString;
+import org.apache.flink.table.store.data.GenericRow;
+import org.apache.flink.table.store.data.InternalRow;
+import org.apache.flink.table.store.file.casting.CastExecutor;
+import org.apache.flink.table.store.file.io.DataFileMeta;
+import org.apache.flink.table.store.file.io.DataFilePathFactory;
+import org.apache.flink.table.store.file.predicate.Predicate;
+import org.apache.flink.table.store.file.schema.SchemaEvolutionUtil;
+import org.apache.flink.table.store.file.schema.SchemaManager;
+import org.apache.flink.table.store.file.schema.TableSchema;
+import org.apache.flink.table.store.file.stats.BinaryTableStats;
+import org.apache.flink.table.store.file.stats.FieldStatsArraySerializer;
+import org.apache.flink.table.store.file.utils.IteratorRecordReader;
+import org.apache.flink.table.store.file.utils.RecordReader;
+import org.apache.flink.table.store.file.utils.SerializationUtils;
+import org.apache.flink.table.store.format.FieldStats;
+import org.apache.flink.table.store.table.FileStoreTable;
+import org.apache.flink.table.store.table.Table;
+import org.apache.flink.table.store.table.source.DataTableScan;
+import org.apache.flink.table.store.table.source.Split;
+import org.apache.flink.table.store.table.source.TableRead;
+import org.apache.flink.table.store.table.source.TableScan;
+import org.apache.flink.table.store.types.BigIntType;
+import org.apache.flink.table.store.types.DataField;
+import org.apache.flink.table.store.types.IntType;
+import org.apache.flink.table.store.types.RowType;
+import org.apache.flink.table.store.utils.ProjectedRow;
+
+import org.apache.flink.shaded.guava30.com.google.common.collect.Iterators;
+
+import javax.annotation.Nullable;
+
+import java.io.IOException;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+import java.util.Objects;
+import java.util.TreeMap;
+
+import static 
org.apache.flink.table.store.file.catalog.Catalog.SYSTEM_TABLE_SPLITTER;
+import static org.apache.flink.util.Preconditions.checkArgument;
+
+/** A {@link Table} for showing files of a snapshot in specific table. */
+public class FilesTable implements Table {
+
+    private static final long serialVersionUID = 1L;
+
+    public static final String FILES = "files";
+
+    public static final RowType TABLE_TYPE =
+            new RowType(
+                    Arrays.asList(
+                            new DataField(0, "snapshot_id", new 
BigIntType(true)),
+                            new DataField(1, "file_path", 
SerializationUtils.newStringType(false)),
+                            new DataField(
+                                    2, "file_format", 
SerializationUtils.newStringType(false)),
+                            new DataField(3, "schema_id", new 
BigIntType(false)),
+                            new DataField(4, "level", new IntType(false)),
+                            new DataField(5, "record_count", new 
BigIntType(false)),
+                            new DataField(6, "file_size_in_bytes", new 
BigIntType(false)),
+                            new DataField(
+                                    7,
+                                    "null_value_counts",
+                                    SerializationUtils.newStringType(false)),
+                            new DataField(
+                                    8,
+                                    "lower_value_bounds",
+                                    SerializationUtils.newStringType(false)),
+                            new DataField(
+                                    9,
+                                    "upper_value_bounds",
+                                    SerializationUtils.newStringType(false))));
+
+    private final FileStoreTable storeTable;
+    private final CoreOptions options;
+
+    public FilesTable(FileStoreTable storeTable, CoreOptions options) {
+        this.storeTable = storeTable;
+        this.options = options;
+    }
+
+    @Override
+    public String name() {
+        return storeTable.name() + SYSTEM_TABLE_SPLITTER + FILES;
+    }
+
+    @Override
+    public RowType rowType() {
+        return TABLE_TYPE;
+    }
+
+    @Override
+    public Path location() {
+        return storeTable.location();
+    }
+
+    @Override
+    public TableScan newScan() {
+        return new FilesScan(storeTable, 
options.filesSnapshotId().orElse(null));
+    }
+
+    @Override
+    public TableRead newRead() {
+        return new FilesRead(new SchemaManager(storeTable.location()));
+    }
+
+    @Override
+    public Table copy(Map<String, String> dynamicOptions) {
+        return new FilesTable(storeTable, new CoreOptions(dynamicOptions));
+    }
+
+    private static class FilesScan implements TableScan {
+        private final FileStoreTable storeTable;
+
+        @Nullable private final Long snapshotId;
+
+        private FilesScan(FileStoreTable storeTable, Long snapshotId) {
+            this.storeTable = storeTable;
+            this.snapshotId = snapshotId;
+        }
+
+        @Override
+        public TableScan withFilter(Predicate predicate) {
+            // TODO
+            return this;
+        }
+
+        @Override
+        public Plan plan() {
+            return () -> Collections.singletonList(new FilesSplit(snapshotId, 
storeTable));
+        }
+    }
+
+    private static class FilesSplit implements Split {
+
+        private static final long serialVersionUID = 1L;
+
+        @Nullable private final Long snapshotId;
+
+        private final FileStoreTable storeTable;
+
+        private FilesSplit(@Nullable Long snapshotId, FileStoreTable 
storeTable) {
+            this.snapshotId = snapshotId;
+            this.storeTable = storeTable;
+        }
+
+        @Override
+        public long rowCount() {
+            return dataFilePlan().splits.stream().mapToLong(s -> 
s.files().size()).sum();
+        }
+
+        private DataTableScan.DataFilePlan dataFilePlan() {
+            DataTableScan scan = storeTable.newScan();
+            if (snapshotId != null) {
+                scan.withSnapshot(snapshotId);
+            }
+            return scan.plan();
+        }
+
+        @Override
+        public boolean equals(Object o) {
+            if (this == o) {
+                return true;
+            }
+            if (o == null || getClass() != o.getClass()) {
+                return false;
+            }
+            FilesSplit that = (FilesSplit) o;
+            return Objects.equals(storeTable, that.storeTable)
+                    && Objects.equals(snapshotId, that.snapshotId);
+        }
+
+        @Override
+        public int hashCode() {
+            return Objects.hash(snapshotId, storeTable);
+        }
+    }
+
+    private static class FilesRead implements TableRead {
+        private final SchemaManager schemaManager;
+
+        private int[][] projection;
+
+        private FilesRead(SchemaManager schemaManager) {
+            this.schemaManager = schemaManager;
+        }
+
+        @Override
+        public TableRead withFilter(Predicate predicate) {
+            // TODO
+            return this;
+        }
+
+        @Override
+        public TableRead withProjection(int[][] projection) {
+            this.projection = projection;
+            return this;
+        }
+
+        @Override
+        public RecordReader<InternalRow> createReader(Split split) throws 
IOException {
+            if (!(split instanceof FilesSplit)) {
+                throw new IllegalArgumentException("Unsupported split: " + 
split.getClass());
+            }
+            FilesSplit filesSplit = (FilesSplit) split;
+            DataTableScan.DataFilePlan dataFilePlan = 
filesSplit.dataFilePlan();
+            Iterator<InternalRow> rows =
+                    Iterators.transform(
+                            dataFilePlan.splits.stream()
+                                    .flatMap(s -> s.files().stream())
+                                    .iterator(),
+                            v -> toRow(dataFilePlan.snapshotId, v, 
filesSplit.storeTable));
+            if (projection != null) {
+                rows =
+                        Iterators.transform(
+                                rows, row -> 
ProjectedRow.from(projection).replaceRow(row));
+            }
+            return new IteratorRecordReader<>(rows);
+        }
+
+        private InternalRow toRow(
+                Long snapshotId, DataFileMeta dataFileMeta, FileStoreTable 
storeTable) {
+            TableSchema tableSchema =
+                    schemaManager.schema(
+                            
storeTable.snapshotManager().snapshot(snapshotId).schemaId());
+            TableSchema dataSchema = 
schemaManager.schema(dataFileMeta.schemaId());
+            int[] indexMapping =
+                    SchemaEvolutionUtil.createIndexMapping(
+                            tableSchema.fields(), dataSchema.fields());
+            CastExecutor<Object, Object>[] castExecutors =
+                    (CastExecutor<Object, Object>[])
+                            SchemaEvolutionUtil.createConvertMapping(
+                                    tableSchema.fields(), dataSchema.fields(), 
indexMapping);
+            RowType rowType = dataSchema.logicalRowType();
+            // Create field stats array serializer with schema evolution
+            FieldStatsArraySerializer fieldStatsArraySerializer =
+                    new FieldStatsArraySerializer(rowType, indexMapping, 
castExecutors);
+
+            // Get schema field stats for different table
+            BinaryTableStats schemaFieldStats = 
storeTable.getSchemaFieldStats(dataFileMeta);
+
+            // Create value stats
+            List<String> fieldNames = tableSchema.fieldNames();
+            FieldStats[] fieldStatsArray =
+                    schemaFieldStats.fields(fieldStatsArraySerializer, 
dataFileMeta.rowCount());
+            checkArgument(fieldNames.size() == fieldStatsArray.length);
+            Map<String, Long> nullValueCounts = new TreeMap<>();
+            Map<String, Object> lowerValueBounds = new TreeMap<>();
+            Map<String, Object> upperValueBounds = new TreeMap<>();
+            for (int i = 0; i < fieldStatsArray.length; i++) {
+                String fieldName = fieldNames.get(i);
+                FieldStats fieldStats = fieldStatsArray[i];
+                nullValueCounts.put(fieldName, fieldStats.nullCount());
+                lowerValueBounds.put(fieldName, fieldStats.minValue());
+                upperValueBounds.put(fieldName, fieldStats.maxValue());
+            }
+
+            return GenericRow.of(
+                    snapshotId,
+                    BinaryString.fromString(dataFileMeta.fileName()),

Review Comment:
   schema and data are not consistent.
   You should add tests for this.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscr...@flink.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[GitHub] [flink-table-store] JingsongLi commented on a diff in pull request #484: [FLINK-30516] Introduce files table in table store

Reply via email to