This is an automated email from the ASF dual-hosted git repository.
junhao pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/paimon.git
The following commit(s) were added to refs/heads/master by this push:
new f2cae5d5c3 [core] Reduce useless getFileStatus for Parquet Reader
(#5217)
f2cae5d5c3 is described below
commit f2cae5d5c312f78f73029dd7359b05a6324de48e
Author: Jingsong Lee <[email protected]>
AuthorDate: Wed Mar 5 18:16:40 2025 +0800
[core] Reduce useless getFileStatus for Parquet Reader (#5217)
---
.../apache/paimon/format/SimpleStatsExtractor.java | 4 ++--
.../org/apache/paimon/fs/local/LocalFileIO.java | 14 ++++++++++++
.../paimon/format/SimpleColStatsExtractorTest.java | 2 +-
.../apache/paimon/io/KeyValueDataFileWriter.java | 6 ++++--
.../org/apache/paimon/io/RowDataFileWriter.java | 6 ++++--
.../paimon/io/StatsCollectingSingleFileWriter.java | 4 ++--
.../org/apache/paimon/migrate/FileMetaUtils.java | 2 +-
.../test/java/org/apache/paimon/SnapshotTest.java | 25 ++++++++++++++++++++++
.../paimon/stats/TestSimpleStatsExtractor.java | 8 +++----
.../format/avro/AvroSimpleStatsExtractor.java | 10 ++++-----
.../format/orc/filter/OrcSimpleStatsExtractor.java | 8 +++----
.../paimon/format/parquet/ParquetInputFile.java | 22 ++++++++++---------
.../format/parquet/ParquetReaderFactory.java | 6 ++++--
.../parquet/ParquetSimpleStatsExtractor.java | 10 ++++-----
.../apache/paimon/format/parquet/ParquetUtil.java | 19 +++++++++-------
.../format/parquet/ParquetFormatReadWriteTest.java | 5 +++--
16 files changed, 100 insertions(+), 51 deletions(-)
diff --git
a/paimon-common/src/main/java/org/apache/paimon/format/SimpleStatsExtractor.java
b/paimon-common/src/main/java/org/apache/paimon/format/SimpleStatsExtractor.java
index a2d599aaab..9ffbf495e5 100644
---
a/paimon-common/src/main/java/org/apache/paimon/format/SimpleStatsExtractor.java
+++
b/paimon-common/src/main/java/org/apache/paimon/format/SimpleStatsExtractor.java
@@ -27,9 +27,9 @@ import java.io.IOException;
/** Extracts statistics directly from file. */
public interface SimpleStatsExtractor {
- SimpleColStats[] extract(FileIO fileIO, Path path) throws IOException;
+ SimpleColStats[] extract(FileIO fileIO, Path path, long length) throws
IOException;
- Pair<SimpleColStats[], FileInfo> extractWithFileInfo(FileIO fileIO, Path
path)
+ Pair<SimpleColStats[], FileInfo> extractWithFileInfo(FileIO fileIO, Path
path, long length)
throws IOException;
/** File info fetched from physical file. */
diff --git
a/paimon-common/src/main/java/org/apache/paimon/fs/local/LocalFileIO.java
b/paimon-common/src/main/java/org/apache/paimon/fs/local/LocalFileIO.java
index ac84c242c5..0b1c4ad260 100644
--- a/paimon-common/src/main/java/org/apache/paimon/fs/local/LocalFileIO.java
+++ b/paimon-common/src/main/java/org/apache/paimon/fs/local/LocalFileIO.java
@@ -26,6 +26,9 @@ import org.apache.paimon.fs.PositionOutputStream;
import org.apache.paimon.fs.SeekableInputStream;
import org.apache.paimon.fs.VectoredReadable;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
@@ -49,6 +52,8 @@ import static
org.apache.paimon.utils.Preconditions.checkState;
/** {@link FileIO} for local file. */
public class LocalFileIO implements FileIO {
+ private static final Logger LOG =
LoggerFactory.getLogger(LocalFileIO.class);
+
private static final long serialVersionUID = 1L;
// the lock to ensure atomic renaming
@@ -68,11 +73,13 @@ public class LocalFileIO implements FileIO {
@Override
public SeekableInputStream newInputStream(Path path) throws IOException {
+ LOG.debug("Invoking newInputStream for {}", path);
return new LocalSeekableInputStream(toFile(path));
}
@Override
public PositionOutputStream newOutputStream(Path path, boolean overwrite)
throws IOException {
+ LOG.debug("Invoking newOutputStream for {}", path);
if (exists(path) && !overwrite) {
throw new FileAlreadyExistsException("File already exists: " +
path);
}
@@ -87,6 +94,7 @@ public class LocalFileIO implements FileIO {
@Override
public FileStatus getFileStatus(Path path) throws IOException {
+ LOG.debug("Invoking getFileStatus for {}", path);
final File file = toFile(path);
if (file.exists()) {
return new LocalFileStatus(file, SCHEME);
@@ -103,6 +111,7 @@ public class LocalFileIO implements FileIO {
@Override
public FileStatus[] listStatus(Path path) throws IOException {
+ LOG.debug("Invoking listStatus for {}", path);
final File file = toFile(path);
FileStatus[] results = new FileStatus[0];
@@ -133,11 +142,13 @@ public class LocalFileIO implements FileIO {
@Override
public boolean exists(Path path) throws IOException {
+ LOG.debug("Invoking exists for {}", path);
return toFile(path).exists();
}
@Override
public boolean delete(Path path, boolean recursive) throws IOException {
+ LOG.debug("Invoking delete for {}", path);
File file = toFile(path);
if (file.isFile()) {
return file.delete();
@@ -175,6 +186,7 @@ public class LocalFileIO implements FileIO {
@Override
public boolean mkdirs(Path path) throws IOException {
+ LOG.debug("Invoking mkdirs for {}", path);
return mkdirsInternal(toFile(path));
}
@@ -196,6 +208,7 @@ public class LocalFileIO implements FileIO {
@Override
public boolean rename(Path src, Path dst) throws IOException {
+ LOG.debug("Invoking rename for {} to {}", src, dst);
File srcFile = toFile(src);
File dstFile = toFile(dst);
File dstParent = dstFile.getParentFile();
@@ -219,6 +232,7 @@ public class LocalFileIO implements FileIO {
@Override
public void copyFile(Path sourcePath, Path targetPath, boolean overwrite)
throws IOException {
+ LOG.debug("Invoking copyFile for {} to {}", sourcePath, targetPath);
if (!overwrite && exists(targetPath)) {
return;
}
diff --git
a/paimon-common/src/test/java/org/apache/paimon/format/SimpleColStatsExtractorTest.java
b/paimon-common/src/test/java/org/apache/paimon/format/SimpleColStatsExtractorTest.java
index ddb3da7a63..c12bc3215d 100644
---
a/paimon-common/src/test/java/org/apache/paimon/format/SimpleColStatsExtractorTest.java
+++
b/paimon-common/src/test/java/org/apache/paimon/format/SimpleColStatsExtractorTest.java
@@ -99,7 +99,7 @@ public abstract class SimpleColStatsExtractorTest {
SimpleStatsExtractor extractor = format.createStatsExtractor(rowType,
stats).get();
assertThat(extractor).isNotNull();
- SimpleColStats[] actual = extractor.extract(fileIO, path);
+ SimpleColStats[] actual = extractor.extract(fileIO, path,
fileIO.getFileSize(path));
for (int i = 0; i < expected.length; i++) {
expected[i] = regenerate(expected[i], rowType.getTypeAt(i));
}
diff --git
a/paimon-core/src/main/java/org/apache/paimon/io/KeyValueDataFileWriter.java
b/paimon-core/src/main/java/org/apache/paimon/io/KeyValueDataFileWriter.java
index 3c7f6b45bb..e655f0ed3a 100644
--- a/paimon-core/src/main/java/org/apache/paimon/io/KeyValueDataFileWriter.java
+++ b/paimon-core/src/main/java/org/apache/paimon/io/KeyValueDataFileWriter.java
@@ -169,7 +169,9 @@ public abstract class KeyValueDataFileWriter
return null;
}
- Pair<SimpleColStats[], SimpleColStats[]> keyValueStats =
fetchKeyValueStats(fieldStats());
+ long fileSize = fileIO.getFileSize(path);
+ Pair<SimpleColStats[], SimpleColStats[]> keyValueStats =
+ fetchKeyValueStats(fieldStats(fileSize));
SimpleStats keyStats =
keyStatsConverter.toBinaryAllMode(keyValueStats.getKey());
Pair<List<String>, SimpleStats> valueStatsPair =
@@ -183,7 +185,7 @@ public abstract class KeyValueDataFileWriter
String externalPath = isExternalPath ? path.toString() : null;
return new DataFileMeta(
path.getName(),
- fileIO.getFileSize(path),
+ fileSize,
recordCount(),
minKey,
keySerializer.toBinaryRow(maxKey).copy(),
diff --git
a/paimon-core/src/main/java/org/apache/paimon/io/RowDataFileWriter.java
b/paimon-core/src/main/java/org/apache/paimon/io/RowDataFileWriter.java
index 25906e2dfa..a21041a3ab 100644
--- a/paimon-core/src/main/java/org/apache/paimon/io/RowDataFileWriter.java
+++ b/paimon-core/src/main/java/org/apache/paimon/io/RowDataFileWriter.java
@@ -109,7 +109,9 @@ public class RowDataFileWriter extends
StatsCollectingSingleFileWriter<InternalR
@Override
public DataFileMeta result() throws IOException {
- Pair<List<String>, SimpleStats> statsPair =
statsArraySerializer.toBinary(fieldStats());
+ long fileSize = fileIO.getFileSize(path);
+ Pair<List<String>, SimpleStats> statsPair =
+ statsArraySerializer.toBinary(fieldStats(fileSize));
DataFileIndexWriter.FileIndexResult indexResult =
dataFileIndexWriter == null
? DataFileIndexWriter.EMPTY_RESULT
@@ -117,7 +119,7 @@ public class RowDataFileWriter extends
StatsCollectingSingleFileWriter<InternalR
String externalPath = isExternalPath ? path.toString() : null;
return DataFileMeta.forAppend(
path.getName(),
- fileIO.getFileSize(path),
+ fileSize,
recordCount(),
statsPair.getRight(),
seqNumCounter.getValue() - super.recordCount(),
diff --git
a/paimon-core/src/main/java/org/apache/paimon/io/StatsCollectingSingleFileWriter.java
b/paimon-core/src/main/java/org/apache/paimon/io/StatsCollectingSingleFileWriter.java
index 67a3fa6d1a..07fc26d26f 100644
---
a/paimon-core/src/main/java/org/apache/paimon/io/StatsCollectingSingleFileWriter.java
+++
b/paimon-core/src/main/java/org/apache/paimon/io/StatsCollectingSingleFileWriter.java
@@ -96,13 +96,13 @@ public abstract class StatsCollectingSingleFileWriter<T, R>
extends SingleFileWr
super.writeBundle(bundle);
}
- public SimpleColStats[] fieldStats() throws IOException {
+ public SimpleColStats[] fieldStats(long fileSize) throws IOException {
Preconditions.checkState(closed, "Cannot access metric unless the
writer is closed.");
if (simpleStatsExtractor != null) {
if (isStatsDisabled) {
return noneStats;
} else {
- return simpleStatsExtractor.extract(fileIO, path);
+ return simpleStatsExtractor.extract(fileIO, path, fileSize);
}
} else {
return simpleStatsCollector.extract();
diff --git
a/paimon-core/src/main/java/org/apache/paimon/migrate/FileMetaUtils.java
b/paimon-core/src/main/java/org/apache/paimon/migrate/FileMetaUtils.java
index 405870d5fa..a960ce7575 100644
--- a/paimon-core/src/main/java/org/apache/paimon/migrate/FileMetaUtils.java
+++ b/paimon-core/src/main/java/org/apache/paimon/migrate/FileMetaUtils.java
@@ -218,7 +218,7 @@ public class FileMetaUtils {
SimpleStatsConverter statsArraySerializer = new
SimpleStatsConverter(rowTypeWithSchemaId);
Pair<SimpleColStats[], SimpleStatsExtractor.FileInfo> fileInfo =
- simpleStatsExtractor.extractWithFileInfo(fileIO, path);
+ simpleStatsExtractor.extractWithFileInfo(fileIO, path,
fileSize);
SimpleStats stats =
statsArraySerializer.toBinaryAllMode(fileInfo.getLeft());
return DataFileMeta.forAppend(
diff --git a/paimon-core/src/test/java/org/apache/paimon/SnapshotTest.java
b/paimon-core/src/test/java/org/apache/paimon/SnapshotTest.java
index 41f5e2dd95..1cceeffbfa 100644
--- a/paimon-core/src/test/java/org/apache/paimon/SnapshotTest.java
+++ b/paimon-core/src/test/java/org/apache/paimon/SnapshotTest.java
@@ -26,6 +26,7 @@ import org.apache.paimon.utils.SnapshotManager;
import org.junit.jupiter.api.Test;
import static
org.apache.paimon.utils.FileSystemBranchManager.DEFAULT_MAIN_BRANCH;
+import static org.assertj.core.api.Assertions.assertThat;
/** Test for snapshots. */
public class SnapshotTest {
@@ -50,6 +51,30 @@ public class SnapshotTest {
+ "}");
}
+ @Test
+ public void testSnapshotWithSizes() {
+ String json =
+ "{\n"
+ + " \"version\" : 3,\n"
+ + " \"id\" : 5,\n"
+ + " \"schemaId\" : 0,\n"
+ + " \"baseManifestList\" : null,\n"
+ + " \"baseManifestListSize\" : 6,\n"
+ + " \"deltaManifestList\" : null,\n"
+ + " \"deltaManifestListSize\" : 8,\n"
+ + " \"changelogManifestListSize\" : 10,\n"
+ + " \"commitUser\" : null,\n"
+ + " \"commitIdentifier\" : 0,\n"
+ + " \"commitKind\" : \"APPEND\",\n"
+ + " \"timeMillis\" : 1234,\n"
+ + " \"totalRecordCount\" : null,\n"
+ + " \"deltaRecordCount\" : null,\n"
+ + " \"unknownKey\" : 22222\n"
+ + "}";
+ Snapshot snapshot = Snapshot.fromJson(json);
+ assertThat(Snapshot.fromJson(snapshot.toJson())).isEqualTo(snapshot);
+ }
+
public static SnapshotManager newSnapshotManager(FileIO fileIO, Path
tablePath) {
return newSnapshotManager(fileIO, tablePath, DEFAULT_MAIN_BRANCH);
}
diff --git
a/paimon-core/src/test/java/org/apache/paimon/stats/TestSimpleStatsExtractor.java
b/paimon-core/src/test/java/org/apache/paimon/stats/TestSimpleStatsExtractor.java
index a410fd6e47..96efc4d11a 100644
---
a/paimon-core/src/test/java/org/apache/paimon/stats/TestSimpleStatsExtractor.java
+++
b/paimon-core/src/test/java/org/apache/paimon/stats/TestSimpleStatsExtractor.java
@@ -59,13 +59,13 @@ public class TestSimpleStatsExtractor implements
SimpleStatsExtractor {
}
@Override
- public SimpleColStats[] extract(FileIO fileIO, Path path) throws
IOException {
- return extractWithFileInfo(fileIO, path).getLeft();
+ public SimpleColStats[] extract(FileIO fileIO, Path path, long length)
throws IOException {
+ return extractWithFileInfo(fileIO, path, length).getLeft();
}
@Override
- public Pair<SimpleColStats[], FileInfo> extractWithFileInfo(FileIO fileIO,
Path path)
- throws IOException {
+ public Pair<SimpleColStats[], FileInfo> extractWithFileInfo(
+ FileIO fileIO, Path path, long length) throws IOException {
IdentityObjectSerializer serializer = new
IdentityObjectSerializer(rowType);
FormatReaderFactory readerFactory =
format.createReaderFactory(rowType);
List<InternalRow> records = readListFromFile(fileIO, path, serializer,
readerFactory);
diff --git
a/paimon-format/src/main/java/org/apache/paimon/format/avro/AvroSimpleStatsExtractor.java
b/paimon-format/src/main/java/org/apache/paimon/format/avro/AvroSimpleStatsExtractor.java
index 6539de25c3..16e014dd5d 100644
---
a/paimon-format/src/main/java/org/apache/paimon/format/avro/AvroSimpleStatsExtractor.java
+++
b/paimon-format/src/main/java/org/apache/paimon/format/avro/AvroSimpleStatsExtractor.java
@@ -39,25 +39,23 @@ import java.util.stream.IntStream;
public class AvroSimpleStatsExtractor implements SimpleStatsExtractor {
private final RowType rowType;
- private final SimpleColStatsCollector.Factory[] statsCollectors;
public AvroSimpleStatsExtractor(
RowType rowType, SimpleColStatsCollector.Factory[]
statsCollectors) {
this.rowType = rowType;
- this.statsCollectors = statsCollectors;
Preconditions.checkArgument(
rowType.getFieldCount() == statsCollectors.length,
"The stats collector is not aligned to write schema.");
}
@Override
- public SimpleColStats[] extract(FileIO fileIO, Path path) throws
IOException {
- return extractWithFileInfo(fileIO, path).getLeft();
+ public SimpleColStats[] extract(FileIO fileIO, Path path, long length)
throws IOException {
+ return extractWithFileInfo(fileIO, path, length).getLeft();
}
@Override
- public Pair<SimpleColStats[], FileInfo> extractWithFileInfo(FileIO fileIO,
Path path)
- throws IOException {
+ public Pair<SimpleColStats[], FileInfo> extractWithFileInfo(
+ FileIO fileIO, Path path, long length) throws IOException {
SeekableInputStream fileInputStream = fileIO.newInputStream(path);
long rowCount = getRowCount(fileInputStream);
diff --git
a/paimon-format/src/main/java/org/apache/paimon/format/orc/filter/OrcSimpleStatsExtractor.java
b/paimon-format/src/main/java/org/apache/paimon/format/orc/filter/OrcSimpleStatsExtractor.java
index c0b9b6f59b..88fe069d46 100644
---
a/paimon-format/src/main/java/org/apache/paimon/format/orc/filter/OrcSimpleStatsExtractor.java
+++
b/paimon-format/src/main/java/org/apache/paimon/format/orc/filter/OrcSimpleStatsExtractor.java
@@ -71,13 +71,13 @@ public class OrcSimpleStatsExtractor implements
SimpleStatsExtractor {
}
@Override
- public SimpleColStats[] extract(FileIO fileIO, Path path) throws
IOException {
- return extractWithFileInfo(fileIO, path).getLeft();
+ public SimpleColStats[] extract(FileIO fileIO, Path path, long length)
throws IOException {
+ return extractWithFileInfo(fileIO, path, length).getLeft();
}
@Override
- public Pair<SimpleColStats[], FileInfo> extractWithFileInfo(FileIO fileIO,
Path path)
- throws IOException {
+ public Pair<SimpleColStats[], FileInfo> extractWithFileInfo(
+ FileIO fileIO, Path path, long length) throws IOException {
try (Reader reader =
OrcReaderFactory.createReader(new Configuration(false),
fileIO, path, null)) {
long rowCount = reader.getNumberOfRows();
diff --git
a/paimon-format/src/main/java/org/apache/paimon/format/parquet/ParquetInputFile.java
b/paimon-format/src/main/java/org/apache/paimon/format/parquet/ParquetInputFile.java
index 7c52c24971..0e68416ba2 100644
---
a/paimon-format/src/main/java/org/apache/paimon/format/parquet/ParquetInputFile.java
+++
b/paimon-format/src/main/java/org/apache/paimon/format/parquet/ParquetInputFile.java
@@ -19,7 +19,6 @@
package org.apache.paimon.format.parquet;
import org.apache.paimon.fs.FileIO;
-import org.apache.paimon.fs.FileStatus;
import org.apache.paimon.fs.Path;
import org.apache.parquet.io.InputFile;
@@ -30,33 +29,36 @@ import java.io.IOException;
public class ParquetInputFile implements InputFile {
private final FileIO fileIO;
- private final FileStatus stat;
+ private final Path path;
+ private final long length;
- public static ParquetInputFile fromPath(FileIO fileIO, Path path) throws
IOException {
- return new ParquetInputFile(fileIO, fileIO.getFileStatus(path));
+ public static ParquetInputFile fromPath(FileIO fileIO, Path path, long
length)
+ throws IOException {
+ return new ParquetInputFile(fileIO, path, length);
}
- private ParquetInputFile(FileIO fileIO, FileStatus stat) {
+ private ParquetInputFile(FileIO fileIO, Path path, long length) {
this.fileIO = fileIO;
- this.stat = stat;
+ this.path = path;
+ this.length = length;
}
public Path getPath() {
- return stat.getPath();
+ return path;
}
@Override
public long getLength() {
- return stat.getLen();
+ return length;
}
@Override
public ParquetInputStream newStream() throws IOException {
- return new ParquetInputStream(fileIO.newInputStream(stat.getPath()));
+ return new ParquetInputStream(fileIO.newInputStream(path));
}
@Override
public String toString() {
- return stat.getPath().toString();
+ return path.toString();
}
}
diff --git
a/paimon-format/src/main/java/org/apache/paimon/format/parquet/ParquetReaderFactory.java
b/paimon-format/src/main/java/org/apache/paimon/format/parquet/ParquetReaderFactory.java
index 82ffa79336..15db2d113a 100644
---
a/paimon-format/src/main/java/org/apache/paimon/format/parquet/ParquetReaderFactory.java
+++
b/paimon-format/src/main/java/org/apache/paimon/format/parquet/ParquetReaderFactory.java
@@ -113,7 +113,8 @@ public class ParquetReaderFactory implements
FormatReaderFactory {
ParquetFileReader reader =
new ParquetFileReader(
- ParquetInputFile.fromPath(context.fileIO(),
context.filePath()),
+ ParquetInputFile.fromPath(
+ context.fileIO(), context.filePath(),
context.fileSize()),
builder.build(),
context.selection());
MessageType fileSchema = reader.getFileMetaData().getSchema();
@@ -145,7 +146,8 @@ public class ParquetReaderFactory implements
FormatReaderFactory {
ParquetFileReader reader =
new ParquetFileReader(
- ParquetInputFile.fromPath(context.fileIO(),
context.filePath()),
+ ParquetInputFile.fromPath(
+ context.fileIO(), context.filePath(),
context.fileSize()),
builder.build(),
context.selection());
MessageType fileSchema = reader.getFileMetaData().getSchema();
diff --git
a/paimon-format/src/main/java/org/apache/paimon/format/parquet/ParquetSimpleStatsExtractor.java
b/paimon-format/src/main/java/org/apache/paimon/format/parquet/ParquetSimpleStatsExtractor.java
index c0d6cef1b8..4dbd232572 100644
---
a/paimon-format/src/main/java/org/apache/paimon/format/parquet/ParquetSimpleStatsExtractor.java
+++
b/paimon-format/src/main/java/org/apache/paimon/format/parquet/ParquetSimpleStatsExtractor.java
@@ -67,15 +67,15 @@ public class ParquetSimpleStatsExtractor implements
SimpleStatsExtractor {
}
@Override
- public SimpleColStats[] extract(FileIO fileIO, Path path) throws
IOException {
- return extractWithFileInfo(fileIO, path).getLeft();
+ public SimpleColStats[] extract(FileIO fileIO, Path path, long length)
throws IOException {
+ return extractWithFileInfo(fileIO, path, length).getLeft();
}
@Override
- public Pair<SimpleColStats[], FileInfo> extractWithFileInfo(FileIO fileIO,
Path path)
- throws IOException {
+ public Pair<SimpleColStats[], FileInfo> extractWithFileInfo(
+ FileIO fileIO, Path path, long length) throws IOException {
Pair<Map<String, Statistics<?>>, FileInfo> statsPair =
- ParquetUtil.extractColumnStats(fileIO, path);
+ ParquetUtil.extractColumnStats(fileIO, path, length);
SimpleColStatsCollector[] collectors =
SimpleColStatsCollector.create(statsCollectors);
return Pair.of(
IntStream.range(0, rowType.getFieldCount())
diff --git
a/paimon-format/src/main/java/org/apache/paimon/format/parquet/ParquetUtil.java
b/paimon-format/src/main/java/org/apache/paimon/format/parquet/ParquetUtil.java
index 038c91445b..0ec4fa162a 100644
---
a/paimon-format/src/main/java/org/apache/paimon/format/parquet/ParquetUtil.java
+++
b/paimon-format/src/main/java/org/apache/paimon/format/parquet/ParquetUtil.java
@@ -40,15 +40,16 @@ import java.util.Map;
public class ParquetUtil {
/**
- * Extract stats from specified Parquet files path.
+ * Extract stats from specified Parquet file path.
*
- * @param path the path of parquet files to be read
+ * @param path the path of parquet file to be read
+ * @param length the length of parquet file to be read
* @return result sets as map, key is column name, value is statistics
(for example, null count,
* minimum value, maximum value)
*/
public static Pair<Map<String, Statistics<?>>,
SimpleStatsExtractor.FileInfo>
- extractColumnStats(FileIO fileIO, Path path) throws IOException {
- try (ParquetFileReader reader = getParquetReader(fileIO, path)) {
+ extractColumnStats(FileIO fileIO, Path path, long length) throws
IOException {
+ try (ParquetFileReader reader = getParquetReader(fileIO, path,
length)) {
ParquetMetadata parquetMetadata = reader.getFooter();
List<BlockMetaData> blockMetaDataList =
parquetMetadata.getBlocks();
Map<String, Statistics<?>> resultStats = new HashMap<>();
@@ -72,14 +73,16 @@ public class ParquetUtil {
}
/**
- * Generate {@link ParquetFileReader} instance to read the Parquet files
at the given path.
+ * Generate {@link ParquetFileReader} instance to read the Parquet file at
the given path.
*
- * @param path the path of parquet files to be read
+ * @param path the path of parquet file to be read
+ * @param length the length of parquet file to be read
* @return parquet reader, used for reading footer, status, etc.
*/
- public static ParquetFileReader getParquetReader(FileIO fileIO, Path path)
throws IOException {
+ public static ParquetFileReader getParquetReader(FileIO fileIO, Path path,
long length)
+ throws IOException {
return new ParquetFileReader(
- ParquetInputFile.fromPath(fileIO, path),
+ ParquetInputFile.fromPath(fileIO, path, length),
ParquetReadOptions.builder().build(),
null);
}
diff --git
a/paimon-format/src/test/java/org/apache/paimon/format/parquet/ParquetFormatReadWriteTest.java
b/paimon-format/src/test/java/org/apache/paimon/format/parquet/ParquetFormatReadWriteTest.java
index 221d524fff..024ea93b6e 100644
---
a/paimon-format/src/test/java/org/apache/paimon/format/parquet/ParquetFormatReadWriteTest.java
+++
b/paimon-format/src/test/java/org/apache/paimon/format/parquet/ParquetFormatReadWriteTest.java
@@ -64,7 +64,7 @@ public class ParquetFormatReadWriteTest extends
FormatReadWriteTest {
RowType rowType = DataTypes.ROW(DataTypes.INT().notNull(),
DataTypes.BIGINT());
if (ThreadLocalRandom.current().nextBoolean()) {
- rowType = (RowType) rowType.notNull();
+ rowType = rowType.notNull();
}
PositionOutputStream out = fileIO.newOutputStream(file, false);
@@ -75,7 +75,8 @@ public class ParquetFormatReadWriteTest extends
FormatReadWriteTest {
writer.close();
out.close();
- try (ParquetFileReader reader = ParquetUtil.getParquetReader(fileIO,
file)) {
+ try (ParquetFileReader reader =
+ ParquetUtil.getParquetReader(fileIO, file,
fileIO.getFileSize(file))) {
ParquetMetadata parquetMetadata = reader.getFooter();
List<BlockMetaData> blockMetaDataList =
parquetMetadata.getBlocks();
for (BlockMetaData blockMetaData : blockMetaDataList) {