[
https://issues.apache.org/jira/browse/DRILL-5660?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16104821#comment-16104821
]
ASF GitHub Bot commented on DRILL-5660:
---------------------------------------
Github user arina-ielchiieva commented on a diff in the pull request:
https://github.com/apache/drill/pull/877#discussion_r130068040
--- Diff:
exec/java-exec/src/test/java/org/apache/drill/exec/store/parquet/TestParquetMetadataCache.java
---
@@ -424,32 +432,121 @@ public void testMoveCache() throws Exception {
@Test
public void testMetadataCacheAbsolutePaths() throws Exception {
+ final String absolutePathsMetadata = "absolute_paths_metadata";
try {
test("use dfs_test.tmp");
- final String relative_path_metadata_t1 = RELATIVE_PATHS_METADATA +
"/t1";
- final String relative_path_metadata_t2 = RELATIVE_PATHS_METADATA +
"/t2";
- test("create table `%s` as select * from cp.`tpch/nation.parquet`",
relative_path_metadata_t1);
- test("create table `%s` as select * from cp.`tpch/nation.parquet`",
relative_path_metadata_t2);
+ // creating two inner directories to leverage
METADATA_DIRECTORIES_FILENAME metadata file as well
+ final String absolutePathsMetadataT1 = absolutePathsMetadata + "/t1";
+ final String absolutePathsMetadataT2 = absolutePathsMetadata + "/t2";
+ test("create table `%s` as select * from cp.`tpch/nation.parquet`",
absolutePathsMetadataT1);
+ test("create table `%s` as select * from cp.`tpch/nation.parquet`",
absolutePathsMetadataT2);
copyMetaDataCacheToTempReplacingInternalPaths("parquet/metadata_with_absolute_path/"
+
- "metadata_directories_with_absolute_paths.requires_replace.txt",
RELATIVE_PATHS_METADATA, Metadata.METADATA_DIRECTORIES_FILENAME);
+ "metadata_directories_with_absolute_paths.requires_replace.txt",
absolutePathsMetadata, Metadata.METADATA_DIRECTORIES_FILENAME);
copyMetaDataCacheToTempReplacingInternalPaths("parquet/metadata_with_absolute_path/"
+
- "metadata_table_with_absolute_paths.requires_replace.txt",
RELATIVE_PATHS_METADATA, Metadata.METADATA_FILENAME);
+ "metadata_table_with_absolute_paths.requires_replace.txt",
absolutePathsMetadata, Metadata.METADATA_FILENAME);
copyMetaDataCacheToTempReplacingInternalPaths("parquet/metadata_with_absolute_path/"
+
- "metadata_table_with_absolute_paths_t1.requires_replace.txt",
relative_path_metadata_t1, Metadata.METADATA_FILENAME);
+ "metadata_table_with_absolute_paths_t1.requires_replace.txt",
absolutePathsMetadataT1, Metadata.METADATA_FILENAME);
copyMetaDataCacheToTempReplacingInternalPaths("parquet/metadata_with_absolute_path/"
+
- "metadata_table_with_absolute_paths_t2.requires_replace.txt",
relative_path_metadata_t2, Metadata.METADATA_FILENAME);
+ "metadata_table_with_absolute_paths_t2.requires_replace.txt",
absolutePathsMetadataT2, Metadata.METADATA_FILENAME);
+ String query = String.format("select * from %s",
absolutePathsMetadata);
+ int expectedRowCount = 50;
+ int expectedNumFiles = 1; // point to selectionRoot since no pruning
is done in this query
+ int actualRowCount = testSql(query);
+ assertEquals("An incorrect result was obtained while querying a
table with metadata cache files",
+ expectedRowCount, actualRowCount);
+ String numFilesPattern = "numFiles=" + expectedNumFiles;
+ String usedMetaPattern = "usedMetadataFile=true";
+ String cacheFileRootPattern = String.format("cacheFileRoot=%s/%s",
getDfsTestTmpSchemaLocation(), absolutePathsMetadata);
+ PlanTestBase.testPlanMatchingPatterns(query, new
String[]{numFilesPattern, usedMetaPattern, cacheFileRootPattern},
+ new String[] {"Filter"});
+ } finally {
+ test("drop table if exists %s", absolutePathsMetadata);
+ }
+ }
- int rowCount = testSql(String.format("select * from %s",
RELATIVE_PATHS_METADATA));
- assertEquals("An incorrect result was obtained while querying a
table with metadata cache files", 50, rowCount);
+ @Test
+ public void testSpacesInMetadataCachePath() throws Exception {
+ final String pathWithSpaces = "path with spaces";
+ try {
+ // creating multilevel table to store path with spaces in both
metadata files (METADATA and METADATA_DIRECTORIES)
+ test("create table dfs_test.tmp.`%s` as select * from
cp.`tpch/nation.parquet`", pathWithSpaces);
+ test("create table dfs_test.tmp.`%1$s/%1$s` as select * from
cp.`tpch/nation.parquet`", pathWithSpaces);
+ test("refresh table metadata dfs_test.tmp.`%s`", pathWithSpaces);
+ checkForMetadataFile(pathWithSpaces);
+ String query = String.format("select * from dfs_test.tmp.`%s`",
pathWithSpaces);
+ int expectedRowCount = 50;
+ int expectedNumFiles = 1; // point to selectionRoot since no pruning
is done in this query
+ int actualRowCount = testSql(query);
+ assertEquals("An incorrect result was obtained while querying a
table with metadata cache files",
+ expectedRowCount, actualRowCount);
+ String numFilesPattern = "numFiles=" + expectedNumFiles;
+ String usedMetaPattern = "usedMetadataFile=true";
+ String cacheFileRootPattern = String.format("cacheFileRoot=%s/%s",
getDfsTestTmpSchemaLocation(), pathWithSpaces);
+ PlanTestBase.testPlanMatchingPatterns(query, new
String[]{numFilesPattern, usedMetaPattern, cacheFileRootPattern},
+ new String[] {"Filter"});
+ } finally {
+ test("drop table if exists dfs_test.tmp.`%s`", pathWithSpaces);
+ }
+ }
+
+ @Test
+ public void testFutureUnsupportedMetadataVersion() throws Exception {
+ final String unsupportedMetadataVersion =
"unsupported_metadata_version";
+ try {
+ test("use dfs_test.tmp");
+ test("create table `%s` as select * from cp.`tpch/nation.parquet`",
unsupportedMetadataVersion);
+ MetadataVersion lastVersion =
Iterables.getLast(MetadataVersion.Constants.SUPPORTED_VERSIONS);
+ for (MetadataVersion supportedVersion :
MetadataVersion.Constants.SUPPORTED_VERSIONS) {
+ if (lastVersion.compareTo(supportedVersion) < 0) {
+ lastVersion = supportedVersion;
+ }
+ }
+ // Get the future version, which is absent in
MetadataVersion.SUPPORTED_VERSIONS list
+ String futureVersion = "v" +
(Integer.parseInt(String.valueOf(lastVersion.toString().charAt(1))) + 1);
+
copyMetaDataCacheToTempWithReplacements("parquet/unsupported_metadata/unsupported_metadata_version.requires_replace.txt",
+ unsupportedMetadataVersion, Metadata.METADATA_FILENAME,
futureVersion);
+ String query = String.format("select * from %s",
unsupportedMetadataVersion);
+ int expectedRowCount = 25;
+ int expectedNumFiles = 1;
+ int actualRowCount = testSql(query);
+ assertEquals("An incorrect result was obtained while querying a
table with metadata cache files",
+ expectedRowCount, actualRowCount);
+ String numFilesPattern = "numFiles=" + expectedNumFiles;
+ String usedMetaPattern = "usedMetadataFile=false"; // ignoring
metadata cache file
+ PlanTestBase.testPlanMatchingPatterns(query, new
String[]{numFilesPattern, usedMetaPattern},
+ new String[] {"Filter"});
+ } finally {
+ test("drop table if exists %s", unsupportedMetadataVersion);
+ }
+ }
+
+ @Test
+ public void testCorruptedMetadataFile() throws Exception {
--- End diff --
Please add tests for the empty metadata file and when at lest one metadata
file is missing.
> Drill 1.10 queries fail due to Parquet Metadata "corruption" from DRILL-3867
> ----------------------------------------------------------------------------
>
> Key: DRILL-5660
> URL: https://issues.apache.org/jira/browse/DRILL-5660
> Project: Apache Drill
> Issue Type: Bug
> Affects Versions: 1.11.0
> Reporter: Paul Rogers
> Assignee: Vitalii Diravka
> Labels: doc-impacting
> Fix For: 1.12.0
>
>
> Drill recently accepted a PR for the following bug:
> DRILL-3867: Store relative paths in metadata file
> This PR turned out to have a flaw which affects version compatibility.
> The DRILL-3867 PR changed the format of Parquet metadata files to store
> relative paths. All Drill servers after that PR create files with relative
> paths. But, the version number of the file is unchanged, so that older
> Drillbits don't know that they can't understand the file.
> Instead, if an older Drill tries to read the file, queries fail left and
> right. Drill will resolve the paths, but does so relative to the user's HDFS
> home directory, which is wrong.
> What should have happened is that we should have bumped the parquet metadata
> file version number so that older Drillbits can’t read the file. This ticket
> requests that we do that.
> Now, one could argue that the lack of version number change is fine. Once a
> user upgrades Drill, they won't use an old Drillbit. But, things are not that
> simple:
> * A developer tests a branch based on a pre-DRILL-3867 build on a cluster in
> which metadata files have been created by a post-DRILL-3867 build. (This has
> already occurred multiple times in our shop.)
> * A user tries to upgrade to Drill 1.11, finds an issue, and needs to roll
> back to Drill 1.10. Doing so will cause queries to fail due to
> seemingly-corrupt metadata files.
> * A user tries to do a rolling upgrade: running 1.11 on some servers, 1.10 on
> others. Once a 1.11 server is installed, the metadata is updated ("corrupted"
> from the perspective of 1.10) and queries fail.
> Standard practice in this scenario is to:
> * Bump the file version number when the file format changes, and
> * Software refuses to read files with a version newer than the software was
> designed for.
> Of course, it is highly desirable that newer servers read old files, but that
> is not the issue here.
> *Main technical points of working of parquet metadata caching for now.*
> Only process of reading the parquet metadata is changed (the process of
> writing isn't changed):
> +1. Metadata files are valid:+
> Metadata objects are created by deserialization of parquet metadata files in
> the process of creating ParquetGroupScan physical operator.
> All supported versions are stored in the "MetadataVersion.Constants" class
> and in the Jackson annotations for Metadata.ParquetTableMetadataBase class.
> +2. Metadata files version isn't supported (created by newer Drill version).
> Drill table has at least one metadata file of unsupported version:+
> JsonMappingException is obtained and swallowed without creating metadata
> object. Error message is logged. The state is stored in MetadataContext,
> therefore further there will be no attempt to deserialize metadata file again
> in context of performing current query. The physical plan will be created
> without using parquet metadata caching. Warning message is logged for every
> further check "is metadata corrupted".
> +3. Drill table has at least one corrupted metadata file, which can't be
> deserialized:+
> JsonParseException is obtained. Then the same behaviour as for the
> unsupported version files.
> +4. The metadata file was removed by other process:+
> FileNotFound is obtained. Then the same behaviour as for the unsupported
> version files.
> The new versions of metadata should be added in such manner:
> 1. Increasing of the metadata major version if metadata structure is changed.
> 2. Increasing of the metadata minor version if only metadata content is
> changed, but metadata structure is the same.
> For the first case a new metadata structure (class) should be created
> (possible an improvement of deserializing metadata files of any version into
> one strucure by using special converting)
> For the second case only annotation for the last metadata structure can be
> updated.
> *Summary*
> 1. Drill will read and use metadata files if files are valid, all present and
> supported. Under supported we mean that files were created before and under
> current Drill version.
> 2. Drill will ignore reading metadata files if at least one file is missing,
> empty, corrupted or unsupported. Under unsupported we mean files created
> after current Drill version. When metadata files are not used, warning
> message will be written to the logs.
--
This message was sent by Atlassian JIRA
(v6.4.14#64029)