[ https://issues.apache.org/jira/browse/HBASE-28461?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]
Dieter De Paepe updated HBASE-28461: ------------------------------------ Description: While working on tests for HBASE-28412, I noticed that `TestIncrementalBackup` always fails on my computer and the computer of a colleague. However, I manage to get this test working 100% when setting breakpoints on following locations (accidentally discovered while trying to debug this issue): * `IncrementalTableBackupClient` L269 (`newTimestamps = ((IncrementalBackupManager) backupManager).getIncrBackupLogFileMap();`) * `IncrementalBackupManager` L96 (`logList = getLogFilesForNewBackup(previousTimestampMins, newTimestamps, conf, savedStartCode);`) * `WALInputFormat` L356 (`RemoteIterator<LocatedFileStatus> iter = fs.listLocatedStatus(dir);`) This test fails with the following error: {code:java} java.io.IOException: java.io.FileNotFoundException: File hdfs://localhost:46787/user/dieter/test-data/b615664f-1cde-fc27-1752-33ab359a931c/WALs/localhost,38309,1711553427191/localhost%2C38309%2C1711553427191.localhost%2C38309%2C1711553427191.regiongroup-0.1711553471361 does not exist. at org.apache.hadoop.hbase.backup.impl.IncrementalTableBackupClient.execute(IncrementalTableBackupClient.java:289) at org.apache.hadoop.hbase.backup.impl.BackupAdminImpl.backupTables(BackupAdminImpl.java:603) at org.apache.hadoop.hbase.backup.TestIncrementalBackup.TestIncBackupRestore(TestIncrementalBackup.java:169) at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method) at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62) at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) at java.base/java.lang.reflect.Method.invoke(Method.java:566) at org.junit.runners.model.FrameworkMethod$1.runReflectiveCall(FrameworkMethod.java:59) at org.junit.internal.runners.model.ReflectiveCallable.run(ReflectiveCallable.java:12) at org.junit.runners.model.FrameworkMethod.invokeExplosively(FrameworkMethod.java:56) at org.junit.internal.runners.statements.InvokeMethod.evaluate(InvokeMethod.java:17) at org.junit.runners.ParentRunner$3.evaluate(ParentRunner.java:306) at org.junit.runners.BlockJUnit4ClassRunner$1.evaluate(BlockJUnit4ClassRunner.java:100) at org.junit.runners.ParentRunner.runLeaf(ParentRunner.java:366) at org.junit.runners.BlockJUnit4ClassRunner.runChild(BlockJUnit4ClassRunner.java:103) at org.junit.runners.BlockJUnit4ClassRunner.runChild(BlockJUnit4ClassRunner.java:63) at org.junit.runners.ParentRunner$4.run(ParentRunner.java:331) at org.junit.runners.ParentRunner$1.schedule(ParentRunner.java:79) at org.junit.runners.ParentRunner.runChildren(ParentRunner.java:329) at org.junit.runners.ParentRunner.access$100(ParentRunner.java:66) at org.junit.runners.ParentRunner$2.evaluate(ParentRunner.java:293) at org.junit.runners.ParentRunner.run(ParentRunner.java:413) at org.junit.runners.Suite.runChild(Suite.java:128) at org.junit.runners.Suite.runChild(Suite.java:27) at org.junit.runners.ParentRunner$4.run(ParentRunner.java:331) at org.junit.runners.ParentRunner$1.schedule(ParentRunner.java:79) at org.junit.runners.ParentRunner.runChildren(ParentRunner.java:329) at org.junit.runners.ParentRunner.access$100(ParentRunner.java:66) at org.junit.runners.ParentRunner$2.evaluate(ParentRunner.java:293) at org.junit.internal.runners.statements.RunBefores.evaluate(RunBefores.java:26) at org.junit.internal.runners.statements.RunAfters.evaluate(RunAfters.java:27) at org.junit.internal.runners.statements.FailOnTimeout$CallableStatement.call(FailOnTimeout.java:299) at org.junit.internal.runners.statements.FailOnTimeout$CallableStatement.call(FailOnTimeout.java:293) at java.base/java.util.concurrent.FutureTask.run(FutureTask.java:264) at java.base/java.lang.Thread.run(Thread.java:829) Caused by: java.io.FileNotFoundException: File hdfs://localhost:46787/user/dieter/test-data/b615664f-1cde-fc27-1752-33ab359a931c/WALs/localhost,38309,1711553427191/localhost%2C38309%2C1711553427191.localhost%2C38309%2C1711553427191.regiongroup-0.1711553471361 does not exist. at org.apache.hadoop.hdfs.DistributedFileSystem$DirListingIterator.<init>(DistributedFileSystem.java:1282) at org.apache.hadoop.hdfs.DistributedFileSystem$DirListingIterator.<init>(DistributedFileSystem.java:1256) at org.apache.hadoop.hdfs.DistributedFileSystem$25.doCall(DistributedFileSystem.java:1201) at org.apache.hadoop.hdfs.DistributedFileSystem$25.doCall(DistributedFileSystem.java:1197) at org.apache.hadoop.fs.FileSystemLinkResolver.resolve(FileSystemLinkResolver.java:81) at org.apache.hadoop.hdfs.DistributedFileSystem.listLocatedStatus(DistributedFileSystem.java:1215) at org.apache.hadoop.fs.FileSystem.listLocatedStatus(FileSystem.java:2230) at org.apache.hadoop.hbase.mapreduce.WALInputFormat.getFiles(WALInputFormat.java:356) at org.apache.hadoop.hbase.mapreduce.WALInputFormat.getSplits(WALInputFormat.java:321) at org.apache.hadoop.hbase.mapreduce.WALInputFormat.getSplits(WALInputFormat.java:301) at org.apache.hadoop.mapreduce.JobSubmitter.writeNewSplits(JobSubmitter.java:310) at org.apache.hadoop.mapreduce.JobSubmitter.writeSplits(JobSubmitter.java:327) at org.apache.hadoop.mapreduce.JobSubmitter.submitJobInternal(JobSubmitter.java:200) at org.apache.hadoop.mapreduce.Job$11.run(Job.java:1678) at org.apache.hadoop.mapreduce.Job$11.run(Job.java:1675) at java.base/java.security.AccessController.doPrivileged(Native Method) at java.base/javax.security.auth.Subject.doAs(Subject.java:423) at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1899) at org.apache.hadoop.mapreduce.Job.submit(Job.java:1675) at org.apache.hadoop.mapreduce.Job.waitForCompletion(Job.java:1696) at org.apache.hadoop.hbase.mapreduce.WALPlayer.run(WALPlayer.java:423) at org.apache.hadoop.hbase.backup.impl.IncrementalTableBackupClient.walToHFiles(IncrementalTableBackupClient.java:406) at org.apache.hadoop.hbase.backup.impl.IncrementalTableBackupClient.convertWALsToHFiles(IncrementalTableBackupClient.java:378) at org.apache.hadoop.hbase.backup.impl.IncrementalTableBackupClient.execute(IncrementalTableBackupClient.java:282) ... 34 more {code} I suspect the issue is that the WAL files are not yet fully written to HDFS when the WALPlayer tries to convert them to HFiles. Since I'm not sure about the exact cause yet, I'm also not sure if this is a problem with the test (which uses a dedicated `IncrementalTableBackupClientForTest`) or the actual backup code. I attached 2 logfiles from tests (one where it runs, and one where it fails). was: While working on tests for HBASE-28412, I noticed that `TestIncrementalBackup` always fails on my computer and the computer of a colleague. However, I manage to get this test working 100% when setting breakpoints on following locations (accidentally discovered while trying to debug this issue): * `IncrementalTableBackupClient` L269 (`newTimestamps = ((IncrementalBackupManager) backupManager).getIncrBackupLogFileMap();`) * `IncrementalBackupManager` L96 (`logList = getLogFilesForNewBackup(previousTimestampMins, newTimestamps, conf, savedStartCode);`) * `WALInputFormat` L356 (`logList = getLogFilesForNewBackup(previousTimestampMins, newTimestamps, conf, savedStartCode);`) This test fails with the following error: {code:java} java.io.IOException: java.io.FileNotFoundException: File hdfs://localhost:46787/user/dieter/test-data/b615664f-1cde-fc27-1752-33ab359a931c/WALs/localhost,38309,1711553427191/localhost%2C38309%2C1711553427191.localhost%2C38309%2C1711553427191.regiongroup-0.1711553471361 does not exist. at org.apache.hadoop.hbase.backup.impl.IncrementalTableBackupClient.execute(IncrementalTableBackupClient.java:289) at org.apache.hadoop.hbase.backup.impl.BackupAdminImpl.backupTables(BackupAdminImpl.java:603) at org.apache.hadoop.hbase.backup.TestIncrementalBackup.TestIncBackupRestore(TestIncrementalBackup.java:169) at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method) at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62) at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) at java.base/java.lang.reflect.Method.invoke(Method.java:566) at org.junit.runners.model.FrameworkMethod$1.runReflectiveCall(FrameworkMethod.java:59) at org.junit.internal.runners.model.ReflectiveCallable.run(ReflectiveCallable.java:12) at org.junit.runners.model.FrameworkMethod.invokeExplosively(FrameworkMethod.java:56) at org.junit.internal.runners.statements.InvokeMethod.evaluate(InvokeMethod.java:17) at org.junit.runners.ParentRunner$3.evaluate(ParentRunner.java:306) at org.junit.runners.BlockJUnit4ClassRunner$1.evaluate(BlockJUnit4ClassRunner.java:100) at org.junit.runners.ParentRunner.runLeaf(ParentRunner.java:366) at org.junit.runners.BlockJUnit4ClassRunner.runChild(BlockJUnit4ClassRunner.java:103) at org.junit.runners.BlockJUnit4ClassRunner.runChild(BlockJUnit4ClassRunner.java:63) at org.junit.runners.ParentRunner$4.run(ParentRunner.java:331) at org.junit.runners.ParentRunner$1.schedule(ParentRunner.java:79) at org.junit.runners.ParentRunner.runChildren(ParentRunner.java:329) at org.junit.runners.ParentRunner.access$100(ParentRunner.java:66) at org.junit.runners.ParentRunner$2.evaluate(ParentRunner.java:293) at org.junit.runners.ParentRunner.run(ParentRunner.java:413) at org.junit.runners.Suite.runChild(Suite.java:128) at org.junit.runners.Suite.runChild(Suite.java:27) at org.junit.runners.ParentRunner$4.run(ParentRunner.java:331) at org.junit.runners.ParentRunner$1.schedule(ParentRunner.java:79) at org.junit.runners.ParentRunner.runChildren(ParentRunner.java:329) at org.junit.runners.ParentRunner.access$100(ParentRunner.java:66) at org.junit.runners.ParentRunner$2.evaluate(ParentRunner.java:293) at org.junit.internal.runners.statements.RunBefores.evaluate(RunBefores.java:26) at org.junit.internal.runners.statements.RunAfters.evaluate(RunAfters.java:27) at org.junit.internal.runners.statements.FailOnTimeout$CallableStatement.call(FailOnTimeout.java:299) at org.junit.internal.runners.statements.FailOnTimeout$CallableStatement.call(FailOnTimeout.java:293) at java.base/java.util.concurrent.FutureTask.run(FutureTask.java:264) at java.base/java.lang.Thread.run(Thread.java:829) Caused by: java.io.FileNotFoundException: File hdfs://localhost:46787/user/dieter/test-data/b615664f-1cde-fc27-1752-33ab359a931c/WALs/localhost,38309,1711553427191/localhost%2C38309%2C1711553427191.localhost%2C38309%2C1711553427191.regiongroup-0.1711553471361 does not exist. at org.apache.hadoop.hdfs.DistributedFileSystem$DirListingIterator.<init>(DistributedFileSystem.java:1282) at org.apache.hadoop.hdfs.DistributedFileSystem$DirListingIterator.<init>(DistributedFileSystem.java:1256) at org.apache.hadoop.hdfs.DistributedFileSystem$25.doCall(DistributedFileSystem.java:1201) at org.apache.hadoop.hdfs.DistributedFileSystem$25.doCall(DistributedFileSystem.java:1197) at org.apache.hadoop.fs.FileSystemLinkResolver.resolve(FileSystemLinkResolver.java:81) at org.apache.hadoop.hdfs.DistributedFileSystem.listLocatedStatus(DistributedFileSystem.java:1215) at org.apache.hadoop.fs.FileSystem.listLocatedStatus(FileSystem.java:2230) at org.apache.hadoop.hbase.mapreduce.WALInputFormat.getFiles(WALInputFormat.java:356) at org.apache.hadoop.hbase.mapreduce.WALInputFormat.getSplits(WALInputFormat.java:321) at org.apache.hadoop.hbase.mapreduce.WALInputFormat.getSplits(WALInputFormat.java:301) at org.apache.hadoop.mapreduce.JobSubmitter.writeNewSplits(JobSubmitter.java:310) at org.apache.hadoop.mapreduce.JobSubmitter.writeSplits(JobSubmitter.java:327) at org.apache.hadoop.mapreduce.JobSubmitter.submitJobInternal(JobSubmitter.java:200) at org.apache.hadoop.mapreduce.Job$11.run(Job.java:1678) at org.apache.hadoop.mapreduce.Job$11.run(Job.java:1675) at java.base/java.security.AccessController.doPrivileged(Native Method) at java.base/javax.security.auth.Subject.doAs(Subject.java:423) at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1899) at org.apache.hadoop.mapreduce.Job.submit(Job.java:1675) at org.apache.hadoop.mapreduce.Job.waitForCompletion(Job.java:1696) at org.apache.hadoop.hbase.mapreduce.WALPlayer.run(WALPlayer.java:423) at org.apache.hadoop.hbase.backup.impl.IncrementalTableBackupClient.walToHFiles(IncrementalTableBackupClient.java:406) at org.apache.hadoop.hbase.backup.impl.IncrementalTableBackupClient.convertWALsToHFiles(IncrementalTableBackupClient.java:378) at org.apache.hadoop.hbase.backup.impl.IncrementalTableBackupClient.execute(IncrementalTableBackupClient.java:282) ... 34 more {code} I suspect the issue is that the WAL files are not yet fully written to HDFS when the WALPlayer tries to convert them to HFiles. Since I'm not sure about the exact cause yet, I'm also not sure if this is a problem with the test (which uses a dedicated `IncrementalTableBackupClientForTest`) or the actual backup code. I attached 2 logfiles from tests (one where it runs, and one where it fails). > Timing issue in Incremental Backup or TestIncrementalBackup > ----------------------------------------------------------- > > Key: HBASE-28461 > URL: https://issues.apache.org/jira/browse/HBASE-28461 > Project: HBase > Issue Type: Bug > Components: backup&restore > Affects Versions: 2.6.0, 4.0.0-alpha-1 > Environment: HBase master - commit > 298c550c804305f2c57029a563039eefcbb4af40 > 20.04.1-Ubuntu > Reporter: Dieter De Paepe > Priority: Minor > Attachments: ide_output_fail_part2.txt, ide_output_success_part2.txt > > > While working on tests for HBASE-28412, I noticed that > `TestIncrementalBackup` always fails on my computer and the computer of a > colleague. However, I manage to get this test working 100% when setting > breakpoints on following locations (accidentally discovered while trying to > debug this issue): > * `IncrementalTableBackupClient` L269 (`newTimestamps = > ((IncrementalBackupManager) backupManager).getIncrBackupLogFileMap();`) > * `IncrementalBackupManager` L96 (`logList = > getLogFilesForNewBackup(previousTimestampMins, newTimestamps, conf, > savedStartCode);`) > * `WALInputFormat` L356 (`RemoteIterator<LocatedFileStatus> iter = > fs.listLocatedStatus(dir);`) > This test fails with the following error: > > {code:java} > java.io.IOException: java.io.FileNotFoundException: File > hdfs://localhost:46787/user/dieter/test-data/b615664f-1cde-fc27-1752-33ab359a931c/WALs/localhost,38309,1711553427191/localhost%2C38309%2C1711553427191.localhost%2C38309%2C1711553427191.regiongroup-0.1711553471361 > does not exist. at > org.apache.hadoop.hbase.backup.impl.IncrementalTableBackupClient.execute(IncrementalTableBackupClient.java:289) > at > org.apache.hadoop.hbase.backup.impl.BackupAdminImpl.backupTables(BackupAdminImpl.java:603) > at > org.apache.hadoop.hbase.backup.TestIncrementalBackup.TestIncBackupRestore(TestIncrementalBackup.java:169) > at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native > Method) > at > java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62) > at > java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) > at java.base/java.lang.reflect.Method.invoke(Method.java:566) > at > org.junit.runners.model.FrameworkMethod$1.runReflectiveCall(FrameworkMethod.java:59) > at > org.junit.internal.runners.model.ReflectiveCallable.run(ReflectiveCallable.java:12) > at > org.junit.runners.model.FrameworkMethod.invokeExplosively(FrameworkMethod.java:56) > at > org.junit.internal.runners.statements.InvokeMethod.evaluate(InvokeMethod.java:17) > at org.junit.runners.ParentRunner$3.evaluate(ParentRunner.java:306) > at > org.junit.runners.BlockJUnit4ClassRunner$1.evaluate(BlockJUnit4ClassRunner.java:100) > at org.junit.runners.ParentRunner.runLeaf(ParentRunner.java:366) > at > org.junit.runners.BlockJUnit4ClassRunner.runChild(BlockJUnit4ClassRunner.java:103) > at > org.junit.runners.BlockJUnit4ClassRunner.runChild(BlockJUnit4ClassRunner.java:63) > at org.junit.runners.ParentRunner$4.run(ParentRunner.java:331) > at org.junit.runners.ParentRunner$1.schedule(ParentRunner.java:79) > at org.junit.runners.ParentRunner.runChildren(ParentRunner.java:329) > at org.junit.runners.ParentRunner.access$100(ParentRunner.java:66) > at org.junit.runners.ParentRunner$2.evaluate(ParentRunner.java:293) > at org.junit.runners.ParentRunner.run(ParentRunner.java:413) > at org.junit.runners.Suite.runChild(Suite.java:128) > at org.junit.runners.Suite.runChild(Suite.java:27) > at org.junit.runners.ParentRunner$4.run(ParentRunner.java:331) > at org.junit.runners.ParentRunner$1.schedule(ParentRunner.java:79) > at org.junit.runners.ParentRunner.runChildren(ParentRunner.java:329) > at org.junit.runners.ParentRunner.access$100(ParentRunner.java:66) > at org.junit.runners.ParentRunner$2.evaluate(ParentRunner.java:293) > at > org.junit.internal.runners.statements.RunBefores.evaluate(RunBefores.java:26) > at > org.junit.internal.runners.statements.RunAfters.evaluate(RunAfters.java:27) > at > org.junit.internal.runners.statements.FailOnTimeout$CallableStatement.call(FailOnTimeout.java:299) > at > org.junit.internal.runners.statements.FailOnTimeout$CallableStatement.call(FailOnTimeout.java:293) > at java.base/java.util.concurrent.FutureTask.run(FutureTask.java:264) > at java.base/java.lang.Thread.run(Thread.java:829) > Caused by: java.io.FileNotFoundException: File > hdfs://localhost:46787/user/dieter/test-data/b615664f-1cde-fc27-1752-33ab359a931c/WALs/localhost,38309,1711553427191/localhost%2C38309%2C1711553427191.localhost%2C38309%2C1711553427191.regiongroup-0.1711553471361 > does not exist. > at > org.apache.hadoop.hdfs.DistributedFileSystem$DirListingIterator.<init>(DistributedFileSystem.java:1282) > at > org.apache.hadoop.hdfs.DistributedFileSystem$DirListingIterator.<init>(DistributedFileSystem.java:1256) > at > org.apache.hadoop.hdfs.DistributedFileSystem$25.doCall(DistributedFileSystem.java:1201) > at > org.apache.hadoop.hdfs.DistributedFileSystem$25.doCall(DistributedFileSystem.java:1197) > at > org.apache.hadoop.fs.FileSystemLinkResolver.resolve(FileSystemLinkResolver.java:81) > at > org.apache.hadoop.hdfs.DistributedFileSystem.listLocatedStatus(DistributedFileSystem.java:1215) > at org.apache.hadoop.fs.FileSystem.listLocatedStatus(FileSystem.java:2230) > at > org.apache.hadoop.hbase.mapreduce.WALInputFormat.getFiles(WALInputFormat.java:356) > at > org.apache.hadoop.hbase.mapreduce.WALInputFormat.getSplits(WALInputFormat.java:321) > at > org.apache.hadoop.hbase.mapreduce.WALInputFormat.getSplits(WALInputFormat.java:301) > at > org.apache.hadoop.mapreduce.JobSubmitter.writeNewSplits(JobSubmitter.java:310) > at > org.apache.hadoop.mapreduce.JobSubmitter.writeSplits(JobSubmitter.java:327) > at > org.apache.hadoop.mapreduce.JobSubmitter.submitJobInternal(JobSubmitter.java:200) > at org.apache.hadoop.mapreduce.Job$11.run(Job.java:1678) > at org.apache.hadoop.mapreduce.Job$11.run(Job.java:1675) > at java.base/java.security.AccessController.doPrivileged(Native Method) > at java.base/javax.security.auth.Subject.doAs(Subject.java:423) > at > org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1899) > at org.apache.hadoop.mapreduce.Job.submit(Job.java:1675) > at org.apache.hadoop.mapreduce.Job.waitForCompletion(Job.java:1696) > at org.apache.hadoop.hbase.mapreduce.WALPlayer.run(WALPlayer.java:423) > at > org.apache.hadoop.hbase.backup.impl.IncrementalTableBackupClient.walToHFiles(IncrementalTableBackupClient.java:406) > at > org.apache.hadoop.hbase.backup.impl.IncrementalTableBackupClient.convertWALsToHFiles(IncrementalTableBackupClient.java:378) > at > org.apache.hadoop.hbase.backup.impl.IncrementalTableBackupClient.execute(IncrementalTableBackupClient.java:282) > ... 34 more {code} > > > I suspect the issue is that the WAL files are not yet fully written to HDFS > when the WALPlayer tries to convert them to HFiles. > Since I'm not sure about the exact cause yet, I'm also not sure if this is a > problem with the test (which uses a dedicated > `IncrementalTableBackupClientForTest`) or the actual backup code. > I attached 2 logfiles from tests (one where it runs, and one where it fails). -- This message was sent by Atlassian Jira (v8.20.10#820010)