[ https://issues.apache.org/jira/browse/HDDS-11068?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]
Attila Doroszlai resolved HDDS-11068. ------------------------------------- Fix Version/s: 1.5.0 Resolution: Fixed > OM down to Snapshot Chain Corruption > ------------------------------------ > > Key: HDDS-11068 > URL: https://issues.apache.org/jira/browse/HDDS-11068 > Project: Apache Ozone > Issue Type: Bug > Components: Ozone Manager, Snapshot > Reporter: Jyotirmoy Sinha > Assignee: Swaminathan Balachandran > Priority: Critical > Labels: ozone-snapshot, pull-request-available > Fix For: 1.5.0 > > > OM down to Snapshot Chain Corruption > OM Error stacktrace - > {code:java} > 2024-06-25 14:51:14,293 ERROR > [main]-org.apache.hadoop.ozone.om.SnapshotChainManager: Failure while loading > snapshot chain. > java.io.IOException: Snapshot chain corruption. All snapshots have not been > added to the snapshot chain. Last snapshot added to chain : > 750ae2ca-3f8d-4a1f-8655-9d15bd5ed84b > at > org.apache.hadoop.ozone.om.SnapshotChainManager.loadFromSnapshotInfoTable(SnapshotChainManager.java:324) > at > org.apache.hadoop.ozone.om.SnapshotChainManager.<init>(SnapshotChainManager.java:66) > at > org.apache.hadoop.ozone.om.OmMetadataManagerImpl.start(OmMetadataManagerImpl.java:565) > at > org.apache.hadoop.ozone.om.OmMetadataManagerImpl.<init>(OmMetadataManagerImpl.java:342) > at > org.apache.hadoop.ozone.om.OzoneManager.instantiateServices(OzoneManager.java:809) > at > org.apache.hadoop.ozone.om.OzoneManager.<init>(OzoneManager.java:687) > at > org.apache.hadoop.ozone.om.OzoneManager.createOm(OzoneManager.java:774) > at > org.apache.hadoop.ozone.om.OzoneManagerStarter$OMStarterHelper.start(OzoneManagerStarter.java:189) > at > org.apache.hadoop.ozone.om.OzoneManagerStarter.startOm(OzoneManagerStarter.java:86) > at > org.apache.hadoop.ozone.om.OzoneManagerStarter.call(OzoneManagerStarter.java:74) > at org.apache.hadoop.hdds.cli.GenericCli.call(GenericCli.java:38) > at picocli.CommandLine.executeUserObject(CommandLine.java:2041) > at picocli.CommandLine.access$1500(CommandLine.java:148) > at > picocli.CommandLine$RunLast.executeUserObjectOfLastSubcommandWithSameParent(CommandLine.java:2461) > at picocli.CommandLine$RunLast.handle(CommandLine.java:2453) > at picocli.CommandLine$RunLast.handle(CommandLine.java:2415) > at > picocli.CommandLine$AbstractParseResultHandler.execute(CommandLine.java:2273) > at picocli.CommandLine$RunLast.execute(CommandLine.java:2417) > at picocli.CommandLine.execute(CommandLine.java:2170) > at org.apache.hadoop.hdds.cli.GenericCli.execute(GenericCli.java:100) > at org.apache.hadoop.hdds.cli.GenericCli.run(GenericCli.java:91) > at > org.apache.hadoop.ozone.om.OzoneManagerStarter.main(OzoneManagerStarter.java:58) > {code} > {code:java} > 2024-06-25 14:51:16,854 INFO > [main]-org.apache.hadoop.hdds.utils.NativeLibraryLoader: Loading Library: > ozone_rocksdb_tools > 2024-06-25 14:51:16,857 ERROR > [main]-org.apache.hadoop.ozone.om.snapshot.SnapshotDiffManager: Native > Library for raw sst file reading loading failed. > org.apache.hadoop.hdds.utils.NativeLibraryNotLoadedException: Unable to load > library ozone_rocksdb_tools from both java.library.path & resource file > libozone_rocksdb_tools.so from jar. > at > org.apache.hadoop.hdds.utils.db.managed.ManagedRawSSTFileReader.loadLibrary(ManagedRawSSTFileReader.java:40) > at > org.apache.hadoop.ozone.om.snapshot.SnapshotDiffManager.initNativeLibraryForEfficientDiff(SnapshotDiffManager.java:285) > at > org.apache.hadoop.ozone.om.snapshot.SnapshotDiffManager.<init>(SnapshotDiffManager.java:259) > at > org.apache.hadoop.ozone.om.OmSnapshotManager.<init>(OmSnapshotManager.java:286) > at > org.apache.hadoop.ozone.om.OzoneManager.instantiateServices(OzoneManager.java:863) > at > org.apache.hadoop.ozone.om.OzoneManager.<init>(OzoneManager.java:687) > at > org.apache.hadoop.ozone.om.OzoneManager.createOm(OzoneManager.java:774) > at > org.apache.hadoop.ozone.om.OzoneManagerStarter$OMStarterHelper.start(OzoneManagerStarter.java:189) > at > org.apache.hadoop.ozone.om.OzoneManagerStarter.startOm(OzoneManagerStarter.java:86) > at > org.apache.hadoop.ozone.om.OzoneManagerStarter.call(OzoneManagerStarter.java:74) > at org.apache.hadoop.hdds.cli.GenericCli.call(GenericCli.java:38) > at picocli.CommandLine.executeUserObject(CommandLine.java:2041) > at picocli.CommandLine.access$1500(CommandLine.java:148) > at > picocli.CommandLine$RunLast.executeUserObjectOfLastSubcommandWithSameParent(CommandLine.java:2461) > at picocli.CommandLine$RunLast.handle(CommandLine.java:2453) > at picocli.CommandLine$RunLast.handle(CommandLine.java:2415) > at > picocli.CommandLine$AbstractParseResultHandler.execute(CommandLine.java:2273) > at picocli.CommandLine$RunLast.execute(CommandLine.java:2417) > at picocli.CommandLine.execute(CommandLine.java:2170) > at org.apache.hadoop.hdds.cli.GenericCli.execute(GenericCli.java:100) > at org.apache.hadoop.hdds.cli.GenericCli.run(GenericCli.java:91) > at > org.apache.hadoop.ozone.om.OzoneManagerStarter.main(OzoneManagerStarter.java:58) > {code} > {code:java} > 2024-06-25 14:51:19,572 WARN [om229-OMStateMachineApplyTransactionThread - > 0]-org.apache.hadoop.metrics2.util.MBeans: Error creating MBean object name: > Hadoop:service=LayoutVersionManager,name=OMLayoutVersionManager > org.apache.hadoop.metrics2.MetricsException: > org.apache.hadoop.metrics2.MetricsException: > Hadoop:service=LayoutVersionManager,name=OMLayoutVersionManager already > exists! > at > org.apache.hadoop.metrics2.lib.DefaultMetricsSystem.newObjectName(DefaultMetricsSystem.java:135) > at > org.apache.hadoop.metrics2.lib.DefaultMetricsSystem.newMBeanName(DefaultMetricsSystem.java:110) > at > org.apache.hadoop.metrics2.util.MBeans.getMBeanName(MBeans.java:163) > at org.apache.hadoop.metrics2.util.MBeans.register(MBeans.java:95) > at org.apache.hadoop.metrics2.util.MBeans.register(MBeans.java:72) > at > org.apache.hadoop.ozone.upgrade.AbstractLayoutVersionManager.init(AbstractLayoutVersionManager.java:88) > at > org.apache.hadoop.ozone.om.upgrade.OMLayoutVersionManager.<init>(OMLayoutVersionManager.java:69) > at > org.apache.hadoop.ozone.om.upgrade.OMLayoutFeatureAspect.checkLayoutFeature(OMLayoutFeatureAspect.java:75) > at > org.apache.hadoop.ozone.om.request.snapshot.OMSnapshotMoveDeletedKeysRequest.validateAndUpdateCache(OMSnapshotMoveDeletedKeysRequest.java:63) > at > org.apache.hadoop.ozone.protocolPB.OzoneManagerRequestHandler.lambda$0(OzoneManagerRequestHandler.java:397) > at > org.apache.hadoop.util.MetricUtil.captureLatencyNs(MetricUtil.java:45) > at > org.apache.hadoop.ozone.protocolPB.OzoneManagerRequestHandler.handleWriteRequestImpl(OzoneManagerRequestHandler.java:395) > at > org.apache.hadoop.ozone.protocolPB.RequestHandler.handleWriteRequest(RequestHandler.java:63) > at > org.apache.hadoop.ozone.om.ratis.OzoneManagerStateMachine.runCommand(OzoneManagerStateMachine.java:539) > at > org.apache.hadoop.ozone.om.ratis.OzoneManagerStateMachine.lambda$1(OzoneManagerStateMachine.java:357) > at > java.util.concurrent.CompletableFuture$AsyncSupply.run(CompletableFuture.java:1604) > at > java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) > at > java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) > at java.lang.Thread.run(Thread.java:748) > Caused by: org.apache.hadoop.metrics2.MetricsException: > Hadoop:service=LayoutVersionManager,name=OMLayoutVersionManager already > exists! > at > org.apache.hadoop.metrics2.lib.DefaultMetricsSystem.newObjectName(DefaultMetricsSystem.java:131) > ... 18 more > 2024-06-25 14:51:19,579 INFO [main]-org.apache.ratis.grpc.server.GrpcService: > om229: GrpcService started, listening on 9872 > 2024-06-25 14:51:19,582 INFO > [JvmPauseMonitor0]-org.apache.ratis.util.JvmPauseMonitor: > JvmPauseMonitor-om229: Started > 2024-06-25 14:51:19,594 INFO [main]-org.apache.hadoop.ozone.om.OzoneManager: > Starting secret key client. > 2024-06-25 14:51:19,678 ERROR > [om229-OMDoubleBufferFlushThread]-org.apache.hadoop.ozone.om.ratis.OzoneManagerDoubleBuffer: > Terminating with exit status 2: During flush to DB encountered error in > OMDoubleBuffer flush thread om229-OMDoubleBufferFlushThread when handling > OMRequest: cmdType: PurgeKeys > traceID: "" > success: true > status: OKjava.lang.IllegalStateException: java.io.IOException: No snapshot > exist with snapshotId: a7efa54d-8beb-4fd0-808c-400e173ca6e9 > at > org.apache.hadoop.ozone.om.snapshot.SnapshotCache.lambda$2(SnapshotCache.java:166) > at > java.util.concurrent.ConcurrentHashMap.compute(ConcurrentHashMap.java:1853) > at > org.apache.hadoop.ozone.om.snapshot.SnapshotCache.get(SnapshotCache.java:154) > at > org.apache.hadoop.ozone.om.OmSnapshotManager.getSnapshot(OmSnapshotManager.java:690) > at > org.apache.hadoop.ozone.om.OmSnapshotManager.getSnapshot(OmSnapshotManager.java:678) > at > org.apache.hadoop.ozone.om.OmSnapshotManager.getSnapshot(OmSnapshotManager.java:661) > at > org.apache.hadoop.ozone.om.response.key.OMKeyPurgeResponse.addToDBBatch(OMKeyPurgeResponse.java:82) > at > org.apache.hadoop.ozone.om.response.OMClientResponse.checkAndUpdateDB(OMClientResponse.java:66) > at > org.apache.hadoop.ozone.om.ratis.OzoneManagerDoubleBuffer.lambda$8(OzoneManagerDoubleBuffer.java:408) > at > org.apache.hadoop.ozone.om.ratis.OzoneManagerDoubleBuffer.addToBatchWithTrace(OzoneManagerDoubleBuffer.java:253) > at > org.apache.hadoop.ozone.om.ratis.OzoneManagerDoubleBuffer.addToBatch(OzoneManagerDoubleBuffer.java:407) > at > org.apache.hadoop.ozone.om.ratis.OzoneManagerDoubleBuffer.flushBatch(OzoneManagerDoubleBuffer.java:353) > at > org.apache.hadoop.ozone.om.ratis.OzoneManagerDoubleBuffer.flushCurrentBuffer(OzoneManagerDoubleBuffer.java:328) > at > org.apache.hadoop.ozone.om.ratis.OzoneManagerDoubleBuffer.flushTransactions(OzoneManagerDoubleBuffer.java:295) > at java.lang.Thread.run(Thread.java:748) > Caused by: java.io.IOException: No snapshot exist with snapshotId: > a7efa54d-8beb-4fd0-808c-400e173ca6e9 > at > org.apache.hadoop.ozone.om.OmSnapshotManager$1.load(OmSnapshotManager.java:351) > at > org.apache.hadoop.ozone.om.OmSnapshotManager$1.load(OmSnapshotManager.java:1) > at > org.apache.hadoop.ozone.om.snapshot.SnapshotCache.lambda$2(SnapshotCache.java:158) > ... 14 more > 2024-06-25 14:51:19,678 ERROR [om229-OMStateMachineApplyTransactionThread - > 0]-org.apache.hadoop.ozone.om.ratis.OzoneManagerStateMachine: Terminating > with exit status 1: OM Ratis Server has received unrecoverable error, to > avoid further DB corruption, terminating OM. Error Response received > is:cmdType: SnapshotMoveDeletedKeys > traceID: "" > success: false > message: "java.io.IOException: Snapshot chain is corrupted.\n\tat > org.apache.hadoop.ozone.om.SnapshotChainManager.validateSnapshotChain(SnapshotChainManager.java:558)\n\tat > > org.apache.hadoop.ozone.om.SnapshotChainManager.hasNextPathSnapshot(SnapshotChainManager.java:453)\n\tat > > org.apache.hadoop.ozone.om.snapshot.SnapshotUtils.getNextActiveSnapshot(SnapshotUtils.java:157)\n\tat > > org.apache.hadoop.ozone.om.request.snapshot.OMSnapshotMoveDeletedKeysRequest.validateAndUpdateCache(OMSnapshotMoveDeletedKeysRequest.java:81)\n\tat > > org.apache.hadoop.ozone.protocolPB.OzoneManagerRequestHandler.lambda$0(OzoneManagerRequestHandler.java:397)\n\tat > org.apache.hadoop.util.MetricUtil.captureLatencyNs(MetricUtil.java:45)\n\tat > org.apache.hadoop.ozone.protocolPB.OzoneManagerRequestHandler.handleWriteRequestImpl(OzoneManagerRequestHandler.java:395)\n\tat > > org.apache.hadoop.ozone.protocolPB.RequestHandler.handleWriteRequest(RequestHandler.java:63)\n\tat > > org.apache.hadoop.ozone.om.ratis.OzoneManagerStateMachine.runCommand(OzoneManagerStateMachine.java:539)\n\tat > > org.apache.hadoop.ozone.om.ratis.OzoneManagerStateMachine.lambda$1(OzoneManagerStateMachine.java:357)\n\tat > > java.util.concurrent.CompletableFuture$AsyncSupply.run(CompletableFuture.java:1604)\n\tat > > java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)\n\tat > > java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)\n\tat > java.lang.Thread.run(Thread.java:748)\n" > status: INTERNAL_ERRORINTERNAL_ERROR > org.apache.hadoop.ozone.om.exceptions.OMException: java.io.IOException: > Snapshot chain is corrupted. > at > org.apache.hadoop.ozone.om.SnapshotChainManager.validateSnapshotChain(SnapshotChainManager.java:558) > at > org.apache.hadoop.ozone.om.SnapshotChainManager.hasNextPathSnapshot(SnapshotChainManager.java:453) > at > org.apache.hadoop.ozone.om.snapshot.SnapshotUtils.getNextActiveSnapshot(SnapshotUtils.java:157) > at > org.apache.hadoop.ozone.om.request.snapshot.OMSnapshotMoveDeletedKeysRequest.validateAndUpdateCache(OMSnapshotMoveDeletedKeysRequest.java:81) > at > org.apache.hadoop.ozone.protocolPB.OzoneManagerRequestHandler.lambda$0(OzoneManagerRequestHandler.java:397) > at > org.apache.hadoop.util.MetricUtil.captureLatencyNs(MetricUtil.java:45) > at > org.apache.hadoop.ozone.protocolPB.OzoneManagerRequestHandler.handleWriteRequestImpl(OzoneManagerRequestHandler.java:395) > at > org.apache.hadoop.ozone.protocolPB.RequestHandler.handleWriteRequest(RequestHandler.java:63) > at > org.apache.hadoop.ozone.om.ratis.OzoneManagerStateMachine.runCommand(OzoneManagerStateMachine.java:539) > at > org.apache.hadoop.ozone.om.ratis.OzoneManagerStateMachine.lambda$1(OzoneManagerStateMachine.java:357) > at > java.util.concurrent.CompletableFuture$AsyncSupply.run(CompletableFuture.java:1604) > at > java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) > at > java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) > at java.lang.Thread.run(Thread.java:748) at > org.apache.hadoop.ozone.om.ratis.OzoneManagerStateMachine.terminate(OzoneManagerStateMachine.java:381) > at > org.apache.hadoop.ozone.om.ratis.OzoneManagerStateMachine.processResponse(OzoneManagerStateMachine.java:370) > at > java.util.concurrent.CompletableFuture.uniApply(CompletableFuture.java:616) > at > java.util.concurrent.CompletableFuture$UniApply.tryFire(CompletableFuture.java:591) > at > java.util.concurrent.CompletableFuture.postComplete(CompletableFuture.java:488) > at > java.util.concurrent.CompletableFuture$AsyncSupply.run(CompletableFuture.java:1609) > at > java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) > at > java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) > at java.lang.Thread.run(Thread.java:748) {code} -- This message was sent by Atlassian Jira (v8.20.10#820010) --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@ozone.apache.org For additional commands, e-mail: issues-h...@ozone.apache.org