[ https://issues.apache.org/jira/browse/CASSANDRA-11209?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=15159096#comment-15159096 ]
Jose Fernandez commented on CASSANDRA-11209: -------------------------------------------- Actually, I just spotted an error during repair: ERROR 22:08:05 [repair #a85c9760-d9b0-11e5-9b9c-c12de94ec9ee] session completed with the following error org.apache.cassandra.exceptions.RepairException: [repair #a85c9760-d9b0-11e5-9b9c-c12de94ec9ee on timeslice_store/minute_timeslice_blobs, (7686143364045646505,-6148914691236517207]] Validation failed in /10.1.29.31 at org.apache.cassandra.repair.RepairSession.validationComplete(RepairSession.java:166) ~[apache-cassandra-2.1.13.jar:2.1.13] at org.apache.cassandra.service.ActiveRepairService.handleMessage(ActiveRepairService.java:415) ~[apache-cassandra-2.1.13.jar:2.1.13] at org.apache.cassandra.repair.RepairMessageVerbHandler.doVerb(RepairMessageVerbHandler.java:134) ~[apache-cassandra-2.1.13.jar:2.1.13] at org.apache.cassandra.net.MessageDeliveryTask.run(MessageDeliveryTask.java:64) ~[apache-cassandra-2.1.13.jar:2.1.13] at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142) [na:1.8.0_66] at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617) [na:1.8.0_66] at java.lang.Thread.run(Thread.java:745) [na:1.8.0_66] ERROR 22:08:05 Repair session a85c9760-d9b0-11e5-9b9c-c12de94ec9ee for range (7686143364045646505,-6148914691236517207] failed with error org.apache.cassandra.exceptions.RepairException: [repair #a85c9760-d9b0-11e5-9b9c-c12de94ec9ee on timeslice_store/minute_timeslice_blobs, (7686143364045646505,-6148914691236517207]] Validation failed in /10.1.29.31 java.util.concurrent.ExecutionException: java.lang.RuntimeException: org.apache.cassandra.exceptions.RepairException: [repair #a85c9760-d9b0-11e5-9b9c-c12de94ec9ee on timeslice_store/minute_timeslice_blobs, (7686143364045646505,-6148914691236517207]] Validation failed in /10.1.29.31 at java.util.concurrent.FutureTask.report(FutureTask.java:122) [na:1.8.0_66] at java.util.concurrent.FutureTask.get(FutureTask.java:192) [na:1.8.0_66] at org.apache.cassandra.service.StorageService$4.runMayThrow(StorageService.java:3048) ~[apache-cassandra-2.1.13.jar:2.1.13] at org.apache.cassandra.utils.WrappedRunnable.run(WrappedRunnable.java:28) [apache-cassandra-2.1.13.jar:2.1.13] at java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511) [na:1.8.0_66] at java.util.concurrent.FutureTask.run(FutureTask.java:266) [na:1.8.0_66] at java.lang.Thread.run(Thread.java:745) [na:1.8.0_66] Caused by: java.lang.RuntimeException: org.apache.cassandra.exceptions.RepairException: [repair #a85c9760-d9b0-11e5-9b9c-c12de94ec9ee on timeslice_store/minute_timeslice_blobs, (7686143364045646505,-6148914691236517207]] Validation failed in /10.1.29.31 at com.google.common.base.Throwables.propagate(Throwables.java:160) ~[guava-16.0.jar:na] at org.apache.cassandra.utils.WrappedRunnable.run(WrappedRunnable.java:32) [apache-cassandra-2.1.13.jar:2.1.13] at java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511) [na:1.8.0_66] at java.util.concurrent.FutureTask.run(FutureTask.java:266) [na:1.8.0_66] at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142) ~[na:1.8.0_66] at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617) ~[na:1.8.0_66] ... 1 common frames omitted Caused by: org.apache.cassandra.exceptions.RepairException: [repair #a85c9760-d9b0-11e5-9b9c-c12de94ec9ee on timeslice_store/minute_timeslice_blobs, (7686143364045646505,-6148914691236517207]] Validation failed in /10.1.29.31 at org.apache.cassandra.repair.RepairSession.validationComplete(RepairSession.java:166) ~[apache-cassandra-2.1.13.jar:2.1.13] at org.apache.cassandra.service.ActiveRepairService.handleMessage(ActiveRepairService.java:415) ~[apache-cassandra-2.1.13.jar:2.1.13] at org.apache.cassandra.repair.RepairMessageVerbHandler.doVerb(RepairMessageVerbHandler.java:134) ~[apache-cassandra-2.1.13.jar:2.1.13] at org.apache.cassandra.net.MessageDeliveryTask.run(MessageDeliveryTask.java:64) ~[apache-cassandra-2.1.13.jar:2.1.13] ... 3 common frames omitted ERROR 22:08:05 Exception in thread Thread[AntiEntropySessions:1,5,jolokia] java.lang.RuntimeException: org.apache.cassandra.exceptions.RepairException: [repair #a85c9760-d9b0-11e5-9b9c-c12de94ec9ee on timeslice_store/minute_timeslice_blobs, (7686143364045646505,-6148914691236517207]] Validation failed in /10.1.29.31 at com.google.common.base.Throwables.propagate(Throwables.java:160) ~[guava-16.0.jar:na] at org.apache.cassandra.utils.WrappedRunnable.run(WrappedRunnable.java:32) ~[apache-cassandra-2.1.13.jar:2.1.13] at java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511) ~[na:1.8.0_66] at java.util.concurrent.FutureTask.run(FutureTask.java:266) ~[na:1.8.0_66] at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142) ~[na:1.8.0_66] at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617) [na:1.8.0_66] at java.lang.Thread.run(Thread.java:745) [na:1.8.0_66] Caused by: org.apache.cassandra.exceptions.RepairException: [repair #a85c9760-d9b0-11e5-9b9c-c12de94ec9ee on timeslice_store/minute_timeslice_blobs, (7686143364045646505,-6148914691236517207]] Validation failed in /10.1.29.31 at org.apache.cassandra.repair.RepairSession.validationComplete(RepairSession.java:166) ~[apache-cassandra-2.1.13.jar:2.1.13] at org.apache.cassandra.service.ActiveRepairService.handleMessage(ActiveRepairService.java:415) ~[apache-cassandra-2.1.13.jar:2.1.13] at org.apache.cassandra.repair.RepairMessageVerbHandler.doVerb(RepairMessageVerbHandler.java:134) ~[apache-cassandra-2.1.13.jar:2.1.13] at org.apache.cassandra.net.MessageDeliveryTask.run(MessageDeliveryTask.java:64) ~[apache-cassandra-2.1.13.jar:2.1.13] ... 3 common frames omitted > SSTable ancestor leaked reference > --------------------------------- > > Key: CASSANDRA-11209 > URL: https://issues.apache.org/jira/browse/CASSANDRA-11209 > Project: Cassandra > Issue Type: Bug > Components: Compaction > Reporter: Jose Fernandez > Attachments: screenshot-1.png, screenshot-2.png > > > We're running a fork of 2.1.13 that adds the TimeWindowCompactionStrategy > from [~jjirsa]. We've been running 4 clusters without any issues for many > months until a few weeks ago we started scheduling incremental repairs every > 24 hours (previously we didn't run any repairs at all). > Since then we started noticing big discrepancies in the LiveDiskSpaceUsed, > TotalDiskSpaceUsed, and actual size of files on disk. The numbers are brought > back in sync by restarting the node. We also noticed that when this bug > happens there are several ancestors that don't get cleaned up. A restart will > queue up a lot of compactions that slowly eat away the ancestors. > I looked at the code and noticed that we only decrease the LiveTotalDiskUsed > metric in the SSTableDeletingTask. Since we have no errors being logged, I'm > assuming that for some reason this task is not getting queued up. If I > understand correctly this only happens when the reference count for the > SStable reaches 0. So this is leading us to believe that something during > repairs and/or compactions is causing a reference leak to the ancestor table. -- This message was sent by Atlassian JIRA (v6.3.4#6332)