[ https://issues.apache.org/jira/browse/CASSANDRA-7567?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=14072373#comment-14072373 ]
Ryan McGuire commented on CASSANDRA-7567: ----------------------------------------- I don't know what it is about 12s specifically, but I'm seeing that same intermittent 12s latency spike on a different cluster, this time 3 c3.8xlarge. How long did you run the test? It definitely gets progressively worse the longer it runs. This kind of activity loops: {code} 20352718 , 25, 25, 25, 4029.3, 1.4, 12089.2, 12091.3, 12092.0, 12092.0, 333.0, 0.01946 20639361 , 24827, 24827, 24827, 4.0, 0.9, 2.3, 6.7, 143.2, 10555.2, 344.5, 0.01964 20656481 , 17580, 17580, 17580, 4.8, 1.4, 14.4, 56.1, 198.3, 206.8, 345.5, 0.01978 20672190 , 14344, 14344, 14344, 6.6, 0.9, 15.6, 42.0, 320.1, 328.2, 346.6, 0.01997 20682762 , 16666, 16666, 16666, 7.8, 0.8, 16.0, 97.3, 444.3, 447.5, 347.3, 0.02018 20683062 , 25, 25, 25, 4017.3, 1.0, 12054.9, 12056.2, 12057.6, 12057.6, 359.3, 0.02037 21256796 , 43999, 43999, 43999, 2.3, 0.9, 1.4, 3.5, 80.8, 12100.9, 372.3, 0.02049 21356015 , 90655, 90655, 90655, 1.1, 0.9, 1.7, 3.2, 80.1, 86.7, 373.4, 0.02051 21426407 , 69401, 69401, 69401, 1.4, 0.9, 2.0, 2.7, 111.2, 114.4, 374.4, 0.02044 21500494 , 73883, 73883, 73883, 1.3, 1.0, 2.1, 9.6, 21.0, 23.1, 375.4, 0.02036 21517270 , 16108, 16108, 16108, 6.1, 0.9, 16.4, 79.7, 135.1, 136.2, 376.5, 0.02029 21523818 , 4002, 4002, 4002, 22.0, 1.4, 84.4, 493.9, 612.6, 672.2, 378.1, 0.02046 21524118 , 25, 25, 25, 4024.3, 0.8, 12081.4, 12086.2, 12097.4, 12097.4, 390.2, 0.02074 21975735 , 35025, 35025, 35025, 2.9, 0.8, 1.1, 1.7, 90.4, 12102.2, 403.1, 0.02104 22064231 , 87534, 87534, 87534, 1.1, 0.9, 1.3, 2.3, 100.4, 102.8, 404.1, 0.02109 22166146 , 105540, 105540, 105540, 0.9, 0.9, 1.2, 2.1, 5.1, 15.3, 405.1, 0.02102 22244592 , 72442, 72442, 72442, 1.4, 0.9, 1.9, 3.6, 98.0, 100.8, 406.1, 0.02096 22313323 , 67902, 67902, 67902, 1.5, 0.9, 2.0, 3.3, 213.0, 223.1, 407.2, 0.02088 22332959 , 19299, 19299, 19299, 5.1, 0.9, 15.5, 21.5, 119.6, 122.1, 408.2, 0.02081 22342749 , 9854, 9854, 9854, 6.3, 0.9, 14.9, 23.6, 547.6, 566.8, 409.2, 0.02094 22347270 , 5399, 5399, 5399, 22.4, 1.6, 22.5, 759.5, 816.5, 830.5, 410.0, 0.02115 22347570 , 25, 25, 25, 4020.3, 0.8, 12083.5, 12089.3, 12099.5, 12099.5, 422.1, 0.02139 22697985 , 26963, 26963, 26963, 3.8, 0.8, 1.1, 2.1, 65.5, 12100.1, 435.1, 0.02166 22788909 , 90108, 90108, 90108, 1.1, 0.9, 1.3, 3.2, 96.2, 99.2, 436.1, 0.02174 22885016 , 95809, 95809, 95809, 1.0, 0.9, 1.3, 2.2, 78.5, 82.4, 437.1, 0.02167 22975898 , 81816, 81816, 81816, 1.2, 0.9, 1.6, 3.5, 100.1, 102.3, 438.2, 0.02161 23049501 , 72615, 72615, 72615, 1.4, 1.0, 1.9, 2.5, 147.2, 151.1, 439.2, 0.02153 23074719 , 24475, 24475, 24475, 4.0, 0.8, 14.4, 19.6, 34.9, 46.3, 440.3, 0.02145 23094356 , 18263, 18263, 18263, 5.5, 1.6, 15.0, 54.3, 99.6, 106.9, 441.3, 0.02155 23103664 , 9878, 9878, 9878, 6.5, 1.4, 16.1, 122.5, 142.2, 349.2, 442.3, 0.02168 23104059 , 33, 33, 33, 3083.8, 0.9, 12068.0, 12071.5, 12071.6, 12071.6, 454.4, 0.02187 23378761 , 21693, 21693, 21693, 4.7, 0.8, 1.5, 3.2, 69.5, 12101.6, 467.1, 0.02212 23473613 , 88575, 88575, 88575, 1.1, 0.9, 1.2, 3.0, 95.9, 99.0, 468.1, 0.02222 23552129 , 77476, 77476, 77476, 1.3, 0.9, 1.4, 3.3, 159.1, 168.8, 469.1, 0.02215 23639310 , 86154, 86154, 86154, 1.1, 0.9, 1.3, 3.1, 95.5, 98.4, 470.1, 0.02207 23731401 , 92987, 92987, 92987, 1.1, 0.9, 1.3, 2.9, 59.4, 62.7, 471.1, 0.02200 23804418 , 63119, 63119, 63119, 1.6, 1.0, 1.9, 11.9, 147.2, 149.7, 472.3, 0.02193 23825655 , 20577, 20577, 20577, 4.9, 0.8, 16.1, 22.2, 37.6, 44.7, 473.3, 0.02186 23845878 , 19932, 19932, 19932, 5.0, 1.2, 14.8, 23.4, 79.5, 87.0, 474.3, 0.02197 23854629 , 16059, 16059, 16059, 5.7, 2.0, 13.0, 98.1, 114.8, 115.3, 474.9, 0.02207 23854929 , 25, 25, 25, 4020.7, 0.8, 12089.1, 12101.0, 12101.4, 12101.4, 487.0, 0.02219 {code} > when the commit_log disk for a single node is overwhelmed the entire cluster > slows down > --------------------------------------------------------------------------------------- > > Key: CASSANDRA-7567 > URL: https://issues.apache.org/jira/browse/CASSANDRA-7567 > Project: Cassandra > Issue Type: Bug > Components: Core > Environment: debian 7.5, bare metal, 14 nodes, 64CPUs, 64GB RAM, > commit_log disk sata, data disk SSD, vnodes, leveled compaction strategy > Reporter: David O'Dell > Assignee: Brandon Williams > Attachments: 7567.logs.bz2, write_request_latency.png > > > We've run into a situation where a single node out of 14 is experiencing high > disk io. This can happen when a node is being decommissioned or after it > joins the ring and runs into the bug cassandra-6621. > When this occurs the write latency for the entire cluster spikes. > From 0.3ms to 170ms. > To simulate this simply run dd on the commit_log disk (dd if=/dev/zero > of=/tmp/foo bs=1024) and you will see that instantly all nodes in the cluster > have slowed down. > BTW overwhelming the data disk does not have this same effect. > Also I've tried this where the overwhelmed node isn't being connected > directly from the client and it still has the same effect. -- This message was sent by Atlassian JIRA (v6.2#6252)