[ 
https://issues.apache.org/jira/browse/CASSANDRA-8552?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=14263428#comment-14263428
 ] 

Brent Haines edited comment on CASSANDRA-8552 at 1/3/15 5:45 AM:
-----------------------------------------------------------------

Here is the syslog after upgrading the kernel to fix the bad pte bug (after the 
OOM killer killed Cassandra): 
{code}
Jan  3 05:15:01 ip-10-0-2-226 CRON[20245]: (ubuntu) CMD 
(/home/ubuntu/checkcassandra.sh)
Jan  3 05:15:01 ip-10-0-2-226 CRON[20246]: (root) CMD (if [ -x 
/etc/munin/plugins/apt_all ]; then munin-run apt_all update 7200 12 >/dev/null; 
elif [ -x /etc/munin/plugins/apt ]; then munin-run apt update 7200 12 
>/dev/null; fi)
Jan  3 05:15:01 ip-10-0-2-226 CRON[20247]: (root) CMD (command -v debian-sa1 > 
/dev/null && debian-sa1 1 1)
Jan  3 05:15:02 ip-10-0-2-226 postfix/pickup[1360]: 4FC6E805D4: uid=1000 
from=<ubuntu>
Jan  3 05:15:02 ip-10-0-2-226 postfix/cleanup[20292]: 4FC6E805D4: 
message-id=<20150103051502.4FC6E805D4@ip-10-0-2-226.ec2.internal>
Jan  3 05:15:02 ip-10-0-2-226 postfix/qmgr[1362]: 4FC6E805D4: 
from=<ubuntu@ip-10-0-2-226.ec2.internal>, size=621, nrcpt=1 (queue active)
Jan  3 05:15:02 ip-10-0-2-226 postfix/local[20294]: 4FC6E805D4: 
to=<ubuntu@ip-10-0-2-226.ec2.internal>, orig_to=<ubuntu>, relay=local, 
delay=0.05, delays=0.03/0.01/0/0.01, dsn=2.0.0, status=sent (delivered to 
mailbox)
Jan  3 05:15:02 ip-10-0-2-226 postfix/qmgr[1362]: 4FC6E805D4: removed
Jan  3 05:17:01 ip-10-0-2-226 CRON[21023]: (root) CMD (   cd / && run-parts 
--report /etc/cron.hourly)
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906482] java invoked 
oom-killer: gfp_mask=0x201da, order=0, oom_score_adj=0
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906490] java cpuset=/ 
mems_allowed=0
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906495] CPU: 0 PID: 21373 Comm: 
java Not tainted 3.13.0-43-generic #72-Ubuntu
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906497]  0000000000000000 
ffff8800053cd980 ffffffff81720bf6 ffff8802bbdf4800
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906503]  ffff8800053cda08 
ffffffff8171b4b1 0000000000000000 00000000003ac2e4
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906506]  ffffffff8173310e 
ffff8803a5720000 0000000000000000 00000000003ac2e4
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906510] Call Trace:
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906521]  [<ffffffff81720bf6>] 
dump_stack+0x45/0x56
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906527]  [<ffffffff8171b4b1>] 
dump_header+0x7f/0x1f1
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906533]  [<ffffffff8173310e>] ? 
xen_hypervisor_callback+0x1e/0x30
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906539]  [<ffffffff811526de>] 
oom_kill_process+0x1ce/0x330
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906545]  [<ffffffff812d6ce5>] ? 
security_capable_noaudit+0x15/0x20
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906548]  [<ffffffff81152e14>] 
out_of_memory+0x414/0x450
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906552]  [<ffffffff81159180>] 
__alloc_pages_nodemask+0xa60/0xb80
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906558]  [<ffffffff811977a3>] 
alloc_pages_current+0xa3/0x160
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906563]  [<ffffffff8114f297>] 
__page_cache_alloc+0x97/0xc0
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906567]  [<ffffffff81150ca5>] 
filemap_fault+0x185/0x410
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906572]  [<ffffffff81175b4f>] 
__do_fault+0x6f/0x530
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906577]  [<ffffffff81005f0d>] ? 
pte_mfn_to_pfn.part.13+0x7d/0x100
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906581]  [<ffffffff81179d12>] 
handle_mm_fault+0x482/0xf00
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906585]  [<ffffffff81151778>] ? 
generic_file_aio_read+0x598/0x700
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906590]  [<ffffffff8172cc14>] 
__do_page_fault+0x184/0x560
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906594]  [<ffffffff81004e32>] ? 
xen_mc_flush+0x182/0x1b0
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906598]  [<ffffffff81004e32>] ? 
xen_mc_flush+0x182/0x1b0
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906601]  [<ffffffff8172d00a>] 
do_page_fault+0x1a/0x70
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906605]  [<ffffffff81729fc5>] ? 
do_device_not_available+0x35/0x50
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906608]  [<ffffffff81729468>] 
page_fault+0x28/0x30
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906611] Mem-Info:
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906613] Node 0 DMA per-cpu:
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906616] CPU    0: hi:    0, 
btch:   1 usd:   0
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906618] CPU    1: hi:    0, 
btch:   1 usd:   0
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906620] CPU    2: hi:    0, 
btch:   1 usd:   0
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906622] CPU    3: hi:    0, 
btch:   1 usd:   0
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906623] Node 0 DMA32 per-cpu:
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906626] CPU    0: hi:  186, 
btch:  31 usd:  25
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906628] CPU    1: hi:  186, 
btch:  31 usd:  83
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906630] CPU    2: hi:  186, 
btch:  31 usd:  42
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906632] CPU    3: hi:  186, 
btch:  31 usd: 104
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906633] Node 0 Normal per-cpu:
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906636] CPU    0: hi:  186, 
btch:  31 usd: 151
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906637] CPU    1: hi:  186, 
btch:  31 usd: 108
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906639] CPU    2: hi:  186, 
btch:  31 usd:  75
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906641] CPU    3: hi:  186, 
btch:  31 usd: 165
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906646] active_anon:3121793 
inactive_anon:57 isolated_anon:0
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906646]  active_file:9 
inactive_file:617 isolated_file:0
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906646]  unevictable:596831 
dirty:4 writeback:0 unstable:0
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906646]  free:18781 
slab_reclaimable:45515 slab_unreclaimable:8928
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906646]  mapped:5174 shmem:71 
pagetables:31232 bounce:0
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906646]  free_cma:0
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906651] Node 0 DMA free:15912kB 
min:16kB low:20kB high:24kB active_anon:0kB inactive_anon:0kB active_file:0kB 
inactive_file:0kB unevictable:0kB isolated(anon):0kB isolated(file):0kB 
present:15996kB managed:15912kB mlocked:0kB dirty:0kB writeback:0kB mapped:0kB 
shmem:0kB slab_reclaimable:0kB slab_unreclaimable:0kB kernel_stack:0kB 
pagetables:0kB unstable:0kB bounce:0kB free_cma:0kB writeback_tmp:0kB 
pages_scanned:0 all_unreclaimable? yes
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906658] lowmem_reserve[]: 0 
4063 15024 15024
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906662] Node 0 DMA32 
free:47852kB min:4240kB low:5300kB high:6360kB active_anon:3336024kB 
inactive_anon:88kB active_file:20kB inactive_file:848kB unevictable:684628kB 
isolated(anon):0kB isolated(file):0kB present:4177920kB managed:4164100kB 
mlocked:684628kB dirty:8kB writeback:0kB mapped:7044kB shmem:108kB 
slab_reclaimable:48484kB slab_unreclaimable:8276kB kernel_stack:1072kB 
pagetables:33408kB unstable:0kB bounce:0kB free_cma:0kB writeback_tmp:0kB 
pages_scanned:2871 all_unreclaimable? yes
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906668] lowmem_reserve[]: 0 0 
10960 10960
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906672] Node 0 Normal 
free:11360kB min:11436kB low:14292kB high:17152kB active_anon:9151148kB 
inactive_anon:140kB active_file:16kB inactive_file:1620kB unevictable:1702696kB 
isolated(anon):0kB isolated(file):0kB present:11542528kB managed:11223908kB 
mlocked:1702696kB dirty:8kB writeback:0kB mapped:13652kB shmem:176kB 
slab_reclaimable:133576kB slab_unreclaimable:27436kB kernel_stack:2352kB 
pagetables:91520kB unstable:0kB bounce:0kB free_cma:0kB writeback_tmp:0kB 
pages_scanned:5660 all_unreclaimable? yes
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906678] lowmem_reserve[]: 0 0 0 0
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906681] Node 0 DMA: 0*4kB 1*8kB 
(U) 0*16kB 1*32kB (U) 2*64kB (U) 1*128kB (U) 1*256kB (U) 0*512kB 1*1024kB (U) 
1*2048kB (R) 3*4096kB (M) = 15912kB
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906696] Node 0 DMA32: 267*4kB 
(UE) 109*8kB (UEM) 328*16kB (UEM) 225*32kB (UE) 59*64kB (UE) 14*128kB (E) 
12*256kB (EM) 8*512kB (E) 6*1024kB (EM) 3*2048kB (ER) 2*4096kB (ER) = 47604kB
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906711] Node 0 Normal: 1880*4kB 
(E) 0*8kB 0*16kB 0*32kB 0*64kB 0*128kB 1*256kB (R) 1*512kB (R) 1*1024kB (R) 
1*2048kB (R) 0*4096kB = 11360kB
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906724] Node 0 
hugepages_total=0 hugepages_free=0 hugepages_surp=0 hugepages_size=2048kB
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906726] 5932 total pagecache 
pages
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906728] 0 pages in swap cache
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906730] Swap cache stats: add 
0, delete 0, find 0/0
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906732] Free swap  = 0kB
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906733] Total swap = 0kB
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906735] 3934111 pages RAM
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906736] 0 pages 
HighMem/MovableOnly
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906737] 79655 pages reserved
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906738] [ pid ]   uid  tgid 
total_vm      rss nr_ptes swapents oom_score_adj name
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906745] [  513]     0   513     
4869       85      13        0             0 upstart-udev-br
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906748] [  518]     0   518    
12395      313      28        0         -1000 systemd-udevd
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906751] [  638]     0   638     
3815       93      12        0             0 upstart-socket-
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906754] [  727]     0   727     
2556      649       8        0             0 dhclient
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906757] [  827]     0   827    
15341      399      34        0         -1000 sshd
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906759] [  935]   102   935     
9804      214      23        0             0 dbus-daemon
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906762] [  958]     0   958    
10863      286      27        0             0 systemd-logind
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906764] [  971]     0   971     
3819       92      12        0             0 upstart-file-br
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906767] [  978]   101   978    
65019      212      30        0             0 rsyslogd
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906770] [ 1062]     0  1062    
13919     2442      32        0             0 munin-node
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906772] [ 1064]     0  1064     
3635      224      12        0             0 getty
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906775] [ 1067]     0  1067     
3635      224      12        0             0 getty
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906778] [ 1071]     0  1071     
3635      225      13        0             0 getty
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906780] [ 1072]     0  1072     
3635      225      12        0             0 getty
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906782] [ 1075]     0  1075     
3635      224      12        0             0 getty
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906785] [ 1092]     0  1092     
5914      238      18        0             0 cron
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906787] [ 1093]     0  1093     
4785       40      13        0             0 atd
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906790] [ 1100]     0  1100     
1092      154       8        0             0 acpid
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906793] [ 1160]     0  1160     
4570      318      14        0             0 datastax_agent_
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906795] [ 1258]   110  1258   
888265    48767     201        0             0 java
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906798] [ 1354]     0  1354     
6336      286      17        0             0 master
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906800] [ 1360]   109  1360     
6852      271      17        0             0 pickup
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906802] [ 1362]   109  1362     
6893      310      17        0             0 qmgr
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906805] [ 1418]     0  1418     
1211      129       8        0             0 mdadm
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906807] [ 1542]     0  1542     
3635      223      12        0             0 getty
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906810] [ 1609]   107  1609 
21784249  3650320   29612        0             0 java
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906813] [ 1685]     0  1685    
26408      535      56        0             0 sshd
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906815] [ 1745]  1000  1745    
26408      329      53        0             0 sshd
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906818] [ 1746]  1000  1746     
5316      686      15        0             0 bash
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906821] [ 2127]   106  2127     
7861      392      19        0             0 ntpd
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906823] [ 2223]     0  2223    
16975      380      37        0             0 sudo
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906826] [ 2224]     0  2224    
15813      299      35        0             0 su
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906828] [ 2225]     0  2225     
5316      702      15        0             0 bash
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906831] [ 2366]     0  2366    
26408      533      57        0             0 sshd
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906833] [ 2419]  1000  2419    
26408      324      54        0             0 sshd
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906835] [ 2420]  1000  2420     
5316      703      15        0             0 bash
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906838] [ 2498]     0  2498    
16975      379      36        0             0 sudo
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906840] [ 2499]     0  2499    
15813      298      36        0             0 su
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906843] [ 2500]     0  2500     
5340      756      14        0             0 bash
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906845] [ 2585]     0  2585     
1489      118       7        0             0 tail
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906848] [ 2668]     0  2668    
26408      534      54        0             0 sshd
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906850] [ 2780]  1000  2780    
26408      330      52        0             0 sshd
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906852] [ 2781]  1000  2781     
5316      687      15        0             0 bash
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906855] [ 2898]     0  2898    
16975      381      37        0             0 sudo
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906857] [ 2899]     0  2899    
15813      299      36        0             0 su
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906860] [ 2900]     0  2900     
5318      702      14        0             0 bash
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906862] [ 2933]     0  2933     
3152      254      11        0             0 watch
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906865] [ 3272]     0  3272    
26408      534      54        0             0 sshd
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906867] [ 3325]  1000  3325    
26408      332      53        0             0 sshd
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906870] [ 3326]  1000  3326     
5316      684      15        0             0 bash
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906872] [ 3402]     0  3402    
16975      380      37        0             0 sudo
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906875] [ 3403]     0  3403    
15813      299      35        0             0 su
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906877] [ 3404]     0  3404     
5337      753      14        0             0 bash
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906880] [ 3498]     0  3498     
1489      132       8        0             0 tail
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906884] [21174]   110 21174     
1485      189       8        0             0 iostat
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906886] [21179]   110 21179     
1485      189       8        0             0 iostat
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906889] [21333]     0 21333     
3151       77       9        0             0 watch
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906891] [21334]     0 21334     
1111      140       7        0             0 sh
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906894] [21335]     0 21335     
1111      162       8        0             0 nodetool
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906897] [21372]     0 21372   
138267     7897      63        0             0 java
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906899] [21403]     0 21403     
1086       86       8        0             0 sleep
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906902] Out of memory: Kill 
process 1609 (java) score 955 or sacrifice child
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906924] Killed process 1609 
(java) total-vm:87136996kB, anon-rss:14584808kB, file-rss:16472kB
{code}


was (Author: thebrenthaines):
Here is the syslog after upgrading the kernel to fix the bad pte bug : 
{code}
Jan  3 05:15:01 ip-10-0-2-226 CRON[20245]: (ubuntu) CMD 
(/home/ubuntu/checkcassandra.sh)
Jan  3 05:15:01 ip-10-0-2-226 CRON[20246]: (root) CMD (if [ -x 
/etc/munin/plugins/apt_all ]; then munin-run apt_all update 7200 12 >/dev/null; 
elif [ -x /etc/munin/plugins/apt ]; then munin-run apt update 7200 12 
>/dev/null; fi)
Jan  3 05:15:01 ip-10-0-2-226 CRON[20247]: (root) CMD (command -v debian-sa1 > 
/dev/null && debian-sa1 1 1)
Jan  3 05:15:02 ip-10-0-2-226 postfix/pickup[1360]: 4FC6E805D4: uid=1000 
from=<ubuntu>
Jan  3 05:15:02 ip-10-0-2-226 postfix/cleanup[20292]: 4FC6E805D4: 
message-id=<20150103051502.4FC6E805D4@ip-10-0-2-226.ec2.internal>
Jan  3 05:15:02 ip-10-0-2-226 postfix/qmgr[1362]: 4FC6E805D4: 
from=<ubuntu@ip-10-0-2-226.ec2.internal>, size=621, nrcpt=1 (queue active)
Jan  3 05:15:02 ip-10-0-2-226 postfix/local[20294]: 4FC6E805D4: 
to=<ubuntu@ip-10-0-2-226.ec2.internal>, orig_to=<ubuntu>, relay=local, 
delay=0.05, delays=0.03/0.01/0/0.01, dsn=2.0.0, status=sent (delivered to 
mailbox)
Jan  3 05:15:02 ip-10-0-2-226 postfix/qmgr[1362]: 4FC6E805D4: removed
Jan  3 05:17:01 ip-10-0-2-226 CRON[21023]: (root) CMD (   cd / && run-parts 
--report /etc/cron.hourly)
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906482] java invoked 
oom-killer: gfp_mask=0x201da, order=0, oom_score_adj=0
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906490] java cpuset=/ 
mems_allowed=0
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906495] CPU: 0 PID: 21373 Comm: 
java Not tainted 3.13.0-43-generic #72-Ubuntu
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906497]  0000000000000000 
ffff8800053cd980 ffffffff81720bf6 ffff8802bbdf4800
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906503]  ffff8800053cda08 
ffffffff8171b4b1 0000000000000000 00000000003ac2e4
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906506]  ffffffff8173310e 
ffff8803a5720000 0000000000000000 00000000003ac2e4
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906510] Call Trace:
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906521]  [<ffffffff81720bf6>] 
dump_stack+0x45/0x56
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906527]  [<ffffffff8171b4b1>] 
dump_header+0x7f/0x1f1
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906533]  [<ffffffff8173310e>] ? 
xen_hypervisor_callback+0x1e/0x30
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906539]  [<ffffffff811526de>] 
oom_kill_process+0x1ce/0x330
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906545]  [<ffffffff812d6ce5>] ? 
security_capable_noaudit+0x15/0x20
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906548]  [<ffffffff81152e14>] 
out_of_memory+0x414/0x450
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906552]  [<ffffffff81159180>] 
__alloc_pages_nodemask+0xa60/0xb80
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906558]  [<ffffffff811977a3>] 
alloc_pages_current+0xa3/0x160
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906563]  [<ffffffff8114f297>] 
__page_cache_alloc+0x97/0xc0
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906567]  [<ffffffff81150ca5>] 
filemap_fault+0x185/0x410
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906572]  [<ffffffff81175b4f>] 
__do_fault+0x6f/0x530
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906577]  [<ffffffff81005f0d>] ? 
pte_mfn_to_pfn.part.13+0x7d/0x100
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906581]  [<ffffffff81179d12>] 
handle_mm_fault+0x482/0xf00
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906585]  [<ffffffff81151778>] ? 
generic_file_aio_read+0x598/0x700
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906590]  [<ffffffff8172cc14>] 
__do_page_fault+0x184/0x560
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906594]  [<ffffffff81004e32>] ? 
xen_mc_flush+0x182/0x1b0
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906598]  [<ffffffff81004e32>] ? 
xen_mc_flush+0x182/0x1b0
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906601]  [<ffffffff8172d00a>] 
do_page_fault+0x1a/0x70
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906605]  [<ffffffff81729fc5>] ? 
do_device_not_available+0x35/0x50
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906608]  [<ffffffff81729468>] 
page_fault+0x28/0x30
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906611] Mem-Info:
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906613] Node 0 DMA per-cpu:
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906616] CPU    0: hi:    0, 
btch:   1 usd:   0
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906618] CPU    1: hi:    0, 
btch:   1 usd:   0
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906620] CPU    2: hi:    0, 
btch:   1 usd:   0
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906622] CPU    3: hi:    0, 
btch:   1 usd:   0
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906623] Node 0 DMA32 per-cpu:
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906626] CPU    0: hi:  186, 
btch:  31 usd:  25
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906628] CPU    1: hi:  186, 
btch:  31 usd:  83
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906630] CPU    2: hi:  186, 
btch:  31 usd:  42
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906632] CPU    3: hi:  186, 
btch:  31 usd: 104
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906633] Node 0 Normal per-cpu:
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906636] CPU    0: hi:  186, 
btch:  31 usd: 151
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906637] CPU    1: hi:  186, 
btch:  31 usd: 108
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906639] CPU    2: hi:  186, 
btch:  31 usd:  75
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906641] CPU    3: hi:  186, 
btch:  31 usd: 165
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906646] active_anon:3121793 
inactive_anon:57 isolated_anon:0
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906646]  active_file:9 
inactive_file:617 isolated_file:0
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906646]  unevictable:596831 
dirty:4 writeback:0 unstable:0
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906646]  free:18781 
slab_reclaimable:45515 slab_unreclaimable:8928
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906646]  mapped:5174 shmem:71 
pagetables:31232 bounce:0
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906646]  free_cma:0
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906651] Node 0 DMA free:15912kB 
min:16kB low:20kB high:24kB active_anon:0kB inactive_anon:0kB active_file:0kB 
inactive_file:0kB unevictable:0kB isolated(anon):0kB isolated(file):0kB 
present:15996kB managed:15912kB mlocked:0kB dirty:0kB writeback:0kB mapped:0kB 
shmem:0kB slab_reclaimable:0kB slab_unreclaimable:0kB kernel_stack:0kB 
pagetables:0kB unstable:0kB bounce:0kB free_cma:0kB writeback_tmp:0kB 
pages_scanned:0 all_unreclaimable? yes
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906658] lowmem_reserve[]: 0 
4063 15024 15024
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906662] Node 0 DMA32 
free:47852kB min:4240kB low:5300kB high:6360kB active_anon:3336024kB 
inactive_anon:88kB active_file:20kB inactive_file:848kB unevictable:684628kB 
isolated(anon):0kB isolated(file):0kB present:4177920kB managed:4164100kB 
mlocked:684628kB dirty:8kB writeback:0kB mapped:7044kB shmem:108kB 
slab_reclaimable:48484kB slab_unreclaimable:8276kB kernel_stack:1072kB 
pagetables:33408kB unstable:0kB bounce:0kB free_cma:0kB writeback_tmp:0kB 
pages_scanned:2871 all_unreclaimable? yes
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906668] lowmem_reserve[]: 0 0 
10960 10960
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906672] Node 0 Normal 
free:11360kB min:11436kB low:14292kB high:17152kB active_anon:9151148kB 
inactive_anon:140kB active_file:16kB inactive_file:1620kB unevictable:1702696kB 
isolated(anon):0kB isolated(file):0kB present:11542528kB managed:11223908kB 
mlocked:1702696kB dirty:8kB writeback:0kB mapped:13652kB shmem:176kB 
slab_reclaimable:133576kB slab_unreclaimable:27436kB kernel_stack:2352kB 
pagetables:91520kB unstable:0kB bounce:0kB free_cma:0kB writeback_tmp:0kB 
pages_scanned:5660 all_unreclaimable? yes
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906678] lowmem_reserve[]: 0 0 0 0
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906681] Node 0 DMA: 0*4kB 1*8kB 
(U) 0*16kB 1*32kB (U) 2*64kB (U) 1*128kB (U) 1*256kB (U) 0*512kB 1*1024kB (U) 
1*2048kB (R) 3*4096kB (M) = 15912kB
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906696] Node 0 DMA32: 267*4kB 
(UE) 109*8kB (UEM) 328*16kB (UEM) 225*32kB (UE) 59*64kB (UE) 14*128kB (E) 
12*256kB (EM) 8*512kB (E) 6*1024kB (EM) 3*2048kB (ER) 2*4096kB (ER) = 47604kB
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906711] Node 0 Normal: 1880*4kB 
(E) 0*8kB 0*16kB 0*32kB 0*64kB 0*128kB 1*256kB (R) 1*512kB (R) 1*1024kB (R) 
1*2048kB (R) 0*4096kB = 11360kB
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906724] Node 0 
hugepages_total=0 hugepages_free=0 hugepages_surp=0 hugepages_size=2048kB
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906726] 5932 total pagecache 
pages
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906728] 0 pages in swap cache
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906730] Swap cache stats: add 
0, delete 0, find 0/0
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906732] Free swap  = 0kB
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906733] Total swap = 0kB
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906735] 3934111 pages RAM
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906736] 0 pages 
HighMem/MovableOnly
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906737] 79655 pages reserved
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906738] [ pid ]   uid  tgid 
total_vm      rss nr_ptes swapents oom_score_adj name
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906745] [  513]     0   513     
4869       85      13        0             0 upstart-udev-br
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906748] [  518]     0   518    
12395      313      28        0         -1000 systemd-udevd
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906751] [  638]     0   638     
3815       93      12        0             0 upstart-socket-
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906754] [  727]     0   727     
2556      649       8        0             0 dhclient
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906757] [  827]     0   827    
15341      399      34        0         -1000 sshd
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906759] [  935]   102   935     
9804      214      23        0             0 dbus-daemon
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906762] [  958]     0   958    
10863      286      27        0             0 systemd-logind
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906764] [  971]     0   971     
3819       92      12        0             0 upstart-file-br
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906767] [  978]   101   978    
65019      212      30        0             0 rsyslogd
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906770] [ 1062]     0  1062    
13919     2442      32        0             0 munin-node
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906772] [ 1064]     0  1064     
3635      224      12        0             0 getty
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906775] [ 1067]     0  1067     
3635      224      12        0             0 getty
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906778] [ 1071]     0  1071     
3635      225      13        0             0 getty
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906780] [ 1072]     0  1072     
3635      225      12        0             0 getty
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906782] [ 1075]     0  1075     
3635      224      12        0             0 getty
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906785] [ 1092]     0  1092     
5914      238      18        0             0 cron
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906787] [ 1093]     0  1093     
4785       40      13        0             0 atd
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906790] [ 1100]     0  1100     
1092      154       8        0             0 acpid
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906793] [ 1160]     0  1160     
4570      318      14        0             0 datastax_agent_
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906795] [ 1258]   110  1258   
888265    48767     201        0             0 java
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906798] [ 1354]     0  1354     
6336      286      17        0             0 master
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906800] [ 1360]   109  1360     
6852      271      17        0             0 pickup
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906802] [ 1362]   109  1362     
6893      310      17        0             0 qmgr
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906805] [ 1418]     0  1418     
1211      129       8        0             0 mdadm
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906807] [ 1542]     0  1542     
3635      223      12        0             0 getty
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906810] [ 1609]   107  1609 
21784249  3650320   29612        0             0 java
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906813] [ 1685]     0  1685    
26408      535      56        0             0 sshd
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906815] [ 1745]  1000  1745    
26408      329      53        0             0 sshd
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906818] [ 1746]  1000  1746     
5316      686      15        0             0 bash
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906821] [ 2127]   106  2127     
7861      392      19        0             0 ntpd
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906823] [ 2223]     0  2223    
16975      380      37        0             0 sudo
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906826] [ 2224]     0  2224    
15813      299      35        0             0 su
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906828] [ 2225]     0  2225     
5316      702      15        0             0 bash
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906831] [ 2366]     0  2366    
26408      533      57        0             0 sshd
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906833] [ 2419]  1000  2419    
26408      324      54        0             0 sshd
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906835] [ 2420]  1000  2420     
5316      703      15        0             0 bash
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906838] [ 2498]     0  2498    
16975      379      36        0             0 sudo
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906840] [ 2499]     0  2499    
15813      298      36        0             0 su
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906843] [ 2500]     0  2500     
5340      756      14        0             0 bash
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906845] [ 2585]     0  2585     
1489      118       7        0             0 tail
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906848] [ 2668]     0  2668    
26408      534      54        0             0 sshd
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906850] [ 2780]  1000  2780    
26408      330      52        0             0 sshd
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906852] [ 2781]  1000  2781     
5316      687      15        0             0 bash
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906855] [ 2898]     0  2898    
16975      381      37        0             0 sudo
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906857] [ 2899]     0  2899    
15813      299      36        0             0 su
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906860] [ 2900]     0  2900     
5318      702      14        0             0 bash
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906862] [ 2933]     0  2933     
3152      254      11        0             0 watch
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906865] [ 3272]     0  3272    
26408      534      54        0             0 sshd
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906867] [ 3325]  1000  3325    
26408      332      53        0             0 sshd
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906870] [ 3326]  1000  3326     
5316      684      15        0             0 bash
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906872] [ 3402]     0  3402    
16975      380      37        0             0 sudo
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906875] [ 3403]     0  3403    
15813      299      35        0             0 su
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906877] [ 3404]     0  3404     
5337      753      14        0             0 bash
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906880] [ 3498]     0  3498     
1489      132       8        0             0 tail
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906884] [21174]   110 21174     
1485      189       8        0             0 iostat
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906886] [21179]   110 21179     
1485      189       8        0             0 iostat
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906889] [21333]     0 21333     
3151       77       9        0             0 watch
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906891] [21334]     0 21334     
1111      140       7        0             0 sh
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906894] [21335]     0 21335     
1111      162       8        0             0 nodetool
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906897] [21372]     0 21372   
138267     7897      63        0             0 java
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906899] [21403]     0 21403     
1086       86       8        0             0 sleep
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906902] Out of memory: Kill 
process 1609 (java) score 955 or sacrifice child
Jan  3 05:18:22 ip-10-0-2-226 kernel: [49881091.906924] Killed process 1609 
(java) total-vm:87136996kB, anon-rss:14584808kB, file-rss:16472kB
{code}

> Large compactions run out of off-heap RAM
> -----------------------------------------
>
>                 Key: CASSANDRA-8552
>                 URL: https://issues.apache.org/jira/browse/CASSANDRA-8552
>             Project: Cassandra
>          Issue Type: Bug
>          Components: Core
>         Environment: Ubuntu 14.4 
> AWS EC2
> 12 m1.xlarge nodes [4 cores, 16GB RAM, 1TB storage (251GB Used)]
> Java build 1.7.0_55-b13 and build 1.8.0_25-b17
>            Reporter: Brent Haines
>            Assignee: Marcus Eriksson
>            Priority: Blocker
>             Fix For: 2.1.3
>
>         Attachments: Screen Shot 2015-01-02 at 9.36.11 PM.png, system.log
>
>
> We have a large table of storing, effectively event logs and a pair of 
> denormalized tables for indexing.
> When updating from 2.0 to 2.1 we saw performance improvements, but some 
> random and silent crashes during nightly repairs. We lost a node (totally 
> corrupted) and replaced it. That node has never stabilized -- it simply can't 
> finish the compactions. 
> Smaller compactions finish. Larger compactions, like these two never finish - 
> {code}
> pending tasks: 48
>    compaction type   keyspace             table     completed         total   
>  unit   progress
>         Compaction       data           stories   16532973358   75977993784   
> bytes     21.76%
>         Compaction       data   stories_by_text   10593780658   38555048812   
> bytes     27.48%
> Active compaction remaining time :   0h10m51s
> {code}
> We are not getting exceptions and are not running out of heap space. The 
> Ubuntu OOM killer is reaping the process after all of the memory is consumed. 
> We watch memory in the opscenter console and it will grow. If we turn off the 
> OOM killer for the process, it will run until everything else is killed 
> instead and then the kernel panics.
> We have the following settings configured: 
> 2G Heap
> 512M New
> {code}
> memtable_heap_space_in_mb: 1024
> memtable_offheap_space_in_mb: 1024
> memtable_allocation_type: heap_buffers
> commitlog_total_space_in_mb: 2048
> concurrent_compactors: 1
> compaction_throughput_mb_per_sec: 128
> {code}
> The compaction strategy is leveled (these are read-intensive tables that are 
> rarely updated)
> I have tried every setting, every option and I have the system where the MTBF 
> is about an hour now, but we never finish compacting because there are some 
> large compactions pending. None of the GC tools or settings help because it 
> is not a GC problem. It is an off-heap memory problem.
> We are getting these messages in our syslog 
> {code}
> Jan  2 07:06:00 ip-10-0-2-226 kernel: [49801151.219527] BUG: Bad page map in 
> process java  pte:00000320 pmd:2d6fa5067
> Jan  2 07:06:00 ip-10-0-2-226 kernel: [49801151.219545] addr:00007fb820be3000 
> vm_flags:08000070 anon_vma:          (null) mapping:          (null) 
> index:7fb820be3
> Jan  2 07:06:00 ip-10-0-2-226 kernel: [49801151.219556] CPU: 3 PID: 27344 
> Comm: java Tainted: G    B        3.13.0-24-generic #47-Ubuntu
> Jan  2 07:06:00 ip-10-0-2-226 kernel: [49801151.219559]  ffff880028510e40 
> ffff88020d43da98 ffffffff81715ac4 00007fb820be3000
> Jan  2 07:06:00 ip-10-0-2-226 kernel: [49801151.219565]  ffff88020d43dae0 
> ffffffff81174183 0000000000000320 00000007fb820be3
> Jan  2 07:06:00 ip-10-0-2-226 kernel: [49801151.219568]  ffff8802d6fa5f18 
> 0000000000000320 00007fb820be3000 00007fb820be4000
> Jan  2 07:06:00 ip-10-0-2-226 kernel: [49801151.219572] Call Trace:
> Jan  2 07:06:00 ip-10-0-2-226 kernel: [49801151.219584]  [<ffffffff81715ac4>] 
> dump_stack+0x45/0x56
> Jan  2 07:06:00 ip-10-0-2-226 kernel: [49801151.219591]  [<ffffffff81174183>] 
> print_bad_pte+0x1a3/0x250
> Jan  2 07:06:00 ip-10-0-2-226 kernel: [49801151.219594]  [<ffffffff81175439>] 
> vm_normal_page+0x69/0x80
> Jan  2 07:06:00 ip-10-0-2-226 kernel: [49801151.219598]  [<ffffffff8117580b>] 
> unmap_page_range+0x3bb/0x7f0
> Jan  2 07:06:00 ip-10-0-2-226 kernel: [49801151.219602]  [<ffffffff81175cc1>] 
> unmap_single_vma+0x81/0xf0
> Jan  2 07:06:00 ip-10-0-2-226 kernel: [49801151.219605]  [<ffffffff81176d39>] 
> unmap_vmas+0x49/0x90
> Jan  2 07:06:00 ip-10-0-2-226 kernel: [49801151.219610]  [<ffffffff8117feec>] 
> exit_mmap+0x9c/0x170
> Jan  2 07:06:00 ip-10-0-2-226 kernel: [49801151.219617]  [<ffffffff8110fcf3>] 
> ? __delayacct_add_tsk+0x153/0x170
> Jan  2 07:06:00 ip-10-0-2-226 kernel: [49801151.219621]  [<ffffffff8106482c>] 
> mmput+0x5c/0x120
> Jan  2 07:06:00 ip-10-0-2-226 kernel: [49801151.219625]  [<ffffffff81069bbc>] 
> do_exit+0x26c/0xa50
> Jan  2 07:06:00 ip-10-0-2-226 kernel: [49801151.219631]  [<ffffffff810d7591>] 
> ? __unqueue_futex+0x31/0x60
> Jan  2 07:06:00 ip-10-0-2-226 kernel: [49801151.219634]  [<ffffffff810d83b6>] 
> ? futex_wait+0x126/0x290
> Jan  2 07:06:00 ip-10-0-2-226 kernel: [49801151.219640]  [<ffffffff8171d8e0>] 
> ? _raw_spin_unlock_irqrestore+0x20/0x40
> Jan  2 07:06:00 ip-10-0-2-226 kernel: [49801151.219643]  [<ffffffff8106a41f>] 
> do_group_exit+0x3f/0xa0
> Jan  2 07:06:00 ip-10-0-2-226 kernel: [49801151.219649]  [<ffffffff8107a050>] 
> get_signal_to_deliver+0x1d0/0x6f0
> Jan  2 07:06:00 ip-10-0-2-226 kernel: [49801151.219655]  [<ffffffff81013448>] 
> do_signal+0x48/0x960
> Jan  2 07:06:00 ip-10-0-2-226 kernel: [49801151.219660]  [<ffffffff811112fc>] 
> ? acct_account_cputime+0x1c/0x20
> Jan  2 07:06:00 ip-10-0-2-226 kernel: [49801151.219664]  [<ffffffff8109d76b>] 
> ? account_user_time+0x8b/0xa0
> Jan  2 07:06:00 ip-10-0-2-226 kernel: [49801151.219667]  [<ffffffff8109dd84>] 
> ? vtime_account_user+0x54/0x60
> Jan  2 07:06:00 ip-10-0-2-226 kernel: [49801151.219671]  [<ffffffff81013dc9>] 
> do_notify_resume+0x69/0xb0
> Jan  2 07:06:00 ip-10-0-2-226 kernel: [49801151.219676]  [<ffffffff8172676a>] 
> int_signal+0x12/0x17
> {code}
> This seems like unmap is failing, but I am uncertain about how to fix it or 
> work around it.
> For completeness sake, let me point this out too: The system.log will show 
> whatever was happening when the system stops and the the service is 
> restarted. There is no stake trace. Here is an example: 
> {code}
> INFO  [main] 2015-01-02 06:38:38,813 ColumnFamilyStore.java:840 - Enqueuing 
> flush of local: 1552 (0%) on-heap, 0 (0%) off-heap
> INFO  [MemtableFlushWriter:1] 2015-01-02 06:38:38,813 Memtable.java:325 - 
> Writing Memtable-local@172795560(281 serialized bytes, 10 ops, 0%/0% of 
> on/off-heap limit)
> INFO  [MemtableFlushWriter:1] 2015-01-02 06:38:38,824 Memtable.java:364 - 
> Completed flushing 
> /data/cassandra/data/system/local-7ad54392bcdd35a684174e047860b377/system-local-ka-778-Data.db
>  (262 bytes) for commitlog position ReplayPosition(segmentId=1420180671225,
>  position=87520)
> INFO  [main] 2015-01-02 06:38:38,825 YamlConfigurationLoader.java:92 - 
> Loading settings from file:/etc/cassandra/cassandra.yaml
> INFO  [main] 2015-01-02 06:38:38,837 YamlConfigurationLoader.java:135 - Node 
> configuration:[authenticator=AllowAllAuthenticator; 
> authorizer=AllowAllAuthorizer; auto_snapshot=true; 
> batch_size_warn_threshold_in_kb=5; batchlog_replay_throttle_in_kb=1024; 
> cas_conten
> tion_timeout_in_ms=1000; client_encryption_options=<REDACTED>; 
> cluster_name=booshaka-batch; column_index_size_in_kb=64; 
> commit_failure_policy=stop; 
> commitlog_directory=/commitlog/cassandra/commitlog; 
> commitlog_segment_size_in_mb=32; commitlog_sync=periodic; comm
> itlog_sync_period_in_ms=10000; commitlog_total_space_in_mb=2048; 
> compaction_throughput_mb_per_sec=128; concurrent_compactors=1; 
> concurrent_counter_writes=32; concurrent_reads=48; concurrent_writes=48; 
> counter_cache_save_period=7200; counter_cache_size_in_mb=null
> ; counter_write_request_timeout_in_ms=5000; cross_node_timeout=false; 
> data_file_directories=[/data/cassandra/data]; disk_failure_policy=stop; 
> dynamic_snitch_badness_threshold=0.1; 
> dynamic_snitch_reset_interval_in_ms=600000; 
> dynamic_snitch_update_interval_in_ms=1
> 00; endpoint_snitch=Ec2Snitch; hinted_handoff_enabled=true; 
> hinted_handoff_throttle_in_kb=1024; incremental_backups=false; 
> index_summary_capacity_in_mb=null; 
> index_summary_resize_interval_in_minutes=60; inter_dc_tcp_nodelay=false; 
> internode_compression=all; key_
> cache_save_period=14400; key_cache_size_in_mb=null; 
> listen_address=10.0.2.226; max_hint_window_in_ms=10800000; 
> max_hints_delivery_threads=2; memtable_allocation_type=heap_buffers; 
> memtable_cleanup_threshold=0.33; memtable_heap_space_in_mb=1024; 
> memtable_offheap_
> space_in_mb=1024; native_transport_port=9042; num_tokens=256; 
> partitioner=org.apache.cassandra.dht.Murmur3Partitioner; 
> permissions_validity_in_ms=2000; phi_convict_threshold=12; 
> range_request_timeout_in_ms=10000; read_request_timeout_in_ms=5000; 
> request_schedule
> r=org.apache.cassandra.scheduler.NoScheduler; request_timeout_in_ms=10000; 
> row_cache_save_period=0; row_cache_size_in_mb=0; rpc_address=10.0.2.226; 
> rpc_keepalive=true; rpc_port=9160; rpc_server_type=sync; 
> saved_caches_directory=/data/cassandra/saved_caches; seed
> _provider=[{class_name=org.apache.cassandra.locator.SimpleSeedProvider, 
> parameters=[{seeds=10.0.2.8,10.0.2.144,10.0.2.145}]}]; 
> server_encryption_options=<REDACTED>; snapshot_before_compaction=false; 
> ssl_storage_port=7001; sstable_preemptive_open_interval_in_mb=5
> 0; start_native_transport=true; start_rpc=true; storage_port=7000; 
> thrift_framed_transport_size_in_mb=15; tombstone_failure_threshold=100000; 
> tombstone_warn_threshold=1000; trickle_fsync=false; 
> trickle_fsync_interval_in_kb=10240; truncate_request_timeout_in_ms=6
> 0000; write_request_timeout_in_ms=2000]
> INFO  [main] 2015-01-02 06:38:38,943 MessagingService.java:477 - Starting 
> Messaging Service on port 7000
> INFO  [main] 2015-01-02 06:38:38,981 YamlConfigurationLoader.java:92 - 
> Loading settings from file:/etc/cassandra/cassandra.yaml
> INFO  [main] 2015-01-02 06:38:38,987 YamlConfigurationLoader.java:135 - Node 
> configuration:[authenticator=AllowAllAuthenticator; 
> authorizer=AllowAllAuthorizer; auto_snapshot=true; 
> batch_size_warn_threshold_in_kb=5; batchlog_replay_throttle_in_kb=1024; 
> cas_conten
> tion_timeout_in_ms=1000; client_encryption_options=<REDACTED>; 
> cluster_name=booshaka-batch; column_index_size_in_kb=64; 
> commit_failure_policy=stop; 
> commitlog_directory=/commitlog/cassandra/commitlog; 
> commitlog_segment_size_in_mb=32; commitlog_sync=periodic; comm
> itlog_sync_period_in_ms=10000; commitlog_total_space_in_mb=2048; 
> compaction_throughput_mb_per_sec=128; concurrent_compactors=1; 
> concurrent_counter_writes=32; concurrent_reads=48; concurrent_writes=48; 
> counter_cache_save_period=7200; counter_cache_size_in_mb=null
> ; counter_write_request_timeout_in_ms=5000; cross_node_timeout=false; 
> data_file_directories=[/data/cassandra/data]; disk_failure_policy=stop; 
> dynamic_snitch_badness_threshold=0.1; 
> dynamic_snitch_reset_interval_in_ms=600000; 
> dynamic_snitch_update_interval_in_ms=1
> 00; endpoint_snitch=Ec2Snitch; hinted_handoff_enabled=true; 
> hinted_handoff_throttle_in_kb=1024; incremental_backups=false; 
> index_summary_capacity_in_mb=null; 
> index_summary_resize_interval_in_minutes=60; inter_dc_tcp_nodelay=false; 
> internode_compression=all; key_
> cache_save_period=14400; key_cache_size_in_mb=null; 
> listen_address=10.0.2.226; max_hint_window_in_ms=10800000; 
> max_hints_delivery_threads=2; memtable_allocation_type=heap_buffers; 
> memtable_cleanup_threshold=0.33; memtable_heap_space_in_mb=1024; 
> memtable_offheap_
> space_in_mb=1024; native_transport_port=9042; num_tokens=256; 
> partitioner=org.apache.cassandra.dht.Murmur3Partitioner; 
> permissions_validity_in_ms=2000; phi_convict_threshold=12; 
> range_request_timeout_in_ms=10000; read_request_timeout_in_ms=5000; 
> request_schedule
> r=org.apache.cassandra.scheduler.NoScheduler; request_timeout_in_ms=10000; 
> row_cache_save_period=0; row_cache_size_in_mb=0; rpc_address=10.0.2.226; 
> rpc_keepalive=true; rpc_port=9160; rpc_server_type=sync; 
> saved_caches_directory=/data/cassandra/saved_caches; seed
> _provider=[{class_name=org.apache.cassandra.locator.SimpleSeedProvider, 
> parameters=[{seeds=10.0.2.8,10.0.2.144,10.0.2.145}]}]; 
> server_encryption_options=<REDACTED>; snapshot_before_compaction=false; 
> ssl_storage_port=7001; sstable_preemptive_open_interval_in_mb=5
> 0; start_native_transport=true; start_rpc=true; storage_port=7000; 
> thrift_framed_transport_size_in_mb=15; tombstone_failure_threshold=100000; 
> tombstone_warn_threshold=1000; trickle_fsync=false; 
> trickle_fsync_interval_in_kb=10240; truncate_request_timeout_in_ms=6
> 0000; write_request_timeout_in_ms=2000]
> {code}



--
This message was sent by Atlassian JIRA
(v6.3.4#6332)

Reply via email to