Hey guys, a little followup. It was very easy to reproduce this issue with warn level logging, 372076 messages stuck. If you have 50K DBs on 4 nodes with n=3, if you freshly restart the machines, then quickly kill two of them the logs spin very fast at error level, at least 5 lines per DB shard, probably more in seconds. Not nearly as bad as before but still causes the whole thing to become unresponsive for a long time. And yes it's an SSD for disk IO. I think it will be good design to expire the messages if the queue is too long here, say more than 10K.
(couc...@couch01.int.test)3> process_info(BadProcess). [{registered_name,couch_log_server}, {current_function,{prim_file,drv_get_response,1}}, {initial_call,{proc_lib,init_p,5}}, {status,running}, {message_queue_len,372076}, {messages,[{'$gen_cast',{log,{log_entry,warning,<0.295.0>, ["mem3_sync",32, "shards/80000000-9fffffff/2019-db9984436215266912.1553419444", 32,"couc...@couch03.int.test",32, "{{rexi_DOWN,{'couc...@couch03.int.test ',noconnect}},[{mem3_rpc,rexi_call,2,[{file,[115,114,99,47,109,101,109,51,95,114,112,99,46,101,114,108]},{line,349}]},{mem3_rep,calculate_start_seq,1,[{file,[115,114,99,47,109,101,109,51,95,114,101,112,46,101,114,108]},{line,349}]},{mem3_rep,repl,1,[{file,[115,114,99,47,109,101,109,51,95,114,101,112,46,101,114,108]},{line,218}]},{mem3_rep,go,1,[{file,[115,114,99,47,109,101,109,51,95,114,101,112,46,101,114,108]},{line,83}]},{mem3_sync,'-start_push_replication/1-fun-0-',2,[{file,[115,114,99,47,109,101,109,51,95,115,121,110,99,46,101,114,108]},{line,211}]}]}"], "--------", ["2019",45,["0",51],33,"27",44,"13",55,"55",58|...]}}}, {'$gen_cast',{log,{log_entry,error,emulator, [11,23,11,111,114,32,66,110,32,112,114|...], "--------", ["2019",45,["0",51],45,"27",84,"13",58,[...]|...]}}}, {'$gen_cast',{log,{log_entry,error,emulator, [69,11,11,111,21,32,105,110,32,112|...], "--------", ["2019",45,["0",51],45,"27",84,"13",58|...]}}}, {'$gen_cast',{log,{log_entry,warning,<0.295.0>, ["mem3_sync",32, "shards/80000000-9fffffff/2019-db9984436215260065.1553417487", 32,"couc...@couch03.int.test",32, "{{rexi_DOWN,{'couc...@couch03.int.test ',noconnect}},[{mem3_rpc,rexi_call,2,[{file,[115,114,99,47,109,101,109,51,95,114,112,99,46,101,114,108]},{line,349}]},{mem3_rep,calculate_start_seq,1,[{file,[115,114,99,47,109,101,109,51,95,114,101,112,46,101,114,108]},{line,349}]},{mem3_rep,repl,1,[{file,[115,114,99,47,109,101,109,51,95,114,101,112,46,101,114,108]},{line,218}]},{mem3_rep,go,1,[{file,[115,114,99,47,109,101,109,51,95,114,101,112,46,101,114,108]},{line,83}]},{mem3_sync,'-start_push_replication/1-fun-0-',2,[{file,[115,114,99,47,109,101,109,51,95,115,121,110,99,46,101,114,108]},{line,211}]}]}"], "--------", ["2019",45,["0",51],11,"27",84,[...]|...]}}}, {'$gen_cast',{log,{log_entry,error,emulator, [11,11,21,21,114,32,105,110|...], "--------", ["2019",45,["0",51],11,"27",84|...]}}}, {'$gen_cast',{log,{log_entry,error,emulator, [69,11,22,21,22,22,105|...], "--------", ["2019",45,["0",51],11,[...]|...]}}}, {'$gen_cast',{log,{log_entry,warning,<0.295.0>, ["mem3_sync",32, "shards/60000000-7fffffff/2019-db9984436215262757.1553420117", 32,"couc...@couch03.int.test ",32|...], "--------", ["2019",45,[[...]|...],45|...]}}}, {'$gen_cast',{log,{log_entry,warning,<0.295.0>, ["mem3_sync",32, "shards/a0000000-bfffffff/2019-db9984436215262165.1553420117", 32, [...]|...], "--------", ["2019",45,[...]|...]}}}, {'$gen_cast',{log,{log_entry,error,<0.22440.52>, [69,11,11,111|...], "--------", ["2019",45|...]}}}, {'$gen_cast',{log,{log_entry,error,<0.23276.52>, [69,11,11|...], "--------", [[...]|...]}}}, {'$gen_cast',{log,{log_entry,error,<0.23176.52>, [69,11|...], "--------", [...]}}}, {'$gen_cast',{log,{log_entry,error,<0.23301.52>, [69|...], [...],...}}}, {'$gen_cast',{log,{log_entry,warning,<0.295.0>,[...],...}}}, {'$gen_cast',{log,{log_entry,error,emulator,...}}}, {'$gen_cast',{log,{log_entry,error,...}}}, {'$gen_cast',{log,{log_entry,...}}}, {'$gen_cast',{log,{...}}}, {'$gen_cast',{log,...}}, {'$gen_cast',{...}}, {'$gen_cast',...}, {...}|...]}, {links,[<0.122.0>,#Port<0.2146>]}, {dictionary,[{'$initial_call',{couch_log_server,init,1}}, {'$ancestors',[couch_log_sup,<0.121.0>]}]}, {trap_exit,true}, {error_handler,error_handler}, {priority,normal}, {group_leader,<0.120.0>}, {total_heap_size,8370}, {heap_size,1598}, {stack_size,29}, {reductions,8572696344}, {garbage_collection,[{max_heap_size,#{error_logger => true,kill => true,size => 0}}, {min_bin_vheap_size,46422}, {min_heap_size,233}, {fullsweep_after,65535}, {minor_gcs,3768}]}, {suspending,[]}] On Thu, Mar 21, 2019 at 9:36 PM Joan Touzet <woh...@apache.org> wrote: > There was a proposal to do so, but there was push back from some of the > lead committers, who felt that if you are smart enough to use these, you > are also smart enough to build CouchDB from scratch and add them into > your distribution via editing rebar.config.script directly and running > `make release`. :) > > -Joan > > ----- Original Message ----- > > From: "Vladimir Ralev" <vladimir.ra...@gmail.com> > > To: user@couchdb.apache.org, woh...@apache.org > > Sent: Thursday, 21 March, 2019 3:30:21 PM > > Subject: Re: Leaking memory in logger process couch 2.3.1 > > > > Got it. Thanks. I guess I will suggest to please consider including > > some > > selfhealing/diagnostic tools in the distro - etop, observer, recon. > > > > On Thu, Mar 21, 2019 at 9:14 PM Joan Touzet <woh...@apache.org> > > wrote: > > > > > Not to pile on, but one other point: if you're logging to a slow > > > volume, > > > especially if it's the same volume that your CouchDB data is on, > > > the > > > problem will be compounded. > > > > > > If you absolutely need debug-level logging to be on for a while, a > > > better > > > approach is to use the syslog-udp backend for the Couch logger, and > > > have > > > it send the data to a second machine running rsyslogd for > > > collection. > > > > > > If you can't do that, at least get another drive/spindle/virtual > > > volume > > > with more bandwidth and log to that, local SSD/NVMe preferred. > > > > > > -Joan > > > > > > ----- Original Message ----- > > > > From: "Jan Lehnardt" <j...@apache.org> > > > > To: user@couchdb.apache.org > > > > Sent: Thursday, 21 March, 2019 2:43:23 PM > > > > Subject: Re: Leaking memory in logger process couch 2.3.1 > > > > > > > > In particular, of you have views, each design doc will cause a > > > > full > > > > database scan to be dumped into the logs. > > > > > > > > Cheers > > > > Jan > > > > — > > > > > > > > > On 21. Mar 2019, at 19:40, Robert Newson <rnew...@apache.org> > > > > > wrote: > > > > > > > > > > Hi, > > > > > > > > > > Eek. This queue should never get this big, it indicates that > > > > > there > > > > > is far too much logging traffic generated and your target (file > > > > > or > > > > > syslog server) can't take it. It looks like you have 'debug' > > > > > level > > > > > set which goes a long way to explaining it. I would return to > > > > > the > > > > > default level of 'notice' for a significant reduction in > > > > > logging > > > > > volume. > > > > > > > > > > -- > > > > > Robert Samuel Newson > > > > > rnew...@apache.org > > > > > > > > > >> On Thu, 21 Mar 2019, at 18:34, Vladimir Ralev wrote: > > > > >> Hello, > > > > >> > > > > >> I am testing couch 2.3.1 in various configurations and while > > > > >> loading high > > > > >> number of test DBs I notice a ton of memory being eaten at > > > > >> some > > > > >> point and > > > > >> never recovered More than 20 gigs and going into swap at which > > > > >> point i kill > > > > >> the machine. > > > > >> > > > > >> So went into the remsh to see where the memory goes and it is > > > > >> the > > > > >> logging > > > > >> process. Take a look at the message queue len 4671185: > > > > >> > > > > >> (couc...@couch01.int.test)65> MQSizes2 = lists:map(fun(A) -> > > > > >> {_,B} > > > > >> = > > > > >> process_info(A,message_queue_len), {B,A} end, processes()). > > > > >> (couc...@couch01.int.test)66> {_,BadProcess} = > > > > >> hd(lists:reverse(lists:sort(MQSizes2))). > > > > >> (couc...@couch01.int.test)67> process_info(BadProcess). > > > > >> [{registered_name,couch_log_server}, > > > > >> {current_function,{prim_file,drv_get_response,1}}, > > > > >> {initial_call,{proc_lib,init_p,5}}, > > > > >> {status,running}, > > > > >> {message_queue_len,4671185}, > > > > >> {messages,[{'$gen_cast',{log,{log_entry,debug,<0.8973.15>, > > > > >> > > > > >> [79,83,32,80,114,111,99,101,115,115,32,[...]|...], > > > > >> "--------", > > > > >> > > > > >> ["2019",45,["0",51],45,"21",84,["0",50],58,"40",58|...]}}}, > > > > >> {'$gen_cast',{log,{log_entry,debug,<0.8973.15>, > > > > >> > > > > >> [79,83,32,80,114,111,99,101,115,115,32|...], > > > > >> "--------", > > > > >> > > > > >> ["2019",45,["0",51],45,"21",84,["0",50],58,[...]|...]}}}, > > > > >> {'$gen_cast',{log,{log_entry,debug,<0.15949.9>, > > > > >> > > > > >> [79,83,32,80,114,111,99,101,115,115|...], > > > > >> "--------", > > > > >> > > > > >> ["2019",45,["0",51],45,"21",84,[[...]|...],58|...]}}}, > > > > >> {'$gen_cast',{log,{log_entry,debug,<0.8971.15>, > > > > >> > > > > >> [79,83,32,80,114,111,99,101,115|...], > > > > >> "--------", > > > > >> > > > > >> ["2019",45,["0",51],45,"21",84,[...]|...]}}}, > > > > >> {'$gen_cast',{log,{log_entry,debug,<0.9015.15>, > > > > >> > > > [79,83,32,80,114,111,99,101|...], > > > > >> "--------", > > > > >> > > > > >> ["2019",45,["0",51],45,"21",84|...]}}}, > > > > >> {'$gen_cast',{log,{log_entry,debug,<0.9015.15>, > > > > >> > [79,83,32,80,114,111,99|...], > > > > >> "--------", > > > > >> > > > > >> ["2019",45,["0",51],45,[...]|...]}}}, > > > > >> {'$gen_cast',{log,{log_entry,debug,<0.8973.15>, > > > > >> [79,83,32,80,114,111|...], > > > > >> "--------", > > > > >> > > > ["2019",45,[[...]|...],45|...]}}}, > > > > >> {'$gen_cast',{log,{log_entry,debug,<0.15949.9>, > > > > >> [79,83,32,80,114|...], > > > > >> "--------", > > > > >> ["2019",45,[...]|...]}}}, > > > > >> {'$gen_cast',{log,{log_entry,debug,<0.8971.15>, > > > > >> [79,83,32,80|...], > > > > >> "--------", > > > > >> ["2019",45|...]}}}, > > > > >> {'$gen_cast',{log,{log_entry,debug,<0.8973.15>, > > > > >> [79,83,32|...], > > > > >> "--------", > > > > >> [[...]|...]}}}, > > > > >> {'$gen_cast',{log,{log_entry,debug,<0.15949.9>, > > > > >> [79,83|...], > > > > >> "--------", > > > > >> [...]}}}, > > > > >> {'$gen_cast',{log,{log_entry,debug,<0.9015.15>, > > > > >> [79|...], > > > > >> [...],...}}}, > > > > >> > > > {'$gen_cast',{log,{log_entry,debug,<0.8971.15>,[...],...}}}, > > > > >> {'$gen_cast',{log,{log_entry,debug,<0.8973.15>,...}}}, > > > > >> {'$gen_cast',{log,{log_entry,debug,...}}}, > > > > >> {'$gen_cast',{log,{log_entry,...}}}, > > > > >> {'$gen_cast',{log,{...}}}, > > > > >> {'$gen_cast',{log,...}}, > > > > >> {'$gen_cast',{...}}, > > > > >> {'$gen_cast',...}, > > > > >> {...}|...]}, > > > > >> {links,[<0.122.0>,#Port<0.2149>]}, > > > > >> {dictionary,[{'$initial_call',{couch_log_server,init,1}}, > > > > >> {'$ancestors',[couch_log_sup,<0.121.0>]}]}, > > > > >> {trap_exit,true}, > > > > >> {error_handler,error_handler}, > > > > >> {priority,normal}, > > > > >> {group_leader,<0.120.0>}, > > > > >> {total_heap_size,10957}, > > > > >> {heap_size,4185}, > > > > >> {stack_size,29}, > > > > >> {reductions,292947037857}, > > > > >> {garbage_collection,[{max_heap_size,#{error_logger => > > > > >> true,kill => > > > > >> true,size => 0}}, > > > > >> {min_bin_vheap_size,46422}, > > > > >> {min_heap_size,233}, > > > > >> {fullsweep_after,65535}, > > > > >> {minor_gcs,591}]}, > > > > >> {suspending,[]}] > > > > >> > > > > >> This last line took 1 hour to finish because it was dumping > > > > >> the > > > > >> whole > > > > >> mailbox into swap once again. > > > > >> > > > > >> I can see I have debug logs enabled which exaggerates the > > > > >> problem, > > > > >> but I am > > > > >> assuming this can happen with any log level over time. Is this > > > > >> known > > > > >> behaviour and do you have any suggestions? > > > > >> > > > > > > > > > > > > > >