Hey guys, a little followup.

It was very easy to reproduce this issue with warn level logging, 372076
messages stuck. If you have 50K DBs on 4 nodes with n=3, if you freshly
restart the machines, then quickly kill two of them the logs spin very fast
at error level, at least 5 lines per DB shard, probably more in seconds.
Not nearly as bad as before but still causes the whole thing to become
unresponsive for a long time. And yes it's an SSD for disk IO. I think it
will be good design to expire the messages if the queue is too long here,
say more than 10K.

(couc...@couch01.int.test)3> process_info(BadProcess).
[{registered_name,couch_log_server},
 {current_function,{prim_file,drv_get_response,1}},
 {initial_call,{proc_lib,init_p,5}},
 {status,running},
 {message_queue_len,372076},
 {messages,[{'$gen_cast',{log,{log_entry,warning,<0.295.0>,
                                         ["mem3_sync",32,

"shards/80000000-9fffffff/2019-db9984436215266912.1553419444",
                                          32,"couc...@couch03.int.test",32,

"{{rexi_DOWN,{'couc...@couch03.int.test
',noconnect}},[{mem3_rpc,rexi_call,2,[{file,[115,114,99,47,109,101,109,51,95,114,112,99,46,101,114,108]},{line,349}]},{mem3_rep,calculate_start_seq,1,[{file,[115,114,99,47,109,101,109,51,95,114,101,112,46,101,114,108]},{line,349}]},{mem3_rep,repl,1,[{file,[115,114,99,47,109,101,109,51,95,114,101,112,46,101,114,108]},{line,218}]},{mem3_rep,go,1,[{file,[115,114,99,47,109,101,109,51,95,114,101,112,46,101,114,108]},{line,83}]},{mem3_sync,'-start_push_replication/1-fun-0-',2,[{file,[115,114,99,47,109,101,109,51,95,115,121,110,99,46,101,114,108]},{line,211}]}]}"],
                                         "--------",

 ["2019",45,["0",51],33,"27",44,"13",55,"55",58|...]}}},
            {'$gen_cast',{log,{log_entry,error,emulator,

 [11,23,11,111,114,32,66,110,32,112,114|...],
                                         "--------",

 ["2019",45,["0",51],45,"27",84,"13",58,[...]|...]}}},
            {'$gen_cast',{log,{log_entry,error,emulator,

 [69,11,11,111,21,32,105,110,32,112|...],
                                         "--------",

 ["2019",45,["0",51],45,"27",84,"13",58|...]}}},
            {'$gen_cast',{log,{log_entry,warning,<0.295.0>,
                                         ["mem3_sync",32,

"shards/80000000-9fffffff/2019-db9984436215260065.1553417487",
                                          32,"couc...@couch03.int.test",32,

"{{rexi_DOWN,{'couc...@couch03.int.test
',noconnect}},[{mem3_rpc,rexi_call,2,[{file,[115,114,99,47,109,101,109,51,95,114,112,99,46,101,114,108]},{line,349}]},{mem3_rep,calculate_start_seq,1,[{file,[115,114,99,47,109,101,109,51,95,114,101,112,46,101,114,108]},{line,349}]},{mem3_rep,repl,1,[{file,[115,114,99,47,109,101,109,51,95,114,101,112,46,101,114,108]},{line,218}]},{mem3_rep,go,1,[{file,[115,114,99,47,109,101,109,51,95,114,101,112,46,101,114,108]},{line,83}]},{mem3_sync,'-start_push_replication/1-fun-0-',2,[{file,[115,114,99,47,109,101,109,51,95,115,121,110,99,46,101,114,108]},{line,211}]}]}"],
                                         "--------",

 ["2019",45,["0",51],11,"27",84,[...]|...]}}},
            {'$gen_cast',{log,{log_entry,error,emulator,
                                         [11,11,21,21,114,32,105,110|...],
                                         "--------",

 ["2019",45,["0",51],11,"27",84|...]}}},
            {'$gen_cast',{log,{log_entry,error,emulator,
                                         [69,11,22,21,22,22,105|...],
                                         "--------",

 ["2019",45,["0",51],11,[...]|...]}}},
            {'$gen_cast',{log,{log_entry,warning,<0.295.0>,
                                         ["mem3_sync",32,

"shards/60000000-7fffffff/2019-db9984436215262757.1553420117",
                                          32,"couc...@couch03.int.test
",32|...],
                                         "--------",
                                         ["2019",45,[[...]|...],45|...]}}},
            {'$gen_cast',{log,{log_entry,warning,<0.295.0>,
                                         ["mem3_sync",32,

"shards/a0000000-bfffffff/2019-db9984436215262165.1553420117",
                                          32,
                                          [...]|...],
                                         "--------",
                                         ["2019",45,[...]|...]}}},
            {'$gen_cast',{log,{log_entry,error,<0.22440.52>,
                                         [69,11,11,111|...],
                                         "--------",
                                         ["2019",45|...]}}},
            {'$gen_cast',{log,{log_entry,error,<0.23276.52>,
                                         [69,11,11|...],
                                         "--------",
                                         [[...]|...]}}},
            {'$gen_cast',{log,{log_entry,error,<0.23176.52>,
                                         [69,11|...],
                                         "--------",
                                         [...]}}},
            {'$gen_cast',{log,{log_entry,error,<0.23301.52>,
                                         [69|...],
                                         [...],...}}},
            {'$gen_cast',{log,{log_entry,warning,<0.295.0>,[...],...}}},
            {'$gen_cast',{log,{log_entry,error,emulator,...}}},
            {'$gen_cast',{log,{log_entry,error,...}}},
            {'$gen_cast',{log,{log_entry,...}}},
            {'$gen_cast',{log,{...}}},
            {'$gen_cast',{log,...}},
            {'$gen_cast',{...}},
            {'$gen_cast',...},
            {...}|...]},
 {links,[<0.122.0>,#Port<0.2146>]},
 {dictionary,[{'$initial_call',{couch_log_server,init,1}},
              {'$ancestors',[couch_log_sup,<0.121.0>]}]},
 {trap_exit,true},
 {error_handler,error_handler},
 {priority,normal},
 {group_leader,<0.120.0>},
 {total_heap_size,8370},
 {heap_size,1598},
 {stack_size,29},
 {reductions,8572696344},
 {garbage_collection,[{max_heap_size,#{error_logger => true,kill =>
true,size => 0}},
                      {min_bin_vheap_size,46422},
                      {min_heap_size,233},
                      {fullsweep_after,65535},
                      {minor_gcs,3768}]},
 {suspending,[]}]

On Thu, Mar 21, 2019 at 9:36 PM Joan Touzet <woh...@apache.org> wrote:

> There was a proposal to do so, but there was push back from some of the
> lead committers, who felt that if you are smart enough to use these, you
> are also smart enough to build CouchDB from scratch and add them into
> your distribution via editing rebar.config.script directly and running
> `make release`.  :)
>
> -Joan
>
> ----- Original Message -----
> > From: "Vladimir Ralev" <vladimir.ra...@gmail.com>
> > To: user@couchdb.apache.org, woh...@apache.org
> > Sent: Thursday, 21 March, 2019 3:30:21 PM
> > Subject: Re: Leaking memory in logger process couch 2.3.1
> >
> > Got it. Thanks. I guess I will suggest to please consider including
> > some
> > selfhealing/diagnostic tools in the distro - etop, observer, recon.
> >
> > On Thu, Mar 21, 2019 at 9:14 PM Joan Touzet <woh...@apache.org>
> > wrote:
> >
> > > Not to pile on, but one other point: if you're logging to a slow
> > > volume,
> > > especially if it's the same volume that your CouchDB data is on,
> > > the
> > > problem will be compounded.
> > >
> > > If you absolutely need debug-level logging to be on for a while, a
> > > better
> > > approach is to use the syslog-udp backend for the Couch logger, and
> > > have
> > > it send the data to a second machine running rsyslogd for
> > > collection.
> > >
> > > If you can't do that, at least get another drive/spindle/virtual
> > > volume
> > > with more bandwidth and log to that, local SSD/NVMe preferred.
> > >
> > > -Joan
> > >
> > > ----- Original Message -----
> > > > From: "Jan Lehnardt" <j...@apache.org>
> > > > To: user@couchdb.apache.org
> > > > Sent: Thursday, 21 March, 2019 2:43:23 PM
> > > > Subject: Re: Leaking memory in logger process couch 2.3.1
> > > >
> > > > In particular, of you have views, each design doc will cause a
> > > > full
> > > > database scan to be dumped into the logs.
> > > >
> > > > Cheers
> > > > Jan
> > > > —
> > > >
> > > > > On 21. Mar 2019, at 19:40, Robert Newson <rnew...@apache.org>
> > > > > wrote:
> > > > >
> > > > > Hi,
> > > > >
> > > > > Eek. This queue should never get this big, it indicates that
> > > > > there
> > > > > is far too much logging traffic generated and your target (file
> > > > > or
> > > > > syslog server) can't take it. It looks like you have 'debug'
> > > > > level
> > > > > set which goes a long way to explaining it. I would return to
> > > > > the
> > > > > default level of 'notice' for a significant reduction in
> > > > > logging
> > > > > volume.
> > > > >
> > > > > --
> > > > >  Robert Samuel Newson
> > > > >  rnew...@apache.org
> > > > >
> > > > >> On Thu, 21 Mar 2019, at 18:34, Vladimir Ralev wrote:
> > > > >> Hello,
> > > > >>
> > > > >> I am testing couch 2.3.1 in various configurations and while
> > > > >> loading high
> > > > >> number of test DBs I notice a ton of memory being eaten at
> > > > >> some
> > > > >> point and
> > > > >> never recovered More than 20 gigs and going into swap at which
> > > > >> point i kill
> > > > >> the machine.
> > > > >>
> > > > >> So went into the remsh to see where the memory goes and it is
> > > > >> the
> > > > >> logging
> > > > >> process. Take a look at the message queue len 4671185:
> > > > >>
> > > > >> (couc...@couch01.int.test)65> MQSizes2 = lists:map(fun(A) ->
> > > > >> {_,B}
> > > > >> =
> > > > >> process_info(A,message_queue_len), {B,A} end, processes()).
> > > > >> (couc...@couch01.int.test)66> {_,BadProcess} =
> > > > >> hd(lists:reverse(lists:sort(MQSizes2))).
> > > > >> (couc...@couch01.int.test)67> process_info(BadProcess).
> > > > >> [{registered_name,couch_log_server},
> > > > >> {current_function,{prim_file,drv_get_response,1}},
> > > > >> {initial_call,{proc_lib,init_p,5}},
> > > > >> {status,running},
> > > > >> {message_queue_len,4671185},
> > > > >> {messages,[{'$gen_cast',{log,{log_entry,debug,<0.8973.15>,
> > > > >>
> > > > >> [79,83,32,80,114,111,99,101,115,115,32,[...]|...],
> > > > >>                                         "--------",
> > > > >>
> > > > >> ["2019",45,["0",51],45,"21",84,["0",50],58,"40",58|...]}}},
> > > > >>            {'$gen_cast',{log,{log_entry,debug,<0.8973.15>,
> > > > >>
> > > > >> [79,83,32,80,114,111,99,101,115,115,32|...],
> > > > >>                                         "--------",
> > > > >>
> > > > >> ["2019",45,["0",51],45,"21",84,["0",50],58,[...]|...]}}},
> > > > >>            {'$gen_cast',{log,{log_entry,debug,<0.15949.9>,
> > > > >>
> > > > >> [79,83,32,80,114,111,99,101,115,115|...],
> > > > >>                                         "--------",
> > > > >>
> > > > >> ["2019",45,["0",51],45,"21",84,[[...]|...],58|...]}}},
> > > > >>            {'$gen_cast',{log,{log_entry,debug,<0.8971.15>,
> > > > >>
> > > > >> [79,83,32,80,114,111,99,101,115|...],
> > > > >>                                         "--------",
> > > > >>
> > > > >> ["2019",45,["0",51],45,"21",84,[...]|...]}}},
> > > > >>            {'$gen_cast',{log,{log_entry,debug,<0.9015.15>,
> > > > >>
> > >  [79,83,32,80,114,111,99,101|...],
> > > > >>                                         "--------",
> > > > >>
> > > > >> ["2019",45,["0",51],45,"21",84|...]}}},
> > > > >>            {'$gen_cast',{log,{log_entry,debug,<0.9015.15>,
> > > > >>
>  [79,83,32,80,114,111,99|...],
> > > > >>                                         "--------",
> > > > >>
> > > > >> ["2019",45,["0",51],45,[...]|...]}}},
> > > > >>            {'$gen_cast',{log,{log_entry,debug,<0.8973.15>,
> > > > >>                                         [79,83,32,80,114,111|...],
> > > > >>                                         "--------",
> > > > >>
> > >  ["2019",45,[[...]|...],45|...]}}},
> > > > >>            {'$gen_cast',{log,{log_entry,debug,<0.15949.9>,
> > > > >>                                         [79,83,32,80,114|...],
> > > > >>                                         "--------",
> > > > >>                                         ["2019",45,[...]|...]}}},
> > > > >>            {'$gen_cast',{log,{log_entry,debug,<0.8971.15>,
> > > > >>                                         [79,83,32,80|...],
> > > > >>                                         "--------",
> > > > >>                                         ["2019",45|...]}}},
> > > > >>            {'$gen_cast',{log,{log_entry,debug,<0.8973.15>,
> > > > >>                                         [79,83,32|...],
> > > > >>                                         "--------",
> > > > >>                                         [[...]|...]}}},
> > > > >>            {'$gen_cast',{log,{log_entry,debug,<0.15949.9>,
> > > > >>                                         [79,83|...],
> > > > >>                                         "--------",
> > > > >>                                         [...]}}},
> > > > >>            {'$gen_cast',{log,{log_entry,debug,<0.9015.15>,
> > > > >>                                         [79|...],
> > > > >>                                         [...],...}}},
> > > > >>
> > > {'$gen_cast',{log,{log_entry,debug,<0.8971.15>,[...],...}}},
> > > > >>            {'$gen_cast',{log,{log_entry,debug,<0.8973.15>,...}}},
> > > > >>            {'$gen_cast',{log,{log_entry,debug,...}}},
> > > > >>            {'$gen_cast',{log,{log_entry,...}}},
> > > > >>            {'$gen_cast',{log,{...}}},
> > > > >>            {'$gen_cast',{log,...}},
> > > > >>            {'$gen_cast',{...}},
> > > > >>            {'$gen_cast',...},
> > > > >>            {...}|...]},
> > > > >> {links,[<0.122.0>,#Port<0.2149>]},
> > > > >> {dictionary,[{'$initial_call',{couch_log_server,init,1}},
> > > > >>              {'$ancestors',[couch_log_sup,<0.121.0>]}]},
> > > > >> {trap_exit,true},
> > > > >> {error_handler,error_handler},
> > > > >> {priority,normal},
> > > > >> {group_leader,<0.120.0>},
> > > > >> {total_heap_size,10957},
> > > > >> {heap_size,4185},
> > > > >> {stack_size,29},
> > > > >> {reductions,292947037857},
> > > > >> {garbage_collection,[{max_heap_size,#{error_logger =>
> > > > >> true,kill =>
> > > > >> true,size => 0}},
> > > > >>                      {min_bin_vheap_size,46422},
> > > > >>                      {min_heap_size,233},
> > > > >>                      {fullsweep_after,65535},
> > > > >>                      {minor_gcs,591}]},
> > > > >> {suspending,[]}]
> > > > >>
> > > > >> This last line took 1 hour to finish because it was dumping
> > > > >> the
> > > > >> whole
> > > > >> mailbox into swap once again.
> > > > >>
> > > > >> I can see I have debug logs enabled which exaggerates the
> > > > >> problem,
> > > > >> but I am
> > > > >> assuming this can happen with any log level over time. Is this
> > > > >> known
> > > > >> behaviour and do you have any suggestions?
> > > > >>
> > > >
> > > >
> > >
> >
>

Reply via email to