Oh god

root@bhs1-mail03-ds03:~# zgrep "lease" /var/log/ceph/*.gz
/var/log/ceph/ceph-mon.bhs1-mail03-ds03.log.2.gz:2017-08-24 06:39:22.384112
7f44c60f1700  1 mon.bhs1-mail03-ds03@2(peon).paxos(paxos updating c
8973251..8973960) lease_timeout -- calling new election
/var/log/ceph/ceph-mon.bhs1-mail03-ds03.log.2.gz:2017-08-24 06:39:37.985416
7f44c60f1700  1 mon.bhs1-mail03-ds03@2(peon).paxos(paxos updating c
8973251..8973960) lease_timeout -- calling new election
/var/log/ceph/ceph-mon.bhs1-mail03-ds03.log.3.gz:2017-08-22 06:43:46.394519
7f44c60f1700  1 mon.bhs1-mail03-ds03@2(peon).paxos(paxos updating c
8715223..8715882) lease_timeout -- calling new election
/var/log/ceph/ceph-mon.bhs1-mail03-ds03.log.3.gz:2017-08-22 06:44:00.518659
7f44c60f1700  1 mon.bhs1-mail03-ds03@2(peon).paxos(paxos updating c
8715223..8715882) lease_timeout -- calling new election
/var/log/ceph/ceph-mon.bhs1-mail03-ds03.log.3.gz:2017-08-22 06:44:16.553480
7f44c60f1700  1 mon.bhs1-mail03-ds03@2(peon).paxos(paxos updating c
8715223..8715883) lease_timeout -- calling new election
/var/log/ceph/ceph-mon.bhs1-mail03-ds03.log.3.gz:2017-08-22 06:44:42.974001
7f44c60f1700  1 mon.bhs1-mail03-ds03@2(peon).paxos(paxos recovering c
8715223..8715890) lease_timeout -- calling new election
/var/log/ceph/ceph-mon.bhs1-mail03-ds03.log.4.gz:2017-08-22 06:40:42.051277
7f44c60f1700  1 mon.bhs1-mail03-ds03@2(peon).paxos(paxos updating c
8715223..8715842) lease_timeout -- calling new election
/var/log/ceph/ceph-mon.bhs1-mail03-ds03.log.4.gz:2017-08-22 06:41:23.900840
7f44c60f1700  1 mon.bhs1-mail03-ds03@2(peon).paxos(paxos updating c
8715223..8715851) lease_timeout -- calling new election
/var/log/ceph/ceph-mon.bhs1-mail03-ds03.log.4.gz:2017-08-22 06:42:55.159764
7f44c60f1700  1 mon.bhs1-mail03-ds03@2(peon).paxos(paxos recovering c
8715223..8715872) lease_timeout -- calling new election
/var/log/ceph/ceph-mon.bhs1-mail03-ds03.log.7.gz:2017-08-19 06:35:54.072809
7f44c60f1700  1 mon.bhs1-mail03-ds03@2(peon).paxos(paxos updating c
8317388..8318114) lease_timeout -- calling new election
/var/log/ceph/ceph-mon.bhs1-mail03-ds03.log.7.gz:2017-08-19 06:36:37.616807
7f44c60f1700  1 mon.bhs1-mail03-ds03@2(peon).paxos(paxos recovering c
8317388..8318126) lease_timeout -- calling new election
/var/log/ceph/ceph-mon.bhs1-mail03-ds03.log.7.gz:2017-08-19 06:37:03.650974
7f44c60f1700  1 mon.bhs1-mail03-ds03@2(peon).paxos(paxos recovering c
8317388..8318132) lease_timeout -- calling new election



Regards,

Webert Lima
DevOps Engineer at MAV Tecnologia
*Belo Horizonte - Brasil*

On Mon, Aug 21, 2017 at 10:34 AM, Webert de Souza Lima <
webert.b...@gmail.com> wrote:

> I really need some help through this.
>
> This is happening very frequently and I can't seem to figure out why.
> My services rely on cephfs and when this happens, the mds suicides.
>
> It's always the same, see the last occurrence logs:
>
> host bhs1-mail03-ds03:
>
> 2017-08-19 06:35:54.072809 7f44c60f1700  1 
> mon.bhs1-mail03-ds03@2(peon).paxos(paxos
> updating c 8317388..8318114) lease_timeout -- calling new election
> 2017-08-19 06:35:54.073777 7f44c58f0700  0 log_channel(cluster) log [INF]
> : mon.bhs1-mail03-ds03 calling new monitor election
>
> host bhs1-mail03-ds02:
>
> 2017-08-19 06:35:54.163225 7fa963d5d700  0 log_channel(cluster) log [INF]
> : mon.bhs1-mail03-ds02 calling new monitor election
> 2017-08-19 06:35:54.163373 7fa963d5d700  1 
> mon.bhs1-mail03-ds02@1(electing).elector(59)
> init, last seen epoch 59
> 2017-08-19 06:35:56.066938 7fa96455e700  0 
> mon.bhs1-mail03-ds02@1(electing).data_health(58)
> update_stats avail 76% total 19555 MB, used 3553 MB, avail 14986 MB
> 2017-08-19 06:35:59.655130 7fa96455e700  0 log_channel(cluster) log [INF]
> : mon.bhs1-mail03-ds02@1 won leader election with quorum 1,2
> 2017-08-19 06:36:00.087081 7fa96455e700  0 log_channel(cluster) log [INF]
> : HEALTH_WARN; 1 mons down, quorum 1,2 bhs1-mail03-ds02,bhs1-mail03-ds03
> 2017-08-19 06:36:00.456332 7fa965f7f700  0 log_channel(cluster) log [INF]
> : monmap e2: 3 mons at {bhs1-mail03-ds01=10.0.0.101:
> 6789/0,bhs1-mail03-ds02=10.0.0.102:6789/0,bhs1-mail03-ds03=
> 10.0.0.103:6789/0}
> 2017-08-19 06:36:00.456511 7fa965f7f700  0 log_channel(cluster) log [INF]
> : pgmap v4088049: 1664 pgs: 1 active+clean+scrubbing+deep, 1663
> active+clean; 6840 GB data, 20295 GB used, 62641 GB / 82937 GB avail; 1019
>  kB/s rd, 142 kB/s wr, 24 op/s
> 2017-08-19 06:36:00.456588 7fa965f7f700  0 log_channel(cluster) log [INF]
> : fsmap e1366: 1/1/1 up {0=bhs1-mail03-ds02=up:active}, 1
> up:standby-replay
>
>
> host bhs1-mail03-ds01:
>
> 2017-08-19 06:36:09.995951 7f94e861c700  0 log_channel(cluster) log [INF]
> : mon.bhs1-mail03-ds01 calling new monitor election
> 2017-08-19 06:36:09.996005 7f94e861c700  1 
> mon.bhs1-mail03-ds01@0(electing).elector(59)
> init, last seen epoch 59
> 2017-08-19 06:36:17.653441 7f94e861c700  0 log_channel(cluster) log [INF]
> : mon.bhs1-mail03-ds01 calling new monitor election
> 2017-08-19 06:36:17.653505 7f94e861c700  1 
> mon.bhs1-mail03-ds01@0(electing).elector(61)
> init, last seen epoch 61
> 2017-08-19 06:36:27.603721 7f94e861c700  0 log_channel(cluster) log [INF]
> : mon.bhs1-mail03-ds01@0 won leader election with quorum 0,1,2
> 2017-08-19 06:36:28.579178 7f94e861c700  0 log_channel(cluster) log [INF]
> : HEALTH_OK
>
> the mds host (bhs1-mail03-ds02):
> 2017-08-19 06:36:06.858100 7f9295a76700  0 -- 10.0.0.102:6801/267422746
> >> 10.0.0.103:0/3970353996 pipe(0x55b450c7c000 sd=19 :6801 s=2 pgs=5396
> cs=1 l=0 c=0x55b4526de000).fault with nothing to send, going to standby
> 2017-08-19 06:36:06.936091 7f9295874700  0 -- 10.0.0.102:6801/267422746
> >> 10.0.0.110:0/3302883508 pipe(0x55b450c7e800 sd=20 :6801 s=2 pgs=38167
> cs=1 l=0 c=0x55b4526de300).fault with nothing to send, going to standby
> 2017-08-19 06:36:06.936122 7f9294e6a700  0 -- 10.0.0.102:6801/267422746
> >> 10.0.0.101:0/2088724837 pipe(0x55b45cef0800 sd=22 :6801 s=2 pgs=360
> cs=1 l=0 c=0x55b45daf3b00).fault with nothing to send, going to standby
> 2017-08-19 06:36:06.936169 7f9295672700  0 -- 10.0.0.102:6801/267422746
> >> 10.0.0.111:0/2802182284 pipe(0x55b450c7d400 sd=21 :6801 s=2 pgs=30804
> cs=1 l=0 c=0x55b4526de180).fault with nothing to send, going to standby
> 2017-08-19 06:36:11.412799 7f929c788700  1 mds.bhs1-mail03-ds02
> handle_mds_map i (10.0.0.102:6801/267422746) dne in the mdsmap,
> respawning myself
> 2017-08-19 06:36:11.412808 7f929c788700  1 mds.bhs1-mail03-ds02 respawn
>
>
>
>
> Regards,
>
> Webert Lima
> DevOps Engineer at MAV Tecnologia
> *Belo Horizonte - Brasil*
>
> On Wed, Aug 9, 2017 at 10:53 AM, Webert de Souza Lima <
> webert.b...@gmail.com> wrote:
>
>> Hi David,
>>
>> thanks for your feedback.
>>
>> With that in mind, I did rm a 15TB RBD Pool about 1 hour or so before
>> this had happened.
>> I wouldn't think it would be related to this because there was nothing
>> different going on after I removed it. Not even high system load.
>>
>> But considering what you sid, I think it could have been due to OSDs
>> operations related to that pool removal.
>>
>>
>>
>>
>>
>>
>> Regards,
>>
>> Webert Lima
>> DevOps Engineer at MAV Tecnologia
>> *Belo Horizonte - Brasil*
>>
>> On Wed, Aug 9, 2017 at 10:15 AM, David Turner <drakonst...@gmail.com>
>> wrote:
>>
>>> I just want to point out that there are many different types of network
>>> issues that don't involve entire networks. Bad nic, bad/loose cable, a
>>> service on a server restarting our modifying the network stack, etc.
>>>
>>> That said there are other things that can prevent an mds service, or any
>>> service from responding to the mons and being wings marked down. It happens
>>> to osds enough that they even have the ability to wire in their logs that
>>> they were wrongly marked down. That usually happens when the service is so
>>> busy with an operation that it can't get to the request from the mon fast
>>> enough and it gets marked down. This could also be environment from the mds
>>> server. If something else on the host is using too many resources
>>> preventing the mds service from having what it needs, this could easily
>>> happen.
>>>
>>> What level of granularity do you have in your monitoring to tell what
>>> your system state was when this happened? Is there a time of day it is more
>>> likely to happen (expect to find a Cron at that time)?
>>>
>>> On Wed, Aug 9, 2017, 8:37 AM Webert de Souza Lima <webert.b...@gmail.com>
>>> wrote:
>>>
>>>> Hi,
>>>>
>>>> I recently had a mds outage beucase the mds suicided due to "dne in the
>>>> mds map".
>>>> I've asked it here before and I know that happens because the monitors
>>>> took out this mds from the mds map even though it was alive.
>>>>
>>>> Weird thing, there was no network related issues happening at the time,
>>>> which if there was, it would impact many other systems.
>>>>
>>>> I found this in the mon logs, and i'd like to understand it better:
>>>>  lease_timeout -- calling new election
>>>>
>>>> full logs:
>>>>
>>>> 2017-08-08 23:12:33.286908 7f2b8398d700  1 leveldb: Manual compaction
>>>> at level-1 from 'pgmap_pg\x009.a' @ 1830392430 : 1 .. 'paxos\x0057687834' @
>>>> 0 : 0; will stop at (end)
>>>>
>>>> 2017-08-08 23:12:36.885087 7f2b86f9a700  0 mon.bhs1-mail02-ds03@2
>>>> (peon).data_health(3524) update_stats avail 81% total 19555 MB, used
>>>> 2632 MB, avail 15907 MB
>>>> 2017-08-08 23:13:29.357625 7f2b86f9a700  1 mon.bhs1-mail02-ds03@2
>>>> (peon).paxos(paxos updating c 57687834..57688383) lease_timeout --
>>>> calling new election
>>>> 2017-08-08 23:13:29.358965 7f2b86799700  0 log_channel(cluster) log
>>>> [INF] : mon.bhs1-mail02-ds03 calling new monitor election
>>>> 2017-08-08 23:13:29.359128 7f2b86799700  1 mon.bhs1-mail02-ds03@2
>>>> (electing).elector(3524) init, last seen epoch 3524
>>>> 2017-08-08 23:13:35.383530 7f2b86799700  1 mon.bhs1-mail02-ds03@2
>>>> (peon).osd e12617 e12617: 19 osds: 19 up, 19 in
>>>> 2017-08-08 23:13:35.605839 7f2b86799700  0 mon.bhs1-mail02-ds03@2
>>>> (peon).mds e18460 print_map
>>>> e18460
>>>> enable_multiple, ever_enabled_multiple: 0,0
>>>> compat: compat={},rocompat={},incompat={1=base v0.20,2=client
>>>> writeable ranges,3=default file layouts on dirs,4=dir inode in separate
>>>> object,5=mds uses versioned encoding,6=dirfrag is stored in omap,8=file
>>>> layout v2}
>>>>
>>>> Filesystem 'cephfs' (2)
>>>> fs_name cephfs
>>>> epoch   18460
>>>> flags   0
>>>> created 2016-08-01 11:07:47.592124
>>>> modified        2017-07-03 10:32:44.426431
>>>> tableserver     0
>>>> root    0
>>>> session_timeout 60
>>>> session_autoclose       300
>>>> max_file_size   1099511627776
>>>> last_failure    0
>>>> last_failure_osd_epoch  12617
>>>> compat  compat={},rocompat={},incompat={1=base v0.20,2=client
>>>> writeable ranges,3=default file layouts on dirs,4=dir inode in separate
>>>> object,5=mds uses versioned encoding,6=dirfrag is stored in omap,8=file
>>>> layout v2}
>>>> max_mds 1
>>>> in      0
>>>> up      {0=1574278}
>>>> failed
>>>> damaged
>>>> stopped
>>>> data_pools      8,9
>>>> metadata_pool   7
>>>> inline_data     disabled
>>>> 1574278:        10.0.2.4:6800/2556733458 'd' mds.0.18460 up:replay seq
>>>> 1 laggy since 2017-08-08 23:13:35.174109 (standby for rank 0)
>>>>
>>>>
>>>>
>>>> 2017-08-08 23:13:35.606303 7f2b86799700  0 log_channel(cluster) log
>>>> [INF] : mon.bhs1-mail02-ds03 calling new monitor election
>>>> 2017-08-08 23:13:35.606361 7f2b86799700  1 mon.bhs1-mail02-ds03@2
>>>> (electing).elector(3526) init, last seen epoch 3526
>>>> 2017-08-08 23:13:36.885540 7f2b86f9a700  0 mon.bhs1-mail02-ds03@2
>>>> (peon).data_health(3528) update_stats avail 81% total 19555 MB, used
>>>> 2636 MB, avail 15903 MB
>>>> 2017-08-08 23:13:38.311777 7f2b86799700  0 mon.bhs1-mail02-ds03@2
>>>> (peon).mds e18461 print_map
>>>>
>>>>
>>>> Regards,
>>>>
>>>> Webert Lima
>>>> DevOps Engineer at MAV Tecnologia
>>>> *Belo Horizonte - Brasil*
>>>> _______________________________________________
>>>> ceph-users mailing list
>>>> ceph-users@lists.ceph.com
>>>> http://lists.ceph.com/listinfo.cgi/ceph-users-ceph.com
>>>>
>>>
>>
>
_______________________________________________
ceph-users mailing list
ceph-users@lists.ceph.com
http://lists.ceph.com/listinfo.cgi/ceph-users-ceph.com

Reply via email to