[ceph-users] Failing to create monitor in a working cluster.

pmestre Fri, 31 Mar 2023 12:46:59 -0700

Hello, i've been running a 3 node proxmox cluster with 4 ceph osd for 3 years 
as a production cluster.
As a test for trying to move ceph cluster network, i destroyed one of the 3 
working monitors and tried to recreate it.
After destroying it, the new monitor refuses to join the cluster, even in the 
old network. I've tried all steps in documentation "Troubleshooting monitors" 
section.


New monitor has this config extracted from ceph --admin-daemon file.asok 
{
    "name": "n3ceph",
    "rank": -1,
    "state": "probing",
    "election_epoch": 0,
    "quorum": [],
    "features": {
        "required_con": "2449958197560098820",
        "required_mon": [
            "kraken",
            "luminous",
            "mimic",
            "osdmap-prune",
            "nautilus",
            "octopus",
            "pacific",
            "elector-pinging"
        ],
        "quorum_con": "0",
        "quorum_mon": []
    },
    "outside_quorum": [],
    "extra_probe_peers": [],
    "sync_provider": [],
    "monmap": {
        "epoch": 6,
        "fsid": "5e60d0bb-33b4-42db-bbe7-7032c35ee605",
        "modified": "2023-03-31T11:54:44.616569Z",
        "created": "2019-12-02T13:50:38.097448Z",
        "min_mon_release": 16,
        "min_mon_release_name": "pacific",
        "election_strategy": 1,
        "disallowed_leaders: ": "",
        "stretch_mode": false,
        "tiebreaker_mon": "",
        "removed_ranks: ": "1",
        "features": {
            "persistent": [
                "kraken",
                "luminous",
                "mimic",
                "osdmap-prune",
                "nautilus",
                "octopus",
                "pacific",
                "elector-pinging"
            ],
            "optional": []
        },
        "mons": [
            {
                "rank": 0,
                "name": "node1",
                "public_addrs": {
                    "addrvec": [
                        {
                            "type": "v2",
                            "addr": "10.100.100.1:3300",
                            "nonce": 0
                        },
                        {
                            "type": "v1",
                            "addr": "10.100.100.1:6789",
                            "nonce": 0
                        }
                    ]
                },
                "addr": "10.100.100.1:6789/0",
                "public_addr": "10.100.100.1:6789/0",
                "priority": 0,
                "weight": 0,
                "crush_location": "{}"
            },
            {
                "rank": 1,
                "name": "node2",
                "public_addrs": {
                    "addrvec": [
                        {
                            "type": "v2",
                            "addr": "10.100.100.2:3300",
                            "nonce": 0
                        },
                        {
                            "type": "v1",
                            "addr": "10.100.100.2:6789",
                            "nonce": 0
                        }
                    ]
                },
                "addr": "10.100.100.2:6789/0",
                "public_addr": "10.100.100.2:6789/0",
                "priority": 0,
                "weight": 0,
                "crush_location": "{}"
            }
        ]
    },
    "feature_map": {
        "mon": [
            {
                "features": "0x3f01cfbdfffdffff",
                "release": "luminous",
                "num": 1
            }
        ]
    },
    "stretch_mode": false
}

The quorum mon stat is as follows:
{
    "name": "node1",
    "rank": 0,
    "state": "leader",
    "election_epoch": 340,
    "quorum": [
        0,
        1
    ],
    "quorum_age": 13090,
    "features": {
        "required_con": "2449958747317026820",
        "required_mon": [
            "kraken",
            "luminous",
            "mimic",
            "osdmap-prune",
            "nautilus",
            "octopus",
            "pacific",
            "elector-pinging"
        ],
        "quorum_con": "4540138314316775423",
        "quorum_mon": [
            "kraken",
            "luminous",
            "mimic",
            "osdmap-prune",
            "nautilus",
            "octopus",
            "pacific",
            "elector-pinging"
        ]
    },
    "outside_quorum": [],
    "extra_probe_peers": [],
    "sync_provider": [],
    "monmap": {
        "epoch": 6,
        "fsid": "5e60d0bb-33b4-42db-bbe7-7032c35ee605",
        "modified": "2023-03-31T11:54:44.616569Z",
        "created": "2019-12-02T13:50:38.097448Z",
        "min_mon_release": 16,
        "min_mon_release_name": "pacific",
        "election_strategy": 1,
        "disallowed_leaders: ": "",
        "stretch_mode": false,
        "tiebreaker_mon": "",
        "removed_ranks: ": "1",
        "features": {
            "persistent": [
                "kraken",
                "luminous",
                "mimic",
                "osdmap-prune",
                "nautilus",
                "octopus",
                "pacific",
                "elector-pinging"
            ],
            "optional": []
        },
        "mons": [
            {
                "rank": 0,
                "name": "node1",
                "public_addrs": {
                    "addrvec": [
                        {
                            "type": "v2",
                            "addr": "10.100.100.1:3300",
                            "nonce": 0
                        },
                        {
                            "type": "v1",
                            "addr": "10.100.100.1:6789",
                            "nonce": 0
                        }
                    ]
                },
                "addr": "10.100.100.1:6789/0",
                "public_addr": "10.100.100.1:6789/0",
                "priority": 0,
                "weight": 0,
                "crush_location": "{}"
            },
            {
                "rank": 1,
                "name": "node2",
                "public_addrs": {
                    "addrvec": [
                        {
                            "type": "v2",
                            "addr": "10.100.100.2:3300",
                            "nonce": 0
                        },
                        {
                            "type": "v1",
                            "addr": "10.100.100.2:6789",
                            "nonce": 0
                        }
                    ]
                },
                "addr": "10.100.100.2:6789/0",
                "public_addr": "10.100.100.2:6789/0",
                "priority": 0,
                "weight": 0,
                "crush_location": "{}"
            }
        ]
    },
    "feature_map": {
        "mon": [
            {
                "features": "0x3f01cfbdfffdffff",
                "release": "luminous",
                "num": 1
            }
        ],
        "osd": [
            {
                "features": "0x3f01cfbdfffdffff",
                "release": "luminous",
                "num": 5
            }
        ],
        "client": [
            {
                "features": "0x2f018fb87aa4aafe",
                "release": "luminous",
                "num": 1
            },
            {
                "features": "0x3f01cfbdfffdffff",
                "release": "luminous",
                "num": 12
            }
        ],
        "mgr": [
            {
                "features": "0x3f01cfbdfffdffff",
                "release": "luminous",
                "num": 1
            }
        ]
    },
    "stretch_mode": false

I tried to get a debug log with ceph daemon mon.n3ceph config set debug_mon 
10/10
 and restarting the service, but the ceph log file stoped working after i tried 
that setting.

journalctl -u tells me:
mar 31 17:35:22 node3 ceph-mon[240916]: 2023-03-31T17:35:22.926+0200 
7f49e0699700 -1 mon.n3ceph@-1(probing) e6 get_health_metrics reporting 4 slow 
ops, oldest is log(1 entries from seq 1 at 2023-03-31T17:30:19.347379+0200)
mar 31 17:35:27 node3 ceph-mon[240916]: 2023-03-31T17:35:27.926+0200 
7f49e0699700 -1 mon.n3ceph@-1(probing) e6 get_health_metrics reporting 4 slow 
ops, oldest is log(1 entries from seq 1 at 2023-03-31T17:30:19.347379+0200)
mar 31 17:35:32 node3 ceph-mon[240916]: 2023-03-31T17:35:32.926+0200 
7f49e0699700 -1 mon.n3ceph@-1(probing) e6 get_health_metrics reporting 4 slow 
ops, oldest is log(1 entries from seq 1 at 2023-03-31T17:30:19.347379+0200).

Any ideas? Cluster is running fine with two monitors, but a reboot in one of 
the nodes might be a big problem.
Kind regards and many thanks.
_______________________________________________
ceph-users mailing list -- ceph-users@ceph.io
To unsubscribe send an email to ceph-users-le...@ceph.io

[ceph-users] Failing to create monitor in a working cluster.

Reply via email to