Hello Ceph experts,

In the last day or so, we had a few nodes randomly reboot and now unfound 
objects are reported in Ceph health during cluster during recovery.

It appears that the object in question is a hit set object, which I now cannot 
mark lost because Ceph cannot probe the OSDs that keep crashing due to missing 
the hit set object.

Pasted below is the crash message[1] for osd.299, and some of the unfound 
objects[2]. Lastly [3] shows a sample of the hit set objects that are lost.

I would greatly appreciate any insight you may have on how to move forward. As 
of right now this cluster is inoperable due to 3 down PGs.

Thanks,
Lincoln Bryant


[1]
   -4> 2020-02-26 22:26:29.455 7ff52edaa700  0 0x559587fa91e0 36.321b 
unexpected need for 
36:d84c0000:.ceph-internal::hit_set_36.321b_archive_2020-02-24 
21%3a15%3a16.792846_2020-02-24 21%3a15%3a32.457855:head have 1352209'2834660 
flags = none tried to add 1352209'2834660 flags = none
    -3> 2020-02-26 22:26:29.455 7ff52edaa700  0 0x559587fa91e0 36.321b 
unexpected need for 
36:d84c0000:.ceph-internal::hit_set_36.321b_archive_2020-02-24 
21%3a15%3a16.792846_2020-02-24 21%3a15%3a32.457855:head have 1352209'2834660 
flags = none tried to add 1359781'2835659 flags = delete
    -2> 2020-02-26 22:26:29.456 7ff53adc2700  3 osd.299 1367392 handle_osd_map 
epochs [1367392,1367392], i have 1367392, src has [1349017,1367392]
    -1> 2020-02-26 22:26:29.460 7ff52edaa700 -1 
/home/jenkins-build/build/workspace/ceph-build/ARCH/x86_64/AVAILABLE_ARCH/x86_64/AVAILABLE_DIST/centos7/DIST/centos7/MACHINE_SIZE/huge/release/14.2.7/rpm/el7/BUILD/ceph-14.2.7/src/osd/PG.h:
 In function 'void PG::MissingLoc::add_active_missing(const pg_missing_t&)' 
thread 7ff52edaa700 time 2020-02-26 22:26:29.457170
/home/jenkins-build/build/workspace/ceph-build/ARCH/x86_64/AVAILABLE_ARCH/x86_64/AVAILABLE_DIST/centos7/DIST/centos7/MACHINE_SIZE/huge/release/14.2.7/rpm/el7/BUILD/ceph-14.2.7/src/osd/PG.h:
 838: FAILED ceph_assert(i->second.need == j->second.need)

 ceph version 14.2.7 (3d58626ebeec02d8385a4cefb92c6cbc3a45bfe8) nautilus 
(stable)
 1: (ceph::__ceph_assert_fail(char const*, char const*, int, char 
const*)+0x14a) [0x55955fdafc0f]
 2: (()+0x4dddd7) [0x55955fdafdd7]
 3: (PG::MissingLoc::add_active_missing(pg_missing_set<false> const&)+0x1e0) 
[0x55955ffa0cb0]
 4: (PG::activate(ObjectStore::Transaction&, unsigned int, std::map<int, 
std::map<spg_t, pg_query_t, std::less<spg_t>, std::allocator<std::pair<spg_t 
const, pg_query_t> > >, std::less<int>, std::allocator<std::pair<int const, 
std::map<spg_t, pg_query_t, std::less<spg_t>, std::allocator<std::pair<spg_t 
const, pg_query_t> > > > > >&, std::map<int, std::vector<std::pair<pg_notify_t, 
PastIntervals>, std::allocator<std::pair<pg_notify_t, PastIntervals> > >, 
std::less<int>, std::allocator<std::pair<int const, 
std::vector<std::pair<pg_notify_t, PastIntervals>, 
std::allocator<std::pair<pg_notify_t, PastIntervals> > > > > >*, 
PG::RecoveryCtx*)+0x1916) [0x55955ff3f1e6]
 5: 
(PG::RecoveryState::Active::Active(boost::statechart::state<PG::RecoveryState::Active,
 PG::RecoveryState::Primary, PG::RecoveryState::Activating, 
(boost::statechart::history_mode)0>::my_context)+0x370) [0x55955ff62d20]
 6: (boost::statechart::simple_state<PG::RecoveryState::Peering, 
PG::RecoveryState::Primary, PG::RecoveryState::GetInfo, 
(boost::statechart::history_mode)0>::react_impl(boost::statechart::event_base 
const&, void const*)+0xfb) [0x55955ffa8d5b]
 7: (boost::statechart::state_machine<PG::RecoveryState::RecoveryMachine, 
PG::RecoveryState::Initial, std::allocator<void>, 
boost::statechart::null_exception_translator>::process_queued_events()+0x97) 
[0x55955ff88507]
 8: (PG::handle_activate_map(PG::RecoveryCtx*)+0x1a8) [0x55955ff75848]
 9: (OSD::advance_pg(unsigned int, PG*, ThreadPool::TPHandle&, 
PG::RecoveryCtx*)+0x61d) [0x55955feb161d]
 10: (OSD::dequeue_peering_evt(OSDShard*, PG*, std::shared_ptr<PGPeeringEvent>, 
ThreadPool::TPHandle&)+0xa6) [0x55955feb2d16]
 11: (PGPeeringItem::run(OSD*, OSDShard*, boost::intrusive_ptr<PG>&, 
ThreadPool::TPHandle&)+0x51) [0x55956011a481]
 12: (OSD::ShardedOpWQ::_process(unsigned int, 
ceph::heartbeat_handle_d*)+0x90f) [0x55955fea7bbf]
 13: (ShardedThreadPool::shardedthreadpool_worker(unsigned int)+0x5b6) 
[0x559560448976]
 14: (ShardedThreadPool::WorkThreadSharded::entry()+0x10) [0x55956044b490]
 15: (()+0x7e25) [0x7ff5669bae25]
 16: (clone()+0x6d) [0x7ff565a9a34d]

     0> 2020-02-26 22:26:29.465 7ff52edaa700 -1 *** Caught signal (Aborted) **
 in thread 7ff52edaa700 thread_name:tp_osd_tp

 ceph version 14.2.7 (3d58626ebeec02d8385a4cefb92c6cbc3a45bfe8) nautilus 
(stable)
 1: (()+0xf5e0) [0x7ff5669c25e0]
 2: (gsignal()+0x37) [0x7ff5659d71f7]
 3: (abort()+0x148) [0x7ff5659d88e8]
 4: (ceph::__ceph_assert_fail(char const*, char const*, int, char 
const*)+0x199) [0x55955fdafc5e]
 5: (()+0x4dddd7) [0x55955fdafdd7]
 6: (PG::MissingLoc::add_active_missing(pg_missing_set<false> const&)+0x1e0) 
[0x55955ffa0cb0]
 7: (PG::activate(ObjectStore::Transaction&, unsigned int, std::map<int, 
std::map<spg_t, pg_query_t, std::less<spg_t>, std::allocator<std::pair<spg_t 
const, pg_query_t> > >, std::less<int>, std::allocator<std::pair<int const, 
std::map<spg_t, pg_query_t, std::less<spg_t>, std::allocator<std::pair<spg_t 
const, pg_query_t> > > > > >&, std::map<int, std::vector<std::pair<pg_notify_t, 
PastIntervals>, std::allocator<std::pair<pg_notify_t, PastIntervals> > >, 
std::less<int>, std::allocator<std::pair<int const, 
std::vector<std::pair<pg_notify_t, PastIntervals>, 
std::allocator<std::pair<pg_notify_t, PastIntervals> > > > > >*, 
PG::RecoveryCtx*)+0x1916) [0x55955ff3f1e6]
 8: 
(PG::RecoveryState::Active::Active(boost::statechart::state<PG::RecoveryState::Active,
 PG::RecoveryState::Primary, PG::RecoveryState::Activating, 
(boost::statechart::history_mode)0>::my_context)+0x370) [0x55955ff62d20]
 9: (boost::statechart::simple_state<PG::RecoveryState::Peering, 
PG::RecoveryState::Primary, PG::RecoveryState::GetInfo, 
(boost::statechart::history_mode)0>::react_impl(boost::statechart::event_base 
const&, void const*)+0xfb) [0x55955ffa8d5b]
 10: (boost::statechart::state_machine<PG::RecoveryState::RecoveryMachine, 
PG::RecoveryState::Initial, std::allocator<void>, 
boost::statechart::null_exception_translator>::process_queued_events()+0x97) 
[0x55955ff88507]
 11: (PG::handle_activate_map(PG::RecoveryCtx*)+0x1a8) [0x55955ff75848]
 12: (OSD::advance_pg(unsigned int, PG*, ThreadPool::TPHandle&, 
PG::RecoveryCtx*)+0x61d) [0x55955feb161d]
 13: (OSD::dequeue_peering_evt(OSDShard*, PG*, std::shared_ptr<PGPeeringEvent>, 
ThreadPool::TPHandle&)+0xa6) [0x55955feb2d16]
 14: (PGPeeringItem::run(OSD*, OSDShard*, boost::intrusive_ptr<PG>&, 
ThreadPool::TPHandle&)+0x51) [0x55956011a481]
 15: (OSD::ShardedOpWQ::_process(unsigned int, 
ceph::heartbeat_handle_d*)+0x90f) [0x55955fea7bbf]
 16: (ShardedThreadPool::shardedthreadpool_worker(unsigned int)+0x5b6) 
[0x559560448976]
 17: (ShardedThreadPool::WorkThreadSharded::entry()+0x10) [0x55956044b490]
 18: (()+0x7e25) [0x7ff5669bae25]
 19: (clone()+0x6d) [0x7ff565a9a34d]
 NOTE: a copy of the executable, or `objdump -rdS <executable>` is needed to 
interpret this.

--- logging levels ---
   0/ 5 none
   0/ 1 lockdep
   0/ 1 context
   1/ 1 crush
   1/ 5 mds
   1/ 5 mds_balancer
   1/ 5 mds_locker
   1/ 5 mds_log
   1/ 5 mds_log_expire
   1/ 5 mds_migrator
   0/ 1 buffer
   0/ 1 timer
   0/ 1 filer
   0/ 1 striper
   0/ 1 objecter
   0/ 5 rados
   0/ 5 rbd
   0/ 5 rbd_mirror
   0/ 5 rbd_replay
   0/ 5 journaler
   0/ 5 objectcacher
   0/ 5 client
   1/ 5 osd
   0/ 5 optracker
   0/ 5 objclass
   1/ 3 filestore
   1/ 3 journal
   0/ 0 ms
   1/ 5 mon
   0/10 monc
   1/ 5 paxos
   0/ 5 tp
   1/ 5 auth
   1/ 5 crypto
   1/ 1 finisher
   1/ 1 reserver
   1/ 5 heartbeatmap
   1/ 5 perfcounter
   1/ 5 rgw
   1/ 5 rgw_sync
   1/10 civetweb
   1/ 5 javaclient
   1/ 5 asok
   1/ 1 throttle
   0/ 0 refs
   1/ 5 xio
   1/ 5 compressor
   1/ 5 bluestore
   1/ 5 bluefs
   1/ 3 bdev
   1/ 5 kstore
   4/ 5 rocksdb
   4/ 5 leveldb
   4/ 5 memdb
   1/ 5 kinetic
   1/ 5 fuse
   1/ 5 mgr
   1/ 5 mgrc
   1/ 5 dpdk
   1/ 5 eventtrace
   1/ 5 prioritycache
  -2/-2 (syslog threshold)
  -1/-1 (stderr threshold)
  max_recent     10000
  max_new         1000
  log_file /var/log/ceph/ceph-osd.299.log
--- end dump of recent events ---


[2]
[root@ceph-mon01 ~]# ceph pg 36.321b list_unfound
{
    "num_missing": 1,
    "num_unfound": 1,
    "objects": [
        {
            "oid": {
                "oid": "hit_set_36.321b_archive_2020-02-24 
21:15:16.792846_2020-02-24 21:15:32.457855",
                "key": "",
                "snapid": -2,
                "hash": 12827,
                "max": 0,
                "pool": 36,
                "namespace": ".ceph-internal"
            },
            "need": "1352209'2834660",
            "have": "0'0",
            "flags": "none",
            "locations": []
        }
    ],
    "more": false
}
[root@ceph-mon01 ~]# ceph pg 36.324a list_unfound
{
    "num_missing": 1,
    "num_unfound": 1,
    "objects": [
        {
            "oid": {
                "oid": "hit_set_36.324a_archive_2020-02-25 
12:40:58.130723_2020-02-25 12:46:25.260587",
                "key": "",
                "snapid": -2,
                "hash": 12874,
                "max": 0,
                "pool": 36,
                "namespace": ".ceph-internal"
            },
            "need": "1361100'2822063",
            "have": "0'0",
            "flags": "none",
            "locations": []
        }
    ],
    "more": false
}
[root@ceph-mon01 ~]# ceph pg 36.10dc list_unfound
{
    "num_missing": 1,
    "num_unfound": 1,
    "objects": [
        {
            "oid": {
                "oid": "hit_set_36.10dc_archive_2020-02-25 
12:40:58.129048_2020-02-25 12:45:02.202268",
                "key": "",
                "snapid": -2,
                "hash": 4316,
                "max": 0,
                "pool": 36,
                "namespace": ".ceph-internal"
            },
            "need": "1361089'2838543",
            "have": "0'0",
            "flags": "none",
            "locations": []
        }
    ],
    "more": false
}



_______________________________________________
ceph-users mailing list -- ceph-users@ceph.io
To unsubscribe send an email to ceph-users-le...@ceph.io

Reply via email to