For those who aren't on the bug tracker, this was brought up (and has follow-up) here: https://tracker.ceph.com/issues/49618
On Thu, Mar 4, 2021 at 9:55 PM Szabo, Istvan (Agoda) <istvan.sz...@agoda.com> wrote: > > Hi, > > I have a 3 DC multisite setup. > > The replication is directional like HKG->SGP->US so the bucket is replicated > from HKG to SGP and the same bucket is replicated further from SGP to US. > > The HKG > SGP connection is pretty fast 12.5millions objects (600GB) > transferred in 6.5 hours. Once the OSD crashed in SGP, it stopped the > complete chain replication and made PG_DAMAGED cluster error. > The pg can be repaired but the sync never started back only bucket sync > disable/enable helped. > I got OSD crash also in HKG BUT in ASH. ASH no any error, the replication > speed is 2 millions objects in 6.5 hours which is like 90GB of data. > > This is the crash of the osd: > > { > "backtrace": [ > "(()+0x12b20) [0x7f597d3fbb20]", > "(gsignal()+0x10f) [0x7f597c0667ff]", > "(abort()+0x127) [0x7f597c050c35]", > "(()+0x9009b) [0x7f597ca1c09b]", > "(()+0x9653c) [0x7f597ca2253c]", > "(()+0x96597) [0x7f597ca22597]", > "(()+0x967f8) [0x7f597ca227f8]", > "(()+0x19d24) [0x7f597e168d24]", > "(tcmalloc::allocate_full_cpp_throw_oom(unsigned long)+0x146) > [0x7f597e18b0d6]", > "(rocksdb::Arena::AllocateNewBlock(unsigned long)+0x43) > [0x5632f08ccb93]", > "(rocksdb::Arena::AllocateFallback(unsigned long, bool)+0x4b) > [0x5632f08ccc3b]", > "(rocksdb::ConcurrentArena::AllocateAligned(unsigned long, unsigned > long, rocksdb::Logger*)+0xb4) [0x5632f07fae94]", > "(()+0x1103e7e) [0x5632f085ae7e]", > "(rocksdb::MemTable::Add(unsigned long, rocksdb::ValueType, > rocksdb::Slice const&, rocksdb::Slice const&, bool, > rocksdb::MemTablePostProcessInfo*)+0xcf) [0x5632f07f6f8f]", > "(rocksdb::MemTableInserter::PutCFImpl(unsigned int, rocksdb::Slice > const&, rocksdb::Slice const&, rocksdb::ValueType)+0x452) [0x5632f08520e2]", > "(rocksdb::MemTableInserter::PutCF(unsigned int, rocksdb::Slice > const&, rocksdb::Slice const&)+0x17) [0x5632f0852e97]", > "(rocksdb::WriteBatch::Iterate(rocksdb::WriteBatch::Handler*) > const+0x480) [0x5632f084ac20]", > > "(rocksdb::WriteBatchInternal::InsertInto(rocksdb::WriteThread::WriteGroup&, > unsigned long, rocksdb::ColumnFamilyMemTables*, rocksdb::FlushScheduler*, > bool, unsigned long, rocksdb::DB*, bool, bool, bool)+0x149) [0x5632f084ebe9]", > "(rocksdb::DBImpl::WriteImpl(rocksdb::WriteOptions const&, > rocksdb::WriteBatch*, rocksdb::WriteCallback*, unsigned long*, unsigned long, > bool, unsigned long*, unsigned long, rocksdb::PreReleaseCallback*)+0x1acd) > [0x5632f078a03d]", > "(rocksdb::DBImpl::Write(rocksdb::WriteOptions const&, > rocksdb::WriteBatch*)+0x21) [0x5632f078ac11]", > "(RocksDBStore::submit_common(rocksdb::WriteOptions&, > std::shared_ptr<KeyValueDB::TransactionImpl>)+0x8c) [0x5632f074180c]", > > "(RocksDBStore::submit_transaction(std::shared_ptr<KeyValueDB::TransactionImpl>)+0x87) > [0x5632f0742027]", > "(BlueStore::_txc_apply_kv(BlueStore::TransContext*, bool)+0x426) > [0x5632f0226376]", > "(BlueStore::_kv_sync_thread()+0x176f) [0x5632f024bc1f]", > "(BlueStore::KVSyncThread::entry()+0x11) [0x5632f0273791]", > "(()+0x814a) [0x7f597d3f114a]", > "(clone()+0x43) [0x7f597c12bf23]" > ], > "ceph_version": "15.2.9", > "crash_id": > "2021-03-04T14:55:45.094048Z_3d481fd3-7573-4cb7-9b22-20784b418e64", > "entity_name": "osd.5", > "os_id": "centos", > "os_name": "CentOS Linux", > "os_version": "8", > "os_version_id": "8", > "process_name": "ceph-osd", > "stack_sig": > "9643c370a20c0d34f5e8965ae4461e2a7cf709ab4183929239bc263d0e1eef94", > "timestamp": "2021-03-04T14:55:45.094048Z", > "utsname_hostname": "hostname", > "utsname_machine": "x86_64", > "utsname_release": "4.18.0-240.10.1.el8_3.x86_64", > "utsname_sysname": "Linux", > "utsname_version": "#1 SMP Mon Jan 18 17:05:51 UTC 2021" > } > > Any idea what I should tune? > > Thank you. > > ________________________________ > This message is confidential and is for the sole use of the intended > recipient(s). It may also be privileged or otherwise protected by copyright > or other legal rules. If you have received it by mistake please let us know > by reply email and delete it from your system. It is prohibited to copy this > message or disclose its content to anyone. Any confidentiality or privilege > is not waived or lost by any mistaken delivery or unauthorized disclosure of > the message. All messages sent to and from Agoda may be monitored to ensure > compliance with company policies, to protect the company's interests and to > remove potential malware. Electronic messages may be intercepted, amended, > lost or deleted, or contain viruses. > _______________________________________________ > ceph-users mailing list -- ceph-users@ceph.io > To unsubscribe send an email to ceph-users-le...@ceph.io _______________________________________________ ceph-users mailing list -- ceph-users@ceph.io To unsubscribe send an email to ceph-users-le...@ceph.io