Hello,

I have recently update a cluster to mimic.  After the upgrade I have
started converting nodes to bluestore one by one. While ceph was
rebalancing I slapped a "nosnaptrim" on the cluster to save a bit of
IO. After the rebalancing was done I enabled the snaptrim and my osds
started flapping like crazy. I immediately slapped back "nosnaptrim"
on the cluster and let the osds come back online.
After everything calmed down I'm left with 28/31196788 objects unfound
(0.000%) and still can't enable the snaptrim. All the osds start
flapping with simillar messages to this:
   -10> 2019-01-30 21:12:24.970 7f9c773f1700  5 write_log_and_missing
with: dirty_to: 0'0, dirty_from: 4294967295'18446744073709551615,
writeout_from: 395796'95371018, trimmed: , trimmed_dups: ,
clear_divergent_priors: 0
    -9> 2019-01-30 21:12:24.970 7f9c773f1700  5 write_log_and_missing
with: dirty_to: 0'0, dirty_from: 4294967295'18446744073709551615,
writeout_from: 395796'95371019, trimmed: , trimmed_dups: ,
clear_divergent_priors: 0
    -8> 2019-01-30 21:12:24.970 7f9c773f1700  5 write_log_and_missing
with: dirty_to: 0'0, dirty_from: 4294967295'18446744073709551615,
writeout_from: 395796'95371020, trimmed: , trimmed_dups: ,
clear_divergent_priors: 0
    -7> 2019-01-30 21:12:24.971 7f9c733e9700  5 write_log_and_missing
with: dirty_to: 0'0, dirty_from: 4294967295'18446744073709551615,
writeout_from: 395796'95371021, trimmed: , trimmed_dups: ,
clear_divergent_priors: 0
    -6> 2019-01-30 21:12:24.975 7f9c773f1700  5 write_log_and_missing
with: dirty_to: 0'0, dirty_from: 4294967295'18446744073709551615,
writeout_from: 395796'95371022, trimmed: , trimmed_dups: ,
clear_divergent_priors: 0
    -5> 2019-01-30 21:12:24.975 7f9c831f3700  3 osd.45 395796
handle_osd_map epochs [395796,395796], i have 395796, src has
[373186,395796]
    -4> 2019-01-30 21:12:24.977 7f9c723e7700  5 osd.45 pg_epoch:
395796 pg[11.2eb( v 395794'141488204
(393296'141485192,395794'141488204] local-lis/les=395790/395791 n=9355
ec=448/448 lis/c 395790/395709 les/c/f 395791/395733/0
395795/395795/395795) [98,45
] r=1 lpr=395795 pi=[395709,395795)/1 crt=395794'141488204 lcod
395782'141488203 unknown NOTIFY mbc={}
ps=[190e4~1,19131~1,19181~1,191f1~1,19248~1,1928f~1,192e0~1,19328~1,19370~1,193c0~1,19431~1]]
exit Started/Stray 0.979289 7 0.000253
    -3> 2019-01-30 21:12:24.977 7f9c723e7700  5 osd.45 pg_epoch:
395796 pg[11.2eb( v 395794'141488204
(393296'141485192,395794'141488204] local-lis/les=395790/395791 n=9355
ec=448/448 lis/c 395790/395709 les/c/f 395791/395733/0
395795/395795/395795) [98,45
] r=1 lpr=395795 pi=[395709,395795)/1 crt=395794'141488204 lcod
395782'141488203 unknown NOTIFY mbc={}
ps=[190e4~1,19131~1,19181~1,191f1~1,19248~1,1928f~1,192e0~1,19328~1,19370~1,193c0~1,19431~1]]
enter Started/ReplicaActive
    -2> 2019-01-30 21:12:24.977 7f9c723e7700  5 osd.45 pg_epoch:
395796 pg[11.2eb( v 395794'141488204
(393296'141485192,395794'141488204] local-lis/les=395790/395791 n=9355
ec=448/448 lis/c 395790/395709 les/c/f 395791/395733/0
395795/395795/395795) [98,45
] r=1 lpr=395795 pi=[395709,395795)/1 crt=395794'141488204 lcod
395782'141488203 unknown NOTIFY mbc={}
ps=[190e4~1,19131~1,19181~1,191f1~1,19248~1,1928f~1,192e0~1,19328~1,19370~1,193c0~1,19431~1]]
enter Started/ReplicaActive/RepNotRecovering
    -1> 2019-01-30 21:12:24.983 7f9c733e9700  5 write_log_and_missing
with: dirty_to: 0'0, dirty_from: 4294967295'18446744073709551615,
writeout_from: 395796'95371023, trimmed: , trimmed_dups: ,
clear_divergent_priors: 0
     0> 2019-01-30 21:12:24.990 7f9c7b3f9700 -1
/home/jenkins-build/build/workspace/ceph-build/ARCH/x86_64/AVAILABLE_ARCH/x86_64/AVAILABLE_DIST/centos7/DIST/centos7/MACHINE_SIZE/huge/release/13.2.4/rpm/el7/BUILD/ceph-13.2.4/src/osd/PrimaryLogPG.h:
In funct
ion 
'PrimaryLogPG::Trimming::Trimming(boost::statechart::state<PrimaryLogPG::Trimming,
PrimaryLogPG::SnapTrimmer,
PrimaryLogPG::WaitReservation>::my_context)' thread 7f9c7b3f9700 time
2019-01-30 21:12:24.987263
/home/jenkins-build/build/workspace/ceph-build/ARCH/x86_64/AVAILABLE_ARCH/x86_64/AVAILABLE_DIST/centos7/DIST/centos7/MACHINE_SIZE/huge/release/13.2.4/rpm/el7/BUILD/ceph-13.2.4/src/osd/PrimaryLogPG.h:
1571: FAILED assert(context< SnapTrimmer >().can_trim())

 ceph version 13.2.4 (b10be4d44915a4d78a8e06aa31919e74927b142e) mimic (stable)
 1: (ceph::__ceph_assert_fail(char const*, char const*, int, char
const*)+0xff) [0x7f9ca0f5c16f]
 2: (()+0x25a337) [0x7f9ca0f5c337]
 3: (PrimaryLogPG::NotTrimming::react(PrimaryLogPG::KickTrim
const&)+0x783) [0x56351d32abc3]
 4: (boost::statechart::simple_state<PrimaryLogPG::NotTrimming,
PrimaryLogPG::SnapTrimmer, boost::mpl::list<mpl_::na, mpl_::na,
mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na,
mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na
, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na>,
(boost::statechart::history_mode)0>::react_impl(boost::statechart::event_base
const&, void const*)+0xa9) [0x56351d376629]
 5: (boost::statechart::state_machine<PrimaryLogPG::SnapTrimmer,
PrimaryLogPG::NotTrimming, std::allocator<void>,
boost::statechart::null_exception_translator>::process_queued_events()+0xb3)
[0x56351d350f23]
 6: (boost::statechart::state_machine<PrimaryLogPG::SnapTrimmer,
PrimaryLogPG::NotTrimming, std::allocator<void>,
boost::statechart::null_exception_translator>::process_event(boost::statechart::event_base
const&)+0x87) [0x56351d351187]
 7: (PrimaryLogPG::WaitReservation::ReservationCB::finish(int)+0xbb)
[0x56351d35149b]
 8: (Context::complete(int)+0x9) [0x56351d1bbe49]
 9: (Finisher::finisher_thread_entry()+0x12e) [0x7f9ca0f5a6ee]
 10: (()+0x7dd5) [0x7f9c9dadddd5]
 11: (clone()+0x6d) [0x7f9c9cbcdead]
 NOTE: a copy of the executable, or `objdump -rdS <executable>` is
needed to interpret this.


Any kind input on how to solve the snaptrim problem will be appreciated.

Regards,
Darius
_______________________________________________
ceph-users mailing list
ceph-users@lists.ceph.com
http://lists.ceph.com/listinfo.cgi/ceph-users-ceph.com

Reply via email to