[PATCH] mds: sort dentries when committing dir fragment
From: Yan, Zheng zheng.z@intel.com Currently ceph mds uses tmap to store dir fragments. Dentry_key_t's string representation is used as key for the tmap. When writing or updating a tmap, the OSDs expect the keys to be provided in ascending order. Current code encodes dentries by the order of dentry_key_t(s) when committing dir fragment. The problem here is that we may get different results when comparing dentry_key_t(s) and their string representations. So the MDS may send data/commands sorted in the wrong order to the OSDs. It confuses the OSDs and causes corruption. Comparing dentry_key_t(s) and their string representations gives different results only when name string in one dentry_key_t is prefix of name string in another dentry_key_t. So the fix is checking the special case and re-sorting dentries that are in the wrong order. Signed-off-by: Yan, Zheng zheng.z@intel.com --- src/mds/CDir.cc| 154 ++--- src/mds/mdstypes.h | 16 +++--- 2 files changed, 130 insertions(+), 40 deletions(-) diff --git a/src/mds/CDir.cc b/src/mds/CDir.cc index 2b4f7c7..e724e61 100644 --- a/src/mds/CDir.cc +++ b/src/mds/CDir.cc @@ -1720,24 +1720,65 @@ CDir::map_t::iterator CDir::_commit_full(ObjectOperation m, const setsnapid_t ::encode(fnode, header); max_write_size -= header.length(); + /* + * We may get different results when comparing dentry_key_t(s) and their + * string representations. It happens only when name string in one dentry_key_t + * is prefix of name string in another dentry_key_t. Tmap uses dentry_key_t's + * string representation as key. When writing or updating a tmap, the osd + * expects the keys to be provided in ascending order. So we need re-sort + * the dentries here. + */ + mapstring, CDentry* pending_items; + map_t::iterator p = items.begin(); - while (p != items.end() bl.length() max_write_size) { -CDentry *dn = p-second; -p++; - -if (dn-linkage.is_null()) - continue; // skip negative entries + while ((p != items.end() || pending_items.size()) bl.length() max_write_size) { +if (p != items.end()) { + CDentry *dn = p-second; + p++; -if (snaps dn-last != CEPH_NOSNAP - try_trim_snap_dentry(dn, *snaps)) - continue; - -n++; + if (dn-linkage.is_null()) + continue; // skip negative entries + + if (snaps dn-last != CEPH_NOSNAP + try_trim_snap_dentry(dn, *snaps)) + continue; + + n++; + + if (pending_items.empty()) { + int len = 0; + if (p != items.end()) + len = min(dn-name.length(), p-second-name.length()); + if (p == items.end() || dn-name.compare(0, len, p-second-name, 0, len) 0) { + _encode_dentry(dn, bl, snaps); + } else { + pending_items[dn-key().str()] = dn; + } + continue; + } + + pending_items[dn-key().str()] = dn; + if (p != items.end()) { + string last_pending = pending_items.rbegin()-second-name; + int len = min(last_pending.length(), p-second-name.length()); + if (last_pending.compare(0, len, p-second-name, 0, len) = 0) + continue; + } +} + +for (mapstring, CDentry*::iterator it = pending_items.begin(); +it != pending_items.end(); it++) { + CDentry *dn = it-second; + _encode_dentry(dn, bl, snaps); +} -_encode_dentry(dn, bl, snaps); +if (bl.length() max_write_size) + break; + +pending_items.clear(); } - if (p != items.end()) { + if (p != items.end() || pending_items.size()) { assert(bl.length() max_write_size); return _commit_partial(m, snaps, max_write_size); } @@ -1790,31 +1831,82 @@ CDir::map_t::iterator CDir::_commit_partial(ObjectOperation m, if(last_committed_dn != map_t::iterator()) p = last_committed_dn; - while (p != items.end() finalbl.length() max_write_size) { -CDentry *dn = p-second; -++p; - -if (snaps dn-last != CEPH_NOSNAP - try_trim_snap_dentry(dn, *snaps)) - continue; + // see comments in _commit_full() + map_t::iterator next_dn = p; + mapstring, CDentry* pending_items; -if (!dn-is_dirty()) - continue; // skip clean dentries + while ((p != items.end() || pending_items.size()) finalbl.length() max_write_size) { +if (p != items.end()) { + CDentry *dn = p-second; + ++p; -if (dn-get_linkage()-is_null()) { - dout(10) rm dn-name *dn dendl; - finalbl.append(CEPH_OSD_TMAP_RM); - dn-key().encode(finalbl); -} else { - dout(10) set dn-name *dn dendl; - finalbl.append(CEPH_OSD_TMAP_SET); - _encode_dentry(dn, finalbl, snaps); + if (snaps dn-last != CEPH_NOSNAP + try_trim_snap_dentry(dn, *snaps)) + continue; + + if (!dn-is_dirty()) + continue; // skip clean dentries + + if (pending_items.empty()) { + int len = 0; + if (p !=
Re: BUG: mount failed with IP and hostname mixed
Thanks! Fix is pushed to next branch. Trigger is when the hostname resolves to the same IP listed. sage On Mon, 26 Nov 2012, Drunkard Zhang wrote: It's ceph-0.51, easy to reproduce, just mix IP and hostname and put IP before hostnames when mount use ceph-fuse or mount.ceph, like: ceph-fuse -m 150.164.100.218,log21,log3 /mnt/bc mount -t ceph 150.164.100.218,log21,log3 /mnt/bc mount.ceph 150.164.100.218,log21,log3 /mnt/bc Here's failed message using ceph-fuse: log2 /tmp # ceph-fuse -m 150.164.100.218,log21,log3 /mnt/bc 2012-11-26 01:23:38.712704 7f4a4bfc6780 -1 did not load config file, using default settings. mon/MonMap.h: In function 'void MonMap::add(const std::string, const entity_addr_t)' thread 7f4a4bfc6780 time 2012-11-26 01:23:38.713625 mon/MonMap.h: 97: FAILED assert(addr_name.count(addr) == 0) ceph version 0.51 (commit:c03ca95d235c9a072dcd8a77ad5274a52e93ae30) 1: (MonMap::build_from_host_list(std::string, std::string)+0xb13) [0x55cf03] 2: (MonMap::build_initial(CephContext*, std::ostream)+0xe7) [0x55e827] 3: (MonClient::build_initial_monmap()+0x6c) [0x54debc] 4: (main()+0x1bd) [0x482cad] 5: (__libc_start_main()+0xfd) [0x7f4a49fcb4bd] 6: ceph-fuse() [0x4829b9] NOTE: a copy of the executable, or `objdump -rdS executable` is needed to interpret this. 2012-11-26 01:23:38.715137 7f4a4bfc6780 -1 mon/MonMap.h: In function 'void MonMap::add(const std::string, const entity_addr_t)' thread 7f4a4bfc6780 time 2012-11-26 01:23:38.713625 mon/MonMap.h: 97: FAILED assert(addr_name.count(addr) == 0) ceph version 0.51 (commit:c03ca95d235c9a072dcd8a77ad5274a52e93ae30) 1: (MonMap::build_from_host_list(std::string, std::string)+0xb13) [0x55cf03] 2: (MonMap::build_initial(CephContext*, std::ostream)+0xe7) [0x55e827] 3: (MonClient::build_initial_monmap()+0x6c) [0x54debc] 4: (main()+0x1bd) [0x482cad] 5: (__libc_start_main()+0xfd) [0x7f4a49fcb4bd] 6: ceph-fuse() [0x4829b9] NOTE: a copy of the executable, or `objdump -rdS executable` is needed to interpret this. -1 2012-11-26 01:23:38.712704 7f4a4bfc6780 -1 did not load config file, using default settings. 0 2012-11-26 01:23:38.715137 7f4a4bfc6780 -1 mon/MonMap.h: In function 'void MonMap::add(const std::string, const entity_addr_t)' thread 7f4a4bfc6780 time 2012-11-26 01:23:38.713625 mon/MonMap.h: 97: FAILED assert(addr_name.count(addr) == 0) ceph version 0.51 (commit:c03ca95d235c9a072dcd8a77ad5274a52e93ae30) 1: (MonMap::build_from_host_list(std::string, std::string)+0xb13) [0x55cf03] 2: (MonMap::build_initial(CephContext*, std::ostream)+0xe7) [0x55e827] 3: (MonClient::build_initial_monmap()+0x6c) [0x54debc] 4: (main()+0x1bd) [0x482cad] 5: (__libc_start_main()+0xfd) [0x7f4a49fcb4bd] 6: ceph-fuse() [0x4829b9] NOTE: a copy of the executable, or `objdump -rdS executable` is needed to interpret this. terminate called after throwing an instance of 'ceph::FailedAssertion' *** Caught signal (Aborted) ** in thread 7f4a4bfc6780 ceph version 0.51 (commit:c03ca95d235c9a072dcd8a77ad5274a52e93ae30) 1: ceph-fuse() [0x67a904] 2: (()+0x10410) [0x7f4a4b972410] 3: (gsignal()+0x35) [0x7f4a49fdea85] 4: (abort()+0x185) [0x7f4a49fdfd85] 5: (__gnu_cxx::__verbose_terminate_handler()+0x11d) [0x7f4a4a9197cd] 6: (()+0xb8976) [0x7f4a4a917976] 7: (()+0xb89a3) [0x7f4a4a9179a3] 8: (()+0xb8a9e) [0x7f4a4a917a9e] 9: (ceph::__ceph_assert_fail(char const*, char const*, int, char const*)+0x7a2) [0x59f1d2] 10: (MonMap::build_from_host_list(std::string, std::string)+0xb13) [0x55cf03] 11: (MonMap::build_initial(CephContext*, std::ostream)+0xe7) [0x55e827] 12: (MonClient::build_initial_monmap()+0x6c) [0x54debc] 13: (main()+0x1bd) [0x482cad] 14: (__libc_start_main()+0xfd) [0x7f4a49fcb4bd] 15: ceph-fuse() [0x4829b9] 2012-11-26 01:23:38.716314 7f4a4bfc6780 -1 *** Caught signal (Aborted) ** in thread 7f4a4bfc6780 ceph version 0.51 (commit:c03ca95d235c9a072dcd8a77ad5274a52e93ae30) 1: ceph-fuse() [0x67a904] 2: (()+0x10410) [0x7f4a4b972410] 3: (gsignal()+0x35) [0x7f4a49fdea85] 4: (abort()+0x185) [0x7f4a49fdfd85] 5: (__gnu_cxx::__verbose_terminate_handler()+0x11d) [0x7f4a4a9197cd] 6: (()+0xb8976) [0x7f4a4a917976] 7: (()+0xb89a3) [0x7f4a4a9179a3] 8: (()+0xb8a9e) [0x7f4a4a917a9e] 9: (ceph::__ceph_assert_fail(char const*, char const*, int, char const*)+0x7a2) [0x59f1d2] 10: (MonMap::build_from_host_list(std::string, std::string)+0xb13) [0x55cf03] 11: (MonMap::build_initial(CephContext*, std::ostream)+0xe7) [0x55e827] 12: (MonClient::build_initial_monmap()+0x6c) [0x54debc] 13: (main()+0x1bd) [0x482cad] 14: (__libc_start_main()+0xfd) [0x7f4a49fcb4bd] 15: ceph-fuse() [0x4829b9] NOTE: a copy of the executable, or `objdump -rdS executable` is needed to interpret this. 0 2012-11-26 01:23:38.716314 7f4a4bfc6780 -1 *** Caught signal (Aborted) ** in thread 7f4a4bfc6780 ceph version 0.51 (commit:c03ca95d235c9a072dcd8a77ad5274a52e93ae30) 1:
bobtail release candidates
Hi all, There are automatic builds of the prerelease bobtail code available under the 'next' branch. For debs, http://ceph.com/docs/master/install/debian/#add-development-testing-packages For example, for Ubuntu 12.04 precise, http://gitbuilder.ceph.com/ceph-deb-precise-x86_64-basic/ref/next/ And RPMs for el6, http://gitbuilder.ceph.com/ceph-rpm-centos6-x86_64-basic/ref/next/RPMS/x86_64/ Any testing and early feedback is greatly appreciated. Thanks! sage -- To unsubscribe from this list: send the line unsubscribe ceph-devel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Statistics / Nagios
Hello list, are there any useful statistics for ceph? ceph -s shows just the status but i would like to record something which gives me an idea about the ceph load or usage. Greets, Stefan -- To unsubscribe from this list: send the line unsubscribe ceph-devel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: syncfs via syscall doesn not work
Am 25.11.2012 00:20, schrieb Sage Weil: On Sat, 24 Nov 2012, Stefan Priebe wrote: Am 24.11.2012 18:16, schrieb Sage Weil: On Sat, 24 Nov 2012, Stefan Priebe - Profihost AG wrote: At the machine compiling? Yes! Hmm, I just tested on my wonky wheezy machine (glibc 2.13, 3.2) and system(SYS_syncfs, ..) works for me. But the #define is in libc6-dev: But that means your glibc supports it. Then we don't need the syscall at all. To me it's only present in the kernel headers (linux-libc-dev) like this: [pb64: ~]# grep -r syncfs /usr/include /usr/include/asm/unistd_64.h:#define __NR_syncfs 306 /usr/include/asm/unistd_64.h:__SYSCALL(__NR_syncfs, sys_syncfs) /usr/include/asm/unistd_32.h:#define __NR_syncfs 344 /usr/include/asm-generic/unistd.h:#define __NR_syncfs 267 /usr/include/asm-generic/unistd.h:__SYSCALL(__NR_syncfs, sys_syncfs) Can try wip-syncfs? It checks for either SYS_syncfs or __NR_syncfs. Since your latest patch it looks good to me: starting osd.11 at :/0 osd_data /ceph/osd.11/ /dev/sdb1 2012-11-25 19:37:15.686305 7f155e1db780 0 filestore(/ceph/osd.11/) mount FIEMAP ioctl is supported and appears to work 2012-11-25 19:37:15.686313 7f155e1db780 0 filestore(/ceph/osd.11/) mount FIEMAP ioctl is disabled via 'filestore fiemap' config option 2012-11-25 19:37:15.686500 7f155e1db780 0 filestore(/ceph/osd.11/) mount did NOT detect btrfs 2012-11-25 19:37:15.687850 7f155e1db780 0 filestore(/ceph/osd.11/) mount syscall(__NR_syncfs, fd) fully supported 2012-11-25 19:37:15.687899 7f155e1db780 0 filestore(/ceph/osd.11/) mount found snaps 2012-11-25 19:37:15.689973 7f155e1db780 0 filestore(/ceph/osd.11/) mount: enabling WRITEAHEAD journal mode: btrfs not detected 2012-11-25 19:37:15.700405 7f155e1db780 0 journal kernel version is 3.6.7 2012-11-25 19:37:15.710407 7f155e1db780 0 journal kernel version is 3.6.7 YES! No more need to build a custom patched glibc. Stefan -- To unsubscribe from this list: send the line unsubscribe ceph-devel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: Redundancy when mount with multiple mon IP
On Sunday, November 25, 2012 at 9:30 AM, Drunkard Zhang wrote: 2012/11/26 Sage Weil s...@inktank.com (mailto:s...@inktank.com): On Sun, 25 Nov 2012, Drunkard Zhang wrote: I'm using ceph-0.51. I setup 3 monitors. then mount with 3 mon IP at another host with either kernel mode or fuse, neither give me redundancy. Here's commands I used at client (IP: 10.0.0.2) side; mount -t ceph log3,log21,squid86-log12:/ /mnt/bc ceph-fuse -m log3,log21,squid86-log12 /mnt/bc When mounted, read/write is OK, then add a iptables rule at log3 to REJECT/DROP packets from client 10.0.0.2, operation at client stuck due to IO problem. Related stuck processes on client look like this: log1 ~ # ps auwx |g bc root 1325 0.0 0.0 120100 3536 pts/0 Sl 23:10 0:00 ceph-fuse -m log3,log21,squid86-log12 /mnt/bc root 1404 0.0 0.0 16192 700 pts/0 S 23:10 0:00 ls --color=auto /mnt/bc Can you repeat teh test with '--debug-monc 20 --debug-ms 1 --log-file /tmp/foo' on the command line and attach the resulting log? Thanks! sage Thanks for the hint, I think the problem found, I rejected the client at mon.log3 which also acts as mds, and it's up and active, another mds is up and standby. So the client cannot connect to mds server? Is it possible to get more than one mds up and active simultaneously, and let client know that? The active MDS servers partition the namespace between them — clients need to be able to communicate with them all. You can configure which MDS is active (at start-up) if you like; unfortunately the best reference is still in the wiki: http://ceph.com/deprecated/Standby-replay_modes -Greg -- To unsubscribe from this list: send the line unsubscribe ceph-devel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: Statistics / Nagios
On Sunday, November 25, 2012 at 10:26 AM, Stefan Priebe wrote: Hello list, are there any useful statistics for ceph? ceph -s shows just the status but i would like to record something which gives me an idea about the ceph load or usage. Each daemon has an admin socket which you can use to extract perfcounters (performance counters). It's not well-documented at all right now, unfortunately. ceph --admin-daemon /path/to/admin/socket help ceph --admin-daemon /path/to/admin/socket perfcounters_dump ceph --admin-daemon /path/to/admin/socket perfcounters_schema should get you started. http://ceph.com/docs/master/radosgw/troubleshooting has some examples of troubleshooting the gateway with them. -Greg -- To unsubscribe from this list: send the line unsubscribe ceph-devel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: Statistics / Nagios
Hi Greg, Am 25.11.2012 20:08, schrieb Gregory Farnum: On Sunday, November 25, 2012 at 10:26 AM, Stefan Priebe wrote: Hello list, are there any useful statistics for ceph? ceph -s shows just the status but i would like to record something which gives me an idea about the ceph load or usage. Each daemon has an admin socket which you can use to extract perfcounters (performance counters). It's not well-documented at all right now, unfortunately. ceph --admin-daemon /path/to/admin/socket help ceph --admin-daemon /path/to/admin/socket perfcounters_dump ceph --admin-daemon /path/to/admin/socket perfcounters_schema should get you started. http://ceph.com/docs/master/radosgw/troubleshooting has some examples of troubleshooting the gateway with them. thanks! I don't use radosgw. Just ceph-mon and ceph-osd with rbd block devices. So i have to query each OSD on their own? Right now i have [global] admin socket=/var/run/ceph.sock but that gives: # ceph --admin-daemon /var/run/ceph.sock perfcounters_schema {} Greets, Stefan -- To unsubscribe from this list: send the line unsubscribe ceph-devel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: Hi + question
On 11/25/2012 07:02 PM, Roald van Loon wrote: Like I said I'm toying away with ceph since an hour or two, and one of the things I noticed was that setting up my test env and getting to know ceph wasn't all that easy to do (gentoo btw). Things like ceph -h not showing the setcrushmap option, the 'ceph osd crush set' only returning '(22) Invalid argument' etc etc... So, I decided to take a real good look at the sources - any dev's number one source for documentation :-) There I... well.. kind of got lost. Cpp files, devops tools (python tools, shell scripts, perl scripts), fooclass, barclass ofcourse, valgrind supps, ... all in ~/src :-) So I was curious if I was the only one finding this a bit confusing. If not, I was wondering if you have some clean-up subproject running somewhere. Might be a nice way for me to really familiarize myself with the code base. Let me know, Roald Welcome Roald! I must say I can relate to that feeling, from back when I started working with ceph. But I guess that most of my early confusion was basically due to not being aware where the stuff I wanted was located. Nowadays I find the sources division fairly consistent with their purpose. See, in src/ you will mostly find the daemons and the tools; on mon/ you will find the files that are mostly mon-related, and same goes for most of the other subdirectories (osd/, mds/, crush/,...). There are a couple of exceptions, such as global/ or common/, but those by themselves are pretty self-explanatory ;-) To be honest, the only thing that keeps annoying me is that the build process will end up leaving all the objects, libraries and binaries in src/ after the build, but that's one of the things I've learned to live with without trying to find an alternative (shame!). If you need pointers on how to get familiarized with the code base, or any doubts arise, just let us know and we'd be pleased to point you in the right direction. You might also find it useful to join us on #ceph at OFTC. And, fwiw, the 'ceph' tool itself (which is on tools/, btw), won't provide you the insight you're looking for. It mostly sends commands directly to the monitor, and the monitor will send replies the tool will output. What you are looking for, in this case, can be found on mon/OSDMonitor.cc Cheers, -Joao -- To unsubscribe from this list: send the line unsubscribe ceph-devel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: Cephfs losing files and corrupting others
So when trawling through the filesystem doing checksum validation these popped up on the files that are filled with null bytes: https://gist.github.com/186ad4c5df816d44f909 Is there any way to fsck today? Looks like feature #86 http://tracker.newdream.net/issues/86 isn't implemented yet. thanks, -n On Thu, Nov 22, 2012 at 11:37 PM, Nathan Howell nathan.d.how...@gmail.com wrote: I upgraded to 0.54 and now there are some hints in the logs. The directories referenced in the log entries are now missing: 2012-11-23 07:28:04.802864 mds.0 [ERR] loaded dup inode 100662f [2,head] v3851654 at /xxx/20120203, but inode 100662f.head v3853093 already exists at ~mds0/stray7/100662f 2012-11-23 07:28:04.802889 mds.0 [ERR] loaded dup inode 1003a4b [2,head] v431518 at /xxx/20120206, but inode 1003a4b.head v3853192 already exists at ~mds0/stray8/1003a4b 2012-11-23 07:28:04.802909 mds.0 [ERR] loaded dup inode 100149e [2,head] v431522 at /xxx/20120207, but inode 100149e.head v3853206 already exists at ~mds0/stray8/100149e 2012-11-23 07:28:04.802927 mds.0 [ERR] loaded dup inode 1000a5f [2,head] v431526 at /xxx/20120208, but inode 1000a5f.head v3853208 already exists at ~mds0/stray8/1000a5f Any ideas? On Thu, Nov 15, 2012 at 11:00 AM, Nathan Howell nathan.d.how...@gmail.com wrote: Yes, successfully written files were disappearing. We switched to ceph-fuse and haven't seen any files truncated since. Older files (written months ago) are still having their entire contents replaced with NULL bytes, seemly at random. I can't yet say for sure this has happened since switching over to fuse... but we think it has. I'm going to test all of the archives over the next few days and restore them from S3, so we should be back in a known-good state after that. In the event more files end up corrupted, is there any logging that I can enable that would help track down the problem? thanks, -n On Sat, Nov 3, 2012 at 9:54 AM, Gregory Farnum g...@inktank.com wrote: On Fri, Nov 2, 2012 at 12:30 AM, Nathan Howell nathan.d.how...@gmail.com wrote: On Thu, Nov 1, 2012 at 3:32 PM, Sam Lang sam.l...@inktank.com wrote: Do the writes succeed? I.e. the programs creating the files don't get errors back? Are you seeing any problems with the ceph mds or osd processes crashing? Can you describe your I/O workload during these bulk loads? How many files, how much data, multiple clients writing, etc. As far as I know, there haven't been any fixes to 0.48.2 to resolve problems like yours. You might try the ceph fuse client to see if you get the same behavior. If not, then at least we have narrowed down the problem to the ceph kernel client. Yes, the writes succeed. Wednesday's failure looked like this: 1) rsync 100-200mb tarball directly into ceph from a remote site 2) untar ~500 files from tarball in ceph into a new directory in ceph 3) wait for a while 4) the .tar file and some log files disappeared but the untarred files were fine Just to be clear, you copied a tarball into Ceph and untarred all in Ceph, and the extracted contents were fine but the tarball disappeared? So this looks like a case of successfully-written files disappearing? Did you at any point check the tarball from a machine other than the initial client that copied it in? This truncation sounds like maybe Yan's fix will deal with it. But if you've also seen files with the proper size but be empty or corrupted, that sounds like an OSD bug. Sam, are you aware of any btrfs issues that could cause this? Nathan, you've also seen parts of the filesystem hierarchy get lost? That's rather more concerning; under what circumstances have you seen that? -Greg Total filesystem size is: pgmap v2221244: 960 pgs: 960 active+clean; 2418 GB data, 7293 GB used, 6151 GB / 13972 GB avail Generally our load looks like: Constant trickle of 1-2mb files from 3 machines, about 1GB per day total. No file is written to by more than 1 machine, but the files go into shared directories. Grid jobs are running constantly and are doing sequential reads from the filesystem. Compute nodes have the filesystem mounted read-only. They're primarily located at a remote site (~40ms away) and tend to average 1-2 megabits/sec. Nightly data jobs load in ~10GB from a few remote sites in to 10 large files. These are split up into about 1000 smaller files but the originals are also kept. All of this is done on one machine. The journals and osd drives are write saturated while this is going on. On Thu, Nov 1, 2012 at 4:02 PM, Gregory Farnum g...@inktank.com wrote: Are you using hard links, by any chance? No, we are using a handfull of soft links though. Do you have one or many MDS systems? ceph mds stat says: e686: 1/1/1 up {0=xxx=up:active}, 2 up:standby What filesystem are you using on your OSDs? btrfs thanks,
Re: Hangup during scrubbing - possible solutions
On Fri, Nov 23, 2012 at 12:35 AM, Sage Weil s...@inktank.com wrote: On Thu, 22 Nov 2012, Andrey Korolyov wrote: Hi, In the recent versions Ceph introduces some unexpected behavior for the permanent connections (VM or kernel clients) - after crash recovery, I/O will hang on the next planned scrub on the following scenario: - launch a bunch of clients doing non-intensive writes, - lose one or more osd, mark them down, wait for recovery completion, - do a slow scrub, e.g. scrubbing one osd per 5m, inside bash script, or wait for ceph to do the same, - observe a raising number of pgs stuck in the active+clean+scrubbing state (they took a master role from ones which was on killed osd and almost surely they are being written in time of crash), - some time later, clients will hang hardly and ceph log introduce stuck(old) I/O requests. The only one way to return clients back without losing their I/O state is per-osd restart, which also will help to get rid of active+clean+scrubbing pgs. First of all, I`ll be happy to help to solve this problem by providing logs. If you can reproduce this behavior with 'debug osd = 20' and 'debug ms = 1' logging on the OSD, that would be wonderful! I have tested slightly different recovery flow, please see below. Since there is no real harm, like frozen I/O, placement groups also was stuck forever on the active+clean+scrubbing state, until I restarted all osds (end of the log): http://xdel.ru/downloads/ceph-log/recover-clients-later-than-osd.txt.gz - start the healthy cluster - start persistent clients - add an another host with pair of OSDs, let them be in the data placement - wait for data to rearrange - [22:06 timestamp] mark OSDs out or simply kill them and wait(since I have an 1/2 hour delay on readjust in such case, I did ``ceph osd out'' manually) - watch for data to rearrange again - [22:51 timestamp] when it ends, start a manual rescrub, with non-zero active+clean+scrubbing-state placement groups at the end of process which `ll stay in this state forever until something happens After that, I can restart osds one per one, if I want to get rid of scrubbing states immediately and then do deep-scrub(if I don`t, those states will return at next ceph self-scrubbing) or do per-osd deep-scrub, if I have a lot of time. The case I have described in the previous message took place when I remove osd from data placement which existed on the moment when client(s) have started and indeed it is more harmful than current one(frozen I/O leads to hanging entire guest, for example). Since testing those flow took a lot of time, I`ll send logs related to this case tomorrow. Second question is not directly related to this problem, but I have thought on for a long time - is there a planned features to control scrub process more precisely, e.g. pg scrub rate or scheduled scrub, instead of current set of timeouts which of course not very predictable on when to run? Not yet. I would be interested in hearing what kind of control/config options/whatever you (and others) would like to see! Of course it will be awesome to have any determined scheduler or at least an option to disable automated scrubbing, since it is not very determined in time and deep-scrub eating a lot of I/O if command issued against entire OSD. Rate limiting is not in the first place, at least it may be recreated in external script, but for those who prefer to leave control to Ceph, it may be very useful. Thanks! -- To unsubscribe from this list: send the line unsubscribe ceph-devel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] mds: sort dentries when committing dir fragment
I pushed an alternative approach to wip-tmap. This sorting is an artifact of tmap's crummy implementation, and the mds workaround will need to get reverted when we switch to omap. Instead, fix tmap so that it will tolerate unsorted keys. (Also, drop the ENOENT on rm on missing key.) Eventually we can deprecate and remove tmap entirely... What do you think? sage On Sun, 25 Nov 2012, Yan, Zheng wrote: From: Yan, Zheng zheng.z@intel.com Currently ceph mds uses tmap to store dir fragments. Dentry_key_t's string representation is used as key for the tmap. When writing or updating a tmap, the OSDs expect the keys to be provided in ascending order. Current code encodes dentries by the order of dentry_key_t(s) when committing dir fragment. The problem here is that we may get different results when comparing dentry_key_t(s) and their string representations. So the MDS may send data/commands sorted in the wrong order to the OSDs. It confuses the OSDs and causes corruption. Comparing dentry_key_t(s) and their string representations gives different results only when name string in one dentry_key_t is prefix of name string in another dentry_key_t. So the fix is checking the special case and re-sorting dentries that are in the wrong order. Signed-off-by: Yan, Zheng zheng.z@intel.com --- src/mds/CDir.cc| 154 ++--- src/mds/mdstypes.h | 16 +++--- 2 files changed, 130 insertions(+), 40 deletions(-) diff --git a/src/mds/CDir.cc b/src/mds/CDir.cc index 2b4f7c7..e724e61 100644 --- a/src/mds/CDir.cc +++ b/src/mds/CDir.cc @@ -1720,24 +1720,65 @@ CDir::map_t::iterator CDir::_commit_full(ObjectOperation m, const setsnapid_t ::encode(fnode, header); max_write_size -= header.length(); + /* + * We may get different results when comparing dentry_key_t(s) and their + * string representations. It happens only when name string in one dentry_key_t + * is prefix of name string in another dentry_key_t. Tmap uses dentry_key_t's + * string representation as key. When writing or updating a tmap, the osd + * expects the keys to be provided in ascending order. So we need re-sort + * the dentries here. + */ + mapstring, CDentry* pending_items; + map_t::iterator p = items.begin(); - while (p != items.end() bl.length() max_write_size) { -CDentry *dn = p-second; -p++; - -if (dn-linkage.is_null()) - continue; // skip negative entries + while ((p != items.end() || pending_items.size()) bl.length() max_write_size) { +if (p != items.end()) { + CDentry *dn = p-second; + p++; -if (snaps dn-last != CEPH_NOSNAP - try_trim_snap_dentry(dn, *snaps)) - continue; - -n++; + if (dn-linkage.is_null()) + continue; // skip negative entries + + if (snaps dn-last != CEPH_NOSNAP + try_trim_snap_dentry(dn, *snaps)) + continue; + + n++; + + if (pending_items.empty()) { + int len = 0; + if (p != items.end()) + len = min(dn-name.length(), p-second-name.length()); + if (p == items.end() || dn-name.compare(0, len, p-second-name, 0, len) 0) { + _encode_dentry(dn, bl, snaps); + } else { + pending_items[dn-key().str()] = dn; + } + continue; + } + + pending_items[dn-key().str()] = dn; + if (p != items.end()) { + string last_pending = pending_items.rbegin()-second-name; + int len = min(last_pending.length(), p-second-name.length()); + if (last_pending.compare(0, len, p-second-name, 0, len) = 0) + continue; + } +} + +for (mapstring, CDentry*::iterator it = pending_items.begin(); + it != pending_items.end(); it++) { + CDentry *dn = it-second; + _encode_dentry(dn, bl, snaps); +} -_encode_dentry(dn, bl, snaps); +if (bl.length() max_write_size) + break; + +pending_items.clear(); } - if (p != items.end()) { + if (p != items.end() || pending_items.size()) { assert(bl.length() max_write_size); return _commit_partial(m, snaps, max_write_size); } @@ -1790,31 +1831,82 @@ CDir::map_t::iterator CDir::_commit_partial(ObjectOperation m, if(last_committed_dn != map_t::iterator()) p = last_committed_dn; - while (p != items.end() finalbl.length() max_write_size) { -CDentry *dn = p-second; -++p; - -if (snaps dn-last != CEPH_NOSNAP - try_trim_snap_dentry(dn, *snaps)) - continue; + // see comments in _commit_full() + map_t::iterator next_dn = p; + mapstring, CDentry* pending_items; -if (!dn-is_dirty()) - continue; // skip clean dentries + while ((p != items.end() || pending_items.size()) finalbl.length() max_write_size) { +if (p != items.end()) { + CDentry *dn = p-second; + ++p; -if (dn-get_linkage()-is_null()) { - dout(10)
Re: [PATCH] mds: sort dentries when committing dir fragment
On 11/26/2012 06:32 AM, Sage Weil wrote: I pushed an alternative approach to wip-tmap. This sorting is an artifact of tmap's crummy implementation, and the mds workaround will need to get reverted when we switch to omap. Instead, fix tmap so that it will tolerate unsorted keys. (Also, drop the ENOENT on rm on missing key.) Eventually we can deprecate and remove tmap entirely... What do you think? This approach is cleaner than mine. But I think your fix isn't enough because MDS may provide tmap contains misordered items to the TMAPPUT method. Misordered items will confuse future TMAPUP. This fix is either sorting items when handling TMAPPUT or searching forward for any potential misordered items when TMAP_SET wants to add a new item or TMAP_RM fails to find an item. Regards Yan, Zheng sage On Sun, 25 Nov 2012, Yan, Zheng wrote: From: Yan, Zheng zheng.z@intel.com Currently ceph mds uses tmap to store dir fragments. Dentry_key_t's string representation is used as key for the tmap. When writing or updating a tmap, the OSDs expect the keys to be provided in ascending order. Current code encodes dentries by the order of dentry_key_t(s) when committing dir fragment. The problem here is that we may get different results when comparing dentry_key_t(s) and their string representations. So the MDS may send data/commands sorted in the wrong order to the OSDs. It confuses the OSDs and causes corruption. Comparing dentry_key_t(s) and their string representations gives different results only when name string in one dentry_key_t is prefix of name string in another dentry_key_t. So the fix is checking the special case and re-sorting dentries that are in the wrong order. Signed-off-by: Yan, Zheng zheng.z@intel.com --- src/mds/CDir.cc| 154 ++--- src/mds/mdstypes.h | 16 +++--- 2 files changed, 130 insertions(+), 40 deletions(-) diff --git a/src/mds/CDir.cc b/src/mds/CDir.cc index 2b4f7c7..e724e61 100644 --- a/src/mds/CDir.cc +++ b/src/mds/CDir.cc @@ -1720,24 +1720,65 @@ CDir::map_t::iterator CDir::_commit_full(ObjectOperation m, const setsnapid_t ::encode(fnode, header); max_write_size -= header.length(); + /* + * We may get different results when comparing dentry_key_t(s) and their + * string representations. It happens only when name string in one dentry_key_t + * is prefix of name string in another dentry_key_t. Tmap uses dentry_key_t's + * string representation as key. When writing or updating a tmap, the osd + * expects the keys to be provided in ascending order. So we need re-sort + * the dentries here. + */ + mapstring, CDentry* pending_items; + map_t::iterator p = items.begin(); - while (p != items.end() bl.length() max_write_size) { -CDentry *dn = p-second; -p++; - -if (dn-linkage.is_null()) - continue; // skip negative entries + while ((p != items.end() || pending_items.size()) bl.length() max_write_size) { +if (p != items.end()) { + CDentry *dn = p-second; + p++; -if (snaps dn-last != CEPH_NOSNAP -try_trim_snap_dentry(dn, *snaps)) - continue; - -n++; + if (dn-linkage.is_null()) +continue; // skip negative entries + + if (snaps dn-last != CEPH_NOSNAP + try_trim_snap_dentry(dn, *snaps)) +continue; + + n++; + + if (pending_items.empty()) { +int len = 0; +if (p != items.end()) + len = min(dn-name.length(), p-second-name.length()); +if (p == items.end() || dn-name.compare(0, len, p-second-name, 0, len) 0) { + _encode_dentry(dn, bl, snaps); +} else { + pending_items[dn-key().str()] = dn; +} +continue; + } + + pending_items[dn-key().str()] = dn; + if (p != items.end()) { +string last_pending = pending_items.rbegin()-second-name; +int len = min(last_pending.length(), p-second-name.length()); +if (last_pending.compare(0, len, p-second-name, 0, len) = 0) + continue; + } +} + +for (mapstring, CDentry*::iterator it = pending_items.begin(); + it != pending_items.end(); it++) { + CDentry *dn = it-second; + _encode_dentry(dn, bl, snaps); +} -_encode_dentry(dn, bl, snaps); +if (bl.length() max_write_size) + break; + +pending_items.clear(); } - if (p != items.end()) { + if (p != items.end() || pending_items.size()) { assert(bl.length() max_write_size); return _commit_partial(m, snaps, max_write_size); } @@ -1790,31 +1831,82 @@ CDir::map_t::iterator CDir::_commit_partial(ObjectOperation m, if(last_committed_dn != map_t::iterator()) p = last_committed_dn; - while (p != items.end() finalbl.length() max_write_size) { -CDentry *dn = p-second; -++p; - -if (snaps dn-last != CEPH_NOSNAP -try_trim_snap_dentry(dn,