[PATCH] mds: sort dentries when committing dir fragment

2012-11-25 Thread Yan, Zheng
From: Yan, Zheng zheng.z@intel.com

Currently ceph mds uses tmap to store dir fragments. Dentry_key_t's
string representation is used as key for the tmap. When writing or
updating a tmap, the OSDs expect the keys to be provided in ascending
order. Current code encodes dentries by the order of dentry_key_t(s)
when committing dir fragment. The problem here is that we may get
different results when comparing dentry_key_t(s) and their string
representations. So the MDS may send data/commands sorted in the
wrong order to the OSDs. It confuses the OSDs and causes corruption.

Comparing dentry_key_t(s) and their string representations gives
different results only when name string in one dentry_key_t is prefix
of name string in another dentry_key_t. So the fix is checking the
special case and re-sorting dentries that are in the wrong order.

Signed-off-by: Yan, Zheng zheng.z@intel.com
---
 src/mds/CDir.cc| 154 ++---
 src/mds/mdstypes.h |  16 +++---
 2 files changed, 130 insertions(+), 40 deletions(-)

diff --git a/src/mds/CDir.cc b/src/mds/CDir.cc
index 2b4f7c7..e724e61 100644
--- a/src/mds/CDir.cc
+++ b/src/mds/CDir.cc
@@ -1720,24 +1720,65 @@ CDir::map_t::iterator 
CDir::_commit_full(ObjectOperation m, const setsnapid_t
   ::encode(fnode, header);
   max_write_size -= header.length();
 
+  /*
+   * We may get different results when comparing dentry_key_t(s) and their
+   * string representations. It happens only when name string in one 
dentry_key_t
+   * is prefix of name string in another dentry_key_t. Tmap uses dentry_key_t's
+   * string representation as key. When writing or updating a tmap, the osd
+   * expects the keys to be provided in ascending order. So we need re-sort
+   * the dentries here.
+   */
+  mapstring, CDentry* pending_items;
+
   map_t::iterator p = items.begin();
-  while (p != items.end()  bl.length()  max_write_size) {
-CDentry *dn = p-second;
-p++;
-
-if (dn-linkage.is_null()) 
-  continue;  // skip negative entries
+  while ((p != items.end() || pending_items.size())  bl.length()  
max_write_size) {
+if (p != items.end()) {
+  CDentry *dn = p-second;
+  p++;
 
-if (snaps  dn-last != CEPH_NOSNAP 
-   try_trim_snap_dentry(dn, *snaps))
-  continue;
-
-n++;
+  if (dn-linkage.is_null())
+   continue;  // skip negative entries
+
+  if (snaps  dn-last != CEPH_NOSNAP 
+ try_trim_snap_dentry(dn, *snaps))
+   continue;
+
+  n++;
+
+  if (pending_items.empty()) {
+   int len = 0;
+   if (p != items.end())
+ len = min(dn-name.length(), p-second-name.length());
+   if (p == items.end() || dn-name.compare(0, len, p-second-name, 0, 
len)  0) {
+ _encode_dentry(dn, bl, snaps);
+   } else {
+ pending_items[dn-key().str()] = dn;
+   }
+   continue;
+  }
+
+  pending_items[dn-key().str()] = dn;
+  if (p != items.end()) {
+   string last_pending = pending_items.rbegin()-second-name;
+   int len = min(last_pending.length(), p-second-name.length());
+   if (last_pending.compare(0, len, p-second-name, 0, len) = 0)
+ continue;
+  }
+}
+
+for (mapstring, CDentry*::iterator it = pending_items.begin();
+it != pending_items.end(); it++) {
+  CDentry *dn = it-second;
+  _encode_dentry(dn, bl, snaps);
+}
 
-_encode_dentry(dn, bl, snaps);
+if (bl.length()  max_write_size)
+  break;
+
+pending_items.clear();
   }
 
-  if (p != items.end()) {
+  if (p != items.end() || pending_items.size()) {
 assert(bl.length()  max_write_size);
 return _commit_partial(m, snaps, max_write_size);
   }
@@ -1790,31 +1831,82 @@ CDir::map_t::iterator 
CDir::_commit_partial(ObjectOperation m,
   if(last_committed_dn != map_t::iterator())
 p = last_committed_dn;
 
-  while (p != items.end()  finalbl.length()  max_write_size) {
-CDentry *dn = p-second;
-++p;
-
-if (snaps  dn-last != CEPH_NOSNAP 
-   try_trim_snap_dentry(dn, *snaps))
-  continue;
+  // see comments in _commit_full()
+  map_t::iterator next_dn = p;
+  mapstring, CDentry* pending_items;
 
-if (!dn-is_dirty())
-  continue;  // skip clean dentries
+  while ((p != items.end() || pending_items.size())  finalbl.length()  
max_write_size) {
+if (p != items.end()) {
+  CDentry *dn = p-second;
+  ++p;
 
-if (dn-get_linkage()-is_null()) {
-  dout(10)   rm   dn-name *dn  dendl;
-  finalbl.append(CEPH_OSD_TMAP_RM);
-  dn-key().encode(finalbl);
-} else {
-  dout(10)   set   dn-name *dn  dendl;
-  finalbl.append(CEPH_OSD_TMAP_SET);
-  _encode_dentry(dn, finalbl, snaps);
+  if (snaps  dn-last != CEPH_NOSNAP 
+ try_trim_snap_dentry(dn, *snaps))
+   continue;
+
+  if (!dn-is_dirty())
+   continue;  // skip clean dentries
+
+  if (pending_items.empty()) {
+   int len = 0;
+   if (p != 

Re: BUG: mount failed with IP and hostname mixed

2012-11-25 Thread Sage Weil
Thanks!  Fix is pushed to next branch.

Trigger is when the hostname resolves to the same IP listed.

sage

On Mon, 26 Nov 2012, Drunkard Zhang wrote:

 It's ceph-0.51, easy to reproduce, just mix IP and hostname and put IP
 before hostnames when mount use ceph-fuse or mount.ceph, like:
 
 ceph-fuse -m 150.164.100.218,log21,log3 /mnt/bc
 mount -t ceph 150.164.100.218,log21,log3 /mnt/bc
 mount.ceph 150.164.100.218,log21,log3 /mnt/bc
 
 Here's failed message using ceph-fuse:
 log2 /tmp # ceph-fuse -m 150.164.100.218,log21,log3 /mnt/bc
 2012-11-26 01:23:38.712704 7f4a4bfc6780 -1 did not load config file,
 using default settings.
 mon/MonMap.h: In function 'void MonMap::add(const std::string, const
 entity_addr_t)' thread 7f4a4bfc6780 time 2012-11-26 01:23:38.713625
 mon/MonMap.h: 97: FAILED assert(addr_name.count(addr) == 0)
  ceph version 0.51 (commit:c03ca95d235c9a072dcd8a77ad5274a52e93ae30)
  1: (MonMap::build_from_host_list(std::string, std::string)+0xb13) [0x55cf03]
  2: (MonMap::build_initial(CephContext*, std::ostream)+0xe7) [0x55e827]
  3: (MonClient::build_initial_monmap()+0x6c) [0x54debc]
  4: (main()+0x1bd) [0x482cad]
  5: (__libc_start_main()+0xfd) [0x7f4a49fcb4bd]
  6: ceph-fuse() [0x4829b9]
  NOTE: a copy of the executable, or `objdump -rdS executable` is
 needed to interpret this.
 2012-11-26 01:23:38.715137 7f4a4bfc6780 -1 mon/MonMap.h: In function
 'void MonMap::add(const std::string, const entity_addr_t)' thread
 7f4a4bfc6780 time 2012-11-26 01:23:38.713625
 mon/MonMap.h: 97: FAILED assert(addr_name.count(addr) == 0)
 
  ceph version 0.51 (commit:c03ca95d235c9a072dcd8a77ad5274a52e93ae30)
  1: (MonMap::build_from_host_list(std::string, std::string)+0xb13) [0x55cf03]
  2: (MonMap::build_initial(CephContext*, std::ostream)+0xe7) [0x55e827]
  3: (MonClient::build_initial_monmap()+0x6c) [0x54debc]
  4: (main()+0x1bd) [0x482cad]
  5: (__libc_start_main()+0xfd) [0x7f4a49fcb4bd]
  6: ceph-fuse() [0x4829b9]
  NOTE: a copy of the executable, or `objdump -rdS executable` is
 needed to interpret this.
 
 -1 2012-11-26 01:23:38.712704 7f4a4bfc6780 -1 did not load config
 file, using default settings.
  0 2012-11-26 01:23:38.715137 7f4a4bfc6780 -1 mon/MonMap.h: In
 function 'void MonMap::add(const std::string, const entity_addr_t)'
 thread 7f4a4bfc6780 time 2012-11-26 01:23:38.713625
 mon/MonMap.h: 97: FAILED assert(addr_name.count(addr) == 0)
 
  ceph version 0.51 (commit:c03ca95d235c9a072dcd8a77ad5274a52e93ae30)
  1: (MonMap::build_from_host_list(std::string, std::string)+0xb13) [0x55cf03]
  2: (MonMap::build_initial(CephContext*, std::ostream)+0xe7) [0x55e827]
  3: (MonClient::build_initial_monmap()+0x6c) [0x54debc]
  4: (main()+0x1bd) [0x482cad]
  5: (__libc_start_main()+0xfd) [0x7f4a49fcb4bd]
  6: ceph-fuse() [0x4829b9]
  NOTE: a copy of the executable, or `objdump -rdS executable` is
 needed to interpret this.
 
 terminate called after throwing an instance of 'ceph::FailedAssertion'
 *** Caught signal (Aborted) **
  in thread 7f4a4bfc6780
  ceph version 0.51 (commit:c03ca95d235c9a072dcd8a77ad5274a52e93ae30)
  1: ceph-fuse() [0x67a904]
  2: (()+0x10410) [0x7f4a4b972410]
  3: (gsignal()+0x35) [0x7f4a49fdea85]
  4: (abort()+0x185) [0x7f4a49fdfd85]
  5: (__gnu_cxx::__verbose_terminate_handler()+0x11d) [0x7f4a4a9197cd]
  6: (()+0xb8976) [0x7f4a4a917976]
  7: (()+0xb89a3) [0x7f4a4a9179a3]
  8: (()+0xb8a9e) [0x7f4a4a917a9e]
  9: (ceph::__ceph_assert_fail(char const*, char const*, int, char
 const*)+0x7a2) [0x59f1d2]
  10: (MonMap::build_from_host_list(std::string, std::string)+0xb13) [0x55cf03]
  11: (MonMap::build_initial(CephContext*, std::ostream)+0xe7) [0x55e827]
  12: (MonClient::build_initial_monmap()+0x6c) [0x54debc]
  13: (main()+0x1bd) [0x482cad]
  14: (__libc_start_main()+0xfd) [0x7f4a49fcb4bd]
  15: ceph-fuse() [0x4829b9]
 2012-11-26 01:23:38.716314 7f4a4bfc6780 -1 *** Caught signal (Aborted) **
  in thread 7f4a4bfc6780
 
  ceph version 0.51 (commit:c03ca95d235c9a072dcd8a77ad5274a52e93ae30)
  1: ceph-fuse() [0x67a904]
  2: (()+0x10410) [0x7f4a4b972410]
  3: (gsignal()+0x35) [0x7f4a49fdea85]
  4: (abort()+0x185) [0x7f4a49fdfd85]
  5: (__gnu_cxx::__verbose_terminate_handler()+0x11d) [0x7f4a4a9197cd]
  6: (()+0xb8976) [0x7f4a4a917976]
  7: (()+0xb89a3) [0x7f4a4a9179a3]
  8: (()+0xb8a9e) [0x7f4a4a917a9e]
  9: (ceph::__ceph_assert_fail(char const*, char const*, int, char
 const*)+0x7a2) [0x59f1d2]
  10: (MonMap::build_from_host_list(std::string, std::string)+0xb13) [0x55cf03]
  11: (MonMap::build_initial(CephContext*, std::ostream)+0xe7) [0x55e827]
  12: (MonClient::build_initial_monmap()+0x6c) [0x54debc]
  13: (main()+0x1bd) [0x482cad]
  14: (__libc_start_main()+0xfd) [0x7f4a49fcb4bd]
  15: ceph-fuse() [0x4829b9]
  NOTE: a copy of the executable, or `objdump -rdS executable` is
 needed to interpret this.
 
  0 2012-11-26 01:23:38.716314 7f4a4bfc6780 -1 *** Caught signal
 (Aborted) **
  in thread 7f4a4bfc6780
 
  ceph version 0.51 (commit:c03ca95d235c9a072dcd8a77ad5274a52e93ae30)
  1: 

bobtail release candidates

2012-11-25 Thread Sage Weil
Hi all,

There are automatic builds of the prerelease bobtail code available under 
the 'next' branch.

For debs,

http://ceph.com/docs/master/install/debian/#add-development-testing-packages

For example, for Ubuntu 12.04 precise,
http://gitbuilder.ceph.com/ceph-deb-precise-x86_64-basic/ref/next/

And RPMs for el6,

http://gitbuilder.ceph.com/ceph-rpm-centos6-x86_64-basic/ref/next/RPMS/x86_64/

Any testing and early feedback is greatly appreciated.

Thanks!
sage
--
To unsubscribe from this list: send the line unsubscribe ceph-devel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Statistics / Nagios

2012-11-25 Thread Stefan Priebe

Hello list,

are there any useful statistics for ceph? ceph -s shows just the status 
but i would like to record something which gives me an idea about the 
ceph load or usage.


Greets,
Stefan
--
To unsubscribe from this list: send the line unsubscribe ceph-devel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: syncfs via syscall doesn not work

2012-11-25 Thread Stefan Priebe

Am 25.11.2012 00:20, schrieb Sage Weil:

On Sat, 24 Nov 2012, Stefan Priebe wrote:

Am 24.11.2012 18:16, schrieb Sage Weil:

On Sat, 24 Nov 2012, Stefan Priebe - Profihost AG wrote:

At the machine compiling? Yes!


Hmm, I just tested on my wonky wheezy machine (glibc 2.13, 3.2) and
system(SYS_syncfs, ..) works for me.  But the #define is in libc6-dev:


But that means your glibc supports it. Then we don't need the syscall at
all.

To me it's only present in the kernel headers (linux-libc-dev) like
this:

[pb64: ~]# grep -r syncfs /usr/include
/usr/include/asm/unistd_64.h:#define __NR_syncfs  306
/usr/include/asm/unistd_64.h:__SYSCALL(__NR_syncfs, sys_syncfs)
/usr/include/asm/unistd_32.h:#define __NR_syncfs 344
/usr/include/asm-generic/unistd.h:#define __NR_syncfs 267
/usr/include/asm-generic/unistd.h:__SYSCALL(__NR_syncfs, sys_syncfs)


Can try wip-syncfs?  It checks for either SYS_syncfs or __NR_syncfs.


Since your latest patch it looks good to me:
starting osd.11 at :/0 osd_data /ceph/osd.11/ /dev/sdb1
2012-11-25 19:37:15.686305 7f155e1db780  0 filestore(/ceph/osd.11/) 
mount FIEMAP ioctl is supported and appears to work
2012-11-25 19:37:15.686313 7f155e1db780  0 filestore(/ceph/osd.11/) 
mount FIEMAP ioctl is disabled via 'filestore fiemap' config option
2012-11-25 19:37:15.686500 7f155e1db780  0 filestore(/ceph/osd.11/) 
mount did NOT detect btrfs
2012-11-25 19:37:15.687850 7f155e1db780  0 filestore(/ceph/osd.11/) 
mount syscall(__NR_syncfs, fd) fully supported
2012-11-25 19:37:15.687899 7f155e1db780  0 filestore(/ceph/osd.11/) 
mount found snaps 
2012-11-25 19:37:15.689973 7f155e1db780  0 filestore(/ceph/osd.11/) 
mount: enabling WRITEAHEAD journal mode: btrfs not detected

2012-11-25 19:37:15.700405 7f155e1db780  0 journal  kernel version is 3.6.7
2012-11-25 19:37:15.710407 7f155e1db780  0 journal  kernel version is 3.6.7

YES! No more need to build a custom patched glibc.

Stefan
--
To unsubscribe from this list: send the line unsubscribe ceph-devel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Redundancy when mount with multiple mon IP

2012-11-25 Thread Gregory Farnum
On Sunday, November 25, 2012 at 9:30 AM, Drunkard Zhang wrote:
 2012/11/26 Sage Weil s...@inktank.com (mailto:s...@inktank.com):
  On Sun, 25 Nov 2012, Drunkard Zhang wrote:
   I'm using ceph-0.51. I setup 3 monitors. then mount with 3 mon IP at
   another host with either kernel mode or fuse, neither give me
   redundancy.

   Here's commands I used at client (IP: 10.0.0.2) side;
   mount -t ceph log3,log21,squid86-log12:/ /mnt/bc
   ceph-fuse -m log3,log21,squid86-log12 /mnt/bc

   When mounted, read/write is OK, then add a iptables rule at log3 to
   REJECT/DROP packets from client 10.0.0.2, operation at client stuck
   due to IO problem.

   Related stuck processes on client look like this:
   log1 ~ # ps auwx |g bc
   root 1325 0.0 0.0 120100 3536 pts/0 Sl 23:10 0:00
   ceph-fuse -m log3,log21,squid86-log12 /mnt/bc
   root 1404 0.0 0.0 16192 700 pts/0 S 23:10 0:00 ls
   --color=auto /mnt/bc
   
   
   
  Can you repeat teh test with '--debug-monc 20 --debug-ms 1 --log-file 
  /tmp/foo'
  on the command line and attach the resulting log?
   
  Thanks!
  sage
  
  
 Thanks for the hint, I think the problem found, I rejected the client
 at mon.log3 which also acts as mds, and it's up and active, another
 mds is up and standby. So the client cannot connect to mds server? Is
 it possible to get more than one mds up and active simultaneously, and
 let client know that?

The active MDS servers partition the namespace between them — clients need to 
be able to communicate with them all. You can configure which MDS is active (at 
start-up) if you like; unfortunately the best reference is still in the wiki: 
http://ceph.com/deprecated/Standby-replay_modes
-Greg

--
To unsubscribe from this list: send the line unsubscribe ceph-devel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Statistics / Nagios

2012-11-25 Thread Gregory Farnum
On Sunday, November 25, 2012 at 10:26 AM, Stefan Priebe wrote:
 Hello list,
 
 are there any useful statistics for ceph? ceph -s shows just the status 
 but i would like to record something which gives me an idea about the 
 ceph load or usage.

Each daemon has an admin socket which you can use to extract perfcounters 
(performance counters). It's not well-documented at all right now, 
unfortunately.
ceph --admin-daemon /path/to/admin/socket help
ceph --admin-daemon /path/to/admin/socket perfcounters_dump

ceph --admin-daemon /path/to/admin/socket perfcounters_schema

should get you started.
http://ceph.com/docs/master/radosgw/troubleshooting has some examples of 
troubleshooting the gateway with them.
-Greg

--
To unsubscribe from this list: send the line unsubscribe ceph-devel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Statistics / Nagios

2012-11-25 Thread Stefan Priebe

Hi Greg,

Am 25.11.2012 20:08, schrieb Gregory Farnum:

On Sunday, November 25, 2012 at 10:26 AM, Stefan Priebe wrote:

Hello list,

are there any useful statistics for ceph? ceph -s shows just the status
but i would like to record something which gives me an idea about the
ceph load or usage.


Each daemon has an admin socket which you can use to extract perfcounters 
(performance counters). It's not well-documented at all right now, unfortunately.
ceph --admin-daemon /path/to/admin/socket help
ceph --admin-daemon /path/to/admin/socket perfcounters_dump

ceph --admin-daemon /path/to/admin/socket perfcounters_schema

should get you started.
http://ceph.com/docs/master/radosgw/troubleshooting has some examples of 
troubleshooting the gateway with them.


thanks! I don't use radosgw. Just ceph-mon and ceph-osd with rbd block 
devices.


So i have to query each OSD on their own?

Right now i have
[global]
  admin socket=/var/run/ceph.sock

but that gives:
# ceph --admin-daemon /var/run/ceph.sock perfcounters_schema
{}

Greets,
Stefan
--
To unsubscribe from this list: send the line unsubscribe ceph-devel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Hi + question

2012-11-25 Thread Joao Eduardo Luis
On 11/25/2012 07:02 PM, Roald van Loon wrote:
 Like I said I'm toying away with ceph since an hour or two, and one of
 the things I noticed was that setting up my test env and getting to
 know ceph wasn't all that easy to do (gentoo btw). Things like ceph -h
 not showing the setcrushmap option, the 'ceph osd crush set' only
 returning '(22) Invalid argument' etc etc... So, I decided to take a
 real good look at the sources - any dev's number one source for
 documentation :-) There I... well.. kind of got lost. Cpp files,
 devops tools (python tools, shell scripts, perl scripts), fooclass,
 barclass ofcourse, valgrind supps, ... all in ~/src :-)
 
 So I was curious if I was the only one finding this a bit confusing.
 If not, I was wondering if you have some clean-up subproject running
 somewhere. Might be a nice way for me to really familiarize myself
 with the code base.
 
 Let me know,
 
 Roald

Welcome Roald!

I must say I can relate to that feeling, from back when I started
working with ceph. But I guess that most of my early confusion was
basically due to not being aware where the stuff I wanted was located.

Nowadays I find the sources division fairly consistent with their
purpose. See, in src/ you will mostly find the daemons and the tools; on
mon/ you will find the files that are mostly mon-related, and same goes
for most of the other subdirectories (osd/, mds/, crush/,...). There are
a couple of exceptions, such as global/ or common/, but those by
themselves are pretty self-explanatory ;-)

To be honest, the only thing that keeps annoying me is that the build
process will end up leaving all the objects, libraries and binaries in
src/ after the build, but that's one of the things I've learned to live
with without trying to find an alternative (shame!).

If you need pointers on how to get familiarized with the code base, or
any doubts arise, just let us know and we'd be pleased to point you in
the right direction. You might also find it useful to join us on #ceph
at OFTC.

And, fwiw, the 'ceph' tool itself (which is on tools/, btw), won't
provide you the insight you're looking for. It mostly sends commands
directly to the monitor, and the monitor will send replies the tool will
output. What you are looking for, in this case, can be found on
mon/OSDMonitor.cc


Cheers,
  -Joao



--
To unsubscribe from this list: send the line unsubscribe ceph-devel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Cephfs losing files and corrupting others

2012-11-25 Thread Nathan Howell
So when trawling through the filesystem doing checksum validation
these popped up on the files that are filled with null bytes:
https://gist.github.com/186ad4c5df816d44f909

Is there any way to fsck today? Looks like feature #86
http://tracker.newdream.net/issues/86 isn't implemented yet.

thanks,
-n

On Thu, Nov 22, 2012 at 11:37 PM, Nathan Howell
nathan.d.how...@gmail.com wrote:
 I upgraded to 0.54 and now there are some hints in the logs. The
 directories referenced in the log entries are now missing:

 2012-11-23 07:28:04.802864 mds.0 [ERR] loaded dup inode 100662f
 [2,head] v3851654 at /xxx/20120203, but inode 100662f.head
 v3853093 already exists at ~mds0/stray7/100662f
 2012-11-23 07:28:04.802889 mds.0 [ERR] loaded dup inode 1003a4b
 [2,head] v431518 at /xxx/20120206, but inode 1003a4b.head v3853192
 already exists at ~mds0/stray8/1003a4b
 2012-11-23 07:28:04.802909 mds.0 [ERR] loaded dup inode 100149e
 [2,head] v431522 at /xxx/20120207, but inode 100149e.head v3853206
 already exists at ~mds0/stray8/100149e
 2012-11-23 07:28:04.802927 mds.0 [ERR] loaded dup inode 1000a5f
 [2,head] v431526 at /xxx/20120208, but inode 1000a5f.head v3853208
 already exists at ~mds0/stray8/1000a5f

 Any ideas?

 On Thu, Nov 15, 2012 at 11:00 AM, Nathan Howell
 nathan.d.how...@gmail.com wrote:
 Yes, successfully written files were disappearing. We switched to ceph-fuse
 and haven't seen any files truncated since. Older files (written months ago)
 are still having their entire contents replaced with NULL bytes, seemly at
 random. I can't yet say for sure this has happened since switching over to
 fuse... but we think it has.

 I'm going to test all of the archives over the next few days and restore
 them from S3, so we should be back in a known-good state after that. In the
 event more files end up corrupted, is there any logging that I can enable
 that would help track down the problem?

 thanks,
 -n


 On Sat, Nov 3, 2012 at 9:54 AM, Gregory Farnum g...@inktank.com wrote:

 On Fri, Nov 2, 2012 at 12:30 AM, Nathan Howell
 nathan.d.how...@gmail.com wrote:
  On Thu, Nov 1, 2012 at 3:32 PM, Sam Lang sam.l...@inktank.com wrote:
  Do the writes succeed?  I.e. the programs creating the files don't get
  errors back?  Are you seeing any problems with the ceph mds or osd
  processes
  crashing?  Can you describe your I/O workload during these bulk loads?
  How
  many files, how much data, multiple clients writing, etc.
 
  As far as I know, there haven't been any fixes to 0.48.2 to resolve
  problems
  like yours.  You might try the ceph fuse client to see if you get the
  same
  behavior.  If not, then at least we have narrowed down the problem to
  the
  ceph kernel client.
 
  Yes, the writes succeed. Wednesday's failure looked like this:
 
  1) rsync 100-200mb tarball directly into ceph from a remote site
  2) untar ~500 files from tarball in ceph into a new directory in ceph
  3) wait for a while
  4) the .tar file and some log files disappeared but the untarred files
  were fine

 Just to be clear, you copied a tarball into Ceph and untarred all in
 Ceph, and the extracted contents were fine but the tarball
 disappeared? So this looks like a case of successfully-written files
 disappearing?
 Did you at any point check the tarball from a machine other than the
 initial client that copied it in?

 This truncation sounds like maybe Yan's fix will deal with it. But if
 you've also seen files with the proper size but be empty or corrupted,
 that sounds like an OSD bug. Sam, are you aware of any btrfs issues
 that could cause this?

 Nathan, you've also seen parts of the filesystem hierarchy get lost?
 That's rather more concerning; under what circumstances have you seen
 that?
 -Greg

  Total filesystem size is:
 
  pgmap v2221244: 960 pgs: 960 active+clean; 2418 GB data, 7293 GB used,
  6151 GB / 13972 GB avail
 
  Generally our load looks like:
 
  Constant trickle of 1-2mb files from 3 machines, about 1GB per day
  total. No file is written to by more than 1 machine, but the files go
  into shared directories.
 
  Grid jobs are running constantly and are doing sequential reads from
  the filesystem. Compute nodes have the filesystem mounted read-only.
  They're primarily located at a remote site (~40ms away) and tend to
  average 1-2 megabits/sec.
 
  Nightly data jobs load in ~10GB from a few remote sites in to 10
  large files. These are split up into about 1000 smaller files but the
  originals are also kept. All of this is done on one machine. The
  journals and osd drives are write saturated while this is going on.
 
 
  On Thu, Nov 1, 2012 at 4:02 PM, Gregory Farnum g...@inktank.com wrote:
  Are you using hard links, by any chance?
 
  No, we are using a handfull of soft links though.
 
 
  Do you have one or many MDS systems?
 
  ceph mds stat says: e686: 1/1/1 up {0=xxx=up:active}, 2 up:standby
 
 
  What filesystem are you using on your OSDs?
 
  btrfs
 
 
  thanks,
 

Re: Hangup during scrubbing - possible solutions

2012-11-25 Thread Andrey Korolyov
On Fri, Nov 23, 2012 at 12:35 AM, Sage Weil s...@inktank.com wrote:
 On Thu, 22 Nov 2012, Andrey Korolyov wrote:
 Hi,

 In the recent versions Ceph introduces some unexpected behavior for
 the permanent connections (VM or kernel clients) - after crash
 recovery, I/O will hang on the next planned scrub on the following
 scenario:

 - launch a bunch of clients doing non-intensive writes,
 - lose one or more osd, mark them down, wait for recovery completion,
 - do a slow scrub, e.g. scrubbing one osd per 5m, inside bash script,
 or wait for ceph to do the same,
 - observe a raising number of pgs stuck in the active+clean+scrubbing
 state (they took a master role from ones which was on killed osd and
 almost surely they are being written in time of crash),
 - some time later, clients will hang hardly and ceph log introduce
 stuck(old) I/O requests.

 The only one way to return clients back without losing their I/O state
 is per-osd restart, which also will help to get rid of
 active+clean+scrubbing pgs.

 First of all, I`ll be happy to help to solve this problem by providing
 logs.

 If you can reproduce this behavior with 'debug osd = 20' and 'debug ms =
 1' logging on the OSD, that would be wonderful!


I have tested slightly different recovery flow, please see below.
Since there is no real harm, like frozen I/O, placement groups also
was stuck forever on the active+clean+scrubbing state, until I
restarted all osds (end of the log):

http://xdel.ru/downloads/ceph-log/recover-clients-later-than-osd.txt.gz

- start the healthy cluster
- start persistent clients
- add an another host with pair of OSDs, let them be in the data placement
- wait for data to rearrange
- [22:06 timestamp] mark OSDs out or simply kill them and wait(since I
have an 1/2 hour delay on readjust in such case, I did ``ceph osd
out'' manually)
- watch for data to rearrange again
- [22:51 timestamp] when it ends, start a manual rescrub, with
non-zero active+clean+scrubbing-state placement groups at the end of
process which `ll stay in this state forever until something happens

After that, I can restart osds one per one, if I want to get rid of
scrubbing states immediately and then do deep-scrub(if I don`t, those
states will return at next ceph self-scrubbing) or do per-osd
deep-scrub, if I have a lot of time. The case I have described in the
previous message took place when I remove osd from data placement
which existed on the moment when client(s) have started and indeed it
is more harmful than current one(frozen I/O leads to hanging entire
guest, for example). Since testing those flow took a lot of time, I`ll
send logs related to this case tomorrow.

 Second question is not directly related to this problem, but I
 have thought on for a long time - is there a planned features to
 control scrub process more precisely, e.g. pg scrub rate or scheduled
 scrub, instead of current set of timeouts which of course not very
 predictable on when to run?

 Not yet.  I would be interested in hearing what kind of control/config
 options/whatever you (and others) would like to see!

Of course it will be awesome to have any determined scheduler or at
least an option to disable automated scrubbing, since it is not very
determined in time and deep-scrub eating a lot of I/O if command
issued against entire OSD. Rate limiting is not in the first place, at
least it may be recreated in external script, but for those who prefer
to leave control to Ceph, it may be very useful.

Thanks!
--
To unsubscribe from this list: send the line unsubscribe ceph-devel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] mds: sort dentries when committing dir fragment

2012-11-25 Thread Sage Weil
I pushed an alternative approach to wip-tmap.

This sorting is an artifact of tmap's crummy implementation, and the mds 
workaround will need to get reverted when we switch to omap.  Instead, fix 
tmap so that it will tolerate unsorted keys.  (Also, drop the ENOENT on rm 
on missing key.)

Eventually we can deprecate and remove tmap entirely...

What do you think?
sage


On Sun, 25 Nov 2012, Yan, Zheng wrote:

 From: Yan, Zheng zheng.z@intel.com
 
 Currently ceph mds uses tmap to store dir fragments. Dentry_key_t's
 string representation is used as key for the tmap. When writing or
 updating a tmap, the OSDs expect the keys to be provided in ascending
 order. Current code encodes dentries by the order of dentry_key_t(s)
 when committing dir fragment. The problem here is that we may get
 different results when comparing dentry_key_t(s) and their string
 representations. So the MDS may send data/commands sorted in the
 wrong order to the OSDs. It confuses the OSDs and causes corruption.
 
 Comparing dentry_key_t(s) and their string representations gives
 different results only when name string in one dentry_key_t is prefix
 of name string in another dentry_key_t. So the fix is checking the
 special case and re-sorting dentries that are in the wrong order.
 
 Signed-off-by: Yan, Zheng zheng.z@intel.com
 ---
  src/mds/CDir.cc| 154 
 ++---
  src/mds/mdstypes.h |  16 +++---
  2 files changed, 130 insertions(+), 40 deletions(-)
 
 diff --git a/src/mds/CDir.cc b/src/mds/CDir.cc
 index 2b4f7c7..e724e61 100644
 --- a/src/mds/CDir.cc
 +++ b/src/mds/CDir.cc
 @@ -1720,24 +1720,65 @@ CDir::map_t::iterator 
 CDir::_commit_full(ObjectOperation m, const setsnapid_t
::encode(fnode, header);
max_write_size -= header.length();
  
 +  /*
 +   * We may get different results when comparing dentry_key_t(s) and their
 +   * string representations. It happens only when name string in one 
 dentry_key_t
 +   * is prefix of name string in another dentry_key_t. Tmap uses 
 dentry_key_t's
 +   * string representation as key. When writing or updating a tmap, the osd
 +   * expects the keys to be provided in ascending order. So we need re-sort
 +   * the dentries here.
 +   */
 +  mapstring, CDentry* pending_items;
 +
map_t::iterator p = items.begin();
 -  while (p != items.end()  bl.length()  max_write_size) {
 -CDentry *dn = p-second;
 -p++;
 -
 -if (dn-linkage.is_null()) 
 -  continue;  // skip negative entries
 +  while ((p != items.end() || pending_items.size())  bl.length()  
 max_write_size) {
 +if (p != items.end()) {
 +  CDentry *dn = p-second;
 +  p++;
  
 -if (snaps  dn-last != CEPH_NOSNAP 
 - try_trim_snap_dentry(dn, *snaps))
 -  continue;
 -
 -n++;
 +  if (dn-linkage.is_null())
 + continue;  // skip negative entries
 +
 +  if (snaps  dn-last != CEPH_NOSNAP 
 +   try_trim_snap_dentry(dn, *snaps))
 + continue;
 +
 +  n++;
 +
 +  if (pending_items.empty()) {
 + int len = 0;
 + if (p != items.end())
 +   len = min(dn-name.length(), p-second-name.length());
 + if (p == items.end() || dn-name.compare(0, len, p-second-name, 0, 
 len)  0) {
 +   _encode_dentry(dn, bl, snaps);
 + } else {
 +   pending_items[dn-key().str()] = dn;
 + }
 + continue;
 +  }
 +
 +  pending_items[dn-key().str()] = dn;
 +  if (p != items.end()) {
 + string last_pending = pending_items.rbegin()-second-name;
 + int len = min(last_pending.length(), p-second-name.length());
 + if (last_pending.compare(0, len, p-second-name, 0, len) = 0)
 +   continue;
 +  }
 +}
 +
 +for (mapstring, CDentry*::iterator it = pending_items.begin();
 +  it != pending_items.end(); it++) {
 +  CDentry *dn = it-second;
 +  _encode_dentry(dn, bl, snaps);
 +}
  
 -_encode_dentry(dn, bl, snaps);
 +if (bl.length()  max_write_size)
 +  break;
 +
 +pending_items.clear();
}
  
 -  if (p != items.end()) {
 +  if (p != items.end() || pending_items.size()) {
  assert(bl.length()  max_write_size);
  return _commit_partial(m, snaps, max_write_size);
}
 @@ -1790,31 +1831,82 @@ CDir::map_t::iterator 
 CDir::_commit_partial(ObjectOperation m,
if(last_committed_dn != map_t::iterator())
  p = last_committed_dn;
  
 -  while (p != items.end()  finalbl.length()  max_write_size) {
 -CDentry *dn = p-second;
 -++p;
 -
 -if (snaps  dn-last != CEPH_NOSNAP 
 - try_trim_snap_dentry(dn, *snaps))
 -  continue;
 +  // see comments in _commit_full()
 +  map_t::iterator next_dn = p;
 +  mapstring, CDentry* pending_items;
  
 -if (!dn-is_dirty())
 -  continue;  // skip clean dentries
 +  while ((p != items.end() || pending_items.size())  finalbl.length()  
 max_write_size) {
 +if (p != items.end()) {
 +  CDentry *dn = p-second;
 +  ++p;
  
 -if (dn-get_linkage()-is_null()) {
 -  dout(10) 

Re: [PATCH] mds: sort dentries when committing dir fragment

2012-11-25 Thread Yan, Zheng
On 11/26/2012 06:32 AM, Sage Weil wrote:
 I pushed an alternative approach to wip-tmap.
 
 This sorting is an artifact of tmap's crummy implementation, and the mds 
 workaround will need to get reverted when we switch to omap.  Instead, fix 
 tmap so that it will tolerate unsorted keys.  (Also, drop the ENOENT on rm 
 on missing key.)
 
 Eventually we can deprecate and remove tmap entirely...
 
 What do you think?

This approach is cleaner than mine. But I think your fix isn't enough because
MDS may provide tmap contains misordered items to the TMAPPUT method. Misordered
items will confuse future TMAPUP. This fix is either sorting items when handling
TMAPPUT or searching forward for any potential misordered items when TMAP_SET
wants to add a new item or TMAP_RM fails to find an item.

Regards
Yan, Zheng

 sage
 
 
 On Sun, 25 Nov 2012, Yan, Zheng wrote:
 
 From: Yan, Zheng zheng.z@intel.com

 Currently ceph mds uses tmap to store dir fragments. Dentry_key_t's
 string representation is used as key for the tmap. When writing or
 updating a tmap, the OSDs expect the keys to be provided in ascending
 order. Current code encodes dentries by the order of dentry_key_t(s)
 when committing dir fragment. The problem here is that we may get
 different results when comparing dentry_key_t(s) and their string
 representations. So the MDS may send data/commands sorted in the
 wrong order to the OSDs. It confuses the OSDs and causes corruption.

 Comparing dentry_key_t(s) and their string representations gives
 different results only when name string in one dentry_key_t is prefix
 of name string in another dentry_key_t. So the fix is checking the
 special case and re-sorting dentries that are in the wrong order.

 Signed-off-by: Yan, Zheng zheng.z@intel.com
 ---
  src/mds/CDir.cc| 154 
 ++---
  src/mds/mdstypes.h |  16 +++---
  2 files changed, 130 insertions(+), 40 deletions(-)

 diff --git a/src/mds/CDir.cc b/src/mds/CDir.cc
 index 2b4f7c7..e724e61 100644
 --- a/src/mds/CDir.cc
 +++ b/src/mds/CDir.cc
 @@ -1720,24 +1720,65 @@ CDir::map_t::iterator 
 CDir::_commit_full(ObjectOperation m, const setsnapid_t
::encode(fnode, header);
max_write_size -= header.length();
  
 +  /*
 +   * We may get different results when comparing dentry_key_t(s) and their
 +   * string representations. It happens only when name string in one 
 dentry_key_t
 +   * is prefix of name string in another dentry_key_t. Tmap uses 
 dentry_key_t's
 +   * string representation as key. When writing or updating a tmap, the osd
 +   * expects the keys to be provided in ascending order. So we need re-sort
 +   * the dentries here.
 +   */
 +  mapstring, CDentry* pending_items;
 +
map_t::iterator p = items.begin();
 -  while (p != items.end()  bl.length()  max_write_size) {
 -CDentry *dn = p-second;
 -p++;
 -
 -if (dn-linkage.is_null()) 
 -  continue;  // skip negative entries
 +  while ((p != items.end() || pending_items.size())  bl.length()  
 max_write_size) {
 +if (p != items.end()) {
 +  CDentry *dn = p-second;
 +  p++;
  
 -if (snaps  dn-last != CEPH_NOSNAP 
 -try_trim_snap_dentry(dn, *snaps))
 -  continue;
 -
 -n++;
 +  if (dn-linkage.is_null())
 +continue;  // skip negative entries
 +
 +  if (snaps  dn-last != CEPH_NOSNAP 
 +  try_trim_snap_dentry(dn, *snaps))
 +continue;
 +
 +  n++;
 +
 +  if (pending_items.empty()) {
 +int len = 0;
 +if (p != items.end())
 +  len = min(dn-name.length(), p-second-name.length());
 +if (p == items.end() || dn-name.compare(0, len, p-second-name, 0, 
 len)  0) {
 +  _encode_dentry(dn, bl, snaps);
 +} else {
 +  pending_items[dn-key().str()] = dn;
 +}
 +continue;
 +  }
 +
 +  pending_items[dn-key().str()] = dn;
 +  if (p != items.end()) {
 +string last_pending = pending_items.rbegin()-second-name;
 +int len = min(last_pending.length(), p-second-name.length());
 +if (last_pending.compare(0, len, p-second-name, 0, len) = 0)
 +  continue;
 +  }
 +}
 +
 +for (mapstring, CDentry*::iterator it = pending_items.begin();
 + it != pending_items.end(); it++) {
 +  CDentry *dn = it-second;
 +  _encode_dentry(dn, bl, snaps);
 +}
  
 -_encode_dentry(dn, bl, snaps);
 +if (bl.length()  max_write_size)
 +  break;
 +
 +pending_items.clear();
}
  
 -  if (p != items.end()) {
 +  if (p != items.end() || pending_items.size()) {
  assert(bl.length()  max_write_size);
  return _commit_partial(m, snaps, max_write_size);
}
 @@ -1790,31 +1831,82 @@ CDir::map_t::iterator 
 CDir::_commit_partial(ObjectOperation m,
if(last_committed_dn != map_t::iterator())
  p = last_committed_dn;
  
 -  while (p != items.end()  finalbl.length()  max_write_size) {
 -CDentry *dn = p-second;
 -++p;
 -
 -if (snaps  dn-last != CEPH_NOSNAP 
 -try_trim_snap_dentry(dn,