osd crash after reboot

2012-12-14 Thread Stefan Priebe

Hello list,

after a reboot of my node i see this on all OSDs of this node after the 
reboot:


2012-12-14 09:03:20.393224 7f8e652f8780 -1 osd/OSD.cc: In function 
'OSDMapRef OSDService::get_map(epoch_t)' thread 7f8e652f8780 time 
2012-12-14 09:03:20.392528

osd/OSD.cc: 4385: FAILED assert(_get_map_bl(epoch, bl))

 ceph version 0.55-239-gc951c27 (c951c270a42b94b6f269992c9001d90f70a2b824)
 1: (OSDService::get_map(unsigned int)+0x918) [0x607f78]
 2: (OSD::load_pgs()+0x13ed) [0x6168ad]
 3: (OSD::init()+0xaff) [0x617a5f]
 4: (main()+0x2de6) [0x55a416]
 5: (__libc_start_main()+0xfd) [0x7f8e63093c8d]
 6: /usr/bin/ceph-osd() [0x557269]
 NOTE: a copy of the executable, or `objdump -rdS executable` is 
needed to interpret this.


--- begin dump of recent events ---
   -29 2012-12-14 09:03:20.266349 7f8e652f8780  5 asok(0x285c000) 
register_command perfcounters_dump hook 0x2850010
   -28 2012-12-14 09:03:20.266366 7f8e652f8780  5 asok(0x285c000) 
register_command 1 hook 0x2850010
   -27 2012-12-14 09:03:20.266369 7f8e652f8780  5 asok(0x285c000) 
register_command perf dump hook 0x2850010
   -26 2012-12-14 09:03:20.266379 7f8e652f8780  5 asok(0x285c000) 
register_command perfcounters_schema hook 0x2850010
   -25 2012-12-14 09:03:20.266383 7f8e652f8780  5 asok(0x285c000) 
register_command 2 hook 0x2850010
   -24 2012-12-14 09:03:20.266386 7f8e652f8780  5 asok(0x285c000) 
register_command perf schema hook 0x2850010
   -23 2012-12-14 09:03:20.266389 7f8e652f8780  5 asok(0x285c000) 
register_command config show hook 0x2850010
   -22 2012-12-14 09:03:20.266392 7f8e652f8780  5 asok(0x285c000) 
register_command config set hook 0x2850010
   -21 2012-12-14 09:03:20.266396 7f8e652f8780  5 asok(0x285c000) 
register_command log flush hook 0x2850010
   -20 2012-12-14 09:03:20.266398 7f8e652f8780  5 asok(0x285c000) 
register_command log dump hook 0x2850010
   -19 2012-12-14 09:03:20.266401 7f8e652f8780  5 asok(0x285c000) 
register_command log reopen hook 0x2850010
   -18 2012-12-14 09:03:20.267686 7f8e652f8780  0 ceph version 
0.55-239-gc951c27 (c951c270a42b94b6f269992c9001d90f70a2b824), process 
ceph-osd, pid 7212
   -17 2012-12-14 09:03:20.268738 7f8e652f8780  1 finished 
global_init_daemonize
   -16 2012-12-14 09:03:20.275957 7f8e652f8780  0 
filestore(/ceph/osd.1/) mount FIEMAP ioctl is supported and appears to work
   -15 2012-12-14 09:03:20.275968 7f8e652f8780  0 
filestore(/ceph/osd.1/) mount FIEMAP ioctl is disabled via 'filestore 
fiemap' config option
   -14 2012-12-14 09:03:20.276177 7f8e652f8780  0 
filestore(/ceph/osd.1/) mount did NOT detect btrfs
   -13 2012-12-14 09:03:20.277051 7f8e652f8780  0 
filestore(/ceph/osd.1/) mount syscall(__NR_syncfs, fd) fully supported
   -12 2012-12-14 09:03:20.277585 7f8e652f8780  0 
filestore(/ceph/osd.1/) mount found snaps 
   -11 2012-12-14 09:03:20.278899 7f8e652f8780  0 
filestore(/ceph/osd.1/) mount: enabling WRITEAHEAD journal mode: btrfs 
not detected
   -10 2012-12-14 09:03:20.290745 7f8e652f8780  0 journal  kernel 
version is 3.6.10
-9 2012-12-14 09:03:20.320728 7f8e652f8780  0 journal  kernel 
version is 3.6.10
-8 2012-12-14 09:03:20.328381 7f8e652f8780  0 
filestore(/ceph/osd.1/) mount FIEMAP ioctl is supported and appears to work
-7 2012-12-14 09:03:20.328391 7f8e652f8780  0 
filestore(/ceph/osd.1/) mount FIEMAP ioctl is disabled via 'filestore 
fiemap' config option
-6 2012-12-14 09:03:20.328574 7f8e652f8780  0 
filestore(/ceph/osd.1/) mount did NOT detect btrfs
-5 2012-12-14 09:03:20.329579 7f8e652f8780  0 
filestore(/ceph/osd.1/) mount syscall(__NR_syncfs, fd) fully supported
-4 2012-12-14 09:03:20.329612 7f8e652f8780  0 
filestore(/ceph/osd.1/) mount found snaps 
-3 2012-12-14 09:03:20.330786 7f8e652f8780  0 
filestore(/ceph/osd.1/) mount: enabling WRITEAHEAD journal mode: btrfs 
not detected
-2 2012-12-14 09:03:20.340711 7f8e652f8780  0 journal  kernel 
version is 3.6.10
-1 2012-12-14 09:03:20.370707 7f8e652f8780  0 journal  kernel 
version is 3.6.10
 0 2012-12-14 09:03:20.393224 7f8e652f8780 -1 osd/OSD.cc: In 
function 'OSDMapRef OSDService::get_map(epoch_t)' thread 7f8e652f8780 
time 2012-12-14 09:03:20.392528

osd/OSD.cc: 4385: FAILED assert(_get_map_bl(epoch, bl))

 ceph version 0.55-239-gc951c27 (c951c270a42b94b6f269992c9001d90f70a2b824)
 1: (OSDService::get_map(unsigned int)+0x918) [0x607f78]
 2: (OSD::load_pgs()+0x13ed) [0x6168ad]
 3: (OSD::init()+0xaff) [0x617a5f]
 4: (main()+0x2de6) [0x55a416]
 5: (__libc_start_main()+0xfd) [0x7f8e63093c8d]
 6: /usr/bin/ceph-osd() [0x557269]
 NOTE: a copy of the executable, or `objdump -rdS executable` is 
needed to interpret this.


Stefan
--
To unsubscribe from this list: send the line unsubscribe ceph-devel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: osd crash after reboot

2012-12-14 Thread Stefan Priebe

same log more verbose:
11 ec=10 les/c 3307/3307 3306/3306/3306) [] r=0 lpr=0 lcod 0'0 mlcod 0'0 
inactive] read_log done
   -11 2012-12-14 09:17:50.648572 7fb6e0d6b780 10 osd.3 pg_epoch: 3996 
pg[3.44b( v 3988'3969 (1379'2968,3988'3969] local-les=3307 n=11 ec=10 
les/c 3307/3307 3306/3306/3306) [3,12] r=0 lpr=0 lcod 0'0 mlcod 0'0 
inactive] handle_loaded
   -10 2012-12-14 09:17:50.648581 7fb6e0d6b780 20 osd.3 pg_epoch: 3996 
pg[3.44b( v 3988'3969 (1379'2968,3988'3969] local-les=3307 n=11 ec=10 
les/c 3307/3307 3306/3306/3306) [3,12] r=0 lpr=0 lcod 0'0 mlcod 0'0 
inactive] exit Initial 0.015080 0 0.00
-9 2012-12-14 09:17:50.648591 7fb6e0d6b780 20 osd.3 pg_epoch: 3996 
pg[3.44b( v 3988'3969 (1379'2968,3988'3969] local-les=3307 n=11 ec=10 
les/c 3307/3307 3306/3306/3306) [3,12] r=0 lpr=0 lcod 0'0 mlcod 0'0 
inactive] enter Reset
-8 2012-12-14 09:17:50.648599 7fb6e0d6b780 20 osd.3 pg_epoch: 3996 
pg[3.44b( v 3988'3969 (1379'2968,3988'3969] local-les=3307 n=11 ec=10 
les/c 3307/3307 3306/3306/3306) [3,12] r=0 lpr=0 lcod 0'0 mlcod 0'0 
inactive] set_last_peering_reset 3996
-7 2012-12-14 09:17:50.648609 7fb6e0d6b780 10 osd.3 4233 load_pgs 
loaded pg[3.44b( v 3988'3969 (1379'2968,3988'3969] local-les=3307 n=11 
ec=10 les/c 3307/3307 3306/3306/3306) [3,12] r=0 lpr=3996 lcod 0'0 mlcod 
0'0 inactive] log(1379'2968,3988'3969]
-6 2012-12-14 09:17:50.648649 7fb6e0d6b780 15 
filestore(/ceph/osd.3/) collection_getattr /ceph/osd.3//current/0.1_head 
'info'
-5 2012-12-14 09:17:50.648664 7fb6e0d6b780 10 
filestore(/ceph/osd.3/) collection_getattr /ceph/osd.3//current/0.1_head 
'info' = 5
-4 2012-12-14 09:17:50.648672 7fb6e0d6b780 20 osd.3 0 get_map 3316 
- loading and decoding 0x2943e00
-3 2012-12-14 09:17:50.648678 7fb6e0d6b780 15 
filestore(/ceph/osd.3/) read meta/a09ec88/osdmap.3316/0//-1 0~0
-2 2012-12-14 09:17:50.648705 7fb6e0d6b780 10 
filestore(/ceph/osd.3/) error opening file 
/ceph/osd.3//current/meta/DIR_8/DIR_8/osdmap.3316__0_0A09EC88__none with 
flags=0 and mode=0: (2) No such file or directory
-1 2012-12-14 09:17:50.648722 7fb6e0d6b780 10 
filestore(/ceph/osd.3/) FileStore::read(meta/a09ec88/osdmap.3316/0//-1) 
open error: (2) No such file or directory
 0 2012-12-14 09:17:50.649586 7fb6e0d6b780 -1 osd/OSD.cc: In 
function 'OSDMapRef OSDService::get_map(epoch_t)' thread 7fb6e0d6b780 
time 2012-12-14 09:17:50.648733

osd/OSD.cc: 4385: FAILED assert(_get_map_bl(epoch, bl))

 ceph version 0.55-239-gc951c27 (c951c270a42b94b6f269992c9001d90f70a2b824)
 1: (OSDService::get_map(unsigned int)+0x918) [0x607f78]
 2: (OSD::load_pgs()+0x13ed) [0x6168ad]
 3: (OSD::init()+0xaff) [0x617a5f]
 4: (main()+0x2de6) [0x55a416]
 5: (__libc_start_main()+0xfd) [0x7fb6deb06c8d]
 6: /usr/bin/ceph-osd() [0x557269]
 NOTE: a copy of the executable, or `objdump -rdS executable` is 
needed to interpret this.


--- logging levels ---
   0/ 5 none
   0/ 0 lockdep
   0/ 0 context
   0/ 0 crush
   1/ 5 mds
   1/ 5 mds_balancer
   1/ 5 mds_locker
   1/ 5 mds_log
   1/ 5 mds_log_expire
   1/ 5 mds_migrator
   0/ 0 buffer
   0/ 0 timer
   0/ 1 filer
   0/ 1 striper
   0/ 1 objecter
   0/ 5 rados
   0/ 5 rbd
   0/20 journaler
   0/ 5 objectcacher
   0/ 5 client
   0/20 osd
   0/ 0 optracker
   0/ 0 objclass
   0/20 filestore
   0/20 journal
   0/ 0 ms
   1/ 5 mon
   0/ 0 monc
   0/ 5 paxos
   0/ 0 tp
   0/ 0 auth
   1/ 5 crypto
   0/ 0 finisher
   0/ 0 heartbeatmap
   0/ 0 perfcounter
   1/ 5 rgw
   1/ 5 hadoop
   1/ 5 javaclient
   0/ 0 asok
   0/ 0 throttle
  -2/-2 (syslog threshold)
  -1/-1 (stderr threshold)
  max_recent10
  max_new 1000
  log_file /var/log/ceph/ceph-osd.3.log
--- end dump of recent events ---
2012-12-14 09:17:50.714676 7fb6e0d6b780 -1 *** Caught signal (Aborted) **
 in thread 7fb6e0d6b780

 ceph version 0.55-239-gc951c27 (c951c270a42b94b6f269992c9001d90f70a2b824)
 1: /usr/bin/ceph-osd() [0x7a1889]
 2: (()+0xeff0) [0x7fb6e0750ff0]
 3: (gsignal()+0x35) [0x7fb6deb1a1b5]
 4: (abort()+0x180) [0x7fb6deb1cfc0]
 5: (__gnu_cxx::__verbose_terminate_handler()+0x115) [0x7fb6df3aedc5]
 6: (()+0xcb166) [0x7fb6df3ad166]
 7: (()+0xcb193) [0x7fb6df3ad193]
 8: (()+0xcb28e) [0x7fb6df3ad28e]
 9: (ceph::__ceph_assert_fail(char const*, char const*, int, char 
const*)+0x7c9) [0x805659]

 10: (OSDService::get_map(unsigned int)+0x918) [0x607f78]
 11: (OSD::load_pgs()+0x13ed) [0x6168ad]
 12: (OSD::init()+0xaff) [0x617a5f]
 13: (main()+0x2de6) [0x55a416]
 14: (__libc_start_main()+0xfd) [0x7fb6deb06c8d]
 15: /usr/bin/ceph-osd() [0x557269]
 NOTE: a copy of the executable, or `objdump -rdS executable` is 
needed to interpret this.


--- begin dump of recent events ---
 0 2012-12-14 09:17:50.714676 7fb6e0d6b780 -1 *** Caught signal 
(Aborted) **

 in thread 7fb6e0d6b780

 ceph version 0.55-239-gc951c27 (c951c270a42b94b6f269992c9001d90f70a2b824)
 1: /usr/bin/ceph-osd() [0x7a1889]
 2: (()+0xeff0) [0x7fb6e0750ff0]
 3: (gsignal()+0x35) [0x7fb6deb1a1b5]
 4: (abort()+0x180) [0x7fb6deb1cfc0]
 5: 

Re: Debian packaging question

2012-12-14 Thread James Page
-BEGIN PGP SIGNED MESSAGE-
Hash: SHA256

On 14/12/12 04:38, Gary Lowell wrote:
 I think that the --debbuildopts '-j8 -b' might be trouncing
 the
 - --binary-arch flag - I'll get pbuilder setup and give it a
 test - I normally use sbuild (for which the packaging changes
 did have the desired effect).
 
 I does appear to have been the --debbuildopts causing the problem.
 The combination of your rules file update and removing
 --debbuildopts  built just the arch dependent packages.

Good - I'd not managed to actually test this out (I was struggling
with pbuilder).

 Thanks for the help.

No problem

Cheers

James

- -- 
James Page
Ubuntu Core Developer
Debian Maintainer
james.p...@ubuntu.com
-BEGIN PGP SIGNATURE-
Version: GnuPG v1.4.11 (GNU/Linux)
Comment: Using GnuPG with undefined - http://www.enigmail.net/

iQIcBAEBCAAGBQJQyutzAAoJEL/srsug59jDr9wQAIMFuAUVeFhp1O+yJshBqnY2
7pY4rz1sw7/qjbklIe9ZSlOKy6tSTLEnW0a2qBhGKgsD/5bvhS8QoSUPeLtWOYwY
Z+om/Z7+voKD1cbbez+qc/sD/uJa6+B8aql3uuSCZwlrESbVUwsRSR+jCsUQwEEh
njipllOmYUZSvpHheFMGEjkbZBsVUz4f84mmEh1vSRXyV/4Xvt9AMHWHnnvkazbK
i/qaRgOF1ux/Jag1z5XowhB6/ugvCiRwhHONuxijchFcWNxQ5j4Y7pfTeZENKO2I
7H6N9W/tucgCyo+EQ3v7+DYpzAxiZK2qJG+ZWSIGUGxUZfcVzmXPAVkibwOCiQ+i
+e7pg0ncdMLlbQoAW1c1prNa5kKFKrnh/lDN+YYOVt1v0ckLHjBev256AWUiSKmw
PoFOewVqb5ESvWepqhjwoM+SnKLsgXvz1NEvFZBx1xKojxmaLtrh1j3IzJiQWY/Z
QG9eyYLkhlxBur2dxksUAk0EwdbiQBWQp32y4hdg2+tUvuZoFisf3rmzu4eZCE7B
FRECb0hZ4K8EN0VOdiCHt/+2abYJ3rZoKDlpVkqrQYHPYML9Wign6O79SU9oGvUn
ZqoV1wnvPUK6Ze92QYDdMkZ1lMXf2YM5ySxgayMjboKeV9wysy6tvns0MO2on9tn
qLO6iJb++3FP4L7wJFOG
=oil3
-END PGP SIGNATURE-
--
To unsubscribe from this list: send the line unsubscribe ceph-devel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] implement librados aio_stat

2012-12-14 Thread Giannakos Filippos

Hi team,

I forgot to include a description (also cc-ing correctly the 
synnefo-devel list).


I am a member of the Synnefo team, where we are experimenting with RADOS 
as a storage backend to host blocks for our volume block storage named 
archipelago.


In this patch I implement aio stat and also export this functionality to 
the C API.


On 12/14/2012 01:18 PM, Filippos Giannakos wrote:

---
  src/include/rados/librados.h   |   14 ++
  src/include/rados/librados.hpp |   15 +-
  src/librados/IoCtxImpl.cc  |   42 
  src/librados/IoCtxImpl.h   |9 +
  src/librados/librados.cc   |   10 ++
  5 files changed, 89 insertions(+), 1 deletion(-)

diff --git a/src/include/rados/librados.h b/src/include/rados/librados.h
index 44d6f71..7f4b5c0 100644
--- a/src/include/rados/librados.h
+++ b/src/include/rados/librados.h
@@ -1444,6 +1444,20 @@ int rados_aio_read(rados_ioctx_t io, const char *oid,
   */
  int rados_aio_flush(rados_ioctx_t io);

+
+/**
+ * Asynchronously get object stats (size/mtime)
+ *
+ * @param io ioctx
+ * @param o object name
+ * @param psize where to store object size
+ * @param pmtime where to store modification time
+ * @returns 0 on success, negative error code on failure
+ */
+int rados_aio_stat(rados_ioctx_t io, const char *o,
+  rados_completion_t completion,
+  uint64_t *psize, time_t *pmtime);
+
  /** @} Asynchronous I/O */

  /**
diff --git a/src/include/rados/librados.hpp b/src/include/rados/librados.hpp
index e50acdb..96bfc15 100644
--- a/src/include/rados/librados.hpp
+++ b/src/include/rados/librados.hpp
@@ -473,9 +473,22 @@ namespace librados
   * other than CEPH_NOSNAP
   */
  int aio_remove(const std::string  oid, AioCompletion *c);
-
+
  int aio_flush();

+/**
+ * Asynchronously get object stats (size/mtime)
+ *
+ * @param io ioctx
+ * @param o object name
+ * @param psize where to store object size
+ * @param pmtime where to store modification time
+ * @returns 0 on success, negative error code on failure
+ */
+int rados_aio_stat(rados_ioctx_t io, const char *o,
+  rados_completion_t completion,
+  uint64_t *psize, time_t *pmtime);
+
  int aio_exec(const std::string  oid, AioCompletion *c, const char *cls, 
const char *method,
 bufferlist  inbl, bufferlist *outbl);

diff --git a/src/librados/IoCtxImpl.cc b/src/librados/IoCtxImpl.cc
index 01b4a94..50aab1e 100644
--- a/src/librados/IoCtxImpl.cc
+++ b/src/librados/IoCtxImpl.cc
@@ -851,6 +851,21 @@ int librados::IoCtxImpl::aio_remove(const object_toid, 
AioCompletionImpl *c)
return 0;
  }

+
+int librados::IoCtxImpl::aio_stat(const object_t  oid, AioCompletionImpl *c,
+ uint64_t *psize, time_t *pmtime)
+{
+  c-io = this;
+  C_aio_stat_Ack *onack = new C_aio_stat_Ack(c, pmtime);
+
+  Mutex::Locker l(*lock);
+  objecter-stat(oid, oloc,
+snap_seq, psize,onack-mtime, 0,
+onack,c-objver);
+
+  return 0;
+}
+
  int librados::IoCtxImpl::remove(const object_t  oid)
  {
utime_t ut = ceph_clock_now(client-cct);
@@ -1562,6 +1577,33 @@ void librados::IoCtxImpl::C_aio_Ack::finish(int r)
c-put_unlock();
  }

+/ C_aio_stat_Ack 
+
+librados::IoCtxImpl::C_aio_stat_Ack::C_aio_stat_Ack(AioCompletionImpl *_c,
+   time_t *pm)
+   : c(_c), pmtime(pm)
+{
+  c-get();
+}
+
+void librados::IoCtxImpl::C_aio_stat_Ack::finish(int r)
+{
+  c-lock.Lock();
+  c-rval = r;
+  c-ack = true;
+  c-cond.Signal();
+
+  if (r= 0  pmtime) {
+*pmtime = mtime.sec();
+  }
+
+  if (c-callback_complete) {
+c-io-client-finisher.queue(new C_AioComplete(c));
+  }
+
+  c-put_unlock();
+}
+
  /// C_aio_sparse_read_Ack //

  
librados::IoCtxImpl::C_aio_sparse_read_Ack::C_aio_sparse_read_Ack(AioCompletionImpl
 *_c,
diff --git a/src/librados/IoCtxImpl.h b/src/librados/IoCtxImpl.h
index feea0e8..55b07ee 100644
--- a/src/librados/IoCtxImpl.h
+++ b/src/librados/IoCtxImpl.h
@@ -144,6 +144,14 @@ struct librados::IoCtxImpl {
  C_aio_Ack(AioCompletionImpl *_c);
  void finish(int r);
};
+
+  struct C_aio_stat_Ack : public Context {
+librados::AioCompletionImpl *c;
+time_t *pmtime;
+utime_t mtime;
+C_aio_stat_Ack(AioCompletionImpl *_c, time_t *pm);
+void finish(int r);
+  };

struct C_aio_sparse_read_Ack : public Context {
  AioCompletionImpl *c;
@@ -177,6 +185,7 @@ struct librados::IoCtxImpl {
int aio_remove(const object_toid, AioCompletionImpl *c);
int aio_exec(const object_t  oid, AioCompletionImpl *c, const char *cls,
   const char *method, bufferlist  inbl, bufferlist *outbl);
+  int aio_stat(const object_t  oid, AioCompletionImpl *c, uint64_t *psize, 
time_t 

RE: Usage of CEPH FS versa HDFS for Hadoop: TeraSort benchmark performance comparison issue

2012-12-14 Thread Lachfeld, Jutta
Hi Noah, Gregory and Sage,

first of all, thanks for your quick replies. Here are some answers to your 
questions.

Gregory, I have got the output of ceph -s before and after this specific 
TeraSort run, and to me it looks ok; all 30 osds are up:

   health HEALTH_OK
   monmap e1: 1 mons at {0=192.168.111.18:6789/0}, election epoch 0, quorum 0 0
   osdmap e22: 30 osds: 30 up, 30 in
pgmap v13688: 5760 pgs: 5760 active+clean; 1862 GB data, 1868 GB used, 6142 
GB / 8366 GB avail
   mdsmap e4: 1/1/1 up {0=0=up:active}

   health HEALTH_OK
   monmap e1: 1 mons at {0=192.168.111.18:6789/0}, election epoch 0, quorum 0 0
   osdmap e22: 30 osds: 30 up, 30 in
pgmap v19657: 5760 pgs: 5760 active+clean; 1862 GB data, 1868 GB used, 6142 
GB / 8366 GB avail
   mdsmap e4: 1/1/1 up {0=0=up:active}

I do not have the full output of ceph pg dump for that specific TeraSort run, 
but here is a typical output after automatically preparing CEPH for a benchmark 
run
 (removed almost all lines in the long pg_stat table hoping that you do not 
need them):

dumped all in format plain
version 403
last_osdmap_epoch 22
last_pg_scan 1
full_ratio 0.95
nearfull_ratio 0.85
pg_stat objects mip degrunf bytes   log disklog state   
state_stamp v   reportedup  acting  last_scrub  
scrub_stamp
2.314   0   0   0   0   0   0   0   active+clean
2012-12-14 08:31:24.524152  0'0 11'17   [23,7]  [23,7]  0'0 
2012-12-14 08:31:24.524096
0.316   0   0   0   0   0   0   0   active+clean
2012-12-14 08:25:12.780643  0'0 11'19   [23][23]0'0 
2012-12-14 08:24:08.394930
1.317   0   0   0   0   0   0   0   active+clean
2012-12-14 08:27:56.400997  0'0 3'17[11,17] [11,17] 0'0 
2012-12-14 08:27:56.400953
[...]
pool 0  1   0   0   0   4   136 136
pool 1  21  0   0   0   23745   55185518
pool 2  0   0   0   0   0   0   0
 sum22  0   0   0   23749   56545654
osdstat kbused  kbavail kb  hb in   hb out
0   2724279808588   292420608   
[3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29] 
[]
1   2892279808588   292420608   
[3,4,5,6,8,9,11,12,13,14,15,16,17,18,20,22,24,25,26,27,28]  []
2   2844279808588   292420608   
[3,4,5,6,7,8,9,10,11,12,13,15,16,17,18,19,20,22,23,24,25,26,27,29]  []
3   2716279808588   292420608   
[0,1,2,6,7,8,9,10,11,12,13,14,15,16,17,19,20,22,23,24,25,26,27,28,29]   []
4   2556279808588   292420608   
[1,2,7,8,9,12,13,14,15,16,17,18,19,20,21,22,24,25,26,27,28,29]  []
5   2856279808584   292420608   
[0,2,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,28,29]  []
6   2840279808584   292420608   
[0,1,2,3,4,5,9,10,11,12,13,14,15,16,17,18,19,20,22,24,25,26,27,28,29]   []
7   2604279808588   292420608   
[1,2,3,4,5,9,10,11,12,13,15,17,18,19,20,21,23,24,25,26,27,28,29][]
8   2564279808588   292420608   
[1,2,3,4,5,9,10,11,12,14,16,17,18,19,20,21,22,23,24,25,27,28,29][]
9   2804279808588   292420608   
[1,2,3,4,5,6,8,12,13,14,15,16,17,18,19,20,21,22,23,24,26,27,29] []
10  2556279808588   292420608   
[0,1,2,4,5,6,7,8,12,13,14,15,16,17,19,20,21,22,23,24,25,26,27,28]   []
11  3084279808588   292420608   
[0,1,2,3,4,5,6,7,8,12,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29]  []
12  2572279808588   292420608   
[0,1,2,3,4,5,7,8,10,11,15,16,18,20,21,22,23,24,27,28,29][]
13  2912279808560   292420608   
[0,1,2,3,5,6,7,8,9,10,11,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29]  []
14  2992279808584   292420608   
[1,2,3,4,5,6,7,8,9,10,11,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29]  []
15  2652279808588   292420608   
[1,2,3,4,5,6,7,8,9,10,11,13,14,19,20,21,22,23,25,26,27,28,29]   []
16  3028279808588   292420608   
[0,1,2,3,5,6,7,8,9,10,11,12,14,18,20,21,22,24,25,26,27,28,29]   []
17  2772279808588   292420608   
[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,18,19,21,22,23,24,25,26,27,28,29]   []
18  2804279808588   292420608   
[0,1,2,3,5,6,8,9,10,11,12,14,15,16,17,21,22,23,24,25,26,27,29]  []
19  2620279808588   292420608   
[0,1,2,3,4,5,6,7,8,10,11,12,13,14,15,16,17,21,22,23,25,26,27,28,29] []
20  2956279808588   292420608   
[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,21,22,23,24,25,27,29]  []
21  2876279808588   292420608   
[0,1,2,3,4,5,6,8,9,10,12,13,15,16,17,18,19,20,24,25,26,27,29]   []
22  3044279808588   292420608   
[1,2,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,24,25,26,27,28,29][]
23  2752279808584   

Re: osd crash after reboot

2012-12-14 Thread Dennis Jacobfeuerborn
On 12/14/2012 10:14 AM, Stefan Priebe wrote:
 One more IMPORTANT note. This might happen due to the fact that a disk was
 missing (disk failure) afte the reboot.
 
 fstab and mountpoint are working with UUIDs so they match but the journal
 block device:
 osd journal  = /dev/sde1
 
 didn't match anymore - as the numbers got renumber due to the failed disk.
 Is there a way to use some kind of UUIDs here too for journal?

You should be able to use /dev/disk/by-uuid/* instead. That should give you
a stable view of the filesystems.

Regards,
  Dennis

--
To unsubscribe from this list: send the line unsubscribe ceph-devel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Usage of CEPH FS versa HDFS for Hadoop: TeraSort benchmark performance comparison issue

2012-12-14 Thread Mark Nelson

On 12/13/2012 08:54 AM, Lachfeld, Jutta wrote:

Hi all,


Hi!  Sorry to send this a bit late, it looks like the reply I authored 
yesterday from my phone got eaten by vger.




I am currently doing some comparisons between CEPH FS and HDFS as a file system 
for Hadoop using Hadoop's integrated benchmark TeraSort. This benchmark first 
generates the specified amount of data in the file system used by Hadoop, e.g. 
1TB of data, and then sorts the data via the MapReduce framework of Hadoop, 
sending the sorted output again to the file system used by Hadoop.  The 
benchmark measures the elapsed time of a sort run.

I am wondering about my best result achieved with CEPH FS in comparison to the 
ones achieved with HDFS. With CEPH, the runtime of the benchmark is somewhat 
longer, the factor is about 1.2 when comparing with an HDFS run using the 
default HDFS block size of 64MB. When comparing with an HDFS run using an HDFS 
block size of 512MB the factor is even 1.5.

Could you please take a look at the configuration, perhaps some key factor 
already catches your eye, e.g. CEPH version.

OS: SLES 11 SP2


Beyond what the others have said, this could be an issue.  If I recall, 
that's an older version of SLES and won't have syncfs support in glibc 
(you need 2.14+).  In newer versions of Ceph you can still use syncfs if 
your kernel is new enough (2.6.38+), but in 0.48 you need support for it 
in glibc too.  This will have a performance impact, especially if you 
have more than one OSD per server.




CEPH:
OSDs are distributed over several machines.
There is 1 MON and 1 MDS process on yet another machine.

Replication of the data pool is set to 1.
Underlying file systems for data are btrfs.


What kernel are you using?  If it's older, this could also be an issue. 
 We've seen pretty bad btrfs fragmentation on older kernels that seems 
to be related to degradation in performance over time.



Mount options  are only rw,noatime.
For each CEPH OSD, we use a RAM disk of 256MB for the journal.
Package ceph has version 0.48-13.1, package ceph-fuse has version 0.48-13.1.

HDFS:
HDFS is distributed over the same machines.
HDFS name node on yet another machine.

Replication level is set to 1.
HDFS block size is set to  64MB or even 512MB.
Underlying file systems for data are btrfs.
Mount options are only rw,noatime.


The large block size may be an issue (at least with some of our default 
tunable settings).  You might want to try 4 or 16MB and see if it's any 
better or worse.




Hadoop version is 1.0.3.
Applied the CEPH patch for Hadoop that was generated with 0 .20.205.0.
The same maximum number of Hadoop map tasks has been used for HDFS and for CEPH 
FS.

The same disk partitions are either formatted for HDFS or for CEPH usage.

CPU usage in both cases is almost 100 percent on all data related nodes.


If you run sysprof, you can probably get an idea of where the time is 
being spent.  perf sort of works but doesn't seem to report ceph-osd 
symbols properly.



There is enough memory on all nodes for the joint load of ceph-osd and Hadoop 
java processes.

Best regards,

Jutta Lachfeld.

--
jutta.lachf...@ts.fujitsu.com, Fujitsu Technology Solutions PBG PDG ESS SWE SOL 4, 
Infrastructure Solutions, MchD 5B, Tel. ..49-89-3222-2705, Company Details: 
http://de.ts.fujitsu.com/imprint

--
To unsubscribe from this list: send the line unsubscribe ceph-devel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html



--
To unsubscribe from this list: send the line unsubscribe ceph-devel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: osd crash after reboot

2012-12-14 Thread Mark Nelson

On 12/14/2012 08:52 AM, Dennis Jacobfeuerborn wrote:

On 12/14/2012 10:14 AM, Stefan Priebe wrote:

One more IMPORTANT note. This might happen due to the fact that a disk was
missing (disk failure) afte the reboot.

fstab and mountpoint are working with UUIDs so they match but the journal
block device:
osd journal  = /dev/sde1

didn't match anymore - as the numbers got renumber due to the failed disk.
Is there a way to use some kind of UUIDs here too for journal?


You should be able to use /dev/disk/by-uuid/* instead. That should give you
a stable view of the filesystems.


I often map partitions to something in /dev/disk/by-partlabel and use 
those in my ceph.conf files.  that way disks can be remapped behind the 
scenes and the ceph configuration doesn't have to change even if disks 
get replaced.




Regards,
   Dennis

--
To unsubscribe from this list: send the line unsubscribe ceph-devel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html



--
To unsubscribe from this list: send the line unsubscribe ceph-devel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: osd crash after reboot

2012-12-14 Thread Stefan Priebe - Profihost AG

Hello Dennis,

Am 14.12.2012 15:52, schrieb Dennis Jacobfeuerborn:

didn't match anymore - as the numbers got renumber due to the failed disk.
Is there a way to use some kind of UUIDs here too for journal?


You should be able to use /dev/disk/by-uuid/* instead. That should give you
a stable view of the filesystems.


Good idea but there are only listed partitions with UUIDs. When the 
journal is using directly the partition it does not have a UUID.


But this reminded me of /dev/disk/by-id and that works fine. I'm now 
using the wwn Number.


Greets,
Stefan
--
To unsubscribe from this list: send the line unsubscribe ceph-devel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: osd crash after reboot

2012-12-14 Thread Mark Nelson

Hi Stefan,

Here's what I often do when I have a journal and data partition sharing 
a disk:


sudo parted -s -a optimal /dev/$DEV mklabel gpt
sudo parted -s -a optimal /dev/$DEV mkpart osd-device-$i-journal 0% 10G
sudo parted -s -a optimal /dev/$DEV mkpart osd-device-$i-data 10G 100%

Mark

On 12/14/2012 09:11 AM, Stefan Priebe - Profihost AG wrote:

Hi Mark,

but do i set a label for a partition without FS like the journal blockdev?
Am 14.12.2012 16:01, schrieb Mark Nelson:

I often map partitions to something in /dev/disk/by-partlabel and use
those in my ceph.conf files.  that way disks can be remapped behind the
scenes and the ceph configuration doesn't have to change even if disks
get replaced.


Greets,
Stefan


--
To unsubscribe from this list: send the line unsubscribe ceph-devel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: osd crash after reboot

2012-12-14 Thread Stefan Priebe - Profihost AG

Hi Mark,

Am 14.12.2012 16:20, schrieb Mark Nelson:

sudo parted -s -a optimal /dev/$DEV mklabel gpt
sudo parted -s -a optimal /dev/$DEV mkpart osd-device-$i-journal 0% 10G
sudo parted -s -a optimal /dev/$DEV mkpart osd-device-$i-data 10G 100%


My disks are gpt too and i'm also using parted. But i don't want to 
recreate my partitions. I haven't seen a way in parted to set such a 
label later.


Greets,
Stefan
--
To unsubscribe from this list: send the line unsubscribe ceph-devel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


OSDMonitor: don't allow creation of pools with 65535 pgs

2012-12-14 Thread Jim Schutt
Hi,

I'm looking at commit e3ed28eb2 in the next branch,
and I have a question.

Shouldn't the limit be pg_num  65536, because
PGs are numbered 0 thru pg_num-1?

If not, what am I missing?

FWIW, up through yesterday I've been using the next branch and this:

  ceph osd pool set data pg_num 65536 --allow-experimental-feature
  ceph osd pool set metadata pg_num 65536 --allow-experimental-feature
  ceph osd pool set data pgp_num 65536 --allow-experimental-feature
  ceph osd pool set metadata pgp_num 65536 --allow-experimental-feature

using cephfs clients, and have seen no trouble with
misdirected ops, etc.

-- Jim

--
To unsubscribe from this list: send the line unsubscribe ceph-devel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: osd crash after reboot

2012-12-14 Thread Stefan Priebe - Profihost AG

Hello Mark,

Am 14.12.2012 16:20, schrieb Mark Nelson:

sudo parted -s -a optimal /dev/$DEV mklabel gpt
sudo parted -s -a optimal /dev/$DEV mkpart osd-device-$i-journal 0% 10G
sudo parted -s -a optimal /dev/$DEV mkpart osd-device-$i-data 10G 100%


Isn't that the part type you're using?
mkpart part-type start-mb end-mb

I like your idea and i think it's a good one but i want to know why this 
works. part-type isn't FS label...


Greets,
Stefan
--
To unsubscribe from this list: send the line unsubscribe ceph-devel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: rbd map command hangs for 15 minutes during system start up

2012-12-14 Thread Alex Elder
On 12/13/2012 01:00 PM, Nick Bartos wrote:
 Here's another log with the kernel debugging enabled:
 https://gist.github.com/raw/4278697/1c9e41d275e614783fbbdee8ca5842680f46c249/rbd-hang-1355424455.log
 
 Note that it hung on the 2nd try.

Just to make sure I'm working with the right code base, can
you confirm that you're using a kernel built with the equivalent
of what's now in the wip-nick-newer branch (commit id 1728893)?


Also, looking at this log I don't think I see any rbd debug output.
Does that make sense to you?

How are you activating debugging to get these messages?
If it includes something like:

echo module libceph +p  /sys/kernel/debug/dynamic_debug/control

it might be that you need to also do:

echo module rbd +p  /sys/kernel/debug/dynamic_debug/control

This information would be helpful in providing some more context
about what rbd is doing that's leading to the various messaging
activity I seen in this log.

Please send me a log with that info if you are able to produce
one.  Thanks a lot.

-Alex
--
To unsubscribe from this list: send the line unsubscribe ceph-devel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: rbd map command hangs for 15 minutes during system start up

2012-12-14 Thread Nick Bartos
The kernel is 3.5.7 with the following patches applied (and in the
order specified below):

001-libceph_eliminate_connection_state_DEAD_13_days_ago.patch
002-libceph_kill_bad_proto_ceph_connection_op_13_days_ago.patch
003-libceph_rename_socket_callbacks_13_days_ago.patch
004-libceph_rename_kvec_reset_and_kvec_add_functions_13_days_ago.patch
005-libceph_embed_ceph_messenger_structure_in_ceph_client_13_days_ago.patch
006-libceph_start_separating_connection_flags_from_state_13_days_ago.patch
007-libceph_start_tracking_connection_socket_state_13_days_ago.patch
008-libceph_provide_osd_number_when_creating_osd_13_days_ago.patch
009-libceph_set_CLOSED_state_bit_in_con_init_13_days_ago.patch
010-libceph_embed_ceph_connection_structure_in_mon_client_13_days_ago.patch
011-libceph_drop_connection_refcounting_for_mon_client_13_days_ago.patch
012-libceph_init_monitor_connection_when_opening_13_days_ago.patch
013-libceph_fully_initialize_connection_in_con_init_13_days_ago.patch
014-libceph_tweak_ceph_alloc_msg_13_days_ago.patch
015-libceph_have_messages_point_to_their_connection_13_days_ago.patch
016-libceph_have_messages_take_a_connection_reference_13_days_ago.patch
017-libceph_make_ceph_con_revoke_a_msg_operation_13_days_ago.patch
018-libceph_make_ceph_con_revoke_message_a_msg_op_13_days_ago.patch
019-libceph_fix_overflow_in___decode_pool_names_13_days_ago.patch
020-libceph_fix_overflow_in_osdmap_decode_13_days_ago.patch
021-libceph_fix_overflow_in_osdmap_apply_incremental_13_days_ago.patch
022-libceph_transition_socket_state_prior_to_actual_connect_13_days_ago.patch
023-libceph_fix_NULL_dereference_in_reset_connection_13_days_ago.patch
024-libceph_use_con_get_put_methods_13_days_ago.patch
025-libceph_drop_ceph_con_get_put_helpers_and_nref_member_13_days_ago.patch
026-libceph_encapsulate_out_message_data_setup_13_days_ago.patch
027-libceph_encapsulate_advancing_msg_page_13_days_ago.patch
028-libceph_don_t_mark_footer_complete_before_it_is_13_days_ago.patch
029-libceph_move_init_bio__functions_up_13_days_ago.patch
030-libceph_move_init_of_bio_iter_13_days_ago.patch
031-libceph_don_t_use_bio_iter_as_a_flag_13_days_ago.patch
032-libceph_SOCK_CLOSED_is_a_flag_not_a_state_13_days_ago.patch
033-libceph_don_t_change_socket_state_on_sock_event_13_days_ago.patch
034-libceph_just_set_SOCK_CLOSED_when_state_changes_13_days_ago.patch
035-libceph_don_t_touch_con_state_in_con_close_socket_13_days_ago.patch
036-libceph_clear_CONNECTING_in_ceph_con_close_13_days_ago.patch
037-libceph_clear_NEGOTIATING_when_done_13_days_ago.patch
038-libceph_define_and_use_an_explicit_CONNECTED_state_13_days_ago.patch
039-libceph_separate_banner_and_connect_writes_13_days_ago.patch
040-libceph_distinguish_two_phases_of_connect_sequence_13_days_ago.patch
041-libceph_small_changes_to_messenger.c_13_days_ago.patch
042-libceph_add_some_fine_ASCII_art_13_days_ago.patch
043-libceph_set_peer_name_on_con_open_not_init_13_days_ago.patch
044-libceph_initialize_mon_client_con_only_once_13_days_ago.patch
045-libceph_allow_sock_transition_from_CONNECTING_to_CLOSED_13_days_ago.patch
046-libceph_initialize_msgpool_message_types_13_days_ago.patch
047-libceph_prevent_the_race_of_incoming_work_during_teardown_13_days_ago.patch
048-libceph_report_socket_read_write_error_message_13_days_ago.patch
049-libceph_fix_mutex_coverage_for_ceph_con_close_13_days_ago.patch
050-libceph_resubmit_linger_ops_when_pg_mapping_changes_12_days_ago.patch
051-libceph_re_initialize_bio_iter_on_start_of_message_receive_28_hours_ago.patch
052-libceph_protect_ceph_con_open_with_mutex_28_hours_ago.patch
053-libceph_reset_connection_retry_on_successfully_negotiation_28_hours_ago.patch
054-libceph_fix_fault_locking_close_socket_on_lossy_fault_28_hours_ago.patch
055-libceph_move_msgr_clear_standby_under_con_mutex_protection_28_hours_ago.patch
056-libceph_move_ceph_con_send_closed_check_under_the_con_mutex_28_hours_ago.patch
057-libceph_drop_gratuitous_socket_close_calls_in_con_work_28_hours_ago.patch
058-libceph_close_socket_directly_from_ceph_con_close_28_hours_ago.patch
059-libceph_drop_unnecessary_CLOSED_check_in_socket_state_change_callback_28_hours_ago.patch
060-libceph_replace_connection_state_bits_with_states_28_hours_ago.patch
061-libceph_clean_up_con_flags_28_hours_ago.patch
062-libceph_clear_all_flags_on_con_close_28_hours_ago.patch
063-libceph_fix_handling_of_immediate_socket_connect_failure_28_hours_ago.patch
064-libceph_revoke_mon_client_messages_on_session_restart_28_hours_ago.patch
065-libceph_verify_state_after_retaking_con_lock_after_dispatch_28_hours_ago.patch
066-libceph_avoid_dropping_con_mutex_before_fault_28_hours_ago.patch
067-libceph_change_ceph_con_in_msg_alloc_convention_to_be_less_weird_28_hours_ago.patch
068-libceph_recheck_con_state_after_allocating_incoming_message_28_hours_ago.patch
069-libceph_fix_crypto_key_null_deref_memory_leak_28_hours_ago.patch
070-libceph_delay_debugfs_initialization_until_we_learn_global_id_28_hours_ago.patch

Re: OSDMonitor: don't allow creation of pools with 65535 pgs

2012-12-14 Thread Joao Eduardo Luis

On 12/14/2012 03:41 PM, Jim Schutt wrote:

Hi,

I'm looking at commit e3ed28eb2 in the next branch,
and I have a question.

Shouldn't the limit be pg_num  65536, because
PGs are numbered 0 thru pg_num-1?

If not, what am I missing?

FWIW, up through yesterday I've been using the next branch and this:

   ceph osd pool set data pg_num 65536 --allow-experimental-feature
   ceph osd pool set metadata pg_num 65536 --allow-experimental-feature
   ceph osd pool set data pgp_num 65536 --allow-experimental-feature
   ceph osd pool set metadata pgp_num 65536 --allow-experimental-feature

using cephfs clients, and have seen no trouble with
misdirected ops, etc.

-- Jim



Hi Jim,

To the best of my knowledge, one of the things that triggered the 
required hard cap on the number of pgs was that the kernel side is still 
limited to 16 bits, despite that on the osd side this is no longer true.


I'm not familiar with what's going on on the kernel side, but if there's 
a slight chance that we are indeed keeping the 'pg_num' on a 16-bit 
variable, then that value must be capped to 65535. If that's not the 
case and we're just limited by the pg's number/id, then I guess that 
accepting up to 65636 would be fine (0..65535).


Just in case I'll look into this and further implications.

Thanks.

  -Joao


--
To unsubscribe from this list: send the line unsubscribe ceph-devel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [EXTERNAL] Re: OSDMonitor: don't allow creation of pools with 65535 pgs

2012-12-14 Thread Jim Schutt
On 12/14/2012 09:59 AM, Joao Eduardo Luis wrote:
 On 12/14/2012 03:41 PM, Jim Schutt wrote:
 Hi,

 I'm looking at commit e3ed28eb2 in the next branch,
 and I have a question.

 Shouldn't the limit be pg_num  65536, because
 PGs are numbered 0 thru pg_num-1?

 If not, what am I missing?

 FWIW, up through yesterday I've been using the next branch and this:

ceph osd pool set data pg_num 65536 --allow-experimental-feature
ceph osd pool set metadata pg_num 65536 --allow-experimental-feature
ceph osd pool set data pgp_num 65536 --allow-experimental-feature
ceph osd pool set metadata pgp_num 65536 --allow-experimental-feature

 using cephfs clients, and have seen no trouble with
 misdirected ops, etc.

 -- Jim

 
 Hi Jim,
 
 To the best of my knowledge, one of the things that triggered the
 required hard cap on the number of pgs was that the kernel side is
 still limited to 16 bits, despite that on the osd side this is no
 longer true.
 

I believe the culprit is the ps member of struct ceph_pg,
which stores what is eventually used as the PG id as __le16.

 I'm not familiar with what's going on on the kernel side, but if
 there's a slight chance that we are indeed keeping the 'pg_num' on a
 16-bit variable, then that value must be capped to 65535. If that's
 not the case and we're just limited by the pg's number/id, then I
 guess that accepting up to 65636 would be fine (0..65535).

struct ceph_pg_pool in the kernel stores pg_num and friends
as __le32.

 
 Just in case I'll look into this and further implications.

Cool, thanks.

-- Jim

 
 Thanks.
 
   -Joao
 
 
 
 


--
To unsubscribe from this list: send the line unsubscribe ceph-devel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: osd crash after reboot

2012-12-14 Thread Sage Weil
On Fri, 14 Dec 2012, Stefan Priebe wrote:
 One more IMPORTANT note. This might happen due to the fact that a disk was
 missing (disk failure) afte the reboot.
 
 fstab and mountpoint are working with UUIDs so they match but the journal
 block device:
 osd journal  = /dev/sde1
 
 didn't match anymore - as the numbers got renumber due to the failed disk. Is
 there a way to use some kind of UUIDs here too for journal?

I think others have addressed the uuid question, but one note:

The ceph-osd process has an internal uuid/fingerprint on the journal and 
data dir, and will refuse to start if they don't match.

sage


 
 Stefan
 
 Am 14.12.2012 09:22, schrieb Stefan Priebe:
  same log more verbose:
  11 ec=10 les/c 3307/3307 3306/3306/3306) [] r=0 lpr=0 lcod 0'0 mlcod 0'0
  inactive] read_log done
  -11 2012-12-14 09:17:50.648572 7fb6e0d6b780 10 osd.3 pg_epoch: 3996
  pg[3.44b( v 3988'3969 (1379'2968,3988'3969] local-les=3307 n=11 ec=10
  les/c 3307/3307 3306/3306/3306) [3,12] r=0 lpr=0 lcod 0'0 mlcod 0'0
  inactive] handle_loaded
  -10 2012-12-14 09:17:50.648581 7fb6e0d6b780 20 osd.3 pg_epoch: 3996
  pg[3.44b( v 3988'3969 (1379'2968,3988'3969] local-les=3307 n=11 ec=10
  les/c 3307/3307 3306/3306/3306) [3,12] r=0 lpr=0 lcod 0'0 mlcod 0'0
  inactive] exit Initial 0.015080 0 0.00
   -9 2012-12-14 09:17:50.648591 7fb6e0d6b780 20 osd.3 pg_epoch: 3996
  pg[3.44b( v 3988'3969 (1379'2968,3988'3969] local-les=3307 n=11 ec=10
  les/c 3307/3307 3306/3306/3306) [3,12] r=0 lpr=0 lcod 0'0 mlcod 0'0
  inactive] enter Reset
   -8 2012-12-14 09:17:50.648599 7fb6e0d6b780 20 osd.3 pg_epoch: 3996
  pg[3.44b( v 3988'3969 (1379'2968,3988'3969] local-les=3307 n=11 ec=10
  les/c 3307/3307 3306/3306/3306) [3,12] r=0 lpr=0 lcod 0'0 mlcod 0'0
  inactive] set_last_peering_reset 3996
   -7 2012-12-14 09:17:50.648609 7fb6e0d6b780 10 osd.3 4233 load_pgs
  loaded pg[3.44b( v 3988'3969 (1379'2968,3988'3969] local-les=3307 n=11
  ec=10 les/c 3307/3307 3306/3306/3306) [3,12] r=0 lpr=3996 lcod 0'0 mlcod
  0'0 inactive] log(1379'2968,3988'3969]
   -6 2012-12-14 09:17:50.648649 7fb6e0d6b780 15
  filestore(/ceph/osd.3/) collection_getattr /ceph/osd.3//current/0.1_head
  'info'
   -5 2012-12-14 09:17:50.648664 7fb6e0d6b780 10
  filestore(/ceph/osd.3/) collection_getattr /ceph/osd.3//current/0.1_head
  'info' = 5
   -4 2012-12-14 09:17:50.648672 7fb6e0d6b780 20 osd.3 0 get_map 3316
  - loading and decoding 0x2943e00
   -3 2012-12-14 09:17:50.648678 7fb6e0d6b780 15
  filestore(/ceph/osd.3/) read meta/a09ec88/osdmap.3316/0//-1 0~0
   -2 2012-12-14 09:17:50.648705 7fb6e0d6b780 10
  filestore(/ceph/osd.3/) error opening file
  /ceph/osd.3//current/meta/DIR_8/DIR_8/osdmap.3316__0_0A09EC88__none with
  flags=0 and mode=0: (2) No such file or directory
   -1 2012-12-14 09:17:50.648722 7fb6e0d6b780 10
  filestore(/ceph/osd.3/) FileStore::read(meta/a09ec88/osdmap.3316/0//-1)
  open error: (2) No such file or directory
0 2012-12-14 09:17:50.649586 7fb6e0d6b780 -1 osd/OSD.cc: In
  function 'OSDMapRef OSDService::get_map(epoch_t)' thread 7fb6e0d6b780
  time 2012-12-14 09:17:50.648733
  osd/OSD.cc: 4385: FAILED assert(_get_map_bl(epoch, bl))
  
ceph version 0.55-239-gc951c27 (c951c270a42b94b6f269992c9001d90f70a2b824)
1: (OSDService::get_map(unsigned int)+0x918) [0x607f78]
2: (OSD::load_pgs()+0x13ed) [0x6168ad]
3: (OSD::init()+0xaff) [0x617a5f]
4: (main()+0x2de6) [0x55a416]
5: (__libc_start_main()+0xfd) [0x7fb6deb06c8d]
6: /usr/bin/ceph-osd() [0x557269]
NOTE: a copy of the executable, or `objdump -rdS executable` is
  needed to interpret this.
  
  --- logging levels ---
  0/ 5 none
  0/ 0 lockdep
  0/ 0 context
  0/ 0 crush
  1/ 5 mds
  1/ 5 mds_balancer
  1/ 5 mds_locker
  1/ 5 mds_log
  1/ 5 mds_log_expire
  1/ 5 mds_migrator
  0/ 0 buffer
  0/ 0 timer
  0/ 1 filer
  0/ 1 striper
  0/ 1 objecter
  0/ 5 rados
  0/ 5 rbd
  0/20 journaler
  0/ 5 objectcacher
  0/ 5 client
  0/20 osd
  0/ 0 optracker
  0/ 0 objclass
  0/20 filestore
  0/20 journal
  0/ 0 ms
  1/ 5 mon
  0/ 0 monc
  0/ 5 paxos
  0/ 0 tp
  0/ 0 auth
  1/ 5 crypto
  0/ 0 finisher
  0/ 0 heartbeatmap
  0/ 0 perfcounter
  1/ 5 rgw
  1/ 5 hadoop
  1/ 5 javaclient
  0/ 0 asok
  0/ 0 throttle
 -2/-2 (syslog threshold)
 -1/-1 (stderr threshold)
 max_recent10
 max_new 1000
 log_file /var/log/ceph/ceph-osd.3.log
  --- end dump of recent events ---
  2012-12-14 09:17:50.714676 7fb6e0d6b780 -1 *** Caught signal (Aborted) **
in thread 7fb6e0d6b780
  
ceph version 0.55-239-gc951c27 (c951c270a42b94b6f269992c9001d90f70a2b824)
1: /usr/bin/ceph-osd() [0x7a1889]
2: (()+0xeff0) [0x7fb6e0750ff0]
3: (gsignal()+0x35) [0x7fb6deb1a1b5]
4: (abort()+0x180) [0x7fb6deb1cfc0]
5: 

Re: osd crash after reboot

2012-12-14 Thread Stefan Priebe

Hi Sage,

this was just an idea and i need to fix MY uuid problem. But then the 
crash is still a problem of ceph. Have you looked into my log?

Am 14.12.2012 20:42, schrieb Sage Weil:

On Fri, 14 Dec 2012, Stefan Priebe wrote:

One more IMPORTANT note. This might happen due to the fact that a disk was
missing (disk failure) afte the reboot.

fstab and mountpoint are working with UUIDs so they match but the journal
block device:
osd journal  = /dev/sde1

didn't match anymore - as the numbers got renumber due to the failed disk. Is
there a way to use some kind of UUIDs here too for journal?


I think others have addressed the uuid question, but one note:

The ceph-osd process has an internal uuid/fingerprint on the journal and
data dir, and will refuse to start if they don't match.


Stefan
--
To unsubscribe from this list: send the line unsubscribe ceph-devel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


ceph-client/testing branch force-updated again

2012-12-14 Thread Alex Elder
I have updated the testing branch in the ceph-client git
repository again, and you'll find that a forced update is
needed to bring your own repository up to date.

This will probably be necessary again at some point once
we get some reviews done on commits still in this branch,
but we'll try not to make a habit of it...

-Alex
--
To unsubscribe from this list: send the line unsubscribe ceph-devel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 1/9] rbd: do not allow remove of mounted-on image

2012-12-14 Thread Sage Weil
Reviewed-by: Sage Weil s...@inktank.com

On Thu, 13 Dec 2012, Alex Elder wrote:

 There is no check in rbd_remove() to see if anybody holds open the
 image being removed.  That's not cool.
 
 Add a simple open count that goes up and down with opens and closes
 (releases) of the device, and don't allow an rbd image to be removed
 if the count is non-zero.
 
 Protect the updates of the open count value with ctl_mutex to ensure
 the underlying rbd device doesn't get removed while concurrently
 being opened.
 
 Signed-off-by: Alex Elder el...@inktank.com
 ---
  drivers/block/rbd.c |   13 +
  1 file changed, 13 insertions(+)
 
 diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
 index 842caf4..c7bf961 100644
 --- a/drivers/block/rbd.c
 +++ b/drivers/block/rbd.c
 @@ -235,6 +235,7 @@ struct rbd_device {
 
   /* sysfs related */
   struct device   dev;
 + unsigned long   open_count;
  };
 
  static DEFINE_MUTEX(ctl_mutex);/* Serialize 
 open/close/setup/teardown */
 @@ -309,8 +310,11 @@ static int rbd_open(struct block_device *bdev,
 fmode_t mode)
   if ((mode  FMODE_WRITE)  rbd_dev-mapping.read_only)
   return -EROFS;
 
 + mutex_lock_nested(ctl_mutex, SINGLE_DEPTH_NESTING);
   rbd_get_dev(rbd_dev);
   set_device_ro(bdev, rbd_dev-mapping.read_only);
 + rbd_dev-open_count++;
 + mutex_unlock(ctl_mutex);
 
   return 0;
  }
 @@ -319,7 +323,11 @@ static int rbd_release(struct gendisk *disk,
 fmode_t mode)
  {
   struct rbd_device *rbd_dev = disk-private_data;
 
 + mutex_lock_nested(ctl_mutex, SINGLE_DEPTH_NESTING);
 + rbd_assert(rbd_dev-open_count  0);
 + rbd_dev-open_count--;
   rbd_put_dev(rbd_dev);
 + mutex_unlock(ctl_mutex);
 
   return 0;
  }
 @@ -3745,6 +3753,11 @@ static ssize_t rbd_remove(struct bus_type *bus,
   goto done;
   }
 
 + if (rbd_dev-open_count) {
 + ret = -EBUSY;
 + goto done;
 + }
 +
   rbd_remove_all_snaps(rbd_dev);
   rbd_bus_del_dev(rbd_dev);
 
 -- 
 1.7.9.5
 
 --
 To unsubscribe from this list: send the line unsubscribe ceph-devel in
 the body of a message to majord...@vger.kernel.org
 More majordomo info at  http://vger.kernel.org/majordomo-info.html
 
 
--
To unsubscribe from this list: send the line unsubscribe ceph-devel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 3/9] libceph: avoid using freed osd in __kick_osd_requests()

2012-12-14 Thread Sage Weil
Reviewed-by: Sage Weil s...@inktank.com

On Thu, 13 Dec 2012, Alex Elder wrote:

 If an osd has no requests and no linger requests, __reset_osd()
 will just remove it with a call to __remove_osd().  That drops
 a reference to the osd, and therefore the osd may have been free
 by the time __reset_osd() returns.  That function offers no
 indication this may have occurred, and as a result the osd will
 continue to be used even when it's no longer valid.
 
 Change__reset_osd() so it returns an error (ENODEV) when it
 deletes the osd being reset.  And change __kick_osd_requests() so it
 returns immediately (before referencing osd again) if __reset_osd()
 returns *any* error.
 
 Signed-off-by: Alex Elder el...@inktank.com
 ---
  net/ceph/osd_client.c |3 ++-
  1 file changed, 2 insertions(+), 1 deletion(-)
 
 diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
 index ac7be72..60c74c1 100644
 --- a/net/ceph/osd_client.c
 +++ b/net/ceph/osd_client.c
 @@ -581,7 +581,7 @@ static void __kick_osd_requests(struct
 ceph_osd_client *osdc,
 
   dout(__kick_osd_requests osd%d\n, osd-o_osd);
   err = __reset_osd(osdc, osd);
 - if (err == -EAGAIN)
 + if (err)
   return;
 
   list_for_each_entry(req, osd-o_requests, r_osd_item) {
 @@ -745,6 +745,7 @@ static int __reset_osd(struct ceph_osd_client *osdc,
 struct ceph_osd *osd)
   if (list_empty(osd-o_requests) 
   list_empty(osd-o_linger_requests)) {
   __remove_osd(osdc, osd);
 + ret = -ENODEV;
   } else if (memcmp(osdc-osdmap-osd_addr[osd-o_osd],
 osd-o_con.peer_addr,
 sizeof(osd-o_con.peer_addr)) == 0 
 -- 
 1.7.9.5
 
 --
 To unsubscribe from this list: send the line unsubscribe ceph-devel in
 the body of a message to majord...@vger.kernel.org
 More majordomo info at  http://vger.kernel.org/majordomo-info.html
 
 
--
To unsubscribe from this list: send the line unsubscribe ceph-devel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 4/9] rbd: get rid of RBD_MAX_SEG_NAME_LEN

2012-12-14 Thread Sage Weil
Reviewed-by: Sage Weil s...@inktank.com

On Thu, 13 Dec 2012, Alex Elder wrote:

 RBD_MAX_SEG_NAME_LEN represents the maximum length of an rbd object
 name (i.e., one of the objects providing storage backing an rbd
 image).
 
 Another symbol, MAX_OBJ_NAME_SIZE, is used in the osd client code to
 define the maximum length of any object name in an osd request.
 
 Right now they disagree, with RBD_MAX_SEG_NAME_LEN being too big.
 
 There's no real benefit at this point to defining the rbd object
 name length limit separate from any other object name, so just
 get rid of RBD_MAX_SEG_NAME_LEN and use MAX_OBJ_NAME_SIZE in its
 place.
 
 Signed-off-by: Alex Elder el...@inktank.com
 ---
  drivers/block/rbd.c   |6 +++---
  drivers/block/rbd_types.h |2 --
  2 files changed, 3 insertions(+), 5 deletions(-)
 
 diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
 index c7bf961..ce26b749 100644
 --- a/drivers/block/rbd.c
 +++ b/drivers/block/rbd.c
 @@ -740,13 +740,13 @@ static char *rbd_segment_name(struct rbd_device
 *rbd_dev, u64 offset)
   u64 segment;
   int ret;
 
 - name = kmalloc(RBD_MAX_SEG_NAME_LEN + 1, GFP_NOIO);
 + name = kmalloc(MAX_OBJ_NAME_SIZE + 1, GFP_NOIO);
   if (!name)
   return NULL;
   segment = offset  rbd_dev-header.obj_order;
 - ret = snprintf(name, RBD_MAX_SEG_NAME_LEN, %s.%012llx,
 + ret = snprintf(name, MAX_OBJ_NAME_SIZE + 1, %s.%012llx,
   rbd_dev-header.object_prefix, segment);
 - if (ret  0 || ret = RBD_MAX_SEG_NAME_LEN) {
 + if (ret  0 || ret  MAX_OBJ_NAME_SIZE) {
   pr_err(error formatting segment name for #%llu (%d)\n,
   segment, ret);
   kfree(name);
 diff --git a/drivers/block/rbd_types.h b/drivers/block/rbd_types.h
 index cbe77fa..49d77cb 100644
 --- a/drivers/block/rbd_types.h
 +++ b/drivers/block/rbd_types.h
 @@ -46,8 +46,6 @@
  #define RBD_MIN_OBJ_ORDER   16
  #define RBD_MAX_OBJ_ORDER   30
 
 -#define RBD_MAX_SEG_NAME_LEN 128
 -
  #define RBD_COMP_NONE0
  #define RBD_CRYPT_NONE   0
 
 -- 
 1.7.9.5
 
 --
 To unsubscribe from this list: send the line unsubscribe ceph-devel in
 the body of a message to majord...@vger.kernel.org
 More majordomo info at  http://vger.kernel.org/majordomo-info.html
 
 
--
To unsubscribe from this list: send the line unsubscribe ceph-devel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 5/9] libceph: init osd-o_node in create_osd()

2012-12-14 Thread Sage Weil
We should drop this one, I think.  See upstream commit 
4c199a93a2d36b277a9fd209a0f2793f8460a215.  When we added the similar call 
on teh request tree it caused some noise in linux-next and then got 
removed.

sage

On Thu, 13 Dec 2012, Alex Elder wrote:

 It turns out to be harmless but the red-black node o_node in the
 ceph osd structure is not initialized in create_osd().  Add a
 call to rb_init_node() initialize it.
 
 Signed-off-by: Alex Elder el...@inktank.com
 ---
  net/ceph/osd_client.c |1 +
  1 file changed, 1 insertion(+)
 
 diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
 index 60c74c1..470816c 100644
 --- a/net/ceph/osd_client.c
 +++ b/net/ceph/osd_client.c
 @@ -642,6 +642,7 @@ static struct ceph_osd *create_osd(struct
 ceph_osd_client *osdc, int onum)
   atomic_set(osd-o_ref, 1);
   osd-o_osdc = osdc;
   osd-o_osd = onum;
 + rb_init_node(osd-o_node);
   INIT_LIST_HEAD(osd-o_requests);
   INIT_LIST_HEAD(osd-o_linger_requests);
   INIT_LIST_HEAD(osd-o_osd_lru);
 -- 
 1.7.9.5
 
 --
 To unsubscribe from this list: send the line unsubscribe ceph-devel in
 the body of a message to majord...@vger.kernel.org
 More majordomo info at  http://vger.kernel.org/majordomo-info.html
 
 
--
To unsubscribe from this list: send the line unsubscribe ceph-devel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] libceph: report connection fault with warning

2012-12-14 Thread Alex Elder
When a connection's socket disconnects, or if there's a protocol
error of some kind on the connection, a fault is signaled and
the connection is reset (closed and reopened, basically).  We
currently get an error message on the log whenever this occurs.

A ceph connection will attempt to reestablish a socket connection
repeatedly if a fault occurs.  This means that these error messages
will get repeatedly added to the log, which is undesirable.

Change the error message to be a warning, so they don't get
logged by default.

Signed-off-by: Alex Elder el...@inktank.com
---
 net/ceph/messenger.c |2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index 4b04ccc..4d111fd 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -2377,7 +2377,7 @@ fault:
 static void ceph_fault(struct ceph_connection *con)
__releases(con-mutex)
 {
-   pr_err(%s%lld %s %s\n, ENTITY_NAME(con-peer_name),
+   pr_warning(%s%lld %s %s\n, ENTITY_NAME(con-peer_name),
   ceph_pr_addr(con-peer_addr.in_addr), con-error_msg);
dout(fault %p state %lu to peer %s\n,
 con, con-state, ceph_pr_addr(con-peer_addr.in_addr));
-- 
1.7.9.5

--
To unsubscribe from this list: send the line unsubscribe ceph-devel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] libceph: report connection fault with warning

2012-12-14 Thread Sage Weil
Reviewed-by: Sage Weil s...@inktank.com

On Fri, 14 Dec 2012, Alex Elder wrote:

 When a connection's socket disconnects, or if there's a protocol
 error of some kind on the connection, a fault is signaled and
 the connection is reset (closed and reopened, basically).  We
 currently get an error message on the log whenever this occurs.
 
 A ceph connection will attempt to reestablish a socket connection
 repeatedly if a fault occurs.  This means that these error messages
 will get repeatedly added to the log, which is undesirable.
 
 Change the error message to be a warning, so they don't get
 logged by default.
 
 Signed-off-by: Alex Elder el...@inktank.com
 ---
  net/ceph/messenger.c |2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)
 
 diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
 index 4b04ccc..4d111fd 100644
 --- a/net/ceph/messenger.c
 +++ b/net/ceph/messenger.c
 @@ -2377,7 +2377,7 @@ fault:
  static void ceph_fault(struct ceph_connection *con)
   __releases(con-mutex)
  {
 - pr_err(%s%lld %s %s\n, ENTITY_NAME(con-peer_name),
 + pr_warning(%s%lld %s %s\n, ENTITY_NAME(con-peer_name),
  ceph_pr_addr(con-peer_addr.in_addr), con-error_msg);
   dout(fault %p state %lu to peer %s\n,
con, con-state, ceph_pr_addr(con-peer_addr.in_addr));
 -- 
 1.7.9.5
 
 --
 To unsubscribe from this list: send the line unsubscribe ceph-devel in
 the body of a message to majord...@vger.kernel.org
 More majordomo info at  http://vger.kernel.org/majordomo-info.html
 
 
--
To unsubscribe from this list: send the line unsubscribe ceph-devel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 6/9] rbd: remove linger unconditionally

2012-12-14 Thread Sage Weil
Reviewed-by: Sage Weil s...@inktank.com

On Thu, 13 Dec 2012, Alex Elder wrote:

 In __unregister_linger_request(), the request is being removed
 from the osd client's req_linger list only when the request
 has a non-null osd pointer.  It should be done whether or not
 the request currently has an osd.
 
 This is most likely a non-issue because I believe the request
 will always have an osd when this function is called.
 
 Signed-off-by: Alex Elder el...@inktank.com
 ---
  net/ceph/osd_client.c |2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)
 
 diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
 index 470816c..b15b475 100644
 --- a/net/ceph/osd_client.c
 +++ b/net/ceph/osd_client.c
 @@ -907,8 +907,8 @@ static void __unregister_linger_request(struct
 ceph_osd_client *osdc,
   struct ceph_osd_request *req)
  {
   dout(__unregister_linger_request %p\n, req);
 + list_del_init(req-r_linger_item);
   if (req-r_osd) {
 - list_del_init(req-r_linger_item);
   list_del_init(req-r_linger_osd);
 
   if (list_empty(req-r_osd-o_requests) 
 -- 
 1.7.9.5
 
 --
 To unsubscribe from this list: send the line unsubscribe ceph-devel in
 the body of a message to majord...@vger.kernel.org
 More majordomo info at  http://vger.kernel.org/majordomo-info.html
 
 
--
To unsubscribe from this list: send the line unsubscribe ceph-devel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 9/9] libceph: socket can close in any connection state

2012-12-14 Thread Sage Weil
Reviewed-by: Sage Weil s...@inktank.com

On Thu, 13 Dec 2012, Alex Elder wrote:

 A connection's socket can close for any reason, independent of the
 state of the connection (and without irrespective of the connection
 mutex).  As a result, the connectino can be in pretty much any state
 at the time its socket is closed.
 
 Handle those other cases at the top of con_work().  Pull this whole
 block of code into a separate function to reduce the clutter.
 
 Signed-off-by: Alex Elder el...@inktank.com
 ---
  net/ceph/messenger.c |   47 ++-
  1 file changed, 30 insertions(+), 17 deletions(-)
 
 diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
 index 1041114..4b04ccc 100644
 --- a/net/ceph/messenger.c
 +++ b/net/ceph/messenger.c
 @@ -2273,6 +2273,35 @@ static void queue_con(struct ceph_connection *con)
   (void) queue_con_delay(con, 0);
  }
 
 +static bool con_sock_closed(struct ceph_connection *con)
 +{
 + if (!test_and_clear_bit(CON_FLAG_SOCK_CLOSED, con-flags))
 + return false;
 +
 +#define CASE(x)  
 \
 + case CON_STATE_ ## x:   \
 + con-error_msg = socket closed (con state  #x );\
 + break;
 +
 + switch (con-state) {
 + CASE(CLOSED);
 + CASE(PREOPEN);
 + CASE(CONNECTING);
 + CASE(NEGOTIATING);
 + CASE(OPEN);
 + CASE(STANDBY);
 + default:
 + pr_warning(%s con %p unrecognized state %lu\n,
 + __func__, con, con-state);
 + con-error_msg = unrecognized con state;
 + BUG();
 + break;
 + }
 +#undef CASE
 +
 + return true;
 +}
 +
  /*
   * Do some work on a connection.  Drop a connection ref when we're done.
   */
 @@ -2284,24 +2313,8 @@ static void con_work(struct work_struct *work)
 
   mutex_lock(con-mutex);
  restart:
 - if (test_and_clear_bit(CON_FLAG_SOCK_CLOSED, con-flags)) {
 - switch (con-state) {
 - case CON_STATE_CONNECTING:
 - con-error_msg = connection failed;
 - break;
 - case CON_STATE_NEGOTIATING:
 - con-error_msg = negotiation failed;
 - break;
 - case CON_STATE_OPEN:
 - con-error_msg = socket closed;
 - break;
 - default:
 - dout(unrecognized con state %d\n, (int)con-state);
 - con-error_msg = unrecognized con state;
 - BUG();
 - }
 + if (con_sock_closed(con))
   goto fault;
 - }
 
   if (test_and_clear_bit(CON_FLAG_BACKOFF, con-flags)) {
   dout(con_work %p backing off\n, con);
 -- 
 1.7.9.5
 
 --
 To unsubscribe from this list: send the line unsubscribe ceph-devel in
 the body of a message to majord...@vger.kernel.org
 More majordomo info at  http://vger.kernel.org/majordomo-info.html
 
 
--
To unsubscribe from this list: send the line unsubscribe ceph-devel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html