Re: librbd discard bug problems -> i got it

2012-11-20 Thread Stefan Priebe - Profihost AG

Hi,

here is the patch. In the log is nothing interesting.

Stefan
Am 20.11.2012 01:28, schrieb Josh Durgin:

On 11/19/2012 04:00 PM, Stefan Priebe wrote:

Hi Josh,

i don't get it. Every debug line i print is a prositive fine value. BUt
rbd_aio_bh_cb get's called with these values. As you can see that are
not much values i copied all values < 0 from log for discarding a whole
30GB device.


Could you post the patch of the debug prints you added and the log?
diff --git a/src/librbd/AioCompletion.cc b/src/librbd/AioCompletion.cc
index 082a08e..021da03 100644
--- a/src/librbd/AioCompletion.cc
+++ b/src/librbd/AioCompletion.cc
@@ -19,7 +19,7 @@ namespace librbd {
 
   void AioCompletion::finish_adding_requests(CephContext *cct)
   {
-ldout(cct, 20) << "AioCompletion::finish_adding_requests " << (void*)this << " pending " << pending_count << dendl;
+ldout(cct, 20) << "AioCompletion::finish_adding_requests " << (void*)this << " pending " << pending_count << " rval: " << rval << dendl;
 lock.Lock();
 assert(building);
 building = false;
@@ -58,14 +58,20 @@ namespace librbd {
   {
 ldout(cct, 20) << "AioCompletion::complete_request() "
 		   << (void *)this << " complete_cb=" << (void *)complete_cb
-		   << " pending " << pending_count << dendl;
+		   << " pending " << pending_count << " r: " << r << " rval: " << rval << dendl;
 lock.Lock();
 if (rval >= 0) {
   if (r < 0 && r != -EEXIST)
 	rval = r;
-  else if (r > 0)
+  else if (r > 0) {
 	rval += r;
+  }
 }
+
+ldout(cct, 20) << "AioCompletion::complete_request()22 "
+		   << (void *)this << " complete_cb=" << (void *)complete_cb
+		   << " pending " << pending_count << " r: " << r << " rval: " << rval << dendl;
+
 assert(pending_count);
 int count = --pending_count;
 if (!count && !building) {
@@ -75,6 +81,12 @@ namespace librbd {
 put_unlock();
   }
 
+  void C_AioWrite::finish(int r)
+  {
+  ldout(m_cct, 10) << "C_AioWrite::finish() " << this << " r = " << r << dendl;
+  m_completion->complete_request(m_cct, r);
+  }
+
   void C_AioRead::finish(int r)
   {
 ldout(m_cct, 10) << "C_AioRead::finish() " << this << " r = " << r << dendl;
@@ -94,11 +106,13 @@ namespace librbd {
 			m_req->m_buffer_extents);
   r = m_req->m_object_len;
 }
+ldout(m_cct, 10) << "C_AioRead::finish22() " << this << " r = " << r << dendl;
 m_completion->complete_request(m_cct, r);
   }
 
   void C_CacheRead::finish(int r)
   {
+fprintf(stdout, "C_CacheRead::finish r: %d\n", r);
 m_req->complete(r);
   }
 }
diff --git a/src/librbd/AioCompletion.h b/src/librbd/AioCompletion.h
index 7d3f336..d174140 100644
--- a/src/librbd/AioCompletion.h
+++ b/src/librbd/AioCompletion.h
@@ -178,9 +178,7 @@ namespace librbd {
 C_AioWrite(CephContext *cct, AioCompletion *completion)
   : m_cct(cct), m_completion(completion) {}
 virtual ~C_AioWrite() {}
-virtual void finish(int r) {
-  m_completion->complete_request(m_cct, r);
-}
+virtual void finish(int r);
   private:
 CephContext *m_cct;
 AioCompletion *m_completion;
diff --git a/src/librbd/AioRequest.cc b/src/librbd/AioRequest.cc
index 63424e3..7460799 100644
--- a/src/librbd/AioRequest.cc
+++ b/src/librbd/AioRequest.cc
@@ -47,6 +47,18 @@ namespace librbd {
 }
   }
 
+   void AioRequest::complete(int r)
+{
+ldout(m_ictx->cct, 20) << "AioRequest::complete " << this << " r = " << r << dendl;
+  if (should_complete(r)) {
+if (m_hide_enoent && r == -ENOENT)
+  r = 0;
+ldout(m_ictx->cct, 20) << "AioRequest::complete22 " << this << " r = " << r << dendl;
+m_completion->complete(r);
+delete this;
+  }
+}
+
   void AioRequest::read_from_parent(vector >& image_extents)
   {
 assert(!m_parent_completion);
@@ -82,6 +94,11 @@ namespace librbd {
   if (r < 0) {
 	assert(0 == "FIXME");
   }
+
+ldout(m_ictx->cct, 20) << "should_complete22 " << this << " " << m_oid << " " << m_object_off << "~" << m_object_len
+			   << " r = " << r << dendl;
+
+
   uint64_t object_overlap = m_ictx->prune_parent_extents(image_extents, image_overlap);
   if (object_overlap) {
 	m_tried_parent = true;
diff --git a/src/librbd/AioRequest.h b/src/librbd/AioRequest.h
index 259af88..a33038c 100644
--- a/src/librbd/AioRequest.h
+++ b/src/librbd/AioRequest.h
@@ -32,23 +32,14 @@ namespace librbd {
 	   bool hide_enoent);
 virtual ~AioRequest();
 
-void complete(int r)
-{
-  if (should_complete(r)) {
-	if (m_hide_enoent && r == -ENOENT)
-	  r = 0;
-	m_completion->complete(r);
-	delete this;
-  }
-}
-
+virtual void complete(int r);
 virtual bool should_complete(int r) = 0;
 virtual int send() = 0;
+ImageCtx *m_ictx;
 
   protected:
 void read_from_parent(vector >& image_extents);
 
-ImageCtx *m_ictx;
 librados::IoCtx m_ioctx;
 std::string m_oid;
 uint64_t m_object_no, m_object_off, m_object_len;
diff --

Re: librbd discard bug problems -> i got it

2012-11-19 Thread Josh Durgin

On 11/19/2012 04:00 PM, Stefan Priebe wrote:

Hi Josh,

i don't get it. Every debug line i print is a prositive fine value. BUt
rbd_aio_bh_cb get's called with these values. As you can see that are
not much values i copied all values < 0 from log for discarding a whole
30GB device.


Could you post the patch of the debug prints you added and the log?
--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html



Re: librbd discard bug problems -> i got it

2012-11-19 Thread Stefan Priebe

Hi Josh,

i don't get it. Every debug line i print is a prositive fine value. BUt 
rbd_aio_bh_cb get's called with these values. As you can see that are 
not much values i copied all values < 0 from log for discarding a whole 
30GB device.


Stefan

Am 20.11.2012 00:47, schrieb Josh Durgin:

On 11/19/2012 03:42 PM, Stefan Priebe wrote:

Am 20.11.2012 00:33, schrieb Josh Durgin:

On 11/19/2012 03:16 PM, Stefan Priebe wrote:

mhm qemu rbd block driver. Get's always these errors back. As
rbd_aio_bh_cb is directly called from librbd the problem must be there.
Strangely i can't find where rbd_aio_bh_cb get's called with -512.

ANy further ideas?


Two ideas:

1) Is rbd_finish_aiocb getting this same return value?

Will check this tomorrow.



2) Perhaps it's an issue with the return value wrapping around with
very large discards. Adding some logging of the return values of each
rados operation in AioCompletion::complete_request() might give us a
clue. These large negative return values are suspicious.


Good idea. As r and rval is int it is limited. But
AioCompletion::complete_request is adding more and more stuff to rval.
What could be a solution? Bump rval to int64? Or wrap to around to start
at 0 again?


The final return value is limited to int at a few levels. Probably it's
best to make discard alway return 0 on success. aio_discard should
already be doing this, but perhaps it's not in this case.

Josh
--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: librbd discard bug problems -> i got it

2012-11-19 Thread Josh Durgin

On 11/19/2012 03:42 PM, Stefan Priebe wrote:

Am 20.11.2012 00:33, schrieb Josh Durgin:

On 11/19/2012 03:16 PM, Stefan Priebe wrote:

mhm qemu rbd block driver. Get's always these errors back. As
rbd_aio_bh_cb is directly called from librbd the problem must be there.
Strangely i can't find where rbd_aio_bh_cb get's called with -512.

ANy further ideas?


Two ideas:

1) Is rbd_finish_aiocb getting this same return value?

Will check this tomorrow.



2) Perhaps it's an issue with the return value wrapping around with
very large discards. Adding some logging of the return values of each
rados operation in AioCompletion::complete_request() might give us a
clue. These large negative return values are suspicious.


Good idea. As r and rval is int it is limited. But
AioCompletion::complete_request is adding more and more stuff to rval.
What could be a solution? Bump rval to int64? Or wrap to around to start
at 0 again?


The final return value is limited to int at a few levels. Probably it's
best to make discard alway return 0 on success. aio_discard should
already be doing this, but perhaps it's not in this case.

Josh
--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: librbd discard bug problems -> i got it

2012-11-19 Thread Stefan Priebe

Am 20.11.2012 00:33, schrieb Josh Durgin:

On 11/19/2012 03:16 PM, Stefan Priebe wrote:

mhm qemu rbd block driver. Get's always these errors back. As
rbd_aio_bh_cb is directly called from librbd the problem must be there.
Strangely i can't find where rbd_aio_bh_cb get's called with -512.

ANy further ideas?


Two ideas:

1) Is rbd_finish_aiocb getting this same return value?

Will check this tomorrow.



2) Perhaps it's an issue with the return value wrapping around with
very large discards. Adding some logging of the return values of each
rados operation in AioCompletion::complete_request() might give us a
clue. These large negative return values are suspicious.


Good idea. As r and rval is int it is limited. But 
AioCompletion::complete_request is adding more and more stuff to rval. 
What could be a solution? Bump rval to int64? Or wrap to around to start 
at 0 again?


Stefan
--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: librbd discard bug problems -> i got it

2012-11-19 Thread Josh Durgin

On 11/19/2012 03:16 PM, Stefan Priebe wrote:

mhm qemu rbd block driver. Get's always these errors back. As
rbd_aio_bh_cb is directly called from librbd the problem must be there.
Strangely i can't find where rbd_aio_bh_cb get's called with -512.

ANy further ideas?


Two ideas:

1) Is rbd_finish_aiocb getting this same return value?

2) Perhaps it's an issue with the return value wrapping around with
very large discards. Adding some logging of the return values of each
rados operation in AioCompletion::complete_request() might give us a
clue. These large negative return values are suspicious.


rbd_aio_bh_cb got error back. Code: -512 Error: 0
rbd_aio_bh_cb got error back. Code: -512 Error: 0
rbd_aio_bh_cb got error back. Code: -512 Error: 0
rbd_aio_bh_cb got error back. Code: -512 Error: 0
rbd_aio_bh_cb got error back. Code: -512 Error: 0
rbd_aio_bh_cb got error back. Code: -512 Error: 0
rbd_aio_bh_cb got error back. Code: -512 Error: 0
rbd_aio_bh_cb got error back. Code: -512 Error: 0
rbd_aio_bh_cb got error back. Code: -512 Error: 0
rbd_aio_bh_cb got error back. Code: -1006628352 Error: 0

Stefan


--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: librbd discard bug problems -> i got it

2012-11-19 Thread Stefan Priebe
mhm qemu rbd block driver. Get's always these errors back. As 
rbd_aio_bh_cb is directly called from librbd the problem must be there. 
Strangely i can't find where rbd_aio_bh_cb get's called with -512.


ANy further ideas?

rbd_aio_bh_cb got error back. Code: -512 Error: 0
rbd_aio_bh_cb got error back. Code: -512 Error: 0
rbd_aio_bh_cb got error back. Code: -512 Error: 0
rbd_aio_bh_cb got error back. Code: -512 Error: 0
rbd_aio_bh_cb got error back. Code: -512 Error: 0
rbd_aio_bh_cb got error back. Code: -512 Error: 0
rbd_aio_bh_cb got error back. Code: -512 Error: 0
rbd_aio_bh_cb got error back. Code: -512 Error: 0
rbd_aio_bh_cb got error back. Code: -512 Error: 0
rbd_aio_bh_cb got error back. Code: -1006628352 Error: 0

Stefan
--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: librbd discard bug problems -> i got it

2012-11-19 Thread Josh Durgin

On 11/19/2012 02:45 PM, Stefan Priebe wrote:

Hi Josh,

how can i log in AioRequest.h?

I need to add some debugging into:
 void complete(int r)
 {
   if (should_complete(r)) {
 if (m_hide_enoent && r == -ENOENT)
   r = 0;
 m_completion->complete(r);
 delete this;
   }
 }


Move the implementation of the method out of the header and into
AioRequest.cc. Then you can use ldout like in the rest of that file.

Josh


Am 19.11.2012 23:38, schrieb Josh Durgin:

On 11/19/2012 01:19 PM, Stefan Priebe wrote:

Hello Josh,

after digging three days around i got it.


That's great!


The problem is in aio_discard in internal.cc. The i/o fails when AioZero
or AioTruncate is used.


Interesting, I wouldn't have expected the bug to be there. Is
it failing with a different error code, or just taking too long
so the guest kernel tries to cancel it?


It works fine with AioRemove. It seems to depend on overlapping.
Hopefully i'm able to provide a patch this nicht.


Looking forward to it!

Josh

--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: librbd discard bug problems -> i got it

2012-11-19 Thread Stefan Priebe

Hi Josh,

how can i log in AioRequest.h?

I need to add some debugging into:
void complete(int r)
{
  if (should_complete(r)) {
if (m_hide_enoent && r == -ENOENT)
  r = 0;
m_completion->complete(r);
delete this;
  }
}

Stefan

Am 19.11.2012 23:38, schrieb Josh Durgin:

On 11/19/2012 01:19 PM, Stefan Priebe wrote:

Hello Josh,

after digging three days around i got it.


That's great!


The problem is in aio_discard in internal.cc. The i/o fails when AioZero
or AioTruncate is used.


Interesting, I wouldn't have expected the bug to be there. Is
it failing with a different error code, or just taking too long
so the guest kernel tries to cancel it?


It works fine with AioRemove. It seems to depend on overlapping.
Hopefully i'm able to provide a patch this nicht.


Looking forward to it!

Josh

--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: librbd discard bug problems -> i got it

2012-11-19 Thread Josh Durgin

On 11/19/2012 01:19 PM, Stefan Priebe wrote:

Hello Josh,

after digging three days around i got it.


That's great!


The problem is in aio_discard in internal.cc. The i/o fails when AioZero
or AioTruncate is used.


Interesting, I wouldn't have expected the bug to be there. Is
it failing with a different error code, or just taking too long
so the guest kernel tries to cancel it?


It works fine with AioRemove. It seems to depend on overlapping.
Hopefully i'm able to provide a patch this nicht.


Looking forward to it!

Josh
--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


librbd discard bug problems -> i got it

2012-11-19 Thread Stefan Priebe

Hello Josh,

after digging three days around i got it.

The problem is in aio_discard in internal.cc. The i/o fails when AioZero 
or AioTruncate is used.


It works fine with AioRemove. It seems to depend on overlapping. 
Hopefully i'm able to provide a patch this nicht.


Greets,
Stefan
--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html