Hi,

while I agree that there is corruption in the IB driver (we have seen it
as well in certain cases), and that the problem seems to be related to the
memcache, I'm not sure I agree that the logic of the memcache if flawed.

[disclaimer: haven't looked at bmi_ib in while, so not certain if what
follows below is actually what the cache code does]

If the cache is keeping track of which pages were already pinned, then it
shouldn't really matter if the user frees the memory and then reallocates
the same pages, as the thing being 'cached' is really the pinning of the
virtual->physical pages, not the contents.

This is assuming that 'malloc' and 'free' only mark the region as free,
not actually return the pages to the operating system.
(i.e. the pages are still part of the address space of the process).

However, for certain sizes, malloc will use mmap and munmap.
In that case, a free will actually unmap the pages from the address space,
possibly also removing the pinned mapping from virtual to physical.

In that case, if another malloc is done which also uses mmap, *and* that
mmap is done at the same virtual address, and the cache thinks the mapping
is still valid, then it might indeed be that corrupted data is being sent
as the pages being sent might not correspond with the pages described by
the virtual address given by the client.

   Dries

* Zhang, Jingwang <jingwang.zh...@emc.com> [2012-11-19 22:22:31]:

> Hi All,

> Here is our use case, and we are using the BMI code in OrangeFS 2.8.6:

> 1.       Malloc a buffer and write something to it.

> 2.       Send the buffer using BMI routines.

> 3.       Free the buffer and goto step 1

> And here we found the following problem:
> We found that the messages captured using ibdump is corrupt in iteration 2. 
> It became a mixture of data from iteration 1 and iteration 2.

> Here is some analysis we did:
> We noticed that when the data corruption occurs, the buffer always point to 
> the same virtual address. And after checking with the BMI code, we found that 
> the memcache code will keep the buffer registered(to ibverbs) and use virtual 
> address to determine whether a registered buffer could be reused or not later.

> However I think the memcache shouldn't keep the buffer registered, because 
> that the user might free this buffer, and when the user did free and 
> re-allocate the buffer, there might be a false match which might lead to data 
> corruption.

> So at first, we tested the code with "define ENABLE_MEMCACHE 0" to disable 
> the memcache. And then the test passed, so it is proven that the data 
> corruption is caused by memcache. However, performance will be affected if 
> the memcache is disabled completely.

> Finally we formatted the attached patch to solve the problem. It fixes the 
> broken code in the clauses when memcache is disabled. And it deregister the 
> buffer whenever its use-count drops to 0 and register it when it is used 
> again.

> Please feel free to share your thoughts and comments. Thank you very much.

> Best Regards,
> Jingwang.

> [-- mutt.octet.filter file type: "unified diff output, ASCII text" --]

> [-- Statistics (lines words chars):  95 362 3774 
> /home/lts/tmp/fix_memcache_issue.patch --]

> diff --git a/src/io/bmi/bmi_ib/mem.c b/src/io/bmi/bmi_ib/mem.c
> index 1fc8159..8821c92 100644
> --- a/src/io/bmi/bmi_ib/mem.c
> +++ b/src/io/bmi/bmi_ib/mem.c
> @@ -144,6 +144,7 @@ memcache_memalloc(void *md, bmi_size_t len, int 
> eager_limit)
>               qlist_del(&c->list);
>               qlist_add_tail(&c->list, &memcache_device->list);
>               ++c->count;
> +             if (c->count == 1) memcache_device->mem_register(c);
>               buf = c->buf;
>               gen_mutex_unlock(&memcache_device->mutex);
>               goto out;
> @@ -166,6 +167,7 @@ memcache_memalloc(void *md, bmi_size_t len, int 
> eager_limit)
>       c = memcache_lookup_cover(memcache_device, buf, len);
>       if (c) {
>           ++c->count;
> +         if (c->count == 1) memcache_device->mem_register(c);
>           debug(4, "%s: reuse reg, buf %p, count %d", __func__, c->buf,
>                 c->count);
>       } else {
> @@ -207,6 +209,7 @@ memcache_memfree(void *md, void *buf, bmi_size_t len)
>                     __func__, c->buf, lld(c->len), c->count);
>       /* cache it */
>       --c->count;
> +     if (c->count == 0) memcache_device->mem_deregister(c);
>       qlist_del(&c->list);
>       qlist_add(&c->list, &memcache_device->free_chunk_list);
>       gen_mutex_unlock(&memcache_device->mutex);
> @@ -238,6 +241,7 @@ memcache_register(void *md, ib_buflist_t *buflist)
>                                 buflist->len[i]);
>       if (c) {
>           ++c->count;
> +         if (c->count == 1) memcache_device->mem_register(c);
>           debug(2, "%s: hit [%d] %p len %lld (via %p len %lld) refcnt now %d",
>             __func__, i, buflist->buf.send[i], lld(buflist->len[i]), c->buf,
>             lld(c->len), c->count);
> @@ -257,10 +261,9 @@ memcache_register(void *md, ib_buflist_t *buflist)
>       }
>       buflist->memcache[i] = c;
>  #else
> -     memcache_entry_t cp = bmi_ib_malloc(sizeof(*cp));
> +     memcache_entry_t *cp = bmi_ib_malloc(sizeof(*cp));
>       cp->buf = buflist->buf.recv[i];
>       cp->len = buflist->len[i];
> -     cp->type = type;
>       ret = memcache_device->mem_register(cp);
>       if (ret) {
>           free(cp);
> @@ -284,6 +287,7 @@ void memcache_preregister(void *md, const void *buf, 
> bmi_size_t len,
>      memcache_device_t *memcache_device = md;
>      memcache_entry_t *c;

> +    return ; // Can not do this any more.
>      gen_mutex_lock(&memcache_device->mutex);
>      c = memcache_lookup_cover(memcache_device, buf, len);
>      if (c) {
> @@ -316,6 +320,7 @@ memcache_deregister(void *md, ib_buflist_t *buflist)
>  #if ENABLE_MEMCACHE
>       memcache_entry_t *c = buflist->memcache[i];
>       --c->count;
> +     if (c->count == 0) memcache_device->mem_deregister(c);
>       debug(2,
>          "%s: dec refcount [%d] %p len %lld (via %p len %lld) refcnt now %d",
>          __func__, i, buflist->buf.send[i], lld(buflist->len[i]),
> @@ -357,12 +362,12 @@ void memcache_shutdown(void *md)

>      gen_mutex_lock(&memcache_device->mutex);
>      qlist_for_each_entry_safe(c, cn, &memcache_device->list, list) {
> -     memcache_device->mem_deregister(c);
> +        if (c->count > 0) memcache_device->mem_deregister(c);
>       qlist_del(&c->list);
>       free(c);
>      }
>      qlist_for_each_entry_safe(c, cn, &memcache_device->free_chunk_list, 
> list) {
> -     memcache_device->mem_deregister(c);
> +        if (c->count > 0) memcache_device->mem_deregister(c);
>       qlist_del(&c->list);
>       free(c->buf);
>       free(c);
> @@ -384,7 +389,6 @@ void memcache_cache_flush(void *md)
>      qlist_for_each_entry_safe(c, cn, &memcache_device->list, list) {
>          debug(4, "%s: list c->count %x c->buf %p", __func__, c->count, 
> c->buf);
>          if (c->count == 0) {
> -            memcache_device->mem_deregister(c);
>              qlist_del(&c->list);
>              free(c);
>          }
> @@ -393,7 +397,6 @@ void memcache_cache_flush(void *md)
>          debug(4, "%s: free list c->count %x c->buf %p", __func__,
>             c->count, c->buf);
>          if (c->count == 0) {
> -            memcache_device->mem_deregister(c);
>              qlist_del(&c->list);
>              free(c->buf);
>              free(c);

> _______________________________________________
> Pvfs2-developers mailing list
> Pvfs2-developers@beowulf-underground.org
> http://www.beowulf-underground.org/mailman/listinfo/pvfs2-developers

Attachment: pgpuDD7TbeFG0.pgp
Description: PGP signature

_______________________________________________
Pvfs2-developers mailing list
Pvfs2-developers@beowulf-underground.org
http://www.beowulf-underground.org/mailman/listinfo/pvfs2-developers

Reply via email to