Hello Chulmin,

On Mon, Apr 04, 2016 at 05:04:16PM +0900, Chulmin Kim wrote:
> On 2016년 03월 30일 16:12, Minchan Kim wrote:
> >Currently, we rely on class->lock to prevent zspage destruction.
> >It was okay until now because the critical section is short but
> >with run-time migration, it could be long so class->lock is not
> >a good apporach any more.
> >
> >So, this patch introduces [un]freeze_zspage functions which
> >freeze allocated objects in the zspage with pinning tag so
> >user cannot free using object. With those functions, this patch
> >redesign compaction.
> >
> >Those functions will be used for implementing zspage runtime
> >migrations, too.
> >
> >Signed-off-by: Minchan Kim <minc...@kernel.org>
> >---
> >  mm/zsmalloc.c | 393 
> > ++++++++++++++++++++++++++++++++++++++--------------------
> >  1 file changed, 257 insertions(+), 136 deletions(-)
> >
> >diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
> >index b11dcd718502..ac8ca7b10720 100644
> >--- a/mm/zsmalloc.c
> >+++ b/mm/zsmalloc.c
> >@@ -922,6 +922,13 @@ static unsigned long obj_to_head(struct size_class 
> >*class, struct page *page,
> >             return *(unsigned long *)obj;
> >  }
> >
> >+static inline int testpin_tag(unsigned long handle)
> >+{
> >+    unsigned long *ptr = (unsigned long *)handle;
> >+
> >+    return test_bit(HANDLE_PIN_BIT, ptr);
> >+}
> >+
> >  static inline int trypin_tag(unsigned long handle)
> >  {
> >     unsigned long *ptr = (unsigned long *)handle;
> >@@ -949,8 +956,7 @@ static void reset_page(struct page *page)
> >     page->freelist = NULL;
> >  }
> >
> >-static void free_zspage(struct zs_pool *pool, struct size_class *class,
> >-                    struct page *first_page)
> >+static void free_zspage(struct zs_pool *pool, struct page *first_page)
> >  {
> >     struct page *nextp, *tmp, *head_extra;
> >
> >@@ -973,11 +979,6 @@ static void free_zspage(struct zs_pool *pool, struct 
> >size_class *class,
> >     }
> >     reset_page(head_extra);
> >     __free_page(head_extra);
> >-
> >-    zs_stat_dec(class, OBJ_ALLOCATED, get_maxobj_per_zspage(
> >-                    class->size, class->pages_per_zspage));
> >-    atomic_long_sub(class->pages_per_zspage,
> >-                            &pool->pages_allocated);
> >  }
> >
> >  /* Initialize a newly allocated zspage */
> >@@ -1325,6 +1326,11 @@ static bool zspage_full(struct size_class *class, 
> >struct page *first_page)
> >     return get_zspage_inuse(first_page) == class->objs_per_zspage;
> >  }
> >
> >+static bool zspage_empty(struct size_class *class, struct page *first_page)
> >+{
> >+    return get_zspage_inuse(first_page) == 0;
> >+}
> >+
> >  unsigned long zs_get_total_pages(struct zs_pool *pool)
> >  {
> >     return atomic_long_read(&pool->pages_allocated);
> >@@ -1455,7 +1461,6 @@ static unsigned long obj_malloc(struct size_class 
> >*class,
> >             set_page_private(first_page, handle | OBJ_ALLOCATED_TAG);
> >     kunmap_atomic(vaddr);
> >     mod_zspage_inuse(first_page, 1);
> >-    zs_stat_inc(class, OBJ_USED, 1);
> >
> >     obj = location_to_obj(m_page, obj);
> >
> >@@ -1510,6 +1515,7 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t 
> >size)
> >     }
> >
> >     obj = obj_malloc(class, first_page, handle);
> >+    zs_stat_inc(class, OBJ_USED, 1);
> >     /* Now move the zspage to another fullness group, if required */
> >     fix_fullness_group(class, first_page);
> >     record_obj(handle, obj);
> >@@ -1540,7 +1546,6 @@ static void obj_free(struct size_class *class, 
> >unsigned long obj)
> >     kunmap_atomic(vaddr);
> >     set_freeobj(first_page, f_objidx);
> >     mod_zspage_inuse(first_page, -1);
> >-    zs_stat_dec(class, OBJ_USED, 1);
> >  }
> >
> >  void zs_free(struct zs_pool *pool, unsigned long handle)
> >@@ -1564,10 +1569,19 @@ void zs_free(struct zs_pool *pool, unsigned long 
> >handle)
> >
> >     spin_lock(&class->lock);
> >     obj_free(class, obj);
> >+    zs_stat_dec(class, OBJ_USED, 1);
> >     fullness = fix_fullness_group(class, first_page);
> >-    if (fullness == ZS_EMPTY)
> >-            free_zspage(pool, class, first_page);
> >+    if (fullness == ZS_EMPTY) {
> >+            zs_stat_dec(class, OBJ_ALLOCATED, get_maxobj_per_zspage(
> >+                            class->size, class->pages_per_zspage));
> >+            spin_unlock(&class->lock);
> >+            atomic_long_sub(class->pages_per_zspage,
> >+                                    &pool->pages_allocated);
> >+            free_zspage(pool, first_page);
> >+            goto out;
> >+    }
> >     spin_unlock(&class->lock);
> >+out:
> >     unpin_tag(handle);
> >
> >     free_handle(pool, handle);
> >@@ -1637,127 +1651,66 @@ static void zs_object_copy(struct size_class 
> >*class, unsigned long dst,
> >     kunmap_atomic(s_addr);
> >  }
> >
> >-/*
> >- * Find alloced object in zspage from index object and
> >- * return handle.
> >- */
> >-static unsigned long find_alloced_obj(struct size_class *class,
> >-                                    struct page *page, int index)
> >+static unsigned long handle_from_obj(struct size_class *class,
> >+                            struct page *first_page, int obj_idx)
> >  {
> >-    unsigned long head;
> >-    int offset = 0;
> >-    unsigned long handle = 0;
> >-    void *addr = kmap_atomic(page);
> >-
> >-    if (!is_first_page(page))
> >-            offset = page->index;
> >-    offset += class->size * index;
> >-
> >-    while (offset < PAGE_SIZE) {
> >-            head = obj_to_head(class, page, addr + offset);
> >-            if (head & OBJ_ALLOCATED_TAG) {
> >-                    handle = head & ~OBJ_ALLOCATED_TAG;
> >-                    if (trypin_tag(handle))
> >-                            break;
> >-                    handle = 0;
> >-            }
> >+    struct page *page;
> >+    unsigned long offset_in_page;
> >+    void *addr;
> >+    unsigned long head, handle = 0;
> >
> >-            offset += class->size;
> >-            index++;
> >-    }
> >+    objidx_to_page_and_offset(class, first_page, obj_idx,
> >+                    &page, &offset_in_page);
> >
> >+    addr = kmap_atomic(page);
> >+    head = obj_to_head(class, page, addr + offset_in_page);
> >+    if (head & OBJ_ALLOCATED_TAG)
> >+            handle = head & ~OBJ_ALLOCATED_TAG;
> >     kunmap_atomic(addr);
> >+
> >     return handle;
> >  }
> >
> >-struct zs_compact_control {
> >-    /* Source page for migration which could be a subpage of zspage. */
> >-    struct page *s_page;
> >-    /* Destination page for migration which should be a first page
> >-     * of zspage. */
> >-    struct page *d_page;
> >-     /* Starting object index within @s_page which used for live object
> >-      * in the subpage. */
> >-    int index;
> >-};
> >-
> >-static int migrate_zspage(struct zs_pool *pool, struct size_class *class,
> >-                            struct zs_compact_control *cc)
> >+static int migrate_zspage(struct size_class *class, struct page *dst_page,
> >+                            struct page *src_page)
> >  {
> >-    unsigned long used_obj, free_obj;
> >     unsigned long handle;
> >-    struct page *s_page = cc->s_page;
> >-    struct page *d_page = cc->d_page;
> >-    unsigned long index = cc->index;
> >-    int ret = 0;
> >+    unsigned long old_obj, new_obj;
> >+    int i;
> >+    int nr_migrated = 0;
> >
> >-    while (1) {
> >-            handle = find_alloced_obj(class, s_page, index);
> >-            if (!handle) {
> >-                    s_page = get_next_page(s_page);
> >-                    if (!s_page)
> >-                            break;
> >-                    index = 0;
> >+    for (i = 0; i < class->objs_per_zspage; i++) {
> >+            handle = handle_from_obj(class, src_page, i);
> >+            if (!handle)
> >                     continue;
> >-            }
> >-
> >-            /* Stop if there is no more space */
> >-            if (zspage_full(class, d_page)) {
> >-                    unpin_tag(handle);
> >-                    ret = -ENOMEM;
> >+            if (zspage_full(class, dst_page))
> >                     break;
> >-            }
> >-
> >-            used_obj = handle_to_obj(handle);
> >-            free_obj = obj_malloc(class, d_page, handle);
> >-            zs_object_copy(class, free_obj, used_obj);
> >-            index++;
> >+            old_obj = handle_to_obj(handle);
> >+            new_obj = obj_malloc(class, dst_page, handle);
> >+            zs_object_copy(class, new_obj, old_obj);
> >+            nr_migrated++;
> >             /*
> >              * record_obj updates handle's value to free_obj and it will
> >              * invalidate lock bit(ie, HANDLE_PIN_BIT) of handle, which
> >              * breaks synchronization using pin_tag(e,g, zs_free) so
> >              * let's keep the lock bit.
> >              */
> >-            free_obj |= BIT(HANDLE_PIN_BIT);
> >-            record_obj(handle, free_obj);
> >-            unpin_tag(handle);
> >-            obj_free(class, used_obj);
> >+            new_obj |= BIT(HANDLE_PIN_BIT);
> >+            record_obj(handle, new_obj);
> >+            obj_free(class, old_obj);
> >     }
> >-
> >-    /* Remember last position in this iteration */
> >-    cc->s_page = s_page;
> >-    cc->index = index;
> >-
> >-    return ret;
> >-}
> >-
> >-static struct page *isolate_target_page(struct size_class *class)
> >-{
> >-    int i;
> >-    struct page *page;
> >-
> >-    for (i = 0; i < _ZS_NR_FULLNESS_GROUPS; i++) {
> >-            page = class->fullness_list[i];
> >-            if (page) {
> >-                    remove_zspage(class, i, page);
> >-                    break;
> >-            }
> >-    }
> >-
> >-    return page;
> >+    return nr_migrated;
> >  }
> >
> >  /*
> >   * putback_zspage - add @first_page into right class's fullness list
> >- * @pool: target pool
> >   * @class: destination class
> >   * @first_page: target page
> >   *
> >   * Return @first_page's updated fullness_group
> >   */
> >-static enum fullness_group putback_zspage(struct zs_pool *pool,
> >-                    struct size_class *class,
> >-                    struct page *first_page)
> >+static enum fullness_group putback_zspage(struct size_class *class,
> >+                                    struct page *first_page)
> >  {
> >     enum fullness_group fullness;
> >
> >@@ -1768,17 +1721,155 @@ static enum fullness_group putback_zspage(struct 
> >zs_pool *pool,
> >     return fullness;
> >  }
> >
> >+/*
> >+ * freeze_zspage - freeze all objects in a zspage
> >+ * @class: size class of the page
> >+ * @first_page: first page of zspage
> >+ *
> >+ * Freeze all allocated objects in a zspage so objects couldn't be
> >+ * freed until unfreeze objects. It should be called under class->lock.
> >+ *
> >+ * RETURNS:
> >+ * the number of pinned objects
> >+ */
> >+static int freeze_zspage(struct size_class *class, struct page *first_page)
> >+{
> >+    unsigned long obj_idx;
> >+    struct page *obj_page;
> >+    unsigned long offset;
> >+    void *addr;
> >+    int nr_freeze = 0;
> >+
> >+    for (obj_idx = 0; obj_idx < class->objs_per_zspage; obj_idx++) {
> >+            unsigned long head;
> >+
> >+            objidx_to_page_and_offset(class, first_page, obj_idx,
> >+                                    &obj_page, &offset);
> >+            addr = kmap_atomic(obj_page);
> >+            head = obj_to_head(class, obj_page, addr + offset);
> >+            if (head & OBJ_ALLOCATED_TAG) {
> >+                    unsigned long handle = head & ~OBJ_ALLOCATED_TAG;
> >+
> >+                    if (!trypin_tag(handle)) {
> >+                            kunmap_atomic(addr);
> >+                            break;
> >+                    }
> >+                    nr_freeze++;
> >+            }
> >+            kunmap_atomic(addr);
> >+    }
> >+
> >+    return nr_freeze;
> >+}
> >+
> >+/*
> >+ * unfreeze_page - unfreeze objects freezed by freeze_zspage in a zspage
> >+ * @class: size class of the page
> >+ * @first_page: freezed zspage to unfreeze
> >+ * @nr_obj: the number of objects to unfreeze
> >+ *
> >+ * unfreeze objects in a zspage.
> >+ */
> >+static void unfreeze_zspage(struct size_class *class, struct page 
> >*first_page,
> >+                    int nr_obj)
> >+{
> >+    unsigned long obj_idx;
> >+    struct page *obj_page;
> >+    unsigned long offset;
> >+    void *addr;
> >+    int nr_unfreeze = 0;
> >+
> >+    for (obj_idx = 0; obj_idx < class->objs_per_zspage &&
> >+                    nr_unfreeze < nr_obj; obj_idx++) {
> >+            unsigned long head;
> >+
> >+            objidx_to_page_and_offset(class, first_page, obj_idx,
> >+                                    &obj_page, &offset);
> >+            addr = kmap_atomic(obj_page);
> >+            head = obj_to_head(class, obj_page, addr + offset);
> >+            if (head & OBJ_ALLOCATED_TAG) {
> >+                    unsigned long handle = head & ~OBJ_ALLOCATED_TAG;
> >+
> >+                    VM_BUG_ON(!testpin_tag(handle));
> >+                    unpin_tag(handle);
> >+                    nr_unfreeze++;
> >+            }
> >+            kunmap_atomic(addr);
> >+    }
> >+}
> >+
> >+/*
> >+ * isolate_source_page - isolate a zspage for migration source
> >+ * @class: size class of zspage for isolation
> >+ *
> >+ * Returns a zspage which are isolated from list so anyone can
> >+ * allocate a object from that page. As well, freeze all objects
> >+ * allocated in the zspage so anyone cannot access that objects
> >+ * (e.g., zs_map_object, zs_free).
> >+ */
> >  static struct page *isolate_source_page(struct size_class *class)
> >  {
> >     int i;
> >     struct page *page = NULL;
> >
> >     for (i = ZS_ALMOST_EMPTY; i >= ZS_ALMOST_FULL; i--) {
> >+            int inuse, freezed;
> >+
> >             page = class->fullness_list[i];
> >             if (!page)
> >                     continue;
> >
> >             remove_zspage(class, i, page);
> >+
> >+            inuse = get_zspage_inuse(page);
> >+            freezed = freeze_zspage(class, page);
> >+
> >+            if (inuse != freezed) {
> >+                    unfreeze_zspage(class, page, freezed);
> >+                    putback_zspage(class, page);
> >+                    page = NULL;
> >+                    continue;
> >+            }
> >+
> >+            break;
> >+    }
> >+
> >+    return page;
> >+}
> >+
> >+/*
> >+ * isolate_target_page - isolate a zspage for migration target
> >+ * @class: size class of zspage for isolation
> >+ *
> >+ * Returns a zspage which are isolated from list so anyone can
> >+ * allocate a object from that page. As well, freeze all objects
> >+ * allocated in the zspage so anyone cannot access that objects
> >+ * (e.g., zs_map_object, zs_free).
> >+ */
> >+static struct page *isolate_target_page(struct size_class *class)
> >+{
> >+    int i;
> >+    struct page *page;
> >+
> >+    for (i = 0; i < _ZS_NR_FULLNESS_GROUPS; i++) {
> >+            int inuse, freezed;
> >+
> >+            page = class->fullness_list[i];
> >+            if (!page)
> >+                    continue;
> >+
> >+            remove_zspage(class, i, page);
> >+
> >+            inuse = get_zspage_inuse(page);
> >+            freezed = freeze_zspage(class, page);
> >+
> >+            if (inuse != freezed) {
> >+                    unfreeze_zspage(class, page, freezed);
> >+                    putback_zspage(class, page);
> >+                    page = NULL;
> >+                    continue;
> >+            }
> >+
> >             break;
> >     }
> >
> >@@ -1793,9 +1884,11 @@ static struct page *isolate_source_page(struct 
> >size_class *class)
> >  static unsigned long zs_can_compact(struct size_class *class)
> >  {
> >     unsigned long obj_wasted;
> >+    unsigned long obj_allocated, obj_used;
> >
> >-    obj_wasted = zs_stat_get(class, OBJ_ALLOCATED) -
> >-            zs_stat_get(class, OBJ_USED);
> >+    obj_allocated = zs_stat_get(class, OBJ_ALLOCATED);
> >+    obj_used = zs_stat_get(class, OBJ_USED);
> >+    obj_wasted = obj_allocated - obj_used;
> >
> >     obj_wasted /= get_maxobj_per_zspage(class->size,
> >                     class->pages_per_zspage);
> >@@ -1805,53 +1898,81 @@ static unsigned long zs_can_compact(struct 
> >size_class *class)
> >
> >  static void __zs_compact(struct zs_pool *pool, struct size_class *class)
> >  {
> >-    struct zs_compact_control cc;
> >-    struct page *src_page;
> >+    struct page *src_page = NULL;
> >     struct page *dst_page = NULL;
> >
> >-    spin_lock(&class->lock);
> >-    while ((src_page = isolate_source_page(class))) {
> >+    while (1) {
> >+            int nr_migrated;
> >
> >-            if (!zs_can_compact(class))
> >+            spin_lock(&class->lock);
> >+            if (!zs_can_compact(class)) {
> >+                    spin_unlock(&class->lock);
> >                     break;
> >+            }
> >
> >-            cc.index = 0;
> >-            cc.s_page = src_page;
> >+            /*
> >+             * Isolate source page and freeze all objects in a zspage
> >+             * to prevent zspage destroying.
> >+             */
> >+            if (!src_page) {
> >+                    src_page = isolate_source_page(class);
> >+                    if (!src_page) {
> >+                            spin_unlock(&class->lock);
> >+                            break;
> >+                    }
> >+            }
> >
> >-            while ((dst_page = isolate_target_page(class))) {
> >-                    cc.d_page = dst_page;
> >-                    /*
> >-                     * If there is no more space in dst_page, resched
> >-                     * and see if anyone had allocated another zspage.
> >-                     */
> >-                    if (!migrate_zspage(pool, class, &cc))
> >+            /* Isolate target page and freeze all objects in the zspage */
> >+            if (!dst_page) {
> >+                    dst_page = isolate_target_page(class);
> >+                    if (!dst_page) {
> >+                            spin_unlock(&class->lock);
> >                             break;
> >+                    }
> >+            }
> >+            spin_unlock(&class->lock);
> 
> (Sorry to delete individual recipients due to my compliance issues.)
> 
> Hello, Minchan.
> 
> 
> Is it safe to unlock?
> 
> 
> (I assume that the system has 2 cores
> and a swap device is using zsmalloc pool.)
> If a zs compact context scheduled out after this "spin_unlock" line,
> 
> 
>    CPU A (Swap In)                CPU B (zs_free by process killed)
> ---------------------           -------------------------
>                                 ...
>                                 spin_lock(&si->lock)
>                                 ...
>                                # assume it is pinned by zs_compact context.
>                                 pin_tag(handle) --> block
> 
> ...
> spin_lock(&si->lock) --> block
> 
> 
> I think CPU A and CPU B may not be released forever.
> Am I missing something?

You didn't miss anything. It could be dead locked.
The swap_slot_free_notify is always really problem. :(
That's why I want to remove it.
I will think over how to handle it and send fix in next revision.

Thanks for the review!

Reply via email to