Author: delphij
Date: Tue Jul 29 07:08:32 2014
New Revision: 269223
URL: http://svnweb.freebsd.org/changeset/base/269223

Log:
  4873 zvol unmap calls can take a very long time for larger datasets
  Reviewed by: George Wilson <geo...@delphix.com>
  Reviewed by: Matthew Ahrens <mahr...@delphix.com>
  Reviewed by: Paul Dagnelie <paul.dagne...@delphix.com>
  Reviewed by: Basil Crow <basil.c...@delphix.com>
  Reviewed by: Dan McDonald <dan...@omniti.com>
  Approved by: Robert Mustacchi <r...@joyent.com>
  
  illumos/illumos-gate@0f6d88aded0d165f5954688a9b13bac76c38da84

Modified:
  vendor-sys/illumos/dist/common/avl/avl.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/dbuf.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/dnode.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/dnode_sync.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/sys/dbuf.h
  vendor-sys/illumos/dist/uts/common/fs/zfs/sys/dnode.h
  vendor-sys/illumos/dist/uts/common/sys/avl.h

Changes in other areas also in this revision:
Modified:
  vendor/illumos/dist/common/avl/avl.c

Modified: vendor-sys/illumos/dist/common/avl/avl.c
==============================================================================
--- vendor-sys/illumos/dist/common/avl/avl.c    Tue Jul 29 06:57:13 2014        
(r269222)
+++ vendor-sys/illumos/dist/common/avl/avl.c    Tue Jul 29 07:08:32 2014        
(r269223)
@@ -24,6 +24,10 @@
  */
 
 /*
+ * Copyright (c) 2014 by Delphix. All rights reserved.
+ */
+
+/*
  * AVL - generic AVL tree implementation for kernel use
  *
  * A complete description of AVL trees can be found in many CS textbooks.
@@ -85,6 +89,12 @@
  *       is a modified "avl_node_t *".  The bottom bit (normally 0 for a
  *       pointer) is set to indicate if that the new node has a value greater
  *       than the value of the indicated "avl_node_t *".
+ *
+ * Note - in addition to userland (e.g. libavl and libutil) and the kernel
+ * (e.g. genunix), avl.c is compiled into ld.so and kmdb's genunix module,
+ * which each have their own compilation environments and subsequent
+ * requirements. Each of these environments must be considered when adding
+ * dependencies from avl.c.
  */
 
 #include <sys/types.h>
@@ -864,6 +874,24 @@ avl_update(avl_tree_t *t, void *obj)
        return (B_FALSE);
 }
 
+void
+avl_swap(avl_tree_t *tree1, avl_tree_t *tree2)
+{
+       avl_node_t *temp_node;
+       ulong_t temp_numnodes;
+
+       ASSERT3P(tree1->avl_compar, ==, tree2->avl_compar);
+       ASSERT3U(tree1->avl_offset, ==, tree2->avl_offset);
+       ASSERT3U(tree1->avl_size, ==, tree2->avl_size);
+
+       temp_node = tree1->avl_root;
+       temp_numnodes = tree1->avl_numnodes;
+       tree1->avl_root = tree2->avl_root;
+       tree1->avl_numnodes = tree2->avl_numnodes;
+       tree2->avl_root = temp_node;
+       tree2->avl_numnodes = temp_numnodes;
+}
+
 /*
  * initialize a new AVL tree
  */

Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/dbuf.c
==============================================================================
--- vendor-sys/illumos/dist/uts/common/fs/zfs/dbuf.c    Tue Jul 29 06:57:13 
2014        (r269222)
+++ vendor-sys/illumos/dist/uts/common/fs/zfs/dbuf.c    Tue Jul 29 07:08:32 
2014        (r269223)
@@ -69,6 +69,9 @@ dbuf_cons(void *vdb, void *unused, int k
        mutex_init(&db->db_mtx, NULL, MUTEX_DEFAULT, NULL);
        cv_init(&db->db_changed, NULL, CV_DEFAULT, NULL);
        refcount_create(&db->db_holds);
+
+       db->db_creation = gethrtime();
+
        return (0);
 }
 
@@ -330,7 +333,7 @@ dbuf_verify(dmu_buf_impl_t *db)
                ASSERT3U(db->db_level, <, dn->dn_nlevels);
                ASSERT(db->db_blkid == DMU_BONUS_BLKID ||
                    db->db_blkid == DMU_SPILL_BLKID ||
-                   !list_is_empty(&dn->dn_dbufs));
+                   !avl_is_empty(&dn->dn_dbufs));
        }
        if (db->db_blkid == DMU_BONUS_BLKID) {
                ASSERT(dn != NULL);
@@ -803,18 +806,30 @@ dbuf_unoverride(dbuf_dirty_record_t *dr)
  * receive; see comment below for details.
  */
 void
-dbuf_free_range(dnode_t *dn, uint64_t start, uint64_t end, dmu_tx_t *tx)
+dbuf_free_range(dnode_t *dn, uint64_t start_blkid, uint64_t end_blkid,
+    dmu_tx_t *tx)
 {
-       dmu_buf_impl_t *db, *db_next;
+       dmu_buf_impl_t *db, *db_next, db_search;
        uint64_t txg = tx->tx_txg;
+       avl_index_t where;
 
-       if (end > dn->dn_maxblkid && (end != DMU_SPILL_BLKID))
-               end = dn->dn_maxblkid;
-       dprintf_dnode(dn, "start=%llu end=%llu\n", start, end);
+       if (end_blkid > dn->dn_maxblkid && (end_blkid != DMU_SPILL_BLKID))
+               end_blkid = dn->dn_maxblkid;
+       dprintf_dnode(dn, "start=%llu end=%llu\n", start_blkid, end_blkid);
+
+       db_search.db_level = 0;
+       db_search.db_blkid = start_blkid;
+       db_search.db_creation = 0;
 
        mutex_enter(&dn->dn_dbufs_mtx);
-       if (start >= dn->dn_unlisted_l0_blkid * dn->dn_datablksz) {
+       if (start_blkid >= dn->dn_unlisted_l0_blkid) {
                /* There can't be any dbufs in this range; no need to search. */
+#ifdef DEBUG
+               db = avl_find(&dn->dn_dbufs, &db_search, &where);
+               ASSERT3P(db, ==, NULL);
+               db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER);
+               ASSERT(db == NULL || db->db_level > 0);
+#endif
                mutex_exit(&dn->dn_dbufs_mtx);
                return;
        } else if (dmu_objset_is_receiving(dn->dn_objset)) {
@@ -828,14 +843,18 @@ dbuf_free_range(dnode_t *dn, uint64_t st
                atomic_inc_64(&zfs_free_range_recv_miss);
        }
 
-       for (db = list_head(&dn->dn_dbufs); db != NULL; db = db_next) {
-               db_next = list_next(&dn->dn_dbufs, db);
+       db = avl_find(&dn->dn_dbufs, &db_search, &where);
+       ASSERT3P(db, ==, NULL);
+       db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER);
+
+       for (; db != NULL; db = db_next) {
+               db_next = AVL_NEXT(&dn->dn_dbufs, db);
                ASSERT(db->db_blkid != DMU_BONUS_BLKID);
 
-               if (db->db_level != 0)
-                       continue;
-               if (db->db_blkid < start || db->db_blkid > end)
-                       continue;
+               if (db->db_level != 0 || db->db_blkid > end_blkid) {
+                       break;
+               }
+               ASSERT3U(db->db_blkid, >=, start_blkid);
 
                /* found a level 0 buffer in the range */
                mutex_enter(&db->db_mtx);
@@ -1585,7 +1604,7 @@ dbuf_clear(dmu_buf_impl_t *db)
        dn = DB_DNODE(db);
        dndb = dn->dn_dbuf;
        if (db->db_blkid != DMU_BONUS_BLKID && MUTEX_HELD(&dn->dn_dbufs_mtx)) {
-               list_remove(&dn->dn_dbufs, db);
+               avl_remove(&dn->dn_dbufs, db);
                (void) atomic_dec_32_nv(&dn->dn_dbufs_count);
                membar_producer();
                DB_DNODE_EXIT(db);
@@ -1748,7 +1767,7 @@ dbuf_create(dnode_t *dn, uint8_t level, 
                mutex_exit(&dn->dn_dbufs_mtx);
                return (odb);
        }
-       list_insert_head(&dn->dn_dbufs, db);
+       avl_add(&dn->dn_dbufs, db);
        if (db->db_level == 0 && db->db_blkid >=
            dn->dn_unlisted_l0_blkid)
                dn->dn_unlisted_l0_blkid = db->db_blkid + 1;
@@ -1807,7 +1826,7 @@ dbuf_destroy(dmu_buf_impl_t *db)
                        DB_DNODE_ENTER(db);
                        dn = DB_DNODE(db);
                        mutex_enter(&dn->dn_dbufs_mtx);
-                       list_remove(&dn->dn_dbufs, db);
+                       avl_remove(&dn->dn_dbufs, db);
                        (void) atomic_dec_32_nv(&dn->dn_dbufs_count);
                        mutex_exit(&dn->dn_dbufs_mtx);
                        DB_DNODE_EXIT(db);
@@ -1825,7 +1844,6 @@ dbuf_destroy(dmu_buf_impl_t *db)
        db->db_parent = NULL;
        db->db_buf = NULL;
 
-       ASSERT(!list_link_active(&db->db_link));
        ASSERT(db->db.db_data == NULL);
        ASSERT(db->db_hash_next == NULL);
        ASSERT(db->db_blkptr == NULL);

Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/dnode.c
==============================================================================
--- vendor-sys/illumos/dist/uts/common/fs/zfs/dnode.c   Tue Jul 29 06:57:13 
2014        (r269222)
+++ vendor-sys/illumos/dist/uts/common/fs/zfs/dnode.c   Tue Jul 29 07:08:32 
2014        (r269223)
@@ -59,6 +59,43 @@ int zfs_default_ibs = DN_MAX_INDBLKSHIFT
 
 static kmem_cbrc_t dnode_move(void *, void *, size_t, void *);
 
+static int
+dbuf_compare(const void *x1, const void *x2)
+{
+       const dmu_buf_impl_t *d1 = x1;
+       const dmu_buf_impl_t *d2 = x2;
+
+       if (d1->db_level < d2->db_level) {
+               return (-1);
+       } else if (d1->db_level > d2->db_level) {
+               return (1);
+       }
+
+       if (d1->db_blkid < d2->db_blkid) {
+               return (-1);
+       } else if (d1->db_blkid > d2->db_blkid) {
+               return (1);
+       }
+
+       /*
+        * If a dbuf is being evicted while dn_dbufs_mutex is not held, we set
+        * the db_state to DB_EVICTING but do not remove it from dn_dbufs. If
+        * another thread creates a dbuf of the same blkid before the dbuf is
+        * removed from dn_dbufs, we can reach a state where there are two
+        * dbufs of the same blkid and level in db_dbufs. To maintain the avl
+        * invariant that there cannot be duplicate items, we distinguish
+        * between these two dbufs based on the time they were created.
+        */
+       if (d1->db_creation < d2->db_creation) {
+               return (-1);
+       } else if (d1->db_creation > d2->db_creation) {
+               return (1);
+       } else {
+               ASSERT3P(d1, ==, d2);
+               return (0);
+       }
+}
+
 /* ARGSUSED */
 static int
 dnode_cons(void *arg, void *unused, int kmflag)
@@ -113,7 +150,7 @@ dnode_cons(void *arg, void *unused, int 
 
        dn->dn_dbufs_count = 0;
        dn->dn_unlisted_l0_blkid = 0;
-       list_create(&dn->dn_dbufs, sizeof (dmu_buf_impl_t),
+       avl_create(&dn->dn_dbufs, dbuf_compare, sizeof (dmu_buf_impl_t),
            offsetof(dmu_buf_impl_t, db_link));
 
        dn->dn_moved = 0;
@@ -166,7 +203,7 @@ dnode_dest(void *arg, void *unused)
 
        ASSERT0(dn->dn_dbufs_count);
        ASSERT0(dn->dn_unlisted_l0_blkid);
-       list_destroy(&dn->dn_dbufs);
+       avl_destroy(&dn->dn_dbufs);
 }
 
 void
@@ -502,7 +539,7 @@ dnode_allocate(dnode_t *dn, dmu_object_t
        ASSERT0(dn->dn_assigned_txg);
        ASSERT(refcount_is_zero(&dn->dn_tx_holds));
        ASSERT3U(refcount_count(&dn->dn_holds), <=, 1);
-       ASSERT3P(list_head(&dn->dn_dbufs), ==, NULL);
+       ASSERT(avl_is_empty(&dn->dn_dbufs));
 
        for (i = 0; i < TXG_SIZE; i++) {
                ASSERT0(dn->dn_next_nblkptr[i]);
@@ -687,8 +724,8 @@ dnode_move_impl(dnode_t *odn, dnode_t *n
        ndn->dn_dirtyctx_firstset = odn->dn_dirtyctx_firstset;
        ASSERT(refcount_count(&odn->dn_tx_holds) == 0);
        refcount_transfer(&ndn->dn_holds, &odn->dn_holds);
-       ASSERT(list_is_empty(&ndn->dn_dbufs));
-       list_move_tail(&ndn->dn_dbufs, &odn->dn_dbufs);
+       ASSERT(avl_is_empty(&ndn->dn_dbufs));
+       avl_swap(&ndn->dn_dbufs, &odn->dn_dbufs);
        ndn->dn_dbufs_count = odn->dn_dbufs_count;
        ndn->dn_unlisted_l0_blkid = odn->dn_unlisted_l0_blkid;
        ndn->dn_bonus = odn->dn_bonus;
@@ -722,7 +759,7 @@ dnode_move_impl(dnode_t *odn, dnode_t *n
         */
        odn->dn_dbuf = NULL;
        odn->dn_handle = NULL;
-       list_create(&odn->dn_dbufs, sizeof (dmu_buf_impl_t),
+       avl_create(&odn->dn_dbufs, dbuf_compare, sizeof (dmu_buf_impl_t),
            offsetof(dmu_buf_impl_t, db_link));
        odn->dn_dbufs_count = 0;
        odn->dn_unlisted_l0_blkid = 0;
@@ -1231,7 +1268,8 @@ dnode_setdirty(dnode_t *dn, dmu_tx_t *tx
                return;
        }
 
-       ASSERT(!refcount_is_zero(&dn->dn_holds) || list_head(&dn->dn_dbufs));
+       ASSERT(!refcount_is_zero(&dn->dn_holds) ||
+           !avl_is_empty(&dn->dn_dbufs));
        ASSERT(dn->dn_datablksz != 0);
        ASSERT0(dn->dn_next_bonuslen[txg&TXG_MASK]);
        ASSERT0(dn->dn_next_blksz[txg&TXG_MASK]);
@@ -1304,7 +1342,7 @@ dnode_free(dnode_t *dn, dmu_tx_t *tx)
 int
 dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx)
 {
-       dmu_buf_impl_t *db, *db_next;
+       dmu_buf_impl_t *db;
        int err;
 
        if (size == 0)
@@ -1327,9 +1365,8 @@ dnode_set_blksz(dnode_t *dn, uint64_t si
                goto fail;
 
        mutex_enter(&dn->dn_dbufs_mtx);
-       for (db = list_head(&dn->dn_dbufs); db; db = db_next) {
-               db_next = list_next(&dn->dn_dbufs, db);
-
+       for (db = avl_first(&dn->dn_dbufs); db != NULL;
+           db = AVL_NEXT(&dn->dn_dbufs, db)) {
                if (db->db_blkid != 0 && db->db_blkid != DMU_BONUS_BLKID &&
                    db->db_blkid != DMU_SPILL_BLKID) {
                        mutex_exit(&dn->dn_dbufs_mtx);

Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/dnode_sync.c
==============================================================================
--- vendor-sys/illumos/dist/uts/common/fs/zfs/dnode_sync.c      Tue Jul 29 
06:57:13 2014        (r269222)
+++ vendor-sys/illumos/dist/uts/common/fs/zfs/dnode_sync.c      Tue Jul 29 
07:08:32 2014        (r269223)
@@ -400,16 +400,13 @@ dnode_evict_dbufs(dnode_t *dn)
        int pass = 0;
 
        do {
-               dmu_buf_impl_t *db, marker;
+               dmu_buf_impl_t *db, *db_next;
                int evicting = FALSE;
 
                progress = FALSE;
                mutex_enter(&dn->dn_dbufs_mtx);
-               list_insert_tail(&dn->dn_dbufs, &marker);
-               db = list_head(&dn->dn_dbufs);
-               for (; db != &marker; db = list_head(&dn->dn_dbufs)) {
-                       list_remove(&dn->dn_dbufs, db);
-                       list_insert_tail(&dn->dn_dbufs, db);
+               for (db = avl_first(&dn->dn_dbufs); db != NULL; db = db_next) {
+                       db_next = AVL_NEXT(&dn->dn_dbufs, db);
 #ifdef DEBUG
                        DB_DNODE_ENTER(db);
                        ASSERT3P(DB_DNODE(db), ==, dn);
@@ -429,7 +426,6 @@ dnode_evict_dbufs(dnode_t *dn)
                        }
 
                }
-               list_remove(&dn->dn_dbufs, &marker);
                /*
                 * NB: we need to drop dn_dbufs_mtx between passes so
                 * that any DB_EVICTING dbufs can make progress.
@@ -500,7 +496,7 @@ dnode_sync_free(dnode_t *dn, dmu_tx_t *t
 
        dnode_undirty_dbufs(&dn->dn_dirty_records[txgoff]);
        dnode_evict_dbufs(dn);
-       ASSERT3P(list_head(&dn->dn_dbufs), ==, NULL);
+       ASSERT(avl_is_empty(&dn->dn_dbufs));
        ASSERT3P(dn->dn_bonus, ==, NULL);
 
        /*

Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/sys/dbuf.h
==============================================================================
--- vendor-sys/illumos/dist/uts/common/fs/zfs/sys/dbuf.h        Tue Jul 29 
06:57:13 2014        (r269222)
+++ vendor-sys/illumos/dist/uts/common/fs/zfs/sys/dbuf.h        Tue Jul 29 
07:08:32 2014        (r269223)
@@ -20,7 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
  * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
  */
 
@@ -213,11 +213,14 @@ typedef struct dmu_buf_impl {
        /* pointer to most recent dirty record for this buffer */
        dbuf_dirty_record_t *db_last_dirty;
 
+       /* Creation time of dbuf (see comment in dbuf_compare). */
+       hrtime_t db_creation;
+
        /*
         * Our link on the owner dnodes's dn_dbufs list.
         * Protected by its dn_dbufs_mtx.
         */
-       list_node_t db_link;
+       avl_node_t db_link;
 
        /* Data which is unique to data (leaf) blocks: */
 

Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/sys/dnode.h
==============================================================================
--- vendor-sys/illumos/dist/uts/common/fs/zfs/sys/dnode.h       Tue Jul 29 
06:57:13 2014        (r269222)
+++ vendor-sys/illumos/dist/uts/common/fs/zfs/sys/dnode.h       Tue Jul 29 
07:08:32 2014        (r269223)
@@ -211,7 +211,7 @@ typedef struct dnode {
        refcount_t dn_holds;
 
        kmutex_t dn_dbufs_mtx;
-       list_t dn_dbufs;                /* descendent dbufs */
+       avl_tree_t dn_dbufs;            /* descendent dbufs */
 
        /* protected by dn_struct_rwlock */
        struct dmu_buf_impl *dn_bonus;  /* bonus buffer dbuf */

Modified: vendor-sys/illumos/dist/uts/common/sys/avl.h
==============================================================================
--- vendor-sys/illumos/dist/uts/common/sys/avl.h        Tue Jul 29 06:57:13 
2014        (r269222)
+++ vendor-sys/illumos/dist/uts/common/sys/avl.h        Tue Jul 29 07:08:32 
2014        (r269223)
@@ -23,6 +23,10 @@
  * Use is subject to license terms.
  */
 
+/*
+ * Copyright (c) 2014 by Delphix. All rights reserved.
+ */
+
 #ifndef        _AVL_H
 #define        _AVL_H
 
@@ -260,6 +264,11 @@ extern boolean_t avl_update_lt(avl_tree_
 extern boolean_t avl_update_gt(avl_tree_t *, void *);
 
 /*
+ * Swaps the contents of the two trees.
+ */
+extern void avl_swap(avl_tree_t *tree1, avl_tree_t *tree2);
+
+/*
  * Return the number of nodes in the tree
  */
 extern ulong_t avl_numnodes(avl_tree_t *tree);
_______________________________________________
svn-src-all@freebsd.org mailing list
http://lists.freebsd.org/mailman/listinfo/svn-src-all
To unsubscribe, send any mail to "svn-src-all-unsubscr...@freebsd.org"

Reply via email to