Author: mm
Date: Mon Feb 25 12:33:31 2013
New Revision: 247265
URL: http://svnweb.freebsd.org/changeset/base/247265

Log:
  MFV v242732:
  
  Merge the ZFS I/O deadman thread from vendor (illumos).
  This feature panics the system on hanging ZFS I/O, helps debugging
  and resumes failed service.
  
  The panic behavior can be controlled with the loader-only tunables:
  vfs.zfs.deadman_enabled (enable or disable panic on stalled ZFS I/O)
  vfs.zfs.deadman_synctime (expiration time for stalled ZFS I/O)
  
  By default, ZFS I/O deadman is enabled by default on amd64 and i386
  excluding virtual guest machines.
  
  Illumos ZFS issues:
    3246 ZFS I/O deadman thread
  
  References:
    https://www.illumos.org/issues/3246
  
  MFC after:    2 weeks

Modified:
  head/cddl/contrib/opensolaris/cmd/zinject/translate.c
  head/cddl/contrib/opensolaris/cmd/zinject/zinject.c
  head/cddl/contrib/opensolaris/lib/libzpool/common/kernel.c
  head/cddl/contrib/opensolaris/lib/libzpool/common/sys/zfs_context.h
  head/sys/cddl/compat/opensolaris/sys/time.h
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_boot.h
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev.h
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_context.h
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ioctl.h
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_inject.c
Directory Properties:
  head/cddl/contrib/opensolaris/   (props changed)
  head/sys/cddl/contrib/opensolaris/   (props changed)

Modified: head/cddl/contrib/opensolaris/cmd/zinject/translate.c
==============================================================================
--- head/cddl/contrib/opensolaris/cmd/zinject/translate.c       Mon Feb 25 
11:22:54 2013        (r247264)
+++ head/cddl/contrib/opensolaris/cmd/zinject/translate.c       Mon Feb 25 
12:33:31 2013        (r247265)
@@ -20,6 +20,7 @@
  */
 /*
  * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012 by Delphix. All rights reserved.
  */
 
 #include <libzfs.h>
@@ -455,6 +456,20 @@ translate_device(const char *pool, const
                    &record->zi_guid) == 0);
        }
 
+       /*
+        * Device faults can take on three different forms:
+        * 1). delayed or hanging I/O
+        * 2). zfs label faults
+        * 3). generic disk faults
+        */
+       if (record->zi_timer != 0) {
+               record->zi_cmd = ZINJECT_DELAY_IO;
+       } else if (label_type != TYPE_INVAL) {
+               record->zi_cmd = ZINJECT_LABEL_FAULT;
+       } else {
+               record->zi_cmd = ZINJECT_DEVICE_FAULT;
+       }
+
        switch (label_type) {
        case TYPE_LABEL_UBERBLOCK:
                record->zi_start = offsetof(vdev_label_t, vl_uberblock[0]);

Modified: head/cddl/contrib/opensolaris/cmd/zinject/zinject.c
==============================================================================
--- head/cddl/contrib/opensolaris/cmd/zinject/zinject.c Mon Feb 25 11:22:54 
2013        (r247264)
+++ head/cddl/contrib/opensolaris/cmd/zinject/zinject.c Mon Feb 25 12:33:31 
2013        (r247265)
@@ -20,6 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012 by Delphix. All rights reserved.
  */
 
 /*
@@ -603,7 +604,7 @@ main(int argc, char **argv)
        }
 
        while ((c = getopt(argc, argv,
-           ":aA:b:d:f:Fg:qhIc:t:T:l:mr:s:e:uL:p:")) != -1) {
+           ":aA:b:d:D:f:Fg:qhIc:t:T:l:mr:s:e:uL:p:")) != -1) {
                switch (c) {
                case 'a':
                        flags |= ZINJECT_FLUSH_ARC;
@@ -629,6 +630,15 @@ main(int argc, char **argv)
                case 'd':
                        device = optarg;
                        break;
+               case 'D':
+                       record.zi_timer = strtoull(optarg, &end, 10);
+                       if (errno != 0 || *end != '\0') {
+                               (void) fprintf(stderr, "invalid i/o delay "
+                                   "value: '%s'\n", optarg);
+                               usage();
+                               return (1);
+                       }
+                       break;
                case 'e':
                        if (strcasecmp(optarg, "io") == 0) {
                                error = EIO;
@@ -693,6 +703,7 @@ main(int argc, char **argv)
                case 'p':
                        (void) strlcpy(record.zi_func, optarg,
                            sizeof (record.zi_func));
+                       record.zi_cmd = ZINJECT_PANIC;
                        break;
                case 'q':
                        quiet = 1;
@@ -766,13 +777,15 @@ main(int argc, char **argv)
        argc -= optind;
        argv += optind;
 
+       if (record.zi_duration != 0)
+               record.zi_cmd = ZINJECT_IGNORED_WRITES;
+
        if (cancel != NULL) {
                /*
                 * '-c' is invalid with any other options.
                 */
                if (raw != NULL || range != NULL || type != TYPE_INVAL ||
-                   level != 0 || record.zi_func[0] != '\0' ||
-                   record.zi_duration != 0) {
+                   level != 0 || record.zi_cmd != ZINJECT_UNINITIALIZED) {
                        (void) fprintf(stderr, "cancel (-c) incompatible with "
                            "any other options\n");
                        usage();
@@ -804,8 +817,7 @@ main(int argc, char **argv)
                 * for doing injection, so handle it separately here.
                 */
                if (raw != NULL || range != NULL || type != TYPE_INVAL ||
-                   level != 0 || record.zi_func[0] != '\0' ||
-                   record.zi_duration != 0) {
+                   level != 0 || record.zi_cmd != ZINJECT_UNINITIALIZED) {
                        (void) fprintf(stderr, "device (-d) incompatible with "
                            "data error injection\n");
                        usage();
@@ -839,7 +851,7 @@ main(int argc, char **argv)
 
        } else if (raw != NULL) {
                if (range != NULL || type != TYPE_INVAL || level != 0 ||
-                   record.zi_func[0] != '\0' || record.zi_duration != 0) {
+                   record.zi_cmd != ZINJECT_UNINITIALIZED) {
                        (void) fprintf(stderr, "raw (-b) format with "
                            "any other options\n");
                        usage();
@@ -862,13 +874,14 @@ main(int argc, char **argv)
                        return (1);
                }
 
+               record.zi_cmd = ZINJECT_DATA_FAULT;
                if (translate_raw(raw, &record) != 0)
                        return (1);
                if (!error)
                        error = EIO;
-       } else if (record.zi_func[0] != '\0') {
+       } else if (record.zi_cmd == ZINJECT_PANIC) {
                if (raw != NULL || range != NULL || type != TYPE_INVAL ||
-                   level != 0 || device != NULL || record.zi_duration != 0) {
+                   level != 0 || device != NULL) {
                        (void) fprintf(stderr, "panic (-p) incompatible with "
                            "other options\n");
                        usage();
@@ -886,7 +899,7 @@ main(int argc, char **argv)
                if (argv[1] != NULL)
                        record.zi_type = atoi(argv[1]);
                dataset[0] = '\0';
-       } else if (record.zi_duration != 0) {
+       } else if (record.zi_cmd == ZINJECT_IGNORED_WRITES) {
                if (nowrites == 0) {
                        (void) fprintf(stderr, "-s or -g meaningless "
                            "without -I (ignore writes)\n");
@@ -940,6 +953,7 @@ main(int argc, char **argv)
                        return (1);
                }
 
+               record.zi_cmd = ZINJECT_DATA_FAULT;
                if (translate_record(type, argv[0], range, level, &record, pool,
                    dataset) != 0)
                        return (1);

Modified: head/cddl/contrib/opensolaris/lib/libzpool/common/kernel.c
==============================================================================
--- head/cddl/contrib/opensolaris/lib/libzpool/common/kernel.c  Mon Feb 25 
11:22:54 2013        (r247264)
+++ head/cddl/contrib/opensolaris/lib/libzpool/common/kernel.c  Mon Feb 25 
12:33:31 2013        (r247265)
@@ -45,6 +45,9 @@ int aok;
 uint64_t physmem;
 vnode_t *rootdir = (vnode_t *)0xabcd1234;
 char hw_serial[HW_HOSTID_LEN];
+#ifdef illumos
+kmutex_t cpu_lock;
+#endif
 
 struct utsname utsname = {
        "userland", "libzpool", "1", "1", "na"
@@ -842,6 +845,28 @@ ddi_strtoull(const char *str, char **npt
        return (0);
 }
 
+#ifdef illumos
+/* ARGSUSED */
+cyclic_id_t
+cyclic_add(cyc_handler_t *hdlr, cyc_time_t *when)
+{
+       return (1);
+}
+
+/* ARGSUSED */
+void
+cyclic_remove(cyclic_id_t id)
+{
+}
+
+/* ARGSUSED */
+int
+cyclic_reprogram(cyclic_id_t id, hrtime_t expiration)
+{
+       return (1);
+}
+#endif
+
 /*
  * =========================================================================
  * kernel emulation setup & teardown
@@ -875,6 +900,10 @@ kernel_init(int mode)
 
        system_taskq_init();
 
+#ifdef illumos
+       mutex_init(&cpu_lock, NULL, MUTEX_DEFAULT, NULL);
+#endif
+
        spa_init(mode);
 }
 

Modified: head/cddl/contrib/opensolaris/lib/libzpool/common/sys/zfs_context.h
==============================================================================
--- head/cddl/contrib/opensolaris/lib/libzpool/common/sys/zfs_context.h Mon Feb 
25 11:22:54 2013        (r247264)
+++ head/cddl/contrib/opensolaris/lib/libzpool/common/sys/zfs_context.h Mon Feb 
25 12:33:31 2013        (r247265)
@@ -457,6 +457,9 @@ extern vnode_t *rootdir;
 
 extern void delay(clock_t ticks);
 
+#define        SEC_TO_TICK(sec)        ((sec) * hz)
+#define        NSEC_TO_TICK(usec)      ((usec) / (NANOSEC / hz))
+
 #define        gethrestime_sec() time(NULL)
 #define        gethrestime(t) \
        do {\
@@ -624,6 +627,36 @@ typedef    uint32_t        idmap_rid_t;
 #define        ERESTART        (-1)
 #endif
 
+#ifdef illumos
+/*
+ * Cyclic information
+ */
+extern kmutex_t cpu_lock;
+
+typedef uintptr_t cyclic_id_t;
+typedef uint16_t cyc_level_t;
+typedef void (*cyc_func_t)(void *);
+
+#define        CY_LOW_LEVEL    0
+#define        CY_INFINITY     INT64_MAX
+#define        CYCLIC_NONE     ((cyclic_id_t)0)
+
+typedef struct cyc_time {
+       hrtime_t cyt_when;
+       hrtime_t cyt_interval;
+} cyc_time_t;
+
+typedef struct cyc_handler {
+       cyc_func_t cyh_func;
+       void *cyh_arg;
+       cyc_level_t cyh_level;
+} cyc_handler_t;
+
+extern cyclic_id_t cyclic_add(cyc_handler_t *, cyc_time_t *);
+extern void cyclic_remove(cyclic_id_t);
+extern int cyclic_reprogram(cyclic_id_t, hrtime_t);
+#endif /* illumos */
+
 #ifdef __cplusplus
 }
 #endif

Modified: head/sys/cddl/compat/opensolaris/sys/time.h
==============================================================================
--- head/sys/cddl/compat/opensolaris/sys/time.h Mon Feb 25 11:22:54 2013        
(r247264)
+++ head/sys/cddl/compat/opensolaris/sys/time.h Mon Feb 25 12:33:31 2013        
(r247265)
@@ -46,6 +46,9 @@ typedef longlong_t    hrtime_t;
        ((ts)->tv_sec < INT64_MIN || (ts)->tv_sec > INT64_MAX)
 #endif
 
+#define        SEC_TO_TICK(sec)        ((sec) * hz)
+#define        NSEC_TO_TICK(usec)      ((usec) / (NANOSEC / hz))
+
 #ifdef _KERNEL
 static __inline hrtime_t
 gethrtime(void) {

Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c
==============================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c   Mon Feb 25 
11:22:54 2013        (r247264)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c   Mon Feb 25 
12:33:31 2013        (r247265)
@@ -22,6 +22,7 @@
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright (c) 2013 Martin Matuska <m...@freebsd.org>. All rights reserved.
  */
 
 /*
@@ -141,6 +142,10 @@ uint_t             zio_taskq_basedc = 80;          /* base 
 boolean_t      spa_create_process = B_TRUE;    /* no process ==> no sysdc */
 extern int     zfs_sync_pass_deferred_free;
 
+#ifndef illumos
+extern void spa_deadman(void *arg);
+#endif
+
 /*
  * This (illegal) pool name is used when temporarily importing a spa_t in order
  * to get the vdev stats associated with the imported devices.
@@ -6258,6 +6263,17 @@ spa_sync(spa_t *spa, uint64_t txg)
 
        tx = dmu_tx_create_assigned(dp, txg);
 
+       spa->spa_sync_starttime = gethrtime();
+#ifdef illumos
+       VERIFY(cyclic_reprogram(spa->spa_deadman_cycid,
+           spa->spa_sync_starttime + spa->spa_deadman_synctime));
+#else  /* FreeBSD */
+#ifdef _KERNEL
+       callout_reset(&spa->spa_deadman_cycid,
+           hz * spa->spa_deadman_synctime / NANOSEC, spa_deadman, spa);
+#endif
+#endif
+
        /*
         * If we are upgrading to SPA_VERSION_RAIDZ_DEFLATE this txg,
         * set spa_deflate if we have no raid-z vdevs.
@@ -6386,6 +6402,14 @@ spa_sync(spa_t *spa, uint64_t txg)
        }
        dmu_tx_commit(tx);
 
+#ifdef illumos
+       VERIFY(cyclic_reprogram(spa->spa_deadman_cycid, CY_INFINITY));
+#else  /* FreeBSD */
+#ifdef _KERNEL
+       callout_drain(&spa->spa_deadman_cycid);
+#endif
+#endif
+
        /*
         * Clear the dirty config list.
         */

Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c
==============================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c      Mon Feb 
25 11:22:54 2013        (r247264)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c      Mon Feb 
25 12:33:31 2013        (r247265)
@@ -22,10 +22,12 @@
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2012 by Delphix. All rights reserved.
  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
+ * Copyright 2013 Martin Matuska <m...@freebsd.org>. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
 #include <sys/spa_impl.h>
+#include <sys/spa_boot.h>
 #include <sys/zio.h>
 #include <sys/zio_checksum.h>
 #include <sys/zio_compress.h>
@@ -253,6 +255,52 @@ TUNABLE_INT("vfs.zfs.recover", &zfs_reco
 SYSCTL_INT(_vfs_zfs, OID_AUTO, recover, CTLFLAG_RDTUN, &zfs_recover, 0,
     "Try to recover from otherwise-fatal errors.");
 
+extern int zfs_txg_synctime_ms;
+
+/*
+ * Expiration time in units of zfs_txg_synctime_ms. This value has two
+ * meanings. First it is used to determine when the spa_deadman logic
+ * should fire. By default the spa_deadman will fire if spa_sync has
+ * not completed in 1000 * zfs_txg_synctime_ms (i.e. 1000 seconds).
+ * Secondly, the value determines if an I/O is considered "hung".
+ * Any I/O that has not completed in zfs_deadman_synctime is considered
+ * "hung" resulting in a system panic.
+ * 1000 zfs_txg_synctime_ms (i.e. 1000 seconds).
+ */
+uint64_t zfs_deadman_synctime = 1000ULL;
+TUNABLE_QUAD("vfs.zfs.deadman_synctime", &zfs_deadman_synctime);
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, deadman_synctime, CTLFLAG_RDTUN,
+    &zfs_deadman_synctime, 0,
+    "Stalled ZFS I/O expiration time in units of vfs.zfs.txg_synctime_ms");
+
+/*
+ * Default value of -1 for zfs_deadman_enabled is resolved in
+ * zfs_deadman_init()
+ */
+int zfs_deadman_enabled = -1;
+TUNABLE_INT("vfs.zfs.deadman_enabled", &zfs_deadman_enabled);
+SYSCTL_INT(_vfs_zfs, OID_AUTO, deadman_enabled, CTLFLAG_RDTUN,
+    &zfs_deadman_enabled, 0, "Kernel panic on stalled ZFS I/O");
+
+#ifndef illumos
+#ifdef _KERNEL
+static void
+zfs_deadman_init()
+{
+       /*
+        * If we are not i386 or amd64 or in a virtual machine,
+        * disable ZFS deadman thread by default
+        */
+       if (zfs_deadman_enabled == -1) {
+#if defined(__amd64__) || defined(__i386__)
+               zfs_deadman_enabled = (vm_guest == VM_GUEST_NO) ? 1 : 0;
+#else
+               zfs_deadman_enabled = 0;
+#endif
+       }
+}
+#endif /* _KERNEL */
+#endif /* !illumos */
 
 /*
  * ==========================================================================
@@ -422,6 +470,23 @@ spa_lookup(const char *name)
 }
 
 /*
+ * Fires when spa_sync has not completed within zfs_deadman_synctime_ms.
+ * If the zfs_deadman_enabled flag is set then it inspects all vdev queues
+ * looking for potentially hung I/Os.
+ */
+void
+spa_deadman(void *arg)
+{
+       spa_t *spa = arg;
+
+       zfs_dbgmsg("slow spa_sync: started %llu seconds ago, calls %llu",
+           (gethrtime() - spa->spa_sync_starttime) / NANOSEC,
+           ++spa->spa_deadman_calls);
+       if (zfs_deadman_enabled)
+               vdev_deadman(spa->spa_root_vdev);
+}
+
+/*
  * Create an uninitialized spa_t with the given name.  Requires
  * spa_namespace_lock.  The caller must ensure that the spa_t doesn't already
  * exist by calling spa_lookup() first.
@@ -431,6 +496,10 @@ spa_add(const char *name, nvlist_t *conf
 {
        spa_t *spa;
        spa_config_dirent_t *dp;
+#ifdef illumos
+       cyc_handler_t hdlr;
+       cyc_time_t when;
+#endif
 
        ASSERT(MUTEX_HELD(&spa_namespace_lock));
 
@@ -462,6 +531,32 @@ spa_add(const char *name, nvlist_t *conf
        spa->spa_proc = &p0;
        spa->spa_proc_state = SPA_PROC_NONE;
 
+#ifdef illumos
+       hdlr.cyh_func = spa_deadman;
+       hdlr.cyh_arg = spa;
+       hdlr.cyh_level = CY_LOW_LEVEL;
+#endif
+
+       spa->spa_deadman_synctime = zfs_deadman_synctime *
+           zfs_txg_synctime_ms * MICROSEC;
+
+#ifdef illumos
+       /*
+        * This determines how often we need to check for hung I/Os after
+        * the cyclic has already fired. Since checking for hung I/Os is
+        * an expensive operation we don't want to check too frequently.
+        * Instead wait for 5 synctimes before checking again.
+        */
+       when.cyt_interval = 5ULL * zfs_txg_synctime_ms * MICROSEC;
+       when.cyt_when = CY_INFINITY;
+       mutex_enter(&cpu_lock);
+       spa->spa_deadman_cycid = cyclic_add(&hdlr, &when);
+       mutex_exit(&cpu_lock);
+#else  /* !illumos */
+#ifdef _KERNEL
+       callout_init(&spa->spa_deadman_cycid, CALLOUT_MPSAFE);
+#endif
+#endif
        refcount_create(&spa->spa_refcount);
        spa_config_lock_init(spa);
 
@@ -544,6 +639,18 @@ spa_remove(spa_t *spa)
        nvlist_free(spa->spa_load_info);
        spa_config_set(spa, NULL);
 
+#ifdef illumos
+       mutex_enter(&cpu_lock);
+       if (spa->spa_deadman_cycid != CYCLIC_NONE)
+               cyclic_remove(spa->spa_deadman_cycid);
+       mutex_exit(&cpu_lock);
+       spa->spa_deadman_cycid = CYCLIC_NONE;
+#else  /* !illumos */
+#ifdef _KERNEL
+       callout_drain(&spa->spa_deadman_cycid);
+#endif
+#endif
+
        refcount_destroy(&spa->spa_refcount);
 
        spa_config_lock_destroy(spa);
@@ -1511,6 +1618,12 @@ spa_prev_software_version(spa_t *spa)
 }
 
 uint64_t
+spa_deadman_synctime(spa_t *spa)
+{
+       return (spa->spa_deadman_synctime);
+}
+
+uint64_t
 dva_get_dsize_sync(spa_t *spa, const dva_t *dva)
 {
        uint64_t asize = DVA_GET_ASIZE(dva);
@@ -1605,7 +1718,9 @@ spa_init(int mode)
        spa_mode_global = mode;
 
 #ifdef illumos
-#ifndef _KERNEL
+#ifdef _KERNEL
+       spa_arch_init();
+#else
        if (spa_mode_global != FREAD && dprintf_find_string("watch")) {
                arc_procfd = open("/proc/self/ctl", O_WRONLY);
                if (arc_procfd == -1) {
@@ -1629,6 +1744,11 @@ spa_init(int mode)
        zpool_feature_init();
        spa_config_load();
        l2arc_start();
+#ifndef illumos
+#ifdef _KERNEL
+       zfs_deadman_init();
+#endif
+#endif /* !illumos */
 }
 
 void

Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h
==============================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h       Mon Feb 
25 11:22:54 2013        (r247264)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h       Mon Feb 
25 12:33:31 2013        (r247265)
@@ -599,6 +599,7 @@ extern boolean_t spa_suspended(spa_t *sp
 extern uint64_t spa_bootfs(spa_t *spa);
 extern uint64_t spa_delegation(spa_t *spa);
 extern objset_t *spa_meta_objset(spa_t *spa);
+extern uint64_t spa_deadman_synctime(spa_t *spa);
 
 /* Miscellaneous support routines */
 extern void spa_activate_mos_feature(spa_t *spa, const char *feature);

Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_boot.h
==============================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_boot.h  Mon Feb 
25 11:22:54 2013        (r247264)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_boot.h  Mon Feb 
25 12:33:31 2013        (r247265)
@@ -23,6 +23,10 @@
  * Use is subject to license terms.
  */
 
+/*
+ * Copyright (c) 2012 by Delphix. All rights reserved.
+ */
+
 #ifndef _SYS_SPA_BOOT_H
 #define        _SYS_SPA_BOOT_H
 
@@ -35,6 +39,8 @@ extern "C" {
 extern char *spa_get_bootprop(char *prop);
 extern void spa_free_bootprop(char *prop);
 
+extern void spa_arch_init(void);
+
 #ifdef __cplusplus
 }
 #endif

Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h
==============================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h  Mon Feb 
25 11:22:54 2013        (r247264)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h  Mon Feb 
25 12:33:31 2013        (r247265)
@@ -22,6 +22,7 @@
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2012 by Delphix. All rights reserved.
  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
+ * Copyright 2013 Martin Matuska <m...@freebsd.org>. All rights reserved.
  */
 
 #ifndef _SYS_SPA_IMPL_H
@@ -230,6 +231,16 @@ struct spa {
        uint64_t        spa_feat_for_write_obj; /* required to write to pool */
        uint64_t        spa_feat_for_read_obj;  /* required to read from pool */
        uint64_t        spa_feat_desc_obj;      /* Feature descriptions */
+#ifdef illumos
+       cyclic_id_t     spa_deadman_cycid;      /* cyclic id */
+#else  /* FreeBSD */
+#ifdef _KERNEL
+       struct callout  spa_deadman_cycid;      /* callout id */
+#endif
+#endif /* illumos */
+       uint64_t        spa_deadman_calls;      /* number of deadman calls */
+       uint64_t        spa_sync_starttime;     /* starting time fo spa_sync */
+       uint64_t        spa_deadman_synctime;   /* deadman expiration timer */
        /*
         * spa_refcnt & spa_config_lock must be the last elements
         * because refcount_t changes size based on compilation options.

Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev.h
==============================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev.h      Mon Feb 
25 11:22:54 2013        (r247264)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev.h      Mon Feb 
25 12:33:31 2013        (r247265)
@@ -80,6 +80,7 @@ extern void vdev_metaslab_fini(vdev_t *v
 extern void vdev_metaslab_set_size(vdev_t *);
 extern void vdev_expand(vdev_t *vd, uint64_t txg);
 extern void vdev_split(vdev_t *vd);
+extern void vdev_deadman(vdev_t *vd);
 
 
 extern void vdev_get_stats(vdev_t *vd, vdev_stat_t *vs);

Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h
==============================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h Mon Feb 
25 11:22:54 2013        (r247264)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h Mon Feb 
25 12:33:31 2013        (r247265)
@@ -104,6 +104,8 @@ struct vdev_queue {
        avl_tree_t      vq_read_tree;
        avl_tree_t      vq_write_tree;
        avl_tree_t      vq_pending_tree;
+       uint64_t        vq_io_complete_ts;
+       uint64_t        vq_io_delta_ts;
        kmutex_t        vq_lock;
 };
 

Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_context.h
==============================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_context.h       
Mon Feb 25 11:22:54 2013        (r247264)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_context.h       
Mon Feb 25 12:33:31 2013        (r247265)
@@ -22,6 +22,10 @@
  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
+/*
+ * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
+ * Copyright (c) 2012 by Delphix. All rights reserved.
+ */
 
 #ifndef _SYS_ZFS_CONTEXT_H
 #define        _SYS_ZFS_CONTEXT_H
@@ -88,6 +92,11 @@ extern "C" {
 #include <sys/u8_textprep.h>
 #include <sys/fm/util.h>
 #include <sys/sunddi.h>
+#ifdef illumos
+#include <sys/cyclic.h>
+#else  /* FreeBSD */
+#include <sys/callout.h>
+#endif
 
 #include <machine/stdarg.h>
 

Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ioctl.h
==============================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ioctl.h Mon Feb 
25 11:22:54 2013        (r247264)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ioctl.h Mon Feb 
25 12:33:31 2013        (r247265)
@@ -246,12 +246,24 @@ typedef struct zinject_record {
        uint32_t        zi_iotype;
        int32_t         zi_duration;
        uint64_t        zi_timer;
+       uint32_t        zi_cmd;
+       uint32_t        zi_pad;
 } zinject_record_t;
 
 #define        ZINJECT_NULL            0x1
 #define        ZINJECT_FLUSH_ARC       0x2
 #define        ZINJECT_UNLOAD_SPA      0x4
 
+typedef enum zinject_type {
+       ZINJECT_UNINITIALIZED,
+       ZINJECT_DATA_FAULT,
+       ZINJECT_DEVICE_FAULT,
+       ZINJECT_LABEL_FAULT,
+       ZINJECT_IGNORED_WRITES,
+       ZINJECT_PANIC,
+       ZINJECT_DELAY_IO,
+} zinject_type_t;
+
 typedef struct zfs_share {
        uint64_t        z_exportdata;
        uint64_t        z_sharedata;

Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h
==============================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h       Mon Feb 
25 11:22:54 2013        (r247264)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h       Mon Feb 
25 12:33:31 2013        (r247265)
@@ -443,6 +443,7 @@ struct zio {
 
        uint64_t        io_offset;
        uint64_t        io_deadline;
+       uint64_t        io_timestamp;
        avl_node_t      io_offset_node;
        avl_node_t      io_deadline_node;
        avl_tree_t      *io_vdev_tree;
@@ -596,6 +597,7 @@ extern int zio_handle_fault_injection(zi
 extern int zio_handle_device_injection(vdev_t *vd, zio_t *zio, int error);
 extern int zio_handle_label_injection(zio_t *zio, int error);
 extern void zio_handle_ignored_writes(zio_t *zio);
+extern uint64_t zio_handle_io_delay(zio_t *zio);
 
 /*
  * Checksum ereport functions

Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c
==============================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c  Mon Feb 25 
11:22:54 2013        (r247264)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c  Mon Feb 25 
12:33:31 2013        (r247265)
@@ -3173,3 +3173,41 @@ vdev_split(vdev_t *vd)
        }
        vdev_propagate_state(cvd);
 }
+
+void
+vdev_deadman(vdev_t *vd)
+{
+       for (int c = 0; c < vd->vdev_children; c++) {
+               vdev_t *cvd = vd->vdev_child[c];
+
+               vdev_deadman(cvd);
+       }
+
+       if (vd->vdev_ops->vdev_op_leaf) {
+               vdev_queue_t *vq = &vd->vdev_queue;
+
+               mutex_enter(&vq->vq_lock);
+               if (avl_numnodes(&vq->vq_pending_tree) > 0) {
+                       spa_t *spa = vd->vdev_spa;
+                       zio_t *fio;
+                       uint64_t delta;
+
+                       /*
+                        * Look at the head of all the pending queues,
+                        * if any I/O has been outstanding for longer than
+                        * the spa_deadman_synctime we panic the system.
+                        */
+                       fio = avl_first(&vq->vq_pending_tree);
+                       delta = ddi_get_lbolt64() - fio->io_timestamp;
+                       if (delta > NSEC_TO_TICK(spa_deadman_synctime(spa))) {
+                               zfs_dbgmsg("SLOW IO: zio timestamp %llu, "
+                                   "delta %llu, last io %llu",
+                                   fio->io_timestamp, delta,
+                                   vq->vq_io_complete_ts);
+                               fm_panic("I/O to pool '%s' appears to be "
+                                   "hung.", spa_name(spa));
+                       }
+               }
+               mutex_exit(&vq->vq_lock);
+       }
+}

Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c
==============================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c    Mon Feb 
25 11:22:54 2013        (r247264)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c    Mon Feb 
25 12:33:31 2013        (r247265)
@@ -23,6 +23,10 @@
  * Use is subject to license terms.
  */
 
+/*
+ * Copyright (c) 2012 by Delphix. All rights reserved.
+ */
+
 #include <sys/zfs_context.h>
 #include <sys/vdev_impl.h>
 #include <sys/zio.h>
@@ -315,6 +319,7 @@ again:
                    zio_buf_alloc(size), size, fio->io_type, ZIO_PRIORITY_AGG,
                    flags | ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE,
                    vdev_queue_agg_io_done, NULL);
+               aio->io_timestamp = fio->io_timestamp;
 
                nio = fio;
                do {
@@ -386,7 +391,8 @@ vdev_queue_io(zio_t *zio)
 
        mutex_enter(&vq->vq_lock);
 
-       zio->io_deadline = (ddi_get_lbolt64() >> zfs_vdev_time_shift) +
+       zio->io_timestamp = ddi_get_lbolt64();
+       zio->io_deadline = (zio->io_timestamp >> zfs_vdev_time_shift) +
            zio->io_priority;
 
        vdev_queue_io_add(vq, zio);
@@ -411,10 +417,16 @@ vdev_queue_io_done(zio_t *zio)
 {
        vdev_queue_t *vq = &zio->io_vd->vdev_queue;
 
+       if (zio_injection_enabled)
+               delay(SEC_TO_TICK(zio_handle_io_delay(zio)));
+
        mutex_enter(&vq->vq_lock);
 
        avl_remove(&vq->vq_pending_tree, zio);
 
+       vq->vq_io_complete_ts = ddi_get_lbolt64();
+       vq->vq_io_delta_ts = vq->vq_io_complete_ts - zio->io_timestamp;
+
        for (int i = 0; i < zfs_vdev_ramp_rate; i++) {
                zio_t *nio = vdev_queue_io_to_issue(vq, zfs_vdev_max_pending);
                if (nio == NULL)

Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_inject.c
==============================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_inject.c    Mon Feb 
25 11:22:54 2013        (r247264)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_inject.c    Mon Feb 
25 12:33:31 2013        (r247265)
@@ -20,6 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012 by Delphix. All rights reserved.
  */
 
 /*
@@ -147,14 +148,8 @@ zio_handle_fault_injection(zio_t *zio, i
        for (handler = list_head(&inject_handlers); handler != NULL;
            handler = list_next(&inject_handlers, handler)) {
 
-               /* Ignore errors not destined for this pool */
-               if (zio->io_spa != handler->zi_spa)
-                       continue;
-
-               /* Ignore device errors and panic injection */
-               if (handler->zi_record.zi_guid != 0 ||
-                   handler->zi_record.zi_func[0] != '\0' ||
-                   handler->zi_record.zi_duration != 0)
+               if (zio->io_spa != handler->zi_spa ||
+                   handler->zi_record.zi_cmd != ZINJECT_DATA_FAULT)
                        continue;
 
                /* If this handler matches, return EIO */
@@ -197,10 +192,7 @@ zio_handle_label_injection(zio_t *zio, i
                uint64_t start = handler->zi_record.zi_start;
                uint64_t end = handler->zi_record.zi_end;
 
-               /* Ignore device only faults or panic injection */
-               if (handler->zi_record.zi_start == 0 ||
-                   handler->zi_record.zi_func[0] != '\0' ||
-                   handler->zi_record.zi_duration != 0)
+               if (handler->zi_record.zi_cmd != ZINJECT_LABEL_FAULT)
                        continue;
 
                /*
@@ -246,13 +238,7 @@ zio_handle_device_injection(vdev_t *vd, 
        for (handler = list_head(&inject_handlers); handler != NULL;
            handler = list_next(&inject_handlers, handler)) {
 
-               /*
-                * Ignore label specific faults, panic injection
-                * or fake writes
-                */
-               if (handler->zi_record.zi_start != 0 ||
-                   handler->zi_record.zi_func[0] != '\0' ||
-                   handler->zi_record.zi_duration != 0)
+               if (handler->zi_record.zi_cmd != ZINJECT_DEVICE_FAULT)
                        continue;
 
                if (vd->vdev_guid == handler->zi_record.zi_guid) {
@@ -316,10 +302,8 @@ zio_handle_ignored_writes(zio_t *zio)
            handler = list_next(&inject_handlers, handler)) {
 
                /* Ignore errors not destined for this pool */
-               if (zio->io_spa != handler->zi_spa)
-                       continue;
-
-               if (handler->zi_record.zi_duration == 0)
+               if (zio->io_spa != handler->zi_spa ||
+                   handler->zi_record.zi_cmd != ZINJECT_IGNORED_WRITES)
                        continue;
 
                /*
@@ -355,11 +339,8 @@ spa_handle_ignored_writes(spa_t *spa)
        for (handler = list_head(&inject_handlers); handler != NULL;
            handler = list_next(&inject_handlers, handler)) {
 
-               /* Ignore errors not destined for this pool */
-               if (spa != handler->zi_spa)
-                       continue;
-
-               if (handler->zi_record.zi_duration == 0)
+               if (spa != handler->zi_spa ||
+                   handler->zi_record.zi_cmd != ZINJECT_IGNORED_WRITES)
                        continue;
 
                if (handler->zi_record.zi_duration > 0) {
@@ -379,6 +360,34 @@ spa_handle_ignored_writes(spa_t *spa)
        rw_exit(&inject_lock);
 }
 
+uint64_t
+zio_handle_io_delay(zio_t *zio)
+{
+       vdev_t *vd = zio->io_vd;
+       inject_handler_t *handler;
+       uint64_t seconds = 0;
+
+       if (zio_injection_enabled == 0)
+               return (0);
+
+       rw_enter(&inject_lock, RW_READER);
+
+       for (handler = list_head(&inject_handlers); handler != NULL;
+           handler = list_next(&inject_handlers, handler)) {
+
+               if (handler->zi_record.zi_cmd != ZINJECT_DELAY_IO)
+                       continue;
+
+               if (vd->vdev_guid == handler->zi_record.zi_guid) {
+                       seconds = handler->zi_record.zi_timer;
+                       break;
+               }
+
+       }
+       rw_exit(&inject_lock);
+       return (seconds);
+}
+
 /*
  * Create a new handler for the given record.  We add it to the list, adding
  * a reference to the spa_t in the process.  We increment 
zio_injection_enabled,
_______________________________________________
svn-src-head@freebsd.org mailing list
http://lists.freebsd.org/mailman/listinfo/svn-src-head
To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"

Reply via email to