Write and Flush errors are considered as critical errors,
upon which the device will be brought offline and marked as
failed. Write and Flush errors are identified using device
error statistics.

Signed-off-by: Anand Jain <anand.j...@oracle.com>
---
 fs/btrfs/disk-io.c | 43 +++++++++++++++++++++++++++++++++++++++++++
 fs/btrfs/volumes.c |  1 +
 fs/btrfs/volumes.h |  4 ++++
 3 files changed, 48 insertions(+)

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index d10ef2e..38e0385 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1836,6 +1836,47 @@ sleep:
        return 0;
 }
 
+static void btrfs_check_devices(struct btrfs_fs_devices *fs_devices)
+{
+       struct btrfs_fs_info *fs_info = fs_devices->fs_info;
+       struct btrfs_device *device;
+
+       if (btrfs_fs_closing(fs_info))
+               return;
+
+       /* mark disk(s) with write or flush error(s) as failed */
+       mutex_lock(&fs_info->volume_mutex);
+       list_for_each_entry_rcu(device, &fs_devices->devices, dev_list) {
+               int c_err;
+
+               /*
+                * todo: replace target device's write/flush error,
+                * skip for now
+                */
+               if (device->is_tgtdev_for_dev_replace)
+                       continue;
+
+               if (!device->dev_stats_valid)
+                       continue;
+
+               c_err = atomic_read(&device->new_critical_errs);
+               atomic_sub(c_err, &device->new_critical_errs);
+               if (c_err) {
+                       rcu_read_lock();
+                       btrfs_warn(fs_info,
+                               "new write errors on device %s",
+                                       rcu_str_deref(device->name));
+                       rcu_read_unlock();
+
+                       /* force close and mark device as failed */
+                       btrfs_force_device_close(device, "failed");
+               }
+       }
+       mutex_unlock(&fs_info->volume_mutex);
+
+       return;
+}
+
 static int transaction_kthread(void *arg)
 {
        struct btrfs_root *root = arg;
@@ -1882,6 +1923,8 @@ static int transaction_kthread(void *arg)
                        btrfs_end_transaction(trans, root);
                }
 sleep:
+               btrfs_check_devices(root->fs_info->fs_devices);
+
                wake_up_process(root->fs_info->cleaner_kthread);
                mutex_unlock(&root->fs_info->transaction_kthread_mutex);
 
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 7492733..b52197b 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -157,6 +157,7 @@ static struct btrfs_device *__alloc_device(void)
        spin_lock_init(&dev->reada_lock);
        atomic_set(&dev->reada_in_flight, 0);
        atomic_set(&dev->dev_stats_ccnt, 0);
+       atomic_set(&dev->new_critical_errs, 0);
        INIT_RADIX_TREE(&dev->reada_zones, GFP_NOFS & ~__GFP_WAIT);
        INIT_RADIX_TREE(&dev->reada_extents, GFP_NOFS & ~__GFP_WAIT);
 
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 1c6107a..827371e 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -167,6 +167,7 @@ struct btrfs_device {
        /* Counter to record the change of device stats */
        atomic_t dev_stats_ccnt;
        atomic_t dev_stat_values[BTRFS_DEV_STAT_VALUES_MAX];
+       atomic_t new_critical_errs;
 };
 
 /*
@@ -518,6 +519,9 @@ static inline void btrfs_dev_stat_inc(struct btrfs_device 
*dev,
        atomic_inc(dev->dev_stat_values + index);
        smp_mb__before_atomic();
        atomic_inc(&dev->dev_stats_ccnt);
+       if (index == BTRFS_DEV_STAT_WRITE_ERRS ||
+               index == BTRFS_DEV_STAT_FLUSH_ERRS)
+               atomic_inc(&dev->new_critical_errs);
 }
 
 static inline int btrfs_dev_stat_read(struct btrfs_device *dev,
-- 
2.4.1

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Reply via email to