Oh, sorry, some format style problems...
let me resend a new one.

On Thu, 2015-01-15 at 16:53 +0800, Gui Hecheng wrote:
> There is a global list @fs_uuids to keep @fs_devices object
> for each created btrfs. But when a btrfs becomes "empty"
> (all devices belong to it are gone), its @fs_devices remains
> in @fs_uuids list until module exit.
> If we keeps mkfs.btrfs on the same device again and again,
> all empty @fs_devices produced are sure to eat up our memory.
> So this case has better to be prevented.
> 
> I think that each time we setup btrfs on that device, we should
> check whether we are stealing some device from another btrfs
> seen before. To faciliate the search procedure, we could insert
> all @btrfs_device in a rb_root, one @btrfs_device per each physical
> device, with @bdev->bd_dev as key. Each time device stealing happens,
> we should replace the corresponding @btrfs_device in the rb_root with
> an up-to-date version.
> If the stolen device is the last device in its @fs_devices,
> then we have an empty btrfs to be deleted.
> 
> Actually there are 3 ways to steal devices and lead to empty btrfs
>         1. mkfs, with -f option
>         2. device add, with -f option
>         3. device replace, with -f option
> We should act under these cases.
> 
> Moreover, there are special cases to consider:
> o If there are seed devices, then it is asured that
>   the devices in cloned @fs_devices are not treated as valid devices.
> o If a device disappears and reappears without any touch, its
>   @bdev->bd_dev may change, so we have to re-insert it into the rb_root.
> 
> Signed-off-by: Gui Hecheng <guihc.f...@cn.fujitsu.com>
> ---
> changelog
>         v1->v2: add handle for device disappears and reappears event
> 
>       *Note*
>       Actually this handles the case when a device disappears and
>       reappears without any touch.
>       We are going to recycle all "dead" btrfs_device in another patch.
>       Two events leads to the "dead"s:
>               1) device disappears and never returns again
>               2) device disappears and returns with a new fs on it
>       A shrinker shall kill the "dead"s.
> ---
>  fs/btrfs/super.c   |   1 +
>  fs/btrfs/volumes.c | 281 
> ++++++++++++++++++++++++++++++++++++++++++-----------
>  fs/btrfs/volumes.h |   6 ++
>  3 files changed, 230 insertions(+), 58 deletions(-)
> 
> diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
> index 60f7cbe..001cba5 100644
> --- a/fs/btrfs/super.c
> +++ b/fs/btrfs/super.c
> @@ -2184,6 +2184,7 @@ static void __exit exit_btrfs_fs(void)
>       btrfs_end_io_wq_exit();
>       unregister_filesystem(&btrfs_fs_type);
>       btrfs_exit_sysfs();
> +     btrfs_cleanup_valid_dev_root();
>       btrfs_cleanup_fs_uuids();
>       btrfs_exit_compress();
>       btrfs_hash_exit();
> diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
> index 0144790..228a7e0 100644
> --- a/fs/btrfs/volumes.c
> +++ b/fs/btrfs/volumes.c
> @@ -27,6 +27,7 @@
>  #include <linux/kthread.h>
>  #include <linux/raid/pq.h>
>  #include <linux/semaphore.h>
> +#include <linux/rbtree.h>
>  #include <asm/div64.h>
>  #include "ctree.h"
>  #include "extent_map.h"
> @@ -52,6 +53,126 @@ static void btrfs_dev_stat_print_on_load(struct 
> btrfs_device *device);
>  
>  DEFINE_MUTEX(uuid_mutex);
>  static LIST_HEAD(fs_uuids);
> +static struct rb_root valid_dev_root = RB_ROOT;
> +
> +static struct btrfs_device *insert_valid_device(struct btrfs_device *new_dev)
> +{
> +     struct rb_node **p;
> +     struct rb_node *parent;
> +     struct rb_node *new;
> +     struct btrfs_device *old_dev;
> +
> +     WARN_ON(!mutex_is_locked(&uuid_mutex));
> +
> +     parent = NULL;
> +     new = &new_dev->rb_node;
> +
> +     p = &valid_dev_root.rb_node;
> +     while (*p) {
> +             parent = *p;
> +             old_dev = rb_entry(parent, struct btrfs_device, rb_node);
> +
> +             if (new_dev->devnum < old_dev->devnum)
> +                     p = &parent->rb_left;
> +             else if (new_dev->devnum > old_dev->devnum)
> +                     p = &parent->rb_right;
> +             else {
> +                     rb_replace_node(parent, new, &valid_dev_root);
> +                     RB_CLEAR_NODE(parent);
> +
> +                     goto out;
> +             }
> +     }
> +
> +     old_dev = NULL;
> +     rb_link_node(new, parent, p);
> +     rb_insert_color(new, &valid_dev_root);
> +
> +out:
> +     return old_dev;
> +}
> +
> +static void free_fs_devices(struct btrfs_fs_devices *fs_devices)
> +{
> +     struct btrfs_device *device;
> +     WARN_ON(fs_devices->opened);
> +     while (!list_empty(&fs_devices->devices)) {
> +             device = list_entry(fs_devices->devices.next,
> +                                 struct btrfs_device, dev_list);
> +             list_del(&device->dev_list);
> +             rcu_string_free(device->name);
> +             kfree(device);
> +     }
> +     kfree(fs_devices);
> +}
> +
> +static void remove_empty_fs_if_need(struct btrfs_fs_devices *old_fs)
> +{
> +     struct btrfs_fs_devices *seed_fs;
> +
> +     if (!list_empty(&old_fs->devices))
> +             return;
> +
> +     list_del(&old_fs->list);
> +
> +     /* free the seed clones */
> +     seed_fs = old_fs->seed;
> +     free_fs_devices(old_fs);
> +     while (seed_fs) {
> +             old_fs = seed_fs;
> +             seed_fs = seed_fs->seed;
> +             free_fs_devices(old_fs);
> +     }
> +
> +}
> +
> +static void free_invalid_device(struct btrfs_device *invalid_dev)
> +{
> +     struct btrfs_fs_devices *old_fs;
> +
> +     old_fs = invalid_dev->fs_devices;
> +     mutex_lock(&old_fs->device_list_mutex);
> +     list_del(&invalid_dev->dev_list);
> +     rcu_string_free(invalid_dev->name);
> +     kfree(invalid_dev);
> +     mutex_unlock(&old_fs->device_list_mutex);
> +
> +     remove_empty_fs_if_need(old_fs);
> +}
> +
> +static void replace_invalid_device(struct btrfs_device *new_dev)
> +{
> +     struct btrfs_device *invalid_dev;
> +
> +     WARN_ON(!mutex_is_locked(&uuid_mutex));
> +
> +     invalid_dev = insert_valid_device(new_dev);
> +     if (!invalid_dev)
> +             return;
> +
> +     free_invalid_device(invalid_dev);
> +}
> +
> +static void remove_valid_device(struct btrfs_device *old_dev)
> +{
> +     WARN_ON(!mutex_is_locked(&uuid_mutex));
> +
> +     if (!RB_EMPTY_NODE(&old_dev->rb_node)) {
> +             rb_erase(&old_dev->rb_node, &valid_dev_root);
> +             RB_CLEAR_NODE(&old_dev->rb_node);
> +     }
> +}
> +
> +void btrfs_cleanup_valid_dev_root(void)
> +{
> +     struct rb_node *rb_node;
> +
> +     rb_node = rb_first(&valid_dev_root);
> +     while (rb_node) {
> +             rb_erase(rb_node, &valid_dev_root);
> +             rb_node = rb_first(&valid_dev_root);
> +     }
> +}
>  
>  static struct btrfs_fs_devices *__alloc_fs_devices(void)
>  {
> @@ -96,20 +217,6 @@ static struct btrfs_fs_devices *alloc_fs_devices(const u8 
> *fsid)
>       return fs_devs;
>  }
>  
> -static void free_fs_devices(struct btrfs_fs_devices *fs_devices)
> -{
> -     struct btrfs_device *device;
> -     WARN_ON(fs_devices->opened);
> -     while (!list_empty(&fs_devices->devices)) {
> -             device = list_entry(fs_devices->devices.next,
> -                                 struct btrfs_device, dev_list);
> -             list_del(&device->dev_list);
> -             rcu_string_free(device->name);
> -             kfree(device);
> -     }
> -     kfree(fs_devices);
> -}
> -
>  static void btrfs_kobject_uevent(struct block_device *bdev,
>                                enum kobject_action action)
>  {
> @@ -155,6 +262,8 @@ static struct btrfs_device *__alloc_device(void)
>       INIT_RADIX_TREE(&dev->reada_zones, GFP_NOFS & ~__GFP_WAIT);
>       INIT_RADIX_TREE(&dev->reada_extents, GFP_NOFS & ~__GFP_WAIT);
>  
> +     RB_CLEAR_NODE(&dev->rb_node);
> +
>       return dev;
>  }
>  
> @@ -451,7 +560,7 @@ static void pending_bios_fn(struct btrfs_work *work)
>   * < 0 - error
>   */
>  static noinline int device_list_add(const char *path,
> -                        struct btrfs_super_block *disk_super,
> +                        struct btrfs_super_block *disk_super, dev_t devnum,
>                          u64 devid, struct btrfs_fs_devices **fs_devices_ret)
>  {
>       struct btrfs_device *device;
> @@ -499,53 +608,65 @@ static noinline int device_list_add(const char *path,
>  
>               ret = 1;
>               device->fs_devices = fs_devices;
> -     } else if (!device->name || strcmp(device->name->str, path)) {
> -             /*
> -              * When FS is already mounted.
> -              * 1. If you are here and if the device->name is NULL that
> -              *    means this device was missing at time of FS mount.
> -              * 2. If you are here and if the device->name is different
> -              *    from 'path' that means either
> -              *      a. The same device disappeared and reappeared with
> -              *         different name. or
> -              *      b. The missing-disk-which-was-replaced, has
> -              *         reappeared now.
> -              *
> -              * We must allow 1 and 2a above. But 2b would be a spurious
> -              * and unintentional.
> -              *
> -              * Further in case of 1 and 2a above, the disk at 'path'
> -              * would have missed some transaction when it was away and
> -              * in case of 2a the stale bdev has to be updated as well.
> -              * 2b must not be allowed at all time.
> -              */
> +             device->devnum = devnum;
> +             replace_invalid_device(device);
> +     } else {
> +             if (!device->name || strcmp(device->name->str, path)) {
> +                     /*
> +                      * When FS is already mounted.
> +                      * 1. If you are here and if the device->name is NULL 
> that
> +                      *    means this device was missing at time of FS mount.
> +                      * 2. If you are here and if the device->name is 
> different
> +                      *    from 'path' that means either
> +                      *      a. The same device disappeared and reappeared 
> with
> +                      *         different name. or
> +                      *      b. The missing-disk-which-was-replaced, has
> +                      *         reappeared now.
> +                      *
> +                      * We must allow 1 and 2a above. But 2b would be a 
> spurious
> +                      * and unintentional.
> +                      *
> +                      * Further in case of 1 and 2a above, the disk at 'path'
> +                      * would have missed some transaction when it was away 
> and
> +                      * in case of 2a the stale bdev has to be updated as 
> well.
> +                      * 2b must not be allowed at all time.
> +                      */
>  
> -             /*
> -              * For now, we do allow update to btrfs_fs_device through the
> -              * btrfs dev scan cli after FS has been mounted.  We're still
> -              * tracking a problem where systems fail mount by subvolume id
> -              * when we reject replacement on a mounted FS.
> -              */
> -             if (!fs_devices->opened && found_transid < device->generation) {
>                       /*
> -                      * That is if the FS is _not_ mounted and if you
> -                      * are here, that means there is more than one
> -                      * disk with same uuid and devid.We keep the one
> -                      * with larger generation number or the last-in if
> -                      * generation are equal.
> +                      * For now, we do allow update to btrfs_fs_device 
> through the
> +                      * btrfs dev scan cli after FS has been mounted.  We're 
> still
> +                      * tracking a problem where systems fail mount by 
> subvolume id
> +                      * when we reject replacement on a mounted FS.
>                        */
> -                     return -EEXIST;
> -             }
> +                     if (!fs_devices->opened && found_transid < 
> device->generation) {
> +                             /*
> +                              * That is if the FS is _not_ mounted and if you
> +                              * are here, that means there is more than one
> +                              * disk with same uuid and devid.We keep the one
> +                              * with larger generation number or the last-in 
> if
> +                              * generation are equal.
> +                              */
> +                             return -EEXIST;
> +                     }
>  
> -             name = rcu_string_strdup(path, GFP_NOFS);
> -             if (!name)
> -                     return -ENOMEM;
> -             rcu_string_free(device->name);
> -             rcu_assign_pointer(device->name, name);
> -             if (device->missing) {
> -                     fs_devices->missing_devices--;
> -                     device->missing = 0;
> +                     name = rcu_string_strdup(path, GFP_NOFS);
> +                     if (!name)
> +                             return -ENOMEM;
> +                     rcu_string_free(device->name);
> +                     rcu_assign_pointer(device->name, name);
> +                     if (device->missing) {
> +                             fs_devices->missing_devices--;
> +                             device->missing = 0;
> +                     }
>               }
> +
> +             /*
> +              * device may reappear with new devnum,
> +              * re-insert to keep it up-to-date
> +              */
> +             rb_erase(&device->rb_node, &valid_dev_root);
> +             device->devnum = devnum;
> +             insert_valid_device(device);
>       }
>  
>       /*
> @@ -599,6 +720,7 @@ static struct btrfs_fs_devices *clone_fs_devices(struct 
> btrfs_fs_devices *orig)
>  
>               list_add(&device->dev_list, &fs_devices->devices);
>               device->fs_devices = fs_devices;
> +             device->devnum = orig_dev->devnum;
>               fs_devices->num_devices++;
>       }
>       mutex_unlock(&orig->device_list_mutex);
> @@ -609,6 +731,15 @@ error:
>       return ERR_PTR(-ENOMEM);
>  }
>  
> +/*
> + * If @fs_devices is not in global list @fs_uuids,
> + * then it is a cloned btrfs_fs_devices for seeding
> + */
> +static int is_cloned_fs_devices(struct btrfs_fs_devices *fs_devices)
> +{
> +     return list_empty(&fs_devices->list);
> +}
> +
>  void btrfs_close_extra_devices(struct btrfs_fs_info *fs_info,
>                              struct btrfs_fs_devices *fs_devices, int step)
>  {
> @@ -655,6 +786,10 @@ again:
>                               fs_devices->rw_devices--;
>               }
>               list_del_init(&device->dev_list);
> +
> +             /* skip cloned fs_devices which act as seed devices*/
> +             if (!is_cloned_fs_devices(fs_devices))
> +                     remove_valid_device(device);
>               fs_devices->num_devices--;
>               rcu_string_free(device->name);
>               kfree(device);
> @@ -730,6 +865,11 @@ static int __btrfs_close_devices(struct btrfs_fs_devices 
> *fs_devices)
>  
>               list_replace_rcu(&device->dev_list, &new_device->dev_list);
>               new_device->fs_devices = device->fs_devices;
> +             new_device->devnum = device->devnum;
> +
> +             /* skip cloned fs_devices which act as seed devices*/
> +             if (!is_cloned_fs_devices(device->fs_devices))
> +                     insert_valid_device(new_device);
>  
>               call_rcu(&device->rcu, free_device);
>       }
> @@ -942,7 +1082,8 @@ int btrfs_scan_one_device(const char *path, fmode_t 
> flags, void *holder,
>       transid = btrfs_super_generation(disk_super);
>       total_devices = btrfs_super_num_devices(disk_super);
>  
> -     ret = device_list_add(path, disk_super, devid, fs_devices_ret);
> +     ret = device_list_add(path, disk_super, bdev->bd_dev,
> +                             devid, fs_devices_ret);
>       if (ret > 0) {
>               if (disk_super->label[0]) {
>                       if (disk_super->label[BTRFS_LABEL_SIZE - 1])
> @@ -1678,6 +1819,7 @@ int btrfs_rm_device(struct btrfs_root *root, char 
> *device_path)
>        */
>  
>       cur_devices = device->fs_devices;
> +     remove_valid_device(device);
>       mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
>       list_del_rcu(&device->dev_list);
>  
> @@ -1825,6 +1967,8 @@ void btrfs_rm_dev_replace_remove_srcdev(struct 
> btrfs_fs_info *fs_info,
>  
>       if (srcdev->bdev)
>               fs_devices->open_devices--;
> +
> +     remove_valid_device(srcdev);
>  }
>  
>  void btrfs_rm_dev_replace_free_srcdev(struct btrfs_fs_info *fs_info,
> @@ -1879,6 +2023,7 @@ void btrfs_destroy_dev_replace_tgtdev(struct 
> btrfs_fs_info *fs_info,
>       if (tgtdev->bdev == fs_info->fs_devices->latest_bdev)
>               fs_info->fs_devices->latest_bdev = next_device->bdev;
>       list_del_rcu(&tgtdev->dev_list);
> +     remove_valid_device(tgtdev);
>  
>       call_rcu(&tgtdev->rcu, free_device);
>  
> @@ -1971,12 +2116,22 @@ static int btrfs_prepare_sprout(struct btrfs_root 
> *root)
>               return PTR_ERR(old_devices);
>       }
>  
> +     /*
> +      * Here @old_devices represent the fs_devices that will be linked
> +      * in the fs_uuids, and devices in it should be valid.
> +      * All devices in @fs_devices which will be moved into @seed_devices
> +      * and they just act as clones. So replace those clones which sit
> +      * in @dev_map_root for now with valid devices in @old_devices.
> +      */
> +     list_for_each_entry(device, &old_devices->devices, dev_list)
> +             insert_valid_device(device);
>       list_add(&old_devices->list, &fs_uuids);
>  
>       memcpy(seed_devices, fs_devices, sizeof(*seed_devices));
>       seed_devices->opened = 1;
>       INIT_LIST_HEAD(&seed_devices->devices);
>       INIT_LIST_HEAD(&seed_devices->alloc_list);
> +     INIT_LIST_HEAD(&seed_devices->list);
>       mutex_init(&seed_devices->device_list_mutex);
>  
>       mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
> @@ -2174,6 +2329,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char 
> *device_path)
>       }
>  
>       device->fs_devices = root->fs_info->fs_devices;
> +     device->devnum = bdev->bd_dev;
>  
>       mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
>       lock_chunks(root);
> @@ -2273,6 +2429,10 @@ int btrfs_init_new_device(struct btrfs_root *root, 
> char *device_path)
>               ret = btrfs_commit_transaction(trans, root);
>       }
>  
> +     mutex_lock(&uuid_mutex);
> +     replace_invalid_device(device);
> +     mutex_unlock(&uuid_mutex);
> +
>       /* Update ctime/mtime for libblkid */
>       update_dev_time(device_path);
>       return ret;
> @@ -2374,11 +2534,16 @@ int btrfs_init_dev_replace_tgtdev(struct btrfs_root 
> *root, char *device_path,
>       device->dev_stats_valid = 1;
>       set_blocksize(device->bdev, 4096);
>       device->fs_devices = fs_info->fs_devices;
> +     device->devnum = bdev->bd_dev;
>       list_add(&device->dev_list, &fs_info->fs_devices->devices);
>       fs_info->fs_devices->num_devices++;
>       fs_info->fs_devices->open_devices++;
>       mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
>  
> +     mutex_lock(&uuid_mutex);
> +     replace_invalid_device(device);
> +     mutex_unlock(&uuid_mutex);
> +
>       *device_out = device;
>       return ret;
>  
> diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
> index d6fe73c..7f5c7ea 100644
> --- a/fs/btrfs/volumes.h
> +++ b/fs/btrfs/volumes.h
> @@ -80,6 +80,11 @@ struct btrfs_device {
>       seqcount_t data_seqcount;
>  #endif
>  
> +     struct rb_node rb_node;
> +
> +     /* node key in valid_dev_root */
> +     dev_t devnum;
> +
>       /* the internal btrfs device id */
>       u64 devid;
>  
> @@ -426,6 +431,7 @@ struct btrfs_device *btrfs_alloc_device(struct 
> btrfs_fs_info *fs_info,
>                                       const u64 *devid,
>                                       const u8 *uuid);
>  int btrfs_rm_device(struct btrfs_root *root, char *device_path);
> +void btrfs_cleanup_valid_dev_root(void);
>  void btrfs_cleanup_fs_uuids(void);
>  int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len);
>  int btrfs_grow_device(struct btrfs_trans_handle *trans,


--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Reply via email to