Cyrill Gorcunov <gorcu...@virtuozzo.com> writes:

> From: Jan Kara <j...@suse.cz>
>
> The following race can lead to a loss of i_disksize update from truncate
> thus resulting in a wrong inode size if the inode size isn't updated
> again before inode is reclaimed:
>
> ext4_setattr()                                mpage_map_and_submit_extent()
>   EXT4_I(inode)->i_disksize = attr->ia_size;
>   ...                                   ...
>                                         disksize = ((loff_t)mpd->first_page) 
> << PAGE_CACHE_SHIFT
>                                         /* False because i_size isn't
>                                          * updated yet */
>                                         if (disksize > i_size_read(inode))
>                                         /* True, because i_disksize is
>                                          * already truncated */
>                                         if (disksize > 
> EXT4_I(inode)->i_disksize)
>                                           /* Overwrite i_disksize
>                                            * update from truncate */
>                                           ext4_update_i_disksize()
>   i_size_write(inode, attr->ia_size);
>
> For other places updating i_disksize such race cannot happen because
> i_mutex prevents these races. Writeback is the only place where we do
> not hold i_mutex and we cannot grab it there because of lock ordering.
>
> We fix the race by doing both i_disksize and i_size update in truncate
> atomically under i_data_sem and in mpage_map_and_submit_extent() we move
> the check against i_size under i_data_sem as well.
>
> gorcunov@:
>  - ML 90e775b71ac4e685898c7995756fe58c135adaa6
>  - https://jira.sw.ru/browse/PSBM-34383
>
> Signed-off-by: Jan Kara <j...@suse.cz>
> Signed-off-by: "Theodore Ts'o" <ty...@mit.edu>
> Signed-off-by: Cyrill Gorcunov <gorcu...@virtuozzo.com>
ACK
> ---
>  fs/ext4/ext4.h  |   24 ++++++++++++++++++++----
>  fs/ext4/inode.c |   15 ++++++++++++---
>  2 files changed, 32 insertions(+), 7 deletions(-)
>
> Index: linux-pcs7.git/fs/ext4/ext4.h
> ===================================================================
> --- linux-pcs7.git.orig/fs/ext4/ext4.h
> +++ linux-pcs7.git/fs/ext4/ext4.h
> @@ -2400,16 +2400,32 @@ do {                                                  
>         \
>  #define EXT4_FREECLUSTERS_WATERMARK 0
>  #endif
>  
> +/* Update i_disksize. Requires i_mutex to avoid races with truncate */
>  static inline void ext4_update_i_disksize(struct inode *inode, loff_t 
> newsize)
>  {
> -     /*
> -      * XXX: replace with spinlock if seen contended -bzzz
> -      */
> +     WARN_ON_ONCE(S_ISREG(inode->i_mode) &&
> +                  !mutex_is_locked(&inode->i_mutex));
>       down_write(&EXT4_I(inode)->i_data_sem);
>       if (newsize > EXT4_I(inode)->i_disksize)
>               EXT4_I(inode)->i_disksize = newsize;
>       up_write(&EXT4_I(inode)->i_data_sem);
> -     return ;
> +}
> +
> +/*
> + * Update i_disksize after writeback has been started. Races with truncate
> + * are avoided by checking i_size under i_data_sem.
> + */
> +static inline void ext4_wb_update_i_disksize(struct inode *inode, loff_t 
> newsize)
> +{
> +     loff_t i_size;
> +
> +     down_write(&EXT4_I(inode)->i_data_sem);
> +     i_size = i_size_read(inode);
> +     if (newsize > i_size)
> +             newsize = i_size;
> +     if (newsize > EXT4_I(inode)->i_disksize)
> +             EXT4_I(inode)->i_disksize = newsize;
> +     up_write(&EXT4_I(inode)->i_data_sem);
>  }
>  
>  struct ext4_group_info {
> Index: linux-pcs7.git/fs/ext4/inode.c
> ===================================================================
> --- linux-pcs7.git.orig/fs/ext4/inode.c
> +++ linux-pcs7.git/fs/ext4/inode.c
> @@ -1788,7 +1788,7 @@ static void mpage_da_map_and_submit(stru
>       if (disksize > i_size_read(mpd->inode))
>               disksize = i_size_read(mpd->inode);
>       if (disksize > EXT4_I(mpd->inode)->i_disksize) {
> -             ext4_update_i_disksize(mpd->inode, disksize);
> +             ext4_wb_update_i_disksize(mpd->inode, disksize);
>               err = ext4_mark_inode_dirty(handle, mpd->inode);
>               if (err)
>                       ext4_error(mpd->inode->i_sb,
> @@ -4831,18 +4831,27 @@ int ext4_setattr(struct dentry *dentry,
>                               error = ext4_orphan_add(handle, inode);
>                               orphan = 1;
>                       }
> +                     down_write(&EXT4_I(inode)->i_data_sem);
>                       EXT4_I(inode)->i_disksize = attr->ia_size;
>                       rc = ext4_mark_inode_dirty(handle, inode);
>                       if (!error)
>                               error = rc;
> +                     /*
> +                      * We have to update i_size under i_data_sem together
> +                      * with i_disksize to avoid races with writeback code
> +                      * running ext4_wb_update_i_disksize().
> +                      */
> +                     if (!error)
> +                             i_size_write(inode, attr->ia_size);
> +                     up_write(&EXT4_I(inode)->i_data_sem);
>                       ext4_journal_stop(handle);
>                       if (error) {
>                               ext4_orphan_del(NULL, inode);
>                               goto err_out;
>                       }
> -             }
> +             } else
> +                     i_size_write(inode, attr->ia_size);
>  
> -             i_size_write(inode, attr->ia_size);
>               /*
>                * Blocks are going to be removed from the inode. Wait
>                * for dio in flight.  Temporarily disable

Attachment: signature.asc
Description: PGP signature

_______________________________________________
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel

Reply via email to