On Tue, Jun 06, 2017 at 04:45:31PM -0700, Omar Sandoval wrote: > From: Omar Sandoval <osan...@fb.com> > > The total_bytes_pinned counter is completely broken when accounting > delayed refs: > > - If two drops for the same extent are merged, we will decrement > total_bytes_pinned twice but only increment it once. > - If an add is merged into a drop or vice versa, we will decrement the > total_bytes_pinned counter but never increment it. > - If multiple references to an extent are dropped, we will account it > multiple times, potentially vastly over-estimating the number of bytes > that will be freed by a commit and doing unnecessary work when we're > close to ENOSPC. > > The last issue is relatively minor, but the first two make the > total_bytes_pinned counter leak or underflow very often. These > accounting issues were introduced in b150a4f10d87 ("Btrfs: use a percpu > to keep track of possibly pinned bytes"), but they were papered over by > zeroing out the counter on every commit until d288db5dc011 ("Btrfs: fix > race of using total_bytes_pinned"). > > We need to make sure that an extent is accounted as pinned exactly once > if and only if we will drop references to it when when the transaction > is committed. Ideally we would only add to total_bytes_pinned when the > *last* reference is dropped, but this information isn't readily > available for data extents. Again, this over-estimation can lead to > extra commits when we're close to ENOSPC, but it's not as bad as before. > > The fix implemented here is to increment total_bytes_pinned when the > total refmod count for an extent goes negative and decrement it if the > refmod count goes back to non-negative or after we've run all of the > delayed refs for that extent. >
The patch could be cleaner if we inc/dec %pinned inside delayed_ref.c. The idea looks good to me. Reviewed-by: Liu Bo <bo.li....@oracle.com> -liubo > Signed-off-by: Omar Sandoval <osan...@fb.com> > --- > fs/btrfs/extent-tree.c | 41 ++++++++++++++++++++++++++++++++--------- > 1 file changed, 32 insertions(+), 9 deletions(-) > > diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c > index 6dce7abafe84..75ad24f8d253 100644 > --- a/fs/btrfs/extent-tree.c > +++ b/fs/btrfs/extent-tree.c > @@ -2112,6 +2112,7 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle > *trans, > u64 bytenr, u64 num_bytes, u64 parent, > u64 root_objectid, u64 owner, u64 offset) > { > + int old_ref_mod, new_ref_mod; > int ret; > > BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID && > @@ -2122,14 +2123,18 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle > *trans, > num_bytes, parent, > root_objectid, (int)owner, > BTRFS_ADD_DELAYED_REF, NULL, > - NULL, NULL); > + &old_ref_mod, &new_ref_mod); > } else { > ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr, > num_bytes, parent, > root_objectid, owner, offset, > - 0, BTRFS_ADD_DELAYED_REF, NULL, > - NULL); > + 0, BTRFS_ADD_DELAYED_REF, > + &old_ref_mod, &new_ref_mod); > } > + > + if (ret == 0 && old_ref_mod < 0 && new_ref_mod >= 0) > + add_pinned_bytes(fs_info, -num_bytes, owner, root_objectid); > + > return ret; > } > > @@ -2433,6 +2438,16 @@ static int run_one_delayed_ref(struct > btrfs_trans_handle *trans, > head = btrfs_delayed_node_to_head(node); > trace_run_delayed_ref_head(fs_info, node, head, node->action); > > + if (head->total_ref_mod < 0) { > + struct btrfs_block_group_cache *cache; > + > + cache = btrfs_lookup_block_group(fs_info, node->bytenr); > + ASSERT(cache); > + > percpu_counter_add(&cache->space_info->total_bytes_pinned, > + -node->num_bytes); > + btrfs_put_block_group(cache); > + } > + > if (insert_reserved) { > btrfs_pin_extent(fs_info, node->bytenr, > node->num_bytes, 1); > @@ -6269,6 +6284,8 @@ static int update_block_group(struct btrfs_trans_handle > *trans, > trace_btrfs_space_reservation(info, "pinned", > cache->space_info->flags, > num_bytes, 1); > + > percpu_counter_add(&cache->space_info->total_bytes_pinned, > + num_bytes); > set_extent_dirty(info->pinned_extents, > bytenr, bytenr + num_bytes - 1, > GFP_NOFS | __GFP_NOFAIL); > @@ -7038,8 +7055,6 @@ static int __btrfs_free_extent(struct > btrfs_trans_handle *trans, > goto out; > } > } > - add_pinned_bytes(info, -num_bytes, owner_objectid, > - root_objectid); > } else { > if (found_extent) { > BUG_ON(is_data && refs_to_drop != > @@ -7171,13 +7186,16 @@ void btrfs_free_tree_block(struct btrfs_trans_handle > *trans, > int ret; > > if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { > + int old_ref_mod, new_ref_mod; > + > ret = btrfs_add_delayed_tree_ref(fs_info, trans, buf->start, > buf->len, parent, > root->root_key.objectid, > btrfs_header_level(buf), > BTRFS_DROP_DELAYED_REF, NULL, > - NULL, NULL); > + &old_ref_mod, &new_ref_mod); > BUG_ON(ret); /* -ENOMEM */ > + pin = old_ref_mod >= 0 && new_ref_mod < 0; > } > > if (last_ref && btrfs_header_generation(buf) == trans->transid) { > @@ -7226,12 +7244,12 @@ int btrfs_free_extent(struct btrfs_trans_handle > *trans, > u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid, > u64 owner, u64 offset) > { > + int old_ref_mod, new_ref_mod; > int ret; > > if (btrfs_is_testing(fs_info)) > return 0; > > - add_pinned_bytes(fs_info, num_bytes, owner, root_objectid); > > /* > * tree log blocks never actually go into the extent allocation > @@ -7241,20 +7259,25 @@ int btrfs_free_extent(struct btrfs_trans_handle > *trans, > WARN_ON(owner >= BTRFS_FIRST_FREE_OBJECTID); > /* unlocks the pinned mutex */ > btrfs_pin_extent(fs_info, bytenr, num_bytes, 1); > + old_ref_mod = new_ref_mod = 0; > ret = 0; > } else if (owner < BTRFS_FIRST_FREE_OBJECTID) { > ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr, > num_bytes, parent, > root_objectid, (int)owner, > BTRFS_DROP_DELAYED_REF, NULL, > - NULL, NULL); > + &old_ref_mod, &new_ref_mod); > } else { > ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr, > num_bytes, parent, > root_objectid, owner, offset, > 0, BTRFS_DROP_DELAYED_REF, > - NULL, NULL); > + &old_ref_mod, &new_ref_mod); > } > + > + if (ret == 0 && old_ref_mod >= 0 && new_ref_mod < 0) > + add_pinned_bytes(fs_info, num_bytes, owner, root_objectid); > + > return ret; > } > > -- > 2.13.0 > -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html