[PATCH v2 1/4] Btrfs: fix wrong send_in_progress accounting

2014-01-07 Thread Wang Shilong
Steps to reproduce:
 # mkfs.btrfs -f /dev/sda8
 # mount /dev/sda8 /mnt
 # btrfs sub snapshot -r /mnt /mnt/snap1
 # btrfs sub snapshot -r /mnt /mnt/snap2
 # btrfs send /mnt/snap1 -p /mnt/snap2 -f /mnt/1
 # dmesg

The problem is that we will sort clone roots(include @send_root), it
might push @send_root before thus @send_root's @send_in_progress will
be decreased twice.

Cc: David Sterba dste...@suse.cz
Signed-off-by: Wang Shilong wangsl.f...@cn.fujitsu.com
---
Changelog v1-v2:
on the right way to fix the problem.(thanks to David)
---
 fs/btrfs/send.c | 16 +---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index bff0b1a..5b69785 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -4752,6 +4752,7 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user 
*arg_)
u32 i;
u64 *clone_sources_tmp = NULL;
int clone_sources_to_rollback = 0;
+   int sort_clone_roots = 0;
 
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
@@ -4942,6 +4943,7 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user 
*arg_)
sort(sctx-clone_roots, sctx-clone_roots_cnt,
sizeof(*sctx-clone_roots), __clone_root_cmp_sort,
NULL);
+   sort_clone_roots = 1;
 
ret = send_subvol(sctx);
if (ret  0)
@@ -4957,11 +4959,19 @@ long btrfs_ioctl_send(struct file *mnt_file, void 
__user *arg_)
}
 
 out:
-   for (i = 0; sctx  i  clone_sources_to_rollback; i++)
-   btrfs_root_dec_send_in_progress(sctx-clone_roots[i].root);
+   if (sort_clone_roots) {
+   for (i = 0; i  sctx-clone_roots_cnt; i++)
+   btrfs_root_dec_send_in_progress(
+   sctx-clone_roots[i].root);
+   } else {
+   for (i = 0; sctx  i  clone_sources_to_rollback; i++)
+   btrfs_root_dec_send_in_progress(
+   sctx-clone_roots[i].root);
+
+   btrfs_root_dec_send_in_progress(send_root);
+   }
if (sctx  !IS_ERR_OR_NULL(sctx-parent_root))
btrfs_root_dec_send_in_progress(sctx-parent_root);
-   btrfs_root_dec_send_in_progress(send_root);
 
kfree(arg);
vfree(clone_sources_tmp);
-- 
1.8.3.1

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 2/4] Btrfs: fix protection between send and root deletion

2014-01-07 Thread Wang Shilong
We should gurantee that parent and clone roots can not be destroyed
during send, for this we have two ideas.

1.by holding @subvol_sem, this might be a nightmare, because it will
block all subvolumes deletion for a long time.

2.Miao pointed out we can reuse @send_in_progress, that mean we will
skip snapshot deletion if root sending is in progress.

Here we adopt the second approach since it won't block other subvolumes
deletion for a long time.

Besides in btrfs_clean_one_deleted_snapshot(), we only check first root
, if this root is involved in send, we return directly rather than
continue to check.There are several reasons about it:

1.this case happen seldomly.
2.after sending,cleaner thread can continue to drop that root.
3.make code simple

Cc: David Sterba dste...@suse.cz
Signed-off-by: Wang Shilong wangsl.f...@cn.fujitsu.com
Reviewed-by: Miao Xie mi...@cn.fujitsu.com
---
Changelog v1-v2:
use right root to check(pointed by David)
---
 fs/btrfs/send.c| 16 
 fs/btrfs/transaction.c | 13 +
 2 files changed, 29 insertions(+)

diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 5b69785..4e2461b 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -4753,6 +4753,7 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user 
*arg_)
u64 *clone_sources_tmp = NULL;
int clone_sources_to_rollback = 0;
int sort_clone_roots = 0;
+   int index;
 
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
@@ -4893,8 +4894,12 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user 
*arg_)
key.objectid = clone_sources_tmp[i];
key.type = BTRFS_ROOT_ITEM_KEY;
key.offset = (u64)-1;
+
+   index = srcu_read_lock(fs_info-subvol_srcu);
+
clone_root = btrfs_read_fs_root_no_name(fs_info, key);
if (IS_ERR(clone_root)) {
+   srcu_read_unlock(fs_info-subvol_srcu, index);
ret = PTR_ERR(clone_root);
goto out;
}
@@ -4903,10 +4908,13 @@ long btrfs_ioctl_send(struct file *mnt_file, void 
__user *arg_)
clone_root-send_in_progress++;
if (!btrfs_root_readonly(clone_root)) {
spin_unlock(clone_root-root_item_lock);
+   srcu_read_unlock(fs_info-subvol_srcu, index);
ret = -EPERM;
goto out;
}
spin_unlock(clone_root-root_item_lock);
+   srcu_read_unlock(fs_info-subvol_srcu, index);
+
sctx-clone_roots[i].root = clone_root;
}
vfree(clone_sources_tmp);
@@ -4917,19 +4925,27 @@ long btrfs_ioctl_send(struct file *mnt_file, void 
__user *arg_)
key.objectid = arg-parent_root;
key.type = BTRFS_ROOT_ITEM_KEY;
key.offset = (u64)-1;
+
+   index = srcu_read_lock(fs_info-subvol_srcu);
+
sctx-parent_root = btrfs_read_fs_root_no_name(fs_info, key);
if (IS_ERR(sctx-parent_root)) {
+   srcu_read_unlock(fs_info-subvol_srcu, index);
ret = PTR_ERR(sctx-parent_root);
goto out;
}
+
spin_lock(sctx-parent_root-root_item_lock);
sctx-parent_root-send_in_progress++;
if (!btrfs_root_readonly(sctx-parent_root)) {
spin_unlock(sctx-parent_root-root_item_lock);
+   srcu_read_unlock(fs_info-subvol_srcu, index);
ret = -EPERM;
goto out;
}
spin_unlock(sctx-parent_root-root_item_lock);
+
+   srcu_read_unlock(fs_info-subvol_srcu, index);
}
 
/*
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 30d11bb..1ef8e28 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -1971,6 +1971,19 @@ int btrfs_clean_one_deleted_snapshot(struct btrfs_root 
*root)
}
root = list_first_entry(fs_info-dead_roots,
struct btrfs_root, root_list);
+   /*
+* Make sure root is not involved in send,
+* if we fail with first root, we return
+* directly rather than continue.
+*/
+   spin_lock(root-root_item_lock);
+   if (root-send_in_progress) {
+   spin_unlock(fs_info-trans_lock);
+   spin_unlock(root-root_item_lock);
+   return 0;
+   }
+   spin_unlock(root-root_item_lock);
+
list_del_init(root-root_list);
spin_unlock(fs_info-trans_lock);
 
-- 
1.8.3.1

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a 

[PATCH v2 4/4] Btrfs: handle EAGAIN case properly in btrfs_drop_snapshot()

2014-01-07 Thread Wang Shilong
We may return early in btrfs_drop_snapshot(), we shouldn't
call btrfs_std_err() for this case, fix it.

Cc: sta...@vger.kernel.org
Signed-off-by: Wang Shilong wangsl.f...@cn.fujitsu.com
---
Changelog v1-v2: cc stable
---
 fs/btrfs/extent-tree.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 4fa1437..81650cf 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -8272,7 +8272,7 @@ out:
 */
if (!for_reloc  root_dropped == false)
btrfs_add_dead_root(root);
-   if (err)
+   if (err  err != -EAGAIN)
btrfs_std_error(root-fs_info, err);
return err;
 }
-- 
1.8.3.1

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 3/4] Btrfs: remove unnecessary transaction commit before send

2014-01-07 Thread Wang Shilong
We will finish orphan cleanups during snapshot, so we don't
have to commit transaction here.

Signed-off-by: Wang Shilong wangsl.f...@cn.fujitsu.com
Reviewed-by: Miao Xie mi...@cn.fujitsu.com
---
Changelog v1-v2: none
---
 fs/btrfs/send.c | 29 -
 1 file changed, 29 deletions(-)

diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 4e2461b..591063d 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -4776,35 +4776,6 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user 
*arg_)
WARN_ON(send_root-orphan_cleanup_state != ORPHAN_CLEANUP_DONE);
 
/*
-* If we just created this root we need to make sure that the orphan
-* cleanup has been done and committed since we search the commit root,
-* so check its commit root transid with our otransid and if they match
-* commit the transaction to make sure everything is updated.
-*/
-   down_read(send_root-fs_info-extent_commit_sem);
-   if (btrfs_header_generation(send_root-commit_root) ==
-   btrfs_root_otransid(send_root-root_item)) {
-   struct btrfs_trans_handle *trans;
-
-   up_read(send_root-fs_info-extent_commit_sem);
-
-   trans = btrfs_attach_transaction_barrier(send_root);
-   if (IS_ERR(trans)) {
-   if (PTR_ERR(trans) != -ENOENT) {
-   ret = PTR_ERR(trans);
-   goto out;
-   }
-   /* ENOENT means theres no transaction */
-   } else {
-   ret = btrfs_commit_transaction(trans, send_root);
-   if (ret)
-   goto out;
-   }
-   } else {
-   up_read(send_root-fs_info-extent_commit_sem);
-   }
-
-   /*
 * Userspace tools do the checks and warn the user if it's
 * not RO.
 */
-- 
1.8.3.1

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC] lib: raid: New RAID library supporting up to six parities

2014-01-07 Thread Andrea Mazzoleni
Hi Neil,

On 01/07, NeilBrown wrote:
  To do the same with up to six failures, it's now required some kind of sort
  function.
 
 So I would probably just make sure we always process the block is the right
 order.  Then sorting would be irrelevant.
 But as I say, I haven't fiddled with the code, so maybe that would end up
 being more complex.

If the the async_tx and btrfs layers can always provide indexes in order,
for sure this sort function can be removed. I agree with you that it would be
a better design.

In fact, the raid library never uses it directly. It's just provided to help the
callers that for whatever reason cannot provide such indexes in order.
And seeing these swap operations between faila/failb in both btrfs and async_tx,
made me to assume that the order is not always correct.

 I don't see the above as sorting faila and failb, but rather determining
 which one is first.  Once you know which one is first, the remainder follow
 in order.

The new raid library, like the existing raid6, requires to have the indexes of
the failed blocks in order. With only two indexes faila/failb this means
faila  failb and sorting is equivalent to find the first one.

But with up to six failures, is like to have six fail variables:
faila/failb/failc/faild/faile/failf and the raid library requires them
to be in order as: faila  failb  failc  faild  faile  failf.
In this general case a sort function is the only one that gives the guarantee
to fulfill this requirement whatever is the original order.

Ciao,
Andrea

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: btrfs-transaction blocked for more than 120 seconds

2014-01-07 Thread Brendan Hide

On 2014/01/06 12:57 AM, Roman Mamedov wrote:

Did you align your partitions to accommodate for the 4K sector of the EARS?
I had, yes. I had to do a lot of research to get the array working 
optimally. I didn't need to repartition the spare so this carried over 
to its being used as an OS disk.


I actually lost the Green array twice - and learned some valuable lessons:

1. I had an 8-port SCSI card which was dropping the disks due to the 
timeout issue mentioned by Chris. That caused the first array failure. 
Technically all the data was on the disks - but temporarily 
irrecoverable as disks were constantly being dropped. I made a mistake 
during ddrescue which simultaneously destroyed two disks' data, meaning 
that the recovery operation was finally for nought. The only consolation 
was that I had very little data at the time and none of it was 
irreplaceable.


2. After replacing the SCSI card with two 4-port SATA cards, a few 
months later I still had a double-failure (the second failure being 
during the RAID5 rebuild). This time it was only due to bad disks and a 
lack of scrubbing/early warning - clearly my own fault.


Having learnt these lessons, I'm now a big fan of scrubbing and backups. ;)

I'm also pushing for RAID15 wherever data is mission-critical. I simply 
don't trust the reliability of disks any more and I also better 
understand how, by having more and/or larger disks in a RAID5/6 array, 
the overall reliability of that array array plummets.


--
__
Brendan Hide
http://swiftspirit.co.za/
http://www.webafrica.co.za/?AFF1E97

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC v2 2/2] fs: btrfs: Extends btrfs/raid56 to support up to six parities

2014-01-07 Thread Andrea Mazzoleni
Hi Chris,

On 01/06, Chris Mason wrote:
 Neat.  The faila/failb were always my least favorite part of the btrfs
 code ;)  Did you test just raid5/6 or also the higher parity counts?
At this stage no real testing was made with btrfs.

The intention of this btrfs patch is mainly to get feedback on the new raid
inteface, and see if it matches the needs of btrfs, and if it results in
cleaner code than before.

And besides the removal of the faila/failb variables, something other that
likely you can appreciate is the removal of all the P/Q logic from btrfs,
that it's now replaced with a single raid_rec() call.

After the raid interface stabilizes, this patch can be used as starting
point for a real btrfs patch. But at now it's just to show
some example code of what kind of modification btrfs will need.

Ciao,
Andrea
--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Is anyone using btrfs send/receive howto?

2014-01-07 Thread Marc MERLIN
I read different howtos on the wiki and oracle docs, but I can't get it
to work:

legolas:/mnt/btrfs_pool1# btrfs subvolume snapshot -r tmp tmp_read_only_new
Create a readonly snapshot of 'tmp' in './tmp_read_only_new'
legolas:/mnt/btrfs_pool1# sync
legolas:/mnt/btrfs_pool1# btrfs send tmp_read_only_new | btrfs receive 
/mnt/btrfs_pool2/
At subvol tmp_read_only_new
At subvol tmp_read_only_new

# Make a new snapshot later and try to sync it:
legolas:/mnt/btrfs_pool1# mv tmp_read_only_new tmp_read_only
legolas:/mnt/btrfs_pool1# btrfs subvolume snapshot -r tmp tmp_read_only_new
Create a readonly snapshot of 'tmp' in './tmp_read_only_new'
legolas:/mnt/btrfs_pool1# btrfs send -p tmp_read_only  tmp_read_only_new | 
btrfs receive /mnt/btrfs_pool2/
At subvol tmp_read_only_new
At snapshot tmp_read_only_new
ERROR: creating snapshot tmp_read_only_new - tmp_read_only_new failed. File 
exists

This is where I get stuck:
Obviously /mnt/btrfs_pool2/tmp_read_only_new already exists since it's the 
reference backup volume.

What am I doing wrong?

Thanks,
Marc
-- 
A mouse is a device used to point at the xterm you want to type in - A.S.R.
Microsoft is to operating systems 
   what McDonalds is to gourmet cooking
Home page: http://marc.merlins.org/ | PGP 1024R/763BE901
--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Is anyone using btrfs send/receive howto?

2014-01-07 Thread Hugo Mills
On Tue, Jan 07, 2014 at 02:49:51AM -0800, Marc MERLIN wrote:
 I read different howtos on the wiki and oracle docs, but I can't get it
 to work:
 
 legolas:/mnt/btrfs_pool1# btrfs subvolume snapshot -r tmp tmp_read_only_new
 Create a readonly snapshot of 'tmp' in './tmp_read_only_new'
 legolas:/mnt/btrfs_pool1# sync
 legolas:/mnt/btrfs_pool1# btrfs send tmp_read_only_new | btrfs receive 
 /mnt/btrfs_pool2/
 At subvol tmp_read_only_new
 At subvol tmp_read_only_new
 
 # Make a new snapshot later and try to sync it:
 legolas:/mnt/btrfs_pool1# mv tmp_read_only_new tmp_read_only
 legolas:/mnt/btrfs_pool1# btrfs subvolume snapshot -r tmp tmp_read_only_new
 Create a readonly snapshot of 'tmp' in './tmp_read_only_new'
 legolas:/mnt/btrfs_pool1# btrfs send -p tmp_read_only  tmp_read_only_new | 
 btrfs receive /mnt/btrfs_pool2/
 At subvol tmp_read_only_new
 At snapshot tmp_read_only_new
 ERROR: creating snapshot tmp_read_only_new - tmp_read_only_new failed. File 
 exists
 
 This is where I get stuck:
 Obviously /mnt/btrfs_pool2/tmp_read_only_new already exists since it's the 
 reference backup volume.
 
 What am I doing wrong?

   You need to move /mnt/btrfs_pool2/tmp_read_only_new to a different
name as well. The send stream contains the name of the subvolume it
wants to create, so it's trying to create a subvolume called
tmp_read_only_new in /mnt/btrfs_pool2, and there's one there already.

   Hugo.

-- 
=== Hugo Mills: hugo@... carfax.org.uk | darksatanic.net | lug.org.uk ===
  PGP key: 65E74AC0 from wwwkeys.eu.pgp.net or http://www.carfax.org.uk
   --- It's against my programming to impersonate a deity! ---   


signature.asc
Description: Digital signature


Re: [RFC v2 0/2] New RAID library supporting up to six parities

2014-01-07 Thread Andrea Mazzoleni
On 01/06, joystick wrote:
 Just by looking at the Subjects, it seems patch number 0/1 is
 missing. It might have not gotten through to the lists, or be a
 numbering mistake.
The patch files can be also downloaded from: 
http://snapraid.sourceforge.net/linux/v2/

Sorry about that,

 Does your code also support (shortcut) RMW as opposed to RCW, for
 all parities?
At now no. But it can be easily extended to add the parity computation
of a single disk in a set of already computed parities and then provide the
required support for RMW.

With SSSE3 this can be implemented in a very efficient way.

Ciao,
Andrea

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC v2 0/2] New RAID library supporting up to six parities

2014-01-07 Thread Andrea Mazzoleni
Hi,

It seems that the patch was to big for some linux lists.
If you miss some patch files, you can download them also at:

http://snapraid.sourceforge.net/linux/v2/

Sorry about that.

Ciao,
Andrea
--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[RFC PATCH v4] Btrfs: add support for inode properties

2014-01-07 Thread Filipe David Borba Manana
This change adds infrastructure to allow for generic properties for
inodes. Properties are name/value pairs that can be associated with
inodes for different purposes. They are stored as xattrs with the
prefix btrfs.

Properties can be inherited - this means when a directory inode has
inheritable properties set, these are added to new inodes created
under that directory. Further, subvolumes can also have properties
associated with them, and they can be inherited from their parent
subvolume. Naturally, directory properties have priority over subvolume
properties (in practice a subvolume property is just a regular
property associated with the root inode, objectid 256, of the
subvolume's fs tree).

This change also adds one specific property implementation, named
compression, whose values can be lzo or zlib and it's an
inheritable property.

The corresponding changes to btrfs-progs were also implemented.
A patch with xfstests for this feature will follow once there's
agreement on this change/feature.

Further, the script at the bottom of this commit message was used to
do some benchmarks to measure any performance penalties of this feature.

Basically the tests correspond to:

Test 1 - create a filesystem and mount it with compress-force=lzo,
then sequentially create N files of 64Kb each, measure how long it took
to create the files, unmount the filesystem, mount the filesystem and
perform an 'ls -lha' against the test directory holding the N files, and
report the time the command took.

Test 2 - create a filesystem and don't use any compression option when
mounting it - instead set the compression property of the subvolume's
root to 'lzo'. Then create N files of 64Kb, and report the time it took.
The unmount the filesystem, mount it again and perform an 'ls -lha' like
in the former test. This means every single file ends up with a property
(xattr) associated to it.

Test 3 - same as test 2, but uses 4 properties - 3 are duplicates of the
compression property, have no real effect other than adding more work
when inheriting properties and taking more btree leaf space.

Test 4 - same as test 3 but with 10 properties per file.

Results (in seconds, and averages of 5 runs each), for different N
numbers of files follow.

* Without properties (test 1)

file creation timels -lha time
10 000 files  3.49   0.76
100 000 files47.19   8.37
1 000 000 files 518.51 107.06

* With 1 property (compression property set to lzo - test 2)

file creation timels -lha time
10 000 files  3.630.93
100 000 files48.569.74
1 000 000 files 537.72  125.11

* With 4 properties (test 3)

file creation timels -lha time
10 000 files  3.941.20
100 000 files52.14   11.48
1 000 000 files 572.70  142.13

* With 10 properties (test 4)

file creation timels -lha time
10 000 files  4.611.35
100 000 files58.86   13.83
1 000 000 files 656.01  177.61

The increased latencies with properties are essencialy because of:

*) When creating an inode, we now synchronously write 1 more item
   (an xattr item) for each property inherited from the parent dir
   (or subvolume). This could be done in an asynchronous way such
   as we do for dir intex items (delayed-inode.c), which could help
   reduce the file creation latency;

*) With properties, we now have larger fs trees. For this particular
   test each xattr item uses 75 bytes of leaf space in the fs tree.
   This could be less by using a new item for xattr items, instead of
   the current btrfs_dir_item, since we could cut the 'location' and
   'type' fields (saving 18 bytes) and maybe 'transid' too (saving a
   total of 26 bytes per xattr item) from the btrfs_dir_item type.

Also tried batching the xattr insertions (ignoring proper hash
collision handling, since it didn't exist) when creating files that
inherit properties from their parent inode/subvolume, but the end
results were (surprisingly) essentially the same.

Test script:

$ cat test.pl
  #!/usr/bin/perl -w

  use strict;
  use Time::HiRes qw(time);
  use constant NUM_FILES = 10_000;
  use constant FILE_SIZES = (64 * 1024);
  use constant DEV = '/dev/sdb4';
  use constant MNT_POINT = '/home/fdmanana/btrfs-tests/dev';
  use constant TEST_DIR = (MNT_POINT . '/testdir');

  system(mkfs.btrfs, -l, 16384, -f, DEV) == 0 or die mkfs.btrfs 
failed!;

  # following line for testing without properties
  #system(mount, -o, compress-force=lzo, DEV, MNT_POINT) == 0 or die 
mount failed!;

  # following 2 lines for testing with properties
  system(mount, DEV, MNT_POINT) == 0 or die mount failed!;
  system(btrfs, prop, 

Re: [PATCH 0/5] Add support for object properties

2014-01-07 Thread Filipe David Manana
On Sat, Nov 23, 2013 at 12:52 AM, David Sterba dste...@suse.cz wrote:
 On Tue, Nov 12, 2013 at 01:41:41PM +, Filipe David Borba Manana wrote:
 This is a revised version of the original proposal/work from Alexander Block
 to introduce a generic framework to set properties on btrfs filesystem 
 objects
 (inodes, subvolumes, filesystems, devices).

 Currently the command group looks like this:
 btrfs prop set [-t type] /path/to/object name value
 btrfs prop get [-t type] /path/to/object [name] (omitting name 
 dumps all)
 btrfs prop list [-t type] /path/to/object (lists properties with 
 description)

 The type is used to explicitly specify what type of object you mean. 
 This is
 necessary in case the object+property combination is ambiguous.  For 
 example
 '/path/to/fs/root' could mean the root subvolume, the directory 
 inode or the
 filesystem itself. Normally, btrfs-progs will try to detect the type
 automatically.

 The generic commandline UI still looks ok to me, storing properties as xattr’s
 is also ok (provided that we can capture the “btrfs.” xattr namespace).

 I’ll dump my thoughts and questions about the rest.

 1) Where are stored properties that are not directly attached to an inode? Ie.
whole-filesystem and device. How can I access props for a subvolume that is
not currently reachable in the directory tree?

 A fs or device props must be accessible any time, eg. no matter which 
 subvolume
 is currently mounted. This should be probably a special case where the 
 property
 can be queried from any inode but will be internally routed to eg. toplevel
 subvolume that will store the respective xattrs.


 2) if a property’s object can be ambiguous, how is that distinguished in the
xattrs?

 We don’t have a list of props yet, so I’m trying to use one that hopefully
 makes some sense. The example here can be persistent-mount-options that are
 attached to fs and a subvolume. The fs-wide props will apply when a different
 subvolume is explicitly mounted.

 Assuming that the xattrs are stored with the toplevel subvolume, the fs-wide
 and per-subvolume property must have a differnt xattr name (another option is
 to store fs-wide elsewhere). So the question is, if we should encode the
 property object into the xattr name directly. Eg.:

   btrfs.fs.persistent_mount
   btrfs.subvol.persistent_mount

 or if the fs-wide uses a reserved naming scheme that would appear as xattr
 named

   btrfs.persistent_mount

 but the value would differ if queried with ‘-t fs or ‘-t subvolume’.


 3) property precedence, interaction with mount options

 The precedence should follow the rule of the least surprise, ie. if I set eg. 
 a
 compression of a file to zlib, set the subvolume compression type to ‘none’ 
 and
 have fs-wide mount the filesystem with compress=lzo, I’m expecting that the
 file will use ‘zlib’.

 The generic rule says that mount options (that have a corresponding property)
 take precedence. There may be exceptions.

 What if there are no specific mount options, but the subvolume has eg.
 compression set? Should new files inherit that automatically or only if
 explicitly marked as such?

 The background concern is how far do I need to look and whether this could be 
 a
 potential performance problem. The lookup chain can be long: inode - dir -
 subvol - fs, plus mount options along the path where applicable.

 If the xattr values are cached in structs that are accessed anyway (containing
 subvol, fs_info) and updated when property xattr changes, this could work.


 4) behaviour of tools that see the btrfs-specific props

 What would cp or rsync do when copying the xattrs to a different filesystem?
 They may simply ignore it and just try to copy the user. namespace, I haven’t
 done any research here.


 5) properties to consider, naming, xattr name namespaces, inheritable props

 Here’s a list of properties that I’ve collected so far:

 filesystem-wide
 - persistent mount options
 - default raid allocation profile
 - default compression parameters
 - label
 - scrub ioprio

 subvolume
 - persistent mount options
 - default raid allocation profile
 - default compression parameters
 - writable

 device
 - writable/ro/dead
 - hot-spare
 - preferred for metadata
 - preferred data reads
 - preferred data writes
 - no new allocations if possible
 - speed profile, io priorities
 - allocation group - from project idea Chunk allocation groups
 - ssd cache, L2ARC
 - scrub ioprio

 file, directory
 - compression parameters
 - raid allocation profile

 details on compression parameters:
 - compression algo
 - compression level
 - desired compression ratio target
 - file compressibility hint - from project idea “Compression updates”
 - compression container type - dtto

 The props should be basically self explanatory. This is not a complete or
 finalized list, feel free to suggest what you think should/not be here.

 The number of 

Re: correct way to rollback a root filesystem?

2014-01-07 Thread Sander
Jim Salter wrote (ao):
 I tried a kernel upgrade with moderately disastrous
 (non-btrfs-related) results this morning; after the kernel upgrade
 Xorg was completely borked beyond my ability to get it working
 properly again through any normal means. I do have hourly snapshots
 being taken by cron, though, so I'm successfully X'ing again on the
 machine in question right now.
 
 It was quite a fight getting back to where I started even so, though
 - I'm embarassed to admit I finally ended up just doing a cp
 --reflink=all /mnt/@/.snapshots/snapshotname /mnt/@/ from the
 initramfs BusyBox prompt.  Which WORKED well enough, but obviously
 isn't ideal.
 
 I tried the btrfs sub set-default command - again from BusyBox - and
 it didn't seem to want to work for me; I got an inappropriate ioctl
 error (which may be because I tried to use / instead of /mnt, where
 the root volume was CURRENTLY mounted, as an argument?). Before
 that, I'd tried setting subvol=@root (which is the writeable
 snapshot I created from the original read-only hourly snapshot I
 had) in GRUB and in fstab... but that's what landed me in BusyBox to
 begin with.
 
 When I DID mount the filesystem in BusyBox on /mnt, I saw that @ and
 @home were listed under /mnt, but no other directories were -
 which explains why mounting -o subvol=@root didn't work. I guess the
 question is, WHY couldn't I see @root in there, since I had a
 working, readable, writeable snapshot which showed its own name as
 root when doing a btrfs sub show /.snapshots/root ?

I don't quite get how your setup is.

In my setup, all subvolumes and snapshots are under /.root/

# cat /etc/fstab
LABEL=panda   /  btrfs  
subvol=rootvolume,space_cache,inode_cache,compress=lzo,ssd  0  0
LABEL=panda   /home   btrfs   subvol=home   
0  0
LABEL=panda   /root   btrfs   subvol=root   
0  0
LABEL=panda   /varbtrfs   subvol=var
0  0
LABEL=panda   /holdingbtrfs   subvol=.holding   
0  0
LABEL=panda   /.root  btrfs   subvolid=0
0  0
/Varlib   /var/libnonebind  
0  0


In case of an OS upgrade gone wrong, I would mount subvolid=0, move
subvolume 'rootvolume' out of the way, and move (rename) the last known
good snapshot to 'rootvolume'.

Not sure if that works though. Never tried.

Sander
--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 00/21] Consolidate Posix ACL implementation V3

2014-01-07 Thread Christoph Hellwig
On Fri, Dec 20, 2013 at 05:16:35AM -0800, Christoph Hellwig wrote:
 This series consolidates the various cut'n'pasted Posix ACL implementations
 into a single common one based on the -get_acl method Linus added a while
 ago and a new -set_acl counterpart.
 
 This remove ~1800 lines of code and provides a single place to implement
 various nasty little gems of the semantics.

Any more comments?  Al, what's the best way to make sure this doesn't
miss 3.14?

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 2/3] Btrfs: rework qgroup accounting

2014-01-07 Thread Josef Bacik


On 12/21/2013 03:56 AM, Wang Shilong wrote:

Hello Josef,

Though i know there are still problems related to qgroup(for example removing 
snapshot
will beak qgroup accounting).I did a simple test about your patch..


# btrfs quota enable /mnt
# dd if=/dev/zero of=/mnt/data bs=4k count=102400 oflag=direct
# btrfs sub snapshot /mnt/ /mnt/snap1
# btrfs sub snapshot /mnt /mnt/snap2
# btrfs sub delete /mnt/snap1 /mnt/snap2
# sync
# rm -rf /mnt/data
# sync
# dmesg
# btrfs qgroup show /mnt


Firstly, qgroup accounting is wrong, this is maybe expected because efficient  
fs tree removal.
However, from dmesg, i get the  WARNING:

WARNING: CPU: 1 PID: 2650 at fs/btrfs/qgroup.c:1486 
btrfs_delayed_qgroup_accounting

I did not take a deep look at codes, but i think you will be interested in 
taking a look at this. ^_^


Ok I finally sat down to look at this and it is because subvol deletion 
screws up quota accounting no matter what.  I just added this WARN_ON() 
to catch actual mistakes, it just so happens it gets tripped when the 
snapshot deletion stuff happens too.  I'm going to put this on the I'll 
deal with it later list since it is an existing issue with or without 
my patch.  Thanks,


Josef
--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] xfstests: kill lib/random.c

2014-01-07 Thread Ben Myers
Hey Gents,

On Mon, Jan 06, 2014 at 03:46:58PM -0600, Eric Sandeen wrote:
 On 1/6/14, 3:42 PM, Josef Bacik wrote:
  
  On 01/06/2014 04:32 PM, Eric Sandeen wrote:
  On 1/6/14, 1:58 PM, Josef Bacik wrote:
  I was trying to reproduce something with fsx and I noticed that no matter 
  what
  seed I set I was getting the same file.  Come to find out we are 
  overloading
  random() with our own custom horribleness for some unknown reason.  So 
  nuke the
  damn thing from orbit and rely on glibc's random().  With this fix the -S 
  option
  actually does something with fsx.  Thanks,
  Hm, old comments seem to indicate that this was done handwave to make 
  random
  behave the same on different architectures (i.e. same result from same 
  seed,
  I guess?)  I . . . don't know if that is true of glibc's random(), is it?
 
  I'd like to dig into the history just a bit before we yank this, just to
  be sure.
  
  I think that if we need the output to match based on a predictable
  random() output then we've lost already. We shouldn't be checking for
  specific output (like inode numbers or sizes etc) that are dependant
  on random()'s behaviour, and if we are we need to fix those tests. So
  even if that is why it was put in place originally I'd say it is high
  time we ripped it out and fixed up any tests that rely on this
  behaviour. Thanks,
 
 Yeah, you're probably right.  And the ancient xfstests history seems to
 be lost in the mists of time, at least as far as I can see.  So I'm ok
 with this but let's let Dave  SGI chime in too just to be certain.

I did not have success locating the history prior to what we have posted on
oss.  I agree that it was likely added so that tests that expose output from
random into golden output files will have the same results across arches.
Maybe this is still of concern for folks who use a different c library with the
kernel.  

Looks there are quite a few callers.  IMO if we're going to remove this we
should fix the tests first.

-Ben
--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] xfstests: kill lib/random.c

2014-01-07 Thread Eric Sandeen
On 1/7/14, 2:01 PM, Ben Myers wrote:
 Hey Gents,
 
 On Mon, Jan 06, 2014 at 03:46:58PM -0600, Eric Sandeen wrote:
 On 1/6/14, 3:42 PM, Josef Bacik wrote:

 On 01/06/2014 04:32 PM, Eric Sandeen wrote:
 On 1/6/14, 1:58 PM, Josef Bacik wrote:
 I was trying to reproduce something with fsx and I noticed that no matter 
 what
 seed I set I was getting the same file.  Come to find out we are 
 overloading
 random() with our own custom horribleness for some unknown reason.  So 
 nuke the
 damn thing from orbit and rely on glibc's random().  With this fix the -S 
 option
 actually does something with fsx.  Thanks,
 Hm, old comments seem to indicate that this was done handwave to make 
 random
 behave the same on different architectures (i.e. same result from same 
 seed,
 I guess?)  I . . . don't know if that is true of glibc's random(), is it?

 I'd like to dig into the history just a bit before we yank this, just to
 be sure.

 I think that if we need the output to match based on a predictable
 random() output then we've lost already. We shouldn't be checking for
 specific output (like inode numbers or sizes etc) that are dependant
 on random()'s behaviour, and if we are we need to fix those tests. So
 even if that is why it was put in place originally I'd say it is high
 time we ripped it out and fixed up any tests that rely on this
 behaviour. Thanks,

 Yeah, you're probably right.  And the ancient xfstests history seems to
 be lost in the mists of time, at least as far as I can see.  So I'm ok
 with this but let's let Dave  SGI chime in too just to be certain.
 
 I did not have success locating the history prior to what we have posted on
 oss.  I agree that it was likely added so that tests that expose output from
 random into golden output files will have the same results across arches.
 Maybe this is still of concern for folks who use a different c library with 
 the
 kernel.  
 
 Looks there are quite a few callers.  IMO if we're going to remove this we
 should fix the tests first.

Or first, determine if they really need fixing.  Did you find tests which
actually contain the random results in the golden output?

-Eric

 -Ben
 
 ___
 xfs mailing list
 x...@oss.sgi.com
 http://oss.sgi.com/mailman/listinfo/xfs
 

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] xfstests: kill lib/random.c

2014-01-07 Thread Eric Sandeen
On 1/7/14, 2:10 PM, Eric Sandeen wrote:
 On 1/7/14, 2:01 PM, Ben Myers wrote:
 Hey Gents,

 On Mon, Jan 06, 2014 at 03:46:58PM -0600, Eric Sandeen wrote:
 On 1/6/14, 3:42 PM, Josef Bacik wrote:

 On 01/06/2014 04:32 PM, Eric Sandeen wrote:
 On 1/6/14, 1:58 PM, Josef Bacik wrote:
 I was trying to reproduce something with fsx and I noticed that no 
 matter what
 seed I set I was getting the same file.  Come to find out we are 
 overloading
 random() with our own custom horribleness for some unknown reason.  So 
 nuke the
 damn thing from orbit and rely on glibc's random().  With this fix the 
 -S option
 actually does something with fsx.  Thanks,
 Hm, old comments seem to indicate that this was done handwave to make 
 random
 behave the same on different architectures (i.e. same result from same 
 seed,
 I guess?)  I . . . don't know if that is true of glibc's random(), is it?

 I'd like to dig into the history just a bit before we yank this, just to
 be sure.

 I think that if we need the output to match based on a predictable
 random() output then we've lost already. We shouldn't be checking for
 specific output (like inode numbers or sizes etc) that are dependant
 on random()'s behaviour, and if we are we need to fix those tests. So
 even if that is why it was put in place originally I'd say it is high
 time we ripped it out and fixed up any tests that rely on this
 behaviour. Thanks,

 Yeah, you're probably right.  And the ancient xfstests history seems to
 be lost in the mists of time, at least as far as I can see.  So I'm ok
 with this but let's let Dave  SGI chime in too just to be certain.

 I did not have success locating the history prior to what we have posted on
 oss.  I agree that it was likely added so that tests that expose output from
 random into golden output files will have the same results across arches.
 Maybe this is still of concern for folks who use a different c library with 
 the
 kernel.  

 Looks there are quite a few callers.  IMO if we're going to remove this we
 should fix the tests first.
 
 Or first, determine if they really need fixing.  Did you find tests which
 actually contain the random results in the golden output?

This should be easy enough to test by just hacking the lib/random.c with a
new starting seed, right?

-Eric

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [block:for-3.14/core] kernel BUG at fs/bio.c:1748

2014-01-07 Thread Muthu Kumar
Thanks Fengguang. Final patch with added comment. BTW, fengguang
mentioned that git-am has trouble with the inline patch and quilt
import worked fine for him...


In btrfs_end_bio(), we increment bi_remaining if is_orig_bio. If not,
we restore the orig_bio but failed to increment bi_remaining for
orig_bio, which triggers a BUG_ON later when bio_endio is called. Fix
is to increment bi_remaining when we restore the orig bio as well.

Reported-and-Tested-by: Fengguang wu fengguang...@intel.com
CC: Kent Overstreet k...@daterainc.com
CC: Jens Axboe ax...@kernel.dk
CC: Chris Mason c...@fb.com
Signed-off-by: Muthukumar Ratty mut...@gmail.com

---
 fs/btrfs/volumes.c |8 ++--
 1 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 37972d5..34aba2b 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -5297,9 +5297,13 @@ static void btrfs_end_bio(struct bio *bio, int err)
if (!is_orig_bio) {
bio_put(bio);
bio = bbio-orig_bio;
-   } else {
-   atomic_inc(bio-bi_remaining);
}
+/*
+ * We have original bio now. So increment bi_remaining to
+ * account for it in endio
+ */
+   atomic_inc(bio-bi_remaining);
+
bio-bi_private = bbio-private;
bio-bi_end_io = bbio-end_io;
btrfs_io_bio(bio)-mirror_num = bbio-mirror_num;

-
--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] xfstests: kill lib/random.c

2014-01-07 Thread Josef Bacik


On 01/07/2014 03:10 PM, Eric Sandeen wrote:

On 1/7/14, 2:01 PM, Ben Myers wrote:

Hey Gents,

On Mon, Jan 06, 2014 at 03:46:58PM -0600, Eric Sandeen wrote:

On 1/6/14, 3:42 PM, Josef Bacik wrote:

On 01/06/2014 04:32 PM, Eric Sandeen wrote:

On 1/6/14, 1:58 PM, Josef Bacik wrote:

I was trying to reproduce something with fsx and I noticed that no matter what
seed I set I was getting the same file.  Come to find out we are overloading
random() with our own custom horribleness for some unknown reason.  So nuke the
damn thing from orbit and rely on glibc's random().  With this fix the -S option
actually does something with fsx.  Thanks,

Hm, old comments seem to indicate that this was done handwave to make random
behave the same on different architectures (i.e. same result from same seed,
I guess?)  I . . . don't know if that is true of glibc's random(), is it?

I'd like to dig into the history just a bit before we yank this, just to
be sure.

I think that if we need the output to match based on a predictable
random() output then we've lost already. We shouldn't be checking for
specific output (like inode numbers or sizes etc) that are dependant
on random()'s behaviour, and if we are we need to fix those tests. So
even if that is why it was put in place originally I'd say it is high
time we ripped it out and fixed up any tests that rely on this
behaviour. Thanks,

Yeah, you're probably right.  And the ancient xfstests history seems to
be lost in the mists of time, at least as far as I can see.  So I'm ok
with this but let's let Dave  SGI chime in too just to be certain.

I did not have success locating the history prior to what we have posted on
oss.  I agree that it was likely added so that tests that expose output from
random into golden output files will have the same results across arches.
Maybe this is still of concern for folks who use a different c library with the
kernel.

Looks there are quite a few callers.  IMO if we're going to remove this we
should fix the tests first.

Or first, determine if they really need fixing.  Did you find tests which
actually contain the random results in the golden output?



I looked through stuff when I ripped it out and I couldn't find anything 
that relied on consistent numbers, we tend to filter all that stuff 
out.  I think ripping it out and waiting to see if somebody complains is 
a good way to figure out if things need fixing, but that may be less 
than friendly ;).  Thanks,


Josef
--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] Btrfs-progs: deal with invalid key orderings and bad orphan items V2

2014-01-07 Thread Josef Bacik
A user had a fs where the objectid of an orphan item was not the actual orphan
item objectid.  This screwed up fsck because the block has keys in the wrong
order, also the fs scanning stuff will freak out because we have an inode with
nlink 0 and no orphan item.  So this patch is pretty big but is all related.

1) Deal with bad key ordering.  We can easily fix this up, so fix the checking
stuff to tell us exactly what it found when it said there was a problem.  Then
if it's bad key ordering we can reorder the keys and restart the scan.

2) Deal with bad keys.  If we find an orphan item with the wrong objectid it's
likely to screw with stuff, so keep track of these sort of things with a
bad_item list and just run through and delete any objects that don't make sense.
So far we just do this for orphan items but we could extend this as new stuff
pops up.

3) Deal with missing orphan items.  This is easy, if we have a file with i_nlink
set to 0 and no orphan item we can just add an orphan item.

4) Add the infrastructure to corrupt actual key values.  Needed this to create a
test image to verify I was fixing things properly.

This patch fixes the corrupt image I'm adding and passes the other make test
tests.  Thanks,

Signed-off-by: Josef Bacik jba...@fb.com
---
V1-V2: Fix btrfs-corrupt-block so it will actually compile.

 btrfs-corrupt-block.c   | 109 ++-
 cmds-check.c| 331 
 ctree.c | 135 +++--
 ctree.h |  32 ++-
 file-item.c |   2 +-
 tests/fsck-tests/003-bad-orphan-key.img | Bin 0 - 4096 bytes
 6 files changed, 509 insertions(+), 100 deletions(-)
 create mode 100644 tests/fsck-tests/003-bad-orphan-key.img

diff --git a/btrfs-corrupt-block.c b/btrfs-corrupt-block.c
index f0c14a9..10cae00 100644
--- a/btrfs-corrupt-block.c
+++ b/btrfs-corrupt-block.c
@@ -105,6 +105,8 @@ static void print_usage(void)
specify -i for the inode and -f for the field to corrupt)\n);
fprintf(stderr, \t-m The metadata block to corrupt (must also 
specify -f for the field to corrupt)\n);
+   fprintf(stderr, \t-K The key to corrupt in the format 
+   num,num,num (must also specify -f for the field)\n);
fprintf(stderr, \t-f The field in the item to corrupt\n);
exit(1);
 }
@@ -306,6 +308,13 @@ enum btrfs_metadata_block_field {
BTRFS_METADATA_BLOCK_BAD,
 };
 
+enum btrfs_key_field {
+   BTRFS_KEY_OBJECTID,
+   BTRFS_KEY_TYPE,
+   BTRFS_KEY_OFFSET,
+   BTRFS_KEY_BAD,
+};
+
 static enum btrfs_inode_field convert_inode_field(char *field)
 {
if (!strncmp(field, isize, FIELD_BUF_LEN))
@@ -328,6 +337,17 @@ convert_metadata_block_field(char *field)
return BTRFS_METADATA_BLOCK_BAD;
 }
 
+static enum btrfs_key_field convert_key_field(char *field)
+{
+   if (!strncmp(field, objectid, FIELD_BUF_LEN))
+   return BTRFS_KEY_OBJECTID;
+   if (!strncmp(field, type, FIELD_BUF_LEN))
+   return BTRFS_KEY_TYPE;
+   if (!strncmp(field, offset, FIELD_BUF_LEN))
+   return BTRFS_KEY_OFFSET;
+   return BTRFS_KEY_BAD;
+}
+
 static u64 generate_u64(u64 orig)
 {
u64 ret;
@@ -337,6 +357,73 @@ static u64 generate_u64(u64 orig)
return ret;
 }
 
+static u8 generate_u8(u8 orig)
+{
+   u8 ret;
+   do {
+   ret = rand();
+   } while (ret == orig);
+   return ret;
+}
+
+static int corrupt_key(struct btrfs_root *root, struct btrfs_key *key,
+  char *field)
+{
+   enum btrfs_key_field corrupt_field = convert_key_field(field);
+   struct btrfs_path *path;
+   struct btrfs_trans_handle *trans;
+   int ret;
+
+   root = root-fs_info-fs_root;
+   if (corrupt_field == BTRFS_KEY_BAD) {
+   fprintf(stderr, Invalid field %s\n, field);
+   return -EINVAL;
+   }
+
+   path = btrfs_alloc_path();
+   if (!path)
+   return -ENOMEM;
+
+   trans = btrfs_start_transaction(root, 1);
+   if (IS_ERR(trans)) {
+   btrfs_free_path(path);
+   return PTR_ERR(trans);
+   }
+
+   ret = btrfs_search_slot(trans, root, key, path, 0, 1);
+   if (ret  0)
+   goto out;
+   if (ret  0) {
+   fprintf(stderr, Couldn't find the key to corrupt\n);
+   ret = -ENOENT;
+   goto out;
+   }
+
+   switch (corrupt_field) {
+   case BTRFS_KEY_OBJECTID:
+   key-objectid = generate_u64(key-objectid);
+   break;
+   case BTRFS_KEY_TYPE:
+   key-type = generate_u8(key-type);
+   break;
+   case BTRFS_KEY_OFFSET:
+   key-offset = generate_u64(key-objectid);
+   break;
+   default:
+   fprintf(stderr, Invalid field %s, %d\n, field,
+ 

Re: [block:for-3.14/core] kernel BUG at fs/bio.c:1748

2014-01-07 Thread Chris Mason
On Tue, 2014-01-07 at 12:15 -0800, Muthu Kumar wrote:
 Thanks Fengguang. Final patch with added comment. BTW, fengguang
 mentioned that git-am has trouble with the inline patch and quilt
 import worked fine for him...
 
 
 In btrfs_end_bio(), we increment bi_remaining if is_orig_bio. If not,
 we restore the orig_bio but failed to increment bi_remaining for
 orig_bio, which triggers a BUG_ON later when bio_endio is called. Fix
 is to increment bi_remaining when we restore the orig bio as well.
 

Hi everyone,

Which git tree is this against?  Just Jens or some extra code too?

I'll run some tests here.  My original patch is below (it's slightly
different from Muthu's).

Btrfs is sometimes calling bio_endio twice on the same bio while
we chain things.  This makes sure we don't trip over new assertions in
fs/bio.c

Signed-off-by: Chris Mason c...@fb.com

diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
index 7fcac70..5b30360 100644
--- a/fs/btrfs/check-integrity.c
+++ b/fs/btrfs/check-integrity.c
@@ -2289,6 +2289,10 @@ static void btrfsic_bio_end_io(struct bio *bp, int 
bio_error_status)
block = next_block;
} while (NULL != block);
 
+   /*
+* since we're not using bio_endio here, we don't need to worry about
+* the remaining count
+*/
bp-bi_end_io(bp, bio_error_status);
 }
 
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 62176ad..786ddac 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1684,7 +1684,7 @@ static void end_workqueue_fn(struct btrfs_work *work)
bio-bi_private = end_io_wq-private;
bio-bi_end_io = end_io_wq-end_io;
kfree(end_io_wq);
-   bio_endio(bio, error);
+   bio_endio_nodec(bio, error);
 }
 
 static int cleaner_kthread(void *arg)
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index ef48947..a31448f 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -5284,9 +5284,17 @@ static void btrfs_end_bio(struct bio *bio, int err)
}
}
 
-   if (bio == bbio-orig_bio)
+   if (bio == bbio-orig_bio) {
is_orig_bio = 1;
 
+   /*
+* eventually we will call the bi_endio for the original bio,
+* make sure that we've properly bumped bi_remaining to reflect
+* our chain of endios here
+*/
+   atomic_inc(bio-bi_remaining);
+   }
+
if (atomic_dec_and_test(bbio-stripes_pending)) {
if (!is_orig_bio) {
bio_put(bio);
--
To unsubscribe from this list: send the line unsubscribe linux-fsdevel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] Btrfs-progs: bail if we find errors in the extent tree

2014-01-07 Thread Josef Bacik
For some reason we weren't exiting if there were errors in the extent tree,
which means we could have errors in just the extent tree and things like
xfstests would keep going because we completely throw away the return value.
Thanks,

Signed-off-by: Josef Bacik jba...@fb.com
---
 cmds-check.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/cmds-check.c b/cmds-check.c
index 689fe6c..cbabfdc 100644
--- a/cmds-check.c
+++ b/cmds-check.c
@@ -6409,8 +6409,10 @@ int cmd_check(int argc, char **argv)
goto out;
}
ret = check_chunks_and_extents(root);
-   if (ret)
+   if (ret) {
fprintf(stderr, Errors found in extent allocation tree or 
chunk allocation\n);
+   goto out;
+   }
 
fprintf(stderr, checking free space cache\n);
ret = check_space_cache(root);
-- 
1.8.3.1

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: btrfsck does not fix

2014-01-07 Thread Hendrik Friedel

Hello,

 I ran btrfsck on my volume with the repair option. When I re-run 
it, I get the same errors as before.

It mounts without errors? So why then btrfsck/btrfs repair? What precipitated 
the repair?


I don't know what caused the damage, but a check revealed this:

Checking filesystem on /dev/sdb1
UUID: 989306aa-d291-4752-8477-0baf94f8c42f
checking extents

Extent back ref already exists for 2994950590464 parent 863072366592 root 0
Extent back ref already exists for 2994950836224 parent 863072366592 root 0
Extent back ref already exists for 862762737664 parent 863072366592 root 0
Extent back ref already exists for 2994950877184 parent 863072366592
[...]
Incorrect global backref count on 2995767250944 found 1 wanted 2
backpointer mismatch on [2995767250944 4096]
ref mismatch on [2995767304192 4096] extent item 1, found 2
Incorrect global backref count on 2995767304192 found 1 wanted 2
backpointer mismatch on [2995767304192 4096]
ref mismatch on [2995768258560 4096] extent item 1, found 2
Incorrect global backref count on 2995768258560 found 1 wanted 2
backpointer mismatch on [2995768258560 4096]
ref mismatch on [2995768459264 4096] extent item 1, found 2
Incorrect global backref count on 2995768459264 found 1 wanted 2
backpointer mismatch on [2995768459264 4096]
Errors found in extent allocation tree or chunk allocation

ref mismatch on [2995768459264 4096] extent item 1, found 2
Incorrect global backref count on 2995768459264 found 1 wanted 2
backpointer mismatch on [2995768459264 4096]
Errors found in extent allocation tree or chunk allocation
checking free space cache
checking fs roots
root 256 inode 9579 errors 100, file extent discount
root 256 inode 9580 errors 100, file extent discount
root 256 inode 14258 errors 100, file extent discount
root 256 inode 14259 errors 100, file extent discount
root  inode 9579 errors 100, file extent discount
root  inode 9580 errors 100, file extent discount
root  inode 14258 errors 100, file extent discount
root  inode 14259 errors 100, file extent discount
found 1993711951581 bytes used err is 1
total csum bytes: 4560615360
total tree bytes: 5643403264
total fs tree bytes: 139776000
total extent tree bytes: 263602176
btree space waste bytes: 504484726
file data blocks allocated: 6557032402944
 referenced 6540949323776
Btrfs v3.12

This made me run btrfsck with the repair option:


Extent back ref already exists for 2994950590464 parent 863072366592 root 0

ref mismatch on [32935936 4096] extent item 1, found 2
repair deleting extent record: key 32935936 168 4096
adding new tree backref on start 32935936 len 4096 parent 2994784206848 
root 2994784206848

Incorrect global backref count on 32935936 found 1 wanted 2
backpointer mismatch on [32935936 4096]
ref mismatch on [32997376 4096] extent item 1, found 2
repair deleting extent record: key 32997376 168 4096
adding new tree backref on start 32997376 len 4096 parent 2994824708096 
root 2994824708096

Incorrect global backref count on 32997376 found 1 wanted 2
backpointer mismatch on [32997376 4096]

Incorrect global backref count on 8988365651968 found 1 wanted 0
backpointer mismatch on [8988365651968 4096]
repaired damaged extent references
checking free space cache
checking fs roots
root 256 inode 9579 errors 100, file extent discount
root 256 inode 9580 errors 100, file extent discount
root 256 inode 14258 errors 100, file extent discount
root 256 inode 14259 errors 100, file extent discount
root  inode 9579 errors 100, file extent discount
root  inode 9580 errors 100, file extent discount
root  inode 14258 errors 100, file extent discount
root  inode 14259 errors 100, file extent discount
enabling repair mode
Checking filesystem on /dev/sdc1
UUID: 989306aa-d291-4752-8477-0baf94f8c42f
cache and super generation don't match, space cache will be invalidated
found 827360733827 bytes used err is 1
total csum bytes: 4446455380
total tree bytes: 5506977792
total fs tree bytes: 137293824
total extent tree bytes: 258691072
btree space waste bytes: 496921489
file data blocks allocated: 6440132583424
 referenced 6424163344384
Btrfs v3.12


After this, I ran a check without the repair option again and the same 
errors persist.


Greetings,
Hendrik


--
Hendrik Friedel
Auf dem Brink 12
28844 Weyhe
Mobil 0178 1874363
--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] xfstests: kill lib/random.c

2014-01-07 Thread Ben Myers
On Tue, Jan 07, 2014 at 02:10:15PM -0600, Eric Sandeen wrote:
 On 1/7/14, 2:01 PM, Ben Myers wrote:
  Hey Gents,
  
  On Mon, Jan 06, 2014 at 03:46:58PM -0600, Eric Sandeen wrote:
  On 1/6/14, 3:42 PM, Josef Bacik wrote:
 
  On 01/06/2014 04:32 PM, Eric Sandeen wrote:
  On 1/6/14, 1:58 PM, Josef Bacik wrote:
  I was trying to reproduce something with fsx and I noticed that no 
  matter what
  seed I set I was getting the same file.  Come to find out we are 
  overloading
  random() with our own custom horribleness for some unknown reason.  So 
  nuke the
  damn thing from orbit and rely on glibc's random().  With this fix the 
  -S option
  actually does something with fsx.  Thanks,
  Hm, old comments seem to indicate that this was done handwave to make 
  random
  behave the same on different architectures (i.e. same result from same 
  seed,
  I guess?)  I . . . don't know if that is true of glibc's random(), is it?
 
  I'd like to dig into the history just a bit before we yank this, just to
  be sure.
 
  I think that if we need the output to match based on a predictable
  random() output then we've lost already. We shouldn't be checking for
  specific output (like inode numbers or sizes etc) that are dependant
  on random()'s behaviour, and if we are we need to fix those tests. So
  even if that is why it was put in place originally I'd say it is high
  time we ripped it out and fixed up any tests that rely on this
  behaviour. Thanks,
 
  Yeah, you're probably right.  And the ancient xfstests history seems to
  be lost in the mists of time, at least as far as I can see.  So I'm ok
  with this but let's let Dave  SGI chime in too just to be certain.
  
  I did not have success locating the history prior to what we have posted on
  oss.  I agree that it was likely added so that tests that expose output from
  random into golden output files will have the same results across arches.
  Maybe this is still of concern for folks who use a different c library with 
  the
  kernel.  
  
  Looks there are quite a few callers.  IMO if we're going to remove this we
  should fix the tests first.
 
 Or first, determine if they really need fixing.  Did you find tests which
 actually contain the random results in the golden output?

At one point random.c was modified because it was returning different test
results on i386 and ia64 with test 007.  Looks like nametest.c is a good
candidate.

-Ben
--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] Btrfs-progs: bail if we find errors in the extent tree

2014-01-07 Thread Josef Bacik


On 01/07/2014 03:34 PM, Josef Bacik wrote:

For some reason we weren't exiting if there were errors in the extent tree,
which means we could have errors in just the extent tree and things like
xfstests would keep going because we completely throw away the return value.
Thanks,

Signed-off-by: Josef Bacik jba...@fb.com
---
  cmds-check.c | 4 +++-
  1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/cmds-check.c b/cmds-check.c
index 689fe6c..cbabfdc 100644
--- a/cmds-check.c
+++ b/cmds-check.c
@@ -6409,8 +6409,10 @@ int cmd_check(int argc, char **argv)
goto out;
}
ret = check_chunks_and_extents(root);
-   if (ret)
+   if (ret) {
fprintf(stderr, Errors found in extent allocation tree or chunk 
allocation\n);
+   goto out;
+   }
  
  	fprintf(stderr, checking free space cache\n);

ret = check_space_cache(root);
Sigh ignore this, it is causing other problems, I'm going to have to 
figure out a better solution to this.  Thanks,


Josef
--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v2 5/6] Btrfs: use flags instead of the bool variants in delayed node

2014-01-07 Thread David Sterba
On Tue, Jan 07, 2014 at 11:59:18AM +0800, Miao Xie wrote:
 But I read a discuss about the use of boolean type, some developers suggested
 us to use bitfields instead of bool, because the bitfields can work better,
 and they are more flexible, less misuse than bool.
   https://lkml.org/lkml/2013/9/1/154

I was searching for pros/cons of the bools but haven't found this one
nor anything useful, though not all of the advantages of bitfields apply
in our case.

I've compared the generated assembly with just this patch, and the
difference is effectively only the lock prefix for set/clear, the read
side has no significant penalty:

old:
  mov0x128(%rbx),%esi

new:
  mov0x120(%rbx),%rax
  test   $0x1,%al


old set:
  movb   $0x1,0x120(%rbx)

new:
  lock orb $0x1,0x120(%rbx)

The delayed_node structure is relatively short lived, is reused
frequently and I haven't seen any contention points where the lock
prefix would hurt.

So, ok to merge it.
--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] xfstests: kill lib/random.c

2014-01-07 Thread Josef Bacik


On 01/07/2014 03:40 PM, Ben Myers wrote:

On Tue, Jan 07, 2014 at 02:10:15PM -0600, Eric Sandeen wrote:

On 1/7/14, 2:01 PM, Ben Myers wrote:

Hey Gents,

On Mon, Jan 06, 2014 at 03:46:58PM -0600, Eric Sandeen wrote:

On 1/6/14, 3:42 PM, Josef Bacik wrote:

On 01/06/2014 04:32 PM, Eric Sandeen wrote:

On 1/6/14, 1:58 PM, Josef Bacik wrote:

I was trying to reproduce something with fsx and I noticed that no matter what
seed I set I was getting the same file.  Come to find out we are overloading
random() with our own custom horribleness for some unknown reason.  So nuke the
damn thing from orbit and rely on glibc's random().  With this fix the -S option
actually does something with fsx.  Thanks,

Hm, old comments seem to indicate that this was done handwave to make random
behave the same on different architectures (i.e. same result from same seed,
I guess?)  I . . . don't know if that is true of glibc's random(), is it?

I'd like to dig into the history just a bit before we yank this, just to
be sure.

I think that if we need the output to match based on a predictable
random() output then we've lost already. We shouldn't be checking for
specific output (like inode numbers or sizes etc) that are dependant
on random()'s behaviour, and if we are we need to fix those tests. So
even if that is why it was put in place originally I'd say it is high
time we ripped it out and fixed up any tests that rely on this
behaviour. Thanks,

Yeah, you're probably right.  And the ancient xfstests history seems to
be lost in the mists of time, at least as far as I can see.  So I'm ok
with this but let's let Dave  SGI chime in too just to be certain.

I did not have success locating the history prior to what we have posted on
oss.  I agree that it was likely added so that tests that expose output from
random into golden output files will have the same results across arches.
Maybe this is still of concern for folks who use a different c library with the
kernel.

Looks there are quite a few callers.  IMO if we're going to remove this we
should fix the tests first.

Or first, determine if they really need fixing.  Did you find tests which
actually contain the random results in the golden output?

At one point random.c was modified because it was returning different test
results on i386 and ia64 with test 007.  Looks like nametest.c is a good
candidate.



Ugh you're right.  Just ignore this patch for now, I'll be in the corner 
banging my head against the wall.  Thanks,


Josef
--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] xfstests: kill lib/random.c

2014-01-07 Thread Chris Mason
On Tue, 2014-01-07 at 16:17 -0500, Josef Bacik wrote:
 On 01/07/2014 03:40 PM, Ben Myers wrote:
  On Tue, Jan 07, 2014 at 02:10:15PM -0600, Eric Sandeen wrote:
  On 1/7/14, 2:01 PM, Ben Myers wrote:
  Hey Gents,
 
  On Mon, Jan 06, 2014 at 03:46:58PM -0600, Eric Sandeen wrote:
  On 1/6/14, 3:42 PM, Josef Bacik wrote:
  On 01/06/2014 04:32 PM, Eric Sandeen wrote:
  On 1/6/14, 1:58 PM, Josef Bacik wrote:
  I was trying to reproduce something with fsx and I noticed that no 
  matter what
  seed I set I was getting the same file.  Come to find out we are 
  overloading
  random() with our own custom horribleness for some unknown reason.  
  So nuke the
  damn thing from orbit and rely on glibc's random().  With this fix 
  the -S option
  actually does something with fsx.  Thanks,
  Hm, old comments seem to indicate that this was done handwave to 
  make random
  behave the same on different architectures (i.e. same result from same 
  seed,
  I guess?)  I . . . don't know if that is true of glibc's random(), is 
  it?
 
  I'd like to dig into the history just a bit before we yank this, just 
  to
  be sure.
  I think that if we need the output to match based on a predictable
  random() output then we've lost already. We shouldn't be checking for
  specific output (like inode numbers or sizes etc) that are dependant
  on random()'s behaviour, and if we are we need to fix those tests. So
  even if that is why it was put in place originally I'd say it is high
  time we ripped it out and fixed up any tests that rely on this
  behaviour. Thanks,
  Yeah, you're probably right.  And the ancient xfstests history seems to
  be lost in the mists of time, at least as far as I can see.  So I'm ok
  with this but let's let Dave  SGI chime in too just to be certain.
  I did not have success locating the history prior to what we have posted 
  on
  oss.  I agree that it was likely added so that tests that expose output 
  from
  random into golden output files will have the same results across arches.
  Maybe this is still of concern for folks who use a different c library 
  with the
  kernel.
 
  Looks there are quite a few callers.  IMO if we're going to remove this we
  should fix the tests first.
  Or first, determine if they really need fixing.  Did you find tests which
  actually contain the random results in the golden output?
  At one point random.c was modified because it was returning different test
  results on i386 and ia64 with test 007.  Looks like nametest.c is a good
  candidate.
 
 
 Ugh you're right.  Just ignore this patch for now, I'll be in the corner 
 banging my head against the wall.  Thanks,

For now we can just use srandom?

-chris

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] Skip non-regular files in recursive defrag

2014-01-07 Thread Pascal VITOUX
On Mon, Jan 6, 2014 at 4:36 PM, David Sterba dste...@suse.cz wrote:

 On Wed, Jan 01, 2014 at 03:10:25PM +0100, Pascal VITOUX wrote:
  ---
   cmds-filesystem.c | 2 +-
   1 file changed, 1 insertion(+), 1 deletion(-)
 
  diff --git a/cmds-filesystem.c b/cmds-filesystem.c
  index 1c1926b..979dbd9 100644
  --- a/cmds-filesystem.c
  +++ b/cmds-filesystem.c
  @@ -646,7 +646,7 @@ static int defrag_callback(const char *fpath, const 
  struct stat *sb,
int e = 0;
int fd = 0;
 
  - if (typeflag == FTW_F) {
  + if ((typeflag == FTW_F)  S_ISREG(sb-st_mode)) {

 The ftw(3) documentation states that FTW_F gets the regular files, do you 
 have a
 reproducer whre this does not work as expected?

Yes, this can be tested with the example code provided in ftw(3) and
compiled with glibc 2.18.

In fact, the ftw(3) page also states in the NOTES part that, under
Linux, FTW_F is used for all objects that are not a directory. Some
libc versions are specified ( libc4, libc5 and glic 2.0.6 ) but it's
still true in glibc 2.18

The ftw's man page is less precise than the libc manual, as you can
see here : 
http://www.gnu.org/software/libc/manual/html_node/Working-with-Directory-Trees.html

 I think the check could be placed into cmd_defrag, there's a st_mode
 check for directories already. Defragmenting makes most sense for
 regular files (though there's the option to defrag portions of the
 metadata b-trees), so it's fine to handle just directories and regular
 files in userspace.

I think so, too. And for a single file defrag the check should return
to the user an explicit error message instead of the 'defrag range
ioctl not supported'  error.

 If you're going to resend the patch, please add 'btrfs-progs: ' string
 to the subject, write a short changlog and add your Signed-off-by line.

 thanks,
 david

Ok, thanks for these instructions. I will resend it soon.

Pascal.
--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


YOU HAVE RECEIVED A DONATION

2014-01-07 Thread III Seminario Internacional de Actualización en Agroindustria



Hello,

This message is to inform you that you have just received a cash
donation
of $460,000 from us. You are advised to write us an email once you get
this message through the following Email: dawesda...@hotmail.com for
more
information on how you can redeem your donated sum.

Best of luck.

Dave  Angela Dawes.
Email: dawesda...@gmail.com


Correo Electrónico - Universidad de Nariño
Pasto - Nariño - Colombia
www.udenar.edu.co
N�r��yb�X��ǧv�^�)޺{.n�+{�n�߲)w*jg����ݢj/���z�ޖ��2�ޙ�)ߡ�a�����G���h��j:+v���w��٥

Re: btrfs-transaction blocked for more than 120 seconds

2014-01-07 Thread Marc MERLIN
On Fri, Jan 03, 2014 at 09:34:10PM +, Duncan wrote:
 IIRC someone also mentioned problems with autodefrag and an about 3/4 gig 
 systemd journal.  My gut feeling (IOW, *NOT* benchmarked!) is that double-
 digit MiB files should /normally/ be fine, but somewhere in the lower 
 triple digits, write-magnification could well become an issue, depending 
 of course on exactly how much active writing the app is doing into the 
 file.
 
When I defrag'ed my 83GB vm file with 156222 extents, it was not in use
or being written to.

 As I said there's more work going into tuning autodefrag ATM, but as it 
 is, I couldn't really recommend making it a global default... tho maybe a 
 distro could enable it by default on a no-VM desktop system (as opposed 
 to a server).  Certainly I'd recommend most desktop types enable it.

I use VMs on my desktop :) but point taken.

On Sun, Jan 05, 2014 at 10:09:38AM -0700, Chris Murphy wrote:
  gandalfthegreat:/var/local/nobck/VirtualBox VMs/Win7# filefrag Win7.vdi 
  Win7.vdi: 156222 extents found
  
  Considering how virtualbox works, that's hardly surprising.
 
 I haven't read anything so far indicating defrag applies to the VM container 
 use case, rather nodatacow via xattr +C is the way to go. At least for now.
 
Yep, I'll convert the file, but since I found a pretty severe
performance problem, does anyone care to get details off my system
before I make the problem go away for me?

 It's better than a panic or corrupt data. So far the best combination

To be honest, I'd have taken a panic, it would have saved me 2H of
waiting for a laptop to recover when it was never going to recover :(
Data corruption, sure, obviously :)

 I've found, open to other suggestions though, is +C xattr on

So you're saying that defragmentation has known performance problems
that can't get fixed for now, and that the solution is not to get
fragmented or recreate the relevant files.
If so, I'll go ahead, I just wanted to make sure I didn't have useful
debug state before clearing my problem.

 This may already be a known problem but it's worth sysrq+w, and then dmesg 
 and posting those results if you haven't already.

No, I had not yet, but I'll do this.

On Sun, Jan 05, 2014 at 01:44:25PM -0700, Duncan wrote:
 [I normally try to reply directly to list but don't believe I've seen
 this there yet, but got it direct-mailed so will reply-all in response.]
 
I like direct Cc on replies, makes my filter and mutt coloring happier
:)
Dupes with the same message-id are what procmail and others were written
for :)

 I now believe the lockup must be due to processing the hundreds of
 thousands of extents on all those snapshots, too, in addition to doing

That's a good call. I do have this:
gandalfthegreat:/mnt/btrfs_pool1# ls var
var/  var_hourly_20140105_16:00:01/
var_daily_20140102_00:01:01/  var_hourly_20140105_17:00:26/
var_daily_20140103_00:59:28/  var_weekly_20131208_00:02:02/
var_daily_20140104_00:01:01/  var_weekly_20131215_00:02:01/
var_daily_20140105_00:33:14/  var_weekly_20131229_00:02:02/
var_hourly_20140105_05:00:01/ var_weekly_20140105_00:33:14/

 it on the main volume.  I don't actually make very extensive use of
 snapshots here anyway, so I didn't think about that aspect originally,
 but that's gotta be what's throwing the real spanner in the works,
 turning a possibly long but workable normal defrag (O(1)) into a lockup
 scenario (O(n)) where virtually no progress is made as currently
 coded.

That is indeed what I'm seeing, so it's very possible you're right.

Marc
-- 
A mouse is a device used to point at the xterm you want to type in - A.S.R.
Microsoft is to operating systems 
   what McDonalds is to gourmet cooking
Home page: http://marc.merlins.org/ | PGP 1024R/763BE901
--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v4 00/18] Replace btrfs_workers with kernel workqueue based btrfs_workqueue

2014-01-07 Thread Qu Wenruo

On Mon, 23 Dec 2013 10:35:04 +0800
, Qu Wenruo wrote:

On fri, 20 Dec 2013 05:30:48 -0800, Josef Bacik wrote:


On 12/19/2013 07:08 PM, Qu Wenruo wrote:

I'm sorry but I failed to reproduce the problem.
Btrfs/012 in xfstests has been run for serveral hours but nothing 
happened.


Would you please give me some more details about the environment or 
the panic backtrace?




Ok so it wasn't that test, it was just ./check -g auto. It would 
sometimes die at btrfs/012, other times it would make it as far as 
generic/083 before it keeled over. I bisected it down to


btrfs: Replace fs_info-workers with btrfs_workqueue

which of course is just the first patch that you start using the new 
code which isn't helpful. My dmesg from one of my runs is here


http://ur1.ca/g867b

All of the initial panics looked exactly the same. I'm just going to 
kick the series out for now while you track down the problem. Thanks,


Josef
--
To unsubscribe from this list: send the line unsubscribe 
linux-btrfs in

the body of a message to majord...@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html


Thanks for your dmesg a lot.

It's quite sad that I still failed to reproduce the same bug on all my 
test environment during the weekend.


But according to the dmesg, which is expired now though, I made the 
following assumption:


There is a possibility that out of order execution made the work-wq 
= wq sentence executed behind the queue_work() call,
and the normal_work_helper is executed in another CPU instantly, 
which caused the problem.


--
static inline void __btrfs_queue_work(struct __btrfs_workqueue *wq,
struct btrfs_work *work)
{
unsigned long flags;

work-wq = wq;
/* There is no memory barrier ensure the work-wq = wq is executed
before queue_work */
thresh_queue_hook(wq);
if (work-ordered_func) {
spin_lock_irqsave(wq-list_lock, flags);
list_add_tail(work-ordered_list, wq-ordered_list);
spin_unlock_irqrestore(wq-list_lock, flags);
}
queue_work(wq-normal_wq, work-normal_work);
}
--

The memory order problem involves compiler or even CPU hardware (AMD 
CPUS seems to have a weaker memory order than Intel),
so to simulate the bug, I used the following codes tried to reproduce 
the bug, and the result is pretty much the same as your dmesg.

--
static inline void __btrfs_queue_work(struct __btrfs_workqueue *wq,
struct btrfs_work *work)
{
unsigned long flags;
int in_order = get_random_int() % 1000

if (in_order)
work-wq = wq;
thresh_queue_hook(wq);
if (work-ordered_func) {
spin_lock_irqsave(wq-list_lock, flags);
list_add_tail(work-ordered_list, wq-ordered_list);
spin_unlock_irqrestore(wq-list_lock, flags);
}
queue_work(wq-normal_wq, work-normal_work);

/* Try make the codes out of order since the Intel CPU
seems obey the memory order quite well */
if (!in_order)
work-wq = wq;
}
--

The fix is as easy as just adding a smp_wmb() behind the work-wq = 
wq sentence, but since I failed to reproduce the bug,

I can't confirm whether the fix is good or not.

I know it's impolite but would you please first try the smp_wmb() 
first to confirm the fix?


If the fix is good I'll send the formal V5 patchset.

Thanks

Qu

Happy new year!

Very sorry for disturbing but ithas been some time before my last replay
and no feedback since then (mainly because of the new year holiday).

If itis convenient for you,would you please give some feedback about the 
smp_wmb() fix ?


Also incase that smp_wmb() doesn't work, it would be very nice of you to
also try smp_mb() fix.

Thanks
Qu
--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html