[PATCH] Btrfs: make some functions return void

2011-08-25 Thread Tsutomu Itoh
The type of some functions that return only 0 is changed to 'void'.
In addition, the check on the return value in the caller of these
functions becomes unnecessary. So, these check is removed.

Signed-off-by: Tsutomu Itoh 
---
 fs/btrfs/async-thread.c |   17 
 fs/btrfs/async-thread.h |4 +-
 fs/btrfs/compression.c  |   14 --
 fs/btrfs/delayed-ref.c  |   61 ---
 fs/btrfs/disk-io.c  |   41 +++
 fs/btrfs/disk-io.h  |2 +-
 fs/btrfs/extent-tree.c  |   11 +++-
 fs/btrfs/extent_io.c|   32 ++--
 fs/btrfs/extent_io.h|2 +-
 fs/btrfs/locking.c  |3 +-
 fs/btrfs/locking.h  |2 +-
 fs/btrfs/ordered-data.c |   30 --
 fs/btrfs/ordered-data.h |   16 ++--
 fs/btrfs/root-tree.c|4 +--
 fs/btrfs/transaction.c  |9 ++
 fs/btrfs/transaction.h  |4 +-
 fs/btrfs/tree-log.c |   21 ++--
 fs/btrfs/tree-log.h |8 +++---
 fs/btrfs/volumes.c  |   11 +++-
 fs/btrfs/volumes.h  |2 +-
 20 files changed, 118 insertions(+), 176 deletions(-)

diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index 7ec1409..5e9998a 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -177,11 +177,11 @@ out:
spin_unlock_irqrestore(&workers->lock, flags);
 }
 
-static noinline int run_ordered_completions(struct btrfs_workers *workers,
-   struct btrfs_work *work)
+static noinline void run_ordered_completions(struct btrfs_workers *workers,
+struct btrfs_work *work)
 {
if (!workers->ordered)
-   return 0;
+   return;
 
set_bit(WORK_DONE_BIT, &work->flags);
 
@@ -219,7 +219,6 @@ static noinline int run_ordered_completions(struct 
btrfs_workers *workers,
}
 
spin_unlock(&workers->order_lock);
-   return 0;
 }
 
 static void put_worker(struct btrfs_worker_thread *worker)
@@ -405,7 +404,7 @@ again:
 /*
  * this will wait for all the worker threads to shutdown
  */
-int btrfs_stop_workers(struct btrfs_workers *workers)
+void btrfs_stop_workers(struct btrfs_workers *workers)
 {
struct list_head *cur;
struct btrfs_worker_thread *worker;
@@ -433,7 +432,6 @@ int btrfs_stop_workers(struct btrfs_workers *workers)
put_worker(worker);
}
spin_unlock_irq(&workers->lock);
-   return 0;
 }
 
 /*
@@ -618,14 +616,14 @@ found:
  * it was taken from.  It is intended for use with long running work functions
  * that make some progress and want to give the cpu up for others.
  */
-int btrfs_requeue_work(struct btrfs_work *work)
+void btrfs_requeue_work(struct btrfs_work *work)
 {
struct btrfs_worker_thread *worker = work->worker;
unsigned long flags;
int wake = 0;
 
if (test_and_set_bit(WORK_QUEUED_BIT, &work->flags))
-   goto out;
+   return;
 
spin_lock_irqsave(&worker->lock, flags);
if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags))
@@ -652,9 +650,6 @@ int btrfs_requeue_work(struct btrfs_work *work)
if (wake)
wake_up_process(worker->task);
spin_unlock_irqrestore(&worker->lock, flags);
-out:
-
-   return 0;
 }
 
 void btrfs_set_work_high_prio(struct btrfs_work *work)
diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h
index 5077746..6a9d3c1 100644
--- a/fs/btrfs/async-thread.h
+++ b/fs/btrfs/async-thread.h
@@ -111,9 +111,9 @@ struct btrfs_workers {
 
 int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work);
 int btrfs_start_workers(struct btrfs_workers *workers, int num_workers);
-int btrfs_stop_workers(struct btrfs_workers *workers);
+void btrfs_stop_workers(struct btrfs_workers *workers);
 void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max,
struct btrfs_workers *async_starter);
-int btrfs_requeue_work(struct btrfs_work *work);
+void btrfs_requeue_work(struct btrfs_work *work);
 void btrfs_set_work_high_prio(struct btrfs_work *work);
 #endif
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 8ec5d86..9a9f138 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -225,8 +225,8 @@ out:
  * Clear the writeback bits on all of the file
  * pages for a compressed write
  */
-static noinline int end_compressed_writeback(struct inode *inode, u64 start,
-unsigned long ram_size)
+static noinline void end_compressed_writeback(struct inode *inode, u64 start,
+ unsigned long ram_size)
 {
unsigned long index = start >> PAGE_CACHE_SHIFT;
unsigned long end_index = (start + ram_size - 1) >> PAGE_CACHE_SHIFT;
@@ -252,7 +252,6 @@ static noinline int end_compressed_writeback(struct inode 
*inode, u64 start,
   

Re: BTRFS and power loss ~= corruption?

2011-08-25 Thread Arne Jansen
On 26.08.2011 01:01, Gregory Maxwell wrote:
> On Wed, Aug 24, 2011 at 9:11 AM, Berend Dekens  wrote:
> [snip]
>> I thought the idea of COW was that whatever happens, you can always mount in
>> a semi-consistent state?
> [snip]
> 
> 
> It seems to me that if someone created a block device which recorded
> all write operations a rather excellent test could be constructed
> where a btrfs filesystem is recorded under load and then every partial
> replay is mounted and checked for corruption/data loss.
> 
> This would result in high confidence that no power loss event could
> destroy data given the offered load assuming well behaved
> (non-reordering hardware).  If it recorded barrier operations the a
> tool could also try many (but probably not all) permissible
> reorderings at every truncation offset.
> 

I like the idea. Some more thoughts:
 - instead of trying all reorderings it might be enough to just always
   deliver the oldest possible copy
 - the order in which btrfs writes the data probably depends on the
   order in which the device acknowledges the request. You might need
   to add some reordering there, too
 - you need to produce a wide variety of workloads, as problems might
   only occur at a special kind of it (directIO, fsync, snapshots...)
 - if there really is a regression somewhere, it would be good to also
   include the full block layer into the test, as the regression might
   not be in btrfs at all
 - as a first small step one could just use blktrace to record the write
   order and analyze the order on mount as well

> It seems to me that the existence of this kind of testing is something
> that should be expected of a modern filesystem before it sees
> widescale production use.
> --
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] xfstests 255: add a seek_data/seek_hole tester

2011-08-25 Thread Marco Stornelli
2011/8/26 Dave Chinner :
> On Thu, Aug 25, 2011 at 12:51:56AM -0600, Andreas Dilger wrote:
>> On 2011-08-25, at 12:40 AM, Dave Chinner wrote:
>> > On Thu, Aug 25, 2011 at 02:06:32AM -0400, Christoph Hellwig wrote:
>> >> On Tue, Jun 28, 2011 at 11:33:19AM -0400, Josef Bacik wrote:
>> >>> This is a test to make sure seek_data/seek_hole is acting like it does on
>> >>> Solaris.  It will check to see if the fs supports finding a hole or not 
>> >>> and will adjust as necessary.
>> >>
>> >> Can you resend this with any updates that happened in the meantime?
>> >>
>> >> Dave also still had some comments about semantics, so it might be worth
>> >> to incorporate that as well.
>> >
>> > The main questions I had when looking at this was how we should
>> > handle unwritten extents - the only answer I got was along the lines
>> > of "we'll deal with that once filesystems have implemented
>> > something". That's a bit of a chicken-and-egg situation, and doesn't
>> > help me decide what is the best thing to do. I don't want to have to
>> > re-implement this code when it's decided i did the wrong thing
>> > initially.
>>
>> Let's first clarify what you mean by an unwritten extent?  Do you mean a
>> preallocated extent that returns 0 when read,
>
> Exactly that.
>
>> or do you mean a delayed
>> allocation extent that was written by the application that is still in
>> memory but not yet written to disk?
>
> That's not an unwritten extent - that's a delayed allocation extent ;)
>
>> Unfortunately, ZFS has no concept of preallocated extents, so we can't
>> look to it for precedent, but it definitely has delayed allocation.
>> Possibly if UFS on Solaris has SEEK_HOLE and also preallocated extents
>> (I have no idea) it could be tested?
>>
>> > The most basic question I really want answered is this:
>> >
>> >     - is preallocated space a HOLE, or is it DATA?
>> >
>> > Whatever the answer, I think it should be consistently
>> > presented by all filesystems that support preallocation, and it
>> > should be encoded into the generic SEEK_HOLE/SEEK_DATA tests
>>
>> My thought would be that a preallocated extent is still a HOLE, because
>> it doesn't contain data that an application actually cares about.  On
>> the other hand, a delalloc extent is DATA because it has something that
>> an application cares about.
>
> OK, that's the way I'd expect to treat both preallocated and
> delalloc space.
>
>> > Answering that question then helps answer the more complex questions
>> > I had, like:
>> >
>> >     - what does SEEK_DATA return when you have a file layout
>> >       like "hole-prealloc-data"?
>>
>> I would think only the "data" part, since that is what the definition
>> of "SEEK_DATA" is IMHO.
>
> Agreed, that's the way I'd interpret it, too. So perhaps we need to
> ensure that this interpretation is actually tested by this test?
>
> How about some definitions to work by:
>
> Data: a range of the file that contains valid data, regardless of
> whether it exists in memory or on disk. The valid data can be
> preceeded and/or followed by an arbitrary number of zero bytes
> dependent on the underlying implementation of hole detection.
>
> Hole: a range of the file that contains no data or is made up
> entirely of  NULL (zero) data. Holes include preallocated ranges of
> files that have not had actual data written to them.
>
> Does that make sense? It has sufficient flexibility in it for the

No for me. A hole is made up of zero data? It's a strange definition
for me. A hole is simply no data. With this definition a hole can be
start from a block with data but partially filled, and fs should check
instead of presence of a block, the data content. I don't like it very
much. For the fallocate case, we need only a simple rule: no check is
done beyond eof. At this point it's clear that we can check the file
block allocation till i_size, don't care if there are other blocks
allocated. I don't know if it's applicable even for unwritten extents
case.

Marco
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC] Btrfs design defect in extent backref ?

2011-08-25 Thread Li Zefan
Yan, Zheng wrote:
> On Fri, Aug 26, 2011 at 10:00 AM, Li Zefan  wrote:
>> Yan, Zheng wrote:
>>> On Thu, Aug 25, 2011 at 3:56 PM, Li Zefan  wrote:
 We have an offset in file extent to indicate its position in the
 corresponding extent item in extent tree. We also have an offset in
 extent item to indicate the start position of the file extent that
 uses this item.

 The math is:

extent_item.extent_data_ref.offset = file_pos - 
 file_extent.extent_offset.

   e1
 disk extents:|--|
 ^
 |  e2
 |  |-|
 |  |   ^
 |  |   |
 v  v   |
 file extents:|- f1 -|- f2 -|

 So it looks like e2.offset points to f1 not f2. Therefore given an extent 
 item,
 we'll have to search through all the file extents in an inode to find the
 relative file extent in the worst case, which makes this field somewhat 
 useless.

>>>
>>> The reason for this is reducing number of file extent backref itmes.
>>
>> It seems to me a rare case, which isn't worth the complexity and 
>> inconvenience
>> it brings, and it requires an extra field (.count).
>>
> Random write workload isn't a rare case.
> 

Ah, I was thinking about the clone ioctl, and ignoring other situations.

Thanks for your clarification.
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC] Btrfs design defect in extent backref ?

2011-08-25 Thread Yan, Zheng
On Fri, Aug 26, 2011 at 10:00 AM, Li Zefan  wrote:
> Yan, Zheng wrote:
>> On Thu, Aug 25, 2011 at 3:56 PM, Li Zefan  wrote:
>>> We have an offset in file extent to indicate its position in the
>>> corresponding extent item in extent tree. We also have an offset in
>>> extent item to indicate the start position of the file extent that
>>> uses this item.
>>>
>>> The math is:
>>>
>>>    extent_item.extent_data_ref.offset = file_pos - 
>>> file_extent.extent_offset.
>>>
>>>                       e1
>>> disk extents:    |--|
>>>                 ^
>>>                 |                  e2
>>>                 |          |-|
>>>                 |          |   ^
>>>                 |          |   |
>>>                 v          v   |
>>> file extents:    |- f1 -|- f2 -|
>>>
>>> So it looks like e2.offset points to f1 not f2. Therefore given an extent 
>>> item,
>>> we'll have to search through all the file extents in an inode to find the
>>> relative file extent in the worst case, which makes this field somewhat 
>>> useless.
>>>
>>
>> The reason for this is reducing number of file extent backref itmes.
>
> It seems to me a rare case, which isn't worth the complexity and inconvenience
> it brings, and it requires an extra field (.count).
>
Random write workload isn't a rare case.

>> we don't have to search all the file extents because the file extent size
>> is limited and we have extent_data_ref.count.
>
> Yes we have to, and for a big file with many small file extents, the extent
> number is not trivial.
>
Max file extent size is 128M, so only need to scan a 128M range in the
worst case.

>>
>>> What makes things worse is the above fomula can make the offset a negative
>>> value (cast to u64):
>>>
>>>    # touch /mnt/dst
>>>    # clone_range -s 8192 -d 4096 /mnt/src /mnt/dst
>>>    # umount /mnt
>>>    # btrfs-debug-tree /dev/sda7
>>>    ...
>>>        item 2 key (12582912 EXTENT_ITEM 49152) itemoff 3865 itemsize 82
>>>                extent refs 2 gen 8 flags 1
>>>                extent data backref root 5 objectid 258 offset 
>>> 18446744073709543424 count 1
>>>                extent data backref root 5 objectid 257 offset 0 count 1
>>>    ...
>>>
>>> and relocation won't work in this case:
>>>
>>>    # mount /dev/sda7 /mnt
>>>    # rm /mnt/src
>>>    # sync
>>>    # btrfs fi bal /mnt
>>>    (kernel warning !!)
>>>    (hung up !!)
>>>
>>> I don't see the necessity or benefit of the substraction in the fomula,
>>> and I think the correct one is:
>>>
>>>    extent_item.extent_data_ref.offset = file_pos
>>>
>>> (As a side effect thereafter we don't need extent_data_ref.count)
>>>
>>> That's what this patch does. Unfornately it is an incompatable change
>>> in disk format.
>>>
>>> So I think we have to live with this defect, just fix relocation for
>>> the negative offset case ?
>>
>> I prefer fixing relocation.
>>
>
> Sure, though I would prefer the alternative if not for the stablity of
> disk format.
>
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC] Btrfs design defect in extent backref ?

2011-08-25 Thread Li Zefan
Yan, Zheng wrote:
> On Thu, Aug 25, 2011 at 3:56 PM, Li Zefan  wrote:
>> We have an offset in file extent to indicate its position in the
>> corresponding extent item in extent tree. We also have an offset in
>> extent item to indicate the start position of the file extent that
>> uses this item.
>>
>> The math is:
>>
>>extent_item.extent_data_ref.offset = file_pos - file_extent.extent_offset.
>>
>>   e1
>> disk extents:|--|
>> ^
>> |  e2
>> |  |-|
>> |  |   ^
>> |  |   |
>> v  v   |
>> file extents:|- f1 -|- f2 -|
>>
>> So it looks like e2.offset points to f1 not f2. Therefore given an extent 
>> item,
>> we'll have to search through all the file extents in an inode to find the
>> relative file extent in the worst case, which makes this field somewhat 
>> useless.
>>
> 
> The reason for this is reducing number of file extent backref itmes.

It seems to me a rare case, which isn't worth the complexity and inconvenience
it brings, and it requires an extra field (.count).

> we don't have to search all the file extents because the file extent size
> is limited and we have extent_data_ref.count.

Yes we have to, and for a big file with many small file extents, the extent
number is not trivial.

> 
>> What makes things worse is the above fomula can make the offset a negative
>> value (cast to u64):
>>
>># touch /mnt/dst
>># clone_range -s 8192 -d 4096 /mnt/src /mnt/dst
>># umount /mnt
>># btrfs-debug-tree /dev/sda7
>>...
>>item 2 key (12582912 EXTENT_ITEM 49152) itemoff 3865 itemsize 82
>>extent refs 2 gen 8 flags 1
>>extent data backref root 5 objectid 258 offset 
>> 18446744073709543424 count 1
>>extent data backref root 5 objectid 257 offset 0 count 1
>>...
>>
>> and relocation won't work in this case:
>>
>># mount /dev/sda7 /mnt
>># rm /mnt/src
>># sync
>># btrfs fi bal /mnt
>>(kernel warning !!)
>>(hung up !!)
>>
>> I don't see the necessity or benefit of the substraction in the fomula,
>> and I think the correct one is:
>>
>>extent_item.extent_data_ref.offset = file_pos
>>
>> (As a side effect thereafter we don't need extent_data_ref.count)
>>
>> That's what this patch does. Unfornately it is an incompatable change
>> in disk format.
>>
>> So I think we have to live with this defect, just fix relocation for
>> the negative offset case ?
> 
> I prefer fixing relocation.
> 

Sure, though I would prefer the alternative if not for the stablity of
disk format.
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] xfstests 255: add a seek_data/seek_hole tester

2011-08-25 Thread Dave Chinner
On Thu, Aug 25, 2011 at 12:51:56AM -0600, Andreas Dilger wrote:
> On 2011-08-25, at 12:40 AM, Dave Chinner wrote:
> > On Thu, Aug 25, 2011 at 02:06:32AM -0400, Christoph Hellwig wrote:
> >> On Tue, Jun 28, 2011 at 11:33:19AM -0400, Josef Bacik wrote:
> >>> This is a test to make sure seek_data/seek_hole is acting like it does on
> >>> Solaris.  It will check to see if the fs supports finding a hole or not 
> >>> and will adjust as necessary.
> >> 
> >> Can you resend this with any updates that happened in the meantime?
> >> 
> >> Dave also still had some comments about semantics, so it might be worth
> >> to incorporate that as well.
> > 
> > The main questions I had when looking at this was how we should
> > handle unwritten extents - the only answer I got was along the lines
> > of "we'll deal with that once filesystems have implemented
> > something". That's a bit of a chicken-and-egg situation, and doesn't
> > help me decide what is the best thing to do. I don't want to have to
> > re-implement this code when it's decided i did the wrong thing
> > initially.
> 
> Let's first clarify what you mean by an unwritten extent?  Do you mean a
> preallocated extent that returns 0 when read,

Exactly that.

> or do you mean a delayed
> allocation extent that was written by the application that is still in
> memory but not yet written to disk?

That's not an unwritten extent - that's a delayed allocation extent ;)

> Unfortunately, ZFS has no concept of preallocated extents, so we can't
> look to it for precedent, but it definitely has delayed allocation.
> Possibly if UFS on Solaris has SEEK_HOLE and also preallocated extents
> (I have no idea) it could be tested?
> 
> > The most basic question I really want answered is this:
> > 
> > - is preallocated space a HOLE, or is it DATA?
> > 
> > Whatever the answer, I think it should be consistently
> > presented by all filesystems that support preallocation, and it
> > should be encoded into the generic SEEK_HOLE/SEEK_DATA tests
> 
> My thought would be that a preallocated extent is still a HOLE, because
> it doesn't contain data that an application actually cares about.  On
> the other hand, a delalloc extent is DATA because it has something that
> an application cares about.

OK, that's the way I'd expect to treat both preallocated and
delalloc space.

> > Answering that question then helps answer the more complex questions
> > I had, like:
> > 
> > - what does SEEK_DATA return when you have a file layout
> >   like "hole-prealloc-data"?
> 
> I would think only the "data" part, since that is what the definition
> of "SEEK_DATA" is IMHO.

Agreed, that's the way I'd interpret it, too. So perhaps we need to
ensure that this interpretation is actually tested by this test?

How about some definitions to work by:

Data: a range of the file that contains valid data, regardless of
whether it exists in memory or on disk. The valid data can be
preceeded and/or followed by an arbitrary number of zero bytes
dependent on the underlying implementation of hole detection.

Hole: a range of the file that contains no data or is made up
entirely of  NULL (zero) data. Holes include preallocated ranges of
files that have not had actual data written to them.

Does that make sense? It has sufficient flexibility in it for the
existing generic "non-implementation", allows for filesystems to
define their own hole detection boundaries (e.g. filesystem block
size), and effectively defines how preallocated ranges from
fallocate() should be treated (i.e. as holes). If we can agree on
those definitions, I think that we should document them in both the
kernel and the man page that defines SEEK_HOLE/SEEK_DATA so everyone
is on the same page...

Cheers,

Dave.
-- 
Dave Chinner
da...@fromorbit.com
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Honest timeline for btrfsck

2011-08-25 Thread Yalonda Gishtaka
Hugo Mills  carfax.org.uk> writes:
> 
>If this is in decent shape, it's probably worth it to release it in
> its current form anyway (and possibly request a moratorium on extra
> patches until you've finished the optimisation). I suspect that
> there's a number of people out there who wouldn't mind the speed
> issues just to get a filesystem back.

I second this.  I'd rather have a slow btrfsck tool than just the promise of one
that rivals the Duke Nukem Forever release schedule.



--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: BTRFS and power loss ~= corruption?

2011-08-25 Thread Gregory Maxwell
On Wed, Aug 24, 2011 at 9:11 AM, Berend Dekens  wrote:
[snip]
> I thought the idea of COW was that whatever happens, you can always mount in
> a semi-consistent state?
[snip]


It seems to me that if someone created a block device which recorded
all write operations a rather excellent test could be constructed
where a btrfs filesystem is recorded under load and then every partial
replay is mounted and checked for corruption/data loss.

This would result in high confidence that no power loss event could
destroy data given the offered load assuming well behaved
(non-reordering hardware).  If it recorded barrier operations the a
tool could also try many (but probably not all) permissible
reorderings at every truncation offset.

It seems to me that the existence of this kind of testing is something
that should be expected of a modern filesystem before it sees
widescale production use.
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: BTRFS and power loss ~= corruption?

2011-08-25 Thread Maciej Marcin Piechotka
On Thu, 2011-08-25 at 19:55 +0200, Martin Steigerwald wrote:
> That said I also do not have any issues with BTRFS on a ThinkPad T23
> for / 
> and /home. But then the machine has an hibernate-to-disk-and-resume
> uptime 
> of almost 120 days, so it didn´t see a power loss for a long time.
> Thats 
> still with 2.6.38.4.
> 

Which method of hibernation are you using?

I got enormous problems with btrfs+toi including:

 - Freezes resulting in umountable partition (twice so far, 2 results in
google including one message I sent to list)
 - Sometimes a random program (Skype, Firefox) cannot be frozen and
stacktracke includes btrfs AIO.

regards


signature.asc
Description: This is a digitally signed message part


WARNING in -rc3-00023-g14c62e7

2011-08-25 Thread Diego Calleja
This is what I got in my dmesg...I have lots of these warnings, didn't
notice it until now.

[  235.705766] [ cut here ]
[  235.705772] WARNING: at fs/inode.c:1309 iput+0x1ed/0x210()
[  235.705773] Hardware name: 
[  235.705774] Modules linked in: ipt_REJECT xt_tcpudp nf_conntrack_ipv4 
nf_defrag_ipv4 xt_state nf_conntrack iptable_filter ip_tables x_tables 
snd_hda_codec_hdmi snd_hda_codec_realtek wacom snd_hda_intel snd_hda_codec 
snd_pcm snd_timer snd soundcore e1000e i2c_i801 snd_page_alloc virtio_net 
virtio virtio_ring kvm_intel kvm [last unloaded: scsi_wait_scan]
[  235.705788] Pid: 3203, comm: btrfs-cleaner Not tainted 
3.1.0-rc3-00023-g14c62e7 #4
[  235.705789] Call Trace:
[  235.705794]  [] warn_slowpath_common+0x7f/0xc0
[  235.705796]  [] warn_slowpath_null+0x1a/0x20
[  235.705797]  [] iput+0x1ed/0x210
[  235.705800]  [] btrfs_iget+0x1cc/0x460
[  235.705804]  [] ? _raw_spin_unlock+0x16/0x40
[  235.705807]  [] btrfs_run_defrag_inodes+0x12e/0x210
[  235.705809]  [] cleaner_kthread+0x18b/0x1a0
[  235.705811]  [] ? _raw_spin_unlock_irqrestore+0x2a/0x60
[  235.705813]  [] ? transaction_kthread+0x290/0x290
[  235.705815]  [] kthread+0x8c/0xa0
[  235.705818]  [] kernel_thread_helper+0x4/0x10
[  235.705820]  [] ? finish_task_switch+0x4e/0xe0
[  235.705822]  [] ? _raw_spin_unlock_irq+0x1c/0x40
[  235.705824]  [] ? retint_restore_args+0xe/0xe
[  235.705826]  [] ? kthread_worker_fn+0x190/0x190
[  235.705828]  [] ? gs_change+0xb/0xb
[  235.705829] ---[ end trace 522a9ae26c065662 ]---
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: BTRFS and power loss ~= corruption?

2011-08-25 Thread Martin Steigerwald
Am Donnerstag, 25. August 2011 schrieb Anand Jain:
>   anyways, solutions containing disk-write-cache disabled and SSD
>   is quite popular now a days. And in terms of random synchronous
>   write performance they are awesome.

There are SSD with capacitors such as Intel SSD 320. These according to 
the vendor should write out all remaining writes that made it to the disk 
cache should a power loss occur.

I did not have any issues with BTRFS on / with a ThinkPad T520 and an 
Intel SSD 320. /home is still Ext4, as I want a fsck first. Thats with 
Linux 3.0.0-2 amd64 debian package.

That said I also do not have any issues with BTRFS on a ThinkPad T23 for / 
and /home. But then the machine has an hibernate-to-disk-and-resume uptime 
of almost 120 days, so it didn´t see a power loss for a long time. Thats 
still with 2.6.38.4.

-- 
Martin 'Helios' Steigerwald - http://www.Lichtvoll.de
GPG: 03B0 0D6C 0040 0710 4AFA  B82F 991B EAAC A599 84C7
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Honest timeline for btrfsck

2011-08-25 Thread Michael Cronenworth

Chris Mason wrote:

There are always bugs to fix, and I have #1 and #2 mostly ready.  I had
hoped to get #1 out the door before I left on vacation and I still might
post it tonight.


Hi Chris,

Is there an update on this? I don't see any new code for 
btrfs-progs-unstable, but I might be looking in the wrong place.


Will this fsck tool be able to handle problems such as:
"parent transid verify failed on # wanted # found #" ?

Thanks,
Michael
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] Btrfs: fix error handling of create_reloc_root() and btrfs_copy_root()

2011-08-25 Thread Tsutomu Itoh
If kmalloc() or btrfs_copy_root() in create_reloc_root() failed,
error is returned to the caller instead of BUG_ON().
and, error handling of btrfs_copy_root() is corrected properly.

Signed-off-by: Tsutomu Itoh 
---
 fs/btrfs/ctree.c   |5 -
 fs/btrfs/relocation.c  |   16 +---
 fs/btrfs/transaction.c |3 ++-
 3 files changed, 19 insertions(+), 5 deletions(-)

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 011cab3..837ff58 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -265,8 +265,11 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
else
ret = btrfs_inc_ref(trans, root, cow, 0);
 
-   if (ret)
+   if (ret) {
+   btrfs_tree_unlock(cow);
+   free_extent_buffer(cow);
return ret;
+   }
 
btrfs_mark_buffer_dirty(cow);
*cow_ret = cow;
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 59bb176..57b6a57 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -1268,7 +1268,8 @@ static struct btrfs_root *create_reloc_root(struct 
btrfs_trans_handle *trans,
int ret;
 
root_item = kmalloc(sizeof(*root_item), GFP_NOFS);
-   BUG_ON(!root_item);
+   if (!root_item)
+   return ERR_PTR(-ENOMEM);
 
root_key.objectid = BTRFS_TREE_RELOC_OBJECTID;
root_key.type = BTRFS_ROOT_ITEM_KEY;
@@ -1278,7 +1279,8 @@ static struct btrfs_root *create_reloc_root(struct 
btrfs_trans_handle *trans,
/* called by btrfs_init_reloc_root */
ret = btrfs_copy_root(trans, root, root->commit_root, &eb,
  BTRFS_TREE_RELOC_OBJECTID);
-   BUG_ON(ret);
+   if (ret)
+   goto out;
 
btrfs_set_root_last_snapshot(&root->root_item,
 trans->transid - 1);
@@ -1292,7 +1294,8 @@ static struct btrfs_root *create_reloc_root(struct 
btrfs_trans_handle *trans,
 */
ret = btrfs_copy_root(trans, root, root->node, &eb,
  BTRFS_TREE_RELOC_OBJECTID);
-   BUG_ON(ret);
+   if (ret)
+   goto out;
}
 
memcpy(root_item, &root->root_item, sizeof(*root_item));
@@ -1320,6 +1323,10 @@ static struct btrfs_root *create_reloc_root(struct 
btrfs_trans_handle *trans,
BUG_ON(IS_ERR(reloc_root));
reloc_root->last_trans = trans->transid;
return reloc_root;
+
+out:
+   kfree(root_item);
+   return ERR_PTR(ret);
 }
 
 /*
@@ -1348,6 +1355,8 @@ int btrfs_init_reloc_root(struct btrfs_trans_handle 
*trans,
clear_rsv = 1;
}
reloc_root = create_reloc_root(trans, root, root->root_key.objectid);
+   BUG_ON(IS_ERR(reloc_root));
+
if (clear_rsv)
trans->block_rsv = NULL;
 
@@ -4403,6 +4412,7 @@ void btrfs_reloc_post_snapshot(struct btrfs_trans_handle 
*trans,
new_root = pending->snap;
reloc_root = create_reloc_root(trans, root->reloc_root,
   new_root->root_key.objectid);
+   BUG_ON(IS_ERR(reloc_root));
 
__add_reloc_root(reloc_root);
new_root->reloc_root = reloc_root;
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 7dc36fa..bdcc320 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -972,9 +972,10 @@ static noinline int create_pending_snapshot(struct 
btrfs_trans_handle *trans,
btrfs_cow_block(trans, root, old, NULL, 0, &old);
btrfs_set_lock_blocking(old);
 
-   btrfs_copy_root(trans, root, old, &tmp, objectid);
+   ret = btrfs_copy_root(trans, root, old, &tmp, objectid);
btrfs_tree_unlock(old);
free_extent_buffer(old);
+   BUG_ON(ret);
 
btrfs_set_root_node(new_root_item, tmp);
/* record when the snapshot was created in key.offset */

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC] Btrfs design defect in extent backref ?

2011-08-25 Thread Yan, Zheng
On Thu, Aug 25, 2011 at 3:56 PM, Li Zefan  wrote:
> We have an offset in file extent to indicate its position in the
> corresponding extent item in extent tree. We also have an offset in
> extent item to indicate the start position of the file extent that
> uses this item.
>
> The math is:
>
>    extent_item.extent_data_ref.offset = file_pos - file_extent.extent_offset.
>
>                       e1
> disk extents:    |--|
>                 ^
>                 |                  e2
>                 |          |-|
>                 |          |   ^
>                 |          |   |
>                 v          v   |
> file extents:    |- f1 -|- f2 -|
>
> So it looks like e2.offset points to f1 not f2. Therefore given an extent 
> item,
> we'll have to search through all the file extents in an inode to find the
> relative file extent in the worst case, which makes this field somewhat 
> useless.
>

The reason for this is reducing number of file extent backref itmes.
we don't have to search all the file extents because the file extent size
is limited and we have extent_data_ref.count.

> What makes things worse is the above fomula can make the offset a negative
> value (cast to u64):
>
>    # touch /mnt/dst
>    # clone_range -s 8192 -d 4096 /mnt/src /mnt/dst
>    # umount /mnt
>    # btrfs-debug-tree /dev/sda7
>    ...
>        item 2 key (12582912 EXTENT_ITEM 49152) itemoff 3865 itemsize 82
>                extent refs 2 gen 8 flags 1
>                extent data backref root 5 objectid 258 offset 
> 18446744073709543424 count 1
>                extent data backref root 5 objectid 257 offset 0 count 1
>    ...
>
> and relocation won't work in this case:
>
>    # mount /dev/sda7 /mnt
>    # rm /mnt/src
>    # sync
>    # btrfs fi bal /mnt
>    (kernel warning !!)
>    (hung up !!)
>
> I don't see the necessity or benefit of the substraction in the fomula,
> and I think the correct one is:
>
>    extent_item.extent_data_ref.offset = file_pos
>
> (As a side effect thereafter we don't need extent_data_ref.count)
>
> That's what this patch does. Unfornately it is an incompatable change
> in disk format.
>
> So I think we have to live with this defect, just fix relocation for
> the negative offset case ?

I prefer fixing relocation.

>
> Signed-off-by: Li Zefan 
> ---
>  fs/btrfs/extent-tree.c |    1 -
>  fs/btrfs/file.c        |   11 +--
>  fs/btrfs/inode.c       |    7 +++
>  fs/btrfs/ioctl.c       |    2 +-
>  fs/btrfs/relocation.c  |    1 -
>  5 files changed, 9 insertions(+), 13 deletions(-)
>
> diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
> index f5be06a..3924e03 100644
> --- a/fs/btrfs/extent-tree.c
> +++ b/fs/btrfs/extent-tree.c
> @@ -2578,7 +2578,6 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle 
> *trans,
>                                continue;
>
>                        num_bytes = btrfs_file_extent_disk_num_bytes(buf, fi);
> -                       key.offset -= btrfs_file_extent_offset(buf, fi);
>                        ret = process_func(trans, root, bytenr, num_bytes,
>                                           parent, ref_root, key.objectid,
>                                           key.offset);
> diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
> index e7872e4..7f65a27 100644
> --- a/fs/btrfs/file.c
> +++ b/fs/btrfs/file.c
> @@ -678,7 +678,7 @@ next_slot:
>                                                disk_bytenr, num_bytes, 0,
>                                                root->root_key.objectid,
>                                                new_key.objectid,
> -                                               start - extent_offset);
> +                                               start);
>                                BUG_ON(ret);
>                                *hint_byte = disk_bytenr;
>                        }
> @@ -752,8 +752,7 @@ next_slot:
>                                ret = btrfs_free_extent(trans, root,
>                                                disk_bytenr, num_bytes, 0,
>                                                root->root_key.objectid,
> -                                               key.objectid, key.offset -
> -                                               extent_offset);
> +                                               key.objectid, key.offset);
>                                BUG_ON(ret);
>                                inode_sub_bytes(inode,
>                                                extent_end - key.offset);
> @@ -962,7 +961,7 @@ again:
>
>                ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, 0,
>                                           root->root_key.objectid,
> -                                          ino, orig_offset);
> +                                          ino, split);
>                BUG_ON(ret);
>
>                if (split == start) {
> @@ -989,7 +988,7 @@ again:
>                del_nr++;
>                ret = btrfs_f

[RFC] Btrfs design defect in extent backref ?

2011-08-25 Thread Li Zefan
We have an offset in file extent to indicate its position in the
corresponding extent item in extent tree. We also have an offset in
extent item to indicate the start position of the file extent that
uses this item.

The math is:

extent_item.extent_data_ref.offset = file_pos - file_extent.extent_offset.

   e1
disk extents:|--|
 ^ 
 |  e2
 |  |-|
 |  |   ^
 |  |   |
 v  v   |
file extents:|- f1 -|- f2 -|

So it looks like e2.offset points to f1 not f2. Therefore given an extent item,
we'll have to search through all the file extents in an inode to find the
relative file extent in the worst case, which makes this field somewhat useless.

What makes things worse is the above fomula can make the offset a negative
value (cast to u64):

# touch /mnt/dst
# clone_range -s 8192 -d 4096 /mnt/src /mnt/dst
# umount /mnt
# btrfs-debug-tree /dev/sda7
...
item 2 key (12582912 EXTENT_ITEM 49152) itemoff 3865 itemsize 82
extent refs 2 gen 8 flags 1
extent data backref root 5 objectid 258 offset 
18446744073709543424 count 1
extent data backref root 5 objectid 257 offset 0 count 1
...

and relocation won't work in this case:

# mount /dev/sda7 /mnt
# rm /mnt/src
# sync
# btrfs fi bal /mnt
(kernel warning !!)
(hung up !!)

I don't see the necessity or benefit of the substraction in the fomula,
and I think the correct one is:

extent_item.extent_data_ref.offset = file_pos

(As a side effect thereafter we don't need extent_data_ref.count)

That's what this patch does. Unfornately it is an incompatable change
in disk format.

So I think we have to live with this defect, just fix relocation for
the negative offset case ?

Signed-off-by: Li Zefan 
---
 fs/btrfs/extent-tree.c |1 -
 fs/btrfs/file.c|   11 +--
 fs/btrfs/inode.c   |7 +++
 fs/btrfs/ioctl.c   |2 +-
 fs/btrfs/relocation.c  |1 -
 5 files changed, 9 insertions(+), 13 deletions(-)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index f5be06a..3924e03 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2578,7 +2578,6 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle 
*trans,
continue;
 
num_bytes = btrfs_file_extent_disk_num_bytes(buf, fi);
-   key.offset -= btrfs_file_extent_offset(buf, fi);
ret = process_func(trans, root, bytenr, num_bytes,
   parent, ref_root, key.objectid,
   key.offset);
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index e7872e4..7f65a27 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -678,7 +678,7 @@ next_slot:
disk_bytenr, num_bytes, 0,
root->root_key.objectid,
new_key.objectid,
-   start - extent_offset);
+   start);
BUG_ON(ret);
*hint_byte = disk_bytenr;
}
@@ -752,8 +752,7 @@ next_slot:
ret = btrfs_free_extent(trans, root,
disk_bytenr, num_bytes, 0,
root->root_key.objectid,
-   key.objectid, key.offset -
-   extent_offset);
+   key.objectid, key.offset);
BUG_ON(ret);
inode_sub_bytes(inode,
extent_end - key.offset);
@@ -962,7 +961,7 @@ again:
 
ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, 0,
   root->root_key.objectid,
-  ino, orig_offset);
+  ino, split);
BUG_ON(ret);
 
if (split == start) {
@@ -989,7 +988,7 @@ again:
del_nr++;
ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
0, root->root_key.objectid,
-   ino, orig_offset);
+   ino, other_start);
BUG_ON(ret);
}
other_start = 0;
@@ -1006,7 +1005,7 @@ again:
del_nr++;
ret = btrfs_free_extent(trans, root, bytenr, num_bytes,