RE: [PATCH] btrfs/ioctl.c: quiet sparse warnings

2011-09-23 Thread H Hartley Sweeten
On Friday, September 23, 2011 11:16 AM, Joe Perches wrote:
> On Fri, 2011-09-23 at 11:07 -0700, H Hartley Sweeten wrote:
>> Quiet the following sparse warnings:
> []
>> diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
> []
>> @@ -2705,7 +2705,7 @@ long btrfs_ioctl_space_info(struct btrfs_root *root, 
>> void __user *arg)
>>  up_read(&info->groups_sem);
>>  }
>>  
>> -user_dest = (struct btrfs_ioctl_space_info *)
>> +user_dest = (struct btrfs_ioctl_space_info __user *)
>> (arg + sizeof(struct btrfs_ioctl_space_args));
>
>   user_dest = arg;
>   user_dest++;
>
> ?

That produces a new sparse warning:

fs/btrfs/ioctl.c: In function ‘btrfs_ioctl_space_info’:
fs/btrfs/ioctl.c:2708: warning: ‘user_dest’ may be used uninitialized in this 
function

I guess user_dest could be set at the start of the function.  This would
also remove the cast of arg in the first copy_from_user.

Something like this:

--

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 970977a..9e7e5dc 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -2599,13 +2599,13 @@ static void get_block_group_info(struct list_head 
*groups_list,
}
 }
 
-long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg)
+static long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg)
 {
struct btrfs_ioctl_space_args space_args;
struct btrfs_ioctl_space_info space;
struct btrfs_ioctl_space_info *dest;
struct btrfs_ioctl_space_info *dest_orig;
-   struct btrfs_ioctl_space_info __user *user_dest;
+   struct btrfs_ioctl_space_info __user *user_dest = arg;
struct btrfs_space_info *info;
u64 types[] = {BTRFS_BLOCK_GROUP_DATA,
   BTRFS_BLOCK_GROUP_SYSTEM,
@@ -2617,9 +2617,7 @@ long btrfs_ioctl_space_info(struct btrfs_root *root, void 
__user *arg)
u64 slot_count = 0;
int i, c;
 
-   if (copy_from_user(&space_args,
-  (struct btrfs_ioctl_space_args __user *)arg,
-  sizeof(space_args)))
+   if (copy_from_user(&space_args, user_dest, sizeof(space_args)))
return -EFAULT;
 
for (i = 0; i < num_types; i++) {
@@ -2705,8 +2703,7 @@ long btrfs_ioctl_space_info(struct btrfs_root *root, void 
__user *arg)
up_read(&info->groups_sem);
}
 
-   user_dest = (struct btrfs_ioctl_space_info *)
-   (arg + sizeof(struct btrfs_ioctl_space_args));
+   user_dest++;
 
if (copy_to_user(user_dest, dest_orig, alloc_size))
ret = -EFAULT;



Re: [PATCH] btrfs/extent_map.c: quiet sparse noise

2011-09-23 Thread Valdis . Kletnieks
On Thu, 22 Sep 2011 18:45:56 PDT, H Hartley Sweeten said:
> Quiet the sparse noise:
> 
> warning: symbol '__lookup_extent_mapping' was not declared. Should it be 
> static?

Patch itself is correct, changelog is bad.  You're not quieting sparse noise,
you're making a declaration static because it doesn't need external visibility.


pgp680pMsCuLa.pgp
Description: PGP signature


[PATCH] btrfs/delayed-inode.c: quiet sparse noise

2011-09-23 Thread H Hartley Sweeten
Quiet the following sparse noise:

warning: symbol 'btrfs_first_delayed_node' was not declared. Should it be 
static?
warning: symbol 'btrfs_next_delayed_node' was not declared. Should it be static?
warning: symbol 'btrfs_first_prepared_delayed_node' was not declared. Should it 
be static?
warning: symbol 'btrfs_alloc_delayed_item' was not declared. Should it be 
static?
warning: symbol '__btrfs_lookup_delayed_insertion_item' was not declared. 
Should it be static?
warning: symbol '__btrfs_lookup_delayed_deletion_item' was not declared. Should 
it be static?
warning: symbol '__btrfs_search_delayed_insertion_item' was not declared. 
Should it be static?
warning: symbol '__btrfs_search_delayed_deletion_item' was not declared. Should 
it be static?
warning: symbol '__btrfs_first_delayed_insertion_item' was not declared. Should 
it be static?
warning: symbol '__btrfs_first_delayed_deletion_item' was not declared. Should 
it be static?
warning: symbol '__btrfs_next_delayed_item' was not declared. Should it be 
static?

The functions __btrfs_lookup_delayed_deletion_item,
__btrfs_search_delayed_insertion_item, and __btrfs_search_delayed_deletion_item
are not currently being used by the code. They may be eventually, so make them
inline to for to now make sparse happy and allow the linker to discard them.

Signed-off-by: H Hartley Sweeten 
Cc: Chris Mason 

---

diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index b52c672..5d3c397 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -199,8 +199,8 @@ static void btrfs_dequeue_delayed_node(struct 
btrfs_delayed_root *root,
spin_unlock(&root->lock);
 }
 
-struct btrfs_delayed_node *btrfs_first_delayed_node(
-   struct btrfs_delayed_root *delayed_root)
+static struct btrfs_delayed_node *
+btrfs_first_delayed_node(struct btrfs_delayed_root *delayed_root)
 {
struct list_head *p;
struct btrfs_delayed_node *node = NULL;
@@ -218,8 +218,8 @@ out:
return node;
 }
 
-struct btrfs_delayed_node *btrfs_next_delayed_node(
-   struct btrfs_delayed_node *node)
+static struct btrfs_delayed_node *
+btrfs_next_delayed_node(struct btrfs_delayed_node *node)
 {
struct btrfs_delayed_root *delayed_root;
struct list_head *p;
@@ -279,8 +279,8 @@ static inline void btrfs_release_delayed_node(struct 
btrfs_delayed_node *node)
__btrfs_release_delayed_node(node, 0);
 }
 
-struct btrfs_delayed_node *btrfs_first_prepared_delayed_node(
-   struct btrfs_delayed_root *delayed_root)
+static struct btrfs_delayed_node *
+btrfs_first_prepared_delayed_node(struct btrfs_delayed_root *delayed_root)
 {
struct list_head *p;
struct btrfs_delayed_node *node = NULL;
@@ -305,7 +305,7 @@ static inline void btrfs_release_prepared_delayed_node(
__btrfs_release_delayed_node(node, 1);
 }
 
-struct btrfs_delayed_item *btrfs_alloc_delayed_item(u32 data_len)
+static struct btrfs_delayed_item *btrfs_alloc_delayed_item(u32 data_len)
 {
struct btrfs_delayed_item *item;
item = kmalloc(sizeof(*item) + data_len, GFP_NOFS);
@@ -380,9 +380,9 @@ static struct btrfs_delayed_item 
*__btrfs_lookup_delayed_item(
return NULL;
 }
 
-struct btrfs_delayed_item *__btrfs_lookup_delayed_insertion_item(
-   struct btrfs_delayed_node *delayed_node,
-   struct btrfs_key *key)
+static struct btrfs_delayed_item *
+__btrfs_lookup_delayed_insertion_item(struct btrfs_delayed_node *delayed_node,
+ struct btrfs_key *key)
 {
struct btrfs_delayed_item *item;
 
@@ -391,9 +391,9 @@ struct btrfs_delayed_item 
*__btrfs_lookup_delayed_insertion_item(
return item;
 }
 
-struct btrfs_delayed_item *__btrfs_lookup_delayed_deletion_item(
-   struct btrfs_delayed_node *delayed_node,
-   struct btrfs_key *key)
+static inline struct btrfs_delayed_item *
+__btrfs_lookup_delayed_deletion_item(struct btrfs_delayed_node *delayed_node,
+struct btrfs_key *key)
 {
struct btrfs_delayed_item *item;
 
@@ -402,9 +402,9 @@ struct btrfs_delayed_item 
*__btrfs_lookup_delayed_deletion_item(
return item;
 }
 
-struct btrfs_delayed_item *__btrfs_search_delayed_insertion_item(
-   struct btrfs_delayed_node *delayed_node,
-   struct btrfs_key *key)
+static inline struct btrfs_delayed_item *
+__btrfs_search_delayed_insertion_item(struct btrfs_delayed_node *delayed_node,
+ struct btrfs_key *key)
 {
struct btrfs_delayed_item *item, *next;
 
@@ -416,9 +416,9 @@ struct btrfs_delayed_item 
*__btrfs_search_delayed_insertion_item(
return item;
 }
 
-struct btrfs_delayed_item *__btrfs_search_

[PATCH] btrfs/compression.c: quiet sparse noise

2011-09-23 Thread H Hartley Sweeten
Quiet the sparse warning:

warning: symbol 'btrfs_compress_op' was not declared. Should it be static?

Signed-off-by: H Hartley Sweeten 
Cc: Chris Mason  

---

diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 8ec5d86..3e8c133 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -726,7 +726,7 @@ static int comp_num_workspace[BTRFS_COMPRESS_TYPES];
 static atomic_t comp_alloc_workspace[BTRFS_COMPRESS_TYPES];
 static wait_queue_head_t comp_workspace_wait[BTRFS_COMPRESS_TYPES];
 
-struct btrfs_compress_op *btrfs_compress_op[] = {
+static struct btrfs_compress_op *btrfs_compress_op[] = {
&btrfs_zlib_compress,
&btrfs_lzo_compress,
 };
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] btrfs/free-space-cache.c: quiet sparse warnings

2011-09-23 Thread H Hartley Sweeten
Quiet the following sparse warnings:

warning: symbol '__create_free_space_inode' was not declared. Should it be 
static?
warning: symbol '__load_free_space_cache' was not declared. Should it be static?
warning: symbol '__btrfs_write_out_cache' was not declared. Should it be static?
warning: symbol '__btrfs_remove_free_space_cache_locked' was not declared. 
Should it be static?
warning: context imbalance in 'insert_into_bitmap' - unexpected unlock

Signed-off-by: H Hartley Sweeten 
Cc: Chris Mason  (maintainer:BTRFS FILE SYSTEM)

---

diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 6a265b9..bdd5c0e 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -113,7 +113,7 @@ struct inode *lookup_free_space_inode(struct btrfs_root 
*root,
return inode;
 }
 
-int __create_free_space_inode(struct btrfs_root *root,
+static int __create_free_space_inode(struct btrfs_root *root,
  struct btrfs_trans_handle *trans,
  struct btrfs_path *path, u64 ino, u64 offset)
 {
@@ -238,7 +238,7 @@ static int readahead_cache(struct inode *inode)
return 0;
 }
 
-int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
+static int __load_free_space_cache(struct btrfs_root *root, struct inode 
*inode,
struct btrfs_free_space_ctl *ctl,
struct btrfs_path *path, u64 offset)
 {
@@ -526,7 +526,7 @@ out:
return ret;
 }
 
-int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
+static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode 
*inode,
struct btrfs_free_space_ctl *ctl,
struct btrfs_block_group_cache *block_group,
struct btrfs_trans_handle *trans,
@@ -1433,6 +1433,8 @@ static struct btrfs_free_space_op free_space_op = {
 
 static int insert_into_bitmap(struct btrfs_free_space_ctl *ctl,
  struct btrfs_free_space *info)
+   __releases(&ctl->tree_lock)
+   __acquires(&ctl->tree_lock)
 {
struct btrfs_free_space *bitmap_info;
struct btrfs_block_group_cache *block_group = NULL;
@@ -1842,7 +1844,8 @@ out:
return 0;
 }
 
-void __btrfs_remove_free_space_cache_locked(struct btrfs_free_space_ctl *ctl)
+static void
+__btrfs_remove_free_space_cache_locked(struct btrfs_free_space_ctl *ctl)
 {
struct btrfs_free_space *info;
struct rb_node *node;
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] btrfs/ioctl.c: quiet sparse warnings

2011-09-23 Thread Joe Perches
On Fri, 2011-09-23 at 11:07 -0700, H Hartley Sweeten wrote:
> Quiet the following sparse warnings:
[]
> diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
[]
> @@ -2705,7 +2705,7 @@ long btrfs_ioctl_space_info(struct btrfs_root *root, 
> void __user *arg)
>   up_read(&info->groups_sem);
>   }
>  
> - user_dest = (struct btrfs_ioctl_space_info *)
> + user_dest = (struct btrfs_ioctl_space_info __user *)
> (arg + sizeof(struct btrfs_ioctl_space_args));

user_dest = arg;
user_dest++;

?


--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] btrfs/ioctl.c: quiet sparse warnings

2011-09-23 Thread H Hartley Sweeten
Quiet the following sparse warnings:

warning: cast removes address space of expression
warning: incorrect type in assignment (different address spaces)
   expected struct btrfs_ioctl_space_info [noderef] *user_dest
   got struct btrfs_ioctl_space_info *
warning: symbol 'btrfs_ioctl_space_info' was not declared. Should it be static?

Signed-off-by: H Hartley Sweeten 
Cc: Chris Mason 

---

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 970977a..a001af4 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -2599,7 +2599,7 @@ static void get_block_group_info(struct list_head 
*groups_list,
}
 }
 
-long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg)
+static long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg)
 {
struct btrfs_ioctl_space_args space_args;
struct btrfs_ioctl_space_info space;
@@ -2705,7 +2705,7 @@ long btrfs_ioctl_space_info(struct btrfs_root *root, void 
__user *arg)
up_read(&info->groups_sem);
}
 
-   user_dest = (struct btrfs_ioctl_space_info *)
+   user_dest = (struct btrfs_ioctl_space_info __user *)
(arg + sizeof(struct btrfs_ioctl_space_args));
 
if (copy_to_user(user_dest, dest_orig, alloc_size))
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Btrfs bug with g38867a2 and a question

2011-09-23 Thread Mathieu Chouquet-Stringer
On Fri, Sep 23, 2011 at 11:34:40AM -0400, Josef Bacik wrote:
> Yeah this is a different problem that's fixed upstream, so reboot into
> your other newer kernel with -o clear_cache.  Thanks,

Ok I'm back in business now, thanks...

Now I'll try to understand why my pc sometimes hangs doing write IOs...
-- 
Mathieu Chouquet-Stringer math...@csetco.com
The sun itself sees not till heaven clears.
 -- William Shakespeare --
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Btrfs bug with g38867a2 and a question

2011-09-23 Thread Josef Bacik
On 09/23/2011 11:31 AM, Mathieu Chouquet-Stringer wrote:
> On Fri, Sep 23, 2011 at 10:49:22AM -0400, Josef Bacik wrote:
>> Ok I have no idea how this could happen.  Can you mount -o clear_cache
>> and see if it's just the cache that's bad?  Thanks,
> 
> Did that and got this (it's a never ending story, this is from a F16
> alpha boot cd hence stack trace could be different):
> 

Yeah this is a different problem that's fixed upstream, so reboot into
your other newer kernel with -o clear_cache.  Thanks,

Josef
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Btrfs bug with g38867a2 and a question

2011-09-23 Thread Mathieu Chouquet-Stringer
On Fri, Sep 23, 2011 at 10:49:22AM -0400, Josef Bacik wrote:
> Ok I have no idea how this could happen.  Can you mount -o clear_cache
> and see if it's just the cache that's bad?  Thanks,

Did that and got this (it's a never ending story, this is from a F16
alpha boot cd hence stack trace could be different):

[  512.455253] [ cut here ]
[  512.455464] kernel BUG at fs/btrfs/inode.c:4586!
[  512.455662] invalid opcode:  [#1] SMP 
[  512.455874] CPU 1 
[  512.455879] Modules linked in: btrfs zlib_deflate libcrc32c xts lrw gf128mul 
sha256_generic dm_crypt dm_round_robin dm_multipath linear raid10 raid456 
async_raid6_recov async_pq raid6_pq async_xor xor async_memcpy async_tx raid1 
raid0 iscsi_ibft iscsi_boot_sysfs pcspkr edd iscsi_tcp libiscsi_tcp libiscsi 
scsi_transport_iscsi cramfs arc4 firewire_ohci firewire_core sdhci_pci sdhci 
yenta_socket crc_itu_t mmc_core iwl4965 iwl_legacy mac80211 cfg80211 rfkill 
nouveau ttm drm_kms_helper drm i2c_algo_bit i2c_core mxm_wmi wmi video e1000e 
squashfs
[  512.456018] 
[  512.456018] Pid: 1349, comm: mount Not tainted 3.0.0-1.fc16.x86_64 #1 LENOVO 
6458V5C/6458V5C
[  512.456018] RIP: 0010:[]  [] 
btrfs_add_link+0x123/0x17c [btrfs]
[  512.456018] RSP: 0018:880123e09848  EFLAGS: 00010282
[  512.456018] RAX: ffef RBX: 8801194c9d78 RCX: 0046
[  512.456018] RDX: 0127 RSI: 88012aa7aaf0 RDI: 0282
[  512.456018] RBP: 880123e098b8 R08: 8801228ac000 R09: 005a
[  512.456018] R10: 880123e096d8 R11: 88013b4023c0 R12: 8801194ca610
[  512.456018] R13: 8801165d2090 R14: 000b R15: 8801181fd630
[  512.456018] FS:  7f7bb600a820() GS:88013bc0() 
knlGS:
[  512.456018] CS:  0010 DS:  ES:  CR0: 8005003b
[  512.456018] CR2: 7f7235313768 CR3: 000113f09000 CR4: 06e0
[  512.456018] DR0:  DR1:  DR2: 
[  512.456018] DR3:  DR6: 0ff0 DR7: 0400
[  512.456018] Process mount (pid: 1349, threadinfo 880123e08000, task 
88012aa7a3e0)
[  512.456018] Stack:
[  512.456018]  88010001 00018028 880123e09888 
8801194bb000
[  512.456018]   5aff8801165b3000 01001537 

[  512.456018]  1000 8801194ba1b0 8801194ca610 
880123e099d7
[  512.456018] Call Trace:
[  512.456018]  [] add_inode_ref+0x2e6/0x37c [btrfs]
[  512.456018]  [] ? read_extent_buffer+0xc3/0xe3 [btrfs]
[  512.456018]  [] replay_one_buffer+0x197/0x212 [btrfs]
[  512.456018]  [] walk_up_log_tree+0xe4/0x1aa [btrfs]
[  512.456018]  [] ? replay_one_dir_item+0xbd/0xbd [btrfs]
[  512.456018]  [] walk_log_tree+0x9e/0x19e [btrfs]
[  512.456018]  [] btrfs_recover_log_trees+0x28b/0x298 [btrfs]
[  512.456018]  [] ? replay_one_dir_item+0xbd/0xbd [btrfs]
[  512.456018]  [] open_ctree+0x11aa/0x14b8 [btrfs]
[  512.456018]  [] btrfs_mount+0x233/0x498 [btrfs]
[  512.456018]  [] ? free_pages+0x47/0x4c
[  512.456018]  [] mount_fs+0x69/0x155
[  512.456018]  [] ? __alloc_percpu+0x10/0x12
[  512.456018]  [] vfs_kern_mount+0x63/0xa0
[  512.456018]  [] do_kern_mount+0x4d/0xdf
[  512.456018]  [] do_mount+0x63c/0x69f
[  512.456018]  [] sys_mount+0x88/0xc2
[  512.456018]  [] system_call_fastpath+0x16/0x1b
[  512.456018] Code: 89 f1 4c 89 fa 4c 89 ee 48 89 44 24 08 41 8b 04 24 66 c1 
e8 0c 83 e0 0f 0f b6 80 b8 bd 39 a0 89 04 24 e8 db cf fe ff 85 c0 74 02 <0f> 0b 
45 01 f6 4d 63 f6 4c 03 b3 a0 01 00 00 4c 89 b3 a0 01 00 
[  512.456018] RIP  [] btrfs_add_link+0x123/0x17c [btrfs]
[  512.456018]  RSP 
[  512.485056] ---[ end trace cea880cef8a5d83b ]---

-- 
Mathieu Chouquet-Stringer math...@csetco.com
The sun itself sees not till heaven clears.
 -- William Shakespeare --
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Btrfs bug with g38867a2 and a question

2011-09-23 Thread Josef Bacik
On 09/23/2011 08:55 AM, Mathieu Chouquet-Stringer wrote:
> On Thu, Sep 22, 2011 at 10:32:13PM +0200, Mathieu Chouquet-Stringer wrote:
>> On Thu, Sep 22, 2011 at 09:30:07PM +0200, Mathieu Chouquet-Stringer wrote:
>>> On Thu, Sep 22, 2011 at 03:00:03PM -0400, Josef Bacik wrote:
 Oh wow sorry I sent you the completely wrong patch, I wish I had caught
 your reply earlier.  Can you run with this patch, which is the one I
 meant to give you :).  Thanks,
>>>
>>> No worries, I've applied your patch (seems your thunderbird mangled line
>>> returns) and I'm rebooting...
>>
>> There:
>> http://mathieu.csetco.com/btrfs/btrfs-screenshots.tar
> 
> Just realised I truncated the first line!  It should have been:
> 
> Couldn't find in bitmap offset=20737413120, bytes=57344, search=20765691904, 
> search_bytes=65536, ret=0

Ok I have no idea how this could happen.  Can you mount -o clear_cache
and see if it's just the cache that's bad?  Thanks,

Josef
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[patch 2/2/4] mm: try to distribute dirty pages fairly across zones

2011-09-23 Thread Johannes Weiner
The maximum number of dirty pages that exist in the system at any time
is determined by a number of pages considered dirtyable and a
user-configured percentage of those, or an absolute number in bytes.

This number of dirtyable pages is the sum of memory provided by all
the zones in the system minus their lowmem reserves and high
watermarks, so that the system can retain a healthy number of free
pages without having to reclaim dirty pages.

But there is a flaw in that we have a zoned page allocator which does
not care about the global state but rather the state of individual
memory zones.  And right now there is nothing that prevents one zone
from filling up with dirty pages while other zones are spared, which
frequently leads to situations where kswapd, in order to restore the
watermark of free pages, does indeed have to write pages from that
zone's LRU list.  This can interfere so badly with IO from the flusher
threads that major filesystems (btrfs, xfs, ext4) mostly ignore write
requests from reclaim already, taking away the VM's only possibility
to keep such a zone balanced, aside from hoping the flushers will soon
clean pages from that zone.

Enter per-zone dirty limits.  They are to a zone's dirtyable memory
what the global limit is to the global amount of dirtyable memory, and
try to make sure that no single zone receives more than its fair share
of the globally allowed dirty pages in the first place.  As the number
of pages considered dirtyable exclude the zones' lowmem reserves and
high watermarks, the maximum number of dirty pages in a zone is such
that the zone can always be balanced without requiring page cleaning.

As this is a placement decision in the page allocator and pages are
dirtied only after the allocation, this patch allows allocators to
pass __GFP_WRITE when they know in advance that the page will be
written to and become dirty soon.  The page allocator will then
attempt to allocate from the first zone of the zonelist - which on
NUMA is determined by the task's NUMA memory policy - that has not
exceeded its dirty limit.

At first glance, it would appear that the diversion to lower zones can
increase pressure on them, but this is not the case.  With a full high
zone, allocations will be diverted to lower zones eventually, so it is
more of a shift in timing of the lower zone allocations.  Workloads
that previously could fit their dirty pages completely in the higher
zone may be forced to allocate from lower zones, but the amount of
pages that 'spill over' are limited themselves by the lower zones'
dirty constraints, and thus unlikely to become a problem.

For now, the problem of unfair dirty page distribution remains for
NUMA configurations where the zones allowed for allocation are in sum
not big enough to trigger the global dirty limits, wake up the flusher
threads and remedy the situation.  Because of this, an allocation that
could not succeed on any of the considered zones is allowed to ignore
the dirty limits before going into direct reclaim or even failing the
allocation, until a future patch changes the global dirty throttling
and flusher thread activation so that they take individual zone states
into account.

Signed-off-by: Johannes Weiner 
---
 include/linux/gfp.h   |4 ++-
 include/linux/writeback.h |1 +
 mm/page-writeback.c   |   83 +
 mm/page_alloc.c   |   29 
 4 files changed, 116 insertions(+), 1 deletions(-)

diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 3a76faf..50efc7e 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -36,6 +36,7 @@ struct vm_area_struct;
 #endif
 #define ___GFP_NO_KSWAPD   0x40u
 #define ___GFP_OTHER_NODE  0x80u
+#define ___GFP_WRITE   0x100u
 
 /*
  * GFP bitmasks..
@@ -85,6 +86,7 @@ struct vm_area_struct;
 
 #define __GFP_NO_KSWAPD((__force gfp_t)___GFP_NO_KSWAPD)
 #define __GFP_OTHER_NODE ((__force gfp_t)___GFP_OTHER_NODE) /* On behalf of 
other node */
+#define __GFP_WRITE((__force gfp_t)___GFP_WRITE)   /* Allocator intends to 
dirty page */
 
 /*
  * This may seem redundant, but it's a way of annotating false positives vs.
@@ -92,7 +94,7 @@ struct vm_area_struct;
  */
 #define __GFP_NOTRACK_FALSE_POSITIVE (__GFP_NOTRACK)
 
-#define __GFP_BITS_SHIFT 24/* Room for N __GFP_FOO bits */
+#define __GFP_BITS_SHIFT 25/* Room for N __GFP_FOO bits */
 #define __GFP_BITS_MASK ((__force gfp_t)((1 << __GFP_BITS_SHIFT) - 1))
 
 /* This equals 0, but use constants in case they ever change */
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index a5f495f..c96ee0c 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -104,6 +104,7 @@ void laptop_mode_timer_fn(unsigned long data);
 static inline void laptop_sync_completion(void) { }
 #endif
 void throttle_vm_writeout(gfp_t gfp_mask);
+bool zone_dirty_ok(struct zone *zone);
 
 extern unsigned long global_dirty_limit;
 
d

[patch 1/2/4] mm: writeback: cleanups in preparation for per-zone dirty limits

2011-09-23 Thread Johannes Weiner
On Thu, Sep 22, 2011 at 10:52:42AM +0200, Johannes Weiner wrote:
> On Wed, Sep 21, 2011 at 04:02:26PM -0700, Andrew Morton wrote:
> > Should we rename determine_dirtyable_memory() to
> > global_dirtyable_memory(), to get some sense of its relationship with
> > zone_dirtyable_memory()?
> 
> Sounds good.

---

The next patch will introduce per-zone dirty limiting functions in
addition to the traditional global dirty limiting.

Rename determine_dirtyable_memory() to global_dirtyable_memory()
before adding the zone-specific version, and fix up its documentation.

Also, move the functions to determine the dirtyable memory and the
function to calculate the dirty limit based on that together so that
their relationship is more apparent and that they can be commented on
as a group.

Signed-off-by: Johannes Weiner 
---
 mm/page-writeback.c |   92 +-
 1 files changed, 46 insertions(+), 46 deletions(-)

diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index c8acf8a..78604a6 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -186,12 +186,12 @@ static unsigned long highmem_dirtyable_memory(unsigned 
long total)
 }
 
 /**
- * determine_dirtyable_memory - amount of memory that may be used
+ * global_dirtyable_memory - number of globally dirtyable pages
  *
- * Returns the numebr of pages that can currently be freed and used
- * by the kernel for direct mappings.
+ * Returns the global number of pages potentially available for dirty
+ * page cache.  This is the base value for the global dirty limits.
  */
-static unsigned long determine_dirtyable_memory(void)
+static unsigned long global_dirtyable_memory(void)
 {
unsigned long x;
 
@@ -205,6 +205,47 @@ static unsigned long determine_dirtyable_memory(void)
 }
 
 /*
+ * global_dirty_limits - background-writeback and dirty-throttling thresholds
+ *
+ * Calculate the dirty thresholds based on sysctl parameters
+ * - vm.dirty_background_ratio  or  vm.dirty_background_bytes
+ * - vm.dirty_ratio or  vm.dirty_bytes
+ * The dirty limits will be lifted by 1/4 for PF_LESS_THROTTLE (ie. nfsd) and
+ * real-time tasks.
+ */
+void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty)
+{
+   unsigned long background;
+   unsigned long dirty;
+   unsigned long uninitialized_var(available_memory);
+   struct task_struct *tsk;
+
+   if (!vm_dirty_bytes || !dirty_background_bytes)
+   available_memory = global_dirtyable_memory();
+
+   if (vm_dirty_bytes)
+   dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE);
+   else
+   dirty = (vm_dirty_ratio * available_memory) / 100;
+
+   if (dirty_background_bytes)
+   background = DIV_ROUND_UP(dirty_background_bytes, PAGE_SIZE);
+   else
+   background = (dirty_background_ratio * available_memory) / 100;
+
+   if (background >= dirty)
+   background = dirty / 2;
+   tsk = current;
+   if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk)) {
+   background += background / 4;
+   dirty += dirty / 4;
+   }
+   *pbackground = background;
+   *pdirty = dirty;
+   trace_global_dirty_state(background, dirty);
+}
+
+/*
  * couple the period to the dirty_ratio:
  *
  *   period/2 ~ roundup_pow_of_two(dirty limit)
@@ -216,7 +257,7 @@ static int calc_period_shift(void)
if (vm_dirty_bytes)
dirty_total = vm_dirty_bytes / PAGE_SIZE;
else
-   dirty_total = (vm_dirty_ratio * determine_dirtyable_memory()) /
+   dirty_total = (vm_dirty_ratio * global_dirtyable_memory()) /
100;
return 2 + ilog2(dirty_total - 1);
 }
@@ -416,47 +457,6 @@ static unsigned long hard_dirty_limit(unsigned long thresh)
return max(thresh, global_dirty_limit);
 }
 
-/*
- * global_dirty_limits - background-writeback and dirty-throttling thresholds
- *
- * Calculate the dirty thresholds based on sysctl parameters
- * - vm.dirty_background_ratio  or  vm.dirty_background_bytes
- * - vm.dirty_ratio or  vm.dirty_bytes
- * The dirty limits will be lifted by 1/4 for PF_LESS_THROTTLE (ie. nfsd) and
- * real-time tasks.
- */
-void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty)
-{
-   unsigned long background;
-   unsigned long dirty;
-   unsigned long uninitialized_var(available_memory);
-   struct task_struct *tsk;
-
-   if (!vm_dirty_bytes || !dirty_background_bytes)
-   available_memory = determine_dirtyable_memory();
-
-   if (vm_dirty_bytes)
-   dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE);
-   else
-   dirty = (vm_dirty_ratio * available_memory) / 100;
-
-   if (dirty_background_bytes)
-   background = DIV_ROUND_UP(dirty_background_bytes, PAGE_SIZE);
-   else
-   background = (dirty_background_ratio * available_mem

[patch 1/4 v2] mm: exclude reserved pages from dirtyable memory

2011-09-23 Thread Johannes Weiner
The amount of dirtyable pages should not include the full number of
free pages: there is a number of reserved pages that the page
allocator and kswapd always try to keep free.

The closer (reclaimable pages - dirty pages) is to the number of
reserved pages, the more likely it becomes for reclaim to run into
dirty pages:

   +--+ ---
   |   anon   |  |
   +--+  |
   |  |  |
   |  |  -- dirty limit new-- flusher new
   |   file   |  | |
   |  |  | |
   |  |  -- dirty limit old-- flusher old
   |  ||
   +--+   --- reclaim
   | reserved |
   +--+
   |  kernel  |
   +--+

This patch introduces a per-zone dirty reserve that takes both the
lowmem reserve as well as the high watermark of the zone into account,
and a global sum of those per-zone values that is subtracted from the
global amount of dirtyable pages.  The lowmem reserve is unavailable
to page cache allocations and kswapd tries to keep the high watermark
free.  We don't want to end up in a situation where reclaim has to
clean pages in order to balance zones.

Not treating reserved pages as dirtyable on a global level is only a
conceptual fix.  In reality, dirty pages are not distributed equally
across zones and reclaim runs into dirty pages on a regular basis.

But it is important to get this right before tackling the problem on a
per-zone level, where the distance between reclaim and the dirty pages
is mostly much smaller in absolute numbers.

Signed-off-by: Johannes Weiner 
---
 include/linux/mmzone.h |6 ++
 include/linux/swap.h   |1 +
 mm/page-writeback.c|6 --
 mm/page_alloc.c|   19 +++
 4 files changed, 30 insertions(+), 2 deletions(-)

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 1ed4116..37a61e7 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -317,6 +317,12 @@ struct zone {
 */
unsigned long   lowmem_reserve[MAX_NR_ZONES];
 
+   /*
+* This is a per-zone reserve of pages that should not be
+* considered dirtyable memory.
+*/
+   unsigned long   dirty_balance_reserve;
+
 #ifdef CONFIG_NUMA
int node;
/*
diff --git a/include/linux/swap.h b/include/linux/swap.h
index b156e80..9021453 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -209,6 +209,7 @@ struct swap_list_t {
 /* linux/mm/page_alloc.c */
 extern unsigned long totalram_pages;
 extern unsigned long totalreserve_pages;
+extern unsigned long dirty_balance_reserve;
 extern unsigned int nr_free_buffer_pages(void);
 extern unsigned int nr_free_pagecache_pages(void);
 
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index da6d263..c8acf8a 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -170,7 +170,8 @@ static unsigned long highmem_dirtyable_memory(unsigned long 
total)
&NODE_DATA(node)->node_zones[ZONE_HIGHMEM];
 
x += zone_page_state(z, NR_FREE_PAGES) +
-zone_reclaimable_pages(z);
+zone_reclaimable_pages(z) -
+zone->dirty_balance_reserve;
}
/*
 * Make sure that the number of highmem pages is never larger
@@ -194,7 +195,8 @@ static unsigned long determine_dirtyable_memory(void)
 {
unsigned long x;
 
-   x = global_page_state(NR_FREE_PAGES) + global_reclaimable_pages();
+   x = global_page_state(NR_FREE_PAGES) + global_reclaimable_pages() -
+   dirty_balance_reserve;
 
if (!vm_highmem_is_dirtyable)
x -= highmem_dirtyable_memory(x);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 1dba05e..f8cba89 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -96,6 +96,14 @@ EXPORT_SYMBOL(node_states);
 
 unsigned long totalram_pages __read_mostly;
 unsigned long totalreserve_pages __read_mostly;
+/*
+ * When calculating the number of globally allowed dirty pages, there
+ * is a certain number of per-zone reserves that should not be
+ * considered dirtyable memory.  This is the sum of those reserves
+ * over all existing zones that contribute dirtyable memory.
+ */
+unsigned long dirty_balance_reserve __read_mostly;
+
 int percpu_pagelist_fraction;
 gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;
 
@@ -5076,8 +5084,19 @@ static void calculate_totalreserve_pages(void)
if (max > zone->present_pages)
max = zone->present_pages;
reserve_pages += max;
+   /*
+* Lowmem reserves are not available to
+* GFP_HIGHUSER page cache allocations and
+* kswapd tries to balance zones to their high
+* watermark.  As a result, neither

Re: [PATCH] xfstests: add new getdents test

2011-09-23 Thread Alex Elder
On Fri, 2011-09-23 at 16:03 +0300, Grazvydas Ignotas wrote:
> On Thu, Sep 22, 2011 at 11:18 PM, Alex Elder  wrote:
> > On Mon, 2011-09-12 at 03:19 +0300, Grazvydas Ignotas wrote:
> >> The test checks if no duplicate d_off values are returned and that
> >> those values are seekable to the right inodes.
> >>
> >> Signed-off-by: Grazvydas Ignotas 
> >
> > I have two minor comments on the C program below,
> > but even if you don't want to address them this
> > looks good.
> >
> > Reviewed-by: Alex Elder 
> >

. . .

> >> +static uint64_t d_off_histoty[HISTORY_LEN];
> >> +static uint64_t d_ino_histoty[HISTORY_LEN];
> >
> > Is "histoty" intentional or a typo?
> 
> whoops, it's a typo. I might send a patch for this later.

I will change this for you before committing.

> >
> >> +int
> >> +main(int argc, char *argv[])

. . .

> >> + nread = syscall(SYS_getdents64, fd, buf, BUF_SIZE);
> >
> > You could just use sizeof (struct linux_dirent_64) rather than
> > BUF_SIZE here.  I suppose it doesn't hurt but there's no real
> > sense in reading more than the one you're going to look at.
> 
> I'm not sure if reading partial entry is allowed, manpage says it may
> fail with EINVAL if buffer size is too small..

I will keep this as-is.  I was wrong about the size you should
pass, but my point still stands...  For a single entry you need
to pass a buffer big enough to hold a linux_dirent64 structure
*plus* the maximum-sized name that entry could contain.  That
appears to be 256 bytes, though POSIX allows more.

Anyway, if you or anyone else wants to try to change this
in the future to read in a smaller amount here, that's fine
but it's not necessary now.

Bottom line, I'll make the change I mentioned above and
will commit the result.

-Alex


--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Honest timeline for btrfsck

2011-09-23 Thread Erik Jensen
Chris,

Now that you're back from vacation, I was wondering if you would be
able to provide a revised estimate on how long this will take. Also,
of the four parts, which will be necessary to correct a 'parent
transid verify failed' error?

Thank you for your time,
--Erik

On Thu, Aug 18, 2011 at 1:50 PM, Chris Mason  wrote:
> Excerpts from Yalonda Gishtaka's message of 2011-08-17 21:09:37 -0400:
>> Chris Mason  oracle.com> writes:
>>
>> >
>> > Aside from making sure the kernel code is stable, btrfsck is all I'm
>> > working on right now.  I do expect a release in the next two weeks that
>> > can recover your data (and many others).
>> >
>> > Thanks,
>> > Chris
>> > --
>>
>>
>> Chris,
>>
>> We're all on the edge of our seats.  Can you provide an updated ETA on the
>> release of the first functional btrfsck tool?  No pressure or anything ;)
>
> Hi everyone,
>
> I've been working non-stop on this.  Currently fsck has four parts:
>
> 1) mount -o recovery mode.  I've posted smaller forms of these patches
> in the past that bypass log tree replay.  The new versions have code to
> create stub roots for trees that can't be read (like the extent
> allocation tree) and will allow the mount to proceed.
>
> 2) fsck that scans for older roots.  This takes advantage of older
> copies of metadata to look for consistent tree roots on disk.  The
> downside is that it is currently very slow.  I'm trying to speed it up
> by limiting the search to only the metadata block groups and a few other
> tricks.
>
> 3) fsck that fixes the extent allocation tree and the chunk tree.  This
> is where I've been spending most of my time.  The problem is that it
> tends to recover some filesystems and badly break others.  While I'm
> fixing up the corner cases that work poorly, I'm adding an undo log to
> the fsck code so that you can get the FS back into its original state if
> you don't like the result of the fsck.
>
> 4) The rest of the corruptions can be dealt with fairly well from the
> kernel.  I have a series of patches to make the extent allocation tree
> less strict about reference counts and other rules, basically allowing
> the FS to limp along instead of crash.
>
> These four things together are basically my minimal set of features
> required for fedora and our own internal projects at Oracle to start
> treating us as production filesystem.
>
> There are always bugs to fix, and I have #1 and #2 mostly ready.  I had
> hoped to get #1 out the door before I left on vacation and I still might
> post it tonight.
>
> -chris
> --
> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] xfstests: add new getdents test

2011-09-23 Thread Grazvydas Ignotas
On Thu, Sep 22, 2011 at 11:18 PM, Alex Elder  wrote:
> On Mon, 2011-09-12 at 03:19 +0300, Grazvydas Ignotas wrote:
>> The test checks if no duplicate d_off values are returned and that
>> those values are seekable to the right inodes.
>>
>> Signed-off-by: Grazvydas Ignotas 
>
> I have two minor comments on the C program below,
> but even if you don't want to address them this
> looks good.
>
> Reviewed-by: Alex Elder 
>
> . . .
>
>> +#include 
>> +
>> +struct linux_dirent64 {
>> +     uint64_t        d_ino;
>> +     uint64_t        d_off;
>> +     unsigned short  d_reclen;
>> +     unsigned char   d_type;
>> +     char            d_name[0];
>> +};
>> +
>> +#define BUF_SIZE 4096
>> +#define HISTORY_LEN 1024
>> +
>> +static uint64_t d_off_histoty[HISTORY_LEN];
>> +static uint64_t d_ino_histoty[HISTORY_LEN];
>
> Is "histoty" intentional or a typo?

whoops, it's a typo. I might send a patch for this later.

>
>> +int
>> +main(int argc, char *argv[])
>> +{
>> +     int fd, nread;
>> +     char buf[BUF_SIZE];
>
> . . .
>
>> +
>> +     /* check if seek works correctly */
>> +     d = (struct linux_dirent64 *)buf;
>> +     for (i = total - 1; i >= 0; i--)
>> +     {
>> +             lret = lseek(fd, i > 0 ? d_off_histoty[i - 1] : 0, SEEK_SET);
>> +             if (lret == -1) {
>> +                     perror("lseek");
>> +                     exit(EXIT_FAILURE);
>> +             }
>> +
>> +             nread = syscall(SYS_getdents64, fd, buf, BUF_SIZE);
>
> You could just use sizeof (struct linux_dirent_64) rather than
> BUF_SIZE here.  I suppose it doesn't hurt but there's no real
> sense in reading more than the one you're going to look at.

I'm not sure if reading partial entry is allowed, manpage says it may
fail with EINVAL if buffer size is too small..


-- 
Gražvydas
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Btrfs bug with g38867a2 and a question

2011-09-23 Thread Mathieu Chouquet-Stringer
On Thu, Sep 22, 2011 at 10:32:13PM +0200, Mathieu Chouquet-Stringer wrote:
> On Thu, Sep 22, 2011 at 09:30:07PM +0200, Mathieu Chouquet-Stringer wrote:
> > On Thu, Sep 22, 2011 at 03:00:03PM -0400, Josef Bacik wrote:
> > > Oh wow sorry I sent you the completely wrong patch, I wish I had caught
> > > your reply earlier.  Can you run with this patch, which is the one I
> > > meant to give you :).  Thanks,
> > 
> > No worries, I've applied your patch (seems your thunderbird mangled line
> > returns) and I'm rebooting...
> 
> There:
> http://mathieu.csetco.com/btrfs/btrfs-screenshots.tar

Just realised I truncated the first line!  It should have been:

Couldn't find in bitmap offset=20737413120, bytes=57344, search=20765691904, 
search_bytes=65536, ret=0
-- 
Mathieu Chouquet-Stringer math...@csetco.com
The sun itself sees not till heaven clears.
 -- William Shakespeare --
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html