[-mm PATCH 2/9] Memory controller containers setup (v6)

2007-08-17 Thread Balbir Singh

Changelong
1. use depends instead of select in init/Kconfig
2. Port to v11
3. Clean up the usage of names (container files) for v11

Setup the memory container and add basic hooks and controls to integrate
and work with the container.

Signed-off-by: <[EMAIL PROTECTED]>
---

 include/linux/container_subsys.h |6 +
 include/linux/memcontrol.h   |   21 ++
 init/Kconfig |7 ++
 mm/Makefile  |1 
 mm/memcontrol.c  |  127 +++
 5 files changed, 162 insertions(+)

diff -puN include/linux/container_subsys.h~mem-control-setup 
include/linux/container_subsys.h
--- linux-2.6.23-rc2-mm2/include/linux/container_subsys.h~mem-control-setup 
2007-08-17 13:14:19.0 +0530
+++ linux-2.6.23-rc2-mm2-balbir/include/linux/container_subsys.h
2007-08-17 13:14:19.0 +0530
@@ -30,3 +30,9 @@ SUBSYS(ns)
 #endif
 
 /* */
+
+#ifdef CONFIG_CONTAINER_MEM_CONT
+SUBSYS(mem_container)
+#endif
+
+/* */
diff -puN /dev/null include/linux/memcontrol.h
--- /dev/null   2007-06-01 20:42:04.0 +0530
+++ linux-2.6.23-rc2-mm2-balbir/include/linux/memcontrol.h  2007-08-17 
13:14:19.0 +0530
@@ -0,0 +1,21 @@
+/* memcontrol.h - Memory Controller
+ *
+ * Copyright IBM Corporation, 2007
+ * Author Balbir Singh <[EMAIL PROTECTED]>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#ifndef _LINUX_MEMCONTROL_H
+#define _LINUX_MEMCONTROL_H
+
+#endif /* _LINUX_MEMCONTROL_H */
+
diff -puN init/Kconfig~mem-control-setup init/Kconfig
--- linux-2.6.23-rc2-mm2/init/Kconfig~mem-control-setup 2007-08-17 
13:14:19.0 +0530
+++ linux-2.6.23-rc2-mm2-balbir/init/Kconfig2007-08-17 13:14:19.0 
+0530
@@ -364,6 +364,13 @@ config SYSFS_DEPRECATED
  If you are using a distro that was released in 2006 or later,
  it should be safe to say N here.
 
+config CONTAINER_MEM_CONT
+   bool "Memory controller for containers"
+   depends on CONTAINERS && RESOURCE_COUNTERS
+   help
+ Provides a memory controller that manages both page cache and
+ RSS memory.
+
 config PROC_PID_CPUSET
bool "Include legacy /proc//cpuset file"
depends on CPUSETS
diff -puN mm/Makefile~mem-control-setup mm/Makefile
--- linux-2.6.23-rc2-mm2/mm/Makefile~mem-control-setup  2007-08-17 
13:14:19.0 +0530
+++ linux-2.6.23-rc2-mm2-balbir/mm/Makefile 2007-08-17 13:14:19.0 
+0530
@@ -31,4 +31,5 @@ obj-$(CONFIG_FS_XIP) += filemap_xip.o
 obj-$(CONFIG_MIGRATION) += migrate.o
 obj-$(CONFIG_SMP) += allocpercpu.o
 obj-$(CONFIG_QUICKLIST) += quicklist.o
+obj-$(CONFIG_CONTAINER_MEM_CONT) += memcontrol.o
 
diff -puN /dev/null mm/memcontrol.c
--- /dev/null   2007-06-01 20:42:04.0 +0530
+++ linux-2.6.23-rc2-mm2-balbir/mm/memcontrol.c 2007-08-17 13:14:19.0 
+0530
@@ -0,0 +1,127 @@
+/* memcontrol.c - Memory Controller
+ *
+ * Copyright IBM Corporation, 2007
+ * Author Balbir Singh <[EMAIL PROTECTED]>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include 
+#include 
+#include 
+
+struct container_subsys mem_container_subsys;
+
+/*
+ * The memory controller data structure. The memory controller controls both
+ * page cache and RSS per container. We would eventually like to provide
+ * statistics based on the statistics developed by Rik Van Riel for clock-pro,
+ * to help the administrator determine what knobs to tune.
+ *
+ * TODO: Add a water mark for the memory controller. Reclaim will begin when
+ * we hit the water mark.
+ */
+struct mem_container {
+   struct container_subsys_state css;
+   /*
+* the counter to account for memory usage
+*/
+   struct res_counter res;
+};
+
+/*
+ * A page_container page is associated with every page descriptor. The
+ * page_container helps us identify information about the container
+ */
+struct page_container {
+   struct list_head lru;   /* per container LRU list */
+   struct page *page;
+   struct mem_container *mem_container;
+};
+
+
+static inline
+st

[-mm PATCH 0/9] Memory controller introduction (v6)

2007-08-17 Thread Balbir Singh
Here's version 6 of the memory controller (against 2.6.23-rc2-mm2).

The tests that were run has been included in the _Test Results section_ below.

Changelog since version 5

1. Ported to 2.6.23-rc2-mm2
2. Added a css_put() in the case of race between allocation of page_containers
   (YAMAMOTO Takashi)

Changelog since version 4

1. Renamed meta_page to page_container (Nick Piggin)
2. Moved locking from page flags to last bit of the page_container pointer
   (Nick Piggin)
3. Fixed a rare race in mem_container_isolate_pages (YAMAMOTO Takashi)

Changelog since version 3

1. Ported to v11 of the containers patchset (2.6.23-rc1-mm1). Paul Menage
   helped immensely with a detailed review of v3
2. Reclaim is retried to allow reclaim of pages coming in as a result
   of mapped pages reclaim (swap cache growing as a result of RSS reclaim)
3. page_referenced() is now container aware. During container reclaim,
   references from other containers do not prevent a page from being
   reclaimed from a non-referencing container
4. Fixed a possible race condition spotted by YAMAMOTO Takashi

Changelog since version 2

1. Improved error handling in mm/memory.c (spotted by YAMAMOTO Takashi)
2. Test results included
3. try_to_free_mem_container_pages() bug fix (sc->may_writepage is now
   set to !laptop_mode)

Changelog since version 1

1. Fixed some compile time errors (in mm/migrate.c from Vaidyanathan S)
2. Fixed a panic seen when LIST_DEBUG is enabled
3. Added a mechanism to control whether we track page cache or both
   page cache and mapped pages (as requested by Pavel)
4. Dave Hansen provided detail review comments on the code.

This patchset implements another version of the memory controller. These
patches have been through a big churn, the first set of patches were posted
last year and earlier this year at
http://lkml.org/lkml/2007/2/19/10

This patchset draws from the patches listed above and from some of the
contents of the patches posted by Vaidyanathan for page cache control.
http://lkml.org/lkml/2007/6/20/92

At OLS, the resource management BOF, it was discussed that we need to manage
RSS and unmapped page cache together. This patchset is a step towards that

TODO's

1. Add memory controller water mark support. Reclaim on high water mark
2. Add support for shrinking on limit change
3. Add per zone per container LRU lists (this is being actively worked
   on by Pavel Emelianov)
4. Figure out a better CLUI for the controller
5. Add better statistics
6. Explore using read_unit64() as recommended by Paul Menage
   (NOTE: read_ulong() would also be nice to have)

In case you have been using/testing the RSS controller, you'll find that
this controller works slower than the RSS controller. The reason being
that both swap cache and page cache is accounted for, so pages do go
out to swap upon reclaim (they cannot live in the swap cache).

Any test output, feedback, comments, suggestions are welcome! I am committed
to fixing any bugs and improving the performance of the memory controller.
Do not hesitate to send any fixes, request for fixes that is required.

Using the patches

1. Enable Memory controller configuration
2. Compile and boot the new kernel
3. mount -t container container -o memory /container
   will mount the memory controller to the /container mount point
4. mkdir /container/a
5. echo $$ > /container/a/tasks (add tasks to the new container)
6. echo -n  > /container/a/memory.limit
   example
   echo -n 204800 > /container/a/memory.limit, sets the limit to 800 MB
   on a system with 4K page size
7. run tasks, see the memory controller work
8. Report results, provide feedback
9. Develop/use new patches and go to step 1

Test Results

Results for version 3 of the patch were posted at
http://lwn.net/Articles/242554/

The code was also tested on a power box with regular machine usage scenarios,
the config disabled and with a stress suite that touched all the memory
in the system and was limited in a container.

Dhaval ran several tests on v6 and gave his thumbs up to the controller
(a hard to achieve goal :-) ).

Run kernbench stress

Three simultaneously and with one inside a container of 800 MB.


Kernbench results running within the container of 800 MB


Thu Aug 16 22:34:59 IST 2007
2.6.23-rc2-mm2-mem-v6
Average Half load -j 4 Run (std deviation):
Elapsed Time 466.548 (47.6014)
User Time 876.598 (10.5273)
System Time 223.136 (1.29247)
Percent CPU 237.2 (23.2744)
Context Switches 146351 (6539.91)
Sleeps 174003 (5031.94)

Average Optimal load -j 32 Run (std deviation):
Elapsed Time 423.496 (60.625)
User Time 897.285 (23.0391)
System Time 228.836 (6.11205)
Percent CPU 257.1 (40.9022)
Context Switches 262134 (123397)
Sleeps 270815 (103597)


Kernbench results running within the default container
--

Thu Aug 16 22:34:33 IST 2007
2.6.23-rc2-mm2-mem-v6
Average Half load -

[-mm PATCH 4/9] Memory controller memory accounting (v6)

2007-08-17 Thread Balbir Singh


Changelog for v6
1. Do a css_put() in the case of a race in allocating page containers
   (YAMAMOTO Takashi)

Changelog for v5
1. Rename meta_page to page_container
2. Remove PG_metapage and use the lower bit of the page_container pointer
   for locking

Changelog for v3

1. Fix a probable leak with meta_page's (pointed out by Paul Menage)
2. Introduce a wrapper around mem_container_uncharge for uncharging pages
   mem_container_uncharge_page()

Changelog

1. Improved error handling, uncharge on errors and check to see if we are
   leaking pages (review by YAMAMOTO Takashi)

Add the accounting hooks. The accounting is carried out for RSS and Page
Cache (unmapped) pages. There is now a common limit and accounting for both.
The RSS accounting is accounted at page_add_*_rmap() and page_remove_rmap()
time. Page cache is accounted at add_to_page_cache(),
__delete_from_page_cache(). Swap cache is also accounted for.

Each page's page_container is protected with the last bit of the page_container
pointer, this makes handling of race conditions involving simultaneous
mappings of a page easier. A reference count is kept in the page_container to
deal with cases where a page might be unmapped from the RSS of all tasks, but
still lives in the page cache.

Credits go to Vaidyanathan Srinivasan for helping with reference counting work
of the page container. Almost all of the page cache accounting code has help
from Vaidyanathan Srinivasan.

Signed-off-by: Vaidyanathan Srinivasan <[EMAIL PROTECTED]>
Signed-off-by: <[EMAIL PROTECTED]>
---

 include/linux/memcontrol.h |   20 +
 mm/filemap.c   |   12 ++-
 mm/memcontrol.c|  166 -
 mm/memory.c|   43 ++-
 mm/migrate.c   |6 +
 mm/page_alloc.c|3 
 mm/rmap.c  |   17 
 mm/swap_state.c|   12 ++-
 mm/swapfile.c  |   41 ++-
 9 files changed, 293 insertions(+), 27 deletions(-)

diff -puN include/linux/memcontrol.h~mem-control-accounting 
include/linux/memcontrol.h
--- linux-2.6.23-rc2-mm2/include/linux/memcontrol.h~mem-control-accounting  
2007-08-17 13:14:19.0 +0530
+++ linux-2.6.23-rc2-mm2-balbir/include/linux/memcontrol.h  2007-08-17 
13:14:19.0 +0530
@@ -30,6 +30,13 @@ extern void mm_free_container(struct mm_
 extern void page_assign_page_container(struct page *page,
struct page_container *pc);
 extern struct page_container *page_get_page_container(struct page *page);
+extern int mem_container_charge(struct page *page, struct mm_struct *mm);
+extern void mem_container_uncharge(struct page_container *pc);
+
+static inline void mem_container_uncharge_page(struct page *page)
+{
+   mem_container_uncharge(page_get_page_container(page));
+}
 
 #else /* CONFIG_CONTAINER_MEM_CONT */
 static inline void mm_init_container(struct mm_struct *mm,
@@ -51,6 +58,19 @@ static inline struct page_container *pag
return NULL;
 }
 
+static inline int mem_container_charge(struct page *page, struct mm_struct *mm)
+{
+   return 0;
+}
+
+static inline void mem_container_uncharge(struct page_container *pc)
+{
+}
+
+static inline void mem_container_uncharge_page(struct page *page)
+{
+}
+
 #endif /* CONFIG_CONTAINER_MEM_CONT */
 
 #endif /* _LINUX_MEMCONTROL_H */
diff -puN include/linux/page-flags.h~mem-control-accounting 
include/linux/page-flags.h
diff -puN mm/filemap.c~mem-control-accounting mm/filemap.c
--- linux-2.6.23-rc2-mm2/mm/filemap.c~mem-control-accounting2007-08-17 
13:14:19.0 +0530
+++ linux-2.6.23-rc2-mm2-balbir/mm/filemap.c2007-08-17 13:14:19.0 
+0530
@@ -31,6 +31,7 @@
 #include 
 #include 
 #include  /* for BUG_ON(!in_atomic()) only */
+#include 
 #include "internal.h"
 
 /*
@@ -116,6 +117,7 @@ void __remove_from_page_cache(struct pag
 {
struct address_space *mapping = page->mapping;
 
+   mem_container_uncharge_page(page);
radix_tree_delete(&mapping->page_tree, page->index);
page->mapping = NULL;
mapping->nrpages--;
@@ -442,6 +444,11 @@ int add_to_page_cache(struct page *page,
int error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM);
 
if (error == 0) {
+
+   error = mem_container_charge(page, current->mm);
+   if (error)
+   goto out;
+
write_lock_irq(&mapping->tree_lock);
error = radix_tree_insert(&mapping->page_tree, offset, page);
if (!error) {
@@ -451,10 +458,13 @@ int add_to_page_cache(struct page *page,
page->index = offset;
mapping->nrpages++;
__inc_zone_page_state(page, NR_FILE_PAGES);
-   }
+   } else
+   mem_container_uncharge_page(page);
+
write_unlock_irq(&mapping->tree_lock);
radix_tree_preload_end();
   

[-mm PATCH 1/9] Memory controller resource counters (v6)

2007-08-17 Thread Balbir Singh

From: Pavel Emelianov <[EMAIL PROTECTED]>

Introduce generic structures and routines for resource accounting.

Each resource accounting container is supposed to aggregate it,
container_subsystem_state and its resource-specific members within.

Signed-off-by: Pavel Emelianov <[EMAIL PROTECTED]>
Signed-off-by: <[EMAIL PROTECTED]>
---

 include/linux/res_counter.h |  102 +
 init/Kconfig|7 ++
 kernel/Makefile |1 
 kernel/res_counter.c|  120 
 4 files changed, 230 insertions(+)

diff -puN /dev/null include/linux/res_counter.h
--- /dev/null   2007-06-01 20:42:04.0 +0530
+++ linux-2.6.23-rc2-mm2-balbir/include/linux/res_counter.h 2007-08-17 
13:14:18.0 +0530
@@ -0,0 +1,102 @@
+#ifndef __RES_COUNTER_H__
+#define __RES_COUNTER_H__
+
+/*
+ * Resource Counters
+ * Contain common data types and routines for resource accounting
+ *
+ * Copyright 2007 OpenVZ SWsoft Inc
+ *
+ * Author: Pavel Emelianov <[EMAIL PROTECTED]>
+ *
+ */
+
+#include 
+
+/*
+ * The core object. the container that wishes to account for some
+ * resource may include this counter into its structures and use
+ * the helpers described beyond
+ */
+
+struct res_counter {
+   /*
+* the current resource consumption level
+*/
+   unsigned long usage;
+   /*
+* the limit that usage cannot exceed
+*/
+   unsigned long limit;
+   /*
+* the number of unsuccessful attempts to consume the resource
+*/
+   unsigned long failcnt;
+   /*
+* the lock to protect all of the above.
+* the routines below consider this to be IRQ-safe
+*/
+   spinlock_t lock;
+};
+
+/*
+ * Helpers to interact with userspace
+ * res_counter_read/_write - put/get the specified fields from the
+ * res_counter struct to/from the user
+ *
+ * @counter: the counter in question
+ * @member:  the field to work with (see RES_xxx below)
+ * @buf: the buffer to opeate on,...
+ * @nbytes:  its size...
+ * @pos: and the offset.
+ */
+
+ssize_t res_counter_read(struct res_counter *counter, int member,
+   const char __user *buf, size_t nbytes, loff_t *pos);
+ssize_t res_counter_write(struct res_counter *counter, int member,
+   const char __user *buf, size_t nbytes, loff_t *pos);
+
+/*
+ * the field descriptors. one for each member of res_counter
+ */
+
+enum {
+   RES_USAGE,
+   RES_LIMIT,
+   RES_FAILCNT,
+};
+
+/*
+ * helpers for accounting
+ */
+
+void res_counter_init(struct res_counter *counter);
+
+/*
+ * charge - try to consume more resource.
+ *
+ * @counter: the counter
+ * @val: the amount of the resource. each controller defines its own
+ *   units, e.g. numbers, bytes, Kbytes, etc
+ *
+ * returns 0 on success and <0 if the counter->usage will exceed the
+ * counter->limit _locked call expects the counter->lock to be taken
+ */
+
+int res_counter_charge_locked(struct res_counter *counter, unsigned long val);
+int res_counter_charge(struct res_counter *counter, unsigned long val);
+
+/*
+ * uncharge - tell that some portion of the resource is released
+ *
+ * @counter: the counter
+ * @val: the amount of the resource
+ *
+ * these calls check for usage underflow and show a warning on the console
+ * _locked call expects the counter->lock to be taken
+ */
+
+void res_counter_uncharge_locked(struct res_counter *counter, unsigned long 
val);
+void res_counter_uncharge(struct res_counter *counter, unsigned long val);
+
+#endif
diff -puN init/Kconfig~res_counters_infra init/Kconfig
--- linux-2.6.23-rc2-mm2/init/Kconfig~res_counters_infra2007-08-17 
13:14:18.0 +0530
+++ linux-2.6.23-rc2-mm2-balbir/init/Kconfig2007-08-17 13:14:18.0 
+0530
@@ -337,6 +337,13 @@ config CPUSETS
 
  Say N if unsure.
 
+config RESOURCE_COUNTERS
+   bool "Resource counters"
+   help
+ This option enables controller independent resource accounting
+  infrastructure that works with containers
+   depends on CONTAINERS
+
 config SYSFS_DEPRECATED
bool "Create deprecated sysfs files"
default y
diff -puN kernel/Makefile~res_counters_infra kernel/Makefile
--- linux-2.6.23-rc2-mm2/kernel/Makefile~res_counters_infra 2007-08-17 
13:14:18.0 +0530
+++ linux-2.6.23-rc2-mm2-balbir/kernel/Makefile 2007-08-17 13:14:18.0 
+0530
@@ -58,6 +58,7 @@ obj-$(CONFIG_RELAY) += relay.o
 obj-$(CONFIG_SYSCTL) += utsname_sysctl.o
 obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
 obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o
+obj-$(CONFIG_RESOURCE_COUNTERS) += res_counter.o
 
 ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y)
 # According to Alan Modra <[EMAIL PROTECTED]>, the -fno-omit-frame-pointer is
diff -puN /dev/null kernel/res_counter.c
--- /dev/null   2007-06-01 20:42:04.0 +0530
+++ linux-2.6.23-rc2-mm2-balbir/kernel/res_counter.c2007-08-

[-mm PATCH 5/9] Memory controller task migration (v6)

2007-08-17 Thread Balbir Singh

Allow tasks to migrate from one container to the other. We migrate
mm_struct's mem_container only when the thread group id migrates.


Signed-off-by: <[EMAIL PROTECTED]>
---

 mm/memcontrol.c |   35 +++
 1 file changed, 35 insertions(+)

diff -puN mm/memcontrol.c~mem-control-task-migration mm/memcontrol.c
--- linux-2.6.23-rc2-mm2/mm/memcontrol.c~mem-control-task-migration 
2007-08-17 13:14:19.0 +0530
+++ linux-2.6.23-rc2-mm2-balbir/mm/memcontrol.c 2007-08-17 13:14:19.0 
+0530
@@ -326,11 +326,46 @@ static int mem_container_populate(struct
ARRAY_SIZE(mem_container_files));
 }
 
+static void mem_container_move_task(struct container_subsys *ss,
+   struct container *cont,
+   struct container *old_cont,
+   struct task_struct *p)
+{
+   struct mm_struct *mm;
+   struct mem_container *mem, *old_mem;
+
+   mm = get_task_mm(p);
+   if (mm == NULL)
+   return;
+
+   mem = mem_container_from_cont(cont);
+   old_mem = mem_container_from_cont(old_cont);
+
+   if (mem == old_mem)
+   goto out;
+
+   /*
+* Only thread group leaders are allowed to migrate, the mm_struct is
+* in effect owned by the leader
+*/
+   if (p->tgid != p->pid)
+   goto out;
+
+   css_get(&mem->css);
+   rcu_assign_pointer(mm->mem_container, mem);
+   css_put(&old_mem->css);
+
+out:
+   mmput(mm);
+   return;
+}
+
 struct container_subsys mem_container_subsys = {
.name = "memory",
.subsys_id = mem_container_subsys_id,
.create = mem_container_create,
.destroy = mem_container_destroy,
.populate = mem_container_populate,
+   .attach = mem_container_move_task,
.early_init = 1,
 };
_

-- 
Warm Regards,
Balbir Singh
Linux Technology Center
IBM, ISTL
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 0/24] make atomic_read() behave consistently across all architectures

2007-08-17 Thread Nick Piggin

Satyam Sharma wrote:


On Fri, 17 Aug 2007, Herbert Xu wrote:



On Fri, Aug 17, 2007 at 01:43:27PM +1000, Paul Mackerras wrote:

BTW, the sort of missing barriers that triggered this thread
aren't that subtle.  It'll result in a simple lock-up if the
loop condition holds upon entry.  At which point it's fairly
straightforward to find the culprit.



Not necessarily. A barrier-less buggy code such as below:

atomic_set(&v, 0);

... /* some initial code */

while (atomic_read(&v))
;

... /* code that MUST NOT be executed unless v becomes non-zero */

(where v->counter is has no volatile access semantics)

could be generated by the compiler to simply *elid* or *do away* with
the loop itself, thereby making the:

"/* code that MUST NOT be executed unless v becomes non-zero */"

to be executed even when v is zero! That is subtle indeed, and causes
no hard lockups.


Then I presume you mean

while (!atomic_read(&v))
;

Which is just the same old infinite loop bug solved with cpu_relax().
These are pretty trivial to audit and fix, and also to debug, I would
think.



Granted, the above IS buggy code. But, the stated objective is to avoid
heisenbugs.


Anyway, why are you making up code snippets that are buggy in other
ways in order to support this assertion being made that lots of kernel
code supposedly depends on volatile semantics. Just reference the
actual code.



And we have driver / subsystem maintainers such as Stefan
coming up and admitting that often a lot of code that's written to use
atomic_read() does assume the read will not be elided by the compiler.


So these are broken on i386 and x86-64?

Are they definitely safe on SMP and weakly ordered machines with
just a simple compiler barrier there? Because I would not be
surprised if there are a lot of developers who don't really know
what to assume when it comes to memory ordering issues.

This is not a dig at driver writers: we still have memory ordering
problems in the VM too (and probably most of the subtle bugs in
lockless VM code are memory ordering ones). Let's not make up a
false sense of security and hope that sprinkling volatile around
will allow people to write bug-free lockless code. If a writer
can't be bothered reading API documentation and learning the Linux
memory model, they can still be productive writing safely locked
code.

--
SUSE Labs, Novell Inc.
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: how can I get an account on master.kernel.org machine

2007-08-17 Thread ye janboe
Randy, Eric,
Thanks you very much!

There is only diffs in http://ftp.arm.linux.org.uk/pub/armlinux/kernel/git-cur/.
What's these diffs base on, Linus' kernel tree?

2007/8/16, eric miao <[EMAIL PROTECTED]>:
> And if you purpose is only to download that tree, my suggestion is that
> some of the git trees are not public like linux-arm so the owner can do
> whatever he likes in the tree, and don't ever try to have access to it.
>
> for linux-arm, it will be constantly pulled by Linus, unless you really
> want the _most_ up-to-date patches, which you might find them at
>
>   http://ftp.arm.linux.org.uk/pub/armlinux/kernel/git-cur/
>
> that folder will be constantly refreshed by the maintainer.
>
> - eric
>
> Randy Dunlap wrote:
> > On Thu, 16 Aug 2007 10:58:41 +0800 ye janboe wrote:
> >
> >> I want to down linux-arm git repos, but I do not have an account on
> >> master machine.
> >
> > see http://www.kernel.org/faq/#account
> >
> > ---
> > ~Randy
> > *** Remember to use Documentation/SubmitChecklist when testing your code ***
> > -
> > To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> > the body of a message to [EMAIL PROTECTED]
> > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > Please read the FAQ at  http://www.tux.org/lkml/
> >
>
>
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 0/24] make atomic_read() behave consistently across all architectures

2007-08-17 Thread Satyam Sharma


On Fri, 17 Aug 2007, Nick Piggin wrote:

> Satyam Sharma wrote:
> 
> > #define atomic_read_volatile(v) \
> > ({  \
> > forget((v)->counter);   \
> > ((v)->counter); \
> > })
> > 
> > where:
> 
> *vomit* :)

I wonder if this'll generate smaller and better code than _both_ the
other atomic_read_volatile() variants. Would need to build allyesconfig
on lots of diff arch's etc to test the theory though.


> Not only do I hate the keyword volatile, but the barrier is only a
> one-sided affair so its probable this is going to have slightly
> different allowed reorderings than a real volatile access.

True ...


> Also, why would you want to make these insane accessors for atomic_t
> types? Just make sure everybody knows the basics of barriers, and they
> can apply that knowledge to atomic_t and all other lockless memory
> accesses as well.

Code that looks like:

while (!atomic_read(&v)) {
...
cpu_relax_no_barrier();
forget(v.counter);

}

would be uglier. Also think about code such as:

a = atomic_read();
if (!a)
do_something();

forget();
a = atomic_read();
... /* some code that depends on value of a, obviously */

forget();
a = atomic_read();
...

So much explicit sprinkling of "forget()" looks ugly.

atomic_read_volatile()

on the other hand, looks neater. The "_volatile()" suffix makes it also
no less explicit than an explicit barrier-like macro that this primitive
is something "special", for code clarity purposes.


> > #define forget(a)   __asm__ __volatile__ ("" :"=m" (a) :"m" (a))
> 
> I like order(x) better, but it's not the most perfect name either.

forget(x) is just a stupid-placeholder-for-a-better-name. order(x) sounds
good but we could leave quibbling about function or macro names for later,
this thread is noisy as it is :-)
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH/RFC 3/4]Introduce "account modifiers" mechanism

2007-08-17 Thread Rusty Russell
On Fri, 2007-08-17 at 09:35 +0200, Laurent Vivier wrote:
> Rusty Russell wrote:
> > Hi Laurent,
> 
> Hi Rusty,
> how are your puppies ?

They're getting a little fat, actually.  Too many features ...

> - remove PATCH 3, and add in task_struct a "ktime vtime" where we accumulate
> guest time (by calling something like guest_enter() and guest_exit() from the
> virtualization engine), and when in account_system_time() we have cputime >
> vtime we substrate vtime from cputime and add vtime to user time and guest 
> time.
> But doing like this we freeze in kernel/sched.c the link between system time,
> user time and guest time (i.e. system time = system time - vtime, user time =
> user time + vtime and guest time = guest time + vtime).

Actually, I think we can set a per-cpu "in_guest" flag for the scheduler
code, which then knows to add the tick to the guest time.  That seems
the simplest possible solution.

lguest or kvm would set the flag before running the guest (which is done
with preempt disabled or using preemption hooks), and reset it
afterwards.

Thoughts?
Rusty.

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 0/24] make atomic_read() behave consistently across all architectures

2007-08-17 Thread Satyam Sharma


On Fri, 17 Aug 2007, Paul Mackerras wrote:

> Herbert Xu writes:
> 
> > On Fri, Aug 17, 2007 at 03:09:57PM +1000, Paul Mackerras wrote:
> > > Herbert Xu writes:
> > > 
> > > > Can you find an actual atomic_read code snippet there that is
> > > > broken without the volatile modifier?
> > > 
> > > There are some in arch-specific code, for example line 1073 of
> > > arch/mips/kernel/smtc.c.  On mips, cpu_relax() is just barrier(), so
> > > the empty loop body is ok provided that atomic_read actually does the
> > > load each time around the loop.
> > 
> > A barrier() is all you need to force the compiler to reread
> > the value.
> > 
> > The people advocating volatile in this thread are talking
> > about code that doesn't use barrier()/cpu_relax().
> 
> Did you look at it?  Here it is:
> 
>   /* Someone else is initializing in parallel - let 'em finish */
>   while (atomic_read(&idle_hook_initialized) < 1000)
>   ;


Honestly, this thread is suffering from HUGE communication gaps.

What Herbert (obviously) meant there was that "this loop could've
been okay _without_ using volatile-semantics-atomic_read() also, if
only it used cpu_relax()".

That does work, because cpu_relax() is _at least_ barrier() on all
archs (on some it also emits some arch-dependent "pause" kind of
instruction).

Now, saying that "MIPS does not have such an instruction so I won't
use cpu_relax() for arch-dependent-busy-while-loops in arch/mips/"
sounds like a wrong argument, because: tomorrow, such arch's _may_
introduce such an instruction, so naturally, at that time we'd
change cpu_relax() appropriately (in reality, we would actually
*re-define* cpu_relax() and ensure that the correct version gets
pulled in depending on whether the callsite code is legacy or only
for the newer such CPUs of said arch, whatever), but loops such as
this would remain un-changed, because they never used cpu_relax()!

OTOH an argument that said the following would've made a stronger case:

"I don't want to use cpu_relax() because that's a full memory
clobber barrier() and I have loop-invariants / other variables
around in that code that I *don't* want the compiler to forget
just because it used cpu_relax(), and hence I will not use
cpu_relax() but instead make my atomic_read() itself have
"volatility" semantics. Not just that, but I will introduce a
cpu_relax_no_barrier() on MIPS, that would be a no-op #define
for now, but which may not be so forever, and continue to use
that in such busy loops."

In general, please read the thread-summary I've tried to do at:
http://lkml.org/lkml/2007/8/17/25
Feel free to continue / comment / correct stuff from there, there's
too much confusion and circular-arguments happening on this thread
otherwise.

[ I might've made an incorrect statement there about
  "volatile" w.r.t. cache on non-x86 archs, I think. ]


Satyam
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH]: proc: export a processes resource limits via proc/

2007-08-17 Thread Valdis . Kletnieks
On Thu, 16 Aug 2007 08:35:38 EDT, Neil Horman said:
> Hey again-
>   Andrew requested that I repost this cleanly, after running the patch
> through checkpatch.  As requested here it is with the changelog.
> 
> Currently, there exists no method for a process to query the resource
> limits of another process.  They can be inferred via some mechanisms but they
> cannot be explicitly determined.  Given that this information can be usefull 
to
> know during the debugging of an application, I've written this patch which
> exports all of a processes limits via /proc//limits.  
> 
> Tested successfully by myself on x86 on top of 2.6.23-rc2-mm1.

I had only one comment the first time around, and Neil addressed it.

I've also tested on x86_64 23-rc2-mm1, and it works here too.  I saw where this
uses units of 'bytes' while the shell 'ulimit' uses 1024-byte units in some
places, but (a) this lists the units and (b) it's consistent with setrlimit().
Testing with values >4G show it's 64-bit clean as well.

One question:  Is the units milliseconds, or seconds here:

+   [RLIMIT_CPU] = {"Max cpu time", "ms"},

Other than that, feel free to stick either/both of these on:

Reviewed-By: Valdis Kletnieks <[EMAIL PROTECTED]>
Tested-By: Valdis Kletnieks <[EMAIL PROTECTED]>


pgpqnQgPc3e9C.pgp
Description: PGP signature


Re: [PATCH 0/24] make atomic_read() behave consistently across all architectures

2007-08-17 Thread Nick Piggin

Stefan Richter wrote:

Nick Piggin wrote:


I don't know why people would assume volatile of atomics. AFAIK, most
of the documentation is pretty clear that all the atomic stuff can be
reordered etc. except for those that modify and return a value.



Which documentation is there?


Documentation/atomic_ops.txt



For driver authors, there is LDD3.  It doesn't specifically cover
effects of optimization on accesses to atomic_t.

For architecture port authors, there is Documentation/atomic_ops.txt.
Driver authors also can learn something from that document, as it
indirectly documents the atomic_t and bitops APIs.



"Semantics and Behavior of Atomic and Bitmask Operations" is
pretty direct :)

Sure, it says that it's for arch maintainers, but there is no
reason why users can't make use of it.



Prompted by this thread, I reread this document, and indeed, the
sentence "Unlike the above routines, it is required that explicit memory
barriers are performed before and after [atomic_{inc,dec}_return]"
indicates that atomic_read (one of the "above routines") is very
different from all other atomic_t accessors that return values.

This is strange.  Why is it that atomic_read stands out that way?  IMO


It is not just atomic_read of course. It is atomic_add,sub,inc,dec,set.



this API imbalance is quite unexpected by many people.  Wouldn't it be
beneficial to change the atomic_read API to behave the same like all
other atomic_t accessors that return values?


It is very consistent and well defined. Operations which both modify
the data _and_ return something are defined to have full barriers
before and after.

What do you want to add to the other atomic accessors? Full memory
barriers? Only compiler barriers? It's quite likely that if you think
some barriers will fix bugs, then there are other bugs lurking there
anyway.

Just use spinlocks if you're not absolutely clear about potential
races and memory ordering issues -- they're pretty cheap and simple.



OK, it is also different from the other accessors that return data in so
far as it doesn't modify the data.  But as driver "author", i.e. user of
the API, I can't see much use of an atomic_read that can be reordered
and, more importantly, can be optimized away by the compiler.


It will return to you an atomic snapshot of the data (loaded from
memory at some point since the last compiler barrier). All you have
to be aware of compiler barriers and the Linux SMP memory ordering
model, which should be a given if you are writing lockless code.



Sure, now
that I learned of these properties I can start to audit code and insert
barriers where I believe they are needed, but this simply means that
almost all occurrences of atomic_read will get barriers (unless there
already are implicit but more or less obvious barriers like msleep).


You might find that these places that appear to need barriers are
buggy for other reasons anyway. Can you point to some in-tree code
we can have a look at?

--
SUSE Labs, Novell Inc.
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] sysfs: don't warn on removal of a nonexistent binary file

2007-08-17 Thread Cornelia Huck
On Fri, 17 Aug 2007 10:19:24 +0900,
Tejun Heo <[EMAIL PROTECTED]> wrote:

> Alan Stern wrote:
> > This patch (as960) removes the error message and stack dump logged by
> > sysfs_remove_bin_file() when someone tries to remove a nonexistent
> > file.  The warning doesn't seem to be needed, since none of the other
> > file-, symlink-, or directory-removal routines in sysfs complain in a
> > comparable way.
> > 
> > Signed-off-by: Alan Stern <[EMAIL PROTECTED]>
> 
> Acked-by: Tejun Heo <[EMAIL PROTECTED]>

Acked-by: Cornelia Huck <[EMAIL PROTECTED]>
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] lockdep: annotate rcu_read_{,un}lock()

2007-08-17 Thread Peter Zijlstra
On Thu, 2007-08-16 at 09:01 -0700, Paul E. McKenney wrote:
> On Thu, Aug 16, 2007 at 04:25:07PM +0200, Peter Zijlstra wrote:
> > 
> > There seem to be some unbalanced rcu_read_{,un}lock() issues of late,
> > how about doing something like this:
> 
> This will break when rcu_read_lock() and rcu_read_unlock() are invoked
> from NMI/SMI handlers -- the raw_local_irq_save() in lock_acquire() will
> not mask NMIs or SMIs.
> 
> One approach would be to check for being in an NMI/SMI handler, and
> to avoid calling lock_acquire() and lock_release() in those cases.

It seems:

#define nmi_enter() do { lockdep_off(); __irq_enter(); } while (0)
#define nmi_exit()  do { __irq_exit(); lockdep_on(); } while (0)

Should make it all work out just fine. (for NMIs at least, /me fully
ignorant of the workings of SMIs)

> Another approach would be to use sparse, which has checks for
> rcu_read_lock()/rcu_read_unlock() nesting.

Yeah, but one more method can never hurt, no? :-)


-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: Need help with modules loading

2007-08-17 Thread Kay Sievers
On 8/17/07, Larry Finger <[EMAIL PROTECTED]> wrote:
> A new driver for the Broadcom BCM43xx devices has been written that uses 
> mac80211, rather than
> softmac. The newest versions of the Broadcom firmware does not support all 
> the BCM devices.
> Accordingly, a separate driver is being prepared that will use an older 
> version of the firmware and
> support these legacy devices. Unfortunately, there is not a clean separation 
> based on PCI id's;
> however, the revision level of the 802.11 wireless core can be used to 
> determine which driver should
> be used. The scheme works on most systems, but not mine and I need some help 
> to discover why.

> The 'MODALIAS=ssb:v4243id0812rev0A' line is correct for my device. In fact 
> issuing a modprobe
> "ssb:v4243id0812rev0A" command results in the loading of the module. For some 
> reason, this does not
> happen automatically.
>
> Initially, I suspected that my version of udev (103-13) was too old; however, 
> upgrading to version
> 114 did not help. My module-init-tools are V 3.2.2 and my distro is the 
> x86_64 version of openSUSE 10.2.

openSUSE 10.2 used a whitelist of buses which trigger module loading.
It's in the udev sysconfig. rules and /sbin/hwup.

The easiest is probably to add a rule for that bus:
  ACTION=="add", SUBSYSTEM=="ssb", ENV{MODALIAS}=="?*",
RUN+="/sbin/modprobe $env{MODALIAS}"

openSUSE 10.3 will call modprobe directly, the whitelist and the whole
hwup logic is removed in the meantime.

Kay
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: nanosleep() accuracy

2007-08-17 Thread Jan Engelhardt

On Aug 17 2007 11:44, GolovaSteek wrote:
>> How do you measure this?
>> If you want to have something done every 300 microseconds, you must not
>> sleep for 300 microseconds in each iteration, because you'd accumulate
>> errors. Use a periodic timer or use the current time to compute how long
>> to sleep in each iteration. Take a look how cyclictest does it.
>
>no. I just want my programm go to sleep sometimes and wake up in correct time.

Would it be acceptable to use an optimistic strategy, like the one below?

Let's say that the following tasks happen at each time: A at 0, B at 300, C at
600, D at 900, E at 1200, F at 1500. Assume sleeping takes 500 µs.
Then B and C could be run at 500, D at 1000 and E,F at 1500.


Jan
-- 

Re: Early printk behaviour

2007-08-17 Thread Gerd Hoffmann
Mike Frysinger wrote:
>> Hmm, sort of, although I didn't think about the case of no real console
>> replacing the early console.  The intention of the patch is to have a
>> smooth handover from the boot console to the real console.  And, yes, if
>> no real console is ever registered the boot console keeps running ...
> 
> i think it also occurs in the case where real console != early console

No.  At least not of the boot console has the CON_BOOT flag set as it
should.  Last message you'll see on the boot console is the handover
printk, telling you which real console device prints the following
messages.  Whenever early and real console go to the physical device or
not doesn't matter.

>> So you can either let it running and *not* mark it __init, so it can
>> keep on going without breaking.  Or you can explicitly unregister your
>> boot console at some point, maybe using a late_initcall.
> 
> wouldnt a common kernel late_initcall() be more appropriate ?  if
> early console hasnt switched over (for whatever reason), then kill it

Hmm, yes, should be doable in generic code.  Check whenever the current
console has CON_BOOT set and if so unregister it.

cheers,

  Gerd

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] Only initialize hvc_console if needed, cleanup Kconfig help

2007-08-17 Thread Stephen Rothwell
On Fri, 17 Aug 2007 11:25:46 +1000 Rusty Russell <[EMAIL PROTECTED]>
wrote:
>
> diff -r 0730da2377be drivers/char/Kconfig
> --- a/drivers/char/KconfigTue Aug 14 12:46:08 2007 +1000
> +++ b/drivers/char/KconfigFri Aug 17 09:05:12 2007 +1000
> @@ -568,10 +568,10 @@ config HVC_DRIVER
>  config HVC_DRIVER
>   bool
>   help
> -   Users of pSeries machines that want to utilize the hvc console 
> front-end
> -   module for their backend console driver should select this option.
> -   It will automatically be selected if one of the back-end console 
> drivers
> -   is selected.
> +   Generic "hypervisor virtual console" infrastructure for various
> +   hypervisors (pSeries, Xen, lguest).
  ^^^
You left out iSeries.  Probably better to just not enumerate them as they
may increase over time.

Otherwise looks good (though I haven't actually built/booted it).
-- 
Cheers,
Stephen Rothwell[EMAIL PROTECTED]
http://www.canb.auug.org.au/~sfr/


pgpLa3V67ZBdQ.pgp
Description: PGP signature


Re: nanosleep() accuracy

2007-08-17 Thread GolovaSteek
2007/8/17, Michal Schmidt <[EMAIL PROTECTED]>:
> GolovaSteek skrev:
> > Hello!
> > I need use sleep with accurat timing.
> > I use 2.6.21 with rt-prempt patch.
> > with enabled rt_preempt, dyn_ticks, and local_apic
> > But
> >
> > req.tv_nsec = 30;
> > req.tv_sec = 0;
> > nanosleep(&req,NULL)
> >
> > make pause around 310-330 microseconds.
>
> How do you measure this?
> If you want to have something done every 300 microseconds, you must not
> sleep for 300 microseconds in each iteration, because you'd accumulate
> errors. Use a periodic timer or use the current time to compute how long
> to sleep in each iteration. Take a look how cyclictest does it.

no. I just want my programm go to sleep sometimes and wake up in correct time.

> > I tried to understend how work nanosleep(), but it not depends from
> > jiffies and from smp_apic_timer_interrupt.
> >
> > When can accuracy be lost?
> > And how are process waked up?
> >
> >
> > GolovaSteek
>
> Don't forget the process will always have non-zero wakeup latency. It
> takes some time to process an interrupt, wakeup the process and schedule
> it to run on the CPU. 10-30 microseconds is not unreasonable.

But 2 operations can be done in 10 microseconds?
and why is there that inconstancy? Why sametimes 10 and sometimes 30?
In which points of implementation it happens?

GolovaSteek
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [Cluster-devel] Re: [gfs2][RFC] readdir caused ls process into D (uninterruptible) state, under testing with Samba 3.0.25

2007-08-17 Thread rae l
On 8/16/07, Steven Whitehouse <[EMAIL PROTECTED]> wrote:
> Hi,
>
> On Thu, 2007-08-16 at 16:20 +0800, 程任全 wrote:
> > It seems that gfs2 cannot work well with Samba,
> >
> > I'm using the gfs2 and the new cluster suite(cman with openais),
> >
> > 1. the testing environment is that 1 iscsi target and 2 cluster node,
> > 2. the two nodes both used iscsi initiator connect to the target,
> > 3. they're using the same physical iscsi disk,
> > 4. run LVM2 on top of the same iscsi disk,
> > 5. on the same lv (logical volume), I created a gfs2 filesystem,
> > 6. mount the gfs2 system to a same path under 2 nodes,
> > 7. start samba to shared the gfs2 mounting pointer on the 2 nodes,
> >
> > now test with windows client, when two or above clients connects to the 
> > samba,
> > everything is still normal; but when heavy writers or readers start,
> > the samba server daemon changed to D state, that's uninterruptible in
> > the kernel,
> > I wonder that's a problem of gfs2?
> >
> Which version of gfs2 are you using? GFS2 doesn't support leases which I
> know that Samba uses, however only relatively recent kernels have been
> able to report that fact via the VFS.
>
> > then I start a simple ls command on the gfs2 mouting point:
> > $ ls /mnt/gfs2
> > the ls process is also changed to D state,
> >
> > I think it's problems about readdir implementation in gfs2, and I want
> > to fix it, someone could give me some pointers?
> >
> Can you get a stack trace? echo 't' >/proc/sysrq-trigger
> That should show where Samba is getting stuck,
>
> Steve.
the stack trace of the 'D' state `ls`:

 ===
lsD F89B83F8  2200 12018  1 (NOTLB)
   f3eeadd4 0082 f6a425c0 f89b83f8 f3eead9c f6a425d4 f6f32d80 f573a93c
   0001 f89b83f3  c40a2030 c3fa9fa0 c40aaa70 c40aab7c 0e89
   b2a4b036 02e4 c40a2030 f3eeae1c  c3f85e98 f8e11e09 f8e11e0e
Call Trace:
 [] gdlm_bast+0x0/0x93 [lock_dlm]
 [] gdlm_ast+0x0/0x5 [lock_dlm]
 [] holder_wait+0x0/0x8 [gfs2]
 [] holder_wait+0x5/0x8 [gfs2]
 [] __wait_on_bit+0x2c/0x51
 [] out_of_line_wait_on_bit+0x6f/0x77
 [] holder_wait+0x0/0x8 [gfs2]
 [] wake_bit_function+0x0/0x3c
 [] wake_bit_function+0x0/0x3c
 [] wait_on_holder+0x3c/0x40 [gfs2]
 [] glock_wait_internal+0x81/0x1a3 [gfs2]
 [] gfs2_glock_nq+0x5e/0x79 [gfs2]
 [] gfs2_getattr+0x72/0xb5 [gfs2]
 [] gfs2_getattr+0x6b/0xb5 [gfs2]
 [] do_path_lookup+0x17a/0x1c3
 [] gfs2_getattr+0x0/0xb5 [gfs2]
 [] vfs_getattr+0x3e/0x51
 [] vfs_lstat_fd+0x2b/0x3d
 [] do_path_lookup+0x17a/0x1c3
 [] mntput_no_expire+0x11/0x6e
 [] sys_lstat64+0xf/0x23
 [] sys_symlinkat+0x81/0xb5
 [] sysenter_past_esp+0x5d/0x81
 [] __ipv6_addr_type+0x88/0xb8

the system is still running, so the mormal 'R' and 'S' state process
are ignored, But it turns out that it's not the readdir's fault from
this call trace, but gdlm_bast's problem in lock_dlm module.

>
>
>


-- 
Denis Cheng
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: Fork Bombing Patch

2007-08-17 Thread Petr Tesarik
-BEGIN PGP SIGNED MESSAGE-
Hash: SHA1

Paul Jackson wrote:
> Petr wrote:
>> Please do not add comments inside functions.
> 
> I find this advice a bit odd.  I am not aware of
> any prohibition of comments inside functions.
> 
> As with comments outside functions, they should
> serve a worthwhile purpose, of course.  One might
> debate whether this particular comment added by
> Anand was sufficiently valuable to be worth
> having.
> 
> But I don't agree to a blanket prohibition on
> comments inside functions.

I'm not saying that comments inside functions should be prohibited, but
comments inside functions often lead to over-commenting. There must be a
good reason for adding such a comment. See CodingStyle, chapter 8:
Commenting:

Also, try to avoid putting comments inside a function body.

(Before somebody starts arguing with this one sentence, please also read
the rest of the chapter; it is not long and you'll understand the
author's intention better.)

Kind regards,
Petr Tesarik
-BEGIN PGP SIGNATURE-
Version: GnuPG v1.4.6 (GNU/Linux)
Comment: Using GnuPG with Mozilla - http://enigmail.mozdev.org

iD8DBQFGxVFLjpY2ODFi2ogRAoLUAJwI+cywi9iKHWlx4yora0+WJfCEawCglyrf
xyucPIB3W63sbM1dw/Nsv2Y=
=SL8f
-END PGP SIGNATURE-
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH/RFC 3/4]Introduce "account modifiers" mechanism

2007-08-17 Thread Laurent Vivier
Rusty Russell wrote:
> On Thu, 2007-08-16 at 17:58 +0200, Laurent Vivier wrote:
>> [PATCH 3/3] introduce "account modifiers" mechanism in the kernel allowing a
>> module to modify the collected accounting for a given task. This 
>> implementation
>> is based on the "preempt_notifier". "account_system_time()" and
>> "account_user_time()" can call functions registered by a module to modify the
>> cputime value.
>>
>> Signed-off-by: Laurent Vivier <[EMAIL PROTECTED]>
> 
> 
> Hi Laurent,

Hi Rusty,
how are your puppies ?
And thank you for your comment.

>   This seems a little like overkill.  Why not just add an
> "account_guest_time" which subtracts the given amount of time from
> system time (if available) and adds it to guest time?  Then kvm (and
> lguest) should just need to call this at the right times.

We can. I did something like this before.
By doing like that, I think there is a major issue: system time can be
decreasing (as we substract a value from it), and thus we can have negative
value in a tool like top. It's why I play with the cputime to add to system time
and not directly with the system time.

BUT I'm very open, my only goal is be able to compute guest time, "how" is not
very important...

what we can do:

- keep PATCHES 1 and 2, because we need to store guest time for cpu and tasks.
It is very generic.

- remove PATCH 3, and add in task_struct a "ktime vtime" where we accumulate
guest time (by calling something like guest_enter() and guest_exit() from the
virtualization engine), and when in account_system_time() we have cputime >
vtime we substrate vtime from cputime and add vtime to user time and guest time.
But doing like this we freeze in kernel/sched.c the link between system time,
user time and guest time (i.e. system time = system time - vtime, user time =
user time + vtime and guest time = guest time + vtime).

- modify PATCH 4 to use new PATCH 3.

Do you agree ? Anybody doesn't agree ?

Laurent
-- 
- [EMAIL PROTECTED]  --
  "Software is hard" - Donald Knuth



signature.asc
Description: OpenPGP digital signature


Re: [PATCH 0/24] make atomic_read() behave consistently across all architectures

2007-08-17 Thread Stefan Richter
Nick Piggin wrote:
> I don't know why people would assume volatile of atomics. AFAIK, most
> of the documentation is pretty clear that all the atomic stuff can be
> reordered etc. except for those that modify and return a value.

Which documentation is there?

For driver authors, there is LDD3.  It doesn't specifically cover
effects of optimization on accesses to atomic_t.

For architecture port authors, there is Documentation/atomic_ops.txt.
Driver authors also can learn something from that document, as it
indirectly documents the atomic_t and bitops APIs.

Prompted by this thread, I reread this document, and indeed, the
sentence "Unlike the above routines, it is required that explicit memory
barriers are performed before and after [atomic_{inc,dec}_return]"
indicates that atomic_read (one of the "above routines") is very
different from all other atomic_t accessors that return values.

This is strange.  Why is it that atomic_read stands out that way?  IMO
this API imbalance is quite unexpected by many people.  Wouldn't it be
beneficial to change the atomic_read API to behave the same like all
other atomic_t accessors that return values?

OK, it is also different from the other accessors that return data in so
far as it doesn't modify the data.  But as driver "author", i.e. user of
the API, I can't see much use of an atomic_read that can be reordered
and, more importantly, can be optimized away by the compiler.  Sure, now
that I learned of these properties I can start to audit code and insert
barriers where I believe they are needed, but this simply means that
almost all occurrences of atomic_read will get barriers (unless there
already are implicit but more or less obvious barriers like msleep).
-- 
Stefan Richter
-=-=-=== =--- =---=
http://arcgraph.de/sr/
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 0/24] make atomic_read() behave consistently across all architectures

2007-08-17 Thread Satyam Sharma


On Thu, 16 Aug 2007, Paul E. McKenney wrote:

> On Fri, Aug 17, 2007 at 07:59:02AM +0800, Herbert Xu wrote:
> > On Thu, Aug 16, 2007 at 09:34:41AM -0700, Paul E. McKenney wrote:
> > >
> > > The compiler can also reorder non-volatile accesses.  For an example
> > > patch that cares about this, please see:
> > > 
> > >   http://lkml.org/lkml/2007/8/7/280
> > > 
> > > This patch uses an ORDERED_WRT_IRQ() in rcu_read_lock() and
> > > rcu_read_unlock() to ensure that accesses aren't reordered with respect
> > > to interrupt handlers and NMIs/SMIs running on that same CPU.
> > 
> > Good, finally we have some code to discuss (even though it's
> > not actually in the kernel yet).
> 
> There was some earlier in this thread as well.

Hmm, I never quite got what all this interrupt/NMI/SMI handling and
RCU business you mentioned earlier was all about, but now that you've
pointed to the actual code and issues with it ...


> > First of all, I think this illustrates that what you want
> > here has nothing to do with atomic ops.  The ORDERED_WRT_IRQ
> > macro occurs a lot more times in your patch than atomic
> > reads/sets.  So *assuming* that it was necessary at all,
> > then having an ordered variant of the atomic_read/atomic_set
> > ops could do just as well.
> 
> Indeed.  If I could trust atomic_read()/atomic_set() to cause the compiler
> to maintain ordering, then I could just use them instead of having to
> create an  ORDERED_WRT_IRQ().  (Or ACCESS_ONCE(), as it is called in a
> different patch.)

+#define WHATEVER(x)(*(volatile typeof(x) *)&(x))

I suppose one could want volatile access semantics for stuff that's
a bit-field too, no?

Also, this gives *zero* "re-ordering" guarantees that your code wants
as you've explained it below) -- neither w.r.t. CPU re-ordering (which
probably you don't care about) *nor* w.r.t. compiler re-ordering
(which you definitely _do_ care about).


> > However, I still don't know which atomic_read/atomic_set in
> > your patch would be broken if there were no volatile.  Could
> > you please point them out?
> 
> Suppose I tried replacing the ORDERED_WRT_IRQ() calls with
> atomic_read() and atomic_set().  Starting with __rcu_read_lock():
> 
> o If "ORDERED_WRT_IRQ(__get_cpu_var(rcu_flipctr)[idx])++"
>   was ordered by the compiler after
>   "ORDERED_WRT_IRQ(me->rcu_read_lock_nesting) = nesting + 1", then
>   suppose an NMI/SMI happened after the rcu_read_lock_nesting but
>   before the rcu_flipctr.
> 
>   Then if there was an rcu_read_lock() in the SMI/NMI
>   handler (which is perfectly legal), the nested rcu_read_lock()
>   would believe that it could take the then-clause of the
>   enclosing "if" statement.  But because the rcu_flipctr per-CPU
>   variable had not yet been incremented, an RCU updater would
>   be within its rights to assume that there were no RCU reads
>   in progress, thus possibly yanking a data structure out from
>   under the reader in the SMI/NMI function.
> 
>   Fatal outcome.  Note that only one CPU is involved here
>   because these are all either per-CPU or per-task variables.

Ok, so you don't care about CPU re-ordering. Still, I should let you know
that your ORDERED_WRT_IRQ() -- bad name, btw -- is still buggy. What you
want is a full compiler optimization barrier().

[ Your code probably works now, and emits correct code, but that's
  just because of gcc did what it did. Nothing in any standard,
  or in any documented behaviour of gcc, or anything about the real
  (or expected) semantics of "volatile" is protecting the code here. ]


> o If "ORDERED_WRT_IRQ(me->rcu_read_lock_nesting) = nesting + 1"
>   was ordered by the compiler to follow the
>   "ORDERED_WRT_IRQ(me->rcu_flipctr_idx) = idx", and an NMI/SMI
>   happened between the two, then an __rcu_read_lock() in the NMI/SMI
>   would incorrectly take the "else" clause of the enclosing "if"
>   statement.  If some other CPU flipped the rcu_ctrlblk.completed
>   in the meantime, then the __rcu_read_lock() would (correctly)
>   write the new value into rcu_flipctr_idx.
> 
>   Well and good so far.  But the problem arises in
>   __rcu_read_unlock(), which then decrements the wrong counter.
>   Depending on exactly how subsequent events played out, this could
>   result in either prematurely ending grace periods or never-ending
>   grace periods, both of which are fatal outcomes.
> 
> And the following are not needed in the current version of the
> patch, but will be in a future version that either avoids disabling
> irqs or that dispenses with the smp_read_barrier_depends() that I
> have 99% convinced myself is unneeded:
> 
> o nesting = ORDERED_WRT_IRQ(me->rcu_read_lock_nesting);
> 
> o idx = ORDERED_WRT_IRQ(rcu_ctrlblk.completed) & 0x1;
> 
> Furthermore, in that future version, irq handlers can cause the same
> mischief that SMI/NMI handlers can in this version.
> 
> Next, looking 

Re: [PATCH 0/24] make atomic_read() behave consistently across all architectures

2007-08-17 Thread Nick Piggin

Satyam Sharma wrote:


#define atomic_read_volatile(v) \
({  \
forget((v)->counter);\
((v)->counter);  \
})

where:


*vomit* :)

Not only do I hate the keyword volatile, but the barrier is only a
one-sided affair so its probable this is going to have slightly
different allowed reorderings than a real volatile access.

Also, why would you want to make these insane accessors for atomic_t
types? Just make sure everybody knows the basics of barriers, and they
can apply that knowledge to atomic_t and all other lockless memory
accesses as well.



#define forget(a)   __asm__ __volatile__ ("" :"=m" (a) :"m" (a))


I like order(x) better, but it's not the most perfect name either.

--
SUSE Labs, Novell Inc.
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: nanosleep() accuracy

2007-08-17 Thread Michal Schmidt

GolovaSteek skrev:

Hello!
I need use sleep with accurat timing.
I use 2.6.21 with rt-prempt patch.
with enabled rt_preempt, dyn_ticks, and local_apic
But

req.tv_nsec = 30;
req.tv_sec = 0;
nanosleep(&req,NULL)

make pause around 310-330 microseconds.


How do you measure this?
If you want to have something done every 300 microseconds, you must not 
sleep for 300 microseconds in each iteration, because you'd accumulate 
errors. Use a periodic timer or use the current time to compute how long 
to sleep in each iteration. Take a look how cyclictest does it.



I tried to understend how work nanosleep(), but it not depends from
jiffies and from smp_apic_timer_interrupt.

When can accuracy be lost?
And how are process waked up?


GolovaSteek


Don't forget the process will always have non-zero wakeup latency. It 
takes some time to process an interrupt, wakeup the process and schedule 
it to run on the CPU. 10-30 microseconds is not unreasonable.


Michal
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 000 of 6] A few block-layer tidy-up patches.

2007-08-17 Thread Neil Brown
On Friday August 17, [EMAIL PROTECTED] wrote:
> > > Please inspect the #block-2.6.24 branch to see the result.
> > 
> > I don't know where to look for this.  I checked
> >   http://git.kernel.org/?p=linux/kernel/git/axboe/linux-2.6-block.git
> > but they don't seem to be there.
> > ??
> 
> That's where it is, but the kernel.org mirroring is just horrible slow.
> Just checked now and it's there.

I discovered I was looking in the wrong place - not being very
familiar with git terminology.
I found them and it looks right.

I had a bit of a look at removing bio_data and ->buffer ...  the
usages outside of drivers/ide are fairly easy to deal with - I might
send a patch for that.  The drivers/ide stuff looks like a fairly
substantial rewrite is in order.
e.g.  idefloppy_packet_command_s seems to duplicate a lot of 
  fields from 'struct request', and it should probably use the request
  struct directly.

But a number of times ->buffer points to ->cmd, and there is no bio.
I guess we should use bio_map_kern to make a bio?

I'll see if I can come up with something testing it might be
awkward.  I have an ide cdrom I can test on.  Maybe an ide disk,, but
not an ide floppy :-)

NeilBrown
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 00/23] per device dirty throttling -v9

2007-08-17 Thread Peter Zijlstra
On Thu, 2007-08-16 at 14:29 -0700, Christoph Lameter wrote:
> Is there any way to make the global limits on which the dirty rate 
> calculations are based cpuset specific?
> 
> A process is part of a cpuset and that cpuset has only a fraction of 
> memory of the whole system. 
> 
> And only a fraction of that fraction can be dirtied. We do not currently 
> enforce such limits which can cause the amount of dirty pages in 
> cpusets to become excessively high. I have posted several patchsets that 
> deal with that issue. See http://lkml.org/lkml/2007/1/16/5
> 
> It seems that limiting dirty pages in cpusets may be much easier to 
> realize in the context of this patchset. The tracking of the dirty pages 
> per node is not necessary if one would calculate the maximum amount of 
> dirtyable pages in a cpuset and use that as a base, right?


Currently we do: 
  dirty = total_dirty * bdi_completions_p * task_dirty_p

As dgc pointed out before, there is the issue of bdi/task correlation,
that is, we do not track task dirty rates per bdi, so now a task that
heavily dirties on one bdi will also get penalised on the others (and
similar issues).

If we were to change it so:
  dirty = cpuset_dirty * bdi_completions_p * task_dirty_p

We get additional correlation issues: cpuset/bdi, cpuset/task.
Which could yield surprising results if some bdis are strictly per
cpuset.

The cpuset/task correlation has a strict mapping and could be solved by
keeping the vm_dirties counter per cpuset. However, this would seriously
complicate the code and I'm not sure if it would gain us much.

Anyway, things to ponder. But overall it should be quite doable.



signature.asc
Description: This is a digitally signed message part


Re: Fork Bombing Patch

2007-08-17 Thread Paul Jackson
Petr wrote:
> Please do not add comments inside functions.

I find this advice a bit odd.  I am not aware of
any prohibition of comments inside functions.

As with comments outside functions, they should
serve a worthwhile purpose, of course.  One might
debate whether this particular comment added by
Anand was sufficiently valuable to be worth
having.

But I don't agree to a blanket prohibition on
comments inside functions.

-- 
  I won't rest till it's the best ...
  Programmer, Linux Scalability
  Paul Jackson <[EMAIL PROTECTED]> 1.925.600.0401
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[Announce] RHEL5 LSPP/EAL4 Certification Testsuite has been released

2007-08-17 Thread Subrata Modak

Dear All,

The Red Hat Enterprise Linux 5 LSPP/EAL4 Certification Testsuite  has 
been released and can be found at http://ltp.sourceforge.net/ or 
http://sourceforge.net/project/showfiles.php?group_id=3382. You can find 
more details about this testsuite from 'lspp/README' contained in the 
package.


Regards & Thanks--
Subrata Modak

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 3/6] writeback: remove pages_skipped accounting in __block_write_full_page()

2007-08-17 Thread Fengguang Wu
On Mon, Aug 13, 2007 at 06:30:00PM +0800, Fengguang Wu wrote:
> > On Sun, Aug 12, 2007 at 05:11:23PM +0800, Fengguang Wu wrote:
> > > Miklos Szeredi <[EMAIL PROTECTED]> and me identified a writeback bug:
> > > Basicly they are
> > > - during the dd: ~16M 
> > > - after 30s:  ~4M
> > > - after 5s:   ~4M
> > > - after 5s: ~176M
> > > 
> > > The box has 2G memory.
> > > 
> > > Question 1:
> > > How come the 5s delays? I run 4 tests in total, 2 of which have such 5s 
> > > delays.
> > 
> > pdflush runs every five seconds, so that is indicative of the inode being
> > written once for 1024 pages, and then delayed to the next pdflush run 5s 
> > later.
> > perhaps the inodes aren't moving between the lists exactly the way you
> > think they are...
> 
> Now I figured out the exact situation. When the scan of s_io finishes
> with some small inodes, nr_to_write will be positive, fooling kupdate
> to quit prematurely. But in fact the big dirty file is on s_more_io
> waiting for more io... The attached patch fixes it.
> 
> Fengguang
> ===
> 
> Subject: writeback: introduce writeback_control.more_io to indicate more io
> 
> After making dirty a 100M file, the normal behavior is to
> start the writeback for all data after 30s delays. But
> sometimes the following happens instead:
> 
>   - after 30s:~4M
>   - after 5s: ~4M
>   - after 5s: all remaining 92M
> 
> Some analyze shows that the internal io dispatch queues goes like this:
> 
>   s_ios_more_io
>   -
>   1)  100M,1K 0
>   2)  1K  96M
>   3)  0   96M
> 
> 1) initial state with a 100M file and a 1K file
> 2) 4M written, nr_to_write <= 0, so write more
> 3) 1K written, nr_to_write > 0, no more writes(BUG)
> 
> nr_to_write > 0 in 3) fools the upper layer to think that data have all been
> written out. Bug the big dirty file is still sitting in s_more_io. We cannot
> simply splice s_more_io back to s_io as soon as s_io becomes empty, and let 
> the
> loop in generic_sync_sb_inodes() continue: this may starve newly expired 
> inodes
> in s_dirty.  It is also not an option to draw inodes from both s_more_io and
> s_dirty, an let the loop go on: this might lead to live locks, and might also
> starve other superblocks in sync time(well kupdate may still starve some
> superblocks, that's another bug).
> 
> So we have to return when a full scan of s_io completes. So nr_to_write > 0 
> does
> not necessarily mean that "all data are written". This patch introduces a flag
> writeback_control.more_io to indicate this situation. With it the big dirty 
> file
> no longer has to wait for the next kupdate invocation 5s later.

Sorry, this patch is found to be dangerous. It locks up my desktop
on heavy I/O: kupdate *immediately* returns to push the file in
s_more_io for writeback, but it *could* still not able to make
progress(locks etc.). Now kupdate ends up *busy looping*.  That could
be fixed by wait for somehow 100ms and retry the io. Should we do
it?(or: Is 5s interval considered too long a wait?)

> Signed-off-by: Fengguang Wu <[EMAIL PROTECTED]>
> ---
>  fs/fs-writeback.c |2 ++
>  include/linux/writeback.h |1 +
>  mm/page-writeback.c   |9 ++---
>  3 files changed, 9 insertions(+), 3 deletions(-)
> 
> --- linux-2.6.23-rc2-mm2.orig/fs/fs-writeback.c
> +++ linux-2.6.23-rc2-mm2/fs/fs-writeback.c
> @@ -560,6 +560,8 @@ int generic_sync_sb_inodes(struct super_
>   if (wbc->nr_to_write <= 0)
>   break;
>   }
> + if (!list_empty(&sb->s_more_io))
> + wbc->more_io = 1;
>   spin_unlock(&inode_lock);
>   return ret; /* Leave any unwritten inodes on s_io */
>  }
> --- linux-2.6.23-rc2-mm2.orig/include/linux/writeback.h
> +++ linux-2.6.23-rc2-mm2/include/linux/writeback.h
> @@ -61,6 +61,7 @@ struct writeback_control {
>   unsigned for_reclaim:1; /* Invoked from the page allocator */
>   unsigned for_writepages:1;  /* This is a writepages() call */
>   unsigned range_cyclic:1;/* range_start is cyclic */
> + unsigned more_io:1; /* more io to be dispatched */
>  
>   void *fs_private;   /* For use by ->writepages() */
>  };
> --- linux-2.6.23-rc2-mm2.orig/mm/page-writeback.c
> +++ linux-2.6.23-rc2-mm2/mm/page-writeback.c
> @@ -382,6 +382,7 @@ static void background_writeout(unsigned
>   global_page_state(NR_UNSTABLE_NFS) < background_thresh
>   && min_pages <= 0)
>   break;
> + wbc.more_io = 0;
>   wbc.encountered_congestion = 0;
>   wbc.nr_to_write = MAX_WRITEBACK_PAGES;
>   wbc.pages_skipped = 0;
> @@ -389,8 +390,9 @@ static void background_writeout(unsigned
>   min_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write;
>   if (wbc.nr_to_write > 0 || 

Re: [PATCH 002 of 6] Introduce rq_for_each_segment replacing rq_for_each_bio

2007-08-17 Thread Geert Uytterhoeven
On Thu, 16 Aug 2007, NeilBrown wrote:
> Every usage of rq_for_each_bio wraps a usage of
> bio_for_each_segment, so these can be combined into
> rq_for_each_segment.
> 
> We define "struct req_iterator" to hold the 'bio' and 'index' that
> are needed for the double iteration.

> --- .prev/drivers/block/ps3disk.c 2007-08-16 20:37:26.0 +1000
> +++ ./drivers/block/ps3disk.c 2007-08-16 20:50:07.0 +1000
> @@ -91,30 +91,30 @@ static void ps3disk_scatter_gather(struc
>  struct request *req, int gather)
>  {
>   unsigned int offset = 0;
> - struct bio *bio;
> - sector_t sector;
> + struct req_iterator iter;
>   struct bio_vec *bvec;
> - unsigned int i = 0, j;
> + unsigned int i = 0;
>   size_t size;
>   void *buf;
>  
> - rq_for_each_bio(bio, req) {
> - sector = bio->bi_sector;
> + rq_for_each_segment(bvec, req, iter) {
> + unsigned long flags;
>   dev_dbg(&dev->sbd.core,
>   "%s:%u: bio %u: %u segs %u sectors from %lu\n",
> - __func__, __LINE__, i, bio_segments(bio),
> - bio_sectors(bio), sector);
> - bio_for_each_segment(bvec, bio, j) {
> + __func__, __LINE__, i, bio_segments(iter.bio),
> + bio_sectors(iter.bio),
> + (unsigned long)iter.bio->bi_sector);
^^^
Superfluous cast: PS3 is 64-bit only, and casts are evil.

> +
>   size = bvec->bv_len;
> - buf = __bio_kmap_atomic(bio, j, KM_IRQ0);
> + buf = bvec_kmap_irq(bvec, flags);

As you already mentioned in the preample of the patch series, this changes
functionality.

>   if (gather)
>   memcpy(dev->bounce_buf+offset, buf, size);
>   else
>   memcpy(buf, dev->bounce_buf+offset, size);
>   offset += size;
>   flush_kernel_dcache_page(bio_iovec_idx(bio, 
> j)->bv_page);
> - __bio_kunmap_atomic(bio, KM_IRQ0);
> - }
> + bio_kunmap_bvec(bvec, flags);
> +
>   i++;
>   }
>  }

With kind regards,
 
Geert Uytterhoeven
Software Architect

Sony Network and Software Technology Center Europe
The Corporate Village · Da Vincilaan 7-D1 · B-1935 Zaventem · Belgium
 
Phone:+32 (0)2 700 8453 
Fax:  +32 (0)2 700 8622 
E-mail:   [EMAIL PROTECTED] 
Internet: http://www.sony-europe.com/

Sony Network and Software Technology Center Europe  
A division of Sony Service Centre (Europe) N.V. 
Registered office: Technologielaan 7 · B-1840 Londerzeel · Belgium  
VAT BE 0413.825.160 · RPR Brussels  
Fortis Bank Zaventem · Swift GEBABEBB08A · IBAN BE39001382358619

<    1   2   3   4