Re: [PATCH] swap: redirty page if page write fails on swap file

2013-05-01 Thread Simon Jeons

Ping, ;-)
On 04/18/2013 08:13 AM, Simon Jeons wrote:

Hi Jerome,
On 04/17/2013 08:11 PM, Jerome Marchand wrote:

Since commit 62c230b, swap_writepage() calls direct_IO on swap files.
However, in that case page isn't redirtied if I/O fails, and is 
therefore

handled afterwards as if it has been successfully written to the swap
file, leading to memory corruption when the page is eventually swapped
back in.
This patch sets the page dirty when direct_IO() fails. It fixes a memory


If swapfile has related page cache which cached swapfile in memory? It 
is not necessary, correct?



corruption that happened while using swap-over-NFS.

Signed-off-by: Jerome Marchand 
---
  mm/page_io.c |2 ++
  1 files changed, 2 insertions(+), 0 deletions(-)

diff --git a/mm/page_io.c b/mm/page_io.c
index 78eee32..04ca00d 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -222,6 +222,8 @@ int swap_writepage(struct page *page, struct 
writeback_control *wbc)

  if (ret == PAGE_SIZE) {
  count_vm_event(PSWPOUT);
  ret = 0;
+} else {
+set_page_dirty(page);
  }
  return ret;
  }

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majord...@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: mailto:"d...@kvack.org;> em...@kvack.org 




--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] swap: redirty page if page write fails on swap file

2013-05-01 Thread Simon Jeons

Ping, ;-)
On 04/18/2013 08:13 AM, Simon Jeons wrote:

Hi Jerome,
On 04/17/2013 08:11 PM, Jerome Marchand wrote:

Since commit 62c230b, swap_writepage() calls direct_IO on swap files.
However, in that case page isn't redirtied if I/O fails, and is 
therefore

handled afterwards as if it has been successfully written to the swap
file, leading to memory corruption when the page is eventually swapped
back in.
This patch sets the page dirty when direct_IO() fails. It fixes a memory


If swapfile has related page cache which cached swapfile in memory? It 
is not necessary, correct?



corruption that happened while using swap-over-NFS.

Signed-off-by: Jerome Marchand jmarc...@redhat.com
---
  mm/page_io.c |2 ++
  1 files changed, 2 insertions(+), 0 deletions(-)

diff --git a/mm/page_io.c b/mm/page_io.c
index 78eee32..04ca00d 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -222,6 +222,8 @@ int swap_writepage(struct page *page, struct 
writeback_control *wbc)

  if (ret == PAGE_SIZE) {
  count_vm_event(PSWPOUT);
  ret = 0;
+} else {
+set_page_dirty(page);
  }
  return ret;
  }

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majord...@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: a href=mailto:d...@kvack.org; em...@kvack.org /a




--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 1/2] Make the batch size of the percpu_counter configurable

2013-04-30 Thread Simon Jeons

Hi Tim,
On 04/30/2013 01:12 AM, Tim Chen wrote:

Currently, there is a single, global, variable (percpu_counter_batch) that
controls the batch sizes for every 'struct percpu_counter' on the system.

However, there are some applications, e.g. memory accounting where it is
more appropriate to scale the batch size according to the memory size.
This patch adds the infrastructure to be able to change the batch sizes
for each individual instance of 'struct percpu_counter'.

I have chosen to implement the added field of batch as a pointer
(by default point to percpu_counter_batch) instead
of a static value.  The reason is the percpu_counter initialization
can be called when we only have boot cpu and not all cpus are online.


What's the meaning of boot cpu? Do you mean cpu 0?


and percpu_counter_batch value have yet to be udpated with a
call to compute_batch_value function.

Thanks to Dave Hansen and Andi Kleen for their comments and suggestions.

Signed-off-by: Tim Chen 
---
  include/linux/percpu_counter.h | 20 +++-
  lib/percpu_counter.c   | 23 ++-
  2 files changed, 41 insertions(+), 2 deletions(-)

diff --git a/include/linux/percpu_counter.h b/include/linux/percpu_counter.h
index d5dd465..5ca7df5 100644
--- a/include/linux/percpu_counter.h
+++ b/include/linux/percpu_counter.h
@@ -22,6 +22,7 @@ struct percpu_counter {
struct list_head list;  /* All percpu_counters are on a list */
  #endif
s32 __percpu *counters;
+   int *batch cacheline_aligned_in_smp;
  };
  
  extern int percpu_counter_batch;

@@ -40,11 +41,22 @@ void percpu_counter_destroy(struct percpu_counter *fbc);
  void percpu_counter_set(struct percpu_counter *fbc, s64 amount);
  void __percpu_counter_add(struct percpu_counter *fbc, s64 amount, s32 batch);
  s64 __percpu_counter_sum(struct percpu_counter *fbc);
+void __percpu_counter_batch_resize(struct percpu_counter *fbc, int *batch);
  int percpu_counter_compare(struct percpu_counter *fbc, s64 rhs);
  
+static inline int percpu_counter_and_batch_init(struct percpu_counter *fbc,

+   s64 amount, int *batch)
+{
+   int ret = percpu_counter_init(fbc, amount);
+
+   if (batch && !ret)
+   __percpu_counter_batch_resize(fbc, batch);
+   return ret;
+}
+
  static inline void percpu_counter_add(struct percpu_counter *fbc, s64 amount)
  {
-   __percpu_counter_add(fbc, amount, percpu_counter_batch);
+   __percpu_counter_add(fbc, amount, *fbc->batch);
  }
  
  static inline s64 percpu_counter_sum_positive(struct percpu_counter *fbc)

@@ -95,6 +107,12 @@ static inline int percpu_counter_init(struct percpu_counter 
*fbc, s64 amount)
return 0;
  }
  
+static inline int percpu_counter_and_batch_init(struct percpu_counter *fbc,

+   s64 amount, int *batch)
+{
+   return percpu_counter_init(fbc, amount);
+}
+
  static inline void percpu_counter_destroy(struct percpu_counter *fbc)
  {
  }
diff --git a/lib/percpu_counter.c b/lib/percpu_counter.c
index ba6085d..a75951e 100644
--- a/lib/percpu_counter.c
+++ b/lib/percpu_counter.c
@@ -116,6 +116,7 @@ int __percpu_counter_init(struct percpu_counter *fbc, s64 
amount,
lockdep_set_class(>lock, key);
fbc->count = amount;
fbc->counters = alloc_percpu(s32);
+   fbc->batch = _counter_batch;
if (!fbc->counters)
return -ENOMEM;
  
@@ -131,6 +132,26 @@ int __percpu_counter_init(struct percpu_counter *fbc, s64 amount,

  }
  EXPORT_SYMBOL(__percpu_counter_init);
  
+void __percpu_counter_batch_resize(struct percpu_counter *fbc, int *batch)

+{
+   unsigned long flags;
+   int cpu;
+
+   if (!batch)
+   return;
+
+   raw_spin_lock_irqsave(>lock, flags);
+   for_each_online_cpu(cpu) {
+   s32 *pcount = per_cpu_ptr(fbc->counters, cpu);
+   fbc->count += *pcount;
+   *pcount = 0;
+   }
+   *batch = max(*batch, percpu_counter_batch);
+   fbc->batch = batch;
+   raw_spin_unlock_irqrestore(>lock, flags);
+}
+EXPORT_SYMBOL(__percpu_counter_batch_resize);
+
  void percpu_counter_destroy(struct percpu_counter *fbc)
  {
if (!fbc->counters)
@@ -196,7 +217,7 @@ int percpu_counter_compare(struct percpu_counter *fbc, s64 
rhs)
  
  	count = percpu_counter_read(fbc);

/* Check to see if rough count will be sufficient for comparison */
-   if (abs(count - rhs) > (percpu_counter_batch*num_online_cpus())) {
+   if (abs(count - rhs) > ((*fbc->batch)*num_online_cpus())) {
if (count > rhs)
return 1;
else


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 1/2] Make the batch size of the percpu_counter configurable

2013-04-30 Thread Simon Jeons

Hi Tim,
On 04/30/2013 01:12 AM, Tim Chen wrote:

Currently, there is a single, global, variable (percpu_counter_batch) that
controls the batch sizes for every 'struct percpu_counter' on the system.

However, there are some applications, e.g. memory accounting where it is
more appropriate to scale the batch size according to the memory size.
This patch adds the infrastructure to be able to change the batch sizes
for each individual instance of 'struct percpu_counter'.

I have chosen to implement the added field of batch as a pointer
(by default point to percpu_counter_batch) instead
of a static value.  The reason is the percpu_counter initialization
can be called when we only have boot cpu and not all cpus are online.


What's the meaning of boot cpu? Do you mean cpu 0?


and percpu_counter_batch value have yet to be udpated with a
call to compute_batch_value function.

Thanks to Dave Hansen and Andi Kleen for their comments and suggestions.

Signed-off-by: Tim Chen tim.c.c...@linux.intel.com
---
  include/linux/percpu_counter.h | 20 +++-
  lib/percpu_counter.c   | 23 ++-
  2 files changed, 41 insertions(+), 2 deletions(-)

diff --git a/include/linux/percpu_counter.h b/include/linux/percpu_counter.h
index d5dd465..5ca7df5 100644
--- a/include/linux/percpu_counter.h
+++ b/include/linux/percpu_counter.h
@@ -22,6 +22,7 @@ struct percpu_counter {
struct list_head list;  /* All percpu_counters are on a list */
  #endif
s32 __percpu *counters;
+   int *batch cacheline_aligned_in_smp;
  };
  
  extern int percpu_counter_batch;

@@ -40,11 +41,22 @@ void percpu_counter_destroy(struct percpu_counter *fbc);
  void percpu_counter_set(struct percpu_counter *fbc, s64 amount);
  void __percpu_counter_add(struct percpu_counter *fbc, s64 amount, s32 batch);
  s64 __percpu_counter_sum(struct percpu_counter *fbc);
+void __percpu_counter_batch_resize(struct percpu_counter *fbc, int *batch);
  int percpu_counter_compare(struct percpu_counter *fbc, s64 rhs);
  
+static inline int percpu_counter_and_batch_init(struct percpu_counter *fbc,

+   s64 amount, int *batch)
+{
+   int ret = percpu_counter_init(fbc, amount);
+
+   if (batch  !ret)
+   __percpu_counter_batch_resize(fbc, batch);
+   return ret;
+}
+
  static inline void percpu_counter_add(struct percpu_counter *fbc, s64 amount)
  {
-   __percpu_counter_add(fbc, amount, percpu_counter_batch);
+   __percpu_counter_add(fbc, amount, *fbc-batch);
  }
  
  static inline s64 percpu_counter_sum_positive(struct percpu_counter *fbc)

@@ -95,6 +107,12 @@ static inline int percpu_counter_init(struct percpu_counter 
*fbc, s64 amount)
return 0;
  }
  
+static inline int percpu_counter_and_batch_init(struct percpu_counter *fbc,

+   s64 amount, int *batch)
+{
+   return percpu_counter_init(fbc, amount);
+}
+
  static inline void percpu_counter_destroy(struct percpu_counter *fbc)
  {
  }
diff --git a/lib/percpu_counter.c b/lib/percpu_counter.c
index ba6085d..a75951e 100644
--- a/lib/percpu_counter.c
+++ b/lib/percpu_counter.c
@@ -116,6 +116,7 @@ int __percpu_counter_init(struct percpu_counter *fbc, s64 
amount,
lockdep_set_class(fbc-lock, key);
fbc-count = amount;
fbc-counters = alloc_percpu(s32);
+   fbc-batch = percpu_counter_batch;
if (!fbc-counters)
return -ENOMEM;
  
@@ -131,6 +132,26 @@ int __percpu_counter_init(struct percpu_counter *fbc, s64 amount,

  }
  EXPORT_SYMBOL(__percpu_counter_init);
  
+void __percpu_counter_batch_resize(struct percpu_counter *fbc, int *batch)

+{
+   unsigned long flags;
+   int cpu;
+
+   if (!batch)
+   return;
+
+   raw_spin_lock_irqsave(fbc-lock, flags);
+   for_each_online_cpu(cpu) {
+   s32 *pcount = per_cpu_ptr(fbc-counters, cpu);
+   fbc-count += *pcount;
+   *pcount = 0;
+   }
+   *batch = max(*batch, percpu_counter_batch);
+   fbc-batch = batch;
+   raw_spin_unlock_irqrestore(fbc-lock, flags);
+}
+EXPORT_SYMBOL(__percpu_counter_batch_resize);
+
  void percpu_counter_destroy(struct percpu_counter *fbc)
  {
if (!fbc-counters)
@@ -196,7 +217,7 @@ int percpu_counter_compare(struct percpu_counter *fbc, s64 
rhs)
  
  	count = percpu_counter_read(fbc);

/* Check to see if rough count will be sufficient for comparison */
-   if (abs(count - rhs)  (percpu_counter_batch*num_online_cpus())) {
+   if (abs(count - rhs)  ((*fbc-batch)*num_online_cpus())) {
if (count  rhs)
return 1;
else


--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] mm: cond_resched in tlb_flush_mmu to fix soft lockups on !CONFIG_PREEMPT

2013-04-27 Thread Simon Jeons

Hi Michal,
On 12/19/2012 12:11 AM, Michal Hocko wrote:

Since e303297 (mm: extended batches for generic mmu_gather) we are batching
pages to be freed until either tlb_next_batch cannot allocate a new batch or we
are done.


Is there material introduce mmu_gather?



This works just fine most of the time but we can get in troubles with
non-preemptible kernel (CONFIG_PREEMPT_NONE or CONFIG_PREEMPT_VOLUNTARY) on
large machines where too aggressive batching might lead to soft lockups during
process exit path (exit_mmap) because there are no scheduling points down the
free_pages_and_swap_cache path and so the freeing can take long enough to
trigger the soft lockup.

The lockup is harmless except when the system is setup to panic on
softlockup which is not that unusual.

The simplest way to work around this issue is to explicitly cond_resched per
batch in tlb_flush_mmu (1020 pages on x86_64).

The following lockup has been reported for 3.0 kernel with a huge process
(in order of hundreds gigs but I do know any more details).

[65674.040540] BUG: soft lockup - CPU#56 stuck for 22s! [kernel:31053]
[65674.040544] Modules linked in: af_packet nfs lockd fscache auth_rpcgss 
nfs_acl sunrpc mptctl mptbase autofs4 binfmt_misc dm_round_robin dm_multipath 
bonding cpufreq_conservative cpufreq_userspace cpufreq_powersave pcc_cpufreq 
mperf microcode fuse loop osst sg sd_mod crc_t10dif st qla2xxx 
scsi_transport_fc scsi_tgt netxen_nic i7core_edac iTCO_wdt joydev e1000e 
serio_raw pcspkr edac_core iTCO_vendor_support acpi_power_meter rtc_cmos hpwdt 
hpilo button container usbhid hid dm_mirror dm_region_hash dm_log linear 
uhci_hcd ehci_hcd usbcore usb_common scsi_dh_emc scsi_dh_alua scsi_dh_hp_sw 
scsi_dh_rdac scsi_dh dm_snapshot pcnet32 mii edd dm_mod raid1 ext3 mbcache jbd 
fan thermal processor thermal_sys hwmon cciss scsi_mod
[65674.040602] Supported: Yes
[65674.040604] CPU 56
[65674.040639] Pid: 31053, comm: kernel Not tainted 3.0.31-0.9-default #1 HP 
ProLiant DL580 G7
[65674.040643] RIP: 0010:[]  [] 
_raw_spin_unlock_irqrestore+0x8/0x10
[65674.040656] RSP: 0018:883ec1037af0  EFLAGS: 0206
[65674.040657] RAX: 0e00 RBX: ea01a0817e28 RCX: 88803ffd9e80
[65674.040659] RDX: 0200 RSI: 0206 RDI: 0206
[65674.040661] RBP: 0002 R08: 0001 R09: 887ec724a400
[65674.040663] R10:  R11: dead00200200 R12: 8144c26e
[65674.040665] R13: 0030 R14: 0297 R15: 000e
[65674.040667] FS:  7ed834282700() GS:88c03f20() 
knlGS:
[65674.040669] CS:  0010 DS:  ES:  CR0: 8005003b
[65674.040671] CR2: 0068b240 CR3: 003ec13c5000 CR4: 06e0
[65674.040673] DR0:  DR1:  DR2: 
[65674.040675] DR3:  DR6: 0ff0 DR7: 0400
[65674.040678] Process kernel (pid: 31053, threadinfo 883ec1036000, task 
883ebd5d4100)
[65674.040680] Stack:
[65674.042972]  810fc935 88a9f1e182b0 0206 
0009
[65674.042978]   ea01a0817e60 ea0211d3a808 
ea0211d3a840
[65674.042983]  ea01a0827a28 ea01a0827a60 ea0288a598c0 
ea0288a598f8
[65674.042989] Call Trace:
[65674.045765]  [] release_pages+0xc5/0x260
[65674.045779]  [] free_pages_and_swap_cache+0x9d/0xc0
[65674.045786]  [] tlb_flush_mmu+0x5c/0x80
[65674.045791]  [] tlb_finish_mmu+0xe/0x50
[65674.045796]  [] exit_mmap+0xbd/0x120
[65674.045805]  [] mmput+0x49/0x120
[65674.045813]  [] exit_mm+0x122/0x160
[65674.045818]  [] do_exit+0x17a/0x430
[65674.045824]  [] do_group_exit+0x3d/0xb0
[65674.045831]  [] get_signal_to_deliver+0x247/0x480
[65674.045840]  [] do_signal+0x71/0x1b0
[65674.045845]  [] do_notify_resume+0x98/0xb0
[65674.045853]  [] int_signal+0x12/0x17
[65674.046737] DWARF2 unwinder stuck at int_signal+0x12/0x17

Signed-off-by: Michal Hocko 
Cc: sta...@vger.kernel.org # 3.0 and higher
---
  mm/memory.c |1 +
  1 file changed, 1 insertion(+)

diff --git a/mm/memory.c b/mm/memory.c
index 1f6cae4..bcd3d5c 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -239,6 +239,7 @@ void tlb_flush_mmu(struct mmu_gather *tlb)
for (batch = >local; batch; batch = batch->next) {
free_pages_and_swap_cache(batch->pages, batch->nr);
batch->nr = 0;
+   cond_resched();
}
tlb->active = >local;
  }


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] mm: cond_resched in tlb_flush_mmu to fix soft lockups on !CONFIG_PREEMPT

2013-04-27 Thread Simon Jeons

Hi Michal,
On 12/19/2012 12:11 AM, Michal Hocko wrote:

Since e303297 (mm: extended batches for generic mmu_gather) we are batching
pages to be freed until either tlb_next_batch cannot allocate a new batch or we
are done.


Is there material introduce mmu_gather?



This works just fine most of the time but we can get in troubles with
non-preemptible kernel (CONFIG_PREEMPT_NONE or CONFIG_PREEMPT_VOLUNTARY) on
large machines where too aggressive batching might lead to soft lockups during
process exit path (exit_mmap) because there are no scheduling points down the
free_pages_and_swap_cache path and so the freeing can take long enough to
trigger the soft lockup.

The lockup is harmless except when the system is setup to panic on
softlockup which is not that unusual.

The simplest way to work around this issue is to explicitly cond_resched per
batch in tlb_flush_mmu (1020 pages on x86_64).

The following lockup has been reported for 3.0 kernel with a huge process
(in order of hundreds gigs but I do know any more details).

[65674.040540] BUG: soft lockup - CPU#56 stuck for 22s! [kernel:31053]
[65674.040544] Modules linked in: af_packet nfs lockd fscache auth_rpcgss 
nfs_acl sunrpc mptctl mptbase autofs4 binfmt_misc dm_round_robin dm_multipath 
bonding cpufreq_conservative cpufreq_userspace cpufreq_powersave pcc_cpufreq 
mperf microcode fuse loop osst sg sd_mod crc_t10dif st qla2xxx 
scsi_transport_fc scsi_tgt netxen_nic i7core_edac iTCO_wdt joydev e1000e 
serio_raw pcspkr edac_core iTCO_vendor_support acpi_power_meter rtc_cmos hpwdt 
hpilo button container usbhid hid dm_mirror dm_region_hash dm_log linear 
uhci_hcd ehci_hcd usbcore usb_common scsi_dh_emc scsi_dh_alua scsi_dh_hp_sw 
scsi_dh_rdac scsi_dh dm_snapshot pcnet32 mii edd dm_mod raid1 ext3 mbcache jbd 
fan thermal processor thermal_sys hwmon cciss scsi_mod
[65674.040602] Supported: Yes
[65674.040604] CPU 56
[65674.040639] Pid: 31053, comm: kernel Not tainted 3.0.31-0.9-default #1 HP 
ProLiant DL580 G7
[65674.040643] RIP: 0010:[81443a88]  [81443a88] 
_raw_spin_unlock_irqrestore+0x8/0x10
[65674.040656] RSP: 0018:883ec1037af0  EFLAGS: 0206
[65674.040657] RAX: 0e00 RBX: ea01a0817e28 RCX: 88803ffd9e80
[65674.040659] RDX: 0200 RSI: 0206 RDI: 0206
[65674.040661] RBP: 0002 R08: 0001 R09: 887ec724a400
[65674.040663] R10:  R11: dead00200200 R12: 8144c26e
[65674.040665] R13: 0030 R14: 0297 R15: 000e
[65674.040667] FS:  7ed834282700() GS:88c03f20() 
knlGS:
[65674.040669] CS:  0010 DS:  ES:  CR0: 8005003b
[65674.040671] CR2: 0068b240 CR3: 003ec13c5000 CR4: 06e0
[65674.040673] DR0:  DR1:  DR2: 
[65674.040675] DR3:  DR6: 0ff0 DR7: 0400
[65674.040678] Process kernel (pid: 31053, threadinfo 883ec1036000, task 
883ebd5d4100)
[65674.040680] Stack:
[65674.042972]  810fc935 88a9f1e182b0 0206 
0009
[65674.042978]   ea01a0817e60 ea0211d3a808 
ea0211d3a840
[65674.042983]  ea01a0827a28 ea01a0827a60 ea0288a598c0 
ea0288a598f8
[65674.042989] Call Trace:
[65674.045765]  [810fc935] release_pages+0xc5/0x260
[65674.045779]  [811289dd] free_pages_and_swap_cache+0x9d/0xc0
[65674.045786]  [81115d6c] tlb_flush_mmu+0x5c/0x80
[65674.045791]  [8111628e] tlb_finish_mmu+0xe/0x50
[65674.045796]  [8111c65d] exit_mmap+0xbd/0x120
[65674.045805]  [810582d9] mmput+0x49/0x120
[65674.045813]  [8105cbb2] exit_mm+0x122/0x160
[65674.045818]  [8105e95a] do_exit+0x17a/0x430
[65674.045824]  [8105ec4d] do_group_exit+0x3d/0xb0
[65674.045831]  [8106f7c7] get_signal_to_deliver+0x247/0x480
[65674.045840]  [81002931] do_signal+0x71/0x1b0
[65674.045845]  [81002b08] do_notify_resume+0x98/0xb0
[65674.045853]  [8144bb60] int_signal+0x12/0x17
[65674.046737] DWARF2 unwinder stuck at int_signal+0x12/0x17

Signed-off-by: Michal Hocko mho...@suse.cz
Cc: sta...@vger.kernel.org # 3.0 and higher
---
  mm/memory.c |1 +
  1 file changed, 1 insertion(+)

diff --git a/mm/memory.c b/mm/memory.c
index 1f6cae4..bcd3d5c 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -239,6 +239,7 @@ void tlb_flush_mmu(struct mmu_gather *tlb)
for (batch = tlb-local; batch; batch = batch-next) {
free_pages_and_swap_cache(batch-pages, batch-nr);
batch-nr = 0;
+   cond_resched();
}
tlb-active = tlb-local;
  }


--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 0/10] Reduce system disruption due to kswapd V2

2013-04-22 Thread Simon Jeons

Hi Zlatko,
On 04/22/2013 02:54 PM, Zlatko Calusic wrote:

On 22.04.2013 08:43, Simon Jeons wrote:

Hi Zlatko,
On 04/22/2013 02:37 PM, Zlatko Calusic wrote:

On 12.04.2013 22:07, Zlatko Calusic wrote:

On 12.04.2013 21:40, Mel Gorman wrote:

On Thu, Apr 11, 2013 at 10:55:13PM +0200, Zlatko Calusic wrote:

On 09.04.2013 13:06, Mel Gorman wrote:


- The only slightly negative thing I observed is that with the patch
applied kswapd burns 10x - 20x more CPU. So instead of about 15
seconds, it has now spent more than 4 minutes on one particular
machine with a quite steady load (after about 12 days of uptime).
Admittedly, that's still nothing too alarming, but...



Would you happen to know what circumstances trigger the higher CPU
usage?



Really nothing special. The server is lightly loaded, but it does 
enough

reading from the disk so that pagecache is mostly populated and page
reclaiming is active. So, kswapd is no doubt using CPU time gradually,
nothing extraordinary.

When I sent my reply yesterday, the server uptime was 12 days, and
kswapd had accumulated 4:28 CPU time. Now, approx 24 hours later (13
days uptime):

root23  0.0  0.0  0 0 ?SMar30 4:52
[kswapd0]

I will apply your v3 series soon and see if there's any improvement 
wrt

CPU usage, although as I said I don't see that as a big issue. It's
still only 0.013% of available CPU resources (dual core CPU).



JFTR, v3 kswapd uses about 15% more CPU time than v2. 2:50 kswapd CPU
time after 6 days 14h uptime.

And find attached another debugging graph that shows how ANON pages
are privileged in the ZONE_NORMAL on a 4GB machine. Take notice that
the number of pages in the ZONE_DMA32 is scaled (/5) to fit the graph
nicely.



Could you tell me how you draw this picture?



It's a home made server monitoring system. I just added the code 
needed to graph the size of active + inactive LRU lists, per zone and 
per type. Check out http://oss.oetiker.ch/rrdtool/


Thanks Zlatko, I successfully install, could you tell me your options?


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 0/10] Reduce system disruption due to kswapd V2

2013-04-22 Thread Simon Jeons

Hi Zlatko,
On 04/22/2013 02:37 PM, Zlatko Calusic wrote:

On 12.04.2013 22:07, Zlatko Calusic wrote:

On 12.04.2013 21:40, Mel Gorman wrote:

On Thu, Apr 11, 2013 at 10:55:13PM +0200, Zlatko Calusic wrote:

On 09.04.2013 13:06, Mel Gorman wrote:


- The only slightly negative thing I observed is that with the patch
applied kswapd burns 10x - 20x more CPU. So instead of about 15
seconds, it has now spent more than 4 minutes on one particular
machine with a quite steady load (after about 12 days of uptime).
Admittedly, that's still nothing too alarming, but...



Would you happen to know what circumstances trigger the higher CPU
usage?



Really nothing special. The server is lightly loaded, but it does enough
reading from the disk so that pagecache is mostly populated and page
reclaiming is active. So, kswapd is no doubt using CPU time gradually,
nothing extraordinary.

When I sent my reply yesterday, the server uptime was 12 days, and
kswapd had accumulated 4:28 CPU time. Now, approx 24 hours later (13
days uptime):

root23  0.0  0.0  0 0 ?SMar30   4:52 
[kswapd0]


I will apply your v3 series soon and see if there's any improvement wrt
CPU usage, although as I said I don't see that as a big issue. It's
still only 0.013% of available CPU resources (dual core CPU).



JFTR, v3 kswapd uses about 15% more CPU time than v2. 2:50 kswapd CPU 
time after 6 days 14h uptime.


And find attached another debugging graph that shows how ANON pages 
are privileged in the ZONE_NORMAL on a 4GB machine. Take notice that 
the number of pages in the ZONE_DMA32 is scaled (/5) to fit the graph 
nicely.




Could you tell me how you draw this picture?

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 0/10] Reduce system disruption due to kswapd V2

2013-04-22 Thread Simon Jeons

Hi Zlatko,
On 04/22/2013 02:37 PM, Zlatko Calusic wrote:

On 12.04.2013 22:07, Zlatko Calusic wrote:

On 12.04.2013 21:40, Mel Gorman wrote:

On Thu, Apr 11, 2013 at 10:55:13PM +0200, Zlatko Calusic wrote:

On 09.04.2013 13:06, Mel Gorman wrote:
SNIP

- The only slightly negative thing I observed is that with the patch
applied kswapd burns 10x - 20x more CPU. So instead of about 15
seconds, it has now spent more than 4 minutes on one particular
machine with a quite steady load (after about 12 days of uptime).
Admittedly, that's still nothing too alarming, but...



Would you happen to know what circumstances trigger the higher CPU
usage?



Really nothing special. The server is lightly loaded, but it does enough
reading from the disk so that pagecache is mostly populated and page
reclaiming is active. So, kswapd is no doubt using CPU time gradually,
nothing extraordinary.

When I sent my reply yesterday, the server uptime was 12 days, and
kswapd had accumulated 4:28 CPU time. Now, approx 24 hours later (13
days uptime):

root23  0.0  0.0  0 0 ?SMar30   4:52 
[kswapd0]


I will apply your v3 series soon and see if there's any improvement wrt
CPU usage, although as I said I don't see that as a big issue. It's
still only 0.013% of available CPU resources (dual core CPU).



JFTR, v3 kswapd uses about 15% more CPU time than v2. 2:50 kswapd CPU 
time after 6 days 14h uptime.


And find attached another debugging graph that shows how ANON pages 
are privileged in the ZONE_NORMAL on a 4GB machine. Take notice that 
the number of pages in the ZONE_DMA32 is scaled (/5) to fit the graph 
nicely.




Could you tell me how you draw this picture?

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 0/10] Reduce system disruption due to kswapd V2

2013-04-22 Thread Simon Jeons

Hi Zlatko,
On 04/22/2013 02:54 PM, Zlatko Calusic wrote:

On 22.04.2013 08:43, Simon Jeons wrote:

Hi Zlatko,
On 04/22/2013 02:37 PM, Zlatko Calusic wrote:

On 12.04.2013 22:07, Zlatko Calusic wrote:

On 12.04.2013 21:40, Mel Gorman wrote:

On Thu, Apr 11, 2013 at 10:55:13PM +0200, Zlatko Calusic wrote:

On 09.04.2013 13:06, Mel Gorman wrote:
SNIP

- The only slightly negative thing I observed is that with the patch
applied kswapd burns 10x - 20x more CPU. So instead of about 15
seconds, it has now spent more than 4 minutes on one particular
machine with a quite steady load (after about 12 days of uptime).
Admittedly, that's still nothing too alarming, but...



Would you happen to know what circumstances trigger the higher CPU
usage?



Really nothing special. The server is lightly loaded, but it does 
enough

reading from the disk so that pagecache is mostly populated and page
reclaiming is active. So, kswapd is no doubt using CPU time gradually,
nothing extraordinary.

When I sent my reply yesterday, the server uptime was 12 days, and
kswapd had accumulated 4:28 CPU time. Now, approx 24 hours later (13
days uptime):

root23  0.0  0.0  0 0 ?SMar30 4:52
[kswapd0]

I will apply your v3 series soon and see if there's any improvement 
wrt

CPU usage, although as I said I don't see that as a big issue. It's
still only 0.013% of available CPU resources (dual core CPU).



JFTR, v3 kswapd uses about 15% more CPU time than v2. 2:50 kswapd CPU
time after 6 days 14h uptime.

And find attached another debugging graph that shows how ANON pages
are privileged in the ZONE_NORMAL on a 4GB machine. Take notice that
the number of pages in the ZONE_DMA32 is scaled (/5) to fit the graph
nicely.



Could you tell me how you draw this picture?



It's a home made server monitoring system. I just added the code 
needed to graph the size of active + inactive LRU lists, per zone and 
per type. Check out http://oss.oetiker.ch/rrdtool/


Thanks Zlatko, I successfully install, could you tell me your options?


--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [RFC PATCH v2 00/15][Sorted-buddy] mm: Memory Power Management

2013-04-18 Thread Simon Jeons

Hi Srivatsa,
On 04/10/2013 05:45 AM, Srivatsa S. Bhat wrote:

[I know, this cover letter is a little too long, but I wanted to clearly
explain the overall goals and the high-level design of this patchset in
detail. I hope this helps more than it annoys, and makes it easier for
reviewers to relate to the background and the goals of this patchset.]


Overview of Memory Power Management and its implications to the Linux MM


Today, we are increasingly seeing computer systems sporting larger and larger
amounts of RAM, in order to meet workload demands. However, memory consumes a
significant amount of power, potentially upto more than a third of total system
power on server systems. So naturally, memory becomes the next big target for
power management - on embedded systems and smartphones, and all the way upto
large server systems.

Power-management capabilities in modern memory hardware:
---

Modern memory hardware such as DDR3 support a number of power management
capabilities - for instance, the memory controller can automatically put


memory controller is integrated in cpu in NUMA system and mount on PCI-E 
in UMA, correct? How can memory controller know which memory DIMMs/banks 
it will control?



memory DIMMs/banks into content-preserving low-power states, if it detects
that that *entire* memory DIMM/bank has not been referenced for a threshold
amount of time, thus reducing the energy consumption of the memory hardware.
We term these power-manageable chunks of memory as "Memory Regions".

Exporting memory region info of the platform to the OS:
--

The OS needs to know about the granularity at which the hardware can perform
automatic power-management of the memory banks (i.e., the address boundaries
of the memory regions). On ARM platforms, the bootloader can be modified to
pass on this info to the kernel via the device-tree. On x86 platforms, the
new ACPI 5.0 spec has added support for exporting the power-management
capabilities of the memory hardware to the OS in a standard way[5].

Estimate of power-savings from power-aware Linux MM:
---

Once the firmware/bootloader exports the required info to the OS, it is upto
the kernel's MM subsystem to make the best use of these capabilities and manage
memory power-efficiently. It had been demonstrated on a Samsung Exynos board
(with 2 GB RAM) that upto 6 percent of total system power can be saved by
making the Linux kernel MM subsystem power-aware[4]. (More savings can be
expected on systems with larger amounts of memory, and perhaps improved further
using better MM designs).


How to know there are 6 percent of total system power can be saved by 
making the Linux kernel MM subsystem power-aware?





Role of the Linux MM in enhancing memory power savings:
--

Often, this simply translates to having the Linux MM understand the granularity
at which RAM modules can be power-managed, and keeping the memory allocations
and references consolidated to a minimum no. of these power-manageable
"memory regions". It is of particular interest to note that most of these memory
hardware have the intelligence to automatically save power, by putting memory
banks into (content-preserving) low-power states when not referenced for a


How to know DIMM/bank is not referenced?


threshold amount of time. All that the kernel has to do, is avoid wrecking
the power-savings logic by scattering its allocations and references all over
the system memory. (The kernel/MM doesn't have to perform the actual power-state
transitions; its mostly done in the hardware automatically, and this is OK
because these are *content-preserving* low-power states).

So we can summarize the goals for the Linux MM as:

o Consolidate memory allocations and/or references such that they are not
spread across the entire memory address space.  Basically the area of memory
that is not being referenced can reside in low power state.

o Support light-weight targetted memory compaction/reclaim, to evacuate
lightly-filled memory regions. This helps avoid memory references to
those regions, thereby allowing them to reside in low power states.


Assumptions and goals of this patchset:
--

In this patchset, we don't handle the part of getting the region boundary info
from the firmware/bootloader and populating it in the kernel data-structures.
The aim of this patchset is to propose and brainstorm on a power-aware design
of the Linux MM which can *use* the region boundary info to influence the MM
at various places such as page allocation, reclamation/compaction etc, thereby
contributing to memory power savings. (This patchset is very much an RFC at
the moment and is not intended for mainline-inclusion yet).

So, in 

Re: [RFC PATCH v2 00/15][Sorted-buddy] mm: Memory Power Management

2013-04-18 Thread Simon Jeons

Hi Srivatsa,
On 04/10/2013 05:45 AM, Srivatsa S. Bhat wrote:

[I know, this cover letter is a little too long, but I wanted to clearly
explain the overall goals and the high-level design of this patchset in
detail. I hope this helps more than it annoys, and makes it easier for
reviewers to relate to the background and the goals of this patchset.]


Overview of Memory Power Management and its implications to the Linux MM


Today, we are increasingly seeing computer systems sporting larger and larger
amounts of RAM, in order to meet workload demands. However, memory consumes a
significant amount of power, potentially upto more than a third of total system
power on server systems. So naturally, memory becomes the next big target for
power management - on embedded systems and smartphones, and all the way upto
large server systems.

Power-management capabilities in modern memory hardware:
---

Modern memory hardware such as DDR3 support a number of power management
capabilities - for instance, the memory controller can automatically put


memory controller is integrated in cpu in NUMA system and mount on PCI-E 
in UMA, correct? How can memory controller know which memory DIMMs/banks 
it will control?



memory DIMMs/banks into content-preserving low-power states, if it detects
that that *entire* memory DIMM/bank has not been referenced for a threshold
amount of time, thus reducing the energy consumption of the memory hardware.
We term these power-manageable chunks of memory as Memory Regions.

Exporting memory region info of the platform to the OS:
--

The OS needs to know about the granularity at which the hardware can perform
automatic power-management of the memory banks (i.e., the address boundaries
of the memory regions). On ARM platforms, the bootloader can be modified to
pass on this info to the kernel via the device-tree. On x86 platforms, the
new ACPI 5.0 spec has added support for exporting the power-management
capabilities of the memory hardware to the OS in a standard way[5].

Estimate of power-savings from power-aware Linux MM:
---

Once the firmware/bootloader exports the required info to the OS, it is upto
the kernel's MM subsystem to make the best use of these capabilities and manage
memory power-efficiently. It had been demonstrated on a Samsung Exynos board
(with 2 GB RAM) that upto 6 percent of total system power can be saved by
making the Linux kernel MM subsystem power-aware[4]. (More savings can be
expected on systems with larger amounts of memory, and perhaps improved further
using better MM designs).


How to know there are 6 percent of total system power can be saved by 
making the Linux kernel MM subsystem power-aware?





Role of the Linux MM in enhancing memory power savings:
--

Often, this simply translates to having the Linux MM understand the granularity
at which RAM modules can be power-managed, and keeping the memory allocations
and references consolidated to a minimum no. of these power-manageable
memory regions. It is of particular interest to note that most of these memory
hardware have the intelligence to automatically save power, by putting memory
banks into (content-preserving) low-power states when not referenced for a


How to know DIMM/bank is not referenced?


threshold amount of time. All that the kernel has to do, is avoid wrecking
the power-savings logic by scattering its allocations and references all over
the system memory. (The kernel/MM doesn't have to perform the actual power-state
transitions; its mostly done in the hardware automatically, and this is OK
because these are *content-preserving* low-power states).

So we can summarize the goals for the Linux MM as:

o Consolidate memory allocations and/or references such that they are not
spread across the entire memory address space.  Basically the area of memory
that is not being referenced can reside in low power state.

o Support light-weight targetted memory compaction/reclaim, to evacuate
lightly-filled memory regions. This helps avoid memory references to
those regions, thereby allowing them to reside in low power states.


Assumptions and goals of this patchset:
--

In this patchset, we don't handle the part of getting the region boundary info
from the firmware/bootloader and populating it in the kernel data-structures.
The aim of this patchset is to propose and brainstorm on a power-aware design
of the Linux MM which can *use* the region boundary info to influence the MM
at various places such as page allocation, reclamation/compaction etc, thereby
contributing to memory power savings. (This patchset is very much an RFC at
the moment and is not intended for mainline-inclusion yet).

So, in this 

Re: [RFC Patch 0/2] mm: Add parameters to make kernel behavior at memory error on dirty cache selectable

2013-04-17 Thread Simon Jeons
Hi Naoya,
On 04/17/2013 10:55 PM, Naoya Horiguchi wrote:
> On Wed, Apr 17, 2013 at 03:14:36PM +0800, Simon Jeons wrote:
>> Hi Naoya,
>> On 04/11/2013 03:11 PM, Naoya Horiguchi wrote:
>>> Hi Tanino-san,
>>>
>>> On Thu, Apr 11, 2013 at 12:26:19PM +0900, Mitsuhiro Tanino wrote:
>>> ...
>>>> Solution
>>>> -
>>>> The patch proposes a new sysctl interface, vm.memory_failure_dirty_panic,
>>>> in order to prevent data corruption comes from data lost problem.
>>>> Also this patch displays information of affected file such as device name,
>>>> inode number, file offset and file type if the file is mapped on a memory
>>>> and the page is dirty cache.
>>>>
>>>> When SRAO machine check occurs on a dirty page cache, corresponding
>>>> data cannot be recovered any more. Therefore, the patch proposes a kernel
>>>> option to keep a system running or force system panic in order
>>>> to avoid further trouble such as data corruption problem of application.
>>>>
>>>> System administrator can select an error action using this option
>>>> according to characteristics of target system.
>>> Can we do this in userspace?
>>> mcelog can trigger scripts when a MCE which matches the user-configurable
>>> conditions happens, so I think that we can trigger a kernel panic by
>>> chekcing kernel messages from the triggered script.
>>> For that purpose, I recently fixed the dirty/clean messaging in commit
>>> ff604cf6d4 "mm: hwpoison: fix action_result() to print out dirty/clean".
>> In your commit ff604cf6d4, you mentioned that "because when we check
>> PageDirty in action_result() it was cleared after page isolation even if
>> it's dirty before error handling." Could you point out where page
>> isolation and clear PageDirty? I don't think is isolate_lru_pages.
> Here is the result of ftracing of memory_failure().
> cancel_dirty_page() is called inside me_pagecache_dirty(), that's it.

Cool! What's the option you used in this ftrace.

>
>mceinj.sh-7662  [000] 154195.857024: funcgraph_entry:  
>  |memory_failure() {
>mceinj.sh-7662  [000] 154195.857024: funcgraph_entry:0.283 us  
>  |  PageHuge();
>mceinj.sh-7662  [000] 154195.857025: funcgraph_entry:0.321 us  
>  |  _cond_resched();
>mceinj.sh-7662  [000] 154195.857025: funcgraph_entry:0.348 us  
>  |  hwpoison_filter();
>mceinj.sh-7662  [000] 154195.857026: funcgraph_entry:0.323 us  
>  |  PageHuge();
>mceinj.sh-7662  [000] 154195.857027: funcgraph_entry:0.264 us  
>  |  PageHuge();
>mceinj.sh-7662  [000] 154195.857027: funcgraph_entry:  
>  |  kmem_cache_alloc_trace() {
>mceinj.sh-7662  [000] 154195.857028: funcgraph_entry:0.254 us  
>  |_cond_resched();
>mceinj.sh-7662  [000] 154195.857028: funcgraph_exit: 0.905 us  
>  |  }
>mceinj.sh-7662  [000] 154195.857029: funcgraph_entry:0.308 us  
>  |  _read_lock();
>mceinj.sh-7662  [000] 154195.857029: funcgraph_entry:0.326 us  
>  |  _spin_lock();
>mceinj.sh-7662  [000] 154195.857057: funcgraph_entry:  
>  |  kfree() {
>mceinj.sh-7662  [000] 154195.857057: funcgraph_entry:0.252 us  
>  |__phys_addr();
>mceinj.sh-7662  [000] 154195.857058: funcgraph_exit: 1.000 us  
>  |  }
>mceinj.sh-7662  [000] 154195.857058: funcgraph_entry:  
>  |  try_to_unmap() {
>mceinj.sh-7662  [000] 154195.857058: funcgraph_entry:  
>  |try_to_unmap_file() {
>mceinj.sh-7662  [000] 154195.857059: funcgraph_entry:0.430 us  
>  |  _spin_lock();
>mceinj.sh-7662  [000] 154195.857060: funcgraph_entry:0.719 us  
>  |  vma_prio_tree_next();
>mceinj.sh-7662  [000] 154195.857061: funcgraph_entry:  
>  |  try_to_unmap_one() {
>mceinj.sh-7662  [000] 154195.857061: funcgraph_entry:  
>  |page_check_address() {
>mceinj.sh-7662  [000] 154195.857061: funcgraph_entry:0.256 us  
>  |  PageHuge();
>mceinj.sh-7662  [000] 154195.857062: funcgraph_entry:0.419 us  
>  |  _spin_lock();
>mcein

Re: [PATCH] swap: redirty page if page write fails on swap file

2013-04-17 Thread Simon Jeons

Hi Jerome,
On 04/17/2013 08:11 PM, Jerome Marchand wrote:

Since commit 62c230b, swap_writepage() calls direct_IO on swap files.
However, in that case page isn't redirtied if I/O fails, and is therefore
handled afterwards as if it has been successfully written to the swap
file, leading to memory corruption when the page is eventually swapped
back in.
This patch sets the page dirty when direct_IO() fails. It fixes a memory


If swapfile has related page cache which cached swapfile in memory? It 
is not necessary, correct?



corruption that happened while using swap-over-NFS.

Signed-off-by: Jerome Marchand 
---
  mm/page_io.c |2 ++
  1 files changed, 2 insertions(+), 0 deletions(-)

diff --git a/mm/page_io.c b/mm/page_io.c
index 78eee32..04ca00d 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -222,6 +222,8 @@ int swap_writepage(struct page *page, struct 
writeback_control *wbc)
if (ret == PAGE_SIZE) {
count_vm_event(PSWPOUT);
ret = 0;
+   } else {
+   set_page_dirty(page);
}
return ret;
}

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majord...@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: mailto:"d...@kvack.org;> em...@kvack.org 


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] mm: mmu_notifier: re-fix freed page still mapped in secondary MMU

2013-04-17 Thread Simon Jeons

Hi Robin,
On 04/16/2013 05:31 PM, Robin Holt wrote:

On Tue, Apr 16, 2013 at 02:39:49PM +0800, Xiao Guangrong wrote:

The commit 751efd8610d3 (mmu_notifier_unregister NULL Pointer deref
and multiple ->release()) breaks the fix:
 3ad3d901bbcfb15a5e4690e55350db0899095a68
 (mm: mmu_notifier: fix freed page still mapped in secondary MMU)

Can you describe how the page is still mapped?  I thought I had all
cases covered.  Whichever call hits first, I thought we had one callout
to the registered notifiers.  Are you saying we need multiple callouts?

Also, shouldn't you be asking for a revert commit and then supply a
subsequent commit for the real fix?  I thought that was the process for
doing a revert.


mmu_notifier is used for sync normal pte and spte, correct?



Thanks,
Robin Holt


This patch reverts the commit and simply fix the bug spotted
by that patch

This bug is spotted by commit 751efd8610d3:
==
There is a race condition between mmu_notifier_unregister() and
__mmu_notifier_release().

Assume two tasks, one calling mmu_notifier_unregister() as a result of a
filp_close() ->flush() callout (task A), and the other calling
mmu_notifier_release() from an mmput() (task B).

 A   B
t1  srcu_read_lock()
t2  if (!hlist_unhashed())
t3  srcu_read_unlock()
t4  srcu_read_lock()
t5  hlist_del_init_rcu()
t6  synchronize_srcu()
t7  srcu_read_unlock()
t8  hlist_del_rcu()  <--- NULL pointer deref.
==

This can be fixed by using hlist_del_init_rcu instead of hlist_del_rcu.

The another issue spotted in the commit is
"multiple ->release() callouts", we needn't care it too much because
it is really rare (e.g, can not happen on kvm since mmu-notify is unregistered
after exit_mmap()) and the later call of multiple ->release should be
fast since all the pages have already been released by the first call.

Signed-off-by: Xiao Guangrong 
---
  mm/mmu_notifier.c |   81 +++--
  1 files changed, 41 insertions(+), 40 deletions(-)

diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index be04122..606777a 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -40,48 +40,45 @@ void __mmu_notifier_release(struct mm_struct *mm)
int id;

/*
-* srcu_read_lock() here will block synchronize_srcu() in
-* mmu_notifier_unregister() until all registered
-* ->release() callouts this function makes have
-* returned.
+* SRCU here will block mmu_notifier_unregister until
+* ->release returns.
 */
id = srcu_read_lock();
+   hlist_for_each_entry_rcu(mn, >mmu_notifier_mm->list, hlist)
+   /*
+* if ->release runs before mmu_notifier_unregister it
+* must be handled as it's the only way for the driver
+* to flush all existing sptes and stop the driver
+* from establishing any more sptes before all the
+* pages in the mm are freed.
+*/
+   if (mn->ops->release)
+   mn->ops->release(mn, mm);
+   srcu_read_unlock(, id);
+
spin_lock(>mmu_notifier_mm->lock);
while (unlikely(!hlist_empty(>mmu_notifier_mm->list))) {
mn = hlist_entry(mm->mmu_notifier_mm->list.first,
 struct mmu_notifier,
 hlist);
-
/*
-* Unlink.  This will prevent mmu_notifier_unregister()
-* from also making the ->release() callout.
+* We arrived before mmu_notifier_unregister so
+* mmu_notifier_unregister will do nothing other than
+* to wait ->release to finish and
+* mmu_notifier_unregister to return.
 */
hlist_del_init_rcu(>hlist);
-   spin_unlock(>mmu_notifier_mm->lock);
-
-   /*
-* Clear sptes. (see 'release' description in mmu_notifier.h)
-*/
-   if (mn->ops->release)
-   mn->ops->release(mn, mm);
-
-   spin_lock(>mmu_notifier_mm->lock);
}
spin_unlock(>mmu_notifier_mm->lock);

/*
-* All callouts to ->release() which we have done are complete.
-* Allow synchronize_srcu() in mmu_notifier_unregister() to complete
-*/
-   srcu_read_unlock(, id);
-
-   /*
-* mmu_notifier_unregister() may have unlinked a notifier and may
-* still be calling out to it.  Additionally, other notifiers
-* may have been active via vmtruncate() et. al. Block here
-* to ensure that all notifier callouts for this mm have been
-* 

Re: [RFC Patch 0/2] mm: Add parameters to make kernel behavior at memory error on dirty cache selectable

2013-04-17 Thread Simon Jeons
Hi Naoya,
On 04/11/2013 03:11 PM, Naoya Horiguchi wrote:
> Hi Tanino-san,
>
> On Thu, Apr 11, 2013 at 12:26:19PM +0900, Mitsuhiro Tanino wrote:
> ...
>> Solution
>> -
>> The patch proposes a new sysctl interface, vm.memory_failure_dirty_panic,
>> in order to prevent data corruption comes from data lost problem.
>> Also this patch displays information of affected file such as device name,
>> inode number, file offset and file type if the file is mapped on a memory
>> and the page is dirty cache.
>>
>> When SRAO machine check occurs on a dirty page cache, corresponding
>> data cannot be recovered any more. Therefore, the patch proposes a kernel
>> option to keep a system running or force system panic in order
>> to avoid further trouble such as data corruption problem of application.
>>
>> System administrator can select an error action using this option
>> according to characteristics of target system.
> Can we do this in userspace?
> mcelog can trigger scripts when a MCE which matches the user-configurable
> conditions happens, so I think that we can trigger a kernel panic by
> chekcing kernel messages from the triggered script.
> For that purpose, I recently fixed the dirty/clean messaging in commit
> ff604cf6d4 "mm: hwpoison: fix action_result() to print out dirty/clean".

In your commit ff604cf6d4, you mentioned that "because when we check
PageDirty in action_result() it was cleared after page isolation even if
it's dirty before error handling." Could you point out where page
isolation and clear PageDirty? I don't think is isolate_lru_pages.

>
>> Use Case
>> -
>> This option is intended to be adopted in KVM guest because it is
>> supposed that Linux on KVM guest operates customers business and
>> it is big impact to lost or corrupt customers data by memory failure.
>>
>> On the other hand, this option does not recommend to apply KVM host
>> as following reasons.
>>
>> - Making KVM host panic has a big impact because all virtual guests are
>>   affected by their host panic. Affected virtual guests are forced to stop
>>   and have to be restarted on the other hypervisor.
> In this reasoning, you seem to assume that important data (business data)
> are only handled on guest OS. That's true in most cases, but not always.
> I think that the more general approach for this use case is that
> we trigger kernel panic if memory errors happened on dirty pagecaches
> used by 'important' processes (for example by adding process flags
> controlled by prctl(),) and set it on qemu processes.
>
>> - If disk cached model of qemu is set to "none", I/O type of virtual
>>   guests becomes O_DIRECT and KVM host does not cache guest's disk I/O.
>>   Therefore, if SRAO machine check is reported on a dirty page cache
>>   in KVM host, its virtual machines are not affected by the machine check.
>>   So the host is expected to keep operating instead of kernel panic.
> What to do if there're multiple guests, and some have "none" cache and
> others have other types?
> I think that we need more flexible settings for this use case.
>
>> Past discussion
>> 
>> This problem was previously discussed in the kernel community, 
>> (refer: mail threads pertaining to
>> http://marc.info/?l=linux-kernel=135187403804934=4). 
>>
 - I worry that if a hardware error occurs, it might affect a large
   amount of memory all at the same time.  For example, if a 4G memory
   block goes bad, this message will be printed a million times?
>> As Andrew mentioned in the above threads, if 4GB memory blocks goes bad,
>> error messages will be printed a million times and this behavior loses
>> a system reliability.
> Maybe "4G memory block goes bad" is not a MCE SRAO but a MCE with higher
> severity, so we have no choice but to make kernel panic.
>
> Thanks,
> Naoya Horiguchi
>
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majord...@kvack.org.  For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: mailto:"d...@kvack.org;> em...@kvack.org 

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [RFC Patch 0/2] mm: Add parameters to make kernel behavior at memory error on dirty cache selectable

2013-04-17 Thread Simon Jeons
Hi Naoya,
On 04/11/2013 11:23 PM, Naoya Horiguchi wrote:
> On Thu, Apr 11, 2013 at 03:49:16PM +0200, Andi Kleen wrote:
>>> As a result, if the dirty cache includes user data, the data is lost,
>>> and data corruption occurs if an application uses old data.
>> The application cannot use old data, the kernel code kills it if it
>> would do that. And if it's IO data there is an EIO triggered.
>>
>> iirc the only concern in the past was that the application may miss
>> the asynchronous EIO because it's cleared on any fd access. 
>>
>> This is a general problem not specific to memory error handling, 
>> as these asynchronous IO errors can happen due to other reason
>> (bad disk etc.) 
>>
>> If you're really concerned about this case I think the solution
>> is to make the EIO more sticky so that there is a higher chance
>> than it gets returned.  This will make your data much more safe,
>> as it will cover all kinds of IO errors, not just the obscure memory
>> errors.
> I'm interested in this topic, and in previous discussion, what I was said
> is that we can't expect user applications to change their behaviors when
> they get EIO, so globally changing EIO's stickiness is not a great approach.

The user applications will get EIO firstly or get SIG_KILL firstly?

> I'm working on a new pagecache tag based mechanism to solve this.
> But it needs time and more discussions.
> So I guess Tanino-san suggests giving up on dirty pagecache errors
> as a quick solution.
>
> Thanks,
> Naoya
>
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majord...@kvack.org.  For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: mailto:"d...@kvack.org;> em...@kvack.org 

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [RFC Patch 0/2] mm: Add parameters to make kernel behavior at memory error on dirty cache selectable

2013-04-17 Thread Simon Jeons
Hi Naoya,
On 04/11/2013 11:23 PM, Naoya Horiguchi wrote:
 On Thu, Apr 11, 2013 at 03:49:16PM +0200, Andi Kleen wrote:
 As a result, if the dirty cache includes user data, the data is lost,
 and data corruption occurs if an application uses old data.
 The application cannot use old data, the kernel code kills it if it
 would do that. And if it's IO data there is an EIO triggered.

 iirc the only concern in the past was that the application may miss
 the asynchronous EIO because it's cleared on any fd access. 

 This is a general problem not specific to memory error handling, 
 as these asynchronous IO errors can happen due to other reason
 (bad disk etc.) 

 If you're really concerned about this case I think the solution
 is to make the EIO more sticky so that there is a higher chance
 than it gets returned.  This will make your data much more safe,
 as it will cover all kinds of IO errors, not just the obscure memory
 errors.
 I'm interested in this topic, and in previous discussion, what I was said
 is that we can't expect user applications to change their behaviors when
 they get EIO, so globally changing EIO's stickiness is not a great approach.

The user applications will get EIO firstly or get SIG_KILL firstly?

 I'm working on a new pagecache tag based mechanism to solve this.
 But it needs time and more discussions.
 So I guess Tanino-san suggests giving up on dirty pagecache errors
 as a quick solution.

 Thanks,
 Naoya

 --
 To unsubscribe, send a message with 'unsubscribe linux-mm' in
 the body to majord...@kvack.org.  For more info on Linux MM,
 see: http://www.linux-mm.org/ .
 Don't email: a href=mailto:d...@kvack.org; em...@kvack.org /a

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [RFC Patch 0/2] mm: Add parameters to make kernel behavior at memory error on dirty cache selectable

2013-04-17 Thread Simon Jeons
Hi Naoya,
On 04/11/2013 03:11 PM, Naoya Horiguchi wrote:
 Hi Tanino-san,

 On Thu, Apr 11, 2013 at 12:26:19PM +0900, Mitsuhiro Tanino wrote:
 ...
 Solution
 -
 The patch proposes a new sysctl interface, vm.memory_failure_dirty_panic,
 in order to prevent data corruption comes from data lost problem.
 Also this patch displays information of affected file such as device name,
 inode number, file offset and file type if the file is mapped on a memory
 and the page is dirty cache.

 When SRAO machine check occurs on a dirty page cache, corresponding
 data cannot be recovered any more. Therefore, the patch proposes a kernel
 option to keep a system running or force system panic in order
 to avoid further trouble such as data corruption problem of application.

 System administrator can select an error action using this option
 according to characteristics of target system.
 Can we do this in userspace?
 mcelog can trigger scripts when a MCE which matches the user-configurable
 conditions happens, so I think that we can trigger a kernel panic by
 chekcing kernel messages from the triggered script.
 For that purpose, I recently fixed the dirty/clean messaging in commit
 ff604cf6d4 mm: hwpoison: fix action_result() to print out dirty/clean.

In your commit ff604cf6d4, you mentioned that because when we check
PageDirty in action_result() it was cleared after page isolation even if
it's dirty before error handling. Could you point out where page
isolation and clear PageDirty? I don't think is isolate_lru_pages.


 Use Case
 -
 This option is intended to be adopted in KVM guest because it is
 supposed that Linux on KVM guest operates customers business and
 it is big impact to lost or corrupt customers data by memory failure.

 On the other hand, this option does not recommend to apply KVM host
 as following reasons.

 - Making KVM host panic has a big impact because all virtual guests are
   affected by their host panic. Affected virtual guests are forced to stop
   and have to be restarted on the other hypervisor.
 In this reasoning, you seem to assume that important data (business data)
 are only handled on guest OS. That's true in most cases, but not always.
 I think that the more general approach for this use case is that
 we trigger kernel panic if memory errors happened on dirty pagecaches
 used by 'important' processes (for example by adding process flags
 controlled by prctl(),) and set it on qemu processes.

 - If disk cached model of qemu is set to none, I/O type of virtual
   guests becomes O_DIRECT and KVM host does not cache guest's disk I/O.
   Therefore, if SRAO machine check is reported on a dirty page cache
   in KVM host, its virtual machines are not affected by the machine check.
   So the host is expected to keep operating instead of kernel panic.
 What to do if there're multiple guests, and some have none cache and
 others have other types?
 I think that we need more flexible settings for this use case.

 Past discussion
 
 This problem was previously discussed in the kernel community, 
 (refer: mail threads pertaining to
 http://marc.info/?l=linux-kernelm=135187403804934w=4). 

 - I worry that if a hardware error occurs, it might affect a large
   amount of memory all at the same time.  For example, if a 4G memory
   block goes bad, this message will be printed a million times?
 As Andrew mentioned in the above threads, if 4GB memory blocks goes bad,
 error messages will be printed a million times and this behavior loses
 a system reliability.
 Maybe 4G memory block goes bad is not a MCE SRAO but a MCE with higher
 severity, so we have no choice but to make kernel panic.

 Thanks,
 Naoya Horiguchi

 --
 To unsubscribe, send a message with 'unsubscribe linux-mm' in
 the body to majord...@kvack.org.  For more info on Linux MM,
 see: http://www.linux-mm.org/ .
 Don't email: a href=mailto:d...@kvack.org; em...@kvack.org /a

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] mm: mmu_notifier: re-fix freed page still mapped in secondary MMU

2013-04-17 Thread Simon Jeons

Hi Robin,
On 04/16/2013 05:31 PM, Robin Holt wrote:

On Tue, Apr 16, 2013 at 02:39:49PM +0800, Xiao Guangrong wrote:

The commit 751efd8610d3 (mmu_notifier_unregister NULL Pointer deref
and multiple -release()) breaks the fix:
 3ad3d901bbcfb15a5e4690e55350db0899095a68
 (mm: mmu_notifier: fix freed page still mapped in secondary MMU)

Can you describe how the page is still mapped?  I thought I had all
cases covered.  Whichever call hits first, I thought we had one callout
to the registered notifiers.  Are you saying we need multiple callouts?

Also, shouldn't you be asking for a revert commit and then supply a
subsequent commit for the real fix?  I thought that was the process for
doing a revert.


mmu_notifier is used for sync normal pte and spte, correct?



Thanks,
Robin Holt


This patch reverts the commit and simply fix the bug spotted
by that patch

This bug is spotted by commit 751efd8610d3:
==
There is a race condition between mmu_notifier_unregister() and
__mmu_notifier_release().

Assume two tasks, one calling mmu_notifier_unregister() as a result of a
filp_close() -flush() callout (task A), and the other calling
mmu_notifier_release() from an mmput() (task B).

 A   B
t1  srcu_read_lock()
t2  if (!hlist_unhashed())
t3  srcu_read_unlock()
t4  srcu_read_lock()
t5  hlist_del_init_rcu()
t6  synchronize_srcu()
t7  srcu_read_unlock()
t8  hlist_del_rcu()  --- NULL pointer deref.
==

This can be fixed by using hlist_del_init_rcu instead of hlist_del_rcu.

The another issue spotted in the commit is
multiple -release() callouts, we needn't care it too much because
it is really rare (e.g, can not happen on kvm since mmu-notify is unregistered
after exit_mmap()) and the later call of multiple -release should be
fast since all the pages have already been released by the first call.

Signed-off-by: Xiao Guangrong xiaoguangr...@linux.vnet.ibm.com
---
  mm/mmu_notifier.c |   81 +++--
  1 files changed, 41 insertions(+), 40 deletions(-)

diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index be04122..606777a 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -40,48 +40,45 @@ void __mmu_notifier_release(struct mm_struct *mm)
int id;

/*
-* srcu_read_lock() here will block synchronize_srcu() in
-* mmu_notifier_unregister() until all registered
-* -release() callouts this function makes have
-* returned.
+* SRCU here will block mmu_notifier_unregister until
+* -release returns.
 */
id = srcu_read_lock(srcu);
+   hlist_for_each_entry_rcu(mn, mm-mmu_notifier_mm-list, hlist)
+   /*
+* if -release runs before mmu_notifier_unregister it
+* must be handled as it's the only way for the driver
+* to flush all existing sptes and stop the driver
+* from establishing any more sptes before all the
+* pages in the mm are freed.
+*/
+   if (mn-ops-release)
+   mn-ops-release(mn, mm);
+   srcu_read_unlock(srcu, id);
+
spin_lock(mm-mmu_notifier_mm-lock);
while (unlikely(!hlist_empty(mm-mmu_notifier_mm-list))) {
mn = hlist_entry(mm-mmu_notifier_mm-list.first,
 struct mmu_notifier,
 hlist);
-
/*
-* Unlink.  This will prevent mmu_notifier_unregister()
-* from also making the -release() callout.
+* We arrived before mmu_notifier_unregister so
+* mmu_notifier_unregister will do nothing other than
+* to wait -release to finish and
+* mmu_notifier_unregister to return.
 */
hlist_del_init_rcu(mn-hlist);
-   spin_unlock(mm-mmu_notifier_mm-lock);
-
-   /*
-* Clear sptes. (see 'release' description in mmu_notifier.h)
-*/
-   if (mn-ops-release)
-   mn-ops-release(mn, mm);
-
-   spin_lock(mm-mmu_notifier_mm-lock);
}
spin_unlock(mm-mmu_notifier_mm-lock);

/*
-* All callouts to -release() which we have done are complete.
-* Allow synchronize_srcu() in mmu_notifier_unregister() to complete
-*/
-   srcu_read_unlock(srcu, id);
-
-   /*
-* mmu_notifier_unregister() may have unlinked a notifier and may
-* still be calling out to it.  Additionally, other notifiers
-* may have been active via vmtruncate() et. al. Block here
-* to ensure that all notifier callouts for 

Re: [PATCH] swap: redirty page if page write fails on swap file

2013-04-17 Thread Simon Jeons

Hi Jerome,
On 04/17/2013 08:11 PM, Jerome Marchand wrote:

Since commit 62c230b, swap_writepage() calls direct_IO on swap files.
However, in that case page isn't redirtied if I/O fails, and is therefore
handled afterwards as if it has been successfully written to the swap
file, leading to memory corruption when the page is eventually swapped
back in.
This patch sets the page dirty when direct_IO() fails. It fixes a memory


If swapfile has related page cache which cached swapfile in memory? It 
is not necessary, correct?



corruption that happened while using swap-over-NFS.

Signed-off-by: Jerome Marchand jmarc...@redhat.com
---
  mm/page_io.c |2 ++
  1 files changed, 2 insertions(+), 0 deletions(-)

diff --git a/mm/page_io.c b/mm/page_io.c
index 78eee32..04ca00d 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -222,6 +222,8 @@ int swap_writepage(struct page *page, struct 
writeback_control *wbc)
if (ret == PAGE_SIZE) {
count_vm_event(PSWPOUT);
ret = 0;
+   } else {
+   set_page_dirty(page);
}
return ret;
}

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majord...@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: a href=mailto:d...@kvack.org; em...@kvack.org /a


--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [RFC Patch 0/2] mm: Add parameters to make kernel behavior at memory error on dirty cache selectable

2013-04-17 Thread Simon Jeons
Hi Naoya,
On 04/17/2013 10:55 PM, Naoya Horiguchi wrote:
 On Wed, Apr 17, 2013 at 03:14:36PM +0800, Simon Jeons wrote:
 Hi Naoya,
 On 04/11/2013 03:11 PM, Naoya Horiguchi wrote:
 Hi Tanino-san,

 On Thu, Apr 11, 2013 at 12:26:19PM +0900, Mitsuhiro Tanino wrote:
 ...
 Solution
 -
 The patch proposes a new sysctl interface, vm.memory_failure_dirty_panic,
 in order to prevent data corruption comes from data lost problem.
 Also this patch displays information of affected file such as device name,
 inode number, file offset and file type if the file is mapped on a memory
 and the page is dirty cache.

 When SRAO machine check occurs on a dirty page cache, corresponding
 data cannot be recovered any more. Therefore, the patch proposes a kernel
 option to keep a system running or force system panic in order
 to avoid further trouble such as data corruption problem of application.

 System administrator can select an error action using this option
 according to characteristics of target system.
 Can we do this in userspace?
 mcelog can trigger scripts when a MCE which matches the user-configurable
 conditions happens, so I think that we can trigger a kernel panic by
 chekcing kernel messages from the triggered script.
 For that purpose, I recently fixed the dirty/clean messaging in commit
 ff604cf6d4 mm: hwpoison: fix action_result() to print out dirty/clean.
 In your commit ff604cf6d4, you mentioned that because when we check
 PageDirty in action_result() it was cleared after page isolation even if
 it's dirty before error handling. Could you point out where page
 isolation and clear PageDirty? I don't think is isolate_lru_pages.
 Here is the result of ftracing of memory_failure().
 cancel_dirty_page() is called inside me_pagecache_dirty(), that's it.

Cool! What's the option you used in this ftrace.


mceinj.sh-7662  [000] 154195.857024: funcgraph_entry:  
  |memory_failure() {
mceinj.sh-7662  [000] 154195.857024: funcgraph_entry:0.283 us  
  |  PageHuge();
mceinj.sh-7662  [000] 154195.857025: funcgraph_entry:0.321 us  
  |  _cond_resched();
mceinj.sh-7662  [000] 154195.857025: funcgraph_entry:0.348 us  
  |  hwpoison_filter();
mceinj.sh-7662  [000] 154195.857026: funcgraph_entry:0.323 us  
  |  PageHuge();
mceinj.sh-7662  [000] 154195.857027: funcgraph_entry:0.264 us  
  |  PageHuge();
mceinj.sh-7662  [000] 154195.857027: funcgraph_entry:  
  |  kmem_cache_alloc_trace() {
mceinj.sh-7662  [000] 154195.857028: funcgraph_entry:0.254 us  
  |_cond_resched();
mceinj.sh-7662  [000] 154195.857028: funcgraph_exit: 0.905 us  
  |  }
mceinj.sh-7662  [000] 154195.857029: funcgraph_entry:0.308 us  
  |  _read_lock();
mceinj.sh-7662  [000] 154195.857029: funcgraph_entry:0.326 us  
  |  _spin_lock();
mceinj.sh-7662  [000] 154195.857057: funcgraph_entry:  
  |  kfree() {
mceinj.sh-7662  [000] 154195.857057: funcgraph_entry:0.252 us  
  |__phys_addr();
mceinj.sh-7662  [000] 154195.857058: funcgraph_exit: 1.000 us  
  |  }
mceinj.sh-7662  [000] 154195.857058: funcgraph_entry:  
  |  try_to_unmap() {
mceinj.sh-7662  [000] 154195.857058: funcgraph_entry:  
  |try_to_unmap_file() {
mceinj.sh-7662  [000] 154195.857059: funcgraph_entry:0.430 us  
  |  _spin_lock();
mceinj.sh-7662  [000] 154195.857060: funcgraph_entry:0.719 us  
  |  vma_prio_tree_next();
mceinj.sh-7662  [000] 154195.857061: funcgraph_entry:  
  |  try_to_unmap_one() {
mceinj.sh-7662  [000] 154195.857061: funcgraph_entry:  
  |page_check_address() {
mceinj.sh-7662  [000] 154195.857061: funcgraph_entry:0.256 us  
  |  PageHuge();
mceinj.sh-7662  [000] 154195.857062: funcgraph_entry:0.419 us  
  |  _spin_lock();
mceinj.sh-7662  [000] 154195.857063: funcgraph_exit: 1.812 us  
  |}
mceinj.sh-7662  [000] 154195.857063: funcgraph_entry:  
  |flush_tlb_page() {
mceinj.sh-7662  [000] 154195.857064: funcgraph_entry:  
  |  native_flush_tlb_others() {
mceinj.sh-7662  [000] 154195.857064: funcgraph_entry:0.286 us  
  |is_uv_system();
mceinj.sh-7662  [000] 154195.857065: funcgraph_entry:  
  |flush_tlb_others_ipi() {
mceinj.sh-7662  [000

Re: [RFC Patch 0/2] mm: Add parameters to make kernel behavior at memory error on dirty cache selectable

2013-04-16 Thread Simon Jeons

Hi Mitsuhiro,
On 04/12/2013 09:43 PM, Mitsuhiro Tanino wrote:

(2013/04/11 22:00), Ric Mason wrote:

Hi Mitsuhiro,
On 04/11/2013 08:51 PM, Mitsuhiro Tanino wrote:

(2013/04/11 12:53), Simon Jeons wrote:

One question against mce instead of the patchset. ;-)

When check memory is bad? Before memory access? Is there a process scan it 
period?

Hi Simon-san,

Yes, there is a process to scan memory periodically.

At Intel Nehalem-EX and CPUs after Nehalem-EX generation, MCA recovery
is supported. MCA recovery provides error detection and isolation
features to work together with OS.
One of the MCA Recovery features is Memory Scrubbing. It periodically
checks memory in the background of OS.

Memory Scrubbing is a kernel thread? Where is the codes of memory scrubbing?

Hi Ric,

No. One of the MCA Recovery features is Memory Scrubbing.


Memory Scrubbing is a process in CPU?


And Memory Scrubbing is a hardware feature of Intel CPU.

OS has a hwpoison feature which is included at mm/memory-failure.c.
A main function is memory_failure().

If Memory Scrubbing finds a memory error, MCA recovery notifies SRAO error
into OS and OS handles the SRAO error using hwpoison function.



If Memory Scrubbing find an uncorrectable error on a memory before
OS accesses the memory bit, MCA recovery notifies SRAO error into OS

It maybe can't find memory error timely since it is sleeping when memory error 
occur, can this case happened?

Memory Scrubbing seems to be operated periodically but I don't have
information about how oftern it is executed.


If Memory Scurbbing doesn't catch memory error timely, who will send 
SRAR into OS?




Regards,
Mitsuhiro Tanino

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majord...@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: mailto:"d...@kvack.org;> em...@kvack.org 


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [RFC Patch 0/2] mm: Add parameters to make kernel behavior at memory error on dirty cache selectable

2013-04-16 Thread Simon Jeons

On 04/11/2013 09:49 PM, Andi Kleen wrote:

As a result, if the dirty cache includes user data, the data is lost,
and data corruption occurs if an application uses old data.


Hi Andi,

Could you give me the link of your mce testcase?


The application cannot use old data, the kernel code kills it if it
would do that. And if it's IO data there is an EIO triggered.

iirc the only concern in the past was that the application may miss
the asynchronous EIO because it's cleared on any fd access.

This is a general problem not specific to memory error handling,
as these asynchronous IO errors can happen due to other reason
(bad disk etc.)

If you're really concerned about this case I think the solution
is to make the EIO more sticky so that there is a higher chance
than it gets returned.  This will make your data much more safe,
as it will cover all kinds of IO errors, not just the obscure memory
errors.

Or maybe have a panic knob on any IO error for any case if you don't
trust your application to check IO syscalls. But I would rather
have better EIO reporting than just giving up like this.

The problem of tying it just to any dirty data for memory errors
is that most anonymous data is dirty and it doesn't have this problem
at all (because the signals handle this and they cannot be lost)

And that is a far more common case than this relatively unlikely
case of dirty IO data.

So just doing it for "dirty" is not the right knob.

Basically I'm saying if you worry about unreliable IO error reporting
fix IO error reporting, don't add random unnecessary panics to
the memory error handling.

BTW my suspicion is that if you approach this from a data driven
perspective: that is measure how much such dirty data is typically
around in comparison to other data it will be unlikely. Such
a study can be done with the "page-types" program in tools/vm

-Andi

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majord...@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: mailto:"d...@kvack.org;> em...@kvack.org 


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [BUG][s390x] mm: system crashed

2013-04-16 Thread Simon Jeons

Hi Heiko,
On 04/16/2013 03:50 PM, Heiko Carstens wrote:

On Mon, Apr 15, 2013 at 02:16:55PM +0800, Zhouping Liu wrote:

On 04/15/2013 01:56 PM, Heiko Carstens wrote:

On Sun, Apr 14, 2013 at 11:28:40PM -0400, Zhouping Liu wrote:

� 16109.346170¨ Call Trace:
� 16109.346179¨ (� <00100920>¨ show_trace+0x128/0x12c)
� 16109.346195¨  � <001cd320>¨ rcu_check_callbacks+0x458/0xccc
� 16109.346209¨  � <00140f2e>¨ update_process_times+0x4a/0x74
� 16109.346222¨  � <00199452>¨ tick_sched_handle.isra.12+0x5e/0x70
� 16109.346235¨  � <001995aa>¨ tick_sched_timer+0x6a/0x98
� 16109.346247¨  � <0015c1ea>¨ __run_hrtimer+0x8e/0x200
� 16109.346381¨  � <0015d1b2>¨ hrtimer_interrupt+0x212/0x2b0
� 16109.346385¨  � <001040f6>¨ clock_comparator_work+0x4a/0x54
� 16109.346390¨  � <0010d658>¨ do_extint+0x158/0x15c
� 16109.346396¨  � <0062aa24>¨ ext_skip+0x38/0x3c
� 16109.346404¨  � <001153c8>¨ smp_yield_cpu+0x44/0x48
� 16109.346412¨ (� <03d10051aec0>¨ 0x3d10051aec0)
� 16109.346457¨  � <0024206a>¨ __page_check_address+0x16a/0x170
� 16109.346466¨  � <002423a2>¨ page_referenced_one+0x3e/0xa0
� 16109.346501¨  � <0024427c>¨ page_referenced+0x32c/0x41c
� 16109.346510¨  � <0021b1dc>¨ shrink_page_list+0x380/0xb9c
� 16109.346521¨  � <0021c0a6>¨ shrink_inactive_list+0x1c6/0x56c
� 16109.346532¨  � <0021c69e>¨ shrink_lruvec+0x252/0x56c
� 16109.346542¨  � <0021ca44>¨ shrink_zone+0x8c/0x1bc
� 16109.346553¨  � <0021d080>¨ balance_pgdat+0x50c/0x658
� 16109.346564¨  � <0021d318>¨ kswapd+0x14c/0x470
� 16109.346576¨  � <00158292>¨ kthread+0xda/0xe4
� 16109.346656¨  � <0062a5de>¨ kernel_thread_starter+0x6/0xc
� 16109.346682¨  � <0062a5d8>¨ kernel_thread_starter+0x0/0xc
[-- MARK -- Fri Apr 12 06:15:00 2013]
� 16289.386061¨ INFO: rcu_sched self-detected stall on CPU { 0}  (t=42010 
jiffies
  g=89766 c=89765 q=10627)

Did the system really crash or did you just see the rcu related warning(s)?

I just check it again, actually at first the system didn't really
crash, but the system is very slow in response.
and the reproducer process can't be killed, after I did some common
actions such as 'ls' 'vim' etc, the system
seemed to be really crashed, no any response.

also in the previous testing, I can remember that the system would
be no any response for a long time, just only
repeatedly print out the such above 'Call Trace' into console.

Ok, thanks.
Just a couple of more questions: did you see this also on other archs, or just
s390 (if you tried other platforms at all).

If you have some time, could you please repeat your test with the kernel
command line option " user_mode=home "?


What's the meaning of this command line? I can't find it in 
Documentation/kernel-parameters.txt/




As far as I can tell there was only one s390 patch merged that was
mmap related: 486c0a0bc80d370471b21662bf03f04fbb37cdc6 "s390/mm: Fix crst
upgrade of mmap with MAP_FIXED".
Even though I don't think it explains the bug you've seen it might be worth
to try to revert it.

And at last, can you share your kernel config?

Thanks,
Heiko

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majord...@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: mailto:"d...@kvack.org;> em...@kvack.org 


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [BUG][s390x] mm: system crashed

2013-04-16 Thread Simon Jeons

Hi Heiko,
On 04/16/2013 03:50 PM, Heiko Carstens wrote:

On Mon, Apr 15, 2013 at 02:16:55PM +0800, Zhouping Liu wrote:

On 04/15/2013 01:56 PM, Heiko Carstens wrote:

On Sun, Apr 14, 2013 at 11:28:40PM -0400, Zhouping Liu wrote:

� 16109.346170¨ Call Trace:
� 16109.346179¨ (� 00100920¨ show_trace+0x128/0x12c)
� 16109.346195¨  � 001cd320¨ rcu_check_callbacks+0x458/0xccc
� 16109.346209¨  � 00140f2e¨ update_process_times+0x4a/0x74
� 16109.346222¨  � 00199452¨ tick_sched_handle.isra.12+0x5e/0x70
� 16109.346235¨  � 001995aa¨ tick_sched_timer+0x6a/0x98
� 16109.346247¨  � 0015c1ea¨ __run_hrtimer+0x8e/0x200
� 16109.346381¨  � 0015d1b2¨ hrtimer_interrupt+0x212/0x2b0
� 16109.346385¨  � 001040f6¨ clock_comparator_work+0x4a/0x54
� 16109.346390¨  � 0010d658¨ do_extint+0x158/0x15c
� 16109.346396¨  � 0062aa24¨ ext_skip+0x38/0x3c
� 16109.346404¨  � 001153c8¨ smp_yield_cpu+0x44/0x48
� 16109.346412¨ (� 03d10051aec0¨ 0x3d10051aec0)
� 16109.346457¨  � 0024206a¨ __page_check_address+0x16a/0x170
� 16109.346466¨  � 002423a2¨ page_referenced_one+0x3e/0xa0
� 16109.346501¨  � 0024427c¨ page_referenced+0x32c/0x41c
� 16109.346510¨  � 0021b1dc¨ shrink_page_list+0x380/0xb9c
� 16109.346521¨  � 0021c0a6¨ shrink_inactive_list+0x1c6/0x56c
� 16109.346532¨  � 0021c69e¨ shrink_lruvec+0x252/0x56c
� 16109.346542¨  � 0021ca44¨ shrink_zone+0x8c/0x1bc
� 16109.346553¨  � 0021d080¨ balance_pgdat+0x50c/0x658
� 16109.346564¨  � 0021d318¨ kswapd+0x14c/0x470
� 16109.346576¨  � 00158292¨ kthread+0xda/0xe4
� 16109.346656¨  � 0062a5de¨ kernel_thread_starter+0x6/0xc
� 16109.346682¨  � 0062a5d8¨ kernel_thread_starter+0x0/0xc
[-- MARK -- Fri Apr 12 06:15:00 2013]
� 16289.386061¨ INFO: rcu_sched self-detected stall on CPU { 0}  (t=42010 
jiffies
  g=89766 c=89765 q=10627)

Did the system really crash or did you just see the rcu related warning(s)?

I just check it again, actually at first the system didn't really
crash, but the system is very slow in response.
and the reproducer process can't be killed, after I did some common
actions such as 'ls' 'vim' etc, the system
seemed to be really crashed, no any response.

also in the previous testing, I can remember that the system would
be no any response for a long time, just only
repeatedly print out the such above 'Call Trace' into console.

Ok, thanks.
Just a couple of more questions: did you see this also on other archs, or just
s390 (if you tried other platforms at all).

If you have some time, could you please repeat your test with the kernel
command line option  user_mode=home ?


What's the meaning of this command line? I can't find it in 
Documentation/kernel-parameters.txt/




As far as I can tell there was only one s390 patch merged that was
mmap related: 486c0a0bc80d370471b21662bf03f04fbb37cdc6 s390/mm: Fix crst
upgrade of mmap with MAP_FIXED.
Even though I don't think it explains the bug you've seen it might be worth
to try to revert it.

And at last, can you share your kernel config?

Thanks,
Heiko

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majord...@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: a href=mailto:d...@kvack.org; em...@kvack.org /a


--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [RFC Patch 0/2] mm: Add parameters to make kernel behavior at memory error on dirty cache selectable

2013-04-16 Thread Simon Jeons

On 04/11/2013 09:49 PM, Andi Kleen wrote:

As a result, if the dirty cache includes user data, the data is lost,
and data corruption occurs if an application uses old data.


Hi Andi,

Could you give me the link of your mce testcase?


The application cannot use old data, the kernel code kills it if it
would do that. And if it's IO data there is an EIO triggered.

iirc the only concern in the past was that the application may miss
the asynchronous EIO because it's cleared on any fd access.

This is a general problem not specific to memory error handling,
as these asynchronous IO errors can happen due to other reason
(bad disk etc.)

If you're really concerned about this case I think the solution
is to make the EIO more sticky so that there is a higher chance
than it gets returned.  This will make your data much more safe,
as it will cover all kinds of IO errors, not just the obscure memory
errors.

Or maybe have a panic knob on any IO error for any case if you don't
trust your application to check IO syscalls. But I would rather
have better EIO reporting than just giving up like this.

The problem of tying it just to any dirty data for memory errors
is that most anonymous data is dirty and it doesn't have this problem
at all (because the signals handle this and they cannot be lost)

And that is a far more common case than this relatively unlikely
case of dirty IO data.

So just doing it for dirty is not the right knob.

Basically I'm saying if you worry about unreliable IO error reporting
fix IO error reporting, don't add random unnecessary panics to
the memory error handling.

BTW my suspicion is that if you approach this from a data driven
perspective: that is measure how much such dirty data is typically
around in comparison to other data it will be unlikely. Such
a study can be done with the page-types program in tools/vm

-Andi

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majord...@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: a href=mailto:d...@kvack.org; em...@kvack.org /a


--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [RFC Patch 0/2] mm: Add parameters to make kernel behavior at memory error on dirty cache selectable

2013-04-16 Thread Simon Jeons

Hi Mitsuhiro,
On 04/12/2013 09:43 PM, Mitsuhiro Tanino wrote:

(2013/04/11 22:00), Ric Mason wrote:

Hi Mitsuhiro,
On 04/11/2013 08:51 PM, Mitsuhiro Tanino wrote:

(2013/04/11 12:53), Simon Jeons wrote:

One question against mce instead of the patchset. ;-)

When check memory is bad? Before memory access? Is there a process scan it 
period?

Hi Simon-san,

Yes, there is a process to scan memory periodically.

At Intel Nehalem-EX and CPUs after Nehalem-EX generation, MCA recovery
is supported. MCA recovery provides error detection and isolation
features to work together with OS.
One of the MCA Recovery features is Memory Scrubbing. It periodically
checks memory in the background of OS.

Memory Scrubbing is a kernel thread? Where is the codes of memory scrubbing?

Hi Ric,

No. One of the MCA Recovery features is Memory Scrubbing.


Memory Scrubbing is a process in CPU?


And Memory Scrubbing is a hardware feature of Intel CPU.

OS has a hwpoison feature which is included at mm/memory-failure.c.
A main function is memory_failure().

If Memory Scrubbing finds a memory error, MCA recovery notifies SRAO error
into OS and OS handles the SRAO error using hwpoison function.



If Memory Scrubbing find an uncorrectable error on a memory before
OS accesses the memory bit, MCA recovery notifies SRAO error into OS

It maybe can't find memory error timely since it is sleeping when memory error 
occur, can this case happened?

Memory Scrubbing seems to be operated periodically but I don't have
information about how oftern it is executed.


If Memory Scurbbing doesn't catch memory error timely, who will send 
SRAR into OS?




Regards,
Mitsuhiro Tanino

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majord...@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: a href=mailto:d...@kvack.org; em...@kvack.org /a


--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [RFC Patch 0/2] mm: Add parameters to make kernel behavior at memory error on dirty cache selectable

2013-04-10 Thread Simon Jeons

Hi Mitsuhiro,
On 04/11/2013 11:26 AM, Mitsuhiro Tanino wrote:

Hi All,
Please find a patch set that introduces these new sysctl interfaces,
to handle a case when an memory error is detected on dirty page cache.

- vm.memory_failure_dirty_panic
- vm.memory_failure_print_ratelimit
- vm.memory_failure_print_ratelimit_burst

Problem
-
Recently, it is common that enterprise servers likely have a large
amount of memory, especially for cloud environment. This means that
possibility of memory failures is increased.

To handle memory failure, Linux has a hwpoison feature. When a memory
error is detected by memory scrub, the error is reported as machine
check, uncorrected recoverable (UCR), to OS. Then OS isolates the memory
region with memory failure if the memory page can be isolated.
The hwpoison handles it according to the memory region, such as kernel,
dirty cache, clean cache. If the memory region can be isolated, the
page is marked "hwpoison" and it is not used again.

When SRAO machine check is reported on a page which is included dirty
page cache, the page is truncated because the memory is corrupted and
data of the page cannot be written to a disk any more.

As a result, if the dirty cache includes user data, the data is lost,
and data corruption occurs if an application uses old data.


One question against mce instead of the patchset. ;-)

When check memory is bad? Before memory access? Is there a process scan 
it period?





Solution
-
The patch proposes a new sysctl interface, vm.memory_failure_dirty_panic,
in order to prevent data corruption comes from data lost problem.
Also this patch displays information of affected file such as device name,
inode number, file offset and file type if the file is mapped on a memory
and the page is dirty cache.

When SRAO machine check occurs on a dirty page cache, corresponding
data cannot be recovered any more. Therefore, the patch proposes a kernel
option to keep a system running or force system panic in order
to avoid further trouble such as data corruption problem of application.

System administrator can select an error action using this option
according to characteristics of target system.



Use Case
-
This option is intended to be adopted in KVM guest because it is
supposed that Linux on KVM guest operates customers business and
it is big impact to lost or corrupt customers data by memory failure.

On the other hand, this option does not recommend to apply KVM host
as following reasons.

- Making KVM host panic has a big impact because all virtual guests are
   affected by their host panic. Affected virtual guests are forced to stop
   and have to be restarted on the other hypervisor.

- If disk cached model of qemu is set to "none", I/O type of virtual
   guests becomes O_DIRECT and KVM host does not cache guest's disk I/O.
   Therefore, if SRAO machine check is reported on a dirty page cache
   in KVM host, its virtual machines are not affected by the machine check.
   So the host is expected to keep operating instead of kernel panic.


Past discussion

This problem was previously discussed in the kernel community,
(refer: mail threads pertaining to
http://marc.info/?l=linux-kernel=135187403804934=4).


- I worry that if a hardware error occurs, it might affect a large
   amount of memory all at the same time.  For example, if a 4G memory
   block goes bad, this message will be printed a million times?


As Andrew mentioned in the above threads, if 4GB memory blocks goes bad,
error messages will be printed a million times and this behavior loses
a system reliability.

Therefore, the second patch introduces two sysctl parameters for
__ratelimit() which is used at mce_notify_irq() in order to notify
occurrence of machine check event to system administrator.
The use of __ratelimit(), this patch can limit quantity of messages
per interval to be output at syslog or terminal console.

If system administrator needs to limit quantity of messages,
these parameters are available.

- vm.memory_failure_print_ratelimit:
   Specifies the minimum length of time between messages.
   By default the rate limiting is disabled.

- vm.memory_failure_print_ratelimit_burst:
   Specifies the number of messages we can send before rate limiting.



Test Results
-
These patches are tested on 3.8.1 kernel(FC18) using software pseudo MCE
injection from KVM host to guest.


 Host OS Screen logs(SRAO Machine Check injection) 
Inject software pseudo MCE into guest qemu process.

(1) Load mce-inject module
# modprobe mce-inject

(2) Find a PID of target qemu-kvm and page struct
# ps -C qemu-kvm -o pid=
  8176

(3) Edit software pseudo MCE data
Choose a offset of page struct and insert the offset to ADDR line in mce-file.

#  ./page-types -p 8176 -LN | grep "___UDlAMa_b___"
voffset offset  flags
...
7fd25eb77   344d77  ___UDlAMa_b___
7fd25eb78   344d78  

Re: [PATCH 2/3] mm, slub: count freed pages via rcu as this task's reclaimed_slab

2013-04-10 Thread Simon Jeons

Hi Christoph,
On 04/10/2013 09:54 PM, Christoph Lameter wrote:

On Wed, 10 Apr 2013, Simon Jeons wrote:


It seems that you misunderstand my question. I don't doubt slab/slub can use
high order pages. However, what I focus on is why slab/slub can use compound
page, PageCompound() just on behalf of hugetlbfs pages or thp pages which
should used by apps, isn't it?

I am not entirely clear on what you are asking for. The following gives a
couple of answers to what I guess the question was.

THP pages and user pages are on the lru and are managed differently.
The slab allocators cannot work with those pages.

Slab allocators *can* allocate higher order pages therefore they could
allocate a page of the same order as huge pages and manage it that way.

However there is no way that these pages could be handled like THP pages
since they cannot be broken up (unless we add the capability to move slab
objects which I wanted to do for a long time).


You can boot a Linux system that uses huge pages for slab allocation
by specifying the following parameter on the kernel command line.

slub_min_order=9

The slub allocator will start using huge pages for all its storage
needs. You need a large number of huge pages to do this. Lots of memory
is going to be lost due to fragmentation but its going to be fast since
the slowpaths are rarely used. OOMs due to reclaim failure become much
more likely ;-).



It seems that I need to simple my question.
All pages which order >=1 are compound pages?


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] x86: add phys addr validity check for /dev/mem mmap

2013-04-10 Thread Simon Jeons

Hi H.Peter,
On 04/11/2013 10:48 AM, H. Peter Anvin wrote:

On 04/10/2013 07:40 PM, Simon Jeons wrote:

Hi H.Peter,
On 04/04/2013 09:13 AM, H. Peter Anvin wrote:

On 04/03/2013 06:11 PM, Simon Jeons wrote:

Why we consider boot_cpu_data.x86_phys_bits instead of e820 map here?


Because x86_phys_bits is what controls how much address space the
processor has.  e820 tells us how much *RAM* the machine has, or
specifically, how much RAM the machine had on boot.

I have 8GB memory in my machine, but when I accumulated every e820
ranges which dump in dmesg, there are 25MB memory less then 8GB(1024*8)
memory, why 25MB miss?


For whatever reason your BIOS is stealing some memory, possibly for video.


Thanks for your quick response. ;-)
My machine is new which have i7 cpu. How much memory video need? 8MB? 
Why I miss 25MB?




-hpa




--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] x86: add phys addr validity check for /dev/mem mmap

2013-04-10 Thread Simon Jeons

Hi H.Peter,
On 04/04/2013 09:13 AM, H. Peter Anvin wrote:

On 04/03/2013 06:11 PM, Simon Jeons wrote:

Why we consider boot_cpu_data.x86_phys_bits instead of e820 map here?


Because x86_phys_bits is what controls how much address space the
processor has.  e820 tells us how much *RAM* the machine has, or
specifically, how much RAM the machine had on boot.


I have 8GB memory in my machine, but when I accumulated every e820 
ranges which dump in dmesg, there are 25MB memory less then 8GB(1024*8) 
memory, why 25MB miss?




-hpa



--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] x86: add phys addr validity check for /dev/mem mmap

2013-04-10 Thread Simon Jeons

Hi H.Peter,
On 04/04/2013 09:13 AM, H. Peter Anvin wrote:

On 04/03/2013 06:11 PM, Simon Jeons wrote:

Why we consider boot_cpu_data.x86_phys_bits instead of e820 map here?


Because x86_phys_bits is what controls how much address space the
processor has.  e820 tells us how much *RAM* the machine has, or
specifically, how much RAM the machine had on boot.


I have 8GB memory in my machine, but when I accumulated every e820 
ranges which dump in dmesg, there are 25MB memory less then 8GB(1024*8) 
memory, why 25MB miss?




-hpa



--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] x86: add phys addr validity check for /dev/mem mmap

2013-04-10 Thread Simon Jeons

Hi H.Peter,
On 04/11/2013 10:48 AM, H. Peter Anvin wrote:

On 04/10/2013 07:40 PM, Simon Jeons wrote:

Hi H.Peter,
On 04/04/2013 09:13 AM, H. Peter Anvin wrote:

On 04/03/2013 06:11 PM, Simon Jeons wrote:

Why we consider boot_cpu_data.x86_phys_bits instead of e820 map here?


Because x86_phys_bits is what controls how much address space the
processor has.  e820 tells us how much *RAM* the machine has, or
specifically, how much RAM the machine had on boot.

I have 8GB memory in my machine, but when I accumulated every e820
ranges which dump in dmesg, there are 25MB memory less then 8GB(1024*8)
memory, why 25MB miss?


For whatever reason your BIOS is stealing some memory, possibly for video.


Thanks for your quick response. ;-)
My machine is new which have i7 cpu. How much memory video need? 8MB? 
Why I miss 25MB?




-hpa




--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 2/3] mm, slub: count freed pages via rcu as this task's reclaimed_slab

2013-04-10 Thread Simon Jeons

Hi Christoph,
On 04/10/2013 09:54 PM, Christoph Lameter wrote:

On Wed, 10 Apr 2013, Simon Jeons wrote:


It seems that you misunderstand my question. I don't doubt slab/slub can use
high order pages. However, what I focus on is why slab/slub can use compound
page, PageCompound() just on behalf of hugetlbfs pages or thp pages which
should used by apps, isn't it?

I am not entirely clear on what you are asking for. The following gives a
couple of answers to what I guess the question was.

THP pages and user pages are on the lru and are managed differently.
The slab allocators cannot work with those pages.

Slab allocators *can* allocate higher order pages therefore they could
allocate a page of the same order as huge pages and manage it that way.

However there is no way that these pages could be handled like THP pages
since they cannot be broken up (unless we add the capability to move slab
objects which I wanted to do for a long time).


You can boot a Linux system that uses huge pages for slab allocation
by specifying the following parameter on the kernel command line.

slub_min_order=9

The slub allocator will start using huge pages for all its storage
needs. You need a large number of huge pages to do this. Lots of memory
is going to be lost due to fragmentation but its going to be fast since
the slowpaths are rarely used. OOMs due to reclaim failure become much
more likely ;-).



It seems that I need to simple my question.
All pages which order =1 are compound pages?


--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [RFC Patch 0/2] mm: Add parameters to make kernel behavior at memory error on dirty cache selectable

2013-04-10 Thread Simon Jeons

Hi Mitsuhiro,
On 04/11/2013 11:26 AM, Mitsuhiro Tanino wrote:

Hi All,
Please find a patch set that introduces these new sysctl interfaces,
to handle a case when an memory error is detected on dirty page cache.

- vm.memory_failure_dirty_panic
- vm.memory_failure_print_ratelimit
- vm.memory_failure_print_ratelimit_burst

Problem
-
Recently, it is common that enterprise servers likely have a large
amount of memory, especially for cloud environment. This means that
possibility of memory failures is increased.

To handle memory failure, Linux has a hwpoison feature. When a memory
error is detected by memory scrub, the error is reported as machine
check, uncorrected recoverable (UCR), to OS. Then OS isolates the memory
region with memory failure if the memory page can be isolated.
The hwpoison handles it according to the memory region, such as kernel,
dirty cache, clean cache. If the memory region can be isolated, the
page is marked hwpoison and it is not used again.

When SRAO machine check is reported on a page which is included dirty
page cache, the page is truncated because the memory is corrupted and
data of the page cannot be written to a disk any more.

As a result, if the dirty cache includes user data, the data is lost,
and data corruption occurs if an application uses old data.


One question against mce instead of the patchset. ;-)

When check memory is bad? Before memory access? Is there a process scan 
it period?





Solution
-
The patch proposes a new sysctl interface, vm.memory_failure_dirty_panic,
in order to prevent data corruption comes from data lost problem.
Also this patch displays information of affected file such as device name,
inode number, file offset and file type if the file is mapped on a memory
and the page is dirty cache.

When SRAO machine check occurs on a dirty page cache, corresponding
data cannot be recovered any more. Therefore, the patch proposes a kernel
option to keep a system running or force system panic in order
to avoid further trouble such as data corruption problem of application.

System administrator can select an error action using this option
according to characteristics of target system.



Use Case
-
This option is intended to be adopted in KVM guest because it is
supposed that Linux on KVM guest operates customers business and
it is big impact to lost or corrupt customers data by memory failure.

On the other hand, this option does not recommend to apply KVM host
as following reasons.

- Making KVM host panic has a big impact because all virtual guests are
   affected by their host panic. Affected virtual guests are forced to stop
   and have to be restarted on the other hypervisor.

- If disk cached model of qemu is set to none, I/O type of virtual
   guests becomes O_DIRECT and KVM host does not cache guest's disk I/O.
   Therefore, if SRAO machine check is reported on a dirty page cache
   in KVM host, its virtual machines are not affected by the machine check.
   So the host is expected to keep operating instead of kernel panic.


Past discussion

This problem was previously discussed in the kernel community,
(refer: mail threads pertaining to
http://marc.info/?l=linux-kernelm=135187403804934w=4).


- I worry that if a hardware error occurs, it might affect a large
   amount of memory all at the same time.  For example, if a 4G memory
   block goes bad, this message will be printed a million times?


As Andrew mentioned in the above threads, if 4GB memory blocks goes bad,
error messages will be printed a million times and this behavior loses
a system reliability.

Therefore, the second patch introduces two sysctl parameters for
__ratelimit() which is used at mce_notify_irq() in order to notify
occurrence of machine check event to system administrator.
The use of __ratelimit(), this patch can limit quantity of messages
per interval to be output at syslog or terminal console.

If system administrator needs to limit quantity of messages,
these parameters are available.

- vm.memory_failure_print_ratelimit:
   Specifies the minimum length of time between messages.
   By default the rate limiting is disabled.

- vm.memory_failure_print_ratelimit_burst:
   Specifies the number of messages we can send before rate limiting.



Test Results
-
These patches are tested on 3.8.1 kernel(FC18) using software pseudo MCE
injection from KVM host to guest.


 Host OS Screen logs(SRAO Machine Check injection) 
Inject software pseudo MCE into guest qemu process.

(1) Load mce-inject module
# modprobe mce-inject

(2) Find a PID of target qemu-kvm and page struct
# ps -C qemu-kvm -o pid=
  8176

(3) Edit software pseudo MCE data
Choose a offset of page struct and insert the offset to ADDR line in mce-file.

#  ./page-types -p 8176 -LN | grep ___UDlAMa_b___
voffset offset  flags
...
7fd25eb77   344d77  ___UDlAMa_b___
7fd25eb78   344d78  

Re: [PATCH 2/3] mm, slub: count freed pages via rcu as this task's reclaimed_slab

2013-04-09 Thread Simon Jeons

Hi Christoph,
On 04/09/2013 10:32 PM, Christoph Lameter wrote:

On Tue, 9 Apr 2013, Simon Jeons wrote:


+   int pages = 1 << compound_order(page);

One question irrelevant this patch. Why slab cache can use compound
page(hugetlbfs pages/thp pages)? They are just used by app to optimize tlb
miss, is it?

Slab caches can use any order pages because these pages are never on
the LRU and are not part of the page cache. Large continuous physical
memory means that objects can be arranged in a more efficient way in the
page. This is particularly useful for larger objects where we might use a
lot of memory because objects do not fit well into a 4k page.

It also reduces the slab page management if higher order pages are used.
In the case of slub the page size also determines the number of objects
that can be allocated/freed without the need for some form of
synchronization.


It seems that you misunderstand my question. I don't doubt slab/slub can 
use high order pages. However, what I focus on is why slab/slub can use 
compound page, PageCompound() just on behalf of hugetlbfs pages or thp 
pages which should used by apps, isn't it?






--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v8 3/3] mm: reinititalise user and admin reserves if memory is added or removed

2013-04-09 Thread Simon Jeons

On 04/10/2013 08:11 AM, Andrew Shewmaker wrote:

On Tue, Apr 9, 2013 at 6:05 PM, Simon Jeons  wrote:

Hi Andrew,

On 04/10/2013 07:56 AM, Andrew Shewmaker wrote:

On Tue, Apr 9, 2013 at 4:19 PM, Andrew Morton 
wrote:

On Mon, 8 Apr 2013 17:00:40 -0400 Andrew Shewmaker 
wrote:


Should I add the memory notifier code to mm/nommu.c too?
I'm guessing that if a system doesn't have an mmu that it also
won't be hotplugging memory.

I doubt if we need to worry about memory hotplug on nommu machines,
so just do the minimum which is required to get nommu to compile
and link.  That's probably "nothing".

I haven't gotten myself set up to compile a nommu architecture, so I'll
post
my next version, and work on verifying it compiles and links later. But I
I probably won't be able to get to that for a week and a half ... I'm
leaving
on my honeymoon in the next couple days :)


How to compile a  nommu architecture? just config in menu config or a
physical machine?

I was going to set up a qemu arm guest. Please, anyone, let me know if
there's an easier way to test nommu builds on x86.


AFAK, arm7 is nommu.


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v8 3/3] mm: reinititalise user and admin reserves if memory is added or removed

2013-04-09 Thread Simon Jeons

Hi Andrew,
On 04/10/2013 07:56 AM, Andrew Shewmaker wrote:

On Tue, Apr 9, 2013 at 4:19 PM, Andrew Morton  wrote:

On Mon, 8 Apr 2013 17:00:40 -0400 Andrew Shewmaker  wrote:


Should I add the memory notifier code to mm/nommu.c too?
I'm guessing that if a system doesn't have an mmu that it also
won't be hotplugging memory.

I doubt if we need to worry about memory hotplug on nommu machines,
so just do the minimum which is required to get nommu to compile
and link.  That's probably "nothing".

I haven't gotten myself set up to compile a nommu architecture, so I'll post
my next version, and work on verifying it compiles and links later. But I
I probably won't be able to get to that for a week and a half ... I'm leaving
on my honeymoon in the next couple days :)


How to compile a  nommu architecture? just config in menu config or a 
physical machine?



--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] mm: page_alloc: Avoid marking zones full prematurely after zone_reclaim()

2013-04-09 Thread Simon Jeons

Hi Michal,
On 04/09/2013 06:14 PM, Michal Hocko wrote:

On Tue 09-04-13 18:05:30, Simon Jeons wrote:
[...]

I try this in v3.9-rc5:
dd if=/dev/sda of=/dev/null bs=1MB
14813+0 records in
14812+0 records out
1481200 bytes (15 GB) copied, 105.988 s, 140 MB/s

free -m -s 1

   total   used   free shared buffers
cached
Mem:  7912   1181   6731  0 663239
-/+ buffers/cache:277   7634
Swap: 8011  0   8011

It seems that almost 15GB copied before I stop dd, but the used
pages which I monitor during dd always around 1200MB. Weird, why?


Sorry for waste your time, but the test result is weird, is it?

I am not sure which values you have been watching but you have to
realize that you are reading a _partition_ not a file and those pages
go into buffers rather than the page chache.


buffer cache are contained in page cache, is it? Which value I should watch?


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] mm: page_alloc: Avoid marking zones full prematurely after zone_reclaim()

2013-04-09 Thread Simon Jeons

Hi Michal,
On 04/05/2013 02:31 PM, Simon Jeons wrote:

Hi Michal,
On 03/21/2013 04:19 PM, Michal Hocko wrote:

On Thu 21-03-13 10:33:07, Simon Jeons wrote:

Hi Mel,
On 03/21/2013 02:19 AM, Mel Gorman wrote:

The following problem was reported against a distribution kernel when
zone_reclaim was enabled but the same problem applies to the mainline
kernel. The reproduction case was as follows

1. Run numactl -m +0 dd if=largefile of=/dev/null
This allocates a large number of clean pages in node 0

I confuse why this need allocate a large number of clean pages?

It reads from file and puts pages into the page cache. The pages are not
modified so they are clean. Output file is /dev/null so no pages are
written. dd doesn't call fadvise(POSIX_FADV_DONTNEED) on the input file
by default so pages from the file stay in the page cache


I try this in v3.9-rc5:
dd if=/dev/sda of=/dev/null bs=1MB
14813+0 records in
14812+0 records out
1481200 bytes (15 GB) copied, 105.988 s, 140 MB/s

free -m -s 1

   total   used   free shared buffers 
cached

Mem:  7912   1181   6731  0 663239
-/+ buffers/cache:277   7634
Swap: 8011  0   8011

It seems that almost 15GB copied before I stop dd, but the used pages 
which I monitor during dd always around 1200MB. Weird, why?




Sorry for waste your time, but the test result is weird, is it?

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 2/3] mm, slub: count freed pages via rcu as this task's reclaimed_slab

2013-04-09 Thread Simon Jeons

Hi Joonsoo,
On 04/09/2013 09:21 AM, Joonsoo Kim wrote:

Currently, freed pages via rcu is not counted for reclaimed_slab, because
it is freed in rcu context, not current task context. But, this free is
initiated by this task, so counting this into this task's reclaimed_slab
is meaningful to decide whether we continue reclaim, or not.
So change code to count these pages for this task's reclaimed_slab.

Cc: Christoph Lameter 
Cc: Pekka Enberg 
Cc: Matt Mackall 
Signed-off-by: Joonsoo Kim 

diff --git a/mm/slub.c b/mm/slub.c
index 4aec537..16fd2d5 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1409,8 +1409,6 @@ static void __free_slab(struct kmem_cache *s, struct page 
*page)
  
  	memcg_release_pages(s, order);

page_mapcount_reset(page);
-   if (current->reclaim_state)
-   current->reclaim_state->reclaimed_slab += pages;
__free_memcg_kmem_pages(page, order);
  }
  
@@ -1431,6 +1429,8 @@ static void rcu_free_slab(struct rcu_head *h)
  
  static void free_slab(struct kmem_cache *s, struct page *page)

  {
+   int pages = 1 << compound_order(page);


One question irrelevant this patch. Why slab cache can use compound 
page(hugetlbfs pages/thp pages)? They are just used by app to optimize 
tlb miss, is it?



+
if (unlikely(s->flags & SLAB_DESTROY_BY_RCU)) {
struct rcu_head *head;
  
@@ -1450,6 +1450,9 @@ static void free_slab(struct kmem_cache *s, struct page *page)

call_rcu(head, rcu_free_slab);
} else
__free_slab(s, page);
+
+   if (current->reclaim_state)
+   current->reclaim_state->reclaimed_slab += pages;
  }
  
  static void discard_slab(struct kmem_cache *s, struct page *page)


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 08/10] mm: vmscan: Have kswapd shrink slab only once per priority

2013-04-09 Thread Simon Jeons

Hi Joonsoo,
On 04/09/2013 02:53 PM, Joonsoo Kim wrote:

Hello, Mel.
Sorry for too late question.

On Sun, Mar 17, 2013 at 01:04:14PM +, Mel Gorman wrote:

If kswaps fails to make progress but continues to shrink slab then it'll
either discard all of slab or consume CPU uselessly scanning shrinkers.
This patch causes kswapd to only call the shrinkers once per priority.

Signed-off-by: Mel Gorman 
---
  mm/vmscan.c | 28 +---
  1 file changed, 21 insertions(+), 7 deletions(-)

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 7d5a932..84375b2 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2661,9 +2661,10 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int 
order, long remaining,
   */
  static bool kswapd_shrink_zone(struct zone *zone,
   struct scan_control *sc,
-  unsigned long lru_pages)
+  unsigned long lru_pages,
+  bool shrinking_slab)
  {
-   unsigned long nr_slab;
+   unsigned long nr_slab = 0;
struct reclaim_state *reclaim_state = current->reclaim_state;
struct shrink_control shrink = {
.gfp_mask = sc->gfp_mask,
@@ -2673,9 +2674,15 @@ static bool kswapd_shrink_zone(struct zone *zone,
sc->nr_to_reclaim = max(SWAP_CLUSTER_MAX, high_wmark_pages(zone));
shrink_zone(zone, sc);
  
-	reclaim_state->reclaimed_slab = 0;

-   nr_slab = shrink_slab(, sc->nr_scanned, lru_pages);
-   sc->nr_reclaimed += reclaim_state->reclaimed_slab;
+   /*
+* Slabs are shrunk for each zone once per priority or if the zone
+* being balanced is otherwise unreclaimable
+*/
+   if (shrinking_slab || !zone_reclaimable(zone)) {
+   reclaim_state->reclaimed_slab = 0;
+   nr_slab = shrink_slab(, sc->nr_scanned, lru_pages);
+   sc->nr_reclaimed += reclaim_state->reclaimed_slab;
+   }
  
  	if (nr_slab == 0 && !zone_reclaimable(zone))

zone->all_unreclaimable = 1;

Why shrink_slab() is called here?
I think that outside of zone loop is better place to run shrink_slab(),
because shrink_slab() is not directly related to a specific zone.


True.



And this is a question not related to this patch.
Why nr_slab is used here to decide zone->all_unreclaimable?
nr_slab is not directly related whether a specific zone is reclaimable
or not, and, moreover, nr_slab is not directly related to number of
reclaimed pages. It just say some objects in the system are freed.

This question comes from my ignorance, so please enlighten me.


Good question, I also want to know. ;-)



Thanks.


@@ -2713,6 +2720,7 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int 
order,
int end_zone = 0;   /* Inclusive.  0 = ZONE_DMA */
unsigned long nr_soft_reclaimed;
unsigned long nr_soft_scanned;
+   bool shrinking_slab = true;
struct scan_control sc = {
.gfp_mask = GFP_KERNEL,
.priority = DEF_PRIORITY,
@@ -2861,7 +2869,8 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int 
order,
 * already being scanned that that high
 * watermark would be met at 100% efficiency.
 */
-   if (kswapd_shrink_zone(zone, , lru_pages))
+   if (kswapd_shrink_zone(zone, ,
+   lru_pages, shrinking_slab))
raise_priority = false;
  
  nr_to_reclaim += sc.nr_to_reclaim;

@@ -2900,6 +2909,9 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int 
order,
pfmemalloc_watermark_ok(pgdat))
wake_up(>pfmemalloc_wait);
  
+		/* Only shrink slab once per priority */

+   shrinking_slab = false;
+
/*
 * Fragmentation may mean that the system cannot be rebalanced
 * for high-order allocations in all zones. If twice the
@@ -2925,8 +2937,10 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int 
order,
 * Raise priority if scanning rate is too low or there was no
 * progress in reclaiming pages
 */
-   if (raise_priority || !this_reclaimed)
+   if (raise_priority || !this_reclaimed) {
sc.priority--;
+   shrinking_slab = true;
+   }
} while (sc.priority >= 1 &&
 !pgdat_balanced(pgdat, order, *classzone_idx));
  
--

1.8.1.4

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majord...@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: mailto:"d...@kvack.org;> em...@kvack.org 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majord...@kvack.org.  

Re: [PATCH 08/10] mm: vmscan: Have kswapd shrink slab only once per priority

2013-04-09 Thread Simon Jeons

Hi Joonsoo,
On 04/09/2013 02:53 PM, Joonsoo Kim wrote:

Hello, Mel.
Sorry for too late question.

On Sun, Mar 17, 2013 at 01:04:14PM +, Mel Gorman wrote:

If kswaps fails to make progress but continues to shrink slab then it'll
either discard all of slab or consume CPU uselessly scanning shrinkers.
This patch causes kswapd to only call the shrinkers once per priority.

Signed-off-by: Mel Gorman mgor...@suse.de
---
  mm/vmscan.c | 28 +---
  1 file changed, 21 insertions(+), 7 deletions(-)

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 7d5a932..84375b2 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2661,9 +2661,10 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int 
order, long remaining,
   */
  static bool kswapd_shrink_zone(struct zone *zone,
   struct scan_control *sc,
-  unsigned long lru_pages)
+  unsigned long lru_pages,
+  bool shrinking_slab)
  {
-   unsigned long nr_slab;
+   unsigned long nr_slab = 0;
struct reclaim_state *reclaim_state = current-reclaim_state;
struct shrink_control shrink = {
.gfp_mask = sc-gfp_mask,
@@ -2673,9 +2674,15 @@ static bool kswapd_shrink_zone(struct zone *zone,
sc-nr_to_reclaim = max(SWAP_CLUSTER_MAX, high_wmark_pages(zone));
shrink_zone(zone, sc);
  
-	reclaim_state-reclaimed_slab = 0;

-   nr_slab = shrink_slab(shrink, sc-nr_scanned, lru_pages);
-   sc-nr_reclaimed += reclaim_state-reclaimed_slab;
+   /*
+* Slabs are shrunk for each zone once per priority or if the zone
+* being balanced is otherwise unreclaimable
+*/
+   if (shrinking_slab || !zone_reclaimable(zone)) {
+   reclaim_state-reclaimed_slab = 0;
+   nr_slab = shrink_slab(shrink, sc-nr_scanned, lru_pages);
+   sc-nr_reclaimed += reclaim_state-reclaimed_slab;
+   }
  
  	if (nr_slab == 0  !zone_reclaimable(zone))

zone-all_unreclaimable = 1;

Why shrink_slab() is called here?
I think that outside of zone loop is better place to run shrink_slab(),
because shrink_slab() is not directly related to a specific zone.


True.



And this is a question not related to this patch.
Why nr_slab is used here to decide zone-all_unreclaimable?
nr_slab is not directly related whether a specific zone is reclaimable
or not, and, moreover, nr_slab is not directly related to number of
reclaimed pages. It just say some objects in the system are freed.

This question comes from my ignorance, so please enlighten me.


Good question, I also want to know. ;-)



Thanks.


@@ -2713,6 +2720,7 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int 
order,
int end_zone = 0;   /* Inclusive.  0 = ZONE_DMA */
unsigned long nr_soft_reclaimed;
unsigned long nr_soft_scanned;
+   bool shrinking_slab = true;
struct scan_control sc = {
.gfp_mask = GFP_KERNEL,
.priority = DEF_PRIORITY,
@@ -2861,7 +2869,8 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int 
order,
 * already being scanned that that high
 * watermark would be met at 100% efficiency.
 */
-   if (kswapd_shrink_zone(zone, sc, lru_pages))
+   if (kswapd_shrink_zone(zone, sc,
+   lru_pages, shrinking_slab))
raise_priority = false;
  
  nr_to_reclaim += sc.nr_to_reclaim;

@@ -2900,6 +2909,9 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int 
order,
pfmemalloc_watermark_ok(pgdat))
wake_up(pgdat-pfmemalloc_wait);
  
+		/* Only shrink slab once per priority */

+   shrinking_slab = false;
+
/*
 * Fragmentation may mean that the system cannot be rebalanced
 * for high-order allocations in all zones. If twice the
@@ -2925,8 +2937,10 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int 
order,
 * Raise priority if scanning rate is too low or there was no
 * progress in reclaiming pages
 */
-   if (raise_priority || !this_reclaimed)
+   if (raise_priority || !this_reclaimed) {
sc.priority--;
+   shrinking_slab = true;
+   }
} while (sc.priority = 1 
 !pgdat_balanced(pgdat, order, *classzone_idx));
  
--

1.8.1.4

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majord...@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: a href=mailto:d...@kvack.org; em...@kvack.org /a

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body 

Re: [PATCH 2/3] mm, slub: count freed pages via rcu as this task's reclaimed_slab

2013-04-09 Thread Simon Jeons

Hi Joonsoo,
On 04/09/2013 09:21 AM, Joonsoo Kim wrote:

Currently, freed pages via rcu is not counted for reclaimed_slab, because
it is freed in rcu context, not current task context. But, this free is
initiated by this task, so counting this into this task's reclaimed_slab
is meaningful to decide whether we continue reclaim, or not.
So change code to count these pages for this task's reclaimed_slab.

Cc: Christoph Lameter c...@linux-foundation.org
Cc: Pekka Enberg penb...@kernel.org
Cc: Matt Mackall m...@selenic.com
Signed-off-by: Joonsoo Kim iamjoonsoo@lge.com

diff --git a/mm/slub.c b/mm/slub.c
index 4aec537..16fd2d5 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1409,8 +1409,6 @@ static void __free_slab(struct kmem_cache *s, struct page 
*page)
  
  	memcg_release_pages(s, order);

page_mapcount_reset(page);
-   if (current-reclaim_state)
-   current-reclaim_state-reclaimed_slab += pages;
__free_memcg_kmem_pages(page, order);
  }
  
@@ -1431,6 +1429,8 @@ static void rcu_free_slab(struct rcu_head *h)
  
  static void free_slab(struct kmem_cache *s, struct page *page)

  {
+   int pages = 1  compound_order(page);


One question irrelevant this patch. Why slab cache can use compound 
page(hugetlbfs pages/thp pages)? They are just used by app to optimize 
tlb miss, is it?



+
if (unlikely(s-flags  SLAB_DESTROY_BY_RCU)) {
struct rcu_head *head;
  
@@ -1450,6 +1450,9 @@ static void free_slab(struct kmem_cache *s, struct page *page)

call_rcu(head, rcu_free_slab);
} else
__free_slab(s, page);
+
+   if (current-reclaim_state)
+   current-reclaim_state-reclaimed_slab += pages;
  }
  
  static void discard_slab(struct kmem_cache *s, struct page *page)


--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] mm: page_alloc: Avoid marking zones full prematurely after zone_reclaim()

2013-04-09 Thread Simon Jeons

Hi Michal,
On 04/05/2013 02:31 PM, Simon Jeons wrote:

Hi Michal,
On 03/21/2013 04:19 PM, Michal Hocko wrote:

On Thu 21-03-13 10:33:07, Simon Jeons wrote:

Hi Mel,
On 03/21/2013 02:19 AM, Mel Gorman wrote:

The following problem was reported against a distribution kernel when
zone_reclaim was enabled but the same problem applies to the mainline
kernel. The reproduction case was as follows

1. Run numactl -m +0 dd if=largefile of=/dev/null
This allocates a large number of clean pages in node 0

I confuse why this need allocate a large number of clean pages?

It reads from file and puts pages into the page cache. The pages are not
modified so they are clean. Output file is /dev/null so no pages are
written. dd doesn't call fadvise(POSIX_FADV_DONTNEED) on the input file
by default so pages from the file stay in the page cache


I try this in v3.9-rc5:
dd if=/dev/sda of=/dev/null bs=1MB
14813+0 records in
14812+0 records out
1481200 bytes (15 GB) copied, 105.988 s, 140 MB/s

free -m -s 1

   total   used   free shared buffers 
cached

Mem:  7912   1181   6731  0 663239
-/+ buffers/cache:277   7634
Swap: 8011  0   8011

It seems that almost 15GB copied before I stop dd, but the used pages 
which I monitor during dd always around 1200MB. Weird, why?




Sorry for waste your time, but the test result is weird, is it?

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] mm: page_alloc: Avoid marking zones full prematurely after zone_reclaim()

2013-04-09 Thread Simon Jeons

Hi Michal,
On 04/09/2013 06:14 PM, Michal Hocko wrote:

On Tue 09-04-13 18:05:30, Simon Jeons wrote:
[...]

I try this in v3.9-rc5:
dd if=/dev/sda of=/dev/null bs=1MB
14813+0 records in
14812+0 records out
1481200 bytes (15 GB) copied, 105.988 s, 140 MB/s

free -m -s 1

   total   used   free shared buffers
cached
Mem:  7912   1181   6731  0 663239
-/+ buffers/cache:277   7634
Swap: 8011  0   8011

It seems that almost 15GB copied before I stop dd, but the used
pages which I monitor during dd always around 1200MB. Weird, why?


Sorry for waste your time, but the test result is weird, is it?

I am not sure which values you have been watching but you have to
realize that you are reading a _partition_ not a file and those pages
go into buffers rather than the page chache.


buffer cache are contained in page cache, is it? Which value I should watch?


--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v8 3/3] mm: reinititalise user and admin reserves if memory is added or removed

2013-04-09 Thread Simon Jeons

Hi Andrew,
On 04/10/2013 07:56 AM, Andrew Shewmaker wrote:

On Tue, Apr 9, 2013 at 4:19 PM, Andrew Morton a...@linux-foundation.org wrote:

On Mon, 8 Apr 2013 17:00:40 -0400 Andrew Shewmaker ags...@gmail.com wrote:


Should I add the memory notifier code to mm/nommu.c too?
I'm guessing that if a system doesn't have an mmu that it also
won't be hotplugging memory.

I doubt if we need to worry about memory hotplug on nommu machines,
so just do the minimum which is required to get nommu to compile
and link.  That's probably nothing.

I haven't gotten myself set up to compile a nommu architecture, so I'll post
my next version, and work on verifying it compiles and links later. But I
I probably won't be able to get to that for a week and a half ... I'm leaving
on my honeymoon in the next couple days :)


How to compile a  nommu architecture? just config in menu config or a 
physical machine?



--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v8 3/3] mm: reinititalise user and admin reserves if memory is added or removed

2013-04-09 Thread Simon Jeons

On 04/10/2013 08:11 AM, Andrew Shewmaker wrote:

On Tue, Apr 9, 2013 at 6:05 PM, Simon Jeons simon.je...@gmail.com wrote:

Hi Andrew,

On 04/10/2013 07:56 AM, Andrew Shewmaker wrote:

On Tue, Apr 9, 2013 at 4:19 PM, Andrew Morton a...@linux-foundation.org
wrote:

On Mon, 8 Apr 2013 17:00:40 -0400 Andrew Shewmaker ags...@gmail.com
wrote:


Should I add the memory notifier code to mm/nommu.c too?
I'm guessing that if a system doesn't have an mmu that it also
won't be hotplugging memory.

I doubt if we need to worry about memory hotplug on nommu machines,
so just do the minimum which is required to get nommu to compile
and link.  That's probably nothing.

I haven't gotten myself set up to compile a nommu architecture, so I'll
post
my next version, and work on verifying it compiles and links later. But I
I probably won't be able to get to that for a week and a half ... I'm
leaving
on my honeymoon in the next couple days :)


How to compile a  nommu architecture? just config in menu config or a
physical machine?

I was going to set up a qemu arm guest. Please, anyone, let me know if
there's an easier way to test nommu builds on x86.


AFAK, arm7 is nommu.


--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 2/3] mm, slub: count freed pages via rcu as this task's reclaimed_slab

2013-04-09 Thread Simon Jeons

Hi Christoph,
On 04/09/2013 10:32 PM, Christoph Lameter wrote:

On Tue, 9 Apr 2013, Simon Jeons wrote:


+   int pages = 1  compound_order(page);

One question irrelevant this patch. Why slab cache can use compound
page(hugetlbfs pages/thp pages)? They are just used by app to optimize tlb
miss, is it?

Slab caches can use any order pages because these pages are never on
the LRU and are not part of the page cache. Large continuous physical
memory means that objects can be arranged in a more efficient way in the
page. This is particularly useful for larger objects where we might use a
lot of memory because objects do not fit well into a 4k page.

It also reduces the slab page management if higher order pages are used.
In the case of slub the page size also determines the number of objects
that can be allocated/freed without the need for some form of
synchronization.


It seems that you misunderstand my question. I don't doubt slab/slub can 
use high order pages. However, what I focus on is why slab/slub can use 
compound page, PageCompound() just on behalf of hugetlbfs pages or thp 
pages which should used by apps, isn't it?






--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 1/3] mm/page_alloc: factor out setting of pcp->high and pcp->batch.

2013-04-08 Thread Simon Jeons

Hi Cody,
On 04/09/2013 01:39 AM, Cody P Schafer wrote:

On 04/06/2013 06:37 PM, Simon Jeons wrote:

Hi Cody,
On 04/06/2013 04:33 AM, Cody P Schafer wrote:

Creates pageset_set_batch() for use in setup_pageset().
pageset_set_batch() imitates the functionality of
setup_pagelist_highmark(), but uses the boot time
(percpu_pagelist_fraction == 0) calculations for determining ->high


Why need adjust pcp->high, pcp->batch during system running? What's the
requirement?



There is currently a sysctl (which I patch later in this series) which 
allows adjusting the ->high mark (and, indirectly, ->batch). 
Additionally, memory hotplug changes ->high and ->batch due to the 
zone size changing (essentially, zone->managed_pages and 
zone->present_pages have changed) , meaning that zone_batchsize(), 
which is used at boot to set ->batch and (indirectly) ->high has a 
different output.


Thanks for your explain. I'm curious about this sysctl, when need adjust 
the ->high, ->batch during system running except memory hotplug which 
will change zone size?




Note that in addition to the 2 users of this functionality mentioned 
here, I'm currently working on anther resizer of zones (runtime NUMA 
reconfiguration).




--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 1/3] mm/page_alloc: factor out setting of pcp-high and pcp-batch.

2013-04-08 Thread Simon Jeons

Hi Cody,
On 04/09/2013 01:39 AM, Cody P Schafer wrote:

On 04/06/2013 06:37 PM, Simon Jeons wrote:

Hi Cody,
On 04/06/2013 04:33 AM, Cody P Schafer wrote:

Creates pageset_set_batch() for use in setup_pageset().
pageset_set_batch() imitates the functionality of
setup_pagelist_highmark(), but uses the boot time
(percpu_pagelist_fraction == 0) calculations for determining -high


Why need adjust pcp-high, pcp-batch during system running? What's the
requirement?



There is currently a sysctl (which I patch later in this series) which 
allows adjusting the -high mark (and, indirectly, -batch). 
Additionally, memory hotplug changes -high and -batch due to the 
zone size changing (essentially, zone-managed_pages and 
zone-present_pages have changed) , meaning that zone_batchsize(), 
which is used at boot to set -batch and (indirectly) -high has a 
different output.


Thanks for your explain. I'm curious about this sysctl, when need adjust 
the -high, -batch during system running except memory hotplug which 
will change zone size?




Note that in addition to the 2 users of this functionality mentioned 
here, I'm currently working on anther resizer of zones (runtime NUMA 
reconfiguration).




--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [RFC] mm: remove swapcache page early

2013-04-07 Thread Simon Jeons

On 04/08/2013 09:48 AM, Minchan Kim wrote:

Hello Simon,

On Sun, Apr 07, 2013 at 03:26:12PM +0800, Simon Jeons wrote:

Ping Minchan.
On 04/02/2013 09:40 PM, Simon Jeons wrote:

Hi Hugh,
On 03/28/2013 05:41 AM, Hugh Dickins wrote:

On Wed, 27 Mar 2013, Minchan Kim wrote:


Swap subsystem does lazy swap slot free with expecting the page
would be swapped out again so we can't avoid unnecessary write.

  so we can avoid unnecessary write.

If page can be swap out again, which codes can avoid unnecessary
write? Could you point out to me? Thanks in advance. ;-)

Look at shrink_page_list.

1) PageAnon(page) && !PageSwapCache()
2) add_to_swap's SetPageDirty
3) __remove_mapping

P.S)
It seems you are misunderstanding. Here isn't proper place to ask a
question for your understanding the code. As I know, there are some
project(ex, kernelnewbies) and books for study and sharing the
knowledge linux kernel.

I recommend Mel's "Understand the Linux Virtual Memory Manager".
It's rather outdated but will be very helpful to understand VM of
linux kernel. You can get it freely but I hope you pay for.
So if author become a billionaire by selecting best book in Amazon,
he might print out second edition which covers all of new VM features
and may solve all of you curiosity.

It would be a another method to contribute open source project. :)

I believe you talented developers can catch it up with reading the
code enoughly and find more bonus knowledge. I think it's why our senior
developers yell out RTFM and I follow them.


What's the meaning of RTFM?



Cheers!




--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [RFC] mm: remove swapcache page early

2013-04-07 Thread Simon Jeons

Ping Minchan.
On 04/02/2013 09:40 PM, Simon Jeons wrote:

Hi Hugh,
On 03/28/2013 05:41 AM, Hugh Dickins wrote:

On Wed, 27 Mar 2013, Minchan Kim wrote:


Swap subsystem does lazy swap slot free with expecting the page
would be swapped out again so we can't avoid unnecessary write.

  so we can avoid unnecessary write.


If page can be swap out again, which codes can avoid unnecessary 
write? Could you point out to me? Thanks in advance. ;-)



But the problem in in-memory swap is that it consumes memory space
until vm_swap_full(ie, used half of all of swap device) condition
meet. It could be bad if we use multiple swap device, small 
in-memory swap

and big storage swap or in-memory swap alone.

That is a very good realization: it's surprising that none of us
thought of it before - no disrespect to you, well done, thank you.

And I guess swap readahead is utterly unhelpful in this case too.


This patch changes vm_swap_full logic slightly so it could free
swap slot early if the backed device is really fast.
For it, I used SWP_SOLIDSTATE but It might be controversial.

But I strongly disagree with almost everything in your patch :)
I disagree with addressing it in vm_swap_full(), I disagree that
it can be addressed by device, I disagree that it has anything to
do with SWP_SOLIDSTATE.

This is not a problem with swapping to /dev/ram0 or to /dev/zram0,
is it?  In those cases, a fixed amount of memory has been set aside
for swap, and it works out just like with disk block devices. The
memory set aside may be wasted, but that is accepted upfront.

Similarly, this is not a problem with swapping to SSD.  There might
or might not be other reasons for adjusting the vm_swap_full() logic
for SSD or generally, but those have nothing to do with this issue.

The problem here is peculiar to frontswap, and the variably sized
memory behind it, isn't it?  We are accustomed to using swap to free
up memory by transferring its data to some other, cheaper but slower
resource.

But in the case of frontswap and zmem (I'll say that to avoid thinking
through which backends are actually involved), it is not a cheaper and
slower resource, but the very same memory we are trying to save: swap
is stolen from the memory under reclaim, so any duplication becomes
counter-productive (if we ignore cpu compression/decompression costs:
I have no idea how fair it is to do so, but anyone who chooses zmem
is prepared to pay some cpu price for that).

And because it's a frontswap thing, we cannot decide this by device:
frontswap may or may not stand in front of each device.  There is no
problem with swapcache duplicated on disk (until that area approaches
being full or fragmented), but at the higher level we cannot see what
is in zmem and what is on disk: we only want to free up the zmem dup.

I believe the answer is for frontswap/zmem to invalidate the frontswap
copy of the page (to free up the compressed memory when possible) and
SetPageDirty on the PageUptodate PageSwapCache page when swapping in
(setting page dirty so nothing will later go to read it from the
unfreed location on backing swap disk, which was never written).

We cannot rely on freeing the swap itself, because in general there
may be multiple references to the swap, and we only satisfy the one
which has faulted.  It may or may not be a good idea to use rmap to
locate the other places to insert pte in place of swap entry, to
resolve them all at once; but we have chosen not to do so in the
past, and there's no need for that, if the zmem gets invalidated
and the swapcache page set dirty.

Hugh


So let's add Ccing Shaohua and Hugh.
If it's a problem for SSD, I'd like to create new type SWP_INMEMORY
or something for z* family.

Other problem is zram is block device so that it can set SWP_INMEMORY
or SWP_SOLIDSTATE easily(ie, actually, zram is already done) but
I have no idea to use it for frontswap.

Any idea?

Other optimize point is we remove it unconditionally when we
found it's exclusive when swap in happen.
It could help frontswap family, too.
What do you think about it?

Cc: Hugh Dickins 
Cc: Dan Magenheimer 
Cc: Seth Jennings 
Cc: Nitin Gupta 
Cc: Konrad Rzeszutek Wilk 
Cc: Shaohua Li 
Signed-off-by: Minchan Kim 
---
  include/linux/swap.h | 11 ---
  mm/memory.c  |  3 ++-
  mm/swapfile.c| 11 +++
  mm/vmscan.c  |  2 +-
  4 files changed, 18 insertions(+), 9 deletions(-)

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 2818a12..1f4df66 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -359,9 +359,14 @@ extern struct page 
*swapin_readahead(swp_entry_t, gfp_t,

  extern atomic_long_t nr_swap_pages;
  extern long total_swap_pages;
  -/* Swap 50% full? Release swapcache more aggressively.. */
-static inline bool vm_swap_full(void)
+/*
+ * Swap 50% full or fast backed device?
+ * Release swapcache more aggressively.
+ */
+static inline bool vm_swap_full(struct swap_info_struct *si

Re: [PATCH] mm: page_alloc: Avoid marking zones full prematurely after zone_reclaim()

2013-04-07 Thread Simon Jeons

Ping!
On 04/05/2013 02:31 PM, Simon Jeons wrote:

Hi Michal,
On 03/21/2013 04:19 PM, Michal Hocko wrote:

On Thu 21-03-13 10:33:07, Simon Jeons wrote:

Hi Mel,
On 03/21/2013 02:19 AM, Mel Gorman wrote:

The following problem was reported against a distribution kernel when
zone_reclaim was enabled but the same problem applies to the mainline
kernel. The reproduction case was as follows

1. Run numactl -m +0 dd if=largefile of=/dev/null
This allocates a large number of clean pages in node 0

I confuse why this need allocate a large number of clean pages?

It reads from file and puts pages into the page cache. The pages are not
modified so they are clean. Output file is /dev/null so no pages are
written. dd doesn't call fadvise(POSIX_FADV_DONTNEED) on the input file
by default so pages from the file stay in the page cache


I try this in v3.9-rc5:
dd if=/dev/sda of=/dev/null bs=1MB
14813+0 records in
14812+0 records out
1481200 bytes (15 GB) copied, 105.988 s, 140 MB/s

free -m -s 1

   total   used   free shared buffers 
cached

Mem:  7912   1181   6731  0 663239
-/+ buffers/cache:277   7634
Swap: 8011  0   8011

It seems that almost 15GB copied before I stop dd, but the used pages 
which I monitor during dd always around 1200MB. Weird, why?




--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] mm: page_alloc: Avoid marking zones full prematurely after zone_reclaim()

2013-04-07 Thread Simon Jeons

Ping!
On 04/05/2013 02:31 PM, Simon Jeons wrote:

Hi Michal,
On 03/21/2013 04:19 PM, Michal Hocko wrote:

On Thu 21-03-13 10:33:07, Simon Jeons wrote:

Hi Mel,
On 03/21/2013 02:19 AM, Mel Gorman wrote:

The following problem was reported against a distribution kernel when
zone_reclaim was enabled but the same problem applies to the mainline
kernel. The reproduction case was as follows

1. Run numactl -m +0 dd if=largefile of=/dev/null
This allocates a large number of clean pages in node 0

I confuse why this need allocate a large number of clean pages?

It reads from file and puts pages into the page cache. The pages are not
modified so they are clean. Output file is /dev/null so no pages are
written. dd doesn't call fadvise(POSIX_FADV_DONTNEED) on the input file
by default so pages from the file stay in the page cache


I try this in v3.9-rc5:
dd if=/dev/sda of=/dev/null bs=1MB
14813+0 records in
14812+0 records out
1481200 bytes (15 GB) copied, 105.988 s, 140 MB/s

free -m -s 1

   total   used   free shared buffers 
cached

Mem:  7912   1181   6731  0 663239
-/+ buffers/cache:277   7634
Swap: 8011  0   8011

It seems that almost 15GB copied before I stop dd, but the used pages 
which I monitor during dd always around 1200MB. Weird, why?




--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [RFC] mm: remove swapcache page early

2013-04-07 Thread Simon Jeons

Ping Minchan.
On 04/02/2013 09:40 PM, Simon Jeons wrote:

Hi Hugh,
On 03/28/2013 05:41 AM, Hugh Dickins wrote:

On Wed, 27 Mar 2013, Minchan Kim wrote:


Swap subsystem does lazy swap slot free with expecting the page
would be swapped out again so we can't avoid unnecessary write.

  so we can avoid unnecessary write.


If page can be swap out again, which codes can avoid unnecessary 
write? Could you point out to me? Thanks in advance. ;-)



But the problem in in-memory swap is that it consumes memory space
until vm_swap_full(ie, used half of all of swap device) condition
meet. It could be bad if we use multiple swap device, small 
in-memory swap

and big storage swap or in-memory swap alone.

That is a very good realization: it's surprising that none of us
thought of it before - no disrespect to you, well done, thank you.

And I guess swap readahead is utterly unhelpful in this case too.


This patch changes vm_swap_full logic slightly so it could free
swap slot early if the backed device is really fast.
For it, I used SWP_SOLIDSTATE but It might be controversial.

But I strongly disagree with almost everything in your patch :)
I disagree with addressing it in vm_swap_full(), I disagree that
it can be addressed by device, I disagree that it has anything to
do with SWP_SOLIDSTATE.

This is not a problem with swapping to /dev/ram0 or to /dev/zram0,
is it?  In those cases, a fixed amount of memory has been set aside
for swap, and it works out just like with disk block devices. The
memory set aside may be wasted, but that is accepted upfront.

Similarly, this is not a problem with swapping to SSD.  There might
or might not be other reasons for adjusting the vm_swap_full() logic
for SSD or generally, but those have nothing to do with this issue.

The problem here is peculiar to frontswap, and the variably sized
memory behind it, isn't it?  We are accustomed to using swap to free
up memory by transferring its data to some other, cheaper but slower
resource.

But in the case of frontswap and zmem (I'll say that to avoid thinking
through which backends are actually involved), it is not a cheaper and
slower resource, but the very same memory we are trying to save: swap
is stolen from the memory under reclaim, so any duplication becomes
counter-productive (if we ignore cpu compression/decompression costs:
I have no idea how fair it is to do so, but anyone who chooses zmem
is prepared to pay some cpu price for that).

And because it's a frontswap thing, we cannot decide this by device:
frontswap may or may not stand in front of each device.  There is no
problem with swapcache duplicated on disk (until that area approaches
being full or fragmented), but at the higher level we cannot see what
is in zmem and what is on disk: we only want to free up the zmem dup.

I believe the answer is for frontswap/zmem to invalidate the frontswap
copy of the page (to free up the compressed memory when possible) and
SetPageDirty on the PageUptodate PageSwapCache page when swapping in
(setting page dirty so nothing will later go to read it from the
unfreed location on backing swap disk, which was never written).

We cannot rely on freeing the swap itself, because in general there
may be multiple references to the swap, and we only satisfy the one
which has faulted.  It may or may not be a good idea to use rmap to
locate the other places to insert pte in place of swap entry, to
resolve them all at once; but we have chosen not to do so in the
past, and there's no need for that, if the zmem gets invalidated
and the swapcache page set dirty.

Hugh


So let's add Ccing Shaohua and Hugh.
If it's a problem for SSD, I'd like to create new type SWP_INMEMORY
or something for z* family.

Other problem is zram is block device so that it can set SWP_INMEMORY
or SWP_SOLIDSTATE easily(ie, actually, zram is already done) but
I have no idea to use it for frontswap.

Any idea?

Other optimize point is we remove it unconditionally when we
found it's exclusive when swap in happen.
It could help frontswap family, too.
What do you think about it?

Cc: Hugh Dickins hu...@google.com
Cc: Dan Magenheimer dan.magenhei...@oracle.com
Cc: Seth Jennings sjenn...@linux.vnet.ibm.com
Cc: Nitin Gupta ngu...@vflare.org
Cc: Konrad Rzeszutek Wilk kon...@darnok.org
Cc: Shaohua Li s...@kernel.org
Signed-off-by: Minchan Kim minc...@kernel.org
---
  include/linux/swap.h | 11 ---
  mm/memory.c  |  3 ++-
  mm/swapfile.c| 11 +++
  mm/vmscan.c  |  2 +-
  4 files changed, 18 insertions(+), 9 deletions(-)

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 2818a12..1f4df66 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -359,9 +359,14 @@ extern struct page 
*swapin_readahead(swp_entry_t, gfp_t,

  extern atomic_long_t nr_swap_pages;
  extern long total_swap_pages;
  -/* Swap 50% full? Release swapcache more aggressively.. */
-static inline bool vm_swap_full(void)
+/*
+ * Swap 50

Re: [RFC] mm: remove swapcache page early

2013-04-07 Thread Simon Jeons

On 04/08/2013 09:48 AM, Minchan Kim wrote:

Hello Simon,

On Sun, Apr 07, 2013 at 03:26:12PM +0800, Simon Jeons wrote:

Ping Minchan.
On 04/02/2013 09:40 PM, Simon Jeons wrote:

Hi Hugh,
On 03/28/2013 05:41 AM, Hugh Dickins wrote:

On Wed, 27 Mar 2013, Minchan Kim wrote:


Swap subsystem does lazy swap slot free with expecting the page
would be swapped out again so we can't avoid unnecessary write.

  so we can avoid unnecessary write.

If page can be swap out again, which codes can avoid unnecessary
write? Could you point out to me? Thanks in advance. ;-)

Look at shrink_page_list.

1) PageAnon(page)  !PageSwapCache()
2) add_to_swap's SetPageDirty
3) __remove_mapping

P.S)
It seems you are misunderstanding. Here isn't proper place to ask a
question for your understanding the code. As I know, there are some
project(ex, kernelnewbies) and books for study and sharing the
knowledge linux kernel.

I recommend Mel's Understand the Linux Virtual Memory Manager.
It's rather outdated but will be very helpful to understand VM of
linux kernel. You can get it freely but I hope you pay for.
So if author become a billionaire by selecting best book in Amazon,
he might print out second edition which covers all of new VM features
and may solve all of you curiosity.

It would be a another method to contribute open source project. :)

I believe you talented developers can catch it up with reading the
code enoughly and find more bonus knowledge. I think it's why our senior
developers yell out RTFM and I follow them.


What's the meaning of RTFM?



Cheers!




--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 3/3] mm: when handling percpu_pagelist_fraction, use on_each_cpu() to set percpu pageset fields.

2013-04-06 Thread Simon Jeons

Hi Cody,
On 04/06/2013 04:33 AM, Cody P Schafer wrote:

In free_hot_cold_page(), we rely on pcp->batch remaining stable.
Updating it without being on the cpu owning the percpu pageset
potentially destroys this stability.


If cpu is off, can its pcp pageset be used in free_hot_code_page()?



Change for_each_cpu() to on_each_cpu() to fix.

Signed-off-by: Cody P Schafer 
---
  mm/page_alloc.c | 21 +++--
  1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 48f2faa..507db31 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5475,30 +5475,31 @@ int lowmem_reserve_ratio_sysctl_handler(ctl_table 
*table, int write,
return 0;
  }
  
+static void _zone_set_pageset_highmark(void *data)

+{
+   struct zone *zone = data;
+   unsigned long  high;
+   high = zone->managed_pages / percpu_pagelist_fraction;
+   setup_pagelist_highmark(
+   per_cpu_ptr(zone->pageset, smp_processor_id()), high);
+}
+
  /*
   * percpu_pagelist_fraction - changes the pcp->high for each zone on each
   * cpu.  It is the fraction of total pages in each zone that a hot per cpu 
pagelist
   * can have before it gets flushed back to buddy allocator.
   */
-
  int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write,
void __user *buffer, size_t *length, loff_t *ppos)
  {
struct zone *zone;
-   unsigned int cpu;
int ret;
  
  	ret = proc_dointvec_minmax(table, write, buffer, length, ppos);

if (!write || (ret < 0))
return ret;
-   for_each_populated_zone(zone) {
-   for_each_possible_cpu(cpu) {
-   unsigned long  high;
-   high = zone->managed_pages / percpu_pagelist_fraction;
-   setup_pagelist_highmark(
-   per_cpu_ptr(zone->pageset, cpu), high);
-   }
-   }
+   for_each_populated_zone(zone)
+   on_each_cpu(_zone_set_pageset_highmark, zone, true);
return 0;
  }
  


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 1/3] mm/page_alloc: factor out setting of pcp->high and pcp->batch.

2013-04-06 Thread Simon Jeons

Hi Cody,
On 04/06/2013 04:33 AM, Cody P Schafer wrote:

Creates pageset_set_batch() for use in setup_pageset().
pageset_set_batch() imitates the functionality of
setup_pagelist_highmark(), but uses the boot time
(percpu_pagelist_fraction == 0) calculations for determining ->high


Why need adjust pcp->high, pcp->batch during system running? What's the 
requirement?



based on ->batch.

Signed-off-by: Cody P Schafer 
---
  mm/page_alloc.c | 12 +---
  1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 8fcced7..5877cf0 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4004,6 +4004,14 @@ static int __meminit zone_batchsize(struct zone *zone)
  #endif
  }
  
+/* a companion to setup_pagelist_highmark() */

+static void pageset_set_batch(struct per_cpu_pageset *p, unsigned long batch)
+{
+   struct per_cpu_pages *pcp = >pcp;
+   pcp->high = 6 * batch;
+   pcp->batch = max(1UL, 1 * batch);
+}
+
  static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
  {
struct per_cpu_pages *pcp;
@@ -4013,8 +4021,7 @@ static void setup_pageset(struct per_cpu_pageset *p, 
unsigned long batch)
  
  	pcp = >pcp;

pcp->count = 0;
-   pcp->high = 6 * batch;
-   pcp->batch = max(1UL, 1 * batch);
+   pageset_set_batch(p, batch);
for (migratetype = 0; migratetype < MIGRATE_PCPTYPES; migratetype++)
INIT_LIST_HEAD(>lists[migratetype]);
  }
@@ -4023,7 +4030,6 @@ static void setup_pageset(struct per_cpu_pageset *p, 
unsigned long batch)
   * setup_pagelist_highmark() sets the high water mark for hot per_cpu_pagelist
   * to the value high for the pageset p.
   */
-
  static void setup_pagelist_highmark(struct per_cpu_pageset *p,
unsigned long high)
  {


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v4, part3 00/15] accurately calculate memory statisitic information

2013-04-06 Thread Simon Jeons

Hi Jiang,
On 04/06/2013 09:54 PM, Jiang Liu wrote:

The original goal of this patchset is to fix the bug reported by
https://bugzilla.kernel.org/show_bug.cgi?id=53501
Now it has also been expanded to reduce common code used by memory
initializion.

This is the third part, previous two patch sets could be accessed at:
http://marc.info/?l=linux-mm=136289696323825=2
http://marc.info/?l=linux-mm=136290291524901=2

This patchset applies to
git://git.cmpxchg.org/linux-mmotm.git fc374c1f9d7bdcfb851b15b86e58ac5e1f645e32
which is based on mmotm-2013-03-26-15-09.

V2->V4:
1) rebase to git://git.cmpxchg.org/linux-mmotm.git
2) fix some build warnings and other minor bugs of previous patches

We have only tested these patchset on x86 platforms, and have done basic
compliation tests using cross-compilers from ftp.kernel.org. That means
some code may not pass compilation on some architectures. So any help
to test this patchset are welcomed!

Patch 1-7:
Bugfixes and more work for part1 and part2
Patch 8-9:
Fix typo and minor bugs in mm core
Patch 10-14:
Enhance the way to manage totalram_pages, totalhigh_pages and
zone->managed_pages.
Patch 15:
Report available pages within the node as "MemTotal" for sysfs
interface /sys/.../node/nodex/meminfo

Jiang Liu (15):
   mm: fix build warnings caused by free_reserved_area()
   mm: enhance free_reserved_area() to support poisoning memory with
 zero
   mm/ARM64: kill poison_init_mem()
   mm/x86: use free_reserved_area() to simplify code
   mm/tile: use common help functions to free reserved pages
   mm, powertv: use free_reserved_area() to simplify code
   mm, acornfb: use free_reserved_area() to simplify code
   mm: fix some trivial typos in comments
   mm: use managed_pages to calculate default zonelist order
   mm: accurately calculate zone->managed_pages for highmem zones
   mm: use a dedicated lock to protect totalram_pages and
 zone->managed_pages
   mm: make __free_pages_bootmem() only available at boot time
   mm: correctly update zone->mamaged_pages
   mm: concentrate modification of totalram_pages into the mm core
   mm: report available pages as "MemTotal" for each NUMA node


What I interested in is how you test different platform? I don't think 
you can have all the physical platform.




  arch/alpha/kernel/sys_nautilus.c  |2 +-
  arch/alpha/mm/init.c  |6 ++--
  arch/alpha/mm/numa.c  |2 +-
  arch/arc/mm/init.c|2 +-
  arch/arm/mm/init.c|   13 
  arch/arm64/mm/init.c  |   15 ++---
  arch/avr32/mm/init.c  |6 ++--
  arch/blackfin/mm/init.c   |6 ++--
  arch/c6x/mm/init.c|6 ++--
  arch/cris/mm/init.c   |4 +--
  arch/frv/mm/init.c|6 ++--
  arch/h8300/mm/init.c  |6 ++--
  arch/hexagon/mm/init.c|3 +-
  arch/ia64/mm/init.c   |4 +--
  arch/m32r/mm/init.c   |6 ++--
  arch/m68k/mm/init.c   |8 ++---
  arch/metag/mm/init.c  |   11 ---
  arch/microblaze/mm/init.c |6 ++--
  arch/mips/mm/init.c   |2 +-
  arch/mips/powertv/asic/asic_devices.c |   13 ++--
  arch/mips/sgi-ip27/ip27-memory.c  |2 +-
  arch/mn10300/mm/init.c|2 +-
  arch/openrisc/mm/init.c   |6 ++--
  arch/parisc/mm/init.c |8 ++---
  arch/powerpc/kernel/kvm.c |2 +-
  arch/powerpc/mm/mem.c |7 ++---
  arch/s390/mm/init.c   |4 +--
  arch/score/mm/init.c  |2 +-
  arch/sh/mm/init.c |6 ++--
  arch/sparc/mm/init_32.c   |3 +-
  arch/sparc/mm/init_64.c   |2 +-
  arch/tile/mm/init.c   |9 ++
  arch/um/kernel/mem.c  |4 +--
  arch/unicore32/mm/init.c  |6 ++--
  arch/x86/mm/highmem_32.c  |6 
  arch/x86/mm/init.c|   14 ++---
  arch/x86/mm/init_32.c |2 +-
  arch/x86/mm/init_64.c |   25 +++
  arch/xtensa/mm/init.c |6 ++--
  drivers/video/acornfb.c   |   28 ++---
  drivers/virtio/virtio_balloon.c   |8 +++--
  drivers/xen/balloon.c |   23 +++---
  include/linux/bootmem.h   |1 +
  include/linux/mm.h|   17 +-
  include/linux/mmzone.h|   14 ++---
  mm/bootmem.c  |   41 +++-
  mm/hugetlb.c  |2 +-
  mm/memory_hotplug.c   |   33 
  mm/nobootmem.c|   35 -
  mm/page_alloc.c   

Re: [PATCH 0/3] mm: fixup changers of per cpu pageset's ->high and ->batch

2013-04-06 Thread Simon Jeons

Hi Cody,
On 04/06/2013 04:33 AM, Cody P Schafer wrote:

In one case while modifying the ->high and ->batch fields of per cpu pagesets
we're unneededly using stop_machine() (patches 1 & 2), and in another we don't 
have any
syncronization at all (patch 3).


Do you mean stop_machine() is used for syncronization between each 
online cpu?




This patchset fixes both of them.

Note that it results in a change to the behavior of zone_pcp_update(), which is
used by memory_hotplug. I _think_ that I've diserned (and preserved) the
essential behavior (changing ->high and ->batch), and only eliminated unneeded
actions (draining the per cpu pages), but this may not be the case.

--
  mm/page_alloc.c | 63 +++--
  1 file changed, 30 insertions(+), 33 deletions(-)

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majord...@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: mailto:"d...@kvack.org;> em...@kvack.org 


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v7 2/2] mm: replace hardcoded 3% with admin_reserve_pages knob

2013-04-06 Thread Simon Jeons

Hi Andrew,
On 04/05/2013 11:02 PM, Andrew Shewmaker wrote:

On Wed, Apr 3, 2013 at 9:50 PM, Simon Jeons  wrote:

FAQ


...

   * How do you calculate a minimum useful reserve?

 A user or the admin needs enough memory to login and perform
 recovery operations, which includes, at a minimum:

 sshd or login + bash (or some other shell) + top (or ps, kill, etc.)

 For overcommit 'guess', we can sum resident set sizes (RSS).
 On x86_64 this is about 8MB.

 For overcommit 'never', we can take the max of their virtual sizes
(VSZ)
 and add the sum of their RSS.
 On x86_64 this is about 128MB.


1.Why has this different between guess and never?

The default, overcommit 'guess' mode, only needs a reserve for
what the recovery programs will typically use. Overcommit 'never'
mode will only successfully launch an app when it can fulfill all of
its requested memory allocations--even if the app only uses a
fraction of what it asks for.


VSZ has already cover RSS, is it? why account RSS again?




2.You just test x86/x86_64, other platforms also will use memory overcommit,
did you test them?

No, I haven't. Unfortunately, I don't currently have any other platforms to test
with. I'll see what I can do.

Thanks,

Andrew


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [RFC][PATCH 0/9] extend hugepage migration

2013-04-06 Thread Simon Jeons

Hi Michal,
On 04/05/2013 05:30 PM, Michal Hocko wrote:

On Fri 05-04-13 17:00:58, Simon Jeons wrote:

Hi Michal,
On 04/05/2013 04:08 PM, Michal Hocko wrote:

On Fri 05-04-13 09:14:58, Simon Jeons wrote:

Hi Michal,
On 03/22/2013 04:15 PM, Michal Hocko wrote:

[getting off-list]

On Fri 22-03-13 07:46:32, Simon Jeons wrote:

Hi Michal,
On 03/21/2013 08:56 PM, Michal Hocko wrote:

On Thu 21-03-13 07:49:48, Simon Jeons wrote:
[...]

When I hacking arch/x86/mm/hugetlbpage.c like this,
diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c
index ae1aa71..87f34ee 100644
--- a/arch/x86/mm/hugetlbpage.c
+++ b/arch/x86/mm/hugetlbpage.c
@@ -354,14 +354,13 @@ hugetlb_get_unmapped_area(struct file *file,
unsigned long addr,

#endif /*HAVE_ARCH_HUGETLB_UNMAPPED_AREA*/

-#ifdef CONFIG_X86_64
static __init int setup_hugepagesz(char *opt)
{
unsigned long ps = memparse(opt, );
if (ps == PMD_SIZE) {
hugetlb_add_hstate(PMD_SHIFT - PAGE_SHIFT);
- } else if (ps == PUD_SIZE && cpu_has_gbpages) {
- hugetlb_add_hstate(PUD_SHIFT - PAGE_SHIFT);
+ } else if (ps == PUD_SIZE) {
+ hugetlb_add_hstate(PMD_SHIFT - PAGE_SHIFT+4);
} else {
printk(KERN_ERR "hugepagesz: Unsupported page size %lu M\n",
ps >> 20);

I set boot=hugepagesz=1G hugepages=10, then I got 10 32MB huge pages.
What's the difference between these pages which I hacking and normal
huge pages?

How is this related to the patch set?
Please _stop_ distracting discussion to unrelated topics!

Nothing personal but this is just wasting our time.

Sorry kindly Michal, my bad.
Btw, could you explain this question for me? very sorry waste your time.

Your CPU has to support GB pages. You have removed cpu_has_gbpages test
and added a hstate for order 13 pages which is a weird number on its
own (32MB) because there is no page table level to support them.

But after hacking, there is /sys/kernel/mm/hugepages/hugepages-*,
and have equal number of 32MB huge pages which I set up in boot
parameter.

because hugetlb_add_hstate creates hstate for those pages and
hugetlb_init_hstates allocates them later on.


If there is no page table level to support them, how can
them present?

Because hugetlb hstate handling code doesn't care about page tables and
the way how those pages are going to be mapped _at all_. Or put it in
another way. Nobody prevents you to allocate order-5 page for a single
pte but that would be a pure waste. Page fault code expects that pages
with a proper size are allocated.

Do you mean 32MB pages will map to one pmd which should map 2MB pages?


Please refer to hugetlb_fault for more information.


Thanks for your pointing out. So my assume is correct, is it? Can pmd 
which support 2MB map 32MB pages work well?






--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [RFC][PATCH 0/9] extend hugepage migration

2013-04-06 Thread Simon Jeons

Hi Michal,
On 04/05/2013 05:30 PM, Michal Hocko wrote:

On Fri 05-04-13 17:00:58, Simon Jeons wrote:

Hi Michal,
On 04/05/2013 04:08 PM, Michal Hocko wrote:

On Fri 05-04-13 09:14:58, Simon Jeons wrote:

Hi Michal,
On 03/22/2013 04:15 PM, Michal Hocko wrote:

[getting off-list]

On Fri 22-03-13 07:46:32, Simon Jeons wrote:

Hi Michal,
On 03/21/2013 08:56 PM, Michal Hocko wrote:

On Thu 21-03-13 07:49:48, Simon Jeons wrote:
[...]

When I hacking arch/x86/mm/hugetlbpage.c like this,
diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c
index ae1aa71..87f34ee 100644
--- a/arch/x86/mm/hugetlbpage.c
+++ b/arch/x86/mm/hugetlbpage.c
@@ -354,14 +354,13 @@ hugetlb_get_unmapped_area(struct file *file,
unsigned long addr,

#endif /*HAVE_ARCH_HUGETLB_UNMAPPED_AREA*/

-#ifdef CONFIG_X86_64
static __init int setup_hugepagesz(char *opt)
{
unsigned long ps = memparse(opt, opt);
if (ps == PMD_SIZE) {
hugetlb_add_hstate(PMD_SHIFT - PAGE_SHIFT);
- } else if (ps == PUD_SIZE  cpu_has_gbpages) {
- hugetlb_add_hstate(PUD_SHIFT - PAGE_SHIFT);
+ } else if (ps == PUD_SIZE) {
+ hugetlb_add_hstate(PMD_SHIFT - PAGE_SHIFT+4);
} else {
printk(KERN_ERR hugepagesz: Unsupported page size %lu M\n,
ps  20);

I set boot=hugepagesz=1G hugepages=10, then I got 10 32MB huge pages.
What's the difference between these pages which I hacking and normal
huge pages?

How is this related to the patch set?
Please _stop_ distracting discussion to unrelated topics!

Nothing personal but this is just wasting our time.

Sorry kindly Michal, my bad.
Btw, could you explain this question for me? very sorry waste your time.

Your CPU has to support GB pages. You have removed cpu_has_gbpages test
and added a hstate for order 13 pages which is a weird number on its
own (32MB) because there is no page table level to support them.

But after hacking, there is /sys/kernel/mm/hugepages/hugepages-*,
and have equal number of 32MB huge pages which I set up in boot
parameter.

because hugetlb_add_hstate creates hstate for those pages and
hugetlb_init_hstates allocates them later on.


If there is no page table level to support them, how can
them present?

Because hugetlb hstate handling code doesn't care about page tables and
the way how those pages are going to be mapped _at all_. Or put it in
another way. Nobody prevents you to allocate order-5 page for a single
pte but that would be a pure waste. Page fault code expects that pages
with a proper size are allocated.

Do you mean 32MB pages will map to one pmd which should map 2MB pages?


Please refer to hugetlb_fault for more information.


Thanks for your pointing out. So my assume is correct, is it? Can pmd 
which support 2MB map 32MB pages work well?






--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v7 2/2] mm: replace hardcoded 3% with admin_reserve_pages knob

2013-04-06 Thread Simon Jeons

Hi Andrew,
On 04/05/2013 11:02 PM, Andrew Shewmaker wrote:

On Wed, Apr 3, 2013 at 9:50 PM, Simon Jeons simon.je...@gmail.com wrote:

FAQ


...

   * How do you calculate a minimum useful reserve?

 A user or the admin needs enough memory to login and perform
 recovery operations, which includes, at a minimum:

 sshd or login + bash (or some other shell) + top (or ps, kill, etc.)

 For overcommit 'guess', we can sum resident set sizes (RSS).
 On x86_64 this is about 8MB.

 For overcommit 'never', we can take the max of their virtual sizes
(VSZ)
 and add the sum of their RSS.
 On x86_64 this is about 128MB.


1.Why has this different between guess and never?

The default, overcommit 'guess' mode, only needs a reserve for
what the recovery programs will typically use. Overcommit 'never'
mode will only successfully launch an app when it can fulfill all of
its requested memory allocations--even if the app only uses a
fraction of what it asks for.


VSZ has already cover RSS, is it? why account RSS again?




2.You just test x86/x86_64, other platforms also will use memory overcommit,
did you test them?

No, I haven't. Unfortunately, I don't currently have any other platforms to test
with. I'll see what I can do.

Thanks,

Andrew


--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 0/3] mm: fixup changers of per cpu pageset's -high and -batch

2013-04-06 Thread Simon Jeons

Hi Cody,
On 04/06/2013 04:33 AM, Cody P Schafer wrote:

In one case while modifying the -high and -batch fields of per cpu pagesets
we're unneededly using stop_machine() (patches 1  2), and in another we don't 
have any
syncronization at all (patch 3).


Do you mean stop_machine() is used for syncronization between each 
online cpu?




This patchset fixes both of them.

Note that it results in a change to the behavior of zone_pcp_update(), which is
used by memory_hotplug. I _think_ that I've diserned (and preserved) the
essential behavior (changing -high and -batch), and only eliminated unneeded
actions (draining the per cpu pages), but this may not be the case.

--
  mm/page_alloc.c | 63 +++--
  1 file changed, 30 insertions(+), 33 deletions(-)

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majord...@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: a href=mailto:d...@kvack.org; em...@kvack.org /a


--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v4, part3 00/15] accurately calculate memory statisitic information

2013-04-06 Thread Simon Jeons

Hi Jiang,
On 04/06/2013 09:54 PM, Jiang Liu wrote:

The original goal of this patchset is to fix the bug reported by
https://bugzilla.kernel.org/show_bug.cgi?id=53501
Now it has also been expanded to reduce common code used by memory
initializion.

This is the third part, previous two patch sets could be accessed at:
http://marc.info/?l=linux-mmm=136289696323825w=2
http://marc.info/?l=linux-mmm=136290291524901w=2

This patchset applies to
git://git.cmpxchg.org/linux-mmotm.git fc374c1f9d7bdcfb851b15b86e58ac5e1f645e32
which is based on mmotm-2013-03-26-15-09.

V2-V4:
1) rebase to git://git.cmpxchg.org/linux-mmotm.git
2) fix some build warnings and other minor bugs of previous patches

We have only tested these patchset on x86 platforms, and have done basic
compliation tests using cross-compilers from ftp.kernel.org. That means
some code may not pass compilation on some architectures. So any help
to test this patchset are welcomed!

Patch 1-7:
Bugfixes and more work for part1 and part2
Patch 8-9:
Fix typo and minor bugs in mm core
Patch 10-14:
Enhance the way to manage totalram_pages, totalhigh_pages and
zone-managed_pages.
Patch 15:
Report available pages within the node as MemTotal for sysfs
interface /sys/.../node/nodex/meminfo

Jiang Liu (15):
   mm: fix build warnings caused by free_reserved_area()
   mm: enhance free_reserved_area() to support poisoning memory with
 zero
   mm/ARM64: kill poison_init_mem()
   mm/x86: use free_reserved_area() to simplify code
   mm/tile: use common help functions to free reserved pages
   mm, powertv: use free_reserved_area() to simplify code
   mm, acornfb: use free_reserved_area() to simplify code
   mm: fix some trivial typos in comments
   mm: use managed_pages to calculate default zonelist order
   mm: accurately calculate zone-managed_pages for highmem zones
   mm: use a dedicated lock to protect totalram_pages and
 zone-managed_pages
   mm: make __free_pages_bootmem() only available at boot time
   mm: correctly update zone-mamaged_pages
   mm: concentrate modification of totalram_pages into the mm core
   mm: report available pages as MemTotal for each NUMA node


What I interested in is how you test different platform? I don't think 
you can have all the physical platform.




  arch/alpha/kernel/sys_nautilus.c  |2 +-
  arch/alpha/mm/init.c  |6 ++--
  arch/alpha/mm/numa.c  |2 +-
  arch/arc/mm/init.c|2 +-
  arch/arm/mm/init.c|   13 
  arch/arm64/mm/init.c  |   15 ++---
  arch/avr32/mm/init.c  |6 ++--
  arch/blackfin/mm/init.c   |6 ++--
  arch/c6x/mm/init.c|6 ++--
  arch/cris/mm/init.c   |4 +--
  arch/frv/mm/init.c|6 ++--
  arch/h8300/mm/init.c  |6 ++--
  arch/hexagon/mm/init.c|3 +-
  arch/ia64/mm/init.c   |4 +--
  arch/m32r/mm/init.c   |6 ++--
  arch/m68k/mm/init.c   |8 ++---
  arch/metag/mm/init.c  |   11 ---
  arch/microblaze/mm/init.c |6 ++--
  arch/mips/mm/init.c   |2 +-
  arch/mips/powertv/asic/asic_devices.c |   13 ++--
  arch/mips/sgi-ip27/ip27-memory.c  |2 +-
  arch/mn10300/mm/init.c|2 +-
  arch/openrisc/mm/init.c   |6 ++--
  arch/parisc/mm/init.c |8 ++---
  arch/powerpc/kernel/kvm.c |2 +-
  arch/powerpc/mm/mem.c |7 ++---
  arch/s390/mm/init.c   |4 +--
  arch/score/mm/init.c  |2 +-
  arch/sh/mm/init.c |6 ++--
  arch/sparc/mm/init_32.c   |3 +-
  arch/sparc/mm/init_64.c   |2 +-
  arch/tile/mm/init.c   |9 ++
  arch/um/kernel/mem.c  |4 +--
  arch/unicore32/mm/init.c  |6 ++--
  arch/x86/mm/highmem_32.c  |6 
  arch/x86/mm/init.c|   14 ++---
  arch/x86/mm/init_32.c |2 +-
  arch/x86/mm/init_64.c |   25 +++
  arch/xtensa/mm/init.c |6 ++--
  drivers/video/acornfb.c   |   28 ++---
  drivers/virtio/virtio_balloon.c   |8 +++--
  drivers/xen/balloon.c |   23 +++---
  include/linux/bootmem.h   |1 +
  include/linux/mm.h|   17 +-
  include/linux/mmzone.h|   14 ++---
  mm/bootmem.c  |   41 +++-
  mm/hugetlb.c  |2 +-
  mm/memory_hotplug.c   |   33 
  mm/nobootmem.c|   35 -
  mm/page_alloc.c   

Re: [PATCH 1/3] mm/page_alloc: factor out setting of pcp-high and pcp-batch.

2013-04-06 Thread Simon Jeons

Hi Cody,
On 04/06/2013 04:33 AM, Cody P Schafer wrote:

Creates pageset_set_batch() for use in setup_pageset().
pageset_set_batch() imitates the functionality of
setup_pagelist_highmark(), but uses the boot time
(percpu_pagelist_fraction == 0) calculations for determining -high


Why need adjust pcp-high, pcp-batch during system running? What's the 
requirement?



based on -batch.

Signed-off-by: Cody P Schafer c...@linux.vnet.ibm.com
---
  mm/page_alloc.c | 12 +---
  1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 8fcced7..5877cf0 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4004,6 +4004,14 @@ static int __meminit zone_batchsize(struct zone *zone)
  #endif
  }
  
+/* a companion to setup_pagelist_highmark() */

+static void pageset_set_batch(struct per_cpu_pageset *p, unsigned long batch)
+{
+   struct per_cpu_pages *pcp = p-pcp;
+   pcp-high = 6 * batch;
+   pcp-batch = max(1UL, 1 * batch);
+}
+
  static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
  {
struct per_cpu_pages *pcp;
@@ -4013,8 +4021,7 @@ static void setup_pageset(struct per_cpu_pageset *p, 
unsigned long batch)
  
  	pcp = p-pcp;

pcp-count = 0;
-   pcp-high = 6 * batch;
-   pcp-batch = max(1UL, 1 * batch);
+   pageset_set_batch(p, batch);
for (migratetype = 0; migratetype  MIGRATE_PCPTYPES; migratetype++)
INIT_LIST_HEAD(pcp-lists[migratetype]);
  }
@@ -4023,7 +4030,6 @@ static void setup_pageset(struct per_cpu_pageset *p, 
unsigned long batch)
   * setup_pagelist_highmark() sets the high water mark for hot per_cpu_pagelist
   * to the value high for the pageset p.
   */
-
  static void setup_pagelist_highmark(struct per_cpu_pageset *p,
unsigned long high)
  {


--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 3/3] mm: when handling percpu_pagelist_fraction, use on_each_cpu() to set percpu pageset fields.

2013-04-06 Thread Simon Jeons

Hi Cody,
On 04/06/2013 04:33 AM, Cody P Schafer wrote:

In free_hot_cold_page(), we rely on pcp-batch remaining stable.
Updating it without being on the cpu owning the percpu pageset
potentially destroys this stability.


If cpu is off, can its pcp pageset be used in free_hot_code_page()?



Change for_each_cpu() to on_each_cpu() to fix.

Signed-off-by: Cody P Schafer c...@linux.vnet.ibm.com
---
  mm/page_alloc.c | 21 +++--
  1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 48f2faa..507db31 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5475,30 +5475,31 @@ int lowmem_reserve_ratio_sysctl_handler(ctl_table 
*table, int write,
return 0;
  }
  
+static void _zone_set_pageset_highmark(void *data)

+{
+   struct zone *zone = data;
+   unsigned long  high;
+   high = zone-managed_pages / percpu_pagelist_fraction;
+   setup_pagelist_highmark(
+   per_cpu_ptr(zone-pageset, smp_processor_id()), high);
+}
+
  /*
   * percpu_pagelist_fraction - changes the pcp-high for each zone on each
   * cpu.  It is the fraction of total pages in each zone that a hot per cpu 
pagelist
   * can have before it gets flushed back to buddy allocator.
   */
-
  int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write,
void __user *buffer, size_t *length, loff_t *ppos)
  {
struct zone *zone;
-   unsigned int cpu;
int ret;
  
  	ret = proc_dointvec_minmax(table, write, buffer, length, ppos);

if (!write || (ret  0))
return ret;
-   for_each_populated_zone(zone) {
-   for_each_possible_cpu(cpu) {
-   unsigned long  high;
-   high = zone-managed_pages / percpu_pagelist_fraction;
-   setup_pagelist_highmark(
-   per_cpu_ptr(zone-pageset, cpu), high);
-   }
-   }
+   for_each_populated_zone(zone)
+   on_each_cpu(_zone_set_pageset_highmark, zone, true);
return 0;
  }
  


--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [RFC][PATCH 0/9] extend hugepage migration

2013-04-05 Thread Simon Jeons

Hi Michal,
On 04/05/2013 04:08 PM, Michal Hocko wrote:

On Fri 05-04-13 09:14:58, Simon Jeons wrote:

Hi Michal,
On 03/22/2013 04:15 PM, Michal Hocko wrote:

[getting off-list]

On Fri 22-03-13 07:46:32, Simon Jeons wrote:

Hi Michal,
On 03/21/2013 08:56 PM, Michal Hocko wrote:

On Thu 21-03-13 07:49:48, Simon Jeons wrote:
[...]

When I hacking arch/x86/mm/hugetlbpage.c like this,
diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c
index ae1aa71..87f34ee 100644
--- a/arch/x86/mm/hugetlbpage.c
+++ b/arch/x86/mm/hugetlbpage.c
@@ -354,14 +354,13 @@ hugetlb_get_unmapped_area(struct file *file,
unsigned long addr,

#endif /*HAVE_ARCH_HUGETLB_UNMAPPED_AREA*/

-#ifdef CONFIG_X86_64
static __init int setup_hugepagesz(char *opt)
{
unsigned long ps = memparse(opt, );
if (ps == PMD_SIZE) {
hugetlb_add_hstate(PMD_SHIFT - PAGE_SHIFT);
- } else if (ps == PUD_SIZE && cpu_has_gbpages) {
- hugetlb_add_hstate(PUD_SHIFT - PAGE_SHIFT);
+ } else if (ps == PUD_SIZE) {
+ hugetlb_add_hstate(PMD_SHIFT - PAGE_SHIFT+4);
} else {
printk(KERN_ERR "hugepagesz: Unsupported page size %lu M\n",
ps >> 20);

I set boot=hugepagesz=1G hugepages=10, then I got 10 32MB huge pages.
What's the difference between these pages which I hacking and normal
huge pages?

How is this related to the patch set?
Please _stop_ distracting discussion to unrelated topics!

Nothing personal but this is just wasting our time.

Sorry kindly Michal, my bad.
Btw, could you explain this question for me? very sorry waste your time.

Your CPU has to support GB pages. You have removed cpu_has_gbpages test
and added a hstate for order 13 pages which is a weird number on its
own (32MB) because there is no page table level to support them.

But after hacking, there is /sys/kernel/mm/hugepages/hugepages-*,
and have equal number of 32MB huge pages which I set up in boot
parameter.

because hugetlb_add_hstate creates hstate for those pages and
hugetlb_init_hstates allocates them later on.


If there is no page table level to support them, how can
them present?

Because hugetlb hstate handling code doesn't care about page tables and
the way how those pages are going to be mapped _at all_. Or put it in
another way. Nobody prevents you to allocate order-5 page for a single
pte but that would be a pure waste. Page fault code expects that pages
with a proper size are allocated.

Do you mean 32MB pages will map to one pmd which should map 2MB pages?

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 0/6] mm/hugetlb: gigantic hugetlb page pools shrink supporting

2013-04-05 Thread Simon Jeons

Hi Michal,
On 04/05/2013 04:12 PM, Michal Hocko wrote:

On Fri 05-04-13 07:41:23, Wanpeng Li wrote:

On Thu, Apr 04, 2013 at 06:17:46PM +0200, Michal Hocko wrote:

On Thu 04-04-13 17:09:08, Wanpeng Li wrote:

order >= MAX_ORDER pages are only allocated at boot stage using the
bootmem allocator with the "hugepages=xxx" option. These pages are never
free after boot by default since it would be a one-way street(>= MAX_ORDER
pages cannot be allocated later), but if administrator confirm not to
use these gigantic pages any more, these pinned pages will waste memory
since other users can't grab free pages from gigantic hugetlb pool even
if OOM, it's not flexible.  The patchset add hugetlb gigantic page pools
shrink supporting. Administrator can enable knob exported in sysctl to
permit to shrink gigantic hugetlb pool.

I am not sure I see why the new knob is needed.
/sys/kernel/mm/hugepages/hugepages-*/nr_hugepages is root interface so
an additional step to allow writing to the file doesn't make much sense
to me to be honest.

Support for shrinking gigantic huge pages makes some sense to me but I
would be interested in the real world example. GB pages are usually used
in very specific environments where the amount is usually well known.

Gigantic huge pages in hugetlb means h->order >= MAX_ORDER instead of GB
pages. ;-)

Yes, I am aware of that but the question remains the same (and
unanswered). What is the use case?


As patch description, "if administrator confirm not to use these 
gigantic pages any more, these pinned pages will waste memory since 
other users can't grab free pages from gigantic hugetlb pool even if OOM".






--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCHv2, RFC 20/30] ramfs: enable transparent huge page cache

2013-04-05 Thread Simon Jeons

Hi Minchan,
On 04/03/2013 09:11 AM, Minchan Kim wrote:

On Tue, Apr 02, 2013 at 03:15:23PM -0700, Hugh Dickins wrote:

On Tue, 2 Apr 2013, Kirill A. Shutemov wrote:

Kirill A. Shutemov wrote:

From: "Kirill A. Shutemov" 

ramfs is the most simple fs from page cache point of view. Let's start
transparent huge page cache enabling here.

For now we allocate only non-movable huge page. It's not yet clear if
movable page is safe here and what need to be done to make it safe.

Signed-off-by: Kirill A. Shutemov 
---
  fs/ramfs/inode.c |6 +-
  1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index c24f1e1..da30b4f 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -61,7 +61,11 @@ struct inode *ramfs_get_inode(struct super_block *sb,
inode_init_owner(inode, dir, mode);
inode->i_mapping->a_ops = _aops;
inode->i_mapping->backing_dev_info = _backing_dev_info;
-   mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER);
+   /*
+* TODO: what should be done to make movable safe?
+*/
+   mapping_set_gfp_mask(inode->i_mapping,
+   GFP_TRANSHUGE & ~__GFP_MOVABLE);

Hugh, I've found old thread with the reason why we have GFP_HIGHUSER here, not
GFP_HIGHUSER_MOVABLE:

http://lkml.org/lkml/2006/11/27/156

It seems the origin reason is not longer valid, correct?

Incorrect, I believe: so far as I know, the original reason remains
valid - though it would only require a couple of good small changes
to reverse that - or perhaps you have already made these changes?

The original reason is that ramfs pages are not migratable,
therefore they should be allocated from an unmovable area.

As I understand it (and I would have preferred to run a test to check
my understanding before replying, but don't have time for that), ramfs
pages cannot be migrated for two reasons, neither of them a good reason.

One reason (okay, it wouldn't have been quite this way in 2006) is that
ramfs (rightly) calls mapping_set_unevictable(), so its pages will fail
the page_evictable() test, so they will be marked PageUnevictable, so
__isolate_lru_page() will refuse to isolate them for migration (except
for CMA).

True.


I am strongly in favour of removing that limitation from
__isolate_lru_page() (and the thread you pointed - thank you - shows Mel
and Christoph were both in favour too); and note that there is no such
restriction in the confusingly similar but different isolate_lru_page().

Some people do worry that migrating Mlocked pages would introduce the
occasional possibility of a minor fault (with migration_entry_wait())
on an Mlocked region which never faulted before.  I tend to dismiss
that worry, but maybe I'm wrong to do so: maybe there should be a
tunable for realtimey people to set, to prohibit page migration from
mlocked areas; but the default should be to allow it.

I agree.
Just FYI for mlocked page migration

I tried migratioin of mlocked page and Johannes and Mel had a concern
about that.
http://lkml.indiana.edu/hypermail/linux/kernel/1109.0/00175.html

But later, Peter already acked it and I guess by reading the thread that
Hugh was in favour when page migration was merged first time.

http://marc.info/?l=linux-mm=133697873414205=2
http://marc.info/?l=linux-mm=133700341823358=2

Many people said mlock means memory-resident, NOT pinning so it could
allow minor fault while Mel still had a concern except CMA.
http://marc.info/?l=linux-mm=133674219714419=2


How about add a knob?


(Of course, we could separate ramfs's mapping_unevictable case from
the Mlocked case; but I'd prefer to continue to treat them the same.)

Fair enough.


The other reason it looks as if ramfs pages cannot be migrated, is
that it does not set a suitable ->migratepage method, so would be
handled by fallback_migrate_page(), whose PageDirty test will end
up failing the migration with -EBUSY or -EINVAL - if I read it
correctly.

True.


Perhaps other such reasons would surface once those are fixed.
But until ramfs pages can be migrated, they should not be allocated
with __GFP_MOVABLE.  (I've been writing about the migratability of
small pages: I expect you have the migratability of THPages in flux.)

Agreed.


Hugh

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majord...@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: mailto:"d...@kvack.org;> em...@kvack.org 


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] mm: page_alloc: Avoid marking zones full prematurely after zone_reclaim()

2013-04-05 Thread Simon Jeons

Hi Michal,
On 03/21/2013 04:19 PM, Michal Hocko wrote:

On Thu 21-03-13 10:33:07, Simon Jeons wrote:

Hi Mel,
On 03/21/2013 02:19 AM, Mel Gorman wrote:

The following problem was reported against a distribution kernel when
zone_reclaim was enabled but the same problem applies to the mainline
kernel. The reproduction case was as follows

1. Run numactl -m +0 dd if=largefile of=/dev/null
This allocates a large number of clean pages in node 0

I confuse why this need allocate a large number of clean pages?

It reads from file and puts pages into the page cache. The pages are not
modified so they are clean. Output file is /dev/null so no pages are
written. dd doesn't call fadvise(POSIX_FADV_DONTNEED) on the input file
by default so pages from the file stay in the page cache


I try this in v3.9-rc5:
dd if=/dev/sda of=/dev/null bs=1MB
14813+0 records in
14812+0 records out
1481200 bytes (15 GB) copied, 105.988 s, 140 MB/s

free -m -s 1

   total   used   free shared buffers 
cached

Mem:  7912   1181   6731  0 663239
-/+ buffers/cache:277   7634
Swap: 8011  0   8011

It seems that almost 15GB copied before I stop dd, but the used pages 
which I monitor during dd always around 1200MB. Weird, why?


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] mm: page_alloc: Avoid marking zones full prematurely after zone_reclaim()

2013-04-05 Thread Simon Jeons

Hi Michal,
On 03/21/2013 04:19 PM, Michal Hocko wrote:

On Thu 21-03-13 10:33:07, Simon Jeons wrote:

Hi Mel,
On 03/21/2013 02:19 AM, Mel Gorman wrote:

The following problem was reported against a distribution kernel when
zone_reclaim was enabled but the same problem applies to the mainline
kernel. The reproduction case was as follows

1. Run numactl -m +0 dd if=largefile of=/dev/null
This allocates a large number of clean pages in node 0

I confuse why this need allocate a large number of clean pages?

It reads from file and puts pages into the page cache. The pages are not
modified so they are clean. Output file is /dev/null so no pages are
written. dd doesn't call fadvise(POSIX_FADV_DONTNEED) on the input file
by default so pages from the file stay in the page cache


I try this in v3.9-rc5:
dd if=/dev/sda of=/dev/null bs=1MB
14813+0 records in
14812+0 records out
1481200 bytes (15 GB) copied, 105.988 s, 140 MB/s

free -m -s 1

   total   used   free shared buffers 
cached

Mem:  7912   1181   6731  0 663239
-/+ buffers/cache:277   7634
Swap: 8011  0   8011

It seems that almost 15GB copied before I stop dd, but the used pages 
which I monitor during dd always around 1200MB. Weird, why?


--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCHv2, RFC 20/30] ramfs: enable transparent huge page cache

2013-04-05 Thread Simon Jeons

Hi Minchan,
On 04/03/2013 09:11 AM, Minchan Kim wrote:

On Tue, Apr 02, 2013 at 03:15:23PM -0700, Hugh Dickins wrote:

On Tue, 2 Apr 2013, Kirill A. Shutemov wrote:

Kirill A. Shutemov wrote:

From: Kirill A. Shutemov kirill.shute...@linux.intel.com

ramfs is the most simple fs from page cache point of view. Let's start
transparent huge page cache enabling here.

For now we allocate only non-movable huge page. It's not yet clear if
movable page is safe here and what need to be done to make it safe.

Signed-off-by: Kirill A. Shutemov kirill.shute...@linux.intel.com
---
  fs/ramfs/inode.c |6 +-
  1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index c24f1e1..da30b4f 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -61,7 +61,11 @@ struct inode *ramfs_get_inode(struct super_block *sb,
inode_init_owner(inode, dir, mode);
inode-i_mapping-a_ops = ramfs_aops;
inode-i_mapping-backing_dev_info = ramfs_backing_dev_info;
-   mapping_set_gfp_mask(inode-i_mapping, GFP_HIGHUSER);
+   /*
+* TODO: what should be done to make movable safe?
+*/
+   mapping_set_gfp_mask(inode-i_mapping,
+   GFP_TRANSHUGE  ~__GFP_MOVABLE);

Hugh, I've found old thread with the reason why we have GFP_HIGHUSER here, not
GFP_HIGHUSER_MOVABLE:

http://lkml.org/lkml/2006/11/27/156

It seems the origin reason is not longer valid, correct?

Incorrect, I believe: so far as I know, the original reason remains
valid - though it would only require a couple of good small changes
to reverse that - or perhaps you have already made these changes?

The original reason is that ramfs pages are not migratable,
therefore they should be allocated from an unmovable area.

As I understand it (and I would have preferred to run a test to check
my understanding before replying, but don't have time for that), ramfs
pages cannot be migrated for two reasons, neither of them a good reason.

One reason (okay, it wouldn't have been quite this way in 2006) is that
ramfs (rightly) calls mapping_set_unevictable(), so its pages will fail
the page_evictable() test, so they will be marked PageUnevictable, so
__isolate_lru_page() will refuse to isolate them for migration (except
for CMA).

True.


I am strongly in favour of removing that limitation from
__isolate_lru_page() (and the thread you pointed - thank you - shows Mel
and Christoph were both in favour too); and note that there is no such
restriction in the confusingly similar but different isolate_lru_page().

Some people do worry that migrating Mlocked pages would introduce the
occasional possibility of a minor fault (with migration_entry_wait())
on an Mlocked region which never faulted before.  I tend to dismiss
that worry, but maybe I'm wrong to do so: maybe there should be a
tunable for realtimey people to set, to prohibit page migration from
mlocked areas; but the default should be to allow it.

I agree.
Just FYI for mlocked page migration

I tried migratioin of mlocked page and Johannes and Mel had a concern
about that.
http://lkml.indiana.edu/hypermail/linux/kernel/1109.0/00175.html

But later, Peter already acked it and I guess by reading the thread that
Hugh was in favour when page migration was merged first time.

http://marc.info/?l=linux-mmm=133697873414205w=2
http://marc.info/?l=linux-mmm=133700341823358w=2

Many people said mlock means memory-resident, NOT pinning so it could
allow minor fault while Mel still had a concern except CMA.
http://marc.info/?l=linux-mmm=133674219714419w=2


How about add a knob?


(Of course, we could separate ramfs's mapping_unevictable case from
the Mlocked case; but I'd prefer to continue to treat them the same.)

Fair enough.


The other reason it looks as if ramfs pages cannot be migrated, is
that it does not set a suitable -migratepage method, so would be
handled by fallback_migrate_page(), whose PageDirty test will end
up failing the migration with -EBUSY or -EINVAL - if I read it
correctly.

True.


Perhaps other such reasons would surface once those are fixed.
But until ramfs pages can be migrated, they should not be allocated
with __GFP_MOVABLE.  (I've been writing about the migratability of
small pages: I expect you have the migratability of THPages in flux.)

Agreed.


Hugh

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majord...@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: a href=mailto:d...@kvack.org; em...@kvack.org /a


--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 0/6] mm/hugetlb: gigantic hugetlb page pools shrink supporting

2013-04-05 Thread Simon Jeons

Hi Michal,
On 04/05/2013 04:12 PM, Michal Hocko wrote:

On Fri 05-04-13 07:41:23, Wanpeng Li wrote:

On Thu, Apr 04, 2013 at 06:17:46PM +0200, Michal Hocko wrote:

On Thu 04-04-13 17:09:08, Wanpeng Li wrote:

order = MAX_ORDER pages are only allocated at boot stage using the
bootmem allocator with the hugepages=xxx option. These pages are never
free after boot by default since it would be a one-way street(= MAX_ORDER
pages cannot be allocated later), but if administrator confirm not to
use these gigantic pages any more, these pinned pages will waste memory
since other users can't grab free pages from gigantic hugetlb pool even
if OOM, it's not flexible.  The patchset add hugetlb gigantic page pools
shrink supporting. Administrator can enable knob exported in sysctl to
permit to shrink gigantic hugetlb pool.

I am not sure I see why the new knob is needed.
/sys/kernel/mm/hugepages/hugepages-*/nr_hugepages is root interface so
an additional step to allow writing to the file doesn't make much sense
to me to be honest.

Support for shrinking gigantic huge pages makes some sense to me but I
would be interested in the real world example. GB pages are usually used
in very specific environments where the amount is usually well known.

Gigantic huge pages in hugetlb means h-order = MAX_ORDER instead of GB
pages. ;-)

Yes, I am aware of that but the question remains the same (and
unanswered). What is the use case?


As patch description, if administrator confirm not to use these 
gigantic pages any more, these pinned pages will waste memory since 
other users can't grab free pages from gigantic hugetlb pool even if OOM.






--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [RFC][PATCH 0/9] extend hugepage migration

2013-04-05 Thread Simon Jeons

Hi Michal,
On 04/05/2013 04:08 PM, Michal Hocko wrote:

On Fri 05-04-13 09:14:58, Simon Jeons wrote:

Hi Michal,
On 03/22/2013 04:15 PM, Michal Hocko wrote:

[getting off-list]

On Fri 22-03-13 07:46:32, Simon Jeons wrote:

Hi Michal,
On 03/21/2013 08:56 PM, Michal Hocko wrote:

On Thu 21-03-13 07:49:48, Simon Jeons wrote:
[...]

When I hacking arch/x86/mm/hugetlbpage.c like this,
diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c
index ae1aa71..87f34ee 100644
--- a/arch/x86/mm/hugetlbpage.c
+++ b/arch/x86/mm/hugetlbpage.c
@@ -354,14 +354,13 @@ hugetlb_get_unmapped_area(struct file *file,
unsigned long addr,

#endif /*HAVE_ARCH_HUGETLB_UNMAPPED_AREA*/

-#ifdef CONFIG_X86_64
static __init int setup_hugepagesz(char *opt)
{
unsigned long ps = memparse(opt, opt);
if (ps == PMD_SIZE) {
hugetlb_add_hstate(PMD_SHIFT - PAGE_SHIFT);
- } else if (ps == PUD_SIZE  cpu_has_gbpages) {
- hugetlb_add_hstate(PUD_SHIFT - PAGE_SHIFT);
+ } else if (ps == PUD_SIZE) {
+ hugetlb_add_hstate(PMD_SHIFT - PAGE_SHIFT+4);
} else {
printk(KERN_ERR hugepagesz: Unsupported page size %lu M\n,
ps  20);

I set boot=hugepagesz=1G hugepages=10, then I got 10 32MB huge pages.
What's the difference between these pages which I hacking and normal
huge pages?

How is this related to the patch set?
Please _stop_ distracting discussion to unrelated topics!

Nothing personal but this is just wasting our time.

Sorry kindly Michal, my bad.
Btw, could you explain this question for me? very sorry waste your time.

Your CPU has to support GB pages. You have removed cpu_has_gbpages test
and added a hstate for order 13 pages which is a weird number on its
own (32MB) because there is no page table level to support them.

But after hacking, there is /sys/kernel/mm/hugepages/hugepages-*,
and have equal number of 32MB huge pages which I set up in boot
parameter.

because hugetlb_add_hstate creates hstate for those pages and
hugetlb_init_hstates allocates them later on.


If there is no page table level to support them, how can
them present?

Because hugetlb hstate handling code doesn't care about page tables and
the way how those pages are going to be mapped _at all_. Or put it in
another way. Nobody prevents you to allocate order-5 page for a single
pte but that would be a pure waste. Page fault code expects that pages
with a proper size are allocated.

Do you mean 32MB pages will map to one pmd which should map 2MB pages?

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [RFC][PATCH 0/9] extend hugepage migration

2013-04-04 Thread Simon Jeons

Hi Michal,
On 03/22/2013 04:15 PM, Michal Hocko wrote:

[getting off-list]

On Fri 22-03-13 07:46:32, Simon Jeons wrote:

Hi Michal,
On 03/21/2013 08:56 PM, Michal Hocko wrote:

On Thu 21-03-13 07:49:48, Simon Jeons wrote:
[...]

When I hacking arch/x86/mm/hugetlbpage.c like this,
diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c
index ae1aa71..87f34ee 100644
--- a/arch/x86/mm/hugetlbpage.c
+++ b/arch/x86/mm/hugetlbpage.c
@@ -354,14 +354,13 @@ hugetlb_get_unmapped_area(struct file *file,
unsigned long addr,

#endif /*HAVE_ARCH_HUGETLB_UNMAPPED_AREA*/

-#ifdef CONFIG_X86_64
static __init int setup_hugepagesz(char *opt)
{
unsigned long ps = memparse(opt, );
if (ps == PMD_SIZE) {
hugetlb_add_hstate(PMD_SHIFT - PAGE_SHIFT);
- } else if (ps == PUD_SIZE && cpu_has_gbpages) {
- hugetlb_add_hstate(PUD_SHIFT - PAGE_SHIFT);
+ } else if (ps == PUD_SIZE) {
+ hugetlb_add_hstate(PMD_SHIFT - PAGE_SHIFT+4);
} else {
printk(KERN_ERR "hugepagesz: Unsupported page size %lu M\n",
ps >> 20);

I set boot=hugepagesz=1G hugepages=10, then I got 10 32MB huge pages.
What's the difference between these pages which I hacking and normal
huge pages?

How is this related to the patch set?
Please _stop_ distracting discussion to unrelated topics!

Nothing personal but this is just wasting our time.

Sorry kindly Michal, my bad.
Btw, could you explain this question for me? very sorry waste your time.

Your CPU has to support GB pages. You have removed cpu_has_gbpages test
and added a hstate for order 13 pages which is a weird number on its
own (32MB) because there is no page table level to support them.


But after hacking, there is /sys/kernel/mm/hugepages/hugepages-*, and 
have equal number of 32MB huge pages which I set up in boot parameter. 
If there is no page table level to support them, how can them present? I 
can hacking this successfully in ubuntu, but not in fedora.



--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH, RFC 00/16] Transparent huge page cache

2013-04-04 Thread Simon Jeons

Hi Hugh,
On 01/31/2013 10:12 AM, Hugh Dickins wrote:

On Tue, 29 Jan 2013, Kirill A. Shutemov wrote:

Hugh Dickins wrote:

On Mon, 28 Jan 2013, Kirill A. Shutemov wrote:

From: "Kirill A. Shutemov" 

Here's first steps towards huge pages in page cache.

The intend of the work is get code ready to enable transparent huge page
cache for the most simple fs -- ramfs.

It's not yet near feature-complete. It only provides basic infrastructure.
At the moment we can read, write and truncate file on ramfs with huge pages in
page cache. The most interesting part, mmap(), is not yet there. For now
we split huge page on mmap() attempt.

I can't say that I see whole picture. I'm not sure if I understand locking
model around split_huge_page(). Probably, not.
Andrea, could you check if it looks correct?

Next steps (not necessary in this order):
  - mmap();
  - migration (?);
  - collapse;
  - stats, knobs, etc.;
  - tmpfs/shmem enabling;
  - ...

Kirill A. Shutemov (16):
   block: implement add_bdi_stat()
   mm: implement zero_huge_user_segment and friends
   mm: drop actor argument of do_generic_file_read()
   radix-tree: implement preload for multiple contiguous elements
   thp, mm: basic defines for transparent huge page cache
   thp, mm: rewrite add_to_page_cache_locked() to support huge pages
   thp, mm: rewrite delete_from_page_cache() to support huge pages
   thp, mm: locking tail page is a bug
   thp, mm: handle tail pages in page_cache_get_speculative()
   thp, mm: implement grab_cache_huge_page_write_begin()
   thp, mm: naive support of thp in generic read/write routines
   thp, libfs: initial support of thp in
 simple_read/write_begin/write_end
   thp: handle file pages in split_huge_page()
   thp, mm: truncate support for transparent huge page cache
   thp, mm: split huge page on mmap file page
   ramfs: enable transparent huge page cache

  fs/libfs.c  |   54 +---
  fs/ramfs/inode.c|6 +-
  include/linux/backing-dev.h |   10 +++
  include/linux/huge_mm.h |8 ++
  include/linux/mm.h  |   15 
  include/linux/pagemap.h |   14 ++-
  include/linux/radix-tree.h  |3 +
  lib/radix-tree.c|   32 +--
  mm/filemap.c|  204 +++
  mm/huge_memory.c|   62 +++--
  mm/memory.c |   22 +
  mm/truncate.c   |   12 +++
  12 files changed, 375 insertions(+), 67 deletions(-)

Interesting.

I was starting to think about Transparent Huge Pagecache a few
months ago, but then got washed away by incoming waves as usual.

Certainly I don't have a line of code to show for it; but my first
impression of your patches is that we have very different ideas of
where to start.

A second impression confirms that we have very different ideas of
where to start.  I don't want to be dismissive, and please don't let
me discourage you, but I just don't find what you have very interesting.

I'm sure you'll agree that the interesting part, and the difficult part,
comes with mmap(); and there's no point whatever to THPages without mmap()
(of course, I'm including exec and brk and shm when I say mmap there).

(There may be performance benefits in working with larger page cache
size, which Christoph Lameter explored a few years back, but that's a
different topic: I think 2MB - if I may be x86_64-centric - would not be
the unit of choice for that, unless SSD erase block were to dominate.)

I'm interested to get to the point of prototyping something that does
support mmap() of THPageCache: I'm pretty sure that I'd then soon learn
a lot about my misconceptions, and have to rework for a while (or give
up!); but I don't see much point in posting anything without that.
I don't know if we have 5 or 50 places which "know" that a THPage
must be Anon: some I'll spot in advance, some I sadly won't.

It's not clear to me that the infrastructural changes you make in this
series will be needed or not, if I pursue my approach: some perhaps as
optimizations on top of the poorly performing base that may emerge from
going about it my way.  But for me it's too soon to think about those.

Something I notice that we do agree upon: the radix_tree holding the
4k subpages, at least for now.  When I first started thinking towards
THPageCache, I was fascinated by how we could manage the hugepages in
the radix_tree, cutting out unnecessary levels etc; but after a while
I realized that although there's probably nice scope for cleverness
there (significantly constrained by RCU expectations), it would only
be about optimization.  Let's be simple and stupid about radix_tree
for now, the problems that need to be worked out lie elsewhere.


Perhaps that's good complementarity, or perhaps I'll disagree with
your approach.  I'll be taking a look at yours in the coming days,
and trying to summon back up my own ideas to summarize them for you.

Yeah, it would be nice to see alternative design ideas. 

Re: [PATCH, RFC 00/16] Transparent huge page cache

2013-04-04 Thread Simon Jeons

Hi Hugh,
On 01/31/2013 10:12 AM, Hugh Dickins wrote:

On Tue, 29 Jan 2013, Kirill A. Shutemov wrote:

Hugh Dickins wrote:

On Mon, 28 Jan 2013, Kirill A. Shutemov wrote:

From: "Kirill A. Shutemov" 

Here's first steps towards huge pages in page cache.

The intend of the work is get code ready to enable transparent huge page
cache for the most simple fs -- ramfs.

It's not yet near feature-complete. It only provides basic infrastructure.
At the moment we can read, write and truncate file on ramfs with huge pages in
page cache. The most interesting part, mmap(), is not yet there. For now
we split huge page on mmap() attempt.

I can't say that I see whole picture. I'm not sure if I understand locking
model around split_huge_page(). Probably, not.
Andrea, could you check if it looks correct?

Next steps (not necessary in this order):
  - mmap();
  - migration (?);
  - collapse;
  - stats, knobs, etc.;
  - tmpfs/shmem enabling;
  - ...

Kirill A. Shutemov (16):
   block: implement add_bdi_stat()
   mm: implement zero_huge_user_segment and friends
   mm: drop actor argument of do_generic_file_read()
   radix-tree: implement preload for multiple contiguous elements
   thp, mm: basic defines for transparent huge page cache
   thp, mm: rewrite add_to_page_cache_locked() to support huge pages
   thp, mm: rewrite delete_from_page_cache() to support huge pages
   thp, mm: locking tail page is a bug
   thp, mm: handle tail pages in page_cache_get_speculative()
   thp, mm: implement grab_cache_huge_page_write_begin()
   thp, mm: naive support of thp in generic read/write routines
   thp, libfs: initial support of thp in
 simple_read/write_begin/write_end
   thp: handle file pages in split_huge_page()
   thp, mm: truncate support for transparent huge page cache
   thp, mm: split huge page on mmap file page
   ramfs: enable transparent huge page cache

  fs/libfs.c  |   54 +---
  fs/ramfs/inode.c|6 +-
  include/linux/backing-dev.h |   10 +++
  include/linux/huge_mm.h |8 ++
  include/linux/mm.h  |   15 
  include/linux/pagemap.h |   14 ++-
  include/linux/radix-tree.h  |3 +
  lib/radix-tree.c|   32 +--
  mm/filemap.c|  204 +++
  mm/huge_memory.c|   62 +++--
  mm/memory.c |   22 +
  mm/truncate.c   |   12 +++
  12 files changed, 375 insertions(+), 67 deletions(-)

Interesting.

I was starting to think about Transparent Huge Pagecache a few
months ago, but then got washed away by incoming waves as usual.

Certainly I don't have a line of code to show for it; but my first
impression of your patches is that we have very different ideas of
where to start.

A second impression confirms that we have very different ideas of
where to start.  I don't want to be dismissive, and please don't let
me discourage you, but I just don't find what you have very interesting.

I'm sure you'll agree that the interesting part, and the difficult part,
comes with mmap(); and there's no point whatever to THPages without mmap()
(of course, I'm including exec and brk and shm when I say mmap there).

(There may be performance benefits in working with larger page cache
size, which Christoph Lameter explored a few years back, but that's a
different topic: I think 2MB - if I may be x86_64-centric - would not be
the unit of choice for that, unless SSD erase block were to dominate.)

I'm interested to get to the point of prototyping something that does
support mmap() of THPageCache: I'm pretty sure that I'd then soon learn
a lot about my misconceptions, and have to rework for a while (or give
up!); but I don't see much point in posting anything without that.
I don't know if we have 5 or 50 places which "know" that a THPage
must be Anon: some I'll spot in advance, some I sadly won't.

It's not clear to me that the infrastructural changes you make in this
series will be needed or not, if I pursue my approach: some perhaps as
optimizations on top of the poorly performing base that may emerge from
going about it my way.  But for me it's too soon to think about those.

Something I notice that we do agree upon: the radix_tree holding the
4k subpages, at least for now.  When I first started thinking towards
THPageCache, I was fascinated by how we could manage the hugepages in
the radix_tree, cutting out unnecessary levels etc; but after a while
I realized that although there's probably nice scope for cleverness
there (significantly constrained by RCU expectations), it would only
be about optimization.  Let's be simple and stupid about radix_tree
for now, the problems that need to be worked out lie elsewhere.


Perhaps that's good complementarity, or perhaps I'll disagree with
your approach.  I'll be taking a look at yours in the coming days,
and trying to summon back up my own ideas to summarize them for you.

Yeah, it would be nice to see alternative design ideas. 

Re: [PATCH, RFC 00/16] Transparent huge page cache

2013-04-04 Thread Simon Jeons

Hi Hugh,
On 01/31/2013 10:12 AM, Hugh Dickins wrote:

On Tue, 29 Jan 2013, Kirill A. Shutemov wrote:

Hugh Dickins wrote:

On Mon, 28 Jan 2013, Kirill A. Shutemov wrote:

From: Kirill A. Shutemov kirill.shute...@linux.intel.com

Here's first steps towards huge pages in page cache.

The intend of the work is get code ready to enable transparent huge page
cache for the most simple fs -- ramfs.

It's not yet near feature-complete. It only provides basic infrastructure.
At the moment we can read, write and truncate file on ramfs with huge pages in
page cache. The most interesting part, mmap(), is not yet there. For now
we split huge page on mmap() attempt.

I can't say that I see whole picture. I'm not sure if I understand locking
model around split_huge_page(). Probably, not.
Andrea, could you check if it looks correct?

Next steps (not necessary in this order):
  - mmap();
  - migration (?);
  - collapse;
  - stats, knobs, etc.;
  - tmpfs/shmem enabling;
  - ...

Kirill A. Shutemov (16):
   block: implement add_bdi_stat()
   mm: implement zero_huge_user_segment and friends
   mm: drop actor argument of do_generic_file_read()
   radix-tree: implement preload for multiple contiguous elements
   thp, mm: basic defines for transparent huge page cache
   thp, mm: rewrite add_to_page_cache_locked() to support huge pages
   thp, mm: rewrite delete_from_page_cache() to support huge pages
   thp, mm: locking tail page is a bug
   thp, mm: handle tail pages in page_cache_get_speculative()
   thp, mm: implement grab_cache_huge_page_write_begin()
   thp, mm: naive support of thp in generic read/write routines
   thp, libfs: initial support of thp in
 simple_read/write_begin/write_end
   thp: handle file pages in split_huge_page()
   thp, mm: truncate support for transparent huge page cache
   thp, mm: split huge page on mmap file page
   ramfs: enable transparent huge page cache

  fs/libfs.c  |   54 +---
  fs/ramfs/inode.c|6 +-
  include/linux/backing-dev.h |   10 +++
  include/linux/huge_mm.h |8 ++
  include/linux/mm.h  |   15 
  include/linux/pagemap.h |   14 ++-
  include/linux/radix-tree.h  |3 +
  lib/radix-tree.c|   32 +--
  mm/filemap.c|  204 +++
  mm/huge_memory.c|   62 +++--
  mm/memory.c |   22 +
  mm/truncate.c   |   12 +++
  12 files changed, 375 insertions(+), 67 deletions(-)

Interesting.

I was starting to think about Transparent Huge Pagecache a few
months ago, but then got washed away by incoming waves as usual.

Certainly I don't have a line of code to show for it; but my first
impression of your patches is that we have very different ideas of
where to start.

A second impression confirms that we have very different ideas of
where to start.  I don't want to be dismissive, and please don't let
me discourage you, but I just don't find what you have very interesting.

I'm sure you'll agree that the interesting part, and the difficult part,
comes with mmap(); and there's no point whatever to THPages without mmap()
(of course, I'm including exec and brk and shm when I say mmap there).

(There may be performance benefits in working with larger page cache
size, which Christoph Lameter explored a few years back, but that's a
different topic: I think 2MB - if I may be x86_64-centric - would not be
the unit of choice for that, unless SSD erase block were to dominate.)

I'm interested to get to the point of prototyping something that does
support mmap() of THPageCache: I'm pretty sure that I'd then soon learn
a lot about my misconceptions, and have to rework for a while (or give
up!); but I don't see much point in posting anything without that.
I don't know if we have 5 or 50 places which know that a THPage
must be Anon: some I'll spot in advance, some I sadly won't.

It's not clear to me that the infrastructural changes you make in this
series will be needed or not, if I pursue my approach: some perhaps as
optimizations on top of the poorly performing base that may emerge from
going about it my way.  But for me it's too soon to think about those.

Something I notice that we do agree upon: the radix_tree holding the
4k subpages, at least for now.  When I first started thinking towards
THPageCache, I was fascinated by how we could manage the hugepages in
the radix_tree, cutting out unnecessary levels etc; but after a while
I realized that although there's probably nice scope for cleverness
there (significantly constrained by RCU expectations), it would only
be about optimization.  Let's be simple and stupid about radix_tree
for now, the problems that need to be worked out lie elsewhere.


Perhaps that's good complementarity, or perhaps I'll disagree with
your approach.  I'll be taking a look at yours in the coming days,
and trying to summon back up my own ideas to summarize them for you.

Yeah, it would be nice to see 

Re: [PATCH, RFC 00/16] Transparent huge page cache

2013-04-04 Thread Simon Jeons

Hi Hugh,
On 01/31/2013 10:12 AM, Hugh Dickins wrote:

On Tue, 29 Jan 2013, Kirill A. Shutemov wrote:

Hugh Dickins wrote:

On Mon, 28 Jan 2013, Kirill A. Shutemov wrote:

From: Kirill A. Shutemov kirill.shute...@linux.intel.com

Here's first steps towards huge pages in page cache.

The intend of the work is get code ready to enable transparent huge page
cache for the most simple fs -- ramfs.

It's not yet near feature-complete. It only provides basic infrastructure.
At the moment we can read, write and truncate file on ramfs with huge pages in
page cache. The most interesting part, mmap(), is not yet there. For now
we split huge page on mmap() attempt.

I can't say that I see whole picture. I'm not sure if I understand locking
model around split_huge_page(). Probably, not.
Andrea, could you check if it looks correct?

Next steps (not necessary in this order):
  - mmap();
  - migration (?);
  - collapse;
  - stats, knobs, etc.;
  - tmpfs/shmem enabling;
  - ...

Kirill A. Shutemov (16):
   block: implement add_bdi_stat()
   mm: implement zero_huge_user_segment and friends
   mm: drop actor argument of do_generic_file_read()
   radix-tree: implement preload for multiple contiguous elements
   thp, mm: basic defines for transparent huge page cache
   thp, mm: rewrite add_to_page_cache_locked() to support huge pages
   thp, mm: rewrite delete_from_page_cache() to support huge pages
   thp, mm: locking tail page is a bug
   thp, mm: handle tail pages in page_cache_get_speculative()
   thp, mm: implement grab_cache_huge_page_write_begin()
   thp, mm: naive support of thp in generic read/write routines
   thp, libfs: initial support of thp in
 simple_read/write_begin/write_end
   thp: handle file pages in split_huge_page()
   thp, mm: truncate support for transparent huge page cache
   thp, mm: split huge page on mmap file page
   ramfs: enable transparent huge page cache

  fs/libfs.c  |   54 +---
  fs/ramfs/inode.c|6 +-
  include/linux/backing-dev.h |   10 +++
  include/linux/huge_mm.h |8 ++
  include/linux/mm.h  |   15 
  include/linux/pagemap.h |   14 ++-
  include/linux/radix-tree.h  |3 +
  lib/radix-tree.c|   32 +--
  mm/filemap.c|  204 +++
  mm/huge_memory.c|   62 +++--
  mm/memory.c |   22 +
  mm/truncate.c   |   12 +++
  12 files changed, 375 insertions(+), 67 deletions(-)

Interesting.

I was starting to think about Transparent Huge Pagecache a few
months ago, but then got washed away by incoming waves as usual.

Certainly I don't have a line of code to show for it; but my first
impression of your patches is that we have very different ideas of
where to start.

A second impression confirms that we have very different ideas of
where to start.  I don't want to be dismissive, and please don't let
me discourage you, but I just don't find what you have very interesting.

I'm sure you'll agree that the interesting part, and the difficult part,
comes with mmap(); and there's no point whatever to THPages without mmap()
(of course, I'm including exec and brk and shm when I say mmap there).

(There may be performance benefits in working with larger page cache
size, which Christoph Lameter explored a few years back, but that's a
different topic: I think 2MB - if I may be x86_64-centric - would not be
the unit of choice for that, unless SSD erase block were to dominate.)

I'm interested to get to the point of prototyping something that does
support mmap() of THPageCache: I'm pretty sure that I'd then soon learn
a lot about my misconceptions, and have to rework for a while (or give
up!); but I don't see much point in posting anything without that.
I don't know if we have 5 or 50 places which know that a THPage
must be Anon: some I'll spot in advance, some I sadly won't.

It's not clear to me that the infrastructural changes you make in this
series will be needed or not, if I pursue my approach: some perhaps as
optimizations on top of the poorly performing base that may emerge from
going about it my way.  But for me it's too soon to think about those.

Something I notice that we do agree upon: the radix_tree holding the
4k subpages, at least for now.  When I first started thinking towards
THPageCache, I was fascinated by how we could manage the hugepages in
the radix_tree, cutting out unnecessary levels etc; but after a while
I realized that although there's probably nice scope for cleverness
there (significantly constrained by RCU expectations), it would only
be about optimization.  Let's be simple and stupid about radix_tree
for now, the problems that need to be worked out lie elsewhere.


Perhaps that's good complementarity, or perhaps I'll disagree with
your approach.  I'll be taking a look at yours in the coming days,
and trying to summon back up my own ideas to summarize them for you.

Yeah, it would be nice to see 

Re: [RFC][PATCH 0/9] extend hugepage migration

2013-04-04 Thread Simon Jeons

Hi Michal,
On 03/22/2013 04:15 PM, Michal Hocko wrote:

[getting off-list]

On Fri 22-03-13 07:46:32, Simon Jeons wrote:

Hi Michal,
On 03/21/2013 08:56 PM, Michal Hocko wrote:

On Thu 21-03-13 07:49:48, Simon Jeons wrote:
[...]

When I hacking arch/x86/mm/hugetlbpage.c like this,
diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c
index ae1aa71..87f34ee 100644
--- a/arch/x86/mm/hugetlbpage.c
+++ b/arch/x86/mm/hugetlbpage.c
@@ -354,14 +354,13 @@ hugetlb_get_unmapped_area(struct file *file,
unsigned long addr,

#endif /*HAVE_ARCH_HUGETLB_UNMAPPED_AREA*/

-#ifdef CONFIG_X86_64
static __init int setup_hugepagesz(char *opt)
{
unsigned long ps = memparse(opt, opt);
if (ps == PMD_SIZE) {
hugetlb_add_hstate(PMD_SHIFT - PAGE_SHIFT);
- } else if (ps == PUD_SIZE  cpu_has_gbpages) {
- hugetlb_add_hstate(PUD_SHIFT - PAGE_SHIFT);
+ } else if (ps == PUD_SIZE) {
+ hugetlb_add_hstate(PMD_SHIFT - PAGE_SHIFT+4);
} else {
printk(KERN_ERR hugepagesz: Unsupported page size %lu M\n,
ps  20);

I set boot=hugepagesz=1G hugepages=10, then I got 10 32MB huge pages.
What's the difference between these pages which I hacking and normal
huge pages?

How is this related to the patch set?
Please _stop_ distracting discussion to unrelated topics!

Nothing personal but this is just wasting our time.

Sorry kindly Michal, my bad.
Btw, could you explain this question for me? very sorry waste your time.

Your CPU has to support GB pages. You have removed cpu_has_gbpages test
and added a hstate for order 13 pages which is a weird number on its
own (32MB) because there is no page table level to support them.


But after hacking, there is /sys/kernel/mm/hugepages/hugepages-*, and 
have equal number of 32MB huge pages which I set up in boot parameter. 
If there is no page table level to support them, how can them present? I 
can hacking this successfully in ubuntu, but not in fedora.



--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] x86: add phys addr validity check for /dev/mem mmap

2013-04-03 Thread Simon Jeons

Hi H.Peter,
On 04/04/2013 09:32 AM, H. Peter Anvin wrote:

On 04/03/2013 06:17 PM, Simon Jeons wrote:

e820 also contain mmio, correct?

No.


How to check which address is used by mmio? /proc/iomem, correct?




So cpu should not access address beyond
e820 map(RAM+MMIO).

No.

-hpa




--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 1/9] migrate: add migrate_entry_wait_huge()

2013-04-03 Thread Simon Jeons
Ping!
On 03/21/2013 07:36 AM, Simon Jeons wrote:
> Hi Naoya,
> On 03/21/2013 05:53 AM, Naoya Horiguchi wrote:
>> On Wed, Mar 20, 2013 at 07:57:32AM +0800, Simon Jeons wrote:
>>> Hi Naoya,
>>> On 02/22/2013 03:41 AM, Naoya Horiguchi wrote:
>>>> When we have a page fault for the address which is backed by a hugepage
>>>> under migration, the kernel can't wait correctly until the migration
>>>> finishes. This is because pte_offset_map_lock() can't get a correct
>>> It seems that current hugetlb_fault still wait hugetlb page under
>>> migration, how can it work without lock 2MB memory?
>> Hugetlb_fault() does call migration_entry_wait(), but returns immediately.
> Could you point out to me which code in function migration_entry_wait()
> lead to return immediately?
>
>> So page fault happens over and over again until the migration completes.
>> IOW, migration_entry_wait() is now broken for hugepage and doesn't work
>> as expected.
>>
>> Thanks,
>> Naoya

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v7 2/2] mm: replace hardcoded 3% with admin_reserve_pages knob

2013-04-03 Thread Simon Jeons
    -   --
guessyes1  5419/5419   no - yes   8MB yes
guessyes4  5436/5436   1  - yes   8MB yes
guessno 1  5440/5440   *  - yes   8MB yes
guessno 4  -   crash  - no8MB no

* process would successfully mlock, then the oom killer would pick it

neveryes1  5446/5446   no 10MB yes20MB yes
neveryes4  5456/5456   no 10MB yes20MB yes
neverno 1  5387/5429   no 128MB no8MB barely
neverno 1  5323/5428   no 226MB barely8MB barely
neverno 1  5323/5428   no 226MB barely8MB barely

neverno 1  5359/5448   no 10MB no 10MB barely

neverno 1  5323/5428   no 0MB no  10MB barely
neverno 1  5332/5428   no 0MB no  50MB yes
neverno 1  5293/5429   no 0MB no  90MB yes

neverno 1  5001/5427   no 230MB yes   338MB yes
neverno 4* 4998/5424   no 230MB yes   338MB yes

* more memtesters were launched, able to allocate approximately another 100MB


Future Work

  - Test larger memory systems.

  - Test an embedded image.

  - Time malloc microbenchmarks.

  - Would it be useful to be able to set overcommit policy for
each memory cgroup?

  - Some lines are slightly above 80 chars.
Perhaps define a macro to convert between pages and kb?
Other places in the kernel do this.


Signed-off-by: Andrew Shewmaker 

---

Patch Changelog

v7:
  * Rebased onto v3.9-rc3-mmotm-2013-03-22-15-21

  * Removed sysctl.h include. It wasn't needed since I removed my
custom handler in v5

  * Ran checkpatch.pl and cleaned up whitespace errors.
A couple lines exceed 80 chars, but that seems common in
nearby code.

  * Added future work section

v6:
  * Rebased onto v3.9-rc1-mmotm-2013-03-07-15-45

  * Replace user_reserve_pages with user_reserve_kbytes

  * Replace admin_reserve_pages with admin_reserve_kbytes

  * Increase verbosity of patch changelog

  * Add background, motivation, risks, alternatives, and testing

  * Add Alan Cox's example of sparse arrays to the
documentation of the 'always' overcommit mode

  * Add note in overcommit_memory documentation that
user_reserve_kbytes affects 'never' mode

  * Improve wording of user_reserve_kbytes documentation

  * Clearly document risk of root-cant-log-in
in admin_reserve_kbytes documentation

v5:
  * Change nontunable k in min(3% process size, k) into
user_reserve_pages knob

  * user_reserve_pages defaults to min(3% free pages, 128MB)
previous k=8MB wasn't enough for OVERCOMMIT_NEVER mode
and 128MB worked when I tested it

  * 128MB from taking max VSZ of sshd, login, bash, and top
and adding the RSS of each

  * Custom sysctl handler was unnecessary. Now using
proc_doulongvec_minmax()

  v5 discussion:
   * Request for more complete changelog with detailed motivation,
 problems, alternatives, and discussion. -Andrew Morton

   * How is the root-cant-login problem addressed?
   * What happens if user_reserve_pages is set to 0?
   * What happens if admin_reserve_pages is set to 0?
   * Clearly describe risks in documentation
 -Andrew Morton

 As long as  admin_reserve_pages is set to at least 8MB for
 OVERCOMMIT_GUESS or above 128MB for OVERCOMMIT_NEVER, I was able to
 log in as root and kill processes. The root-cant-log-in problem
 cannot be hit if user_reserve_pages is set to 0 because that
 reserve only exists in OVERCOMMIT_NEVER mode.

   * Exported interfaces which deal in "pages" are considered harmful.
 PAGE_SIZE can vary by a factor of 16 depending upon config (ie:
 architecture). The risk is that a setup script which works nicely on
 4k x86_64 will waste memory when executed on a 64k PAGE_SIZE powerpc
 box. A smart programmer will recognize this and will adapt the setting
 using getpagesize(2), but if we define these things in "bytes" rather
 than "pages" then dumb programmers can use it too.
 -Andrew Morton

v4:
  * Rebased onto v3.8-mmotm-2013-03-01-15-50

  * No longer assumes 4kb pages

  * Code duplicated for nommu

  v4 discussion:
   * "Please add changelog, otherwise it's for other guys to review."
 -Simon Jeons

 Sorry, I'll be sure to include one in the future. And it
 looks like I do need a v5 ... I think this needs to
 be tunable like the admin reserve. The user_reserve_pages default
 certainly needs to be higher since this reserve is only for
 OVERCOMMIT_NEVER mode and 8MB is too little to allow
 the user to recover. I was thinking of OVERCOMMIT_GUESS
 mode when I chose that size.

v3:
  * New patch summary becau

Re: [PATCH] THP: Use explicit memory barrier

2013-04-03 Thread Simon Jeons

Hi Minchan,
On 04/01/2013 07:45 AM, Minchan Kim wrote:

__do_huge_pmd_anonymous_page depends on page_add_new_anon_rmap's
spinlock for making sure that clear_huge_page write become visible
after set set_pmd_at() write.


1. There are no pte modify, why take page_table_lock here?
2. What's the meaning of "clear_huge_page write become visible after set 
set_pmd_at() write"?




But lru_cache_add_lru uses pagevec so it could miss spinlock
easily so above rule was broken so user may see inconsistent data.

This patch fixes it with using explict barrier rather than depending
on lru spinlock.

Cc: Mel Gorman 
Cc: Andrea Arcangeli 
Cc: Hugh Dickins 
Signed-off-by: Minchan Kim 
---
  mm/huge_memory.c | 7 +++
  1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index bfa142e..fad800e 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -725,11 +725,10 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct 
*mm,
pmd_t entry;
entry = mk_huge_pmd(page, vma);
/*
-* The spinlocking to take the lru_lock inside
-* page_add_new_anon_rmap() acts as a full memory
-* barrier to be sure clear_huge_page writes become
-* visible after the set_pmd_at() write.
+* clear_huge_page write become visible after the
+* set_pmd_at() write.
 */
+   smp_wmb();
page_add_new_anon_rmap(page, vma, haddr);
set_pmd_at(mm, haddr, pmd, entry);
pgtable_trans_huge_deposit(mm, pgtable);


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] x86: add phys addr validity check for /dev/mem mmap

2013-04-03 Thread Simon Jeons

On 04/04/2013 10:14 AM, H. Peter Anvin wrote:

Because git didn't exist before then?


Oh, I see, thanks! :-)



Simon Jeons  wrote:


On 04/04/2013 09:32 AM, H. Peter Anvin wrote:

On 04/03/2013 06:17 PM, Simon Jeons wrote:

e820 also contain mmio, correct?

No.


So cpu should not access address beyond
e820 map(RAM+MMIO).

No.

-hpa



One offline question, why can't check git log before 2005?


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] x86: add phys addr validity check for /dev/mem mmap

2013-04-03 Thread Simon Jeons

On 04/04/2013 09:32 AM, H. Peter Anvin wrote:

On 04/03/2013 06:17 PM, Simon Jeons wrote:

e820 also contain mmio, correct?

No.


So cpu should not access address beyond
e820 map(RAM+MMIO).

No.

-hpa




One offline question, why can't check git log before 2005?
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] x86: add phys addr validity check for /dev/mem mmap

2013-04-03 Thread Simon Jeons

Hi H.Peter,
On 04/04/2013 09:13 AM, H. Peter Anvin wrote:

On 04/03/2013 06:11 PM, Simon Jeons wrote:

Why we consider boot_cpu_data.x86_phys_bits instead of e820 map here?


Because x86_phys_bits is what controls how much address space the
processor has.  e820 tells us how much *RAM* the machine has, or
specifically, how much RAM the machine had on boot.


e820 also contain mmio, correct? So cpu should not access address beyond 
e820 map(RAM+MMIO).




-hpa



--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] x86: add phys addr validity check for /dev/mem mmap

2013-04-03 Thread Simon Jeons

Hi H.Peter,
On 04/03/2013 02:48 AM, H. Peter Anvin wrote:

On 04/02/2013 05:28 AM, Frantisek Hrbata wrote:

diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h
index d8e8eef..39607c6 100644
--- a/arch/x86/include/asm/io.h
+++ b/arch/x86/include/asm/io.h
@@ -242,6 +242,10 @@ static inline void flush_write_buffers(void)
  #endif
  }
  
+#define ARCH_HAS_VALID_PHYS_ADDR_RANGE

+extern int valid_phys_addr_range(phys_addr_t addr, size_t count);
+extern int valid_mmap_phys_addr_range(unsigned long pfn, size_t count);
+
  #endif /* __KERNEL__ */
  
  extern void native_io_delay(void);

diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c
index 845df68..92ec31c 100644
--- a/arch/x86/mm/mmap.c
+++ b/arch/x86/mm/mmap.c
@@ -31,6 +31,8 @@
  #include 
  #include 
  
+#include "physaddr.h"

+
  struct __read_mostly va_alignment va_align = {
.flags = -1,
  };
@@ -122,3 +124,14 @@ void arch_pick_mmap_layout(struct mm_struct *mm)
mm->unmap_area = arch_unmap_area_topdown;
}
  }
+
+int valid_phys_addr_range(phys_addr_t addr, size_t count)
+{
+   return addr + count <= __pa(high_memory);
+}
+
+int valid_mmap_phys_addr_range(unsigned long pfn, size_t count)
+{
+   resource_size_t addr = (pfn << PAGE_SHIFT) + count;
+   return phys_addr_valid(addr);
+}



Why we consider boot_cpu_data.x86_phys_bits instead of e820 map here?



-hpa



--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] mm, x86: no zeroing of hugetlbfs pages at boot

2013-04-03 Thread Simon Jeons

On 03/07/2013 05:50 AM, Cliff Wickman wrote:

From: Cliff Wickman 

Allocating a large number of 1GB hugetlbfs pages at boot takes a
very long time.

Large system sites would at times like to allocate a very large amount of
memory as 1GB pages.  They would put this on the kernel boot line:
default_hugepagesz=1G hugepagesz=1G hugepages=4096
[Dynamic allocation of 1G pages is not an option, as zone pages only go
  up to MAX_ORDER, and MAX_ORDER cannot exceed the section size.]

Each page is zeroed as it is allocated, and all allocation is done by
cpu 0, as this path is early in boot:


How you confirm they are done by cpu 0? just cpu 0 works during boot?


   start_kernel
 kernel_init
   do_pre_smp_initcalls
 hugetlb_init
   hugetlb_init_hstates
 hugetlb_hstate_alloc_pages

Zeroing remote (offnode) memory takes ~1GB/sec (and most memory is offnode
on large numa systems).
This estimate is approximate (it depends on core frequency & number of hops
to remote memory) but should be within a factor of 2 on most systems.
A benchmark attempting to reserve a TB for 1GB pages would thus require
~1000 seconds of boot time just for this allocating.  32TB would take 8 hours.

I propose passing a flag to the early allocator to indicate that no zeroing
of a page should be done.  The 'no zeroing' flag would have to be passed
down this code path:

   hugetlb_hstate_alloc_pages
 alloc_bootmem_huge_page
   __alloc_bootmem_node_nopanic NO_ZERO  (nobootmem.c)
 __alloc_memory_core_early  NO_ZERO
  if (!(flags & NO_ZERO))
 memset(ptr, 0, size);

Or this path if CONFIG_NO_BOOTMEM is not set:

   hugetlb_hstate_alloc_pages
 alloc_bootmem_huge_page
   __alloc_bootmem_node_nopanic  NO_ZERO  (bootmem.c)
 alloc_bootmem_core  NO_ZERO
  if (!(flags & NO_ZERO))
 memset(region, 0, size);
 __alloc_bootmem_nopanic NO_ZERO
   ___alloc_bootmem_nopanic  NO_ZERO
 alloc_bootmem_core  NO_ZERO
  if (!(flags & NO_ZERO))
 memset(region, 0, size);

Signed-off-by: Cliff Wickman 

---
  arch/x86/kernel/setup_percpu.c |4 ++--
  include/linux/bootmem.h|   23 ---
  mm/bootmem.c   |   12 +++-
  mm/hugetlb.c   |3 ++-
  mm/nobootmem.c |   41 
+++--
  mm/page_cgroup.c   |2 +-
  mm/sparse.c|2 +-
  7 files changed, 52 insertions(+), 35 deletions(-)

Index: linux/include/linux/bootmem.h
===
--- linux.orig/include/linux/bootmem.h
+++ linux/include/linux/bootmem.h
@@ -8,6 +8,11 @@
  #include 
  
  /*

+ * allocation flags
+ */
+#define NO_ZERO0x0001
+
+/*
   *  simple boot-time physical memory area allocator.
   */
  
@@ -79,7 +84,8 @@ extern void *__alloc_bootmem(unsigned lo

 unsigned long goal);
  extern void *__alloc_bootmem_nopanic(unsigned long size,
 unsigned long align,
-unsigned long goal);
+unsigned long goal,
+u32 flags);
  extern void *__alloc_bootmem_node(pg_data_t *pgdat,
  unsigned long size,
  unsigned long align,
@@ -91,12 +97,14 @@ void *__alloc_bootmem_node_high(pg_data_
  extern void *__alloc_bootmem_node_nopanic(pg_data_t *pgdat,
  unsigned long size,
  unsigned long align,
- unsigned long goal);
+ unsigned long goal,
+ u32 flags);
  void *___alloc_bootmem_node_nopanic(pg_data_t *pgdat,
  unsigned long size,
  unsigned long align,
  unsigned long goal,
- unsigned long limit);
+ unsigned long limit,
+ u32 flags);
  extern void *__alloc_bootmem_low(unsigned long size,
 unsigned long align,
 unsigned long goal);
@@ -120,19 +128,20 @@ extern void *__alloc_bootmem_low_node(pg
  #define alloc_bootmem_align(x, align) \
__alloc_bootmem(x, align, BOOTMEM_LOW_LIMIT)
  #define alloc_bootmem_nopanic(x) \
-   __alloc_bootmem_nopanic(x, SMP_CACHE_BYTES, BOOTMEM_LOW_LIMIT)
+   __alloc_bootmem_nopanic(x, SMP_CACHE_BYTES, BOOTMEM_LOW_LIMIT, 0)
  #define alloc_bootmem_pages(x) \
__alloc_bootmem(x, PAGE_SIZE, BOOTMEM_LOW_LIMIT)
  #define alloc_bootmem_pages_nopanic(x) \
-   __alloc_bootmem_nopanic(x, PAGE_SIZE, BOOTMEM_LOW_LIMIT)
+   

Re: [PATCH] x86: add phys addr validity check for /dev/mem mmap

2013-04-03 Thread Simon Jeons

On 04/04/2013 10:14 AM, H. Peter Anvin wrote:

Because git didn't exist before then?


Oh, I see, thanks! :-)



Simon Jeons simon.je...@gmail.com wrote:


On 04/04/2013 09:32 AM, H. Peter Anvin wrote:

On 04/03/2013 06:17 PM, Simon Jeons wrote:

e820 also contain mmio, correct?

No.


So cpu should not access address beyond
e820 map(RAM+MMIO).

No.

-hpa



One offline question, why can't check git log before 2005?


--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] THP: Use explicit memory barrier

2013-04-03 Thread Simon Jeons

Hi Minchan,
On 04/01/2013 07:45 AM, Minchan Kim wrote:

__do_huge_pmd_anonymous_page depends on page_add_new_anon_rmap's
spinlock for making sure that clear_huge_page write become visible
after set set_pmd_at() write.


1. There are no pte modify, why take page_table_lock here?
2. What's the meaning of clear_huge_page write become visible after set 
set_pmd_at() write?




But lru_cache_add_lru uses pagevec so it could miss spinlock
easily so above rule was broken so user may see inconsistent data.

This patch fixes it with using explict barrier rather than depending
on lru spinlock.

Cc: Mel Gorman mgor...@suse.de
Cc: Andrea Arcangeli aarca...@redhat.com
Cc: Hugh Dickins hu...@google.com
Signed-off-by: Minchan Kim minc...@kernel.org
---
  mm/huge_memory.c | 7 +++
  1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index bfa142e..fad800e 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -725,11 +725,10 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct 
*mm,
pmd_t entry;
entry = mk_huge_pmd(page, vma);
/*
-* The spinlocking to take the lru_lock inside
-* page_add_new_anon_rmap() acts as a full memory
-* barrier to be sure clear_huge_page writes become
-* visible after the set_pmd_at() write.
+* clear_huge_page write become visible after the
+* set_pmd_at() write.
 */
+   smp_wmb();
page_add_new_anon_rmap(page, vma, haddr);
set_pmd_at(mm, haddr, pmd, entry);
pgtable_trans_huge_deposit(mm, pgtable);


--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v7 2/2] mm: replace hardcoded 3% with admin_reserve_pages knob

2013-04-03 Thread Simon Jeons
   no - yes   8MB yes
guessyes4  5436/5436   1  - yes   8MB yes
guessno 1  5440/5440   *  - yes   8MB yes
guessno 4  -   crash  - no8MB no

* process would successfully mlock, then the oom killer would pick it

neveryes1  5446/5446   no 10MB yes20MB yes
neveryes4  5456/5456   no 10MB yes20MB yes
neverno 1  5387/5429   no 128MB no8MB barely
neverno 1  5323/5428   no 226MB barely8MB barely
neverno 1  5323/5428   no 226MB barely8MB barely

neverno 1  5359/5448   no 10MB no 10MB barely

neverno 1  5323/5428   no 0MB no  10MB barely
neverno 1  5332/5428   no 0MB no  50MB yes
neverno 1  5293/5429   no 0MB no  90MB yes

neverno 1  5001/5427   no 230MB yes   338MB yes
neverno 4* 4998/5424   no 230MB yes   338MB yes

* more memtesters were launched, able to allocate approximately another 100MB


Future Work

  - Test larger memory systems.

  - Test an embedded image.

  - Time malloc microbenchmarks.

  - Would it be useful to be able to set overcommit policy for
each memory cgroup?

  - Some lines are slightly above 80 chars.
Perhaps define a macro to convert between pages and kb?
Other places in the kernel do this.


Signed-off-by: Andrew Shewmaker ags...@gmail.com

---

Patch Changelog

v7:
  * Rebased onto v3.9-rc3-mmotm-2013-03-22-15-21

  * Removed sysctl.h include. It wasn't needed since I removed my
custom handler in v5

  * Ran checkpatch.pl and cleaned up whitespace errors.
A couple lines exceed 80 chars, but that seems common in
nearby code.

  * Added future work section

v6:
  * Rebased onto v3.9-rc1-mmotm-2013-03-07-15-45

  * Replace user_reserve_pages with user_reserve_kbytes

  * Replace admin_reserve_pages with admin_reserve_kbytes

  * Increase verbosity of patch changelog

  * Add background, motivation, risks, alternatives, and testing

  * Add Alan Cox's example of sparse arrays to the
documentation of the 'always' overcommit mode

  * Add note in overcommit_memory documentation that
user_reserve_kbytes affects 'never' mode

  * Improve wording of user_reserve_kbytes documentation

  * Clearly document risk of root-cant-log-in
in admin_reserve_kbytes documentation

v5:
  * Change nontunable k in min(3% process size, k) into
user_reserve_pages knob

  * user_reserve_pages defaults to min(3% free pages, 128MB)
previous k=8MB wasn't enough for OVERCOMMIT_NEVER mode
and 128MB worked when I tested it

  * 128MB from taking max VSZ of sshd, login, bash, and top
and adding the RSS of each

  * Custom sysctl handler was unnecessary. Now using
proc_doulongvec_minmax()

  v5 discussion:
   * Request for more complete changelog with detailed motivation,
 problems, alternatives, and discussion. -Andrew Morton

   * How is the root-cant-login problem addressed?
   * What happens if user_reserve_pages is set to 0?
   * What happens if admin_reserve_pages is set to 0?
   * Clearly describe risks in documentation
 -Andrew Morton

 As long as  admin_reserve_pages is set to at least 8MB for
 OVERCOMMIT_GUESS or above 128MB for OVERCOMMIT_NEVER, I was able to
 log in as root and kill processes. The root-cant-log-in problem
 cannot be hit if user_reserve_pages is set to 0 because that
 reserve only exists in OVERCOMMIT_NEVER mode.

   * Exported interfaces which deal in pages are considered harmful.
 PAGE_SIZE can vary by a factor of 16 depending upon config (ie:
 architecture). The risk is that a setup script which works nicely on
 4k x86_64 will waste memory when executed on a 64k PAGE_SIZE powerpc
 box. A smart programmer will recognize this and will adapt the setting
 using getpagesize(2), but if we define these things in bytes rather
 than pages then dumb programmers can use it too.
 -Andrew Morton

v4:
  * Rebased onto v3.8-mmotm-2013-03-01-15-50

  * No longer assumes 4kb pages

  * Code duplicated for nommu

  v4 discussion:
   * Please add changelog, otherwise it's for other guys to review.
 -Simon Jeons

 Sorry, I'll be sure to include one in the future. And it
 looks like I do need a v5 ... I think this needs to
 be tunable like the admin reserve. The user_reserve_pages default
 certainly needs to be higher since this reserve is only for
 OVERCOMMIT_NEVER mode and 8MB is too little to allow
 the user to recover. I was thinking of OVERCOMMIT_GUESS
 mode when I chose that size.

v3:
  * New patch summary because it wasn't unique
New is mm: limit growth of 3% hardcoded other user reserve
Old was mm: tuning

Re: [PATCH 1/9] migrate: add migrate_entry_wait_huge()

2013-04-03 Thread Simon Jeons
Ping!
On 03/21/2013 07:36 AM, Simon Jeons wrote:
 Hi Naoya,
 On 03/21/2013 05:53 AM, Naoya Horiguchi wrote:
 On Wed, Mar 20, 2013 at 07:57:32AM +0800, Simon Jeons wrote:
 Hi Naoya,
 On 02/22/2013 03:41 AM, Naoya Horiguchi wrote:
 When we have a page fault for the address which is backed by a hugepage
 under migration, the kernel can't wait correctly until the migration
 finishes. This is because pte_offset_map_lock() can't get a correct
 It seems that current hugetlb_fault still wait hugetlb page under
 migration, how can it work without lock 2MB memory?
 Hugetlb_fault() does call migration_entry_wait(), but returns immediately.
 Could you point out to me which code in function migration_entry_wait()
 lead to return immediately?

 So page fault happens over and over again until the migration completes.
 IOW, migration_entry_wait() is now broken for hugepage and doesn't work
 as expected.

 Thanks,
 Naoya

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] x86: add phys addr validity check for /dev/mem mmap

2013-04-03 Thread Simon Jeons

Hi H.Peter,
On 04/04/2013 09:32 AM, H. Peter Anvin wrote:

On 04/03/2013 06:17 PM, Simon Jeons wrote:

e820 also contain mmio, correct?

No.


How to check which address is used by mmio? /proc/iomem, correct?




So cpu should not access address beyond
e820 map(RAM+MMIO).

No.

-hpa




--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] mm, x86: no zeroing of hugetlbfs pages at boot

2013-04-03 Thread Simon Jeons

On 03/07/2013 05:50 AM, Cliff Wickman wrote:

From: Cliff Wickman c...@sgi.com

Allocating a large number of 1GB hugetlbfs pages at boot takes a
very long time.

Large system sites would at times like to allocate a very large amount of
memory as 1GB pages.  They would put this on the kernel boot line:
default_hugepagesz=1G hugepagesz=1G hugepages=4096
[Dynamic allocation of 1G pages is not an option, as zone pages only go
  up to MAX_ORDER, and MAX_ORDER cannot exceed the section size.]

Each page is zeroed as it is allocated, and all allocation is done by
cpu 0, as this path is early in boot:


How you confirm they are done by cpu 0? just cpu 0 works during boot?


   start_kernel
 kernel_init
   do_pre_smp_initcalls
 hugetlb_init
   hugetlb_init_hstates
 hugetlb_hstate_alloc_pages

Zeroing remote (offnode) memory takes ~1GB/sec (and most memory is offnode
on large numa systems).
This estimate is approximate (it depends on core frequency  number of hops
to remote memory) but should be within a factor of 2 on most systems.
A benchmark attempting to reserve a TB for 1GB pages would thus require
~1000 seconds of boot time just for this allocating.  32TB would take 8 hours.

I propose passing a flag to the early allocator to indicate that no zeroing
of a page should be done.  The 'no zeroing' flag would have to be passed
down this code path:

   hugetlb_hstate_alloc_pages
 alloc_bootmem_huge_page
   __alloc_bootmem_node_nopanic NO_ZERO  (nobootmem.c)
 __alloc_memory_core_early  NO_ZERO
  if (!(flags  NO_ZERO))
 memset(ptr, 0, size);

Or this path if CONFIG_NO_BOOTMEM is not set:

   hugetlb_hstate_alloc_pages
 alloc_bootmem_huge_page
   __alloc_bootmem_node_nopanic  NO_ZERO  (bootmem.c)
 alloc_bootmem_core  NO_ZERO
  if (!(flags  NO_ZERO))
 memset(region, 0, size);
 __alloc_bootmem_nopanic NO_ZERO
   ___alloc_bootmem_nopanic  NO_ZERO
 alloc_bootmem_core  NO_ZERO
  if (!(flags  NO_ZERO))
 memset(region, 0, size);

Signed-off-by: Cliff Wickman c...@sgi.com

---
  arch/x86/kernel/setup_percpu.c |4 ++--
  include/linux/bootmem.h|   23 ---
  mm/bootmem.c   |   12 +++-
  mm/hugetlb.c   |3 ++-
  mm/nobootmem.c |   41 
+++--
  mm/page_cgroup.c   |2 +-
  mm/sparse.c|2 +-
  7 files changed, 52 insertions(+), 35 deletions(-)

Index: linux/include/linux/bootmem.h
===
--- linux.orig/include/linux/bootmem.h
+++ linux/include/linux/bootmem.h
@@ -8,6 +8,11 @@
  #include asm/dma.h
  
  /*

+ * allocation flags
+ */
+#define NO_ZERO0x0001
+
+/*
   *  simple boot-time physical memory area allocator.
   */
  
@@ -79,7 +84,8 @@ extern void *__alloc_bootmem(unsigned lo

 unsigned long goal);
  extern void *__alloc_bootmem_nopanic(unsigned long size,
 unsigned long align,
-unsigned long goal);
+unsigned long goal,
+u32 flags);
  extern void *__alloc_bootmem_node(pg_data_t *pgdat,
  unsigned long size,
  unsigned long align,
@@ -91,12 +97,14 @@ void *__alloc_bootmem_node_high(pg_data_
  extern void *__alloc_bootmem_node_nopanic(pg_data_t *pgdat,
  unsigned long size,
  unsigned long align,
- unsigned long goal);
+ unsigned long goal,
+ u32 flags);
  void *___alloc_bootmem_node_nopanic(pg_data_t *pgdat,
  unsigned long size,
  unsigned long align,
  unsigned long goal,
- unsigned long limit);
+ unsigned long limit,
+ u32 flags);
  extern void *__alloc_bootmem_low(unsigned long size,
 unsigned long align,
 unsigned long goal);
@@ -120,19 +128,20 @@ extern void *__alloc_bootmem_low_node(pg
  #define alloc_bootmem_align(x, align) \
__alloc_bootmem(x, align, BOOTMEM_LOW_LIMIT)
  #define alloc_bootmem_nopanic(x) \
-   __alloc_bootmem_nopanic(x, SMP_CACHE_BYTES, BOOTMEM_LOW_LIMIT)
+   __alloc_bootmem_nopanic(x, SMP_CACHE_BYTES, BOOTMEM_LOW_LIMIT, 0)
  #define alloc_bootmem_pages(x) \
__alloc_bootmem(x, PAGE_SIZE, BOOTMEM_LOW_LIMIT)
  #define alloc_bootmem_pages_nopanic(x) \
-   __alloc_bootmem_nopanic(x, PAGE_SIZE, 

Re: [PATCH] x86: add phys addr validity check for /dev/mem mmap

2013-04-03 Thread Simon Jeons

Hi H.Peter,
On 04/03/2013 02:48 AM, H. Peter Anvin wrote:

On 04/02/2013 05:28 AM, Frantisek Hrbata wrote:

diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h
index d8e8eef..39607c6 100644
--- a/arch/x86/include/asm/io.h
+++ b/arch/x86/include/asm/io.h
@@ -242,6 +242,10 @@ static inline void flush_write_buffers(void)
  #endif
  }
  
+#define ARCH_HAS_VALID_PHYS_ADDR_RANGE

+extern int valid_phys_addr_range(phys_addr_t addr, size_t count);
+extern int valid_mmap_phys_addr_range(unsigned long pfn, size_t count);
+
  #endif /* __KERNEL__ */
  
  extern void native_io_delay(void);

diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c
index 845df68..92ec31c 100644
--- a/arch/x86/mm/mmap.c
+++ b/arch/x86/mm/mmap.c
@@ -31,6 +31,8 @@
  #include linux/sched.h
  #include asm/elf.h
  
+#include physaddr.h

+
  struct __read_mostly va_alignment va_align = {
.flags = -1,
  };
@@ -122,3 +124,14 @@ void arch_pick_mmap_layout(struct mm_struct *mm)
mm-unmap_area = arch_unmap_area_topdown;
}
  }
+
+int valid_phys_addr_range(phys_addr_t addr, size_t count)
+{
+   return addr + count = __pa(high_memory);
+}
+
+int valid_mmap_phys_addr_range(unsigned long pfn, size_t count)
+{
+   resource_size_t addr = (pfn  PAGE_SHIFT) + count;
+   return phys_addr_valid(addr);
+}



Why we consider boot_cpu_data.x86_phys_bits instead of e820 map here?



-hpa



--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


  1   2   3   4   >