Re: [PATCH 00/05] input: RMI4 Synaptics RMI4 Touchscreen Driver

2014-02-03 Thread Linus Walleij
On Sat, Jan 19, 2013 at 2:12 AM, Christopher Heiny  wrote:

> This patchset implements changes based on the synaptics-rmi4 branch of
> Dmitry's input tree.

What is happening to the RMI4 driver stuff?

Has this development stalled? The branch in Dmitry's git
seems to be maintained but not much is happening or is
there any progress?

Yours,
Linus Walleij
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v2 2/3] clocksource: tegra: Define timer bases in header file

2014-02-03 Thread Daniel Lezcano

On 02/04/2014 01:17 AM, Andrew Chew wrote:

Added timers that are present in tegra30 and later, that are NOT in tegra20.

Also, some of these timer bases are needed in the tegra watchdog driver, so
separate them out into a header file that both the clocksource driver and
the watchdog driver can share them.

Signed-off-by: Andrew Chew 


When reading the patch 3/3, I don't see any define reused from this 
header except TEGRA30_TIMER_WDT_BASE which is only used for the 
watchdog. May be I missed something but I don't see any definition 
shared and thus I don't see the point of creating this header file.



---
  drivers/clocksource/tegra20_timer.c | 15 ++---
  include/clocksource/tegra_timer.h   | 43 +
  2 files changed, 49 insertions(+), 9 deletions(-)
  create mode 100644 include/clocksource/tegra_timer.h

diff --git a/drivers/clocksource/tegra20_timer.c 
b/drivers/clocksource/tegra20_timer.c
index 73cfa56..2c49643 100644
--- a/drivers/clocksource/tegra20_timer.c
+++ b/drivers/clocksource/tegra20_timer.c
@@ -28,6 +28,8 @@
  #include 
  #include 

+#include 
+
  #include 
  #include 

@@ -39,11 +41,6 @@
  #define TIMERUS_USEC_CFG 0x14
  #define TIMERUS_CNTR_FREEZE 0x4c

-#define TIMER1_BASE 0x0
-#define TIMER2_BASE 0x8
-#define TIMER3_BASE 0x50
-#define TIMER4_BASE 0x58
-
  #define TIMER_PTV 0x0
  #define TIMER_PCR 0x4

@@ -64,7 +61,7 @@ static int tegra_timer_set_next_event(unsigned long cycles,
u32 reg;

reg = 0x8000 | ((cycles > 1) ? (cycles-1) : 0);
-   timer_writel(reg, TIMER3_BASE + TIMER_PTV);
+   timer_writel(reg, TEGRA20_TIMER3_BASE + TIMER_PTV);

return 0;
  }
@@ -74,12 +71,12 @@ static void tegra_timer_set_mode(enum clock_event_mode mode,
  {
u32 reg;

-   timer_writel(0, TIMER3_BASE + TIMER_PTV);
+   timer_writel(0, TEGRA20_TIMER3_BASE + TIMER_PTV);

switch (mode) {
case CLOCK_EVT_MODE_PERIODIC:
reg = 0xC000 | ((100/HZ)-1);
-   timer_writel(reg, TIMER3_BASE + TIMER_PTV);
+   timer_writel(reg, TEGRA20_TIMER3_BASE + TIMER_PTV);
break;
case CLOCK_EVT_MODE_ONESHOT:
break;
@@ -142,7 +139,7 @@ static void tegra_read_persistent_clock(struct timespec *ts)
  static irqreturn_t tegra_timer_interrupt(int irq, void *dev_id)
  {
struct clock_event_device *evt = (struct clock_event_device *)dev_id;
-   timer_writel(1<<30, TIMER3_BASE + TIMER_PCR);
+   timer_writel(1<<30, TEGRA20_TIMER3_BASE + TIMER_PCR);
evt->event_handler(evt);
return IRQ_HANDLED;
  }
diff --git a/include/clocksource/tegra_timer.h 
b/include/clocksource/tegra_timer.h
new file mode 100644
index 000..ea0bc8b
--- /dev/null
+++ b/include/clocksource/tegra_timer.h
@@ -0,0 +1,43 @@
+/*
+ * Copyright (C) 2010 Google, Inc.
+ *
+ * Author:
+ * Colin Cross 


 ^^

+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#ifndef __CLOCKSOURCE_TEGRA_TIMER_H
+#define __CLOCKSOURCE_TEGRA_TIMER_H
+
+/* Tegra 20 timers */
+#define TEGRA20_TIMER1_BASE0x0
+#define TEGRA20_TIMER2_BASE0x8
+#define TEGRA20_TIMER3_BASE0x50
+#define TEGRA20_TIMER4_BASE0x58
+
+/* Tegra 30 timers */
+#define TEGRA30_TIMER1_BASETEGRA20_TIMER1_BASE
+#define TEGRA30_TIMER2_BASETEGRA20_TIMER2_BASE
+#define TEGRA30_TIMER3_BASETEGRA20_TIMER3_BASE
+#define TEGRA30_TIMER4_BASETEGRA20_TIMER4_BASE
+#define TEGRA30_TIMER5_BASE0x60
+#define TEGRA30_TIMER6_BASE0x68
+#define TEGRA30_TIMER7_BASE0x70
+#define TEGRA30_TIMER8_BASE0x78
+#define TEGRA30_TIMER9_BASE0x80
+#define TEGRA30_TIMER0_BASE0x88
+
+/* Used by the tegra watchdog timer */
+#define TEGRA30_TIMER_WDT_BASE TEGRA30_TIMER5_BASE
+#define TEGRA30_TIMER_WDT_ID   5
+
+#endif /* __CLOCKSOURCE_TEGRA_TIMER_H */




--
  Linaro.org │ Open source software for ARM SoCs

Follow Linaro:   Facebook |
 Twitter |
 Blog

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH] memcg, slab: cleanup memcg cache creation

2014-02-03 Thread Vladimir Davydov
This patch cleanups the memcg cache creation path as follows:
 - Move memcg cache name creation to a separate function to be called
   from kmem_cache_create_memcg(). This allows us to get rid of the
   mutex protecting the temporary buffer used for the name formatting,
   because the whole cache creation path is protected by the slab_mutex.
 - Get rid of memcg_create_kmem_cache(). This function serves as a proxy
   to kmem_cache_create_memcg(). After separating the cache name
   creation path, it would be reduced to a function call, so let's
   inline it.

Signed-off-by: Vladimir Davydov 
---
 include/linux/memcontrol.h |9 +
 mm/memcontrol.c|   89 
 mm/slab_common.c   |5 ++-
 3 files changed, 54 insertions(+), 49 deletions(-)

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index abd0113b6620..84e4801fc36c 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -497,6 +497,9 @@ void __memcg_kmem_commit_charge(struct page *page,
 void __memcg_kmem_uncharge_pages(struct page *page, int order);
 
 int memcg_cache_id(struct mem_cgroup *memcg);
+
+char *memcg_create_cache_name(struct mem_cgroup *memcg,
+ struct kmem_cache *root_cache);
 int memcg_alloc_cache_params(struct mem_cgroup *memcg, struct kmem_cache *s,
 struct kmem_cache *root_cache);
 void memcg_free_cache_params(struct kmem_cache *s);
@@ -641,6 +644,12 @@ static inline int memcg_cache_id(struct mem_cgroup *memcg)
return -1;
 }
 
+static inline char *memcg_create_cache_name(struct mem_cgroup *memcg,
+   struct kmem_cache *root_cache)
+{
+   return NULL;
+}
+
 static inline int memcg_alloc_cache_params(struct mem_cgroup *memcg,
struct kmem_cache *s, struct kmem_cache *root_cache)
 {
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 53385cd4e6f0..43e08b7bb365 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -3193,6 +3193,32 @@ int memcg_update_cache_size(struct kmem_cache *s, int 
num_groups)
return 0;
 }
 
+char *memcg_create_cache_name(struct mem_cgroup *memcg,
+ struct kmem_cache *root_cache)
+{
+   static char *buf = NULL;
+
+   /*
+* We need a mutex here to protect the shared buffer. Since this is
+* expected to be called only on cache creation, we can employ the
+* slab_mutex for that purpose.
+*/
+   lockdep_assert_held(_mutex);
+
+   if (!buf) {
+   buf = kmalloc(PATH_MAX, GFP_KERNEL);
+   if (!buf)
+   return NULL;
+   }
+
+   rcu_read_lock();
+   snprintf(buf, PATH_MAX, "%s(%d:%s)", root_cache->name,
+memcg_cache_id(memcg), cgroup_name(memcg->css.cgroup));
+   rcu_read_unlock();
+
+   return kstrdup(buf, GFP_KERNEL);
+}
+
 int memcg_alloc_cache_params(struct mem_cgroup *memcg, struct kmem_cache *s,
 struct kmem_cache *root_cache)
 {
@@ -3397,44 +3423,6 @@ void mem_cgroup_destroy_cache(struct kmem_cache *cachep)
schedule_work(>memcg_params->destroy);
 }
 
-static struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg,
- struct kmem_cache *s)
-{
-   struct kmem_cache *new = NULL;
-   static char *tmp_name = NULL;
-   static DEFINE_MUTEX(mutex); /* protects tmp_name */
-
-   BUG_ON(!memcg_can_account_kmem(memcg));
-
-   mutex_lock();
-   /*
-* kmem_cache_create_memcg duplicates the given name and
-* cgroup_name for this name requires RCU context.
-* This static temporary buffer is used to prevent from
-* pointless shortliving allocation.
-*/
-   if (!tmp_name) {
-   tmp_name = kmalloc(PATH_MAX, GFP_KERNEL);
-   if (!tmp_name)
-   goto out;
-   }
-
-   rcu_read_lock();
-   snprintf(tmp_name, PATH_MAX, "%s(%d:%s)", s->name,
-memcg_cache_id(memcg), cgroup_name(memcg->css.cgroup));
-   rcu_read_unlock();
-
-   new = kmem_cache_create_memcg(memcg, tmp_name, s->object_size, s->align,
- (s->flags & ~SLAB_PANIC), s->ctor, s);
-   if (new)
-   new->allocflags |= __GFP_KMEMCG;
-   else
-   new = s;
-out:
-   mutex_unlock();
-   return new;
-}
-
 void kmem_cache_destroy_memcg_children(struct kmem_cache *s)
 {
struct kmem_cache *c;
@@ -3481,12 +3469,6 @@ void kmem_cache_destroy_memcg_children(struct kmem_cache 
*s)
mutex_unlock(_kmem_mutex);
 }
 
-struct create_work {
-   struct mem_cgroup *memcg;
-   struct kmem_cache *cachep;
-   struct work_struct work;
-};
-
 static void mem_cgroup_destroy_all_caches(struct mem_cgroup *memcg)
 {
struct kmem_cache *cachep;
@@ -3504,13 +3486,24 @@ 

Re: [PATCH] kernel: kprobe: move all *kretprobe* generic implementation to CONFIG_KRETPROBES enabled area

2014-02-03 Thread Masami Hiramatsu
(2014/02/04 14:16), Chen Gang wrote:
> When CONFIG_KRETPROBES disabled, all *kretprobe* generic implementation
> are useless, so need move them to CONFIG_KPROBES enabled area.
> 
> Now, *kretprobe* generic implementation are all implemented in 2 files:
> 
>  - in "include/linux/kprobes.h":
> 
>  move inline kretprobe*() to CONFIG_KPROBES area and dummy outside.
>  move some *kprobe() declarations which kretprobe*() call, to front.
>  not touch kretprobe_blacklist[] which is architecture's variable.
> 
>  - in "kernel/kprobes.c":
> 
>  move all kretprobe* to CONFIG_KPROBES area and dummy outside.
>  define kretprobe_flush_task() to let kprobe_flush_task() call.
>  define init_kretprobes() to let init_kprobes() call.
> 
> The patch passes compiling (get "kernel/kprobes.o" and "kernel/built-
> in.o") under avr32 and x86_64 allmodconfig, and passes building (get
> bzImage and Modpost modules) under x86_64 defconfig.

Thanks for the fix! and I have some comments below.

> Signed-off-by: Chen Gang 
> ---
>  include/linux/kprobes.h |  58 +
>  kernel/kprobes.c| 328 
> +++-
>  2 files changed, 222 insertions(+), 164 deletions(-)
> 
> diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h
> index 925eaf2..c0d1212 100644
> --- a/include/linux/kprobes.h
> +++ b/include/linux/kprobes.h
> @@ -223,10 +223,36 @@ static inline int kprobes_built_in(void)
>   return 1;
>  }
>  
> +int disable_kprobe(struct kprobe *kp);
> +int enable_kprobe(struct kprobe *kp);
> +
> +void dump_kprobe(struct kprobe *kp);
> +
> +extern struct kretprobe_blackpoint kretprobe_blacklist[];
> +
>  #ifdef CONFIG_KRETPROBES
>  extern void arch_prepare_kretprobe(struct kretprobe_instance *ri,
>  struct pt_regs *regs);
>  extern int arch_trampoline_kprobe(struct kprobe *p);
> +static inline void kretprobe_assert(struct kretprobe_instance *ri,
> + unsigned long orig_ret_address, unsigned long trampoline_address)
> +{
> + if (!orig_ret_address || (orig_ret_address == trampoline_address)) {
> + printk(KERN_ERR
> + "kretprobe BUG!: Processing kretprobe %p @ %p\n",
> + ri->rp, ri->rp->kp.addr);
> + BUG();
> + }
> +}
> +static inline int disable_kretprobe(struct kretprobe *rp)
> +{
> + return disable_kprobe(>kp);
> +}
> +static inline int enable_kretprobe(struct kretprobe *rp)
> +{
> + return enable_kprobe(>kp);
> +}
> +
>  #else /* CONFIG_KRETPROBES */
>  static inline void arch_prepare_kretprobe(struct kretprobe *rp,
>   struct pt_regs *regs)
> @@ -236,19 +262,20 @@ static inline int arch_trampoline_kprobe(struct kprobe 
> *p)
>  {
>   return 0;
>  }
> -#endif /* CONFIG_KRETPROBES */
> -
> -extern struct kretprobe_blackpoint kretprobe_blacklist[];
> -
>  static inline void kretprobe_assert(struct kretprobe_instance *ri,
>   unsigned long orig_ret_address, unsigned long trampoline_address)
>  {
> - if (!orig_ret_address || (orig_ret_address == trampoline_address)) {
> - printk("kretprobe BUG!: Processing kretprobe %p @ %p\n",
> - ri->rp, ri->rp->kp.addr);
> - BUG();
> - }
>  }
> +static inline int disable_kretprobe(struct kretprobe *rp)
> +{
> + return 0;
> +}
> +static inline int enable_kretprobe(struct kretprobe *rp)
> +{
> + return 0;
> +}

No, these should returns -EINVAL or -ENOSYS, since these are user API.
Anyway, I don't think those inlined functions to be changed, because
most of them are internal functions. If CONFIG_KRETPROBES=n, it just
be ignored.

So, I think you don't need to change kprobes.h.


> +
> +#endif /* CONFIG_KRETPROBES */
>  
>  #ifdef CONFIG_KPROBES_SANITY_TEST
>  extern int init_test_probes(void);
> @@ -379,11 +406,6 @@ void unregister_kretprobes(struct kretprobe **rps, int 
> num);
>  void kprobe_flush_task(struct task_struct *tk);
>  void recycle_rp_inst(struct kretprobe_instance *ri, struct hlist_head *head);
>  
> -int disable_kprobe(struct kprobe *kp);
> -int enable_kprobe(struct kprobe *kp);
> -
> -void dump_kprobe(struct kprobe *kp);
> -
>  #else /* !CONFIG_KPROBES: */
>  
>  static inline int kprobes_built_in(void)
> @@ -459,14 +481,6 @@ static inline int enable_kprobe(struct kprobe *kp)
>   return -ENOSYS;
>  }
>  #endif /* CONFIG_KPROBES */
> -static inline int disable_kretprobe(struct kretprobe *rp)
> -{
> - return disable_kprobe(>kp);
> -}
> -static inline int enable_kretprobe(struct kretprobe *rp)
> -{
> - return enable_kprobe(>kp);
> -}
>  static inline int disable_jprobe(struct jprobe *jp)
>  {
>   return disable_kprobe(>kp);
> diff --git a/kernel/kprobes.c b/kernel/kprobes.c
> index ceeadfc..e305a81 100644
> --- a/kernel/kprobes.c
> +++ b/kernel/kprobes.c
[...]
> @@ -1936,8 +1955,44 @@ static int __kprobes pre_handler_kretprobe(struct 
> kprobe *p,
>   return 0;
>  }
>  
> 

Re: Need help in bug in isolate_migratepages_range

2014-02-03 Thread Holger Kiehl

On Mon, 3 Feb 2014, David Rientjes wrote:


On Mon, 3 Feb 2014, Vlastimil Babka wrote:


It seems to come from balloon_page_movable() and its test page_count(page) ==
1.



Hmm, I think it might be because compound_head() == NULL here.  Holger,
this looks like a race condition when allocating a compound page, did you
only see it once or is it actually reproducible?


No, this only happened once. It is not reproducable, the system was running
for four days without problems. And before this kernel, five years without
any problems.

Thanks,
Holger
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [RFC][PATCH v2 5/5] mutex: Give spinners a chance to spin_on_owner if need_resched() triggered while queued

2014-02-03 Thread Jason Low
On Mon, 2014-02-03 at 20:25 +0100, Peter Zijlstra wrote:

> +void m_spin_unlock(struct m_spinlock **lock)
> +{
> + struct m_spinlock *node = this_cpu_ptr(_node);
> + struct m_spinlock *next;
> +
> + if (likely(cmpxchg(lock, node, NULL) == node))
> + return;

At this current point, (node->next != NULL) is a likely scenario.
Perhaps we can also add the following code here:

next = xchg(>next, NULL);
if (next) {
ACCESS_ONCE(next->locked) = 1;
return;
}

> + next = m_spin_wait_next(lock, node, NULL);
> + if (next)
> + ACCESS_ONCE(next->locked) = 1;
> +}


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v3 5/6] fat: permit to return phy block number by fibmap in fallocated region

2014-02-03 Thread Namjae Jeon
2014-02-04, OGAWA Hirofumi :
> Namjae Jeon  writes:
>
>> 2014-02-04, OGAWA Hirofumi :
>>> Namjae Jeon  writes:
>>>
>>  /* fat_get_cluster() assumes the requested blocknr isn't 
>> truncated.
>> */
>>  down_read(_I(mapping->host)->truncate_lock);
>> +/* To get block number beyond file size in fallocated region */
>> +atomic_set(_I(mapping->host)->beyond_isize, 1);
>>  blocknr = generic_block_bmap(mapping, block, fat_get_block);
>> +atomic_set(_I(mapping->host)->beyond_isize, 0);
>>  up_read(_I(mapping->host)->truncate_lock);
>
> This is racy. While user is using bmap, kernel can allocate new
> blocks.
> We should use another function for this.
 I understand that fat can map fallocated blocks in read case while
 user is using bmap.
 But I can not find the case allocate new blocks.
 If I am missing something, Could you please elaborate more ?
 Is it a case of _bmap request returning the block number for block
 allocated in parallel write path ?
>>>
>>> ->beyond_size is global for inode. So, write(2) path on same inode with
>>> bmap() also can see 1 set by bmap() while another process is using
>>> bmap().
>> 'create' flag  will be 1 in write(2) path. ->beyond_isize will only be
>> checked when 'create' flag is 0. Is there any case to be racy by
>> beyond_isize in write(2) path ?
>
> Ah, so instead of write, it will assign physical address to buffers
> beyond i_size for simple read if race?  In this case, it is still wrong.
Right. I will fix this case.
Thanks for review!
> --
> OGAWA Hirofumi 
>
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH] staging: cxt1e1: fix pointer-integer size mismatch warning

2014-02-03 Thread SeongJae Park
Fix the pointer-integer size mismatch warning below:
drivers/staging/cxt1e1/functions.c: In function ‘VMETRO_TRACE’:
drivers/staging/cxt1e1/functions.c:268:21: warning: cast from pointer
to integer of different size [-Wpointer-to-int-cast]
 u_int32_t   y = (u_int32_t) x;
 ^

Signed-off-by: SeongJae Park 
---
 drivers/staging/cxt1e1/functions.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/staging/cxt1e1/functions.c 
b/drivers/staging/cxt1e1/functions.c
index 95218e2..8f19a39 100644
--- a/drivers/staging/cxt1e1/functions.c
+++ b/drivers/staging/cxt1e1/functions.c
@@ -265,7 +265,7 @@ extern ci_t *CI;/* dummy pointer to board 
ZERO's data */
 void
 VMETRO_TRACE (void *x)
 {
-u_int32_t   y = (u_int32_t) x;
+u_int32_t   y = (u_int32_t)(uintptr_t) x;
 
 pci_write_32 ((u_int32_t *) >cpldbase->leds, y);
 }
-- 
1.8.3.2

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH] staging: bcm: fix pointer-integer size mismatch warnings

2014-02-03 Thread SeongJae Park
Fix the pointer-integer size mismatch warnings below:
drivers/staging/bcm/CmHost.c: In function 
‘StoreCmControlResponseMessage’:
drivers/staging/bcm/CmHost.c:1387:39: warning: cast to pointer from
integer of different size [-Wint-to-pointer-cast]
  pstAddIndication->psfAuthorizedSet = (struct bcm_connect_mgr_params 
*)ntohl((ULONG)pstAddIndication->psfAuthorizedSet);
   ^
drivers/staging/bcm/CmHost.c:1426:37: warning: cast to pointer from
integer of different size [-Wint-to-pointer-cast]
  pstAddIndication->psfAdmittedSet = (struct bcm_connect_mgr_params 
*)ntohl((ULONG)pstAddIndication->psfAdmittedSet);
 ^
drivers/staging/bcm/CmHost.c:1440:35: warning: cast to pointer from
integer of different size [-Wint-to-pointer-cast]
  pstAddIndication->psfActiveSet = (struct bcm_connect_mgr_params 
*)ntohl((ULONG)pstAddIndication->psfActiveSet);
   ^

Signed-off-by: SeongJae Park 
---
 drivers/staging/bcm/CmHost.c | 9 ++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/drivers/staging/bcm/CmHost.c b/drivers/staging/bcm/CmHost.c
index cc91b5e..dd8f8f7 100644
--- a/drivers/staging/bcm/CmHost.c
+++ b/drivers/staging/bcm/CmHost.c
@@ -1384,7 +1384,8 @@ ULONG StoreCmControlResponseMessage(struct 
bcm_mini_adapter *Adapter, PVOID pvBu
}
 
/* this can't possibly be right */
-   pstAddIndication->psfAuthorizedSet = (struct bcm_connect_mgr_params 
*)ntohl((ULONG)pstAddIndication->psfAuthorizedSet);
+   pstAddIndication->psfAuthorizedSet = (struct bcm_connect_mgr_params *)
+   (uintptr_t)ntohl((ULONG)pstAddIndication->psfAuthorizedSet);
 
if (pstAddIndicationAlt->u8Type == DSA_REQ) {
struct bcm_add_request AddRequest;
@@ -1423,7 +1424,8 @@ ULONG StoreCmControlResponseMessage(struct 
bcm_mini_adapter *Adapter, PVOID pvBu
return 0;
}
 
-   pstAddIndication->psfAdmittedSet = (struct bcm_connect_mgr_params 
*)ntohl((ULONG)pstAddIndication->psfAdmittedSet);
+   pstAddIndication->psfAdmittedSet = (struct bcm_connect_mgr_params *)
+   (uintptr_t)ntohl((ULONG)pstAddIndication->psfAdmittedSet);
 
/* ACTIVE SET */
pstAddIndication->psfActiveSet = (struct bcm_connect_mgr_params *)
@@ -1437,7 +1439,8 @@ ULONG StoreCmControlResponseMessage(struct 
bcm_mini_adapter *Adapter, PVOID pvBu
return 0;
}
 
-   pstAddIndication->psfActiveSet = (struct bcm_connect_mgr_params 
*)ntohl((ULONG)pstAddIndication->psfActiveSet);
+   pstAddIndication->psfActiveSet = (struct bcm_connect_mgr_params *)
+   (uintptr_t)ntohl((ULONG)pstAddIndication->psfActiveSet);
 
(*puBufferLength) = sizeof(struct bcm_add_indication);
*(struct bcm_add_indication *)pvBuffer = *pstAddIndication;
-- 
1.8.3.2

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [BISECTED] Linux 3.12.7 introduces page map handling regression

2014-02-03 Thread Elena Ufimtseva
On Sun, Jan 26, 2014 at 1:02 PM, Elena Ufimtseva  wrote:
> On Fri, Jan 24, 2014 at 8:38 AM, Mel Gorman  wrote:
>> On Thu, Jan 23, 2014 at 11:23:37AM -0500, Elena Ufimtseva wrote:
>>> >> >> 
>>> >> >>
>>> >> >> This dump doesn't look dramatically different, either.
>>> >> >>
>>> >> >>>
>>> >> >>> The other question is - how is AutoNUMA running when it is not 
>>> >> >>> enabled?
>>> >> >>> Shouldn't those _PAGE_NUMA ops be nops when AutoNUMA hasn't even been
>>> >> >>> turned on?
>>> >> >>
>>> >> >>
>>> >> >> Well, NUMA_BALANCING is enabled in the kernel config[1], but I 
>>> >> >> presume you
>>> >> >> mean not enabled at runtime?
>>> >> >>
>>> >> >> [1]
>>> >> >> http://git.uplinklabs.net/snoonan/projects/archlinux/ec2/ec2-packages.git/tree/linux-ec2/config.x86_64
>>> >>
>>> >>
>>> >>
>>> >> --
>>> >> Elena
>>>
>>> I was able to reproduce this consistently, also with the latest mm
>>> patches from yesterday.
>>> Can you please try this:
>>>
>>
>> Thanks Elena,
>>
>>> diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
>>> index ce563be..76dcf96 100644
>>> --- a/arch/x86/xen/mmu.c
>>> +++ b/arch/x86/xen/mmu.c
>>> @@ -365,7 +365,7 @@ void xen_ptep_modify_prot_commit(struct mm_struct
>>> *mm, unsigned long addr,
>>>  /* Assume pteval_t is equivalent to all the other *val_t types. */
>>>  static pteval_t pte_mfn_to_pfn(pteval_t val)
>>>  {
>>> -   if (val & _PAGE_PRESENT) {
>>> +   if ((val & _PAGE_PRESENT) || ((val &
>>> (_PAGE_NUMA|_PAGE_PRESENT)) == _PAGE_NUMA)) {
>>> unsigned long mfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT;
>>> unsigned long pfn = mfn_to_pfn(mfn);
>>>
>>> @@ -381,7 +381,7 @@ static pteval_t pte_mfn_to_pfn(pteval_t val)
>>>
>>>  static pteval_t pte_pfn_to_mfn(pteval_t val)
>>>  {
>>> -   if (val & _PAGE_PRESENT) {
>>> +   if ((val & _PAGE_PRESENT) || ((val &
>>> (_PAGE_NUMA|_PAGE_PRESENT)) == _PAGE_NUMA)) {
>>> unsigned long pfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT;
>>> pteval_t flags = val & PTE_FLAGS_MASK;
>>> unsigned long mfn;
>>
>> Would reusing pte_present be an option? Ordinarily I expect that
>> PAGE_NUMA/PAGE_PROTNONE is only set if PAGE_PRESENT is not set and 
>> pte_present
>> is defined as
>>
>> static inline int pte_present(pte_t a)
>> {
>> return pte_flags(a) & (_PAGE_PRESENT | _PAGE_PROTNONE |
>>_PAGE_NUMA);
>> }
>>
>> So it looks like it work work. Of course it would need to be split to
>> reuse it within xen if pte_present was split to have a pteval_present
>> helper like so
>>
>> static inline int pteval_present(pteval_t val)
>> {
>> /*
>>  * Yes Linus, _PAGE_PROTNONE == _PAGE_NUMA. Expressing it this
>>  * way clearly states that the intent is that a protnone and numa
>>  * hinting ptes are considered present for the purposes of
>>  * pagetable operations like zapping, protection changes, gup etc.
>>  */
>> return val & (_PAGE_PRESENT | _PAGE_PROTNONE | _PAGE_NUMA);
>> }
>>
>> static inline int pte_present(pte_t pte)
>> {
>> return pteval_present(pte_flags(pte))
>> }
>>
>> If Xen is doing some other tricks with _PAGE_PRESENT then it might be
>> ruled out as an option. If so, then maybe it could still be made a
>> little clearer for future reference?
>
> Yes, sure, it should work, I tried it.
> Thank you Mel.
>
>>
>>
>> diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
>> index c1d406f..ff621de 100644
>> --- a/arch/x86/xen/mmu.c
>> +++ b/arch/x86/xen/mmu.c
>> @@ -365,7 +365,7 @@ void xen_ptep_modify_prot_commit(struct mm_struct *mm, 
>> unsigned long addr,
>>  /* Assume pteval_t is equivalent to all the other *val_t types. */
>>  static pteval_t pte_mfn_to_pfn(pteval_t val)
>>  {
>> -   if (val & _PAGE_PRESENT) {
>> +   if ((val & _PAGE_PRESENT) || pteval_numa(val)) {
>> unsigned long mfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT;
>> unsigned long pfn = mfn_to_pfn(mfn);
>>
>> @@ -381,7 +381,7 @@ static pteval_t pte_mfn_to_pfn(pteval_t val)
>>
>>  static pteval_t pte_pfn_to_mfn(pteval_t val)
>>  {
>> -   if (val & _PAGE_PRESENT) {
>> +   if ((val & _PAGE_PRESENT) || pteval_numa(val)) {
>> unsigned long pfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT;
>> pteval_t flags = val & PTE_FLAGS_MASK;
>> unsigned long mfn;
>> diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
>> index 8e4f41d..693fe00 100644
>> --- a/include/asm-generic/pgtable.h
>> +++ b/include/asm-generic/pgtable.h
>> @@ -654,10 +654,14 @@ static inline int pmd_trans_unstable(pmd_t *pmd)
>>   * (because _PAGE_PRESENT is not set).
>>   */
>>  #ifndef pte_numa
>> +static inline int pteval_numa(pteval_t pteval)
>> +{
>> +   return (pteval & (_PAGE_NUMA|_PAGE_PRESENT)) == _PAGE_NUMA;
>> +}
>> +
>>  static inline int pte_numa(pte_t pte)
>>  {
>> -   return (pte_flags(pte) &
>> -   

[PATCH] spi: fix pointer-integer size mismatch warning

2014-02-03 Thread SeongJae Park
Fix the pointer-integer size mismatch warning below:
drivers/spi/spi-gpio.c: In function ‘spi_gpio_setup’:
drivers/spi/spi-gpio.c:252:8: warning: cast from pointer to integer of
different size [-Wpointer-to-int-cast]
   cs = (unsigned int) spi->controller_data;
^

Signed-off-by: SeongJae Park 
---
 drivers/spi/spi-gpio.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/spi/spi-gpio.c b/drivers/spi/spi-gpio.c
index cfc9fb3..406bbd7 100644
--- a/drivers/spi/spi-gpio.c
+++ b/drivers/spi/spi-gpio.c
@@ -249,7 +249,7 @@ static int spi_gpio_setup(struct spi_device *spi)
/*
 * ... otherwise, take it from spi->controller_data
 */
-   cs = (unsigned int) spi->controller_data;
+   cs = (unsigned int)(uintptr_t) spi->controller_data;
}
 
if (!spi->controller_state) {
-- 
1.8.3.2

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v3 5/6] fat: permit to return phy block number by fibmap in fallocated region

2014-02-03 Thread OGAWA Hirofumi
Namjae Jeon  writes:

> 2014-02-04, OGAWA Hirofumi :
>> Namjae Jeon  writes:
>>
>   /* fat_get_cluster() assumes the requested blocknr isn't truncated.
> */
>   down_read(_I(mapping->host)->truncate_lock);
> + /* To get block number beyond file size in fallocated region */
> + atomic_set(_I(mapping->host)->beyond_isize, 1);
>   blocknr = generic_block_bmap(mapping, block, fat_get_block);
> + atomic_set(_I(mapping->host)->beyond_isize, 0);
>   up_read(_I(mapping->host)->truncate_lock);

 This is racy. While user is using bmap, kernel can allocate new blocks.
 We should use another function for this.
>>> I understand that fat can map fallocated blocks in read case while
>>> user is using bmap.
>>> But I can not find the case allocate new blocks.
>>> If I am missing something, Could you please elaborate more ?
>>> Is it a case of _bmap request returning the block number for block
>>> allocated in parallel write path ?
>>
>> ->beyond_size is global for inode. So, write(2) path on same inode with
>> bmap() also can see 1 set by bmap() while another process is using bmap().
> 'create' flag  will be 1 in write(2) path. ->beyond_isize will only be
> checked when 'create' flag is 0. Is there any case to be racy by
> beyond_isize in write(2) path ?

Ah, so instead of write, it will assign physical address to buffers
beyond i_size for simple read if race?  In this case, it is still wrong.
-- 
OGAWA Hirofumi 
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH] drivers/mfd: fix pointer-integer size mismatch warnings

2014-02-03 Thread SeongJae Park
Fix the pointer-integer size mismatch warnings below:
drivers/mfd/wm8994-core.c: In function ‘wm8994_i2c_probe’:
mfd/wm8994-core.c:639:19: warning: cast from pointer to integer of
different size [-Wpointer-to-int-cast]
wm8994->type = (int)of_id->data;
   ^
drivers/mfd/max8997.c: In function ‘max8997_i2c_get_driver_data’:
drivers/mfd/max8997.c:173:10: warning: cast from pointer to integer of
different size [-Wpointer-to-int-cast]
   return (int)match->data;
  ^

Signed-off-by: SeongJae Park 
---
 drivers/mfd/max8997.c | 2 +-
 drivers/mfd/wm8994-core.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/mfd/max8997.c b/drivers/mfd/max8997.c
index be88a3b..768bcb1 100644
--- a/drivers/mfd/max8997.c
+++ b/drivers/mfd/max8997.c
@@ -170,7 +170,7 @@ static inline int max8997_i2c_get_driver_data(struct 
i2c_client *i2c,
if (IS_ENABLED(CONFIG_OF) && i2c->dev.of_node) {
const struct of_device_id *match;
match = of_match_node(max8997_pmic_dt_match, i2c->dev.of_node);
-   return (int)match->data;
+   return (int)(long)match->data;
}
return (int)id->driver_data;
 }
diff --git a/drivers/mfd/wm8994-core.c b/drivers/mfd/wm8994-core.c
index ba04f1b..1b25335 100644
--- a/drivers/mfd/wm8994-core.c
+++ b/drivers/mfd/wm8994-core.c
@@ -636,7 +636,7 @@ static int wm8994_i2c_probe(struct i2c_client *i2c,
if (i2c->dev.of_node) {
of_id = of_match_device(wm8994_of_match, >dev);
if (of_id)
-   wm8994->type = (int)of_id->data;
+   wm8994->type = (int)(long)of_id->data;
} else {
wm8994->type = id->driver_data;
}
-- 
1.8.3.2

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v7] xen/grant-table: Avoid m2p_override during mapping

2014-02-03 Thread Matt Wilson
On Mon, Feb 03, 2014 at 01:24:58PM +, Zoltan Kiss wrote:
> The grant mapping API does m2p_override unnecessarily: only gntdev needs it,
> for blkback and future netback patches it just cause a lock contention, as
> those pages never go to userspace. Therefore this series does the following:
> - the original functions were renamed to __gnttab_[un]map_refs, with a new
>   parameter m2p_override
> - based on m2p_override either they follow the original behaviour, or just set
>   the private flag and call set_phys_to_machine
> - gnttab_[un]map_refs are now a wrapper to call __gnttab_[un]map_refs with
>   m2p_override false
> - a new function gnttab_[un]map_refs_userspace provides the old behaviour
> 
> It also removes a stray space from page.h and change ret to 0 if
> XENFEAT_auto_translated_physmap, as that is the only possible return value
> there.
> 
> v2:
> - move the storing of the old mfn in page->index to gnttab_map_refs
> - move the function header update to a separate patch
> 
> v3:
> - a new approach to retain old behaviour where it needed
> - squash the patches into one
> 
> v4:
> - move out the common bits from m2p* functions, and pass pfn/mfn as parameter
> - clear page->private before doing anything with the page, so 
> m2p_find_override
>   won't race with this
> 
> v5:
> - change return value handling in __gnttab_[un]map_refs
> - remove a stray space in page.h
> - add detail why ret = 0 now at some places
> 
> v6:
> - don't pass pfn to m2p* functions, just get it locally
> 
> v7:
> - the previous version broke build on ARM, as there is no need for those p2m
>   changes. I've put them into arch specific functions, which are stubs on arm
>
> Signed-off-by: Zoltan Kiss 
> Suggested-by: David Vrabel 

You're still forgetting that this was originally proposed by Anthony
Liguori .

https://lkml.kernel.org/r/1384307336-5328-1-git-send-email-anth...@codemonkey.ws

--msw
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 2/4] staging/lustre/obdclass: read jobid from proc

2014-02-03 Thread Oleg Drokin
Hello!

On Wed, Oct 30, 2013 at 06:21:01AM -0700, Greg Kroah-Hartman wrote:
> > - * stored in between the "env_start" & "env_end" of task struct.
> > +static char *self_environ_file = "/proc/self/environ";
> 
> Heh, no, that's not ok at all.
> 
> This is a _huge_ sign that you are doing something wrong in your driver
> if you need something that isn't exported, or that you have to dig out
> of proc.
> 
> Sorry, I can't take this, please fix the underlying problems that would
> even think that you need access to the environment from within a kernel
> driver.

I took a stab at this.
This is not a final patch, I know there's still some number of checkpatch
warnings and the proc layout is not finalized yet for example.

But before I spend any more time in polishing this, can you please take a look
and advise if this direction would be acceptable for you when driven to
completion?

Thanks.


>From 6a5b58657cc32163738d4a8c210e8683159b582f Mon Sep 17 00:00:00 2001
From: Oleg Drokin 
Date: Tue, 4 Feb 2014 00:32:12 -0500
Subject: [PATCH] staging/lustre: Obtain jobid invormation via upcall

Replace lustre jobid information fetching directly from
process env variable with either node-wide jobid obtained via
a proc file, or through an upcall that would provide the jobid
if more fine-grained operations are necessary.

Signed-off-by: Oleg Drokin 
---
 .../staging/lustre/include/linux/libcfs/curproc.h  |   1 -
 .../staging/lustre/include/linux/libcfs/lucache.h  |   6 +
 .../staging/lustre/lustre/include/lprocfs_status.h |   1 +
 .../lustre/lustre/libcfs/linux/linux-curproc.c | 152 -
 drivers/staging/lustre/lustre/obdclass/Makefile|   2 +-
 drivers/staging/lustre/lustre/obdclass/class_obd.c |  67 
 .../staging/lustre/lustre/obdclass/jobid_cache.c   | 181 +
 .../lustre/lustre/obdclass/jobid_internal.h|  10 ++
 .../lustre/lustre/obdclass/linux/linux-module.c| 114 +
 9 files changed, 345 insertions(+), 189 deletions(-)
 create mode 100644 drivers/staging/lustre/lustre/obdclass/jobid_cache.c
 create mode 100644 drivers/staging/lustre/lustre/obdclass/jobid_internal.h

diff --git a/drivers/staging/lustre/include/linux/libcfs/curproc.h 
b/drivers/staging/lustre/include/linux/libcfs/curproc.h
index 507d16b..cf1f26b 100644
--- a/drivers/staging/lustre/include/linux/libcfs/curproc.h
+++ b/drivers/staging/lustre/include/linux/libcfs/curproc.h
@@ -63,7 +63,6 @@ intcfs_curproc_groups_nr(void);
 /* check if task is running in compat mode.*/
 #define current_pid()  (current->pid)
 #define current_comm() (current->comm)
-int cfs_get_environ(const char *key, char *value, int *val_len);
 
 typedef __u32 cfs_cap_t;
 
diff --git a/drivers/staging/lustre/include/linux/libcfs/lucache.h 
b/drivers/staging/lustre/include/linux/libcfs/lucache.h
index 9668b39..f8361d7 100644
--- a/drivers/staging/lustre/include/linux/libcfs/lucache.h
+++ b/drivers/staging/lustre/include/linux/libcfs/lucache.h
@@ -82,6 +82,11 @@ struct md_identity {
struct md_perm  *mi_perms;
 };
 
+struct jobid_cache_entry {
+   struct upcall_cache_entry *jce_uc_entry;
+   char  *jce_jobid;
+};
+
 struct upcall_cache_entry {
struct list_head  ue_hash;
__u64  ue_key;
@@ -92,6 +97,7 @@ struct upcall_cache_entry {
cfs_time_tue_expire;
union {
struct md_identity identity;
+   struct jobid_cache_entry jobid;
} u;
 };
 
diff --git a/drivers/staging/lustre/lustre/include/lprocfs_status.h 
b/drivers/staging/lustre/lustre/include/lprocfs_status.h
index 428e3e4..3c99dcf 100644
--- a/drivers/staging/lustre/lustre/include/lprocfs_status.h
+++ b/drivers/staging/lustre/lustre/include/lprocfs_status.h
@@ -369,6 +369,7 @@ static inline void s2dhms(struct dhms *ts, time_t secs)
 #define JOBSTATS_JOBID_VAR_MAX_LEN 20
 #define JOBSTATS_DISABLE   "disable"
 #define JOBSTATS_PROCNAME_UID  "procname_uid"
+#define JOBSTATS_NODELOCAL "nodelocal"
 
 extern int lprocfs_write_frac_helper(const char *buffer, unsigned long count,
 int *val, int mult);
diff --git a/drivers/staging/lustre/lustre/libcfs/linux/linux-curproc.c 
b/drivers/staging/lustre/lustre/libcfs/linux/linux-curproc.c
index a2ef64c..7c48601 100644
--- a/drivers/staging/lustre/lustre/libcfs/linux/linux-curproc.c
+++ b/drivers/staging/lustre/lustre/libcfs/linux/linux-curproc.c
@@ -140,158 +140,6 @@ int cfs_capable(cfs_cap_t cap)
return capable(cfs_cap_unpack(cap));
 }
 
-static int cfs_access_process_vm(struct task_struct *tsk, unsigned long addr,
-void *buf, int len, int write)
-{
-   /* Just copied from kernel for the kernels which doesn't
-* have access_process_vm() exported */
-   struct mm_struct *mm;
-   struct vm_area_struct *vma;
-   struct page *page;
-   void 

Re: [Xen-devel] xen-blkback: bug fixes

2014-02-03 Thread Matt Wilson
On Tue, Jan 28, 2014 at 03:38:37PM -0400, Konrad Rzeszutek Wilk wrote:
> On Tue, Jan 28, 2014 at 06:43:32PM +0100, Roger Pau Monne wrote:
> > blkback bug fixes for memory leaks (patches 1 and 2) and a race 
> > (patch 3).
> 
> They all look OK to me. I've stuck them in my 'stable/for-jens-3.14'
> branch and are testing them now (hadn't pushed it yet).
> 
> Matt and Matt,
> 
> Could you take a look at the other two patches as well?

Sure, though somehow you didn't address your message to us, so I
didn't see it until today.

Matt Rushton did some review and testing on an earlier version that
came out fine. We'll give the final series a test since there was
still a bit of rework.

--msw

> David, Boris,
> 
> Are you OK with pushing those patches out to Jens Axboe if nobody
> gives an NACK by Friday?
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v2 2/7] memcg, slab: cleanup memcg cache name creation

2014-02-03 Thread Vladimir Davydov
On 02/04/2014 02:08 AM, Andrew Morton wrote:
> On Mon, 3 Feb 2014 19:54:37 +0400 Vladimir Davydov  
> wrote:
>
>> The way memcg_create_kmem_cache() creates the name for a memcg cache
>> looks rather strange: it first formats the name in the static buffer
>> tmp_name protected by a mutex, then passes the pointer to the buffer to
>> kmem_cache_create_memcg(), which finally duplicates it to the cache
>> name.
>>
>> Let's clean this up by moving memcg cache name creation to a separate
>> function to be called by kmem_cache_create_memcg(), and estimating the
>> length of the name string before copying anything to it so that we won't
>> need a temporary buffer.
>>
>> --- a/mm/memcontrol.c
>> +++ b/mm/memcontrol.c
>> @@ -3193,6 +3193,37 @@ int memcg_update_cache_size(struct kmem_cache *s, int 
>> num_groups)
>>  return 0;
>>  }
>>  
>> +static int memcg_print_cache_name(char *buf, size_t size,
>> +struct mem_cgroup *memcg, struct kmem_cache *root_cache)
>> +{
>> +int ret;
>> +
>> +rcu_read_lock();
>> +ret = snprintf(buf, size, "%s(%d:%s)", root_cache->name,
>> +   memcg_cache_id(memcg), cgroup_name(memcg->css.cgroup));
>> +rcu_read_unlock();
>> +return ret;
>> +}
>> +
>> +char *memcg_create_cache_name(struct mem_cgroup *memcg,
>> +  struct kmem_cache *root_cache)
>> +{
>> +int len;
>> +char *name;
>> +
>> +/*
>> + * We cannot use kasprintf() here, because cgroup_name() must be called
>> + * under RCU protection.
>> + */
>> +len = memcg_print_cache_name(NULL, 0, memcg, root_cache);
>> +
>> +name = kmalloc(len + 1, GFP_KERNEL);
>> +if (name)
>> +memcg_print_cache_name(name, len + 1, memcg, root_cache);
> but but but this assumes that cgroup_name(memcg->css.cgroup) did not
> change between the two calls to memcg_print_cache_name().  If that is
> the case then the locking was unneeded anyway.

Oops, I missed that. Thank you for pointing me out. It seems the usage
of the temporary buffer is inevitable. However, a dedicated mutex
protecting it can be removed, because we already hold the slab_mutex
while calling this function. Will rework.

Thanks.

>
>> +return name;
>> +}
>> +
>>  int memcg_alloc_cache_params(struct mem_cgroup *memcg, struct kmem_cache *s,
>>   struct kmem_cache *root_cache)
>>  {
>> @@ -3397,44 +3428,6 @@ void mem_cgroup_destroy_cache(struct kmem_cache 
>> *cachep)
>>  schedule_work(>memcg_params->destroy);
>>  }
>>  

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 7/8] Add 32 bit VDSO time support for 32 bit kernel

2014-02-03 Thread Stefani Seibold
Am Montag, den 03.02.2014, 14:04 -0800 schrieb Andy Lutomirski:
> On Mon, Feb 3, 2014 at 2:01 PM, Stefani Seibold  wrote:
> > Am Montag, den 03.02.2014, 08:36 -0800 schrieb Andy Lutomirski:
> >> On Sun, Feb 2, 2014 at 11:44 PM, Stefani Seibold  
> >> wrote:
> >> > Am Sonntag, den 02.02.2014, 16:12 -0800 schrieb Andy Lutomirski:
> >> >> On Sun, Feb 2, 2014 at 1:39 PM, Stefani Seibold  
> >> >> wrote:
> >> >> > Am Sonntag, den 02.02.2014, 08:46 -0800 schrieb Andy Lutomirski:
> >> >> >> On Sun, Feb 2, 2014 at 3:27 AM,   wrote:
> >> >> >> > From: Stefani Seibold 
> >> >> >> >
> >> >> >> > This patch add the time support for 32 bit a VDSO to a 32 bit 
> >> >> >> > kernel.
> >> >> >>
> >> >> >> [...]
> >> >> >>
> >> >> >> Can you address the review comments from last time around?  For
> >> >> >> example, this still seems to have redundant vvar and hpet mappings, 
> >> >> >> it
> >> >> >> doesn't use the VVAR macro, it moves the 32-bit compat vDSO, etc.
> >> >> >>
> >> >> >
> >> >> > I will address the compat VDSO issue.
> >> >> >
> >> >> > But the VVAR macro will be not a part of this patch set. If you depend
> >> >> > on this, feel free to create one. From my point of view this is not
> >> >> > feasible without a macro hacking, because the address accessing the 
> >> >> > vvar
> >> >> > area differs in kernel and VDSO user mode.
> >> >>
> >> >> Sorry, but "I will make the code messier for no apparent reason and I
> >> >> will not offer to fix it in the same series" gets my NAK.
> >> >>
> >> >> Hint: I'm talking about two or three lines of code in vvar.h.
> >> >>
> >> >
> >> > A hint back: if you threat me with a NAK for a requested code sequence
> >> > which currently no user, this is far away from professional. I am not
> >> > your trainee.
> >> >
> >> > BTW: If it is so easy, send me the two or three lines and i will merge
> >> > it ;-)
> >>
> >> Something to the effect of:
> >>
> >> #elif defined(BUILD_VDSO32)
> >> #define VVAR(name) (*vvar_ ## name)
> >> #endif
> >>
> >> Should do the trick.
> >
> > You are wrong...
> >
> > #ifdef BUILD_VDSO32
> >
> > #define DECLARE_VVAR(offset, type, name) \
> > extern type vvar_ ## name __attribute__((visibility("hidden")));
> >
> > #define VVAR(name) (vvar_ ## name)
> >
> > #else
> >
> > /* Base address of vvars.  This is not ABI. */
> > #ifdef CONFIG_X86_64
> > #define VVAR_ADDRESS (-10*1024*1024 - 4096)
> > #else
> > extern char __vvar_page;
> >
> > #define VVAR_ADDRESS (&__vvar_page)
> > #endif
> >
> > This would do the trick!
> >
> > But for 64 bit to 32 bit conversation layer in vclock_gettime.c there is
> > still a
> >
> > struct arch_vsyscall_gtod_data arch_vvar_vsyscall_gtod_data
> > __attribute__((visibility("hidden")));
> > #define gtod (_vvar_vsyscall_gtod_data)
> >
> > needed, because vvar_vsyscall_gtod_data is the 32 bit version, which
> > would result in incorrect access of the struct members. So my code can't
> > use this VVAR macro.
> 
> For 32-on-64, I must have read your code wrong.  Are you sticking two
> copies of the same struct with different layout into the vvar page?
> If so, wouldn't it be better to only have the variant with a common
> layout and use it for all versions of the vdso?
> 

No, only one. But for depend on 32/64 bit the layout differs.

We discuss this topic some days before, so please have a look at the
code and the previous posts.

- Stefani


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] Make math_state_restore() save and restore the interrupt flag

2014-02-03 Thread Suresh Siddha
On Mon, 2014-02-03 at 10:20 -0800, Linus Torvalds wrote:
> Thinking about it some more, this patch is *almost* not needed at all.
> 
> I'm wondering if you should just change the first patch to just always
> initialize the fpu when it is allocated, and at execve() time (ie in
> flush_thread()).
> 

We already do this for eager-fpu case, in eager_fpu_init() during boot
and in drop_init_fpu() during flush_thread().

> If we do that, then this:
> 
> +   if (!tsk_used_math(tsk))
> +   init_fpu(tsk);
> 
> can be dropped entirely from math_state_restore(). 

yeah, probably for eager-fpu, but:

> And quite frankly,
> at that point, I think all the changes to __kernel_fpu_end() can go
> away, because at that point math_state_restore() really does the right
> thing - all the allocations are gone, and all the async task state
> games are gone, only the "restore state" remains.
> 
> Hmm? So the only thing needed would be to add that "init_fpu()" to the
> initial bootmem allocation path and to change flush_thread() (it
> currently does "drop_init_fpu()", let's just make it initialize the
> FPU state using fpu_finit()), and then we could remove the whole
> "used_math" bit entirely, and just say that the FPU is always
> initialized.
> 
> What do you guys think?

No. as I mentioned in the changelog, there is one more path which does
drop_fpu() and we still depend on this used_math bit for eager-fpu.

in signal restore path for 32-bit app, where we copy the sig-context
state from the user stack to the kernel manually (because of legacy
reasons where fsave state is followed by fxsave state etc in the 32-bit
signal handler context and we have to go through convert_to_fxsr() etc).

from __restore_xstate_sig() :

/*
 * Drop the current fpu which clears used_math(). This ensures
 * that any context-switch during the copy of the new state,
 * avoids the intermediate state from getting restored/saved.
 * Thus avoiding the new restored state from getting corrupted.
 * We will be ready to restore/save the state only after
 * set_used_math() is again set.
 */
drop_fpu(tsk);


thanks,
suresh

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] spi: rspi: fix build error when CONFIG_OF is not set

2014-02-03 Thread Shimoda, Yoshihiro
Hi Geert-san,

(2014/02/03 17:47), Geert Uytterhoeven wrote:
>   Hi Shimoda-san,
> 
> On Mon, 3 Feb 2014, Shimoda, Yoshihiro wrote:
>> This patch fixes an issue that the following build error happens when
>> the CONFIG_OF is not set:
>>
>> drivers/spi/spi-rspi.c: In function 'rspi_probe':
>> drivers/spi/spi-rspi.c:1203:26: error: 'rspi_of_match' undeclared (first use 
>> in this function)
>>
>> Signed-off-by: Yoshihiro Shimoda 
>> ---
>>  This patch is based on the latest origin/topic/rspi branch in the spi.git.
>>
>>  drivers/spi/spi-rspi.c |1 +
>>  1 files changed, 1 insertions(+), 0 deletions(-)
>>
>> diff --git a/drivers/spi/spi-rspi.c b/drivers/spi/spi-rspi.c
>> index 34ad4bc..e5cfc3d 100644
>> --- a/drivers/spi/spi-rspi.c
>> +++ b/drivers/spi/spi-rspi.c
>> @@ -1164,6 +1164,7 @@ static int rspi_parse_dt(struct device *dev, struct 
>> spi_master *master)
>>  return 0;
>>  }
>>  #else
>> +#define rspi_of_match   NULL
>>  static inline int rspi_parse_dt(struct device *dev, struct spi_master 
>> *master)
>>  {
>>  return -EINVAL;
>> -- 
>> 1.7.1
> 
> Thanks, obviously I missed that of_match_device() still uses the ID table
> parameter if CONFIG_OF=n :-(
> 
> Below I have two alternative solutions:
>   1. Uses rspi_of_match() to nullify the ID table pointer, like is done in
>  the platform_driver structure,
>   2. Fixes it at the OF subsystem level, by nullifying the ID table pointer
>  inside of_match_device().
> 
> If 2 is accepted, drivers don't have to care about this anymore.
> 
> What do you think?

Thank you for the reply.
I think the 2nd one is a nice idea.
If I applied it without my patch, the build error disappeared.

Best regards,
Yoshihiro Shimoda
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH] regulator: core: Make regulator object reflect configured voltage

2014-02-03 Thread Bjorn Andersson
In the case when a regulator is initialized from DT with equal min and max
voltages the voltage is applied on initialization and future calls to
regulator_set_voltage fails. This behavious is different than if the regulator
is configured to be a span and therefor requires logic to handle this
difference in the consumer driver.

Eliminate this difference by populating the min_uV and max_uV of the newly
created regulator from the constraints so that calles to regulator_set_voltage
is considered no-ops and not a failure.

Signed-off-by: Bjorn Andersson 
---
 drivers/regulator/core.c |   10 ++
 1 file changed, 10 insertions(+)

diff --git a/drivers/regulator/core.c b/drivers/regulator/core.c
index d85f313..9c82d37 100644
--- a/drivers/regulator/core.c
+++ b/drivers/regulator/core.c
@@ -1209,6 +1209,16 @@ static struct regulator *create_regulator(struct 
regulator_dev *rdev,
_regulator_is_enabled(rdev))
regulator->always_on = true;
 
+   /*
+* Make the regulator reflect the configured voltage selected in
+* machine_constraints_voltage()
+*/
+   if (rdev->constraints->apply_uV &&
+   rdev->constraints->min_uV == rdev->constraints->max_uV) {
+   regulator->min_uV = rdev->constraints->min_uV;
+   regulator->max_uV = rdev->constraints->min_uV;
+   }
+
mutex_unlock(>mutex);
return regulator;
 overflow_err:
-- 
1.7.9.5

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 0/4] w1: refcnt fix, skip non-error send, docs

2014-02-03 Thread David Fries
On Tue, Feb 04, 2014 at 03:59:38AM +0400, z...@ioremap.net wrote:
> Hi
> 
> 03.02.2014, 05:15, "David Fries" :
> 
> >  I could submit these patches as in, which would require the previous
> >  set, or I could merge the documentation into the previous set and
> >  resubmit them all since they haven't made it into the kernel tree yet.
> >  Opinions?
> >
> >  Here's a small refcnt fix, skipping sending non-error messages, and
> >  documentation and comment updates.
> >
> >  non-error error messages:
> >  Currently every master or slave command is sending a response with
> >  w1_netlink_send_error no matter if there is an error or not.  This
> >  makes commands like list slaves W1_CMD_LIST_SLAVES or W1_CMD_READ
> >  return two messages, one with data and one without.  That is a problem
> >  with the list slaves because they are identical except for one having
> >  data and one not, and since there could be no slaves known to the
> >  kernel you can't just discard the no data case, unless the program
> >  were to expect two replies.  So I propose only sending the error reply
> >  if there is an error, in which case there wouldn't be a normal reply
> >  (such as read).  This would mean commands like write would no longer
> >  return a response unless there was an error.  If an application wanted
> >  to verify the kernel received the write message it could follow it by
> >  a read to verify the data or just that read came after write and had a
> >  response so write must have completed without error.  I think it is
> >  safe to do away with the extra replies.  If someone sees a big enough
> >  need for this, I could modify it so all commands return one response,
> >  with commands like write always calling send error even if there
> >  wasn't one.
> 
> I created this protocol to handle cases like nothing is returned, but yet 
> userspace knows
> operations has been completed. Also, you can not really change it at this 
> time - there are
> already userspace application which may depend on the last ack to find out 
> its request completed.
> 
> Reference counter fix is correct, please submit it in the separate patch.

Help me understand what the protocol is supposed to be.  Assuming
there aren't any errors, is there supposed to be a
w1_netlink_send_error generated reply per netlink packet (cn_msg), per
w1_netlink_msg, or per w1_netlink_cmd?

What about the cn_msg seq and ack values?  I assume the kernel
response should carry the same seq number as the request, but what
should the ack be set to?

-- 
David Fries PGP pub CB1EE8F0
http://fries.net/~david/
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v2] dma: Add Xilinx AXI Video Direct Memory Access Engine driver support

2014-02-03 Thread Vinod Koul
On Fri, Jan 31, 2014 at 12:22:52PM +0530, Srikanth Thokala wrote:
> >>> >> [...]
> >>> >>> +/**
> >>> >>> + * xilinx_vdma_device_control - Configure DMA channel of the device
> >>> >>> + * @dchan: DMA Channel pointer
> >>> >>> + * @cmd: DMA control command
> >>> >>> + * @arg: Channel configuration
> >>> >>> + *
> >>> >>> + * Return: '0' on success and failure value on error
> >>> >>> + */
> >>> >>> +static int xilinx_vdma_device_control(struct dma_chan *dchan,
> >>> >>> +   enum dma_ctrl_cmd cmd, unsigned 
> >>> >>> long arg)
> >>> >>> +{
> >>> >>> + struct xilinx_vdma_chan *chan = to_xilinx_chan(dchan);
> >>> >>> +
> >>> >>> + switch (cmd) {
> >>> >>> + case DMA_TERMINATE_ALL:
> >>> >>> + xilinx_vdma_terminate_all(chan);
> >>> >>> + return 0;
> >>> >>> + case DMA_SLAVE_CONFIG:
> >>> >>> + return xilinx_vdma_slave_config(chan,
> >>> >>> + (struct xilinx_vdma_config 
> >>> >>> *)arg);
> >>> >>
> >>> >> You really shouldn't be overloading the generic API with your own 
> >>> >> semantics.
> >>> >> DMA_SLAVE_CONFIG should take a dma_slave_config and nothing else.
> >>> >
> >>> > Ok.  The driver needs few additional configuration from the slave
> >>> > device like Vertical
> >>> > Size, Horizontal Size,  Stride etc., for the DMA transfers, in that 
> >>> > case do you
> >>> > suggest me to define a separate dma_ctrl_cmd like the one 
> >>> > FSLDMA_EXTERNAL_START
> >>> > defined for Freescale drivers?
> >>>
> >>> In my opinion it is not a good idea to have driver implement a generic 
> >>> API,
> >>> but at the same time let the driver have custom semantics for those API
> >>> calls. It's a bit like having a gpio driver that expects 23 and 42 as the
> >>> values passed to gpio_set_value instead of 0 and 1. It completely defeats
> >>> the purpose of a generic API, namely that you are able to write generic 
> >>> code
> >>> that makes use of the API without having to know about which 
> >>> implementation
> >>> API it is talking to. The dmaengine framework provides the
> >>> dmaengine_prep_interleaved_dma() function to setup two dimensional
> >>> transfers, e.g. take a look at sirf-dma.c or imx-dma.c.
> >>
> >> The question here i think would be waht this device supports? Is the 
> >> hardware
> >> capable of doing interleaved transfers, then would make sense.
> >>
> >> While we do try to get users use dma_slave_config, but there will always be
> >> someone who have specfic params. If we can generalize then we might want 
> >> to add
> >> to the dma_slave_config as well
> >
> > There are many configuration parameters which are specific to IP and I
> > would like to
> > give an overview of some of parameteres here:
> >
> > 1) Park Mode ('cfg->park'): In Park mode, engine will park on frame
> > referenced by
> > 'cfg->park_frm', so user will have control on each frame in this mode.
> >
> > 2) Interrupt Coalesce ('cfg->coalesce'):  Used for setting interrupt
> > threshold. This value
> >determines the number of frame buffers to process. To use this feature,
> >'cfg->frm_cnt_en' should be set.
> >
> > 3) Frame Synchronization Source ('cfg->ext_fsync'):  Can be an
> > external/internal frame
> > synchronization source. Used to synchronize one channel (MM2S/S2MM) with
> > another (S2MM/MM2S) channel.
> >
> > 4) Genlock Synchronization ('cfg->genlock'): Used to avoid mismatch rate 
> > between
> > master and slave.  In master mode (cfg->master), frames are not dropped 
> > and
> > slave can drop frames to adjust to master frame rate.
> >
> > And in future, this Engine being a soft IP, we could expect some more 
> > additional
> > parameters.  Isn't a good idea to have a private member in dma_slave_config 
> > for
> > sharing additional configuration between slave device and dma engine? Or a 
> > new
> > dma_ctrl_cmd like FSLDMA_EXTERNAL_START?

The idea of a generic API is that we can use it for most of the controllers. 
Even
if you are planning to support a family of controllers

ATM, lets not discuss the possiblity of private member and try to exhanust all
possible options. Worst case you can embed the dma_slave_config in
xilinx_dma_slave_config and retrieve it in dmac driver

-- 
~Vinod
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH] fdtable: Avoid triggering OOMs from alloc_fdmem

2014-02-03 Thread Eric W. Biederman

Recently due to a spike in connections per second memcached on 3
separate boxes triggered the OOM killer from accept.  At the time the
OOM killer was triggered there was 4GB out of 36GB free in zone 1. The
problem was that alloc_fdtable was allocating an order 3 page (32KiB) to
hold a bitmap, and there was sufficient fragmentation that the largest
page available was 8KiB.

I find the logic that PAGE_ALLOC_COSTLY_ORDER can't fail pretty dubious
but I do agree that order 3 allocations are very likely to succeed.

There are always pathologies where order > 0 allocations can fail when
there are copious amounts of free memory available.  Using the pigeon
hole principle it is easy to show that it requires 1 page more than 50%
of the pages being free to guarantee an order 1 (8KiB) allocation will
succeed, 1 page more than 75% of the pages being free to guarantee an
order 2 (16KiB) allocation will succeed and 1 page more than 87.5% of
the pages being free to guarantee an order 3 allocate will succeed.

A server churning memory with a lot of small requests and replies like
memcached is a common case that if anything can will skew the odds
against large pages being available.

Therefore let's not give external applications a practical way to kill
linux server applications, and specify __GFP_NORETRY to the kmalloc in
alloc_fdmem.  Unless I am misreading the code and by the time the code
reaches should_alloc_retry in __alloc_pages_slowpath (where
__GFP_NORETRY becomes signification).  We have already tried everything
reasonable to allocate a page and the only thing left to do is wait.  So
not waiting and falling back to vmalloc immediately seems like the
reasonable thing to do even if there wasn't a chance of triggering the
OOM killer.

Cc: sta...@vger.kernel.org
Signed-off-by: "Eric W. Biederman" 
---
 fs/file.c |2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/fs/file.c b/fs/file.c
index 771578b33fb6..db25c2bdfe46 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -34,7 +34,7 @@ static void *alloc_fdmem(size_t size)
 * vmalloc() if the allocation size will be considered "large" by the 
VM.
 */
if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) {
-   void *data = kmalloc(size, GFP_KERNEL|__GFP_NOWARN);
+   void *data = kmalloc(size, 
GFP_KERNEL|__GFP_NOWARN|__GFP_NORETRY);
if (data != NULL)
return data;
}
-- 
1.7.5.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH] kernel: kprobe: move all *kretprobe* generic implementation to CONFIG_KRETPROBES enabled area

2014-02-03 Thread Chen Gang
When CONFIG_KRETPROBES disabled, all *kretprobe* generic implementation
are useless, so need move them to CONFIG_KPROBES enabled area.

Now, *kretprobe* generic implementation are all implemented in 2 files:

 - in "include/linux/kprobes.h":

 move inline kretprobe*() to CONFIG_KPROBES area and dummy outside.
 move some *kprobe() declarations which kretprobe*() call, to front.
 not touch kretprobe_blacklist[] which is architecture's variable.

 - in "kernel/kprobes.c":

 move all kretprobe* to CONFIG_KPROBES area and dummy outside.
 define kretprobe_flush_task() to let kprobe_flush_task() call.
 define init_kretprobes() to let init_kprobes() call.

The patch passes compiling (get "kernel/kprobes.o" and "kernel/built-
in.o") under avr32 and x86_64 allmodconfig, and passes building (get
bzImage and Modpost modules) under x86_64 defconfig.


Signed-off-by: Chen Gang 
---
 include/linux/kprobes.h |  58 +
 kernel/kprobes.c| 328 +++-
 2 files changed, 222 insertions(+), 164 deletions(-)

diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h
index 925eaf2..c0d1212 100644
--- a/include/linux/kprobes.h
+++ b/include/linux/kprobes.h
@@ -223,10 +223,36 @@ static inline int kprobes_built_in(void)
return 1;
 }
 
+int disable_kprobe(struct kprobe *kp);
+int enable_kprobe(struct kprobe *kp);
+
+void dump_kprobe(struct kprobe *kp);
+
+extern struct kretprobe_blackpoint kretprobe_blacklist[];
+
 #ifdef CONFIG_KRETPROBES
 extern void arch_prepare_kretprobe(struct kretprobe_instance *ri,
   struct pt_regs *regs);
 extern int arch_trampoline_kprobe(struct kprobe *p);
+static inline void kretprobe_assert(struct kretprobe_instance *ri,
+   unsigned long orig_ret_address, unsigned long trampoline_address)
+{
+   if (!orig_ret_address || (orig_ret_address == trampoline_address)) {
+   printk(KERN_ERR
+   "kretprobe BUG!: Processing kretprobe %p @ %p\n",
+   ri->rp, ri->rp->kp.addr);
+   BUG();
+   }
+}
+static inline int disable_kretprobe(struct kretprobe *rp)
+{
+   return disable_kprobe(>kp);
+}
+static inline int enable_kretprobe(struct kretprobe *rp)
+{
+   return enable_kprobe(>kp);
+}
+
 #else /* CONFIG_KRETPROBES */
 static inline void arch_prepare_kretprobe(struct kretprobe *rp,
struct pt_regs *regs)
@@ -236,19 +262,20 @@ static inline int arch_trampoline_kprobe(struct kprobe *p)
 {
return 0;
 }
-#endif /* CONFIG_KRETPROBES */
-
-extern struct kretprobe_blackpoint kretprobe_blacklist[];
-
 static inline void kretprobe_assert(struct kretprobe_instance *ri,
unsigned long orig_ret_address, unsigned long trampoline_address)
 {
-   if (!orig_ret_address || (orig_ret_address == trampoline_address)) {
-   printk("kretprobe BUG!: Processing kretprobe %p @ %p\n",
-   ri->rp, ri->rp->kp.addr);
-   BUG();
-   }
 }
+static inline int disable_kretprobe(struct kretprobe *rp)
+{
+   return 0;
+}
+static inline int enable_kretprobe(struct kretprobe *rp)
+{
+   return 0;
+}
+
+#endif /* CONFIG_KRETPROBES */
 
 #ifdef CONFIG_KPROBES_SANITY_TEST
 extern int init_test_probes(void);
@@ -379,11 +406,6 @@ void unregister_kretprobes(struct kretprobe **rps, int 
num);
 void kprobe_flush_task(struct task_struct *tk);
 void recycle_rp_inst(struct kretprobe_instance *ri, struct hlist_head *head);
 
-int disable_kprobe(struct kprobe *kp);
-int enable_kprobe(struct kprobe *kp);
-
-void dump_kprobe(struct kprobe *kp);
-
 #else /* !CONFIG_KPROBES: */
 
 static inline int kprobes_built_in(void)
@@ -459,14 +481,6 @@ static inline int enable_kprobe(struct kprobe *kp)
return -ENOSYS;
 }
 #endif /* CONFIG_KPROBES */
-static inline int disable_kretprobe(struct kretprobe *rp)
-{
-   return disable_kprobe(>kp);
-}
-static inline int enable_kretprobe(struct kretprobe *rp)
-{
-   return enable_kprobe(>kp);
-}
 static inline int disable_jprobe(struct jprobe *jp)
 {
return disable_kprobe(>kp);
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index ceeadfc..e305a81 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -69,7 +69,6 @@
 
 static int kprobes_initialized;
 static struct hlist_head kprobe_table[KPROBE_TABLE_SIZE];
-static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE];
 
 /* NOTE: change this value only with kprobe_mutex held */
 static bool kprobes_all_disarmed;
@@ -77,14 +76,6 @@ static bool kprobes_all_disarmed;
 /* This protects kprobe_table and optimizing_list */
 static DEFINE_MUTEX(kprobe_mutex);
 static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL;
-static struct {
-   raw_spinlock_t lock cacheline_aligned_in_smp;
-} kretprobe_table_locks[KPROBE_TABLE_SIZE];
-
-static raw_spinlock_t *kretprobe_table_lock_ptr(unsigned long hash)
-{
-   return 

[PATCH 4/4] power_supply: bq24261 charger driver

2014-02-03 Thread Jenny TC
This patch introduces BQ24261 charger driver. The driver makes use of power
supply charging driver to setup charging. So the driver does hardware
abstraction and handles h/w specific corner cases. The charging logic resides
with power supply charging driver

Signed-off-by: Jenny TC 
---
 drivers/power/Kconfig |   10 +
 drivers/power/Makefile|1 +
 drivers/power/bq24261-charger.c   | 1358 +
 include/linux/power/bq24261-charger.h |   25 +
 4 files changed, 1394 insertions(+)
 create mode 100644 drivers/power/bq24261-charger.c
 create mode 100644 include/linux/power/bq24261-charger.h

diff --git a/drivers/power/Kconfig b/drivers/power/Kconfig
index 913ec36..a1c2780 100644
--- a/drivers/power/Kconfig
+++ b/drivers/power/Kconfig
@@ -409,6 +409,16 @@ config BATTERY_GOLDFISH
  Say Y to enable support for the battery and AC power in the
  Goldfish emulator.
 
+config CHARGER_BQ24261
+   tristate "BQ24261 charger driver"
+   select POWER_SUPPLY_CHARGER
+   depends on I2C
+   help
+ Say Y to include support for BQ24261 Charger driver. This driver
+ makes use of power supply charging driver. So the driver gives
+ the charger hardware abstraction only. Charging logic is abstracted
+ in the power supply charging driver.
+
 source "drivers/power/reset/Kconfig"
 
 endif # POWER_SUPPLY
diff --git a/drivers/power/Makefile b/drivers/power/Makefile
index 77535fd..9dde895 100644
--- a/drivers/power/Makefile
+++ b/drivers/power/Makefile
@@ -59,4 +59,5 @@ obj-$(CONFIG_CHARGER_BQ24735) += bq24735-charger.o
 obj-$(CONFIG_POWER_AVS)+= avs/
 obj-$(CONFIG_CHARGER_SMB347)   += smb347-charger.o
 obj-$(CONFIG_CHARGER_TPS65090) += tps65090-charger.o
+obj-$(CONFIG_CHARGER_BQ24261)  += bq24261-charger.o
 obj-$(CONFIG_POWER_RESET)  += reset/
diff --git a/drivers/power/bq24261-charger.c b/drivers/power/bq24261-charger.c
new file mode 100644
index 000..a87d1cc
--- /dev/null
+++ b/drivers/power/bq24261-charger.c
@@ -0,0 +1,1358 @@
+/*
+ * bq24261-charger.c - BQ24261 Charger I2C client driver
+ *
+ * Copyright (C) 2011 Intel Corporation
+ *
+ * ~~
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; version 2 of the License.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.See the GNU
+ * General Public License for more details.
+ *
+ * ~~
+ * Author: Jenny TC 
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+
+#define DEV_NAME "bq24261_charger"
+#define DEV_MANUFACTURER "TI"
+#define MODEL_NAME_SIZE 8
+#define DEV_MANUFACTURER_NAME_SIZE 4
+
+#define EXCEPTION_MONITOR_DELAY (60 * HZ)
+#define WDT_RESET_DELAY (15 * HZ)
+
+/* BQ24261 registers */
+#define BQ24261_STAT_CTRL0_ADDR0x00
+#define BQ24261_CTRL_ADDR  0x01
+#define BQ24261_BATT_VOL_CTRL_ADDR 0x02
+#define BQ24261_VENDOR_REV_ADDR0x03
+#define BQ24261_TERM_FCC_ADDR  0x04
+#define BQ24261_VINDPM_STAT_ADDR   0x05
+#define BQ24261_ST_NTC_MON_ADDR0x06
+
+#define BQ24261_RESET_MASK (0x01 << 7)
+#define BQ24261_RESET_ENABLE   (0x01 << 7)
+
+#define BQ24261_FAULT_MASK 0x07
+#define BQ24261_STAT_MASK  (0x03 << 4)
+#define BQ24261_BOOST_MASK (0x01 << 6)
+#define BQ24261_TMR_RST_MASK   (0x01 << 7)
+#define BQ24261_TMR_RST(0x01 << 7)
+
+#define BQ24261_ENABLE_BOOST   (0x01 << 6)
+
+#define BQ24261_VOVP   0x01
+#define BQ24261_LOW_SUPPLY 0x02
+#define BQ24261_THERMAL_SHUTDOWN   0x03
+#define BQ24261_BATT_TEMP_FAULT0x04
+#define BQ24261_TIMER_FAULT0x05
+#define BQ24261_BATT_OVP   0x06
+#define BQ24261_NO_BATTERY 0x07
+#define BQ24261_STAT_READY 0x00
+
+#define BQ24261_STAT_CHRG_PRGRSS   (0x01 << 4)
+#define BQ24261_STAT_CHRG_DONE (0x02 << 4)
+#define BQ24261_STAT_FAULT (0x03 << 4)
+
+#define BQ24261_CE_MASK(0x01 << 1)
+#define BQ24261_CE_DISABLE (0x01 << 1)
+
+#define BQ24261_HZ_MASK(0x01)
+#define BQ24261_HZ_ENABLE  (0x01)
+
+#define BQ24261_ICHRG_MASK (0x1F << 3)
+#define BQ24261_MIN_CC 500 /* 500mA */
+#define BQ24261_MAX_CC 3000 /* 3A */
+
+#define BQ24261_ITERM_MASK (0x03)
+#define 

[PATCH 3/4] power_supply: Introduce PSE compliant algorithm

2014-02-03 Thread Jenny TC
As per Product Safety Engineering (PSE) specification for battery charging, the
battery characteristics and thereby the charging rates can vary on different
temperature zones. This patch introduces a PSE compliant charging algorithm with
maintenance charging support. The algorithm can be selected by the power supply
charging driver based on the type of the battery charging profile.

Signed-off-by: Jenny TC 
---
 drivers/power/Kconfig  |   13 ++
 drivers/power/Makefile |1 +
 drivers/power/charging_algo_pse.c  |  198 
 include/linux/power/power_supply_charger.h |   48 +++
 4 files changed, 260 insertions(+)
 create mode 100644 drivers/power/charging_algo_pse.c

diff --git a/drivers/power/Kconfig b/drivers/power/Kconfig
index f679f82..913ec36 100644
--- a/drivers/power/Kconfig
+++ b/drivers/power/Kconfig
@@ -22,6 +22,19 @@ config POWER_SUPPLY_CHARGER
  drivers to keep the charging logic outside and the charger driver
  just need to abstract the charger hardware.
 
+config POWER_SUPPLY_CHARGING_ALGO_PSE
+   bool "PSE compliant charging algorithm"
+   help
+ Say Y here to select Product Safety Engineering (PSE) compliant
+ charging algorithm. As per PSE standard the battery characteristics
+ and thereby the charging rates can vary on different temperature
+ zones. This config will enable PSE compliant charging algorithm with
+ maintenance charging support. At runtime the algorithm will be
+ selected by the psy charger driver based on the type of the battery
+ charging profile.
+
+   depends on POWER_SUPPLY_CHARGER
+
 config PDA_POWER
tristate "Generic PDA/phone power driver"
depends on !S390
diff --git a/drivers/power/Makefile b/drivers/power/Makefile
index 405f0f4..77535fd 100644
--- a/drivers/power/Makefile
+++ b/drivers/power/Makefile
@@ -8,6 +8,7 @@ obj-$(CONFIG_POWER_SUPPLY)  += power_supply.o
 obj-$(CONFIG_GENERIC_ADC_BATTERY)  += generic-adc-battery.o
 
 obj-$(CONFIG_POWER_SUPPLY_CHARGER) += power_supply_charger.o
+obj-$(CONFIG_POWER_SUPPLY_CHARGING_ALGO_PSE) += charging_algo_pse.o
 obj-$(CONFIG_PDA_POWER)+= pda_power.o
 obj-$(CONFIG_APM_POWER)+= apm_power.o
 obj-$(CONFIG_MAX8925_POWER)+= max8925_power.o
diff --git a/drivers/power/charging_algo_pse.c 
b/drivers/power/charging_algo_pse.c
new file mode 100644
index 000..0a0130a
--- /dev/null
+++ b/drivers/power/charging_algo_pse.c
@@ -0,0 +1,198 @@
+/*
+ * Copyright (C) 2012 Intel Corporation
+ *
+ * ~~
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; version 2 of the License.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.See the GNU
+ * General Public License for more details.
+ *
+ * ~~
+ * Author: Jenny TC 
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include "power_supply.h"
+#include "power_supply_charger.h"
+
+/* 98% of CV is considered as voltage to detect Full */
+#define FULL_CV_MIN 98
+
+/* Offset to exit from maintenance charging. In maintenance charging
+*  if the volatge is less than the (maintenance_lower_threshold -
+*  MAINT_EXIT_OFFSET) then system can switch to normal charging
+*/
+#define MAINT_EXIT_OFFSET 50  /* mV */
+
+static int get_tempzone(struct psy_pse_chrg_prof *pse_mod_bprof,
+   int temp)
+{
+
+   int i = 0;
+   int temp_range_cnt = min_t(u16, pse_mod_bprof->temp_mon_ranges,
+   BATT_TEMP_NR_RNG);
+
+   if ((temp < pse_mod_bprof->temp_low_lim) ||
+   (temp > pse_mod_bprof->temp_mon_range[0].temp_up_lim))
+   return -EINVAL;
+
+   for (i = 0; i < temp_range_cnt; ++i)
+   if (temp > pse_mod_bprof->temp_mon_range[i].temp_up_lim)
+   break;
+   return i-1;
+}
+
+static inline bool __is_battery_full
+   (long volt, long cur, long iterm, unsigned long cv)
+{
+   pr_devel("%s:current=%ld pse_mod_bprof->chrg_term_mA =%ld 
voltage_now=%ld full_cond=%ld",
+   __func__, cur, iterm, volt * 100, (FULL_CV_MIN * cv));
+
+   return (cur > 0) && (cur <= iterm) &&
+   ((volt * 100)  >= (FULL_CV_MIN * cv));
+
+}
+
+static inline bool is_battery_full(struct psy_batt_props bat_prop,
+   struct psy_pse_chrg_prof *pse_mod_bprof, unsigned long cv)
+{
+
+   int i;
+   /* Software full detection. Check the battery charge current to detect
+   *  battery Full. The 

Re: [PATCH] cpufreq: cpu0: make THERMAL_CPU support optional

2014-02-03 Thread Viresh Kumar
On 3 February 2014 21:44, Rob Herring  wrote:
> That's certainly fine by me, but I don't know which platforms those are.

Probably OMAP as the author came from TI and has tested it on OMAPs

https://lkml.org/lkml/2013/9/26/787

> BTW, REGULATOR could probably be dropped as well. It certainly works
> w/o a regulator as highbank does not define one.

Sure. Get that out as well in the same patch.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 1/4] power_supply: Add inlmt,iterm, min/max temp props

2014-02-03 Thread Jenny TC
Add new power supply properties for input current, charge termination
current, min and max temperature

POWER_SUPPLY_PROP_TEMP_MIN - minimum operatable temperature
POWER_SUPPLY_PROP_TEMP_MAX - maximum operatable temperature

POWER_SUPPLY_PROP_INLMT - input current limit programmed by charger. Indicates
the input current for a charging source.

POWER_SUPPLY_PROP_CHARGE_TERM_CUR - Charge termination current used to detect
the end of charge condition

Signed-off-by: Jenny TC 
---
 Documentation/power/power_supply_class.txt |6 ++
 drivers/power/power_supply_sysfs.c |4 
 include/linux/power_supply.h   |4 
 3 files changed, 14 insertions(+)

diff --git a/Documentation/power/power_supply_class.txt 
b/Documentation/power/power_supply_class.txt
index 89a8816..48cff88 100644
--- a/Documentation/power/power_supply_class.txt
+++ b/Documentation/power/power_supply_class.txt
@@ -118,6 +118,10 @@ relative, time-based measurements.
 CONSTANT_CHARGE_CURRENT - constant charge current programmed by charger.
 CONSTANT_CHARGE_CURRENT_MAX - maximum charge current supported by the
 power supply object.
+INPUT_CURRENT_LIMIT - input current limit programmed by charger. Indicates
+the current drawn from a charging source.
+CHARGE_TERM_CURRENT - Charge termination current used to detect the end of 
charge
+condition.
 
 CONSTANT_CHARGE_VOLTAGE - constant charge voltage programmed by charger.
 CONSTANT_CHARGE_VOLTAGE_MAX - maximum charge voltage supported by the
@@ -140,6 +144,8 @@ TEMP_ALERT_MAX - maximum battery temperature alert.
 TEMP_AMBIENT - ambient temperature.
 TEMP_AMBIENT_ALERT_MIN - minimum ambient temperature alert.
 TEMP_AMBIENT_ALERT_MAX - maximum ambient temperature alert.
+TEMP_MIN - minimum operatable temperature
+TEMP_MAX - maximum operatable temperature
 
 TIME_TO_EMPTY - seconds left for battery to be considered empty (i.e.
 while battery powers a load)
diff --git a/drivers/power/power_supply_sysfs.c 
b/drivers/power/power_supply_sysfs.c
index 44420d1..750a202 100644
--- a/drivers/power/power_supply_sysfs.c
+++ b/drivers/power/power_supply_sysfs.c
@@ -167,6 +167,7 @@ static struct device_attribute power_supply_attrs[] = {
POWER_SUPPLY_ATTR(constant_charge_voltage_max),
POWER_SUPPLY_ATTR(charge_control_limit),
POWER_SUPPLY_ATTR(charge_control_limit_max),
+   POWER_SUPPLY_ATTR(input_current_limit),
POWER_SUPPLY_ATTR(energy_full_design),
POWER_SUPPLY_ATTR(energy_empty_design),
POWER_SUPPLY_ATTR(energy_full),
@@ -178,6 +179,8 @@ static struct device_attribute power_supply_attrs[] = {
POWER_SUPPLY_ATTR(capacity_alert_max),
POWER_SUPPLY_ATTR(capacity_level),
POWER_SUPPLY_ATTR(temp),
+   POWER_SUPPLY_ATTR(temp_max),
+   POWER_SUPPLY_ATTR(temp_min),
POWER_SUPPLY_ATTR(temp_alert_min),
POWER_SUPPLY_ATTR(temp_alert_max),
POWER_SUPPLY_ATTR(temp_ambient),
@@ -189,6 +192,7 @@ static struct device_attribute power_supply_attrs[] = {
POWER_SUPPLY_ATTR(time_to_full_avg),
POWER_SUPPLY_ATTR(type),
POWER_SUPPLY_ATTR(scope),
+   POWER_SUPPLY_ATTR(charge_term_current),
/* Properties of type `const char *' */
POWER_SUPPLY_ATTR(model_name),
POWER_SUPPLY_ATTR(manufacturer),
diff --git a/include/linux/power_supply.h b/include/linux/power_supply.h
index c9dc4e0..0278600 100644
--- a/include/linux/power_supply.h
+++ b/include/linux/power_supply.h
@@ -120,6 +120,7 @@ enum power_supply_property {
POWER_SUPPLY_PROP_CONSTANT_CHARGE_VOLTAGE_MAX,
POWER_SUPPLY_PROP_CHARGE_CONTROL_LIMIT,
POWER_SUPPLY_PROP_CHARGE_CONTROL_LIMIT_MAX,
+   POWER_SUPPLY_PROP_INPUT_CURRENT_LIMIT,
POWER_SUPPLY_PROP_ENERGY_FULL_DESIGN,
POWER_SUPPLY_PROP_ENERGY_EMPTY_DESIGN,
POWER_SUPPLY_PROP_ENERGY_FULL,
@@ -131,6 +132,8 @@ enum power_supply_property {
POWER_SUPPLY_PROP_CAPACITY_ALERT_MAX, /* in percents! */
POWER_SUPPLY_PROP_CAPACITY_LEVEL,
POWER_SUPPLY_PROP_TEMP,
+   POWER_SUPPLY_PROP_TEMP_MAX,
+   POWER_SUPPLY_PROP_TEMP_MIN,
POWER_SUPPLY_PROP_TEMP_ALERT_MIN,
POWER_SUPPLY_PROP_TEMP_ALERT_MAX,
POWER_SUPPLY_PROP_TEMP_AMBIENT,
@@ -142,6 +145,7 @@ enum power_supply_property {
POWER_SUPPLY_PROP_TIME_TO_FULL_AVG,
POWER_SUPPLY_PROP_TYPE, /* use power_supply.type instead */
POWER_SUPPLY_PROP_SCOPE,
+   POWER_SUPPLY_PROP_CHARGE_TERM_CURRENT,
/* Properties of type `const char *' */
POWER_SUPPLY_PROP_MODEL_NAME,
POWER_SUPPLY_PROP_MANUFACTURER,
-- 
1.7.9.5

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v5 0/4] power_supply: Introduce power supply charging driver

2014-02-03 Thread Jenny TC
v1: introduced feature as a framework within power supply class driver with
separate files for battid framework and charging framework
v2: fixed review comments, moved macros and inline functions to power_supply.h
v3: moved the feature as a separate driver, combined battid framework and
charging framework inside the power supply charging driver. Moved
charger specific properties to power_supply_charger.h and plugged the
driver with power supply subsystem using power_supply_notifier
introduced in my previous patch. Also a sample charger chip driver
(bq24261) patch added to give more idea on the psy charging driver
usage
v4: Fixed review comments, no major design changes.
v5: Fixed makefile inconsistencies, removed unused pdata callbacks

The Power Supply charging driver connects multiple subsystems
to do charging in a generic way. The subsystems involves power_supply,
thermal and battery communication subsystems (1wire).With this the charging is
handled in a generic way.

The driver makes use of different new features - Battery Identification
interfaces, pluggable charging algorithms, charger cable arbitrations etc.
The patch also introduces generic interface for charger cable notifications.
Charger cable events and capabilities can be notified using the generic
power_supply_notifier chain.

Overall this driver removes the charging logic out of the charger chip driver
and the charger chip driver can just listen to the request from the power
supply charging driver to set the charger properties. This can be implemented
by exposing get_property and set property callbacks.

Jenny TC (4):
  power_supply: Add inlmt,iterm, min/max temp props
  power_supply: Introduce generic psy charging driver
  power_supply: Introduce PSE compliant algorithm
  power_supply: bq24261 charger driver

 Documentation/power/power_supply_charger.txt |  339 +++
 Documentation/power/power_supply_class.txt   |6 +
 drivers/power/Kconfig|   31 +
 drivers/power/Makefile   |3 +
 drivers/power/bq24261-charger.c  | 1364 ++
 drivers/power/charging_algo_pse.c|  198 
 drivers/power/power_supply_charger.c | 1196 ++
 drivers/power/power_supply_charger.h |  218 
 drivers/power/power_supply_core.c|3 +
 drivers/power/power_supply_sysfs.c   |4 +
 include/linux/power/bq24261-charger.h|   25 +
 include/linux/power/power_supply_charger.h   |  237 +
 include/linux/power_supply.h |  164 
 13 files changed, 3788 insertions(+)
 create mode 100644 Documentation/power/power_supply_charger.txt
 create mode 100644 drivers/power/bq24261-charger.c
 create mode 100644 drivers/power/charging_algo_pse.c
 create mode 100644 drivers/power/power_supply_charger.c
 create mode 100644 drivers/power/power_supply_charger.h
 create mode 100644 include/linux/power/bq24261-charger.h
 create mode 100644 include/linux/power/power_supply_charger.h

-- 
1.7.9.5

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


linux-next: Tree for Feb 4

2014-02-03 Thread Stephen Rothwell
Hi all,

This tree fails (more than usual) the powerpc allyesconfig build.

Changes since 20140203:

Dropped tree: parisc-hd

Undropped tree: btrfs

The parisc-hd tree gained conflicts against its rebased version in Linus'
tree, so I dropped it for today.

The powerpc tree still had its build failure.

The btrfs tree lost its conflicts against Linus' tree so it is back.

The init tree lost a patch.

Non-merge commits (relative to Linus' tree): 1151
 1636 files changed, 25791 insertions(+), 10541 deletions(-)



I have created today's linux-next tree at
git://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git
(patches at http://www.kernel.org/pub/linux/kernel/next/ ).  If you
are tracking the linux-next tree using git, you should not use "git pull"
to do so as that will try to merge the new linux-next release with the
old one.  You should use "git fetch" as mentioned in the FAQ on the wiki
(see below).

You can see which trees have been included by looking in the Next/Trees
file in the source.  There are also quilt-import.log and merge.log files
in the Next directory.  Between each merge, the tree was built with
a ppc64_defconfig for powerpc and an allmodconfig for x86_64 and a
multi_v7_defconfig for arm. After the final fixups (if any), it is also
built with powerpc allnoconfig (32 and 64 bit), ppc44x_defconfig and
allyesconfig (minus CONFIG_PROFILE_ALL_BRANCHES - this fails its final
link) and i386, sparc, sparc64 and arm defconfig. These builds also have
CONFIG_ENABLE_WARN_DEPRECATED, CONFIG_ENABLE_MUST_CHECK and
CONFIG_DEBUG_INFO disabled when necessary.

Below is a summary of the state of the merge.

I am currently merging 208 trees (counting Linus' and 28 trees of patches
pending for Linus' tree).

Stats about the size of the tree over time can be seen at
http://neuling.org/linux-next-size.html .

Status of my local build tests will be at
http://kisskb.ellerman.id.au/linux-next .  If maintainers want to give
advice about cross compilers/configs that work, we are always open to add
more builds.

Thanks to Randy Dunlap for doing many randconfig builds.  And to Paul
Gortmaker for triage and bug fixes.

There is a wiki covering stuff to do with linux-next at
http://linux.f-seidel.de/linux-next/pmwiki/ .  Thanks to Frank Seidel.

-- 
Cheers,
Stephen Rothwells...@canb.auug.org.au

$ git checkout master
$ git reset --hard stable
Merging origin/master (38dbfb59d117 Linus 3.14-rc1)
Merging fixes/master (b0031f227e47 Merge tag 's2mps11-build' of 
git://git.kernel.org/pub/scm/linux/kernel/git/broonie/regulator)
Merging kbuild-current/rc-fixes (19514fc665ff arm, kbuild: make "make install" 
not depend on vmlinux)
Merging arc-current/for-curr (7e22e91102c6 Linux 3.13-rc8)
Merging arm-current/fixes (d326b65c57d6 ARM: fix building with gcc 4.6.4)
Merging m68k-current/for-linus (56931d73697c m68k/mac: Make SCC reset work more 
reliably)
Merging metag-fixes/fixes (3b2f64d00c46 Linux 3.11-rc2)
Merging powerpc-merge/merge (b3084f4db3ae powerpc/thp: Fix crash on mremap)
Merging sparc/master (9b0cd304f26b Merge branch 'drm-next' of 
git://people.freedesktop.org/~airlied/linux)
Merging net/master (b045d37bd68c ip_tunnel: fix panic in ip_tunnel_xmit())
Merging ipsec/master (965cdea82569 dccp: catch failed request_module call in 
dccp_probe init)
Merging sound-current/for-linus (4fa71c1550a8 ALSA: usb-audio: Add missing 
kconfig dependecy)
Merging pci-current/for-linus (38dbfb59d117 Linus 3.14-rc1)
Merging wireless/master (53d8ab29f8f6 Merge branch 'for-3.14/drivers' of 
git://git.kernel.dk/linux-block)
Merging driver-core.current/driver-core-linus (90804ed61f24 Merge branch 
'for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/jack/linux-fs)
Merging tty.current/tty-linus (413541dd66d5 Linux 3.13-rc5)
Merging usb.current/usb-linus (90804ed61f24 Merge branch 'for_linus' of 
git://git.kernel.org/pub/scm/linux/kernel/git/jack/linux-fs)
Merging staging.current/staging-linus (77d143de7581 Merge branch 'for-linus' of 
git://git.kernel.org/pub/scm/linux/kernel/git/rw/uml)
Merging char-misc.current/char-misc-linus (90804ed61f24 Merge branch 
'for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/jack/linux-fs)
Merging input-current/for-linus (55df811f2066 Merge branch 'next' into 
for-linus)
Merging md-current/for-linus (d47648fcf061 raid5: avoid finding "discard" 
stripe)
Merging crypto-current/master (ee97dc7db4cb crypto: s390 - fix des and des3_ede 
ctr concurrency issue)
Merging ide/master (9b0cd304f26b Merge branch 'drm-next' of 
git://people.freedesktop.org/~airlied/linux)
Merging dwmw2/master (5950f0803ca9 pcmcia: remove RPX board stuff)
Merging devicetree-current/devicetree/merge (6f041e99fc7b of: Fix NULL 
dereference in unflatten_and_copy())
Merging rr-fixes/fixes (7122c3e9154b scripts/link-vmlinux.sh: only filter 
kernel symbols for arm)
Merging mfd-fixes/mas

Re: [PATCH] Clarify CONFIG_DEBUG_INFO's bloaty nature

2014-02-03 Thread Borislav Petkov
On Mon, Feb 03, 2014 at 02:57:18PM -0800, Andrew Morton wrote:
> On Mon, 3 Feb 2014 14:47:15 -0800 Linus Torvalds 
>  wrote:
> 
> > On Mon, Feb 3, 2014 at 2:00 PM, David Rientjes  wrote:
> > >
> > > How do you define "huge bloat" if the size of vmlinux doesn't increase?
> > 
> > Don't be silly. The size of all the object files increase *hugely*.
> 
> yup, I disable this in my allmodconfig testing, to great effect.
> 
> That being said, I do think the text should make clear that the bloat
> is a compile-time impact and not a runtime one.  Something like
> 
> --- 
> a/lib/Kconfig.debug~lib-kconfigdebug-clarify-config_debug_infos-bloaty-nature-fix
> +++ a/lib/Kconfig.debug
> @@ -128,9 +128,9 @@ config DEBUG_INFO
> tools like crash, kgdb, LKCD, gdb, etc on the kernel.
>  
> If you only want to have resolved symbols in kernel traces and
> -   are not going to need support for those tools above, you don't need
> -   to enable this as it is a huge bloat and build slowdown;
> -   enable CONFIG_KALLSYMS instead.
> +   are not going to need support for the above tools, you don't need
> +   to enable this.  It hugely bloat object files' on-disk sizes and slows
> +   the build.  Enable CONFIG_KALLSYMS instead.

Yes, this is better.

Andrew, can you add that or you want me to send a new version?

Thanks.

-- 
Regards/Gruss,
Boris.

Sent from a fat crate under my desk. Formatting is fine.
--
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 2/2] f2fs: clean up with a macro

2014-02-03 Thread Jaegeuk Kim
This patch adds GET_BLKOFF_FROM_SEG0 to clean up some codes.

Signed-off-by: Jaegeuk Kim 
---
 fs/f2fs/recovery.c |  3 +--
 fs/f2fs/segment.c  | 11 ---
 fs/f2fs/segment.h  |  3 +++
 3 files changed, 8 insertions(+), 9 deletions(-)

diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
index f1b0b89..bda04a0 100644
--- a/fs/f2fs/recovery.c
+++ b/fs/f2fs/recovery.c
@@ -218,8 +218,7 @@ static int check_index_in_prev_nodes(struct f2fs_sb_info 
*sbi,
 {
struct seg_entry *sentry;
unsigned int segno = GET_SEGNO(sbi, blkaddr);
-   unsigned short blkoff = GET_SEGOFF_FROM_SEG0(sbi, blkaddr) &
-   (sbi->blocks_per_seg - 1);
+   unsigned short blkoff = GET_BLKOFF_FROM_SEG0(sbi, blkaddr);
struct f2fs_summary sum;
nid_t ino, nid;
void *kaddr;
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index fba510b..e87946a 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -405,7 +405,7 @@ static void update_sit_entry(struct f2fs_sb_info *sbi, 
block_t blkaddr, int del)
 
se = get_seg_entry(sbi, segno);
new_vblocks = se->valid_blocks + del;
-   offset = GET_SEGOFF_FROM_SEG0(sbi, blkaddr) & (sbi->blocks_per_seg - 1);
+   offset = GET_BLKOFF_FROM_SEG0(sbi, blkaddr);
 
f2fs_bug_on((new_vblocks >> (sizeof(unsigned short) << 3) ||
(new_vblocks > sbi->blocks_per_seg)));
@@ -987,8 +987,7 @@ void recover_data_page(struct f2fs_sb_info *sbi,
change_curseg(sbi, type, true);
}
 
-   curseg->next_blkoff = GET_SEGOFF_FROM_SEG0(sbi, new_blkaddr) &
-   (sbi->blocks_per_seg - 1);
+   curseg->next_blkoff = GET_BLKOFF_FROM_SEG0(sbi, new_blkaddr);
__add_sum_entry(sbi, type, sum);
 
refresh_sit_entry(sbi, old_blkaddr, new_blkaddr);
@@ -1026,8 +1025,7 @@ void rewrite_node_page(struct f2fs_sb_info *sbi,
curseg->next_segno = segno;
change_curseg(sbi, type, true);
}
-   curseg->next_blkoff = GET_SEGOFF_FROM_SEG0(sbi, new_blkaddr) &
-   (sbi->blocks_per_seg - 1);
+   curseg->next_blkoff = GET_BLKOFF_FROM_SEG0(sbi, new_blkaddr);
__add_sum_entry(sbi, type, sum);
 
/* change the current log to the next block addr in advance */
@@ -1035,8 +1033,7 @@ void rewrite_node_page(struct f2fs_sb_info *sbi,
curseg->next_segno = next_segno;
change_curseg(sbi, type, true);
}
-   curseg->next_blkoff = GET_SEGOFF_FROM_SEG0(sbi, next_blkaddr) &
-   (sbi->blocks_per_seg - 1);
+   curseg->next_blkoff = GET_BLKOFF_FROM_SEG0(sbi, next_blkaddr);
 
/* rewrite node page */
set_page_writeback(page);
diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
index 5731682..4024546 100644
--- a/fs/f2fs/segment.h
+++ b/fs/f2fs/segment.h
@@ -57,6 +57,9 @@
((blk_addr) - SM_I(sbi)->seg0_blkaddr)
 #define GET_SEGNO_FROM_SEG0(sbi, blk_addr) \
(GET_SEGOFF_FROM_SEG0(sbi, blk_addr) >> sbi->log_blocks_per_seg)
+#define GET_BLKOFF_FROM_SEG0(sbi, blk_addr)\
+   (GET_SEGOFF_FROM_SEG0(sbi, blk_addr) & (sbi->blocks_per_seg - 1))
+
 #define GET_SEGNO(sbi, blk_addr)   \
(((blk_addr == NULL_ADDR) || (blk_addr == NEW_ADDR)) ?  \
NULL_SEGNO : GET_L2R_SEGNO(FREE_I(sbi), \
-- 
1.8.4.474.g128a96c

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 1/2] f2fs: fix the potential mismatch between dir's i_size and i_blocks

2014-02-03 Thread Jaegeuk Kim
This is the erroneous scenario.

 i_sizeon-disk i_sizei_blocks
__f2fs_add_link() 4096   4096   2
 get_new_data_page8192   4096   3
 -ENOSPC = init_inode_metadata
 checkpoint -4096   3
 POR and reboot

__f2fs_add_link() 4096   4096   3
 page = get_new_data_page (page->index = 1 by NEW_ADDR)
 add a dentry to the page successfully

f2fs_rmdir()
 f2fs_empty_dir() 4096   4096   3
 f2fs_unlink() goes, since there is no valid dentry due to i_size = 4096.
 But, still there is one dentry in page->index = 1.

So this patch moves the code to write dir->i_size into on-disk i_size in order
to sync dir's i_size, on-disk i_size, and its i_blocks.

Signed-off-by: Jaegeuk Kim 
---
 fs/f2fs/dir.c | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c
index 2b7c255..bfcb4ae 100644
--- a/fs/f2fs/dir.c
+++ b/fs/f2fs/dir.c
@@ -395,9 +395,6 @@ static void update_parent_metadata(struct inode *dir, 
struct inode *inode,
set_inode_flag(F2FS_I(dir), FI_UPDATE_DIR);
}
 
-   if (is_inode_flag_set(F2FS_I(dir), FI_UPDATE_DIR))
-   update_inode_page(dir);
-
if (is_inode_flag_set(F2FS_I(inode), FI_INC_LINK))
clear_inode_flag(F2FS_I(inode), FI_INC_LINK);
 }
@@ -511,7 +508,10 @@ add_dentry:
 
update_parent_metadata(dir, inode, current_depth);
 fail:
-   clear_inode_flag(F2FS_I(dir), FI_UPDATE_DIR);
+   if (is_inode_flag_set(F2FS_I(dir), FI_UPDATE_DIR)) {
+   update_inode_page(dir);
+   clear_inode_flag(F2FS_I(dir), FI_UPDATE_DIR);
+   }
kunmap(dentry_page);
f2fs_put_page(dentry_page, 1);
return err;
-- 
1.8.4.474.g128a96c

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: Do we really need curr_target in signal_struct ?

2014-02-03 Thread Rakib Mullick
On Mon, Feb 3, 2014 at 10:39 PM, Oleg Nesterov  wrote:
> On 02/02, Rakib Mullick wrote:
>
>> > As I already said it caches the last wants_signal(t) thread?
>> Yes, you said. But, gets messed up at exit path, not useful everytime.
>
> Yes.
>
>> If fails, p gets checked twice.
>
> Yes, but this is minor, I think.
>
Right.

>> >> I took a look and found that using while_each_thread()
>> >> can make things better than current.
>> >
>> > Why?
>> >
>> using while_each_thread() we can start thread traversing from p, which
>> is a likely
>> pick and also gets away from redundant checking of p.
>
> Heh. We always check "p" first. And (in general) we do not want to start
> from "p" if we want to find a wants_signal() thread, and ->curr_target can
> help to avoid this.
>
>> >> What do you think?
>> >
>> > The patch is technically wrong, a group-wide signal doesn't check all
>> > threads after this change.
>> If group is empty, we don't need to check other than t.
>
> I didn't meant the thread_group_empty() case. Please look at your code:
>
>
> if (!group || thread_group_empty(p)) {
> if (wants_signal(sig, t))
> goto found;
> } else {
> while_each_thread(p, t) {
> if (wants_signal(sig, t))
> goto found;
> }
> }
>
> Suppose that group == T, thread_group_empty(p) == F. Suppose that all
> sub-threads except "p" blocked this signal. With this change "p" (and
> thus the whole thread group) won't be notified. IOW, with your change
> we do not check "p" at all. This is wrong.
>
Oh, sorry, my bad. That was wrong.

> The only user of ->curr_target is complete_signal(), you have found it.
>
Indeed.

>
> I can only read the current code. I do not know the original intent.
>
This is where things are confusing.


> Really?
>
Sometimes, 100% correct (!group case) ;-).

>
> Yes (except a thread can't be killed), so what? Obviously, if ->curr_targer
> exits we should update this pointer. We could even nullify it.
>
That's makes ->curr_target less useful, that's what I meant.

>
> Yes, "p" can be checked twice. I don't think this is that bad, and I
> do not think this particular "problem" should be fixed.
>
Yes, it's minor.

>
> I simply can't understand. Why? I do not think so.
>
Cause, want_signal logic checks these thread attributes to find whether it's
eligible or not.

>> We can acheive the same without ->curr_signal
>> by traversing thread group from the lastly created thread.
>
> We certainly can't "achieve the same" this way, although I am not sure
> what this "the same" actually means.
>
>> So, this is what I think. Let me know if these reason's looks reasonable to 
>> you,
>
> No. Contrary, whatever I personally think about ->curr_signal, I feel
> that you do not understand the code you are trying to change. Sorry,
> I can be wrong. But I still do not see any argument.
>
Yes, right. I do not fully understand this code, also how it exactly puts impact
on signaling subsystems. And, therefore, I think I should not make any
changes in this code.

>> cause before Ingo or Andrew taking it, it requires your ack.
>
> Not really. And of course I'll review the patch correctness-wise, and
> I already sent the change in complete_signal() which looks right to me.
>
> But I am not going to ack the behaviour change, simply because I have
> no idea how this can impact the existing applications. Perhaps nobody
> will notice this change, but we can't know this.
>
Yes, I'm not also sure about the behavior change and it's impact over
existing applications, so, I'm skipping it.

I usually try to make small fixes, cleanup; cause it's less error-prone and
requires less follow-up. Since the things here becoming sort of "don't know"
thing, I think I should stop. But, thank you for helping and replying in this
thread.

Again thanks,
Rakib.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 3/8] mm/swap: prevent concurrent swapon on the same S_ISBLK blockdev

2014-02-03 Thread Hugh Dickins
On Mon, 3 Feb 2014, Andrew Morton wrote:
> On Mon, 27 Jan 2014 18:03:04 +0800 Weijie Yang  
> wrote:
> 
> > When swapon the same S_ISBLK blockdev concurrent, the allocated two
> > swap_info could hold the same block_device, because claim_swapfile()
> > allow the same holder(here, it is sys_swapon function).
> > 
> > To prevent this situation, This patch adds swap_lock protect to ensure
> > we can find this situation and return -EBUSY for one swapon call.
> > 
> > As for S_ISREG swapfile, claim_swapfile() already prevent this scenario
> > by holding inode->i_mutex.
> > 
> > This patch is just for a rare scenario, aim to correct of code.
> > 
> 
> hm, OK.  Would it be saner to pass a unique `holder' to
> claim_swapfile()?  Say, `p'?
> 
> Truly, I am fed up with silly swapon/swapoff races.  How often does
> anyone call these things?  Let's slap a huge lock around the whole
> thing and be done with it?

That answer makes me sad: we can't be bothered to get it right,
even when Weijie goes to the trouble of presenting a series to do so.
But I sure don't deserve a vote until I've actually looked through it.

Hugh
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 00/16] [RFC] Staging updates from the Android tree

2014-02-03 Thread Greg KH
On Mon, Feb 03, 2014 at 10:16:12AM -0800, John Stultz wrote:
> I recently went through the AOSP common.git android/3.10 tree to
> try to pull fixes that haven't been submitted upstream. I've
> cherry picked those patches and wanted to submit them here for
> review, and for hopeful inclusion into staging for 3.15.
> 
> In most cases the patches cherry-picked right over. In a few cases,
> there were collisions due to trivial changes and cleanups like
> spelling fixes. However, the "ion: Move shrinker out of heaps"
> patch required more complicated merge, due to the shrinker api
> change upstream in 3.12.  Things build and appear to work, but
> I'd appreciate extra review there.
> 
> Anyway, please let me know if there's any feedback or suggestions. 

As this series is ordered, I can't take any of them for 3.14-final
(patch 1 is a cleanup patch, not for 3.14.)

Care to make 2 series, one for things you feel should be in 3.14 (i.e.
bugfixes), and the other for what can wait for 3.15 (i.e. uapi header
file stuff)?

thanks,

greg k-h
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 2/8] mm/swap: fix race on swap_info reuse between swapoff and swapon

2014-02-03 Thread Hugh Dickins
On Mon, 3 Feb 2014, Andrew Morton wrote:
> On Mon, 3 Feb 2014 15:23:40 -0800 Andrew Morton  
> wrote:
> > On Mon, 27 Jan 2014 18:03:04 +0800 Weijie Yang  
> > wrote:
> > 
> > > swapoff clear swap_info's SWP_USED flag prematurely and free its resources
> > > after that. A concurrent swapon will reuse this swap_info while its 
> > > previous
> > > resources are not cleared completely.
> > > 
> > > These late freed resources are:
> > >  - p->percpu_cluster
> > >  - swap_cgroup_ctrl[type]
> > >  - block_device setting
> > >  - inode->i_flags &= ~S_SWAPFILE
> > > 
> > > This patch clear SWP_USED flag after all its resources freed, so that 
> > > swapon
> > > can reuse this swap_info by alloc_swap_info() safely.
> > > 
> > > This patch is just for a rare scenario, aim to correct of code.
> > 
> > I believe that
> > http://ozlabs.org/~akpm/mmots/broken-out/mm-swap-fix-race-on-swap_info-reuse-between-swapoff-and-swapon.patch
> > makes this patch redundant?
> > 
> 
> oop, hang on.  This patch *is* a stealth-updated version of
> http://ozlabs.org/~akpm/mmots/broken-out/mm-swap-fix-race-on-swap_info-reuse-between-swapoff-and-swapon.patch.
> 
> Undocumented removals of si->swap_map have been added.  What's going on
> there?
> 
> I think I'll stick with the original patch for now.  If you see
> additional optimisations or changes, let's address that separately?

Correct decision, thanks: I explained in an answer when Acking the
previous version why I dislike this version (it would prevent you
from watching the slow progress of swapoff).

Hugh
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v3 5/6] fat: permit to return phy block number by fibmap in fallocated region

2014-02-03 Thread Namjae Jeon
2014-02-04, OGAWA Hirofumi :
> Namjae Jeon  writes:
>
/* fat_get_cluster() assumes the requested blocknr isn't truncated.
 */
down_read(_I(mapping->host)->truncate_lock);
 +  /* To get block number beyond file size in fallocated region */
 +  atomic_set(_I(mapping->host)->beyond_isize, 1);
blocknr = generic_block_bmap(mapping, block, fat_get_block);
 +  atomic_set(_I(mapping->host)->beyond_isize, 0);
up_read(_I(mapping->host)->truncate_lock);
>>>
>>> This is racy. While user is using bmap, kernel can allocate new blocks.
>>> We should use another function for this.
>> I understand that fat can map fallocated blocks in read case while
>> user is using bmap.
>> But I can not find the case allocate new blocks.
>> If I am missing something, Could you please elaborate more ?
>> Is it a case of _bmap request returning the block number for block
>> allocated in parallel write path ?
>
> ->beyond_size is global for inode. So, write(2) path on same inode with
> bmap() also can see 1 set by bmap() while another process is using bmap().
'create' flag  will be 1 in write(2) path. ->beyond_isize will only be
checked when 'create' flag is 0. Is there any case to be racy by
beyond_isize in write(2) path ?

Thanks.
> --
> OGAWA Hirofumi 
>
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: mm: BUG in do_huge_pmd_wp_page

2014-02-03 Thread Hugh Dickins
On Mon, 3 Feb 2014, Sasha Levin wrote:
> On 04/25/2013 10:01 PM, Dave Jones wrote:
> > On Thu, Apr 25, 2013 at 08:51:27PM -0400, Sasha Levin wrote:
> >   > On 04/24/2013 06:46 PM, Andrew Morton wrote:
> >   > > Guys, did this get fixed?
> >   >
> >   > I've stopped seeing that during fuzzing, so I guess that it got fixed
> > somehow...
> > 
> > We've had reports of users hitting this in 3.8
> > 
> > eg:
> > https://bugzilla.redhat.com/show_bug.cgi?id=947985
> > https://bugzilla.redhat.com/show_bug.cgi?id=956730
> > 
> > I'm sure there are other reports of it too.
> > 
> > Would be good if we can figure out what fixed it (if it is actually fixed)
> > for backporting to stable
> 
> It's been a while (7 months?), but this one is back...
> 
> Just hit it again with today's -next:
> 
> [  762.701278] BUG: unable to handle kernel paging request at
> 88009eae6000
> [  762.702462] IP: [] copy_page_rep+0x5/0x10
> [  762.703369] PGD 84bb067 PUD 22fa81067 PMD 22f98b067 PTE 80009eae6060
> [  762.704411] Oops:  [#1] PREEMPT SMP DEBUG_PAGEALLOC
> [  762.705873] Dumping ftrace buffer:
> [  762.707606](ftrace buffer empty)
> [  762.708311] Modules linked in:
> [  762.708762] CPU: 16 PID: 17920 Comm: trinity-c16 Tainted: GW
> 3.13.0-next-2
> 0140203-sasha-7-gf4985e2 #23
> [  762.710135] task: 8801ac358000 ti: 880199234000 task.ti:
> 880199234000
> [  762.710135] RIP: 0010:[]  []
> copy_page_rep+0x5/0x
> 10
> [  762.710135] RSP: 0018:880199235c90  EFLAGS: 00010286
> [  762.710135] RAX: 8002 RBX: 056db980 RCX:
> 0200
> [  762.710135] RDX: 8801ac358000 RSI: 88009eae6000 RDI:
> 88015b6e6000
> [  762.710135] RBP: 880199235cd8 R08:  R09:
> 
> [  762.710135] R10: 0001 R11:  R12:
> 027ab980
> [  762.710135] R13: 0200 R14: 00e6 R15:
> 8800
> [  762.710135] FS:  7fb0804e1700() GS:88003da0()
> knlGS:0
> 000
> [  762.710135] CS:  0010 DS:  ES:  CR0: 8005003b
> [  762.710135] CR2: 88009eae6000 CR3: 000199225000 CR4:
> 06e0
> [  762.710135] Stack:
> [  762.710135]  81298995 8801a841ae00 88003d084520
> 880199227090
> [  762.710135]  80009ea008e5 8801a841ae00 ea00027a8000
> 880199227090
> [  762.710135]  ea00056d8000 880199235d58 812d7260
> 880199235cf8
> [  762.710135] Call Trace:
> [  762.710135]  [] ? copy_user_huge_page+0x1a5/0x210
> [  762.710135]  [] do_huge_pmd_wp_page+0x3d0/0x650
> [  762.710135]  [] ? put_lock_stats+0xe/0x30
> [  762.710135]  [] __handle_mm_fault+0x2b1/0x3d0
> [  762.710135]  [] handle_mm_fault+0x133/0x1c0
> [  762.710135]  [] __get_user_pages+0x438/0x630
> [  762.710135]  [] ? put_lock_stats+0xe/0x30
> [  762.710135]  [] __mlock_vma_pages_range+0xd4/0xe0
> [  762.710135]  [] __mm_populate+0x110/0x190
> [  762.710135]  [] SyS_mlockall+0x160/0x1b0
> [  762.710135]  [] tracesys+0xdd/0xe2
> [  762.710135] Code: 90 90 90 90 90 90 9c fa 65 48 3b 06 75 14 65 48 3b 56 08
> 75 0d 65 48 89 1e 65 48 89 4e 08 9d b0 01 c3 9d 30 c0 c3 b9 00 02 00 00 
> 48 a5 c3 0f 1f 80 00
> 00 00 00 eb ee 66 66 66 90 66 66 66 90
> [  762.710135] RIP  [] copy_page_rep+0x5/0x10
> [  762.710135]  RSP 
> [  762.710135] CR2: 88009eae6000

Here's what I suggested about that one in eecc1e426d68
"thp: fix copy_page_rep GPF by testing is_huge_zero_pmd once only":
Note: this is not the same issue as trinity's DEBUG_PAGEALLOC BUG
in copy_page_rep with RSI: 88009c422000, reported by Sasha Levin
in https://lkml.org/lkml/2013/3/29/103.  I believe that one is due
to the source page being split, and a tail page freed, while copy
is in progress; and not a problem without DEBUG_PAGEALLOC, since
the pmd_same check will prevent a miscopy from being made visible.

It could be fixed by additional locking, or by taking an additional
reference on every tail page, in the DEBUG_PAGEALLOC case (we wouldn't
want to add to the overhead in the normal case).  I didn't feel very
motivated to uglify the code in that way just for DEBUG_PAGEALLOC and
trinity: if it only comes up once in seven months, I'm inclined to
live with it myself, but you may have a different perspective.

Hugh
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [RFC 10/16] drm/nouveau/timer: skip calibration on GK20A

2014-02-03 Thread Ben Skeggs
On Sat, Feb 1, 2014 at 1:16 PM, Alexandre Courbot  wrote:
> GK20A's timer is directly attached to the system timer and cannot be
> calibrated. Skip the calibration phase on that chip since the
> corresponding registers do not exist.
Just a curiosity:  What timer resolution does the HW initialise at?

>
> Signed-off-by: Alexandre Courbot 
> ---
>  drivers/gpu/drm/nouveau/core/subdev/timer/nv04.c | 19 +--
>  1 file changed, 13 insertions(+), 6 deletions(-)
>
> diff --git a/drivers/gpu/drm/nouveau/core/subdev/timer/nv04.c 
> b/drivers/gpu/drm/nouveau/core/subdev/timer/nv04.c
> index c0bdd10..822fe0d 100644
> --- a/drivers/gpu/drm/nouveau/core/subdev/timer/nv04.c
> +++ b/drivers/gpu/drm/nouveau/core/subdev/timer/nv04.c
> @@ -185,6 +185,10 @@ nv04_timer_init(struct nouveau_object *object)
> if (ret)
> return ret;
>
> +   /* gk20a does not have the calibration registers */
> +   if (device->chipset == 0xea)
> +   goto skip_clk_init;
> +
> /* aim for 31.25MHz, which gives us nanosecond timestamps */
> d = 100 / 32;
>
> @@ -235,20 +239,23 @@ nv04_timer_init(struct nouveau_object *object)
> d >>= 1;
> }
>
> -   /* restore the time before suspend */
> -   lo = priv->suspend_time;
> -   hi = (priv->suspend_time >> 32);
> -
> nv_debug(priv, "input frequency : %dHz\n", f);
> nv_debug(priv, "input multiplier: %d\n", m);
> nv_debug(priv, "numerator   : 0x%08x\n", n);
> nv_debug(priv, "denominator : 0x%08x\n", d);
> nv_debug(priv, "timer frequency : %dHz\n", (f * m) * d / n);
> -   nv_debug(priv, "time low: 0x%08x\n", lo);
> -   nv_debug(priv, "time high   : 0x%08x\n", hi);
>
> nv_wr32(priv, NV04_PTIMER_NUMERATOR, n);
> nv_wr32(priv, NV04_PTIMER_DENOMINATOR, d);
> +
> +skip_clk_init:
> +   /* restore the time before suspend */
> +   lo = priv->suspend_time;
> +   hi = (priv->suspend_time >> 32);
> +
> +   nv_debug(priv, "time low: 0x%08x\n", lo);
> +   nv_debug(priv, "time high   : 0x%08x\n", hi);
> +
> nv_wr32(priv, NV04_PTIMER_INTR_0, 0x);
> nv_wr32(priv, NV04_PTIMER_INTR_EN_0, 0x);
> nv_wr32(priv, NV04_PTIMER_TIME_1, hi);
> --
> 1.8.5.3
>
> ___
> dri-devel mailing list
> dri-de...@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/dri-devel
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] bio_integrity_add_page: check for BIO_POOL_NONE before determining nr_vecs on slab

2014-02-03 Thread Martin K. Petersen
> "David" == David Milburn  writes:

David> When enabling DIX T10-DIF-TYPE1-IP protection you can hit the
David> bip_vec full condition which fails to attach the integrity
David> metadata and returns 0 back to bio_integrity_prep()

Looks like Kent accidentally broke this when he changed the bvec pool
setup.

David> - if (bip->bip_vcnt >= bvec_nr_vecs(bip->bip_slab)) {
David> + if (bip->bip_slab != BIO_POOL_NONE &&
David> + bip->bip_vcnt >= bvec_nr_vecs(bip->bip_slab)) {
David>  printk(KERN_ERR "%s: bip_vec full\n", __func__);
David>  return 0;
David>  }

We still need to check that the page will actually fit, though:


block: Fix nr_vecs for inline integrity vectors

Commit 9f060e2231ca changed the way we handle allocations for the
integrity vectors. When the vectors are inline there is no associated
slab and consequently bvec_nr_vecs() returns 0. Ensure that we check
against BIP_INLINE_VECS in that case.

Reported-by: David Milburn 
Signed-off-by: Martin K. Petersen 

diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c
index fc60b31453ee..6dea2b90b4d5 100644
--- a/fs/bio-integrity.c
+++ b/fs/bio-integrity.c
@@ -114,6 +114,14 @@ void bio_integrity_free(struct bio *bio)
 }
 EXPORT_SYMBOL(bio_integrity_free);
 
+static inline unsigned int bip_integrity_vecs(struct bio_integrity_payload 
*bip)
+{
+   if (bip->bip_slab == BIO_POOL_NONE)
+   return BIP_INLINE_VECS;
+
+   return bvec_nr_vecs(bip->bip_slab);
+}
+
 /**
  * bio_integrity_add_page - Attach integrity metadata
  * @bio:   bio to update
@@ -129,7 +137,7 @@ int bio_integrity_add_page(struct bio *bio, struct page 
*page,
struct bio_integrity_payload *bip = bio->bi_integrity;
struct bio_vec *iv;
 
-   if (bip->bip_vcnt >= bvec_nr_vecs(bip->bip_slab)) {
+   if (bip->bip_vcnt >= bip_integrity_vecs(bip)) {
printk(KERN_ERR "%s: bip_vec full\n", __func__);
return 0;
}
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [RFC 07/16] drm/nouveau/bar/nvc0: support chips without BAR3

2014-02-03 Thread Ben Skeggs
On Sat, Feb 1, 2014 at 1:16 PM, Alexandre Courbot  wrote:
> Adapt the NVC0 BAR driver to make it able to support chips that do not
> expose a BAR3. When this happens, BAR1 is then used for USERD mapping
> and the BAR alloc() functions is disabled, making GPU objects unable
> to rely on BAR for data access and falling back to PRAMIN.
>
> Signed-off-by: Alexandre Courbot 
> ---
>  drivers/gpu/drm/nouveau/core/subdev/bar/nvc0.c | 115 
> +
>  1 file changed, 61 insertions(+), 54 deletions(-)
>
> diff --git a/drivers/gpu/drm/nouveau/core/subdev/bar/nvc0.c 
> b/drivers/gpu/drm/nouveau/core/subdev/bar/nvc0.c
> index 3f30db6..c2bb0e5 100644
> --- a/drivers/gpu/drm/nouveau/core/subdev/bar/nvc0.c
> +++ b/drivers/gpu/drm/nouveau/core/subdev/bar/nvc0.c
> @@ -79,87 +79,88 @@ nvc0_bar_unmap(struct nouveau_bar *bar, struct 
> nouveau_vma *vma)
>  }
>
>  static int
> -nvc0_bar_ctor(struct nouveau_object *parent, struct nouveau_object *engine,
> - struct nouveau_oclass *oclass, void *data, u32 size,
> - struct nouveau_object **pobject)
> +nvc0_bar_init_vm(struct nvc0_bar_priv *priv, int nr, int bar)
>  {
> -   struct nouveau_device *device = nv_device(parent);
> -   struct nvc0_bar_priv *priv;
> +   struct nouveau_device *device = nv_device(>base);
> struct nouveau_gpuobj *mem;
> struct nouveau_vm *vm;
> +   resource_size_t bar_len;
> int ret;
>
> -   ret = nouveau_bar_create(parent, engine, oclass, );
> -   *pobject = nv_object(priv);
> -   if (ret)
> -   return ret;
> -
> -   /* BAR3 */
> ret = nouveau_gpuobj_new(nv_object(priv), NULL, 0x1000, 0, 0,
> -   >bar[0].mem);
> -   mem = priv->bar[0].mem;
> +   >bar[nr].mem);
> +   mem = priv->bar[nr].mem;
> if (ret)
> return ret;
>
> ret = nouveau_gpuobj_new(nv_object(priv), NULL, 0x8000, 0, 0,
> -   >bar[0].pgd);
> +   >bar[nr].pgd);
> if (ret)
> return ret;
>
> -   ret = nouveau_vm_new(device, 0, nv_device_resource_len(device, 3), 0, 
> );
> +   bar_len = nv_device_resource_len(device, bar);
> +
> +   ret = nouveau_vm_new(device, 0, bar_len, 0, );
> if (ret)
> return ret;
>
> atomic_inc(>engref[NVDEV_SUBDEV_BAR]);
>
> -   ret = nouveau_gpuobj_new(nv_object(priv), NULL,
> -(nv_device_resource_len(device, 3) >> 12) * 
> 8,
> -0x1000, NVOBJ_FLAG_ZERO_ALLOC,
> ->pgt[0].obj[0]);
> -   vm->pgt[0].refcount[0] = 1;
> -   if (ret)
> -   return ret;
> +   /*
> +* Bootstrap page table lookup.
> +*/
> +   if (bar == 3) {
> +   ret = nouveau_gpuobj_new(nv_object(priv), NULL,
> +(bar_len >> 12) * 8, 0x1000,
> +NVOBJ_FLAG_ZERO_ALLOC,
> +   >pgt[0].obj[0]);
> +   vm->pgt[0].refcount[0] = 1;
> +   if (ret)
> +   return ret;
> +   }
>
> -   ret = nouveau_vm_ref(vm, >bar[0].vm, priv->bar[0].pgd);
> +   ret = nouveau_vm_ref(vm, >bar[nr].vm, priv->bar[nr].pgd);
> nouveau_vm_ref(NULL, , NULL);
> if (ret)
> return ret;
>
> -   nv_wo32(mem, 0x0200, lower_32_bits(priv->bar[0].pgd->addr));
> -   nv_wo32(mem, 0x0204, upper_32_bits(priv->bar[0].pgd->addr));
> -   nv_wo32(mem, 0x0208, lower_32_bits(nv_device_resource_len(device, 3) 
> - 1));
> -   nv_wo32(mem, 0x020c, upper_32_bits(nv_device_resource_len(device, 3) 
> - 1));
> +   nv_wo32(mem, 0x0200, lower_32_bits(priv->bar[nr].pgd->addr));
> +   nv_wo32(mem, 0x0204, upper_32_bits(priv->bar[nr].pgd->addr));
> +   nv_wo32(mem, 0x0208, lower_32_bits(bar_len - 1));
> +   nv_wo32(mem, 0x020c, upper_32_bits(bar_len - 1));
>
> -   /* BAR1 */
> -   ret = nouveau_gpuobj_new(nv_object(priv), NULL, 0x1000, 0, 0,
> -   >bar[1].mem);
> -   mem = priv->bar[1].mem;
> -   if (ret)
> -   return ret;
> +   return 0;
> +}
>
> -   ret = nouveau_gpuobj_new(nv_object(priv), NULL, 0x8000, 0, 0,
> -   >bar[1].pgd);
> -   if (ret)
> -   return ret;
> +static int
> +nvc0_bar_ctor(struct nouveau_object *parent, struct nouveau_object *engine,
> + struct nouveau_oclass *oclass, void *data, u32 size,
> + struct nouveau_object **pobject)
> +{
> +   struct nouveau_device *device = nv_device(parent);
> +   struct nvc0_bar_priv *priv;
> +   bool has_bar3 = nv_device_resource_len(device, 3) != 0;
> +   int ret;
>
> -   ret = nouveau_vm_new(device, 0, nv_device_resource_len(device, 1), 0, 
> );
> +   ret = 

Re: [RFC 00/16] drm/nouveau: initial support for GK20A (Tegra K1)

2014-02-03 Thread Ben Skeggs
On Sat, Feb 1, 2014 at 1:16 PM, Alexandre Courbot  wrote:
> Hello everyone,
Hey Alex,

The series looks pretty good to me.  I'll reply to the relevant
patches with any minor nit-picks on top of what's already been said by
others.

Thank you, and welcome to Nouveau :)

Ben.

>
> GK20A is the Kepler-based GPU used in the upcoming Tegra K1 chips. The 
> following
> patches perform architectural changes to Nouveau that are necessary to support
> non-PCI GPUs and add initial support for GK20A. Although the support is still
> very basic and more user-space changes will be needed to make the full 
> graphics
> stack run on top of it, we were able to successfully open channels and run
> simple pushbuffers with libdrm (more testing including rendering is in 
> progress
> as we get more familiar with Nouveau's user-space interface).
>
> This work should be considered as a RFC and a proof-of-concept for driving
> future Tegra GPUs with Nouveau. Some design choices need to be discussed and
> quite a few inelegant shortcuts were purposely taken to minimize the size of
> this first set. Or said otherwise, apart from the changes that add support for
> non-PCI GPUs, remarkably little code needs to be added to get GK20A to a point
> where it is actually running. This is very encouraging, and it will be
> interesting to keep improving this support and see where this gets us.
>
> The first part of this series (patches 01/09) adds support for platform 
> devices
> to Nouveau. Nouveau currently only supports PCI devices, and GK20A uses the
> platform bus and Device Tree. So the first step towards GK20A support is to
> abstract the PCI functions used by Nouveau (mainly resources range querying 
> and
> page mapping functions) and add platform device probing functions. For most of
> the existing chips, platform device support does not make any sense, so only 
> the
> subdev and engine drivers actually used by GK20A were updated to use these
> abstractions. If, for consistency reasons, it is deemed preferable to use them
> everywhere in the driver, we will do it in the next revision of this series.
>
> This part can be considered independently from the actual GK20A support, and I
> believe it would make sense to discuss what needs to be improved and drive it 
> to
> merge separately, as the remainder of the series will likely require more 
> work.
>
> The second part (10/14) updates existing subdev/engine drivers to support 
> GK20A,
> and adds a very simple memory driver that simulates dedicated video memory by
> allocating a large system memory chunk at boot time. This is clearly 
> sub-optimal
> and should not be merged, but allowed us to quickly bring GK20A up with 
> Nouveau.
> Other drivers changes are fairly small, and are here to handle the difference 
> in
> number of engines and units compared to desktop Kepler as well as to perform a
> few things usually done by the video BIOS (which Tegra does not feature).
>
> Finally, support for probing GK20A is added in the last 2 patches. It should 
> be
> noted that contrary to what Nouveau currently expects, GK20A does not embed 
> any
> display hardware (that part being handled by tegradrm). So this driver should
> really be only used through DRM render-nodes and collaborate with the display
> driver using PRIME. I have not yet figured out how to turn GK20A's 
> instantiation
> of Nouveau into a render-node only driver without breaking support for 
> existing
> desktop GPUs, and consequently the driver spawns a /dev/dri/cardX node which 
> we
> should try to get rid of.
>
> I guess my email address might surprise some of you, so let me anticipate some
> questions you might have. :P Yes, this work is endorsed by NVIDIA. Several 
> other
> NVIDIAns (CC'd), including core GPU experts, have provided significant 
> technical
> guidance and will continue their involvement. Special thanks go to Terje
> Bergstrom and Ken Adams for their invaluable GPU expertise, and Thierry Reding
> (at FOSDEM this weekend) for help with debugging and user-space testing.
>
> Let me also stress that although very exciting, this effort is still
> experimental, so I would like to make sure that nobody makes excessive
> expectations based on these few patches. The scope of this work is strictly
> limited to Tegra (although given the similarities desktop GPU support will
> certainly benefit from it indirectly), and we do not have any plan to work on
> user-space support. So do not uninstall that proprietary driver just yet. ;)
>
> With this being clarified, we are looking forward to getting your feedback and
> working with you guys to bring and improve Tegra K1 support into Nouveau! :)
>
> Alexandre Courbot (16):
>   drm/nouveau: handle -EACCES runtime PM return code
>   drm/nouveau: basic support for platform devices
>   drm/nouveau: add platform device probing function
>   drm/nouveau/fifo: support platform devices
>   drm/nouveau/bar: support platform devices
>   drm/nouveau/bar: only ioremap BAR3 if 

Re: [PATCH RESEND 5/10] xfstest: shared/001: Standard collapse range tests

2014-02-03 Thread Namjae Jeon
2014-02-04, Dave Chinner :
> On Sun, Feb 02, 2014 at 02:45:58PM +0900, Namjae Jeon wrote:
>> From: Namjae Jeon 
>>
>> This testcase(001) tries to test various corner cases
>> for fcollapse range functionality over different type of extents.
>>
>> Signed-off-by: Namjae Jeon 
>> Signed-off-by: Ashish Sangwan 
>
> Couple of things:
>
>>  -c "$map_cmd -v" $testfile | $filter_cmd
>>  [ $? -ne 0 ] && die_now
>>  _md5_checksum $testfile
>> @@ -415,10 +425,10 @@ _test_generic_punch()
>>  if [ "$remove_testfile" ]; then
>>  rm -f $testfile
>>  fi
>> -$XFS_IO_PROG -f -c "truncate 20k" \
>> --c "$alloc_cmd 0 8k" \
>> --c "pwrite 8k 8k" $sync_cmd \
>> --c "$zero_cmd 4k 8k" \
>> +$XFS_IO_PROG -f -c "truncate $(($multiple * 20))k" \
>> +-c "$alloc_cmd 0 $(($multiple * 8))k" \
>> +-c "pwrite $(($multiple * 8))k $(($multiple * 8))k" 
>> $sync_cmd \
>> +-c "$zero_cmd $(($multiple * 4))k $(($multiple * 8))k" \
>>  -c "$map_cmd -v" $testfile | $filter_cmd
>
Hi. Dave.
> This is unreadable, and therefore I'd consider that these changes
> render _test_generic_punch unmaintainable.
>
> Either it needs tobe factored to be more readable, or we need a more
> readable way of representing the offsets and sizes if we want them
> to be variable. For example:
>
> _4k="$((multiple * 4))k"
> _8k="$((multiple * 8))k"
> _20k="$((multiple * 20))k"
>
> leads to:
>
>   $XFS_IO_PROG -f -c "truncate $_20k" \
>   -c "$alloc_cmd 0 $_8k" \
>   -c "pwrite $_8k $_8k" $sync_cmd \
>   -c "$zero_cmd $_4k $_8k" \
>   -c "$map_cmd -v" $testfile | $filter_cmd
>
> which is still readable and allows us to arbitrarily scale the
> parameters. It even allows us to handle different filesystem block
> sizes if we really want to
Okay, I will change it as you suggest.
>
>>  -c "$map_cmd -v" $testfile | $filter_cmd
>>  [ $? -ne 0 ] && die_now
>>  _md5_checksum $testfile
>>
>> +# If zero_cmd is fcollpase, don't check unaligned offsets
>> +if [ "$zero_cmd" == "fcollapse" ]; then
>> +if [ "$remove_testfile" ]; then
>> +rm -f $testfile
>> +rm -f $testfile.2
>> +fi
>> +return
>> +fi
>
> No need to remove the test files here - we remove them at
> test startup to ensure we have a known initial state
Okay.
>
>> +0: [0..63]: extent
>> +bb7df04e1b0a2570657527a7e108ae23
>> +13. data -> unwritten -> data
>> +0: [0..63]: extent
>> +0f0151cbed83e4bf6e5bde26e82ab115
>> +14. data -> hole @ EOF
>> +fallocate: Invalid argument
>> +0: [0..159]: extent
>
> This error appears in all the golden outputs. If it's correct, then
> perhaps it should be filtered out or commented somewhere to explain
> why it is expected.
Okay, I will add the comments to explain about this.

Thanks for your review :)
>
> Cheers,
>
> Dave.
> --
> Dave Chinner
> da...@fromorbit.com
>
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [patch] mm, compaction: avoid isolating pinned pages fix

2014-02-03 Thread Hugh Dickins
On Mon, 3 Feb 2014, David Rientjes wrote:
> On Tue, 4 Feb 2014, Joonsoo Kim wrote:
> 
> > > > Okay. It can't fix your situation. Anyway, *normal* anon pages may be 
> > > > mapped
> > > > and have positive page_count(), so your code such as
> > > > '!page_mapping(page) && page_count(page)' makes compaction skip these 
> > > > *normal*
> > > > anon pages and this is incorrect behaviour.
> > > > 
> > > 
> > > So how does that work with migrate_page_move_mapping() which demands 
> > > page_count(page) == 1 and the get_page_unless_zero() in 
> > > __isolate_lru_page()?
> > 
> > Before doing migrate_page_move_mapping(), try_to_unmap() is called so that 
> > all
> > mapping is unmapped. Then, remained page_count() is 1 which is grabbed by
> > __isolate_lru_page(). Am I missing something?
> > 
> 
> Ah, good point.  I wonder if we can get away with 
> page_count(page) - page_mapcount(page) > 1 to avoid the get_user_pages() 
> pin?

Something like that.  But please go back to migrate_page_move_mapping()
to factor in what it's additionally considering.  Whether you can share
code with it, I don't know - it has to do some things under a lock you
cannot take at the preliminary stage - you haven't isolated or locked
the page yet.

There is a separate issue, that a mapping may supply its own non-default
mapping->a_ops->migratepage(): can we assume that the page_counting is
the same whatever migratepage() is in use?  I'm not sure.

If you stick to special-casing PageAnon pages, you won't face that
issue; but your proposed change would be a lot more satisfying if we
can convince ourselves that it's good for !PageAnon too.  May need a
trawl through the different migratepage() methods that exist in tree.

Hugh
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [3.14-rc1] cirrus driver problem (qemu)

2014-02-03 Thread Dave Airlie
On Tue, Feb 4, 2014 at 1:34 AM, Sabrina Dubroca  wrote:
> When I boot 3.14-rc1 in qemu, I get the trace below. The console stops
> updating and I don't get a login prompt. I can login, but I can't see
> what I'm doing. I can login normally via SSH.
>
> If I revert the last commit in drivers/gpu/drm/cirrus:
>
> f4b4718b61d1d5a7442a4fd6863ea80c3a10e508 drm: ast,cirrus,mgag200: use 
> drm_can_sleep
>
> the problem is solved.
>

Hi does the attach patch fix it?

Dave.
From c6feb881e00c8db9f9f73d099a08e86c5af79d50 Mon Sep 17 00:00:00 2001
From: Dave Airlie 
Date: Tue, 4 Feb 2014 13:19:08 +1000
Subject: [PATCH] drm: add in_interrupt to drm_can_sleep list of things

This should stop us taking locks when we shouldn't in the console drivers,

[1.749341] [ cut here ]
[1.749347] WARNING: CPU: 0 PID: 0 at kernel/locking/mutex.c:856 mutex_trylock+0x1e5/0x250()
[1.749348] DEBUG_LOCKS_WARN_ON(in_interrupt())
[1.749360] Modules linked in: ppdev cirrus syscopyarea sysfillrect sysimgblt drm_kms_helper evdev psmouse microcode serio_raw pcspkr ttm e1000 parport_pc parport processor button intel_agp drm intel_gtt i2c_piix4 ipv6 ext4 crc16 mbcache jbd2 sd_mod sr_mod cdrom ata_generic pata_acpi ata_piix 9pnet_virtio 9pnet libata scsi_mod
[1.749362] CPU: 0 PID: 0 Comm: swapper/0 Not tainted 3.14.0-rc1-t1 #34
[1.749364] Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011
[1.749366]  0009 88001fc038c8 814e8456 88001fc03910
[1.749367]  88001fc03900 8106a0dd 88001d3ff990 0010
[1.749368]   01e0 88001cc3b000 88001fc03960
[1.749369] Call Trace:
[1.749372][] dump_stack+0x4d/0x6f
[1.749374]  [] warn_slowpath_common+0x7d/0xa0
[1.749375]  [] warn_slowpath_fmt+0x4c/0x50
[1.749377]  [] mutex_trylock+0x1e5/0x250
[1.749380]  [] cirrus_dirty_update+0x7c/0x2f0 [cirrus]
[1.749381]  [] cirrus_imageblit+0x2f/0x40 [cirrus]
[1.749388]  [] soft_cursor+0x1b4/0x250
[1.749390]  [] bit_cursor+0x613/0x650
[1.749391]  [] ? get_color.isra.15+0x31/0x140

Reported-by: Sabrina Dubroca 
Signed-off-by: Dave Airlie 
---
 include/drm/drmP.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/drm/drmP.h b/include/drm/drmP.h
index 1d4a920..3a98664 100644
--- a/include/drm/drmP.h
+++ b/include/drm/drmP.h
@@ -1665,7 +1665,7 @@ extern void drm_platform_exit(struct drm_driver *driver, struct platform_device
 /* returns true if currently okay to sleep */
 static __inline__ bool drm_can_sleep(void)
 {
-	if (in_atomic() || in_dbg_master() || irqs_disabled())
+	if (in_atomic() || in_dbg_master() || in_interrupt() || irqs_disabled())
 		return false;
 	return true;
 }
-- 
1.8.4.2



Re: [PATCH 0/2] tools: lockdep: build fixes

2014-02-03 Thread Sasha Levin

On 01/31/2014 04:35 PM, Ira W. Snyder wrote:

From: "Ira W. Snyder" 

Included are some fixes to the tools/lib/lockdep source tree to fix some
build issues.

Ira W. Snyder (2):
   tools: lockdep: fix include of asm/hash.h
   tools: lockdep: add include directory to allow tests to compile

  tools/lib/lockdep/Makefile| 2 +-
  tools/lib/lockdep/uinclude/asm/hash.h | 6 ++
  2 files changed, 7 insertions(+), 1 deletion(-)
  create mode 100644 tools/lib/lockdep/uinclude/asm/hash.h



Acked-by: Sasha Levin 


Ingo, How would you like the liblockdep things to work? Would you be picking 
them
yourself directly to the locking tree or should I be sending a pull request?


Thanks,
Sasha
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: a LLC sched domain bug for panda board?

2014-02-03 Thread Preeti U Murthy
Hi Alex, Vincent,

On 02/04/2014 02:10 AM, Vincent Guittot wrote:
> Yes,  it's probably worth enabling by default for all ARM arch.
> 
> Vincent
> 
> On 02/04/2014 12:28 AM, Vincent Guittot wrote:
>> On 3 February 2014 17:27, Vincent Guittot 
> wrote:
>>> Have you checked that CONFIG_SCHED_LC is set ?
>>
>> sorry it's CONFIG_SCHED_MC
> 
> Thanks for reminder! no it wasn't set. Does it means
> arch/arm/configs/omap2plus_defconfig need add this config?

Hmm..ok let me think this aloud. So looks like the SMT,MC and the NUMA
sched domains are optional depending on the architecture. They are
config dependent. These domains could potentially exist on the processor
layout, but if the respective CONFIG options are not set, the scheduler
could very well ignore these levels.

What this means is that although the architecture could populate the
cpu_sibling_mask and cpu_coregroup_mask, the scheduler is not mandated
to schedule across the SMT and MC levels of the topology.
Its just the CPU sched domain which is guaranteed to be present no
matter what.

This is indeed interesting to note :) Thanks Alex for bringing up this
point :)

On PowerPC, the SCHED_MC option can never be set. Its not even optional.
On x86, it is on by default and on arm looks like its off by default.

Thanks,

Regards
Preeti U Murthy



> 
>>
>>>
>>>
>>> On 3 February 2014 17:17, Alex Shi  wrote:
 I just run the 3.14-rc1 kernel on panda board. The only domain for it is
 'CPU' domain, but this domain has no SD_SHARE_PKG_RESOURCES setting, it
 has no sd_llc.

 Guess the right domain for this board should be MC. So is it a bug?

 ..
 /proc/sys/kernel/sched_domain/cpu0/domain0/name:CPU
 ..
 /proc/sys/kernel/sched_domain/cpu1/domain0/name:CPU

 --
 Thanks
 Alex
 --
 To unsubscribe from this list: send the line "unsubscribe linux-kernel"
> in
 the body of a message to majord...@vger.kernel.org
 More majordomo info at  http://vger.kernel.org/majordomo-info.html
 Please read the FAQ at  http://www.tux.org/lkml/
> 
> --
> Thanks
> Alex
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v3 2/8] ARM: dts: sun7i: Add GMAC clock node to sun7i DTSI

2014-02-03 Thread Chen-Yu Tsai
On Tue, Feb 4, 2014 at 3:34 AM, Maxime Ripard
 wrote:
> On Mon, Feb 03, 2014 at 11:32:20AM +0800, Chen-Yu Tsai wrote:
>> The GMAC uses 1 of 2 sources for its transmit clock, depending on the
>> PHY interface mode. Add both sources as dummy clocks, and as parents
>> to the GMAC clock node.
>>
>> Signed-off-by: Chen-Yu Tsai 
>> ---
>>  arch/arm/boot/dts/sun7i-a20.dtsi | 28 
>>  1 file changed, 28 insertions(+)
>>
>> diff --git a/arch/arm/boot/dts/sun7i-a20.dtsi 
>> b/arch/arm/boot/dts/sun7i-a20.dtsi
>> index 1595e9a..fc7f470 100644
>> --- a/arch/arm/boot/dts/sun7i-a20.dtsi
>> +++ b/arch/arm/boot/dts/sun7i-a20.dtsi
>> @@ -314,6 +314,34 @@
>>   };
>>
>>   /*
>> +  * The following two are dummy clocks, placeholders used
>> +  * on gmac_tx clock. The actual frequency and availability
>> +  * depends on the external PHY, operation mode and link
>> +  * speed.
>> +  */
>
> If it depends on the external PHY, I guess that means it also depends
> on the board, right? Or is the GMAC supposed to always have that clock
> running at 25MHz, no matter what PHY is connected to it?

What I meant in the comment is that we cannot control the actual clock
rate of the TX clock. We can only select the source, and this is what
gmac_tx clock does. It is just a clock mux. The 125MHz and 25MHz clock
rates are used by the clk_set_rate in the stmmac glue layer to do
auto-reparenting.

The board dependent factor is what _type_ of PHY it is using, i.e.
MII, GMII, or RGMII. If it's MII, the PHY should provide the clock.
If it's RGMII, the internal clock would be used. GMII is a mix of
both. The actual clock rate depends on the link speed.

I should rephrase the comment along the lines of:

The following two are dummy clocks, placeholders used in the gmac_tx
clock. The gmac driver will choose one parent depending on the PHY
interface mode, using clk_set_rate auto-reparenting.
The actual TX clock rate is not controlled by the gmac_tx clock.


Cheers
ChenYu
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: mm: BUG in do_huge_pmd_wp_page

2014-02-03 Thread Sasha Levin

On 04/25/2013 10:01 PM, Dave Jones wrote:

On Thu, Apr 25, 2013 at 08:51:27PM -0400, Sasha Levin wrote:
  > On 04/24/2013 06:46 PM, Andrew Morton wrote:
  > > Guys, did this get fixed?
  >
  > I've stopped seeing that during fuzzing, so I guess that it got fixed 
somehow...

We've had reports of users hitting this in 3.8

eg:
https://bugzilla.redhat.com/show_bug.cgi?id=947985
https://bugzilla.redhat.com/show_bug.cgi?id=956730

I'm sure there are other reports of it too.

Would be good if we can figure out what fixed it (if it is actually fixed)
for backporting to stable


It's been a while (7 months?), but this one is back...

Just hit it again with today's -next:

[  762.701278] BUG: unable to handle kernel paging request at 88009eae6000
[  762.702462] IP: [] copy_page_rep+0x5/0x10
[  762.703369] PGD 84bb067 PUD 22fa81067 PMD 22f98b067 PTE 80009eae6060
[  762.704411] Oops:  [#1] PREEMPT SMP DEBUG_PAGEALLOC
[  762.705873] Dumping ftrace buffer:
[  762.707606](ftrace buffer empty)
[  762.708311] Modules linked in:
[  762.708762] CPU: 16 PID: 17920 Comm: trinity-c16 Tainted: GW
3.13.0-next-2
0140203-sasha-7-gf4985e2 #23
[  762.710135] task: 8801ac358000 ti: 880199234000 task.ti: 
880199234000
[  762.710135] RIP: 0010:[]  [] 
copy_page_rep+0x5/0x
10
[  762.710135] RSP: 0018:880199235c90  EFLAGS: 00010286
[  762.710135] RAX: 8002 RBX: 056db980 RCX: 0200
[  762.710135] RDX: 8801ac358000 RSI: 88009eae6000 RDI: 88015b6e6000
[  762.710135] RBP: 880199235cd8 R08:  R09: 
[  762.710135] R10: 0001 R11:  R12: 027ab980
[  762.710135] R13: 0200 R14: 00e6 R15: 8800
[  762.710135] FS:  7fb0804e1700() GS:88003da0() 
knlGS:0
000
[  762.710135] CS:  0010 DS:  ES:  CR0: 8005003b
[  762.710135] CR2: 88009eae6000 CR3: 000199225000 CR4: 06e0
[  762.710135] Stack:
[  762.710135]  81298995 8801a841ae00 88003d084520 
880199227090
[  762.710135]  80009ea008e5 8801a841ae00 ea00027a8000 
880199227090
[  762.710135]  ea00056d8000 880199235d58 812d7260 
880199235cf8
[  762.710135] Call Trace:
[  762.710135]  [] ? copy_user_huge_page+0x1a5/0x210
[  762.710135]  [] do_huge_pmd_wp_page+0x3d0/0x650
[  762.710135]  [] ? put_lock_stats+0xe/0x30
[  762.710135]  [] __handle_mm_fault+0x2b1/0x3d0
[  762.710135]  [] handle_mm_fault+0x133/0x1c0
[  762.710135]  [] __get_user_pages+0x438/0x630
[  762.710135]  [] ? put_lock_stats+0xe/0x30
[  762.710135]  [] __mlock_vma_pages_range+0xd4/0xe0
[  762.710135]  [] __mm_populate+0x110/0x190
[  762.710135]  [] SyS_mlockall+0x160/0x1b0
[  762.710135]  [] tracesys+0xdd/0xe2
[  762.710135] Code: 90 90 90 90 90 90 9c fa 65 48 3b 06 75 14 65 48 3b 56 08 75 0d 65 48 89 1e 65 
48 89 4e 08 9d b0 01 c3 9d 30 c0 c3 b9 00 02 00 00  48 a5 c3 0f 1f 80 00

00 00 00 eb ee 66 66 66 90 66 66 66 90
[  762.710135] RIP  [] copy_page_rep+0x5/0x10
[  762.710135]  RSP 
[  762.710135] CR2: 88009eae6000


Thanks,
Sasha
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [patch] mm, compaction: avoid isolating pinned pages fix

2014-02-03 Thread David Rientjes
On Tue, 4 Feb 2014, Joonsoo Kim wrote:

> > > Okay. It can't fix your situation. Anyway, *normal* anon pages may be 
> > > mapped
> > > and have positive page_count(), so your code such as
> > > '!page_mapping(page) && page_count(page)' makes compaction skip these 
> > > *normal*
> > > anon pages and this is incorrect behaviour.
> > > 
> > 
> > So how does that work with migrate_page_move_mapping() which demands 
> > page_count(page) == 1 and the get_page_unless_zero() in 
> > __isolate_lru_page()?
> 
> Before doing migrate_page_move_mapping(), try_to_unmap() is called so that all
> mapping is unmapped. Then, remained page_count() is 1 which is grabbed by
> __isolate_lru_page(). Am I missing something?
> 

Ah, good point.  I wonder if we can get away with 
page_count(page) - page_mapcount(page) > 1 to avoid the get_user_pages() 
pin?
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [ovs-discuss] Linus GIT Head OOPs reproducable in open vswitch when running mininet topology

2014-02-03 Thread Jesse Gross
On Fri, Jan 31, 2014 at 10:18 AM, Thomas Glanzmann  wrote:
>> Do you know if this happens with an older kernel or with a simpler topology?
>
> No, I don't. I just verified that the Ubuntu Mininet uses the
> openvswitch kernel module from openvswitch and not the one that is
> shipped with the kernel. Ubuntu precise does not crash with the exact same
> topology.

The kernel from Precise doesn't call the function that is triggering
the problem, so it's not too surprising that it doesn't have the same
issue.

It's not clear that this is actually a bug in the OVS code since it
happens in a different function and that function accesses data that
OVS doesn't really touch. Do you know if this happens with the bridge?
Or can you try bisecting? Or use gdb to track down the faulting
address?
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [RFC 00/16] drm/nouveau: initial support for GK20A (Tegra K1)

2014-02-03 Thread Alexandre Courbot

On 02/03/2014 08:25 PM, David Herrmann wrote:

Hi

[..snip..]

Finally, support for probing GK20A is added in the last 2 patches. It should be
noted that contrary to what Nouveau currently expects, GK20A does not embed any
display hardware (that part being handled by tegradrm). So this driver should
really be only used through DRM render-nodes and collaborate with the display
driver using PRIME. I have not yet figured out how to turn GK20A's instantiation
of Nouveau into a render-node only driver without breaking support for existing
desktop GPUs, and consequently the driver spawns a /dev/dri/cardX node which we
should try to get rid of.


You cannot get rid of cardX currently. It is implied by DRIVER_MODESET
and that flag should actually be called NOT_A_LEGACY_DRIVER. So you
cannot remove it. I did try to replace DRIVER_MODESET by an inverted
DRIVER_LEGACY flag some time ago, but I thought it's not worth it.

Anyhow, you can easily add a new flag to make
drm_dev_register()/drm_dev_alloc() not create the drm_minor for
DRM_MINOR_LEGACY, which would prevent the card0 node from showing up.
But people started using the cardX interface as base interface so mesa
might not be able to open render-nodes if the related card-node is not
available (which is a bug in their code, so no reason to support that
by not adding stand-alone render-nodes).


Actually my mention of /dev/dri/cardX was misleading. I was rather 
thinking about getting rid of the DRIVER_MODESET flag to correctly 
expose what the card provides, not only to user-space, but to DRM 
itself. The legacy node is ok as long as DRM itself correctly knows what 
the driver can and cannot do and fails gracefully if the user tries to 
set a mode.


DRIVER_MODESET is statically set in nouveau_drm.c, and the reason why I 
cannot get rid of it is because the driver (and its features) is 
registered with drm_pci_init() before the card is probed and its actual 
features known.


For platform devices, you could check the card features before 
registering it with drm_platform_init(), but then you have the issue 
that the driver instance is referenced by every probed card, and thus 
you cannot have cards with different capabilities.


So it seems like handling this would require the driver_features to move 
from drm_driver to drm_device, but that's quite a core change. As 
pointed out by you and Daniel, we can certainly live with the control 
and legacy nodes. Nonetheless I'd be curious to know how (and if) this 
case can be correctly handled.


Alex.

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v2] mm/zswap: add writethrough option

2014-02-03 Thread Minchan Kim
Hello Andrew,

On Mon, Feb 03, 2014 at 03:08:35PM -0800, Andrew Morton wrote:
> On Mon, 27 Jan 2014 09:01:19 -0500 Dan Streetman  wrote:
> 
> > Currently, zswap is writeback cache; stored pages are not sent
> > to swap disk, and when zswap wants to evict old pages it must
> > first write them back to swap cache/disk manually.  This avoids
> > swap out disk I/O up front, but only moves that disk I/O to
> > the writeback case (for pages that are evicted), and adds the
> > overhead of having to uncompress the evicted pages and the
> > need for an additional free page (to store the uncompressed page).
> > 
> > This optionally changes zswap to writethrough cache by enabling
> > frontswap_writethrough() before registering, so that any
> > successful page store will also be written to swap disk.  The
> > default remains writeback.  To enable writethrough, the param
> > zswap.writethrough=1 must be used at boot.
> > 
> > Whether writeback or writethrough will provide better performance
> > depends on many factors including disk I/O speed/throughput,
> > CPU speed(s), system load, etc.  In most cases it is likely
> > that writeback has better performance than writethrough before
> > zswap is full, but after zswap fills up writethrough has
> > better performance than writeback.
> > 
> > The reason to add this option now is, first to allow any zswap
> > user to be able to test using writethrough to determine if they
> > get better performance than using writeback, and second to allow
> > future updates to zswap, such as the possibility of dynamically
> > switching between writeback and writethrough.
> > 
> > ...
> >
> > Based on specjbb testing on my laptop, the results for both writeback
> > and writethrough are better than not using zswap at all, but writeback
> > does seem to be better than writethrough while zswap isn't full.  Once
> > it fills up, performance for writethrough is essentially close to not
> > using zswap, while writeback seems to be worse than not using zswap.
> > However, I think more testing on a wider span of systems and conditions
> > is needed.  Additionally, I'm not sure that specjbb is measuring true
> > performance under fully loaded cpu conditions, so additional cpu load
> > might need to be added or specjbb parameters modified (I took the
> > values from the 4 "warehouses" test run).
> > 
> > In any case though, I think having writethrough as an option is still
> > useful.  More changes could be made, such as changing from writeback
> > to writethrough based on the zswap % full.  And the patch doesn't
> > change default behavior - writethrough must be specifically enabled.
> > 
> > The %-ized numbers I got from specjbb on average, using the default
> > 20% max_pool_percent and varying the amount of heap used as shown:
> > 
> > ram | no zswap | writeback | writethrough
> > 75 93.08 100 96.90
> > 87 96.58 95.58   96.72
> > 10092.29 89.73   86.75
> > 11263.80 38.66   19.66
> > 1254.79  29.90   15.75
> > 1374.99  4.504.75
> > 1504.28  4.625.01
> > 1625.20  2.944.66
> > 1755.71  2.114.84
> 
> Changelog is very useful, thanks for taking the time.
> 
> It does sound like the feature is of marginal benefit.  Is "zswap
> filled up" an interesting or useful case to optimize?
> 
> otoh the addition is pretty simple and we can later withdraw the whole
> thing without breaking anyone's systems.
> 
> What do people think?

IMHO, Using overcommiting memory and swap, it's really thing
we shold optimize once we decided to use writeback of zswap.

But I don't think writethrough isn't ideal solution for
that case where zswap is full. Sometime, just dynamic disabling
of zswap might be better due to reducing unnecessary
comp/decomp overhead.

Dan said that it's good to have because someuser might find
right example we didn't find in future. Although I'm not a
huge fan of such justification for merging the patch(I tempted
my patches several time with such claim), I don't object it
(Actually, I have an idea to make zswap's writethough useful but
it isn't related to this topic) any more if we could withdraw
easily if it turns out a obstacle for future enhace.

Thanks.
-- 
Kind regards,
Minchan Kim
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v3 5/6] fat: permit to return phy block number by fibmap in fallocated region

2014-02-03 Thread OGAWA Hirofumi
Namjae Jeon  writes:

>>> /* fat_get_cluster() assumes the requested blocknr isn't truncated. */
>>> down_read(_I(mapping->host)->truncate_lock);
>>> +   /* To get block number beyond file size in fallocated region */
>>> +   atomic_set(_I(mapping->host)->beyond_isize, 1);
>>> blocknr = generic_block_bmap(mapping, block, fat_get_block);
>>> +   atomic_set(_I(mapping->host)->beyond_isize, 0);
>>> up_read(_I(mapping->host)->truncate_lock);
>>
>> This is racy. While user is using bmap, kernel can allocate new blocks.
>> We should use another function for this.
> I understand that fat can map fallocated blocks in read case while
> user is using bmap.
> But I can not find the case allocate new blocks.
> If I am missing something, Could you please elaborate more ?
> Is it a case of _bmap request returning the block number for block
> allocated in parallel write path ?

->beyond_size is global for inode. So, write(2) path on same inode with
bmap() also can see 1 set by bmap() while another process is using bmap().
-- 
OGAWA Hirofumi 
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [patch] mm, compaction: avoid isolating pinned pages

2014-02-03 Thread Hugh Dickins
On Mon, 3 Feb 2014, David Rientjes wrote:
> On Mon, 3 Feb 2014, Mel Gorman wrote:
> 
> > > Page migration will fail for memory that is pinned in memory with, for
> > > example, get_user_pages().  In this case, it is unnecessary to take
> > > zone->lru_lock or isolating the page and passing it to page migration
> > > which will ultimately fail.
> > > 
> > > This is a racy check, the page can still change from under us, but in
> > > that case we'll just fail later when attempting to move the page.
> > > 
> > > This avoids very expensive memory compaction when faulting transparent
> > > hugepages after pinning a lot of memory with a Mellanox driver.
> > > 
> > > On a 128GB machine and pinning ~120GB of memory, before this patch we
> > > see the enormous disparity in the number of page migration failures
> > > because of the pinning (from /proc/vmstat):

120GB of memory on the active/inactive lrus but longterm pinned,
that's quite worrying: not just a great waste of time for compaction,
but for page reclaim also.  I suppose a fairly easy way around it would
be for the driver to use mlock too, moving them all to unevictable lru.

But in general, you may well  be right that, racy as this isolation/
migration procedure necessarily is, in the face of longterm pinning it
may make more sense to test page_count before proceding to isolation
rather than only after in migration.  We always took the view that it's
better to give up only at the last moment, but that may be a bad bet.

> > > 
> > > compact_blocks_moved 7609
> > > compact_pages_moved 3431
> > > compact_pagemigrate_failed 133219
> > > compact_stall 13
> > > 
> > > After the patch, it is much more efficient:
> > > 
> > > compact_blocks_moved 7998
> > > compact_pages_moved 6403
> > > compact_pagemigrate_failed 3
> > > compact_stall 15
> > > 
> > > Signed-off-by: David Rientjes 
> > > ---
> > >  mm/compaction.c | 8 
> > >  1 file changed, 8 insertions(+)
> > > 
> > > diff --git a/mm/compaction.c b/mm/compaction.c
> > > --- a/mm/compaction.c
> > > +++ b/mm/compaction.c
> > > @@ -578,6 +578,14 @@ isolate_migratepages_range(struct zone *zone, struct 
> > > compact_control *cc,
> > >   continue;
> > >   }
> > >  
> > > + /*
> > > +  * Migration will fail if an anonymous page is pinned in memory,
> > > +  * so avoid taking zone->lru_lock and isolating it unnecessarily
> > > +  * in an admittedly racy check.
> > > +  */
> > > + if (!page_mapping(page) && page_count(page))
> > > + continue;
> > > +
> > 
> > Are you sure about this? The page_count check migration does is this
> > 
> > int expected_count = 1 + extra_count;
> > if (!mapping) {
> > if (page_count(page) != expected_count)
> > return -EAGAIN;
> > return MIGRATEPAGE_SUCCESS;
> > }
> > 
> > spin_lock_irq(>tree_lock);
> > 
> > pslot = radix_tree_lookup_slot(>page_tree,
> > page_index(page));
> > 
> > expected_count += 1 + page_has_private(page);
> > 
> > Migration expects and can migrate pages with no mapping and a page count
> > but you are now skipping them. I think you may have intended to split
> > migrations page count into a helper or copy the logic.
> > 
> 
> Thanks for taking a look!
> 
> The patch is correct, it just shows my lack of a complete commit message 

I don't think so.  I agree with Mel that you should be reconsidering
those tests that migrate_page_move_mapping() makes, but remembering that
it's called at a stage between try_to_unmap() and remove_migration_ptes(),
when page_mapcount has been brought down to 0 - not the case here.

> which I'm struggling with recently.  In the case that this is addressing, 
> get_user_pages() already gives page_count(page) == 1, then 

But get_user_pages() brings the pages into user address space (if not
already there), page_mapcount 1 and page_count 1, and does an additional
pin on the page, page_count 2.  Or if it's a page_mapping page (perhaps
even PageAnon in SwapCache) there's another +1; if page_has_buffers
another +1; mapped into more user address spaces, +more.

Yourif (!page_mapping(page) && page_count(page))
continue;
is letting through any Anon SwapCache pages (probably no great concern
in your 120GB example; but I don't understand why you want to special-
case Anon anyway, beyond your specific testcase); and refusing to
isolate all those unpinned anonymous pages mapped into userspace which
migration is perfectly capable of migrating.  If 120GB out of 128GB is
pinned, that won't be a significant proportion, and of course your
change saves a lot of wasted time and lock contention; but for most
people it's a considerable proportion of their memory, and needs to
be migratable.

I think Joonsoo is making the same point (though I disagree with the
test he suggested); but I've not yet read the 

Re: [PATCH v3 1/8] clk: sunxi: Add Allwinner A20/A31 GMAC clock unit

2014-02-03 Thread Chen-Yu Tsai
Hi,

On Tue, Feb 4, 2014 at 3:31 AM, Maxime Ripard
 wrote:
> Hi,
>
> On Mon, Feb 03, 2014 at 11:32:19AM +0800, Chen-Yu Tsai wrote:
>> The Allwinner A20/A31 clock module controls the transmit clock source
>> and interface type of the GMAC ethernet controller. Model this as
>> a single clock for GMAC drivers to use.
>>
>> Signed-off-by: Chen-Yu Tsai 
>> ---
>>  Documentation/devicetree/bindings/clock/sunxi.txt | 26 +++
>>  drivers/clk/sunxi/clk-sunxi.c | 83 
>> +++
>>  2 files changed, 109 insertions(+)
>>
>> diff --git a/Documentation/devicetree/bindings/clock/sunxi.txt 
>> b/Documentation/devicetree/bindings/clock/sunxi.txt
>> index 0cf679b..f43b4c0 100644
>> --- a/Documentation/devicetree/bindings/clock/sunxi.txt
>> +++ b/Documentation/devicetree/bindings/clock/sunxi.txt
>> @@ -37,6 +37,7 @@ Required properties:
>>   "allwinner,sun6i-a31-apb2-gates-clk" - for the APB2 gates on A31
>>   "allwinner,sun4i-mod0-clk" - for the module 0 family of clocks
>>   "allwinner,sun7i-a20-out-clk" - for the external output clocks
>> + "allwinner,sun7i-a20-gmac-clk" - for the GMAC clock module on A20/A31
>>
>>  Required properties for all clocks:
>>  - reg : shall be the control register address for the clock.
>> @@ -50,6 +51,9 @@ Required properties for all clocks:
>>   If the clock module only has one output, the name shall be the
>>   module name.
>>


>> +For "allwinner,sun7i-a20-gmac-clk", the parent clocks shall be fixed rate
>> +dummy clocks at 25 MHz and 125 MHz, respectively. See example.
>> +


>>  Clock consumers should specify the desired clocks they use with a
>>  "clocks" phandle cell. Consumers that are using a gated clock should
>>  provide an additional ID in their clock property. This ID is the
>> @@ -96,3 +100,25 @@ mmc0_clk: clk@01c20088 {
>>   clocks = <>, < 1>, < 1>;
>>   clock-output-names = "mmc0";
>>  };
>> +
>> +mii_phy_tx_clk: clk@2 {
>> + #clock-cells = <0>;
>> + compatible = "fixed-clock";
>> + clock-frequency = <2500>;
>> + clock-output-names = "mii_phy_tx";
>> +};
>> +
>> +gmac_int_tx_clk: clk@3 {
>> + #clock-cells = <0>;
>> + compatible = "fixed-clock";
>> + clock-frequency = <12500>;
>> + clock-output-names = "gmac_int_tx";
>> +};
>> +
>> +gmac_clk: clk@01c20164 {
>> + #clock-cells = <0>;
>> + compatible = "allwinner,sun7i-a20-gmac-clk";
>> + reg = <0x01c20164 0x4>;
>> + clocks = <_phy_tx_clk>, <_int_tx_clk>;
>
> You should also document in which order you expect the parents to
> be. Or it will probably be easier to just use clock-names here.

Is it not clear from the "Required properties" section above?

>
>> + clock-output-names = "gmac";
>> +};
>> diff --git a/drivers/clk/sunxi/clk-sunxi.c b/drivers/clk/sunxi/clk-sunxi.c
>> index 736fb60..0b361d2 100644
>> --- a/drivers/clk/sunxi/clk-sunxi.c
>> +++ b/drivers/clk/sunxi/clk-sunxi.c
>> @@ -379,6 +379,89 @@ static void sun7i_a20_get_out_factors(u32 *freq, u32 
>> parent_rate,
>>
>>
>>  /**
>> + * sun7i_a20_gmac_clk_setup - Setup function for A20/A31 GMAC clock module
>> + *
>> + * This clock looks something like this
>> + *   
>> + *  MII TX clock from PHY >-|____|> to GMAC core
>> + *  GMAC Int. RGMII TX clk >|___\__/__gate---|> to PHY
>> + *  Ext. 125MHz RGMII TX clk >--|__divider__/|
>> + *  ||
>> + *
>> + * The external 125 MHz reference is optional, i.e. GMAC can use its
>> + * internal TX clock just fine. The A31 GMAC clock module does not have
>> + * the divider controls for the external reference.
>> + *
>> + * To keep it simple, let the GMAC use either the MII TX clock for MII mode,
>> + * and its internal TX clock for GMII and RGMII modes. The GMAC driver 
>> should
>> + * select the appropriate source and gate/ungate the output to the PHY.
>> + *
>> + * Only the GMAC should use this clock. Altering the clock so that it 
>> doesn't
>> + * match the GMAC's operation parameters will result in the GMAC not being
>> + * able to send traffic out. The GMAC driver should set the clock rate and
>> + * enable/disable this clock to configure the required state. The clock
>> + * driver then responds by auto-reparenting the clock.
>> + */
>> +
>> +#define SUN7I_A20_GMAC_GPIT  2
>> +#define SUN7I_A20_GMAC_MASK  0x3
>> +#define SUN7I_A20_GMAC_MAX_PARENTS   2
>> +
>> +static void __init sun7i_a20_gmac_clk_setup(struct device_node *node)
>> +{
>> + struct clk *clk;
>> + struct clk_mux *mux;
>> + struct clk_gate *gate;
>> + const char *clk_name = node->name;
>> + const char *parents[SUN7I_A20_GMAC_MAX_PARENTS];
>> + void *reg;
>> + int i = 0;
>> +
>> + /* allocate mux and gate clock structs */
>> + mux = kzalloc(sizeof(struct clk_mux), GFP_KERNEL);
>> + if (!mux)
>> + return;
>
> Newline.
>
>> + gate 

Re: + mm-utilc-add-kstrimdup.patch added to -mm tree

2014-02-03 Thread Andrew Morton
On Mon, 03 Feb 2014 17:28:16 -0800 Sebastian Capella 
 wrote:

> Quoting David Rientjes (2014-02-03 17:05:04)
> > The last we heard, I think Sebastian is looking to redo this series and 
> > this patch is no longer needed.  Sebastian?
> Hi David, Alexey,
> 
> I am in the process of reworking the patches.  I'm not sure if Andrew
> was just interested in having the kstrimdup utility function available.
> 
> Isn't it too late to impose userspace trimming of newlines for sysfs?
> It seems already fairly common and expected for the kernel to eat the
> trailing whitespace, or at least ignore it.  If we change this won't we
> be breaking userspace / tools / instructions / etc?
> 

We have quite a lot of codesites which open-code the newline trimming. 
Providing a library function to do this will result in goodness.

That being said, I don't plan to merge kstrimdup() until it has some users.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] kernel/kprobes.c: move cleanup_rp_inst() to where CONFIG_KRETPROBES enabled

2014-02-03 Thread Chen Gang
On 02/03/2014 11:42 PM, Masami Hiramatsu wrote:
> (2014/02/03 20:48), Chen Gang wrote:
>> On 02/02/2014 10:40 AM, Masami Hiramatsu wrote:
>>> (2014/02/01 21:17), Chen Gang wrote:
 When CONFIG_KRETPROBES disabled, cleanup_rp_inst() is useless too. It
 is only called by unregister_kretprobes() which is in CONFIG_KRETPROBES
 enabled area.

 The related warning (allmodconfig under avr32):

   kernel/kprobes.c:1181: warning: 'cleanup_rp_inst' defined but not used
>>>
>>> This patch itself looks good to me.
>>> And it seems that not only the cleanup_rp_inst, but also other
>>> kretprobe related functions should be moved (free_rp_inst,etc)
>>>
>>
>> OK, thanks, need/should I check them again and send patch v2 for them?
> 
> Yes, I'm happy to review it :)
> 
> Thank you!
> 

I will/should finish in these days (within 2014-02-07).

Thanks.
-- 
Chen Gang

Open, share and attitude like air, water and life which God blessed
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [patch] mm, compaction: avoid isolating pinned pages fix

2014-02-03 Thread Joonsoo Kim
On Mon, Feb 03, 2014 at 06:00:56PM -0800, David Rientjes wrote:
> On Tue, 4 Feb 2014, Joonsoo Kim wrote:
> 
> > Okay. It can't fix your situation. Anyway, *normal* anon pages may be mapped
> > and have positive page_count(), so your code such as
> > '!page_mapping(page) && page_count(page)' makes compaction skip these 
> > *normal*
> > anon pages and this is incorrect behaviour.
> > 
> 
> So how does that work with migrate_page_move_mapping() which demands 
> page_count(page) == 1 and the get_page_unless_zero() in 
> __isolate_lru_page()?

Before doing migrate_page_move_mapping(), try_to_unmap() is called so that all
mapping is unmapped. Then, remained page_count() is 1 which is grabbed by
__isolate_lru_page(). Am I missing something?

Thanks.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v5 16/16] ARM: Remove uprobes dependency on kprobes

2014-02-03 Thread David Long

On 02/03/14 10:45, Jon Medhurst (Tixy) wrote:

On Thu, 2014-01-23 at 15:05 -0500, David Long wrote:

From: "David A. Long" 

Now that arm uprobes support has been made separate from the arm kprobes code
the Kconfig can be changed to reflect that.

Signed-off-by: David A. Long 
---
  arch/arm/Kconfig | 1 -
  1 file changed, 1 deletion(-)

diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index fec5a6b..9ddc4ae 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -204,7 +204,6 @@ config NEED_DMA_MAP_STATE
 def_bool y

  config ARCH_SUPPORTS_UPROBES
-   depends on KPROBES
def_bool y

  config ARCH_HAS_DMA_SET_COHERENT_MASK



Was this patch meant to have other contents? If not, it seems a bit
pointless as all it does is remove a line added in the previous patch,
so should just be folded into that one.



That patch was added late to a much earlier rev.  It should have been 
merged with the previous patch before now.  I have just done so.


-dl

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH] security: select correct default LSM_MMAP_MIN_ADDR on arm on arm64

2014-02-03 Thread Colin Cross
Binaries compiled for arm may run on arm64 if CONFIG_COMPAT is
selected.  Set LSM_MMAP_MIN_ADDR to 32768 if ARM64 && COMPAT to
prevent selinux failures launching 32-bit static executables that
are mapped at 0x8000.

Signed-off-by: Colin Cross 
---
 security/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/security/Kconfig b/security/Kconfig
index e9c6ac724fef..beb86b500adf 100644
--- a/security/Kconfig
+++ b/security/Kconfig
@@ -103,7 +103,7 @@ config INTEL_TXT
 config LSM_MMAP_MIN_ADDR
int "Low address space for LSM to protect from user allocation"
depends on SECURITY && SECURITY_SELINUX
-   default 32768 if ARM
+   default 32768 if ARM || (ARM64 && COMPAT)
default 65536
help
  This is the portion of low virtual memory which should be protected
-- 
1.9.0.rc1.175.g0b1dcb5

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v5 07/16] ARM: Remove use of struct kprobe from generic probes code

2014-02-03 Thread David Long

On 02/03/14 09:57, Jon Medhurst (Tixy) wrote:

On Thu, 2014-01-23 at 15:05 -0500, David Long wrote:

From: "David A. Long" 

Change the generic ARM probes code to pass in the opcode and 
architecture-specific
structure separately instead of using struct kprobe, so we do not pollute
code being used only for uprobes or other non-kprobes instruction
interpretation.

Signed-off-by: David A. Long 
---


One minor nit-pick...

[...]

diff --git a/arch/arm/kernel/kprobes-thumb.c b/arch/arm/kernel/kprobes-thumb.c
index c7ee290..cea707a 100644
--- a/arch/arm/kernel/kprobes-thumb.c
+++ b/arch/arm/kernel/kprobes-thumb.c

[...]

@@ -593,7 +590,7 @@ t16_emulate_pop_pc(struct kprobe *p, struct pt_regs *regs)
bx_write_pc(pc, regs);
  }

-static enum kprobe_insn __kprobes
+enum kprobe_insn __kprobes
  t16_decode_pop(kprobe_opcode_t insn, struct arch_specific_insn *asi,
struct decode_header *d)
  {


The above removal of 'static' appears to be an unneeded accidental
change?



Yes, that got lost during editing.  The change has been made.

-dl

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v5 05/16] ARM: use a function table for determining instruction interpreter action

2014-02-03 Thread David Long

On 02/03/14 09:24, Jon Medhurst (Tixy) wrote:

On Thu, 2014-01-23 at 15:05 -0500, David Long wrote:

From: "David A. Long" 

Make the instruction interpreter call back to semantic action functions
through a function pointer array provided by the invoker.  The interpreter
decodes the instructions into groups and uses the group number to index
into the supplied array.  kprobes and uprobes code will each supply their
own array of functions.

Signed-off-by: David A. Long 
---


[...]


--- a/arch/arm/kernel/probes.c
+++ b/arch/arm/kernel/probes.c
@@ -378,10 +378,11 @@ static const int decode_struct_sizes[NUM_DECODE_TYPES] = {
   */
  int __kprobes
  kprobe_decode_insn(kprobe_opcode_t insn, struct arch_specific_insn *asi,
-  const union decode_item *table, bool thumb)
+  const union decode_item *table, bool thumb,
+  const union decode_action *actions)
  {
-   const struct decode_header *h = (struct decode_header *)table;
-   const struct decode_header *next;
+   struct decode_header *h = (struct decode_header *)table;
+   struct decode_header *next;


The decode tables are fixed structures which nothing should want to
modify, so I think the const's above should be kept. I believe that
you've had to resort to changing them because the following typedef
lacks a 'const' on the final argument



I've made the change.


[...]

diff --git a/arch/arm/kernel/probes.h b/arch/arm/kernel/probes.h
index c610fa9..81b6e61 100644

[...]


+typedef enum kprobe_insn (probes_custom_decode_t)(kprobe_opcode_t,
+ struct arch_specific_insn *,
+ struct decode_header *);
+


Adding 'const' above will also have the knock on effect of requiring
const on all the 'custom decode' functions as well.



I've made those numerous changes as well.

-dl

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [patch] mm, compaction: avoid isolating pinned pages fix

2014-02-03 Thread David Rientjes
On Tue, 4 Feb 2014, Joonsoo Kim wrote:

> Okay. It can't fix your situation. Anyway, *normal* anon pages may be mapped
> and have positive page_count(), so your code such as
> '!page_mapping(page) && page_count(page)' makes compaction skip these *normal*
> anon pages and this is incorrect behaviour.
> 

So how does that work with migrate_page_move_mapping() which demands 
page_count(page) == 1 and the get_page_unless_zero() in 
__isolate_lru_page()?
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCHSET 0/5] tracing/uprobes: Support multi buffer and event trigger

2014-02-03 Thread Steven Rostedt
On Mon, 03 Feb 2014 14:06:12 +0900
Namhyung Kim  wrote:

> Ping!

Hi Namhyung,

I plan on getting these ready for the 3.15 queue. There was a bit too
much in 3.14 to add these on top of at the last minute.

Currently, I'm working on some bugs at work as well as some things I
found in mainline. I'll be reviewing these when I get a chance.

Thanks,

-- Steve
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [patch] mm, compaction: avoid isolating pinned pages fix

2014-02-03 Thread Joonsoo Kim
On Mon, Feb 03, 2014 at 05:20:46PM -0800, David Rientjes wrote:
> On Tue, 4 Feb 2014, Joonsoo Kim wrote:
> 
> > I think that you need more code to skip this type of page correctly.
> > Without page_mapped() check, this code makes migratable pages be skipped,
> > since if page_mapped() case, page_count() may be more than zero.
> > 
> > So I think that you need following change.
> > 
> > (!page_mapping(page) && !page_mapped(page) && page_count(page))
> > 
> 
> These pages returned by get_user_pages() will have a mapcount of 1 so this 
> wouldn't actually fix the massive lock contention.  page_mapping() is only 
> going to be NULL for pages off the lru like these are for 
> PAGE_MAPPING_ANON.

Okay. It can't fix your situation. Anyway, *normal* anon pages may be mapped
and have positive page_count(), so your code such as
'!page_mapping(page) && page_count(page)' makes compaction skip these *normal*
anon pages and this is incorrect behaviour.

Thanks.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH] nilfs2: add description of NILFS_IOCTL_SET_SUINFO ioctl

2014-02-03 Thread Ryusuke Konishi
Add description of NILFS_IOCTL_SET_SUINFO ioctl in
Documentation/filesystems/nilfs2.txt to make it up-to-date.

Signed-off-by: Ryusuke Konishi 
Cc: Andreas Rohner 
---
 Documentation/filesystems/nilfs2.txt |7 +++
 1 file changed, 7 insertions(+)

diff --git a/Documentation/filesystems/nilfs2.txt 
b/Documentation/filesystems/nilfs2.txt
index 06887d4..8b887ae 100644
--- a/Documentation/filesystems/nilfs2.txt
+++ b/Documentation/filesystems/nilfs2.txt
@@ -111,6 +111,13 @@ Table of NILFS2 specific ioctls
nilfs_resize utilities and by nilfs_cleanerd
daemon.
 
+ NILFS_IOCTL_SET_SUINFO Modify segment usage info of requested
+   segments. This ioctl is used by
+   nilfs_cleanerd daemon to skip unnecessary
+   cleaning operation of segments and reduce
+   performance penalty or wear of flash device
+   due to redundant move of in-use blocks.
+
  NILFS_IOCTL_GET_SUSTAT Return segment usage statistics. This ioctl
is used in lssu, nilfs_resize utilities and
by nilfs_cleanerd daemon.
-- 
1.7.9.3

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 4/4] nilfs2: implementation of NILFS_IOCTL_SET_SUINFO ioctl

2014-02-03 Thread Ryusuke Konishi
On Mon, 3 Feb 2014 13:41:01 -0800, Andrew Morton wrote:
> On Tue,  4 Feb 2014 01:50:44 +0900 Ryusuke Konishi 
>  wrote:
> 
>> With this ioctl the segment usage entries in the SUFILE can be
>> updated from userspace.
>> 
>> This is useful, because it allows the userspace GC to modify and update
>> segment usage entries for specific segments, which enables it to avoid
>> unnecessary write operations.
>> 
>> If a segment needs to be cleaned, but there is no or very little
>> reclaimable space in it, the cleaning operation basically degrades to
>> a useless moving operation. In the end the only thing that changes is
>> the location of the data and a timestamp in the segment usage
>> information. With this ioctl the GC can skip the cleaning and update
>> the segment usage entries directly instead.
>> 
>> This is basically a shortcut to cleaning the segment. It is still
>> necessary to read the segment summary information, but the writing of
>> the live blocks can be skipped if it's not worth it.
> 
> Documentation/filesystems/nilfs2.txt should be updated to document the
> new ioctl.
> 
> Which we're in there, please check that the ioctl documentation is
> otherwise complete and up-to-date.  These things have a tendency to
> bitrot.

Got it.  I missed the recent effort by Vyacheslav which added
description on every ioctl in the doucument file.

I'll send a patch for this soon.

Thanks,
Ryusuke Konishi
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Tux3 Report: Our blocker list

2014-02-03 Thread Daniel Phillips

At Korea Linux Forum last fall, Linus asked, haven't I been hearing
about Tux3 for ten years? I said, no, that was Tux2, completely
different. You only heard about Tux3 for six years.

Well, that is about long enough to keep hearing about an out of tree
filesystem. Last time we talked about merging, the main criticisms were
stylistic things, long since fixed. After that, we decided to address
some glaring issues rather than let innocent victims hit them the hard
way. Though victims of a new filesystem are theoretically limited to
battle hardened veterans, in practice it does not work out that way. In
reality, if you just need to flip a config flag then all kinds of
people will try the code. If it then does stupid things it immediately 
acquires a reputation that could take years to shake. Not fun. So we 
decided to fill in some holes first.


Here is our remaining blocker list:

 1) Allocation policy: simple minded linear block allocation is good
for benchmarks but ages poorly, so add a respectable allocation
policy.

 2) Mmap consistency: mmap writes may interact with block forking
caused by write(2) to leave stale pages in cache - fix it.

 3) ENOSPC: Volume full conditions must be predicted by the frontend,
not detected in the backend when it is too late to enforce ACID
guarantees, and the prediction must be accurate or users will be
annoyed by ENOSPC errors on a volume that is far from full.

After that, plenty of issues remain before anyone should deploy Tux3 for
real work, however none are in the "fill up your volume and it eats
itself" category. Items 1 and 2 above are nearly done and item 3 is
designed in detail, so we are close to a flag day where we offer up the 
Tux3 patch for serious review.


You can watch our progress here:

https://github.com/OGAWAHirofumi/tux3/commits/hirofumi

and here:

   http://buildbot.tux3.org:8010/waterfall

This is the amazing test infrastructure Hirofumi set up using buildbot
and hardware contributed by Miracle Linux. It goes to work whenever new
patches arrive on Github. You can see it testing the allocation patches
that landed this weekend.

One thing that happened over the last couple of months is, we added
allocation group counts and thus adopted yet another main design
feature of Ext4. This required some new, persistent metadata, with a
risk of regressing our benchmarks, but we will actually end up more
efficient for reasons I will delve into on the Tux3 mailing list.

Incidentally, the Tux3 kernel patch grew very little over the last
year. In spite of many improvements, we remain just over 18K lines of
code including whitespace. By comparison, Ext4 is 52K, Btrfs is 94K and
XFS is 96K. Though none of these can be reasonably described as
bloated, Tux3 is tighter by a multiple.

Overall, we tend to devote as much work to removing code as adding it.
As a result, we think Tux3 upholds the traditional Unix Philosophy
pretty well. Though it is fashionable to attack this time honored credo
on the basis of practicality, you can have orthogonal design and great
functionality too. We view lightness and tightness as a major
contribution, ranking just as high as performance and resilience. This
is about both maintainability and personal satisfaction.

We do expect our code base to grow faster as the focus shifts from base
functionality to features and scaling. To name a few: snapshots; data
compression; directory indexing; online repair; quotas. But we also have
opportunities to remove code, so a year from now I expect a code base
that is only modestly bigger and includes most of this list.

Interested developers and testers are welcome to drop by for a chat:

http://tux3.org/contribute.html
irc.oftc.net #tux3

With most of the tedious groundwork out of the way, this is the fun
part of the process where we get to obsess endlessly over matters of
fit and finish.

Regards,

Daniel

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [RFC 0/4] memcg: Low-limit reclaim

2014-02-03 Thread Greg Thelen
On Mon, Feb 03 2014, Michal Hocko wrote:

> On Thu 30-01-14 16:28:27, Greg Thelen wrote:
>> On Thu, Jan 30 2014, Michal Hocko wrote:
>> 
>> > On Wed 29-01-14 11:08:46, Greg Thelen wrote:
>> > [...]
>> >> The series looks useful.  We (Google) have been using something similar.
>> >> In practice such a low_limit (or memory guarantee), doesn't nest very
>> >> well.
>> >> 
>> >> Example:
>> >>   - parent_memcg: limit 500, low_limit 500, usage 500
>> >> 1 privately charged non-reclaimable page (e.g. mlock, slab)
>> >>   - child_memcg: limit 500, low_limit 500, usage 499
>> >
>> > I am not sure this is a good example. Your setup basically say that no
>> > single page should be reclaimed. I can imagine this might be useful in
>> > some cases and I would like to allow it but it sounds too extreme (e.g.
>> > a load which would start trashing heavily once the reclaim starts and it
>> > makes more sense to start it again rather than crowl - think about some
>> > mathematical simulation which might diverge).
>> 
>> Pages will still be reclaimed the usage_in_bytes is exceeds
>> limit_in_bytes.  I see the low_limit as a way to tell the kernel: don't
>> reclaim my memory due to external pressure, but internal pressure is
>> different.
>
> That sounds strange and very confusing to me. What if the internal
> pressure comes from children memcgs? Lowlimit is intended for protecting
> a group from reclaim and it shouldn't matter whether the reclaim is a
> result of the internal or external pressure.
>
>> >> If a streaming file cache workload (e.g. sha1sum) starts gobbling up
>> >> page cache it will lead to an oom kill instead of reclaiming. 
>> >
>> > Does it make any sense to protect all of such memory although it is
>> > easily reclaimable?
>> 
>> I think protection makes sense in this case.  If I know my workload
>> needs 500 to operate well, then I reserve 500 using low_limit.  My app
>> doesn't want to run with less than its reservation.
>> 
>> >> One could argue that this is working as intended because child_memcg
>> >> was promised 500 but can only get 499.  So child_memcg is oom killed
>> >> rather than being forced to operate below its promised low limit.
>> >> 
>> >> This has led to various internal workarounds like:
>> >> - don't charge any memory to interior tree nodes (e.g. parent_memcg);
>> >>   only charge memory to cgroup leafs.  This gets tricky when dealing
>> >>   with reparented memory inherited to parent from child during cgroup
>> >>   deletion.
>> >
>> > Do those need any protection at all?
>> 
>> Interior tree nodes don't need protection from their children.  But
>> children and interior nodes need protection from siblings and parents.
>
> Why? They contains only reparented pages in the above case. Those would
> be #1 candidate for reclaim in most cases, no?

I think we're on the same page.  My example interior node has reclaimed
pages and is a #1 candidate for reclaim induced from charges against
parent_memcg, but not a candidate for reclaim due to global memory
pressure induced by a sibling of parent_memcg.

>> >> - don't set low_limit on non leafs (e.g. do not set low limit on
>> >>   parent_memcg).  This constrains the cgroup layout a bit.  Some
>> >>   customers want to purchase $MEM and setup their workload with a few
>> >>   child cgroups.  A system daemon hands out $MEM by setting low_limit
>> >>   for top-level containers (e.g. parent_memcg).  Thereafter such
>> >>   customers are able to partition their workload with sub memcg below
>> >>   child_memcg.  Example:
>> >>  parent_memcg
>> >>  \
>> >>   child_memcg
>> >> / \
>> >> server   backup
>> >
>> > I think that the low_limit makes sense where you actually want to
>> > protect something from reclaim. And backup sounds like a bad fit for
>> > that.
>> 
>> The backup job would presumably have a small low_limit, but it may still
>> have a minimum working set required to make useful forward progress.
>> 
>> Example:
>>   parent_memcg
>>   \
>>child_memcg limit 500, low_limit 500, usage 500
>>  / \
>>  |   backup   limit 10, low_limit 10, usage 10
>>  |
>>   server limit 490, low_limit 490, usage 490
>> 
>> One could argue that problems appear when
>> server.low_limit+backup.lower_limit=child_memcg.limit.  So the safer
>> configuration is leave some padding:
>>   server.low_limit + backup.low_limit + padding = child_memcg.limit
>> but this just defers the problem.  As memory is reparented into parent,
>> then padding must grow.
>
> Which all sounds like a drawback of internal vs. external pressure
> semantic which you have mentioned above.

Huh?  I probably confused matters with the internal vs external talk
above.  Forgetting about that, I'm happy with the following
configuration assuming low_limit_fallback (ll_fallback) is eventually
available.

   parent_memcg
   \
child_memcg limit 500, low_limit 500, usage 500, ll_fallback 0
  

Re: + mm-utilc-add-kstrimdup.patch added to -mm tree

2014-02-03 Thread David Rientjes
On Mon, 3 Feb 2014, Sebastian Capella wrote:

> I am in the process of reworking the patches.  I'm not sure if Andrew
> was just interested in having the kstrimdup utility function available.
> 

I think the point Alexey is making is that whitespace is already stripped 
by the shell unless quoted; it seemed that you may have had a usecase 
where the string to be written would not be stripped by the shell for 
whatever reason and there was no convenient way to do it specifically for 
the file you were modifying?
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: + mm-utilc-add-kstrimdup.patch added to -mm tree

2014-02-03 Thread Sebastian Capella
Quoting David Rientjes (2014-02-03 17:05:04)
> The last we heard, I think Sebastian is looking to redo this series and 
> this patch is no longer needed.  Sebastian?
Hi David, Alexey,

I am in the process of reworking the patches.  I'm not sure if Andrew
was just interested in having the kstrimdup utility function available.

Isn't it too late to impose userspace trimming of newlines for sysfs?
It seems already fairly common and expected for the kernel to eat the
trailing whitespace, or at least ignore it.  If we change this won't we
be breaking userspace / tools / instructions / etc?

I'll try to post the new patches soon.

Thanks,

Sebastian

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[patch] mm, compaction: avoid isolating pinned pages fix

2014-02-03 Thread David Rientjes
On Tue, 4 Feb 2014, Joonsoo Kim wrote:

> I think that you need more code to skip this type of page correctly.
> Without page_mapped() check, this code makes migratable pages be skipped,
> since if page_mapped() case, page_count() may be more than zero.
> 
> So I think that you need following change.
> 
> (!page_mapping(page) && !page_mapped(page) && page_count(page))
> 

These pages returned by get_user_pages() will have a mapcount of 1 so this 
wouldn't actually fix the massive lock contention.  page_mapping() is only 
going to be NULL for pages off the lru like these are for 
PAGE_MAPPING_ANON.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v10 00/16] Volatile Ranges v10

2014-02-03 Thread Minchan Kim
On Fri, Jan 31, 2014 at 11:49:01AM -0500, Johannes Weiner wrote:
> On Wed, Jan 29, 2014 at 02:11:02PM +0900, Minchan Kim wrote:
> > It's interesting timing, I posted this patch Yew Year's Day
> > and receives indepth design review Lunar New Year's Day. :)
> > It's almost 0-day review. :)
> 
> That's the only way I can do 0-day reviews ;)
> 
> > On Tue, Jan 28, 2014 at 07:03:59PM -0500, Johannes Weiner wrote:
> > > Hello Minchan,
> > > 
> > > On Thu, Jan 02, 2014 at 04:12:08PM +0900, Minchan Kim wrote:
> > > > Hey all,
> > > > 
> > > > Happy New Year!
> > > > 
> > > > I know it's bad timing to send this unfamiliar large patchset for
> > > > review but hope there are some guys with freshed-brain in new year
> > > > all over the world. :)
> > > > And most important thing is that before I dive into lots of testing,
> > > > I'd like to make an agreement on design issues and others
> > > > 
> > > > o Syscall interface
> > > 
> > > Why do we need another syscall for this?  Can't we extend madvise to
> > 
> > Yeb. I should have written the reason. Early versions in this patchset
> > had used madvise with VMA handling but it was terrible performance for
> > ebizzy workload by mmap_sem's downside lock due to merging/split VMA.
> > Even it was worse than old so I gave up the VMA approach.
> > 
> > You could see the difference.
> > https://lkml.org/lkml/2013/10/8/63
> 
> So the compared kernels are 4 releases apart and the test happened
> inside a VM.  It's also not really apparent from that link what the
> tested workload is doing.  We first have to agree that it's doing
> nothing that could be avoided.  E.g. we wouldn't introduce an
> optimized version of write() because an application that writes 4G at
> one byte per call is having problems.

About ebizzy workload, the process allocates several chunks then,
threads start to alloc own chunk and *copy( the content from random
chunk which was one of preallocated chunk to own chunk.
It means lots of threads are page-faulting so mmap_sem write-side
lock is really critical point for performance.
(I don't know ebizzy is really good for real practice but at least,
several papers and benchmark suites have used it so we couldn't
ignore. And per-thread allocator are really popular these days)

With VMA approach, we need mmap_sem write-side lock twice to mark/unmark
VM_VOLATILE in vma->vm_flags so with my experiment, the performance was
terrible as I said on link.

I don't think the situation of current kernel would be better than old.
And virtulization is really important technique thesedays so we couldn't
ignore that although I tested it on VM for convenience. If you want,
I surely can test it on bare box.

> 
> The vroot lock has the same locking granularity as mmap_sem.  Why is
> mmap_sem more contended in this test?

It seems above explanation is enough.

> 
> > > take MADV_VOLATILE, MADV_NONVOLATILE, and return -ENOMEM if something
> > > in the range was purged?
> > 
> > In that case, -ENOMEM would have duplicated meaning "Purged" and "Out
> > of memory so failed in the middle of the system call processing" and
> > later could be a problem so we need to return value to indicate
> > how many bytes are succeeded so far so it means we need additional
> > out parameter. But yes, we can solve it by modifying semantic and
> > behavior (ex, as you said below, we could just unmark volatile
> > successfully if user pass (offset, len) consistent with marked volatile
> > ranges. (IOW, if we give up overlapping/subrange marking/unmakring
> > usecase. I expect it makes code simple further).
> > It's request from John so If he is okay, I'm no problem.
> 
> Yes, I don't insist on using madvise.  And it's too early to decide on
> an interface before we haven't fully nailed the semantics and features.
> 
> > > > o Not bind with vma split/merge logic to prevent mmap_sem cost and
> > > > o Not bind with vma split/merge logic to avoid vm_area_struct memory
> > > >   footprint.
> > > 
> > > VMAs are there to track attributes of memory ranges.  Duplicating
> > > large parts of their functionality and co-maintaining both structures
> > > on create, destroy, split, and merge means duplicate code and complex
> > > interactions.
> > > 
> > > 1. You need to define semantics and coordinate what happens when the
> > >vma underlying a volatile range changes.
> > > 
> > >Either you have to strictly co-maintain both range objects, or you
> > >have weird behavior like volatily outliving a vma and then applying
> > >to a separate vma created in its place.
> > > 
> > >Userspace won't get this right, and even in the kernel this is
> > >error prone and adds a lot to the complexity of vma management.
> > 
> > Current semantic is following as,
> > Vma handling logic in mm doesn't need to know vrange handling because
> > vrange's internal logic always checks validity of the vma but
> > one thing to do in vma logic is only clearing old volatile ranges
> > on creating new vma.
> > (Look 

Re: + mm-utilc-add-kstrimdup.patch added to -mm tree

2014-02-03 Thread David Rientjes
On Mon, 3 Feb 2014, Alexey Dobriyan wrote:

> > kstrimdup() creates a whitespace-trimmed duplicate of the passed in
> > null-terminated string.  This is useful for strings coming from sysfs that
> > often include trailing whitespace due to user input.
> 
> I think kernel should be strict in what it accepts, otherwise
> case-insensivity and
> what not will be added some day.
> 
> Shell will trim whitespace for you.
> 

The last we heard, I think Sebastian is looking to redo this series and 
this patch is no longer needed.  Sebastian?
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[patch 09/10] lib: radix_tree: tree node interface

2014-02-03 Thread Johannes Weiner
Make struct radix_tree_node part of the public interface and provide
API functions to create, look up, and delete whole nodes.  Refactor
the existing insert, look up, delete functions on top of these new
node primitives.

This will allow the VM to track and garbage collect page cache radix
tree nodes.

Signed-off-by: Johannes Weiner 
Reviewed-by: Rik van Riel 
---
 include/linux/radix-tree.h |  34 ++
 lib/radix-tree.c   | 261 +
 2 files changed, 180 insertions(+), 115 deletions(-)

diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h
index e8be53ecfc45..13636c40bc42 100644
--- a/include/linux/radix-tree.h
+++ b/include/linux/radix-tree.h
@@ -60,6 +60,33 @@ static inline int radix_tree_is_indirect_ptr(void *ptr)
 
 #define RADIX_TREE_MAX_TAGS 3
 
+#ifdef __KERNEL__
+#define RADIX_TREE_MAP_SHIFT   (CONFIG_BASE_SMALL ? 4 : 6)
+#else
+#define RADIX_TREE_MAP_SHIFT   3   /* For more stressful testing */
+#endif
+
+#define RADIX_TREE_MAP_SIZE(1UL << RADIX_TREE_MAP_SHIFT)
+#define RADIX_TREE_MAP_MASK(RADIX_TREE_MAP_SIZE-1)
+
+#define RADIX_TREE_TAG_LONGS   \
+   ((RADIX_TREE_MAP_SIZE + BITS_PER_LONG - 1) / BITS_PER_LONG)
+
+struct radix_tree_node {
+   unsigned intheight; /* Height from the bottom */
+   unsigned intcount;
+   union {
+   struct radix_tree_node *parent; /* Used when ascending tree */
+   struct rcu_head rcu_head;   /* Used when freeing node */
+   };
+   void __rcu  *slots[RADIX_TREE_MAP_SIZE];
+   unsigned long   tags[RADIX_TREE_MAX_TAGS][RADIX_TREE_TAG_LONGS];
+};
+
+#define RADIX_TREE_INDEX_BITS  (8 /* CHAR_BIT */ * sizeof(unsigned long))
+#define RADIX_TREE_MAX_PATH (DIV_ROUND_UP(RADIX_TREE_INDEX_BITS, \
+ RADIX_TREE_MAP_SHIFT))
+
 /* root tags are stored in gfp_mask, shifted by __GFP_BITS_SHIFT */
 struct radix_tree_root {
unsigned intheight;
@@ -101,6 +128,7 @@ do {
\
  *   concurrently with other readers.
  *
  * The notable exceptions to this rule are the following functions:
+ * __radix_tree_lookup
  * radix_tree_lookup
  * radix_tree_lookup_slot
  * radix_tree_tag_get
@@ -216,9 +244,15 @@ static inline void radix_tree_replace_slot(void **pslot, 
void *item)
rcu_assign_pointer(*pslot, item);
 }
 
+int __radix_tree_create(struct radix_tree_root *root, unsigned long index,
+   struct radix_tree_node **nodep, void ***slotp);
 int radix_tree_insert(struct radix_tree_root *, unsigned long, void *);
+void *__radix_tree_lookup(struct radix_tree_root *root, unsigned long index,
+ struct radix_tree_node **nodep, void ***slotp);
 void *radix_tree_lookup(struct radix_tree_root *, unsigned long);
 void **radix_tree_lookup_slot(struct radix_tree_root *, unsigned long);
+bool __radix_tree_delete_node(struct radix_tree_root *root, unsigned long 
index,
+ struct radix_tree_node *node);
 void *radix_tree_delete_item(struct radix_tree_root *, unsigned long, void *);
 void *radix_tree_delete(struct radix_tree_root *, unsigned long);
 unsigned int
diff --git a/lib/radix-tree.c b/lib/radix-tree.c
index e8adb5d8a184..e601c56a43d0 100644
--- a/lib/radix-tree.c
+++ b/lib/radix-tree.c
@@ -35,33 +35,6 @@
 #include  /* in_interrupt() */
 
 
-#ifdef __KERNEL__
-#define RADIX_TREE_MAP_SHIFT   (CONFIG_BASE_SMALL ? 4 : 6)
-#else
-#define RADIX_TREE_MAP_SHIFT   3   /* For more stressful testing */
-#endif
-
-#define RADIX_TREE_MAP_SIZE(1UL << RADIX_TREE_MAP_SHIFT)
-#define RADIX_TREE_MAP_MASK(RADIX_TREE_MAP_SIZE-1)
-
-#define RADIX_TREE_TAG_LONGS   \
-   ((RADIX_TREE_MAP_SIZE + BITS_PER_LONG - 1) / BITS_PER_LONG)
-
-struct radix_tree_node {
-   unsigned intheight; /* Height from the bottom */
-   unsigned intcount;
-   union {
-   struct radix_tree_node *parent; /* Used when ascending tree */
-   struct rcu_head rcu_head;   /* Used when freeing node */
-   };
-   void __rcu  *slots[RADIX_TREE_MAP_SIZE];
-   unsigned long   tags[RADIX_TREE_MAX_TAGS][RADIX_TREE_TAG_LONGS];
-};
-
-#define RADIX_TREE_INDEX_BITS  (8 /* CHAR_BIT */ * sizeof(unsigned long))
-#define RADIX_TREE_MAX_PATH (DIV_ROUND_UP(RADIX_TREE_INDEX_BITS, \
- RADIX_TREE_MAP_SHIFT))
-
 /*
  * The height_to_maxindex array needs to be one deeper than the maximum
  * path as height 0 holds only 1 entry.
@@ -387,23 +360,28 @@ out:
 }
 
 /**
- * radix_tree_insert-insert into a radix tree
+ * __radix_tree_create -   create a slot in a radix tree
  * @root:  radix tree root
  * @index: index key
- * @item:  item to insert
+ * @nodep: returns node
+ * @slotp: returns slot
  *
- * 

[patch 00/10] mm: thrash detection-based file cache sizing v9

2014-02-03 Thread Johannes Weiner
Changes in this revision

o Fix vmstat build problems on UP (Fengguang Wu's build bot)

o Clarify why optimistic radix_tree_node->private_list link checking
  is safe without holding the list_lru lock (Dave Chinner)

o Assert locking balance when the list_lru isolator says it dropped
  the list lock (Dave Chinner)

o Remove remnant of a manual reclaim counter in the shadow isolator,
  the list_lru-provided accounting is accurate now that we added
  LRU_REMOVED_RETRY (Dave Chinner)

o Set an object limit for the shadow shrinker instead of messing with
  its seeks setting.  The configured seeks define how pressure applied
  to pages translates to pressure on the object pool, in itself it is
  not enough to replace proper object valuation to classify expired
  and in-use objects.  Shadow nodes contain up to 64 shadow entries
  from different/alternating zones that have their own atomic age
  counter, so determining if a node is overall expired is crazy
  expensive.  Instead, use an object limit above which nodes are very
  likely to be expired.

o __pagevec_lookup and __find_get_pages kerneldoc fixes (Minchan Kim)

o radix_tree_node->count accessors for pages and shadows (Minchan Kim)

o Rebase to v3.14-rc1 and add review tags

Summary

The VM maintains cached filesystem pages on two types of lists.  One
list holds the pages recently faulted into the cache, the other list
holds pages that have been referenced repeatedly on that first list.
The idea is to prefer reclaiming young pages over those that have
shown to benefit from caching in the past.  We call the recently used
list "inactive list" and the frequently used list "active list".

Currently, the VM aims for a 1:1 ratio between the lists, which is the
"perfect" trade-off between the ability to *protect* frequently used
pages and the ability to *detect* frequently used pages.  This means
that working set changes bigger than half of cache memory go
undetected and thrash indefinitely, whereas working sets bigger than
half of cache memory are unprotected against used-once streams that
don't even need caching.

This happens on file servers and media streaming servers, where the
popular files and file sections change over time.  Even though the
individual files might be smaller than half of memory, concurrent
access to many of them may still result in their inter-reference
distance being greater than half of memory.  It's also been reported
as a problem on database workloads that switch back and forth between
tables that are bigger than half of memory.  In these cases the VM
never recognizes the new working set and will for the remainder of the
workload thrash disk data which could easily live in memory.

Historically, every reclaim scan of the inactive list also took a
smaller number of pages from the tail of the active list and moved
them to the head of the inactive list.  This model gave established
working sets more gracetime in the face of temporary use-once streams,
but ultimately was not significantly better than a FIFO policy and
still thrashed cache based on eviction speed, rather than actual
demand for cache.

This series solves the problem by maintaining a history of pages
evicted from the inactive list, enabling the VM to detect frequently
used pages regardless of inactive list size and facilitate working set
transitions.

Tests

The reported database workload is easily demonstrated on a 8G machine
with two filesets a 6G.  This fio workload operates on one set first,
then switches to the other.  The VM should obviously always cache the
set that the workload is currently using.

This test is based on a problem encountered by Citus Data customers:
http://citusdata.com/blog/72-linux-memory-manager-and-your-big-data

unpatched:
db1: READ: io=98304MB, aggrb=885559KB/s, minb=885559KB/s, maxb=885559KB/s, 
mint= 113672msec, maxt= 113672msec
db2: READ: io=98304MB, aggrb= 66169KB/s, minb= 66169KB/s, maxb= 66169KB/s, 
mint=1521302msec, maxt=1521302msec
sdb: ios=835750/4, merge=2/1, ticks=4659739/60016, in_queue=4719203, util=98.92%

real27m15.541s
user0m19.059s
sys 0m51.459s

patched:
db1: READ: io=98304MB, aggrb=877783KB/s, minb=877783KB/s, maxb=877783KB/s, 
mint=114679msec, maxt=114679msec
db2: READ: io=98304MB, aggrb=397449KB/s, minb=397449KB/s, maxb=397449KB/s, 
mint=253273msec, maxt=253273msec
sdb: ios=170587/4, merge=2/1, ticks=954910/61123, in_queue=1015923, util=90.40%

real6m8.630s
user0m14.714s
sys 0m31.233s

As can be seen, the unpatched kernel simply never adapts to the
workingset change and db2 is stuck indefinitely with secondary storage
speed.  The patched kernel needs 2-3 iterations over db2 before it
replaces db1 and reaches full memory speed.  Given the unbounded
negative affect of the existing VM behavior, these patches should be
considered correctness fixes rather than performance optimizations.

Another test resembles a fileserver or streaming server workload,
where data in 

[patch 04/10] mm: shmem: save one radix tree lookup when truncating swapped pages

2014-02-03 Thread Johannes Weiner
Page cache radix tree slots are usually stabilized by the page lock,
but shmem's swap cookies have no such thing.  Because the overall
truncation loop is lockless, the swap entry is currently confirmed by
a tree lookup and then deleted by another tree lookup under the same
tree lock region.

Use radix_tree_delete_item() instead, which does the verification and
deletion with only one lookup.  This also allows removing the
delete-only special case from shmem_radix_tree_replace().

Signed-off-by: Johannes Weiner 
Reviewed-by: Minchan Kim 
Reviewed-by: Rik van Riel 
---
 mm/shmem.c | 25 -
 1 file changed, 12 insertions(+), 13 deletions(-)

diff --git a/mm/shmem.c b/mm/shmem.c
index 1f18c9d0d93e..e470997010cd 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -242,19 +242,17 @@ static int shmem_radix_tree_replace(struct address_space 
*mapping,
pgoff_t index, void *expected, void *replacement)
 {
void **pslot;
-   void *item = NULL;
+   void *item;
 
VM_BUG_ON(!expected);
+   VM_BUG_ON(!replacement);
pslot = radix_tree_lookup_slot(>page_tree, index);
-   if (pslot)
-   item = radix_tree_deref_slot_protected(pslot,
-   >tree_lock);
+   if (!pslot)
+   return -ENOENT;
+   item = radix_tree_deref_slot_protected(pslot, >tree_lock);
if (item != expected)
return -ENOENT;
-   if (replacement)
-   radix_tree_replace_slot(pslot, replacement);
-   else
-   radix_tree_delete(>page_tree, index);
+   radix_tree_replace_slot(pslot, replacement);
return 0;
 }
 
@@ -386,14 +384,15 @@ export:
 static int shmem_free_swap(struct address_space *mapping,
   pgoff_t index, void *radswap)
 {
-   int error;
+   void *old;
 
spin_lock_irq(>tree_lock);
-   error = shmem_radix_tree_replace(mapping, index, radswap, NULL);
+   old = radix_tree_delete_item(>page_tree, index, radswap);
spin_unlock_irq(>tree_lock);
-   if (!error)
-   free_swap_and_cache(radix_to_swp_entry(radswap));
-   return error;
+   if (old != radswap)
+   return -ENOENT;
+   free_swap_and_cache(radix_to_swp_entry(radswap));
+   return 0;
 }
 
 /*
-- 
1.8.5.3

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[patch 02/10] fs: cachefiles: use add_to_page_cache_lru()

2014-02-03 Thread Johannes Weiner
This code used to have its own lru cache pagevec up until a0b8cab3
("mm: remove lru parameter from __pagevec_lru_add and remove parts of
pagevec API").  Now it's just add_to_page_cache() followed by
lru_cache_add(), might as well use add_to_page_cache_lru() directly.

Signed-off-by: Johannes Weiner 
Reviewed-by: Rik van Riel 
Reviewed-by: Minchan Kim 
---
 fs/cachefiles/rdwr.c | 33 +
 1 file changed, 13 insertions(+), 20 deletions(-)

diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c
index ebaff368120d..4b1fb5ca65b8 100644
--- a/fs/cachefiles/rdwr.c
+++ b/fs/cachefiles/rdwr.c
@@ -265,24 +265,22 @@ static int cachefiles_read_backing_file_one(struct 
cachefiles_object *object,
goto nomem_monitor;
}
 
-   ret = add_to_page_cache(newpage, bmapping,
-   netpage->index, cachefiles_gfp);
+   ret = add_to_page_cache_lru(newpage, bmapping,
+   netpage->index, cachefiles_gfp);
if (ret == 0)
goto installed_new_backing_page;
if (ret != -EEXIST)
goto nomem_page;
}
 
-   /* we've installed a new backing page, so now we need to add it
-* to the LRU list and start it reading */
+   /* we've installed a new backing page, so now we need to start
+* it reading */
 installed_new_backing_page:
_debug("- new %p", newpage);
 
backpage = newpage;
newpage = NULL;
 
-   lru_cache_add_file(backpage);
-
 read_backing_page:
ret = bmapping->a_ops->readpage(NULL, backpage);
if (ret < 0)
@@ -510,24 +508,23 @@ static int cachefiles_read_backing_file(struct 
cachefiles_object *object,
goto nomem;
}
 
-   ret = add_to_page_cache(newpage, bmapping,
-   netpage->index, cachefiles_gfp);
+   ret = add_to_page_cache_lru(newpage, bmapping,
+   netpage->index,
+   cachefiles_gfp);
if (ret == 0)
goto installed_new_backing_page;
if (ret != -EEXIST)
goto nomem;
}
 
-   /* we've installed a new backing page, so now we need to add it
-* to the LRU list and start it reading */
+   /* we've installed a new backing page, so now we need
+* to start it reading */
installed_new_backing_page:
_debug("- new %p", newpage);
 
backpage = newpage;
newpage = NULL;
 
-   lru_cache_add_file(backpage);
-
reread_backing_page:
ret = bmapping->a_ops->readpage(NULL, backpage);
if (ret < 0)
@@ -538,8 +535,8 @@ static int cachefiles_read_backing_file(struct 
cachefiles_object *object,
monitor_backing_page:
_debug("- monitor add");
 
-   ret = add_to_page_cache(netpage, op->mapping, netpage->index,
-   cachefiles_gfp);
+   ret = add_to_page_cache_lru(netpage, op->mapping,
+   netpage->index, cachefiles_gfp);
if (ret < 0) {
if (ret == -EEXIST) {
page_cache_release(netpage);
@@ -549,8 +546,6 @@ static int cachefiles_read_backing_file(struct 
cachefiles_object *object,
goto nomem;
}
 
-   lru_cache_add_file(netpage);
-
/* install a monitor */
page_cache_get(netpage);
monitor->netfs_page = netpage;
@@ -613,8 +608,8 @@ static int cachefiles_read_backing_file(struct 
cachefiles_object *object,
backing_page_already_uptodate:
_debug("- uptodate");
 
-   ret = add_to_page_cache(netpage, op->mapping, netpage->index,
-   cachefiles_gfp);
+   ret = add_to_page_cache_lru(netpage, op->mapping,
+   netpage->index, cachefiles_gfp);
if (ret < 0) {
if (ret == -EEXIST) {
page_cache_release(netpage);
@@ -631,8 +626,6 @@ static int cachefiles_read_backing_file(struct 
cachefiles_object *object,
 
fscache_mark_page_cached(op, netpage);
 
-   lru_cache_add_file(netpage);
-
/* the netpage is unlocked and marked up to date here */
fscache_end_io(op, netpage, 0);
page_cache_release(netpage);
-- 
1.8.5.3

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in

[patch 07/10] mm + fs: store shadow entries in page cache

2014-02-03 Thread Johannes Weiner
Reclaim will be leaving shadow entries in the page cache radix tree
upon evicting the real page.  As those pages are found from the LRU,
an iput() can lead to the inode being freed concurrently.  At this
point, reclaim must no longer install shadow pages because the inode
freeing code needs to ensure the page tree is really empty.

Add an address_space flag, AS_EXITING, that the inode freeing code
sets under the tree lock before doing the final truncate.  Reclaim
will check for this flag before installing shadow pages.

Signed-off-by: Johannes Weiner 
Reviewed-by: Rik van Riel 
Reviewed-by: Minchan Kim 
---
 Documentation/filesystems/porting   |  6 +--
 drivers/staging/lustre/lustre/llite/llite_lib.c |  2 +-
 fs/9p/vfs_inode.c   |  2 +-
 fs/affs/inode.c |  2 +-
 fs/afs/inode.c  |  2 +-
 fs/bfs/inode.c  |  2 +-
 fs/block_dev.c  |  4 +-
 fs/btrfs/inode.c|  2 +-
 fs/cifs/cifsfs.c|  2 +-
 fs/coda/inode.c |  2 +-
 fs/ecryptfs/super.c |  2 +-
 fs/exofs/inode.c|  2 +-
 fs/ext2/inode.c |  2 +-
 fs/ext3/inode.c |  2 +-
 fs/ext4/inode.c |  4 +-
 fs/f2fs/inode.c |  2 +-
 fs/fat/inode.c  |  2 +-
 fs/freevxfs/vxfs_inode.c|  2 +-
 fs/fuse/inode.c |  2 +-
 fs/gfs2/super.c |  2 +-
 fs/hfs/inode.c  |  2 +-
 fs/hfsplus/super.c  |  2 +-
 fs/hostfs/hostfs_kern.c |  2 +-
 fs/hpfs/inode.c |  2 +-
 fs/inode.c  |  4 +-
 fs/jffs2/fs.c   |  2 +-
 fs/jfs/inode.c  |  4 +-
 fs/kernfs/inode.c   |  2 +-
 fs/logfs/readwrite.c|  2 +-
 fs/minix/inode.c|  2 +-
 fs/ncpfs/inode.c|  2 +-
 fs/nfs/inode.c  |  2 +-
 fs/nfs/nfs4super.c  |  2 +-
 fs/nilfs2/inode.c   |  6 +--
 fs/ntfs/inode.c |  2 +-
 fs/ocfs2/inode.c|  4 +-
 fs/omfs/inode.c |  2 +-
 fs/proc/inode.c |  2 +-
 fs/reiserfs/inode.c |  2 +-
 fs/sysv/inode.c |  2 +-
 fs/ubifs/super.c|  2 +-
 fs/udf/inode.c  |  4 +-
 fs/ufs/inode.c  |  2 +-
 fs/xfs/xfs_super.c  |  2 +-
 include/linux/fs.h  |  1 +
 include/linux/mm.h  |  1 +
 include/linux/pagemap.h | 13 +-
 mm/filemap.c| 33 ---
 mm/truncate.c   | 54 +++--
 mm/vmscan.c |  2 +-
 50 files changed, 147 insertions(+), 65 deletions(-)

diff --git a/Documentation/filesystems/porting 
b/Documentation/filesystems/porting
index fe2b7ae6f962..0f3a1390bf00 100644
--- a/Documentation/filesystems/porting
+++ b/Documentation/filesystems/porting
@@ -295,9 +295,9 @@ in the beginning of ->setattr unconditionally.
->clear_inode() and ->delete_inode() are gone; ->evict_inode() should
 be used instead.  It gets called whenever the inode is evicted, whether it has
 remaining links or not.  Caller does *not* evict the pagecache or 
inode-associated
-metadata buffers; getting rid of those is responsibility of method, as it had
-been for ->delete_inode(). Caller makes sure async writeback cannot be running
-for the inode while (or after) ->evict_inode() is called.
+metadata buffers; the method has to use truncate_inode_pages_final() to get rid
+of those. Caller makes sure async writeback cannot be running for the inode 
while
+(or after) ->evict_inode() is called.
 
->drop_inode() returns int now; it's called on final iput() with
 inode->i_lock held and it returns true if filesystems wants the inode to be
diff --git a/drivers/staging/lustre/lustre/llite/llite_lib.c 
b/drivers/staging/lustre/lustre/llite/llite_lib.c
index 6cfdb9e4b74b..fc6aac3cfe00 100644
--- a/drivers/staging/lustre/lustre/llite/llite_lib.c
+++ b/drivers/staging/lustre/lustre/llite/llite_lib.c
@@ -1877,7 +1877,7 @@ void ll_delete_inode(struct inode *inode)

[patch 10/10] mm: keep page cache radix tree nodes in check

2014-02-03 Thread Johannes Weiner
Previously, page cache radix tree nodes were freed after reclaim
emptied out their page pointers.  But now reclaim stores shadow
entries in their place, which are only reclaimed when the inodes
themselves are reclaimed.  This is problematic for bigger files that
are still in use after they have a significant amount of their cache
reclaimed, without any of those pages actually refaulting.  The shadow
entries will just sit there and waste memory.  In the worst case, the
shadow entries will accumulate until the machine runs out of memory.

To get this under control, the VM will track radix tree nodes
exclusively containing shadow entries on a per-NUMA node list.
Per-NUMA rather than global because we expect the radix tree nodes
themselves to be allocated node-locally and we want to reduce
cross-node references of otherwise independent cache workloads.  A
simple shrinker will then reclaim these nodes on memory pressure.

A few things need to be stored in the radix tree node to implement the
shadow node LRU and allow tree deletions coming from the list:

1. There is no index available that would describe the reverse path
   from the node up to the tree root, which is needed to perform a
   deletion.  To solve this, encode in each node its offset inside the
   parent.  This can be stored in the unused upper bits of the same
   member that stores the node's height at no extra space cost.

2. The number of shadow entries needs to be counted in addition to the
   regular entries, to quickly detect when the node is ready to go to
   the shadow node LRU list.  The current entry count is an unsigned
   int but the maximum number of entries is 64, so a shadow counter
   can easily be stored in the unused upper bits.

3. Tree modification needs tree lock and tree root, which are located
   in the address space, so store an address_space backpointer in the
   node.  The parent pointer of the node is in a union with the 2-word
   rcu_head, so the backpointer comes at no extra cost as well.

4. The node needs to be linked to an LRU list, which requires a list
   head inside the node.  This does increase the size of the node, but
   it does not change the number of objects that fit into a slab page.

Signed-off-by: Johannes Weiner 
Reviewed-by: Rik van Riel 
Reviewed-by: Minchan Kim 
---
 include/linux/list_lru.h   |   2 +
 include/linux/mmzone.h |   1 +
 include/linux/radix-tree.h |  32 +++---
 include/linux/swap.h   |  31 ++
 lib/radix-tree.c   |  36 +++-
 mm/filemap.c   |  90 +++-
 mm/list_lru.c  |  10 
 mm/truncate.c  |  26 -
 mm/vmstat.c|   1 +
 mm/workingset.c| 143 +
 10 files changed, 332 insertions(+), 40 deletions(-)

diff --git a/include/linux/list_lru.h b/include/linux/list_lru.h
index 3ce541753c88..b02fc233eadd 100644
--- a/include/linux/list_lru.h
+++ b/include/linux/list_lru.h
@@ -13,6 +13,8 @@
 /* list_lru_walk_cb has to always return one of those */
 enum lru_status {
LRU_REMOVED,/* item removed from list */
+   LRU_REMOVED_RETRY,  /* item removed, but lock has been
+  dropped and reacquired */
LRU_ROTATE, /* item referenced, give another pass */
LRU_SKIP,   /* item cannot be locked, skip */
LRU_RETRY,  /* item not freeable. May drop the lock
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index b4bdeb411a4d..934820b3249c 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -144,6 +144,7 @@ enum zone_stat_item {
 #endif
WORKINGSET_REFAULT,
WORKINGSET_ACTIVATE,
+   WORKINGSET_NODERECLAIM,
NR_ANON_TRANSPARENT_HUGEPAGES,
NR_FREE_CMA_PAGES,
NR_VM_ZONE_STAT_ITEMS };
diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h
index 13636c40bc42..33170dbd9db4 100644
--- a/include/linux/radix-tree.h
+++ b/include/linux/radix-tree.h
@@ -72,21 +72,37 @@ static inline int radix_tree_is_indirect_ptr(void *ptr)
 #define RADIX_TREE_TAG_LONGS   \
((RADIX_TREE_MAP_SIZE + BITS_PER_LONG - 1) / BITS_PER_LONG)
 
+#define RADIX_TREE_INDEX_BITS  (8 /* CHAR_BIT */ * sizeof(unsigned long))
+#define RADIX_TREE_MAX_PATH (DIV_ROUND_UP(RADIX_TREE_INDEX_BITS, \
+ RADIX_TREE_MAP_SHIFT))
+
+/* Height component in node->path */
+#define RADIX_TREE_HEIGHT_SHIFT(RADIX_TREE_MAX_PATH + 1)
+#define RADIX_TREE_HEIGHT_MASK ((1UL << RADIX_TREE_HEIGHT_SHIFT) - 1)
+
+/* Internally used bits of node->count */
+#define RADIX_TREE_COUNT_SHIFT (RADIX_TREE_MAP_SHIFT + 1)
+#define RADIX_TREE_COUNT_MASK  ((1UL << RADIX_TREE_COUNT_SHIFT) - 1)
+
 struct radix_tree_node {
-   unsigned intheight; /* Height from the bottom */
+   unsigned intpath;   /* Offset in parent & height from the bottom */

[patch 01/10] mm: vmstat: fix UP zone state accounting

2014-02-03 Thread Johannes Weiner
Fengguang Wu's build testing spotted problems with inc_zone_state()
and dec_zone_state() on UP configurations in out-of-tree patches.

inc_zone_state() is declared but not defined, dec_zone_state() is
missing entirely.

Just like with *_zone_page_state(), they can be defined like their
preemption-unsafe counterparts on UP.

Signed-off-by: Johannes Weiner 
---
 include/linux/vmstat.h | 29 +++--
 1 file changed, 15 insertions(+), 14 deletions(-)

diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index a67b38415768..a32dbd2c2155 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -179,8 +179,6 @@ extern void zone_statistics(struct zone *, struct zone *, 
gfp_t gfp);
 #define add_zone_page_state(__z, __i, __d) mod_zone_page_state(__z, __i, __d)
 #define sub_zone_page_state(__z, __i, __d) mod_zone_page_state(__z, __i, 
-(__d))
 
-extern void inc_zone_state(struct zone *, enum zone_stat_item);
-
 #ifdef CONFIG_SMP
 void __mod_zone_page_state(struct zone *, enum zone_stat_item item, int);
 void __inc_zone_page_state(struct page *, enum zone_stat_item);
@@ -216,24 +214,12 @@ static inline void __mod_zone_page_state(struct zone 
*zone,
zone_page_state_add(delta, zone, item);
 }
 
-static inline void __inc_zone_state(struct zone *zone, enum zone_stat_item 
item)
-{
-   atomic_long_inc(>vm_stat[item]);
-   atomic_long_inc(_stat[item]);
-}
-
 static inline void __inc_zone_page_state(struct page *page,
enum zone_stat_item item)
 {
__inc_zone_state(page_zone(page), item);
 }
 
-static inline void __dec_zone_state(struct zone *zone, enum zone_stat_item 
item)
-{
-   atomic_long_dec(>vm_stat[item]);
-   atomic_long_dec(_stat[item]);
-}
-
 static inline void __dec_zone_page_state(struct page *page,
enum zone_stat_item item)
 {
@@ -248,6 +234,21 @@ static inline void __dec_zone_page_state(struct page *page,
 #define dec_zone_page_state __dec_zone_page_state
 #define mod_zone_page_state __mod_zone_page_state
 
+static inline void __inc_zone_state(struct zone *zone, enum zone_stat_item 
item)
+{
+   atomic_long_inc(>vm_stat[item]);
+   atomic_long_inc(_stat[item]);
+}
+
+static inline void __dec_zone_state(struct zone *zone, enum zone_stat_item 
item)
+{
+   atomic_long_dec(>vm_stat[item]);
+   atomic_long_dec(_stat[item]);
+}
+
+#define inc_zone_state __inc_zone_state
+#define dec_zone_state __dec_zone_state
+
 #define set_pgdat_percpu_threshold(pgdat, callback) { }
 
 static inline void refresh_cpu_vm_stats(int cpu) { }
-- 
1.8.5.3

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[patch 08/10] mm: thrash detection-based file cache sizing

2014-02-03 Thread Johannes Weiner
The VM maintains cached filesystem pages on two types of lists.  One
list holds the pages recently faulted into the cache, the other list
holds pages that have been referenced repeatedly on that first list.
The idea is to prefer reclaiming young pages over those that have
shown to benefit from caching in the past.  We call the recently used
list "inactive list" and the frequently used list "active list".

Currently, the VM aims for a 1:1 ratio between the lists, which is the
"perfect" trade-off between the ability to *protect* frequently used
pages and the ability to *detect* frequently used pages.  This means
that working set changes bigger than half of cache memory go
undetected and thrash indefinitely, whereas working sets bigger than
half of cache memory are unprotected against used-once streams that
don't even need caching.

Historically, every reclaim scan of the inactive list also took a
smaller number of pages from the tail of the active list and moved
them to the head of the inactive list.  This model gave established
working sets more gracetime in the face of temporary use-once streams,
but ultimately was not significantly better than a FIFO policy and
still thrashed cache based on eviction speed, rather than actual
demand for cache.

This patch solves one half of the problem by decoupling the ability to
detect working set changes from the inactive list size.  By
maintaining a history of recently evicted file pages it can detect
frequently used pages with an arbitrarily small inactive list size,
and subsequently apply pressure on the active list based on actual
demand for cache, not just overall eviction speed.

Every zone maintains a counter that tracks inactive list aging speed.
When a page is evicted, a snapshot of this counter is stored in the
now-empty page cache radix tree slot.  On refault, the minimum access
distance of the page can be assessed, to evaluate whether the page
should be part of the active list or not.

This fixes the VM's blindness towards working set changes in excess of
the inactive list.  And it's the foundation to further improve the
protection ability and reduce the minimum inactive list size of 50%.

Signed-off-by: Johannes Weiner 
Reviewed-by: Rik van Riel 
Reviewed-by: Minchan Kim 
Reviewed-by: Bob Liu 
---
 include/linux/mmzone.h |   5 +
 include/linux/swap.h   |   5 +
 mm/Makefile|   2 +-
 mm/filemap.c   |  61 
 mm/swap.c  |   2 +
 mm/vmscan.c|  24 -
 mm/vmstat.c|   2 +
 mm/workingset.c| 253 +
 8 files changed, 331 insertions(+), 23 deletions(-)
 create mode 100644 mm/workingset.c

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 5f2052c83154..b4bdeb411a4d 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -142,6 +142,8 @@ enum zone_stat_item {
NUMA_LOCAL, /* allocation from local node */
NUMA_OTHER, /* allocation from other node */
 #endif
+   WORKINGSET_REFAULT,
+   WORKINGSET_ACTIVATE,
NR_ANON_TRANSPARENT_HUGEPAGES,
NR_FREE_CMA_PAGES,
NR_VM_ZONE_STAT_ITEMS };
@@ -392,6 +394,9 @@ struct zone {
spinlock_t  lru_lock;
struct lruvec   lruvec;
 
+   /* Evictions & activations on the inactive file list */
+   atomic_long_t   inactive_age;
+
unsigned long   pages_scanned; /* since last reclaim */
unsigned long   flags; /* zone flags, see below */
 
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 46ba0c6c219f..b83cf61403ed 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -260,6 +260,11 @@ struct swap_list_t {
int next;   /* swapfile to be used next */
 };
 
+/* linux/mm/workingset.c */
+void *workingset_eviction(struct address_space *mapping, struct page *page);
+bool workingset_refault(void *shadow);
+void workingset_activation(struct page *page);
+
 /* linux/mm/page_alloc.c */
 extern unsigned long totalram_pages;
 extern unsigned long totalreserve_pages;
diff --git a/mm/Makefile b/mm/Makefile
index 310c90a09264..cdd741519ee0 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -17,7 +17,7 @@ obj-y := filemap.o mempool.o oom_kill.o 
fadvise.o \
   util.o mmzone.o vmstat.o backing-dev.o \
   mm_init.o mmu_context.o percpu.o slab_common.o \
   compaction.o balloon_compaction.o \
-  interval_tree.o list_lru.o $(mmu-y)
+  interval_tree.o list_lru.o workingset.o $(mmu-y)
 
 obj-y += init-mm.o
 
diff --git a/mm/filemap.c b/mm/filemap.c
index 18f80d418f83..33ceebf4d577 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -469,7 +469,7 @@ int replace_page_cache_page(struct page *old, struct page 
*new, gfp_t gfp_mask)
 EXPORT_SYMBOL_GPL(replace_page_cache_page);
 
 static int 

[patch 05/10] mm: filemap: move radix tree hole searching here

2014-02-03 Thread Johannes Weiner
The radix tree hole searching code is only used for page cache, for
example the readahead code trying to get a a picture of the area
surrounding a fault.

It sufficed to rely on the radix tree definition of holes, which is
"empty tree slot".  But this is about to change, though, as shadow
page descriptors will be stored in the page cache after the actual
pages get evicted from memory.

Move the functions over to mm/filemap.c and make them native page
cache operations, where they can later be adapted to handle the new
definition of "page cache hole".

Signed-off-by: Johannes Weiner 
Reviewed-by: Rik van Riel 
Reviewed-by: Minchan Kim 
---
 fs/nfs/blocklayout/blocklayout.c |  2 +-
 include/linux/pagemap.h  |  5 +++
 include/linux/radix-tree.h   |  4 ---
 lib/radix-tree.c | 75 ---
 mm/filemap.c | 76 
 mm/readahead.c   |  4 +--
 6 files changed, 84 insertions(+), 82 deletions(-)

diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index 56ff823ca82e..65d849bdf77a 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -1213,7 +1213,7 @@ static u64 pnfs_num_cont_bytes(struct inode *inode, 
pgoff_t idx)
end = DIV_ROUND_UP(i_size_read(inode), PAGE_CACHE_SIZE);
if (end != NFS_I(inode)->npages) {
rcu_read_lock();
-   end = radix_tree_next_hole(>page_tree, idx + 1, 
ULONG_MAX);
+   end = page_cache_next_hole(mapping, idx + 1, ULONG_MAX);
rcu_read_unlock();
}
 
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 1710d1b060ba..52d56872fe26 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -243,6 +243,11 @@ static inline struct page 
*page_cache_alloc_readahead(struct address_space *x)
 
 typedef int filler_t(void *, struct page *);
 
+pgoff_t page_cache_next_hole(struct address_space *mapping,
+pgoff_t index, unsigned long max_scan);
+pgoff_t page_cache_prev_hole(struct address_space *mapping,
+pgoff_t index, unsigned long max_scan);
+
 extern struct page * find_get_page(struct address_space *mapping,
pgoff_t index);
 extern struct page * find_lock_page(struct address_space *mapping,
diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h
index 1bf0a9c388d9..e8be53ecfc45 100644
--- a/include/linux/radix-tree.h
+++ b/include/linux/radix-tree.h
@@ -227,10 +227,6 @@ radix_tree_gang_lookup(struct radix_tree_root *root, void 
**results,
 unsigned int radix_tree_gang_lookup_slot(struct radix_tree_root *root,
void ***results, unsigned long *indices,
unsigned long first_index, unsigned int max_items);
-unsigned long radix_tree_next_hole(struct radix_tree_root *root,
-   unsigned long index, unsigned long max_scan);
-unsigned long radix_tree_prev_hole(struct radix_tree_root *root,
-   unsigned long index, unsigned long max_scan);
 int radix_tree_preload(gfp_t gfp_mask);
 int radix_tree_maybe_preload(gfp_t gfp_mask);
 void radix_tree_init(void);
diff --git a/lib/radix-tree.c b/lib/radix-tree.c
index f442e3243607..e8adb5d8a184 100644
--- a/lib/radix-tree.c
+++ b/lib/radix-tree.c
@@ -946,81 +946,6 @@ next:
 }
 EXPORT_SYMBOL(radix_tree_range_tag_if_tagged);
 
-
-/**
- * radix_tree_next_hole-find the next hole (not-present entry)
- * @root:  tree root
- * @index: index key
- * @max_scan:  maximum range to search
- *
- * Search the set [index, min(index+max_scan-1, MAX_INDEX)] for the lowest
- * indexed hole.
- *
- * Returns: the index of the hole if found, otherwise returns an index
- * outside of the set specified (in which case 'return - index >= max_scan'
- * will be true). In rare cases of index wrap-around, 0 will be returned.
- *
- * radix_tree_next_hole may be called under rcu_read_lock. However, like
- * radix_tree_gang_lookup, this will not atomically search a snapshot of
- * the tree at a single point in time. For example, if a hole is created
- * at index 5, then subsequently a hole is created at index 10,
- * radix_tree_next_hole covering both indexes may return 10 if called
- * under rcu_read_lock.
- */
-unsigned long radix_tree_next_hole(struct radix_tree_root *root,
-   unsigned long index, unsigned long max_scan)
-{
-   unsigned long i;
-
-   for (i = 0; i < max_scan; i++) {
-   if (!radix_tree_lookup(root, index))
-   break;
-   index++;
-   if (index == 0)
-   break;
-   }
-
-   return index;
-}
-EXPORT_SYMBOL(radix_tree_next_hole);
-
-/**
- * radix_tree_prev_hole-find the prev hole 

[patch 06/10] mm + fs: prepare for non-page entries in page cache radix trees

2014-02-03 Thread Johannes Weiner
shmem mappings already contain exceptional entries where swap slot
information is remembered.

To be able to store eviction information for regular page cache,
prepare every site dealing with the radix trees directly to handle
entries other than pages.

The common lookup functions will filter out non-page entries and
return NULL for page cache holes, just as before.  But provide a raw
version of the API which returns non-page entries as well, and switch
shmem over to use it.

Signed-off-by: Johannes Weiner 
Reviewed-by: Rik van Riel 
Reviewed-by: Minchan Kim 
---
 fs/btrfs/compression.c   |   2 +-
 include/linux/mm.h   |   8 ++
 include/linux/pagemap.h  |  15 ++--
 include/linux/pagevec.h  |   3 +
 include/linux/shmem_fs.h |   1 +
 mm/filemap.c | 197 +--
 mm/mincore.c |  20 +++--
 mm/readahead.c   |   2 +-
 mm/shmem.c   |  97 +--
 mm/swap.c|  48 
 mm/truncate.c|  73 ++
 11 files changed, 338 insertions(+), 128 deletions(-)

diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index e2600cdb6c25..1b8d21b681f2 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -472,7 +472,7 @@ static noinline int add_ra_bio_pages(struct inode *inode,
rcu_read_lock();
page = radix_tree_lookup(>page_tree, pg_index);
rcu_read_unlock();
-   if (page) {
+   if (page && !radix_tree_exceptional_entry(page)) {
misses++;
if (misses > 4)
break;
diff --git a/include/linux/mm.h b/include/linux/mm.h
index f28f46eade6a..d684ac125482 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1031,6 +1031,14 @@ extern void show_free_areas(unsigned int flags);
 extern bool skip_free_areas_node(unsigned int flags, int nid);
 
 int shmem_zero_setup(struct vm_area_struct *);
+#ifdef CONFIG_SHMEM
+bool shmem_mapping(struct address_space *mapping);
+#else
+static inline bool shmem_mapping(struct address_space *mapping)
+{
+   return false;
+}
+#endif
 
 extern int can_do_mlock(void);
 extern int user_shm_lock(size_t, struct user_struct *);
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 52d56872fe26..2eeca3c83b0f 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -248,12 +248,15 @@ pgoff_t page_cache_next_hole(struct address_space 
*mapping,
 pgoff_t page_cache_prev_hole(struct address_space *mapping,
 pgoff_t index, unsigned long max_scan);
 
-extern struct page * find_get_page(struct address_space *mapping,
-   pgoff_t index);
-extern struct page * find_lock_page(struct address_space *mapping,
-   pgoff_t index);
-extern struct page * find_or_create_page(struct address_space *mapping,
-   pgoff_t index, gfp_t gfp_mask);
+struct page *__find_get_page(struct address_space *mapping, pgoff_t offset);
+struct page *find_get_page(struct address_space *mapping, pgoff_t offset);
+struct page *__find_lock_page(struct address_space *mapping, pgoff_t offset);
+struct page *find_lock_page(struct address_space *mapping, pgoff_t offset);
+struct page *find_or_create_page(struct address_space *mapping, pgoff_t index,
+gfp_t gfp_mask);
+unsigned __find_get_pages(struct address_space *mapping, pgoff_t start,
+ unsigned int nr_pages, struct page **pages,
+ pgoff_t *indices);
 unsigned find_get_pages(struct address_space *mapping, pgoff_t start,
unsigned int nr_pages, struct page **pages);
 unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t start,
diff --git a/include/linux/pagevec.h b/include/linux/pagevec.h
index e4dbfab37729..3c6b8b1e945b 100644
--- a/include/linux/pagevec.h
+++ b/include/linux/pagevec.h
@@ -22,6 +22,9 @@ struct pagevec {
 
 void __pagevec_release(struct pagevec *pvec);
 void __pagevec_lru_add(struct pagevec *pvec);
+unsigned __pagevec_lookup(struct pagevec *pvec, struct address_space *mapping,
+ pgoff_t start, unsigned nr_pages, pgoff_t *indices);
+void pagevec_remove_exceptionals(struct pagevec *pvec);
 unsigned pagevec_lookup(struct pagevec *pvec, struct address_space *mapping,
pgoff_t start, unsigned nr_pages);
 unsigned pagevec_lookup_tag(struct pagevec *pvec,
diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h
index 9d55438bc4ad..4d1771c2d29f 100644
--- a/include/linux/shmem_fs.h
+++ b/include/linux/shmem_fs.h
@@ -51,6 +51,7 @@ extern struct file *shmem_kernel_file_setup(const char *name, 
loff_t size,
unsigned long flags);
 extern int shmem_zero_setup(struct vm_area_struct *);
 extern int shmem_lock(struct file *file, int 

[patch 03/10] lib: radix-tree: radix_tree_delete_item()

2014-02-03 Thread Johannes Weiner
Provide a function that does not just delete an entry at a given
index, but also allows passing in an expected item.  Delete only if
that item is still located at the specified index.

This is handy when lockless tree traversals want to delete entries as
well because they don't have to do an second, locked lookup to verify
the slot has not changed under them before deleting the entry.

Signed-off-by: Johannes Weiner 
Reviewed-by: Minchan Kim 
Reviewed-by: Rik van Riel 
---
 include/linux/radix-tree.h |  1 +
 lib/radix-tree.c   | 31 +++
 2 files changed, 28 insertions(+), 4 deletions(-)

diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h
index 403940787be1..1bf0a9c388d9 100644
--- a/include/linux/radix-tree.h
+++ b/include/linux/radix-tree.h
@@ -219,6 +219,7 @@ static inline void radix_tree_replace_slot(void **pslot, 
void *item)
 int radix_tree_insert(struct radix_tree_root *, unsigned long, void *);
 void *radix_tree_lookup(struct radix_tree_root *, unsigned long);
 void **radix_tree_lookup_slot(struct radix_tree_root *, unsigned long);
+void *radix_tree_delete_item(struct radix_tree_root *, unsigned long, void *);
 void *radix_tree_delete(struct radix_tree_root *, unsigned long);
 unsigned int
 radix_tree_gang_lookup(struct radix_tree_root *root, void **results,
diff --git a/lib/radix-tree.c b/lib/radix-tree.c
index 7811ed3b4e70..f442e3243607 100644
--- a/lib/radix-tree.c
+++ b/lib/radix-tree.c
@@ -1335,15 +1335,18 @@ static inline void radix_tree_shrink(struct 
radix_tree_root *root)
 }
 
 /**
- * radix_tree_delete-delete an item from a radix tree
+ * radix_tree_delete_item-delete an item from a radix tree
  * @root:  radix tree root
  * @index: index key
+ * @item:  expected item
  *
- * Remove the item at @index from the radix tree rooted at @root.
+ * Remove @item at @index from the radix tree rooted at @root.
  *
- * Returns the address of the deleted item, or NULL if it was not present.
+ * Returns the address of the deleted item, or NULL if it was not present
+ * or the entry at the given @index was not @item.
  */
-void *radix_tree_delete(struct radix_tree_root *root, unsigned long index)
+void *radix_tree_delete_item(struct radix_tree_root *root,
+unsigned long index, void *item)
 {
struct radix_tree_node *node = NULL;
struct radix_tree_node *slot = NULL;
@@ -1378,6 +1381,11 @@ void *radix_tree_delete(struct radix_tree_root *root, 
unsigned long index)
if (slot == NULL)
goto out;
 
+   if (item && slot != item) {
+   slot = NULL;
+   goto out;
+   }
+
/*
 * Clear all tags associated with the item to be deleted.
 * This way of doing it would be inefficient, but seldom is any set.
@@ -1422,6 +1430,21 @@ void *radix_tree_delete(struct radix_tree_root *root, 
unsigned long index)
 out:
return slot;
 }
+EXPORT_SYMBOL(radix_tree_delete_item);
+
+/**
+ * radix_tree_delete-delete an item from a radix tree
+ * @root:  radix tree root
+ * @index: index key
+ *
+ * Remove the item at @index from the radix tree rooted at @root.
+ *
+ * Returns the address of the deleted item, or NULL if it was not present.
+ */
+void *radix_tree_delete(struct radix_tree_root *root, unsigned long index)
+{
+   return radix_tree_delete_item(root, index, NULL);
+}
 EXPORT_SYMBOL(radix_tree_delete);
 
 /**
-- 
1.8.5.3

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 2/2] numa, mem-hotplug: Fix array index overflow when synchronizing nid to memblock.reserved.

2014-02-03 Thread Josh Boyer
On Tue, Jan 28, 2014 at 10:24 AM, Dave Jones  wrote:
> On Tue, Jan 28, 2014 at 05:05:16PM +0800, Tang Chen wrote:
>  > The following path will cause array out of bound.
>  >
>  > memblock_add_region() will always set nid in memblock.reserved to 
> MAX_NUMNODES.
>  > In numa_register_memblks(), after we set all nid to correct valus in 
> memblock.reserved,
>  > we called setup_node_data(), and used memblock_alloc_nid() to allocate 
> memory, with
>  > nid set to MAX_NUMNODES.
>  >
>  > The nodemask_t type can be seen as a bit array. And the index is 0 ~ 
> MAX_NUMNODES-1.
>  >
>  > After that, when we call node_set() in numa_clear_kernel_node_hotplug(), 
> the nodemask_t
>  > got an index of value MAX_NUMNODES, which is out of [0 ~ MAX_NUMNODES-1].
>  >
>  > See below:
>  >
>  > numa_init()
>  >  |---> numa_register_memblks()
>  >  |  |---> memblock_set_node(memory)  set correct nid in 
> memblock.memory
>  >  |  |---> memblock_set_node(reserved)set correct nid in 
> memblock.reserved
>  >  |  |..
>  >  |  |---> setup_node_data()
>  >  | |---> memblock_alloc_nid()here, nid is set to 
> MAX_NUMNODES (1024)
>  >  |..
>  >  |---> numa_clear_kernel_node_hotplug()
>  > |---> node_set() here, we have an index 1024, 
> and overflowed
>  >
>  > This patch moves nid setting to numa_clear_kernel_node_hotplug() to fix 
> this problem.
>  >
>  > Reported-by: Dave Jones 
>  > Signed-off-by: Tang Chen 
>  > Tested-by: Gu Zheng 
>  > ---
>  >  arch/x86/mm/numa.c | 19 +++
>  >  1 file changed, 11 insertions(+), 8 deletions(-)
>
> This does seem to solve the problem (In conjunction with David's variant of 
> the other patch).

Is this (and the first in the series) going to land in Linus' tree
soon?  I don't see them in -rc1 and people are still hitting the early
oops Dave did without this.

josh
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


linux-next: manual merge of the parisc-hd tree with Linus' tree

2014-02-03 Thread Stephen Rothwell
Hi all,

Today's linux-next merge of the parisc-hd tree got a conflict in
arch/parisc/include/asm/elf.h between commit 9dabf60dc4ab ("parisc: add
flexible mmap memory layout support") from Linus' tree and commit
13de8ec38997 ("parisc: add flexible mmap memory layout support") from the
parisc-hd tree.

It seems that this tree was rebased before being sent to Linus (I
understand why) so I will just drop the whole parisc tree for today.
Please just tidy up.

-- 
Cheers,
Stephen Rothwells...@canb.auug.org.au


pgp8B5QIiWk9c.pgp
Description: PGP signature


Re: [PATCH v3 3/5] spi: sunxi: Add Allwinner A31 SPI controller driver

2014-02-03 Thread Mark Brown
On Fri, Jan 31, 2014 at 11:47:04PM +0100, Maxime Ripard wrote:
> On Fri, Jan 31, 2014 at 12:48:09PM +, Mark Brown wrote:
> > On Fri, Jan 31, 2014 at 11:55:50AM +0100, Maxime Ripard wrote:

> > > + pm_runtime_enable(>dev);
> > > + if (!pm_runtime_enabled(>dev)) {
> > > + ret = sun6i_spi_runtime_resume(>dev);
> > > + if (ret) {
> > > + dev_err(>dev, "Couldn't resume the device\n");
> > > + return ret;
> > > + }
> > > + }

> > No, as discussed don't do this - notice how other drivers aren't written
> > this way either.  Like I said leave the device powered on startup and
> > then let it be idled by runtime PM.

> Well, some SPI drivers are actually written like that (all the tegra

It's not been done consistently, no - that should be fixed.

> SPI drivers for example). It's not an excuse, but waking up the device
> only to put it back in suspend right away seems kind of

It isn't awesome, no.  Ideally the runtime PM code would do this but
then you couldn't ifdef the operations which as far as I can tell is the
main thing people want from disabling it and it gets complicated for
devices that genuinely do power up on startup so here we are.

> inefficient. Plus, the pm_runtime_idle callback you suggested are
> actually calling runtime_idle, while we want to call runtime_suspend.

Yeah, I didn't actually check if I was looking at the right call there.


signature.asc
Description: Digital signature


Re: [PATCH 3/8] mm, hugetlb: fix race in region tracking

2014-02-03 Thread Andrew Morton
On Tue, 28 Jan 2014 17:19:38 -0800 Davidlohr Bueso  wrote:

> On Tue, 2014-01-28 at 19:36 -0500, Naoya Horiguchi wrote:
> > On Mon, Jan 27, 2014 at 06:34:17PM -0800, Davidlohr Bueso wrote:
> [...]
> > > > If this retry is really essential for the fix, please comment the reason
> > > > both in patch description and inline comment. It's very important for
> > > > future code maintenance.
> > > 
> > > So we locate the corresponding region in the reserve map, and if we are
> > > below the current region, then we allocate a new one. Since we dropped
> > > the lock to allocate memory, we have to make sure that we still need the
> > > new region and that we don't race with the new status of the reservation
> > > map. This is the whole point of the retry, and I don't see it being
> > > suboptimal.
> > 
> > I'm afraid that you don't explain why you need drop the lock for memory
> > allocation. Are you saying that this unlocking comes from the difference
> > between rwsem and spin lock?
> 
> Because you cannot go to sleep while holding a spinlock, which is
> exactly what kmalloc(GFP_KERNEL) can do. We *might* get a way with it
> with GFP_ATOMIC, I dunno, but I certainly prefer this approach better.

yup.  You could do

foo = kmalloc(size, GFP_NOWAIT);
if (!foo) {
spin_unlock(...);
foo = kmalloc(size, GFP_KERNEL);
if (!foo)
...
spin_lock(...);
}

that avoids the lock/unlock once per allocation.  But it also increases
the lock's average hold times


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v2 3/3] watchdog: Add tegra watchdog

2014-02-03 Thread Andrew Chew
Add a driver for the hardware watchdogs in NVIDIA Tegra SoCs (tegra30 and
later).  This driver will configure one watchdog timer that will reset the
system in the case of a watchdog timeout.

This driver binds to the nvidia,tegra30-timer device node and gets its
register base from there.

Signed-off-by: Andrew Chew 
---
 Documentation/watchdog/watchdog-parameters.txt |   5 +
 drivers/watchdog/Kconfig   |  11 +
 drivers/watchdog/Makefile  |   1 +
 drivers/watchdog/tegra_wdt.c   | 372 +
 4 files changed, 389 insertions(+)
 create mode 100644 drivers/watchdog/tegra_wdt.c

diff --git a/Documentation/watchdog/watchdog-parameters.txt 
b/Documentation/watchdog/watchdog-parameters.txt
index f9492fe..b39f355 100644
--- a/Documentation/watchdog/watchdog-parameters.txt
+++ b/Documentation/watchdog/watchdog-parameters.txt
@@ -325,6 +325,11 @@ soft_noboot: Softdog action, set to 1 to ignore reboots, 0 
to reboot
 stmp3xxx_wdt:
 heartbeat: Watchdog heartbeat period in seconds from 1 to 4194304, default 19
 -
+tegra_wdt:
+heartbeat: Watchdog heartbeats in seconds. (default = 120)
+nowayout: Watchdog cannot be stopped once started
+   (default=kernel config parameter)
+-
 ts72xx_wdt:
 timeout: Watchdog timeout in seconds. (1 <= timeout <= 8, default=8)
 nowayout: Disable watchdog shutdown on close
diff --git a/drivers/watchdog/Kconfig b/drivers/watchdog/Kconfig
index 4c4c566..2852447 100644
--- a/drivers/watchdog/Kconfig
+++ b/drivers/watchdog/Kconfig
@@ -420,6 +420,17 @@ config SIRFSOC_WATCHDOG
  Support for CSR SiRFprimaII and SiRFatlasVI watchdog. When
  the watchdog triggers the system will be reset.
 
+config TEGRA_WATCHDOG
+   tristate "Tegra watchdog"
+   depends on ARCH_TEGRA
+   select WATCHDOG_CORE
+   help
+ Say Y here to include support for the watchdog timer
+ embedded in NVIDIA Tegra SoCs.
+
+ To compile this driver as a module, choose M here: the
+ module will be called tegra_wdt.
+
 # AVR32 Architecture
 
 config AT32AP700X_WDT
diff --git a/drivers/watchdog/Makefile b/drivers/watchdog/Makefile
index 985a66c..1b5f3d5 100644
--- a/drivers/watchdog/Makefile
+++ b/drivers/watchdog/Makefile
@@ -58,6 +58,7 @@ obj-$(CONFIG_BCM2835_WDT) += bcm2835_wdt.o
 obj-$(CONFIG_MOXART_WDT) += moxart_wdt.o
 obj-$(CONFIG_SIRFSOC_WATCHDOG) += sirfsoc_wdt.o
 obj-$(CONFIG_BCM_KONA_WDT) += bcm_kona_wdt.o
+obj-$(CONFIG_TEGRA_WATCHDOG) += tegra_wdt.o
 
 # AVR32 Architecture
 obj-$(CONFIG_AT32AP700X_WDT) += at32ap700x_wdt.o
diff --git a/drivers/watchdog/tegra_wdt.c b/drivers/watchdog/tegra_wdt.c
new file mode 100644
index 000..eebe7fc
--- /dev/null
+++ b/drivers/watchdog/tegra_wdt.c
@@ -0,0 +1,372 @@
+/*
+ * Copyright (c) 2013, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include 
+
+/* minimum and maximum watchdog trigger timeout, in seconds */
+#define MIN_WDT_TIMEOUT1
+#define MAX_WDT_TIMEOUT255
+
+/*
+ * Base of the WDT registers, from the timer base address.  There are
+ * actually 5 watchdogs that can be configured (by pairing with an available
+ * timer), at bases 0x100 + (WDT ID) * 0x20, where WDT ID is 0 through 4.
+ * This driver only configures the first watchdog (WDT ID 0).
+ */
+#define WDT_BASE   0x100
+#define WDT_ID 0
+#define WDT_TIMER_BASE TEGRA30_TIMER_WDT_BASE
+#define WDT_TIMER_ID   TEGRA30_TIMER_WDT_ID
+
+/* WDT registers */
+#define WDT_CFG0x0
+#define WDT_CFG_PERIOD_SHIFT   4
+#define WDT_CFG_PERIOD_MASK0xff
+#define WDT_CFG_INT_EN (1 << 12)
+#define WDT_CFG_PMC2CAR_RST_EN (1 << 15)
+#define WDT_STS0x4
+#define WDT_STS_COUNT_SHIFT4
+#define WDT_STS_COUNT_MASK 0xff
+#define WDT_STS_EXP_SHIFT  12
+#define WDT_STS_EXP_MASK   0x3
+#define WDT_CMD0x8
+#define WDT_CMD_START_COUNTER  (1 << 0)
+#define WDT_CMD_DISABLE_COUNTER(1 << 1)
+#define WDT_UNLOCK (0xc)
+#define WDT_UNLOCK_PATTERN (0xc45a << 0)
+
+/* Timer registers */
+#define TIMER_PTV  0x0

[PATCH v2 2/3] clocksource: tegra: Define timer bases in header file

2014-02-03 Thread Andrew Chew
Added timers that are present in tegra30 and later, that are NOT in tegra20.

Also, some of these timer bases are needed in the tegra watchdog driver, so
separate them out into a header file that both the clocksource driver and
the watchdog driver can share them.

Signed-off-by: Andrew Chew 
---
 drivers/clocksource/tegra20_timer.c | 15 ++---
 include/clocksource/tegra_timer.h   | 43 +
 2 files changed, 49 insertions(+), 9 deletions(-)
 create mode 100644 include/clocksource/tegra_timer.h

diff --git a/drivers/clocksource/tegra20_timer.c 
b/drivers/clocksource/tegra20_timer.c
index 73cfa56..2c49643 100644
--- a/drivers/clocksource/tegra20_timer.c
+++ b/drivers/clocksource/tegra20_timer.c
@@ -28,6 +28,8 @@
 #include 
 #include 
 
+#include 
+
 #include 
 #include 
 
@@ -39,11 +41,6 @@
 #define TIMERUS_USEC_CFG 0x14
 #define TIMERUS_CNTR_FREEZE 0x4c
 
-#define TIMER1_BASE 0x0
-#define TIMER2_BASE 0x8
-#define TIMER3_BASE 0x50
-#define TIMER4_BASE 0x58
-
 #define TIMER_PTV 0x0
 #define TIMER_PCR 0x4
 
@@ -64,7 +61,7 @@ static int tegra_timer_set_next_event(unsigned long cycles,
u32 reg;
 
reg = 0x8000 | ((cycles > 1) ? (cycles-1) : 0);
-   timer_writel(reg, TIMER3_BASE + TIMER_PTV);
+   timer_writel(reg, TEGRA20_TIMER3_BASE + TIMER_PTV);
 
return 0;
 }
@@ -74,12 +71,12 @@ static void tegra_timer_set_mode(enum clock_event_mode mode,
 {
u32 reg;
 
-   timer_writel(0, TIMER3_BASE + TIMER_PTV);
+   timer_writel(0, TEGRA20_TIMER3_BASE + TIMER_PTV);
 
switch (mode) {
case CLOCK_EVT_MODE_PERIODIC:
reg = 0xC000 | ((100/HZ)-1);
-   timer_writel(reg, TIMER3_BASE + TIMER_PTV);
+   timer_writel(reg, TEGRA20_TIMER3_BASE + TIMER_PTV);
break;
case CLOCK_EVT_MODE_ONESHOT:
break;
@@ -142,7 +139,7 @@ static void tegra_read_persistent_clock(struct timespec *ts)
 static irqreturn_t tegra_timer_interrupt(int irq, void *dev_id)
 {
struct clock_event_device *evt = (struct clock_event_device *)dev_id;
-   timer_writel(1<<30, TIMER3_BASE + TIMER_PCR);
+   timer_writel(1<<30, TEGRA20_TIMER3_BASE + TIMER_PCR);
evt->event_handler(evt);
return IRQ_HANDLED;
 }
diff --git a/include/clocksource/tegra_timer.h 
b/include/clocksource/tegra_timer.h
new file mode 100644
index 000..ea0bc8b
--- /dev/null
+++ b/include/clocksource/tegra_timer.h
@@ -0,0 +1,43 @@
+/*
+ * Copyright (C) 2010 Google, Inc.
+ *
+ * Author:
+ * Colin Cross 
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#ifndef __CLOCKSOURCE_TEGRA_TIMER_H
+#define __CLOCKSOURCE_TEGRA_TIMER_H
+
+/* Tegra 20 timers */
+#define TEGRA20_TIMER1_BASE0x0
+#define TEGRA20_TIMER2_BASE0x8
+#define TEGRA20_TIMER3_BASE0x50
+#define TEGRA20_TIMER4_BASE0x58
+
+/* Tegra 30 timers */
+#define TEGRA30_TIMER1_BASETEGRA20_TIMER1_BASE
+#define TEGRA30_TIMER2_BASETEGRA20_TIMER2_BASE
+#define TEGRA30_TIMER3_BASETEGRA20_TIMER3_BASE
+#define TEGRA30_TIMER4_BASETEGRA20_TIMER4_BASE
+#define TEGRA30_TIMER5_BASE0x60
+#define TEGRA30_TIMER6_BASE0x68
+#define TEGRA30_TIMER7_BASE0x70
+#define TEGRA30_TIMER8_BASE0x78
+#define TEGRA30_TIMER9_BASE0x80
+#define TEGRA30_TIMER0_BASE0x88
+
+/* Used by the tegra watchdog timer */
+#define TEGRA30_TIMER_WDT_BASE TEGRA30_TIMER5_BASE
+#define TEGRA30_TIMER_WDT_ID   5
+
+#endif /* __CLOCKSOURCE_TEGRA_TIMER_H */
-- 
1.8.1.5

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v2 1/3] clocksource: tegra: Add nvidia,tegra30-timer compat

2014-02-03 Thread Andrew Chew
There are some differences between tegra20's timer registers and tegra30's
(and later).  For example, tegra30 has more timers.  In addition, watchdogs
are not present in tegra20.

Add this compatibility string in order to be able to distinguish
whether the additional timers and watchdogs are there or not.

Signed-off-by: Andrew Chew 
Acked-by: Stephen Warren 
---
 drivers/clocksource/tegra20_timer.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/clocksource/tegra20_timer.c 
b/drivers/clocksource/tegra20_timer.c
index d1869f0..73cfa56 100644
--- a/drivers/clocksource/tegra20_timer.c
+++ b/drivers/clocksource/tegra20_timer.c
@@ -218,6 +218,7 @@ static void __init tegra20_init_timer(struct device_node 
*np)
0x1, 0x1fff);
 }
 CLOCKSOURCE_OF_DECLARE(tegra20_timer, "nvidia,tegra20-timer", 
tegra20_init_timer);
+CLOCKSOURCE_OF_DECLARE(tegra30_timer, "nvidia,tegra30-timer", 
tegra20_init_timer);
 
 static void __init tegra20_init_rtc(struct device_node *np)
 {
-- 
1.8.1.5

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v2 0/3] tegra30 watchdog support

2014-02-03 Thread Andrew Chew
This patch series ultimately adds watchdog support for tegra30 and later
chips.

The existing tegra clocksource driver (drivers/clocksource/tegra20_timer.c)
sadly does not distinguish between tegra20 and tegra30 (and later), which
it should have done since the contents of the timer register base have
changed significantly.  In particular, tegra30 (and later) has more timers,
and also hardware watchdog registers.

The first patch adds nvidia,tegra30-timer to the list of compatibilty
strings for the tegra timer device tree node, so that we can distinguish
between tegra20 and tegra30 (and later).

The second patch separates out some macros that are interesting to other
drivers (in particular, the tegra watchdog driver), and also adds the
the missing timers that are present in tegra30 and later.

The third patch adds the actual watchdog driver.  This driver configures
a single watchdog (watchdog 0), pairs it with timer 5
(defined as TEGRA30_TIMER_WDT_* in the shared header file from the previous
patch), and sets it up so that upon timer expiration, will cause the target
system to reset.

I've decided to encapsulate all related changes into one patch series, since
I did not modify any device tree bindings and therefore don't need to review
dt changes separately.  This way, everything can be seen within its complete
context.

Andrew Chew (3):
  clocksource: tegra: Add nvidia,tegra30-timer compat
  clocksource: tegra: Define timer bases in header file
  watchdog: Add tegra watchdog

 Documentation/watchdog/watchdog-parameters.txt |   5 +
 drivers/clocksource/tegra20_timer.c|  16 +-
 drivers/watchdog/Kconfig   |  11 +
 drivers/watchdog/Makefile  |   1 +
 drivers/watchdog/tegra_wdt.c   | 372 +
 include/clocksource/tegra_timer.h  |  43 +++
 6 files changed, 439 insertions(+), 9 deletions(-)
 create mode 100644 drivers/watchdog/tegra_wdt.c
 create mode 100644 include/clocksource/tegra_timer.h

-- 
1.8.1.5

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: Need help in bug in isolate_migratepages_range

2014-02-03 Thread David Rientjes
On Mon, 3 Feb 2014, Vlastimil Babka wrote:

> It seems to come from balloon_page_movable() and its test page_count(page) ==
> 1.
> 

Hmm, I think it might be because compound_head() == NULL here.  Holger, 
this looks like a race condition when allocating a compound page, did you 
only see it once or is it actually reproducible?

I think this happens when a new compound page is allocated and PageBuddy 
is cleared before prep_compound_page() and then we see PageTail(p) set but 
p->first_page is not yet initialized.  Is there any way to avoid memory 
barriers in compound_page()?
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [patch] mm, compaction: avoid isolating pinned pages

2014-02-03 Thread Joonsoo Kim
On Mon, Feb 03, 2014 at 02:49:32AM -0800, David Rientjes wrote:
> On Mon, 3 Feb 2014, Mel Gorman wrote:
> 
> > > Page migration will fail for memory that is pinned in memory with, for
> > > example, get_user_pages().  In this case, it is unnecessary to take
> > > zone->lru_lock or isolating the page and passing it to page migration
> > > which will ultimately fail.
> > > 
> > > This is a racy check, the page can still change from under us, but in
> > > that case we'll just fail later when attempting to move the page.
> > > 
> > > This avoids very expensive memory compaction when faulting transparent
> > > hugepages after pinning a lot of memory with a Mellanox driver.
> > > 
> > > On a 128GB machine and pinning ~120GB of memory, before this patch we
> > > see the enormous disparity in the number of page migration failures
> > > because of the pinning (from /proc/vmstat):
> > > 
> > > compact_blocks_moved 7609
> > > compact_pages_moved 3431
> > > compact_pagemigrate_failed 133219
> > > compact_stall 13
> > > 
> > > After the patch, it is much more efficient:
> > > 
> > > compact_blocks_moved 7998
> > > compact_pages_moved 6403
> > > compact_pagemigrate_failed 3
> > > compact_stall 15
> > > 
> > > Signed-off-by: David Rientjes 
> > > ---
> > >  mm/compaction.c | 8 
> > >  1 file changed, 8 insertions(+)
> > > 
> > > diff --git a/mm/compaction.c b/mm/compaction.c
> > > --- a/mm/compaction.c
> > > +++ b/mm/compaction.c
> > > @@ -578,6 +578,14 @@ isolate_migratepages_range(struct zone *zone, struct 
> > > compact_control *cc,
> > >   continue;
> > >   }
> > >  
> > > + /*
> > > +  * Migration will fail if an anonymous page is pinned in memory,
> > > +  * so avoid taking zone->lru_lock and isolating it unnecessarily
> > > +  * in an admittedly racy check.
> > > +  */
> > > + if (!page_mapping(page) && page_count(page))
> > > + continue;
> > > +

Hello,

I think that you need more code to skip this type of page correctly.
Without page_mapped() check, this code makes migratable pages be skipped,
since if page_mapped() case, page_count() may be more than zero.

So I think that you need following change.

(!page_mapping(page) && !page_mapped(page) && page_count(page))

Thanks.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 0/4] w1: refcnt fix, skip non-error send, docs

2014-02-03 Thread z...@ioremap.net
Hi

03.02.2014, 05:15, "David Fries" :

>  I could submit these patches as in, which would require the previous
>  set, or I could merge the documentation into the previous set and
>  resubmit them all since they haven't made it into the kernel tree yet.
>  Opinions?
>
>  Here's a small refcnt fix, skipping sending non-error messages, and
>  documentation and comment updates.
>
>  non-error error messages:
>  Currently every master or slave command is sending a response with
>  w1_netlink_send_error no matter if there is an error or not.  This
>  makes commands like list slaves W1_CMD_LIST_SLAVES or W1_CMD_READ
>  return two messages, one with data and one without.  That is a problem
>  with the list slaves because they are identical except for one having
>  data and one not, and since there could be no slaves known to the
>  kernel you can't just discard the no data case, unless the program
>  were to expect two replies.  So I propose only sending the error reply
>  if there is an error, in which case there wouldn't be a normal reply
>  (such as read).  This would mean commands like write would no longer
>  return a response unless there was an error.  If an application wanted
>  to verify the kernel received the write message it could follow it by
>  a read to verify the data or just that read came after write and had a
>  response so write must have completed without error.  I think it is
>  safe to do away with the extra replies.  If someone sees a big enough
>  need for this, I could modify it so all commands return one response,
>  with commands like write always calling send error even if there
>  wasn't one.

I created this protocol to handle cases like nothing is returned, but yet 
userspace knows
operations has been completed. Also, you can not really change it at this time 
- there are
already userspace application which may depend on the last ack to find out its 
request completed.

Reference counter fix is correct, please submit it in the separate patch.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 1/2] ALSA: hda - hdmi: introduce patch_nvhdmi()

2014-02-03 Thread Stephen Warren
From: Anssi Hannula 

(This is a backport of *part* of upstream 611885bc963a "ALSA: hda -
hdmi: Disallow unsupported 2ch remapping on NVIDIA codecs" to stable
3.10 through 3.12. Later stable already contain all of the original
patch.)

Mainline commit 611885bc963a "ALSA: hda - hdmi: Disallow unsupported 2ch
remapping on NVIDIA codecs" introduces function patch_nvhdmi(). That
function is edited by 75fae117a5db "ALSA: hda/hdmi - allow PIN_OUT to be
dynamically enabled". In order to backport the PIN_OUT patch, I am first
back-porting just the addition of function patch_nvhdmi(), so that the
conflicts applying the PIN_OUT patch are simplified.

Ideally, one might backport all of 611885bc963a. However, that commit
doesn't apply to stable kernels, since it relies on a chain of other
patches which implement new features.

Signed-off-by: Anssi Hannula 
Signed-off-by: Takashi Iwai 
[swarren, extracted just a small part of the original patch]
Signed-off-by: Stephen Warren 
---
 sound/pci/hda/patch_hdmi.c | 60 --
 1 file changed, 37 insertions(+), 23 deletions(-)

diff --git a/sound/pci/hda/patch_hdmi.c b/sound/pci/hda/patch_hdmi.c
index f26c42c92db7..aad73a1fc2cd 100644
--- a/sound/pci/hda/patch_hdmi.c
+++ b/sound/pci/hda/patch_hdmi.c
@@ -2560,6 +2560,20 @@ static int patch_nvhdmi_8ch_7x(struct hda_codec *codec)
return 0;
 }
 
+static int patch_nvhdmi(struct hda_codec *codec)
+{
+   struct hdmi_spec *spec;
+   int err;
+
+   err = patch_generic_hdmi(codec);
+   if (err)
+   return err;
+
+   spec = codec->spec;
+
+   return 0;
+}
+
 /*
  * ATI-specific implementations
  *
@@ -2632,30 +2646,30 @@ static const struct hda_codec_preset 
snd_hda_preset_hdmi[] = {
 { .id = 0x10de0005, .name = "MCP77/78 HDMI",   .patch = patch_nvhdmi_8ch_7x },
 { .id = 0x10de0006, .name = "MCP77/78 HDMI",   .patch = patch_nvhdmi_8ch_7x },
 { .id = 0x10de0007, .name = "MCP79/7A HDMI",   .patch = patch_nvhdmi_8ch_7x },
-{ .id = 0x10de000a, .name = "GPU 0a HDMI/DP",  .patch = patch_generic_hdmi },
-{ .id = 0x10de000b, .name = "GPU 0b HDMI/DP",  .patch = patch_generic_hdmi },
-{ .id = 0x10de000c, .name = "MCP89 HDMI",  .patch = patch_generic_hdmi },
-{ .id = 0x10de000d, .name = "GPU 0d HDMI/DP",  .patch = patch_generic_hdmi },
-{ .id = 0x10de0010, .name = "GPU 10 HDMI/DP",  .patch = patch_generic_hdmi },
-{ .id = 0x10de0011, .name = "GPU 11 HDMI/DP",  .patch = patch_generic_hdmi },
-{ .id = 0x10de0012, .name = "GPU 12 HDMI/DP",  .patch = patch_generic_hdmi },
-{ .id = 0x10de0013, .name = "GPU 13 HDMI/DP",  .patch = patch_generic_hdmi },
-{ .id = 0x10de0014, .name = "GPU 14 HDMI/DP",  .patch = patch_generic_hdmi },
-{ .id = 0x10de0015, .name = "GPU 15 HDMI/DP",  .patch = patch_generic_hdmi },
-{ .id = 0x10de0016, .name = "GPU 16 HDMI/DP",  .patch = patch_generic_hdmi },
+{ .id = 0x10de000a, .name = "GPU 0a HDMI/DP",  .patch = patch_nvhdmi },
+{ .id = 0x10de000b, .name = "GPU 0b HDMI/DP",  .patch = patch_nvhdmi },
+{ .id = 0x10de000c, .name = "MCP89 HDMI",  .patch = patch_nvhdmi },
+{ .id = 0x10de000d, .name = "GPU 0d HDMI/DP",  .patch = patch_nvhdmi },
+{ .id = 0x10de0010, .name = "GPU 10 HDMI/DP",  .patch = patch_nvhdmi },
+{ .id = 0x10de0011, .name = "GPU 11 HDMI/DP",  .patch = patch_nvhdmi },
+{ .id = 0x10de0012, .name = "GPU 12 HDMI/DP",  .patch = patch_nvhdmi },
+{ .id = 0x10de0013, .name = "GPU 13 HDMI/DP",  .patch = patch_nvhdmi },
+{ .id = 0x10de0014, .name = "GPU 14 HDMI/DP",  .patch = patch_nvhdmi },
+{ .id = 0x10de0015, .name = "GPU 15 HDMI/DP",  .patch = patch_nvhdmi },
+{ .id = 0x10de0016, .name = "GPU 16 HDMI/DP",  .patch = patch_nvhdmi },
 /* 17 is known to be absent */
-{ .id = 0x10de0018, .name = "GPU 18 HDMI/DP",  .patch = patch_generic_hdmi },
-{ .id = 0x10de0019, .name = "GPU 19 HDMI/DP",  .patch = patch_generic_hdmi },
-{ .id = 0x10de001a, .name = "GPU 1a HDMI/DP",  .patch = patch_generic_hdmi },
-{ .id = 0x10de001b, .name = "GPU 1b HDMI/DP",  .patch = patch_generic_hdmi },
-{ .id = 0x10de001c, .name = "GPU 1c HDMI/DP",  .patch = patch_generic_hdmi },
-{ .id = 0x10de0040, .name = "GPU 40 HDMI/DP",  .patch = patch_generic_hdmi },
-{ .id = 0x10de0041, .name = "GPU 41 HDMI/DP",  .patch = patch_generic_hdmi },
-{ .id = 0x10de0042, .name = "GPU 42 HDMI/DP",  .patch = patch_generic_hdmi },
-{ .id = 0x10de0043, .name = "GPU 43 HDMI/DP",  .patch = patch_generic_hdmi },
-{ .id = 0x10de0044, .name = "GPU 44 HDMI/DP",  .patch = patch_generic_hdmi },
-{ .id = 0x10de0051, .name = "GPU 51 HDMI/DP",  .patch = patch_generic_hdmi },
-{ .id = 0x10de0060, .name = "GPU 60 HDMI/DP",  .patch = patch_generic_hdmi },
+{ .id = 0x10de0018, .name = "GPU 18 HDMI/DP",  .patch = patch_nvhdmi },
+{ .id = 0x10de0019, .name = "GPU 19 HDMI/DP",  .patch = patch_nvhdmi },
+{ .id = 0x10de001a, .name = "GPU 1a HDMI/DP",  .patch = patch_nvhdmi },
+{ .id = 0x10de001b, .name = "GPU 1b HDMI/DP",  .patch = patch_nvhdmi },
+{ .id = 0x10de001c, .name = "GPU 1c HDMI/DP",  

  1   2   3   4   5   6   7   8   9   10   >